Symaptic: os/textandloc/fontservices/textshaperplugin/IcuSource/common/unicode/ubrk.h@260cb5ec6c19

     1 /*

     2 * Copyright (C) 1996-2005, International Business Machines Corporation and others. All Rights Reserved.

     3 *****************************************************************************************

     4 */

     6 #ifndef UBRK_H

     7 #define UBRK_H

     9 #include "unicode/utypes.h"

    10 #include "unicode/uloc.h"

    11 #include "unicode/utext.h"

    13 /**

    14  * A text-break iterator.

    15  *  For usage in C programs.

    16  */

    17 #ifndef UBRK_TYPEDEF_UBREAK_ITERATOR

    18 #   define UBRK_TYPEDEF_UBREAK_ITERATOR

    19     /**

    20      *  Opaque type representing an ICU Break iterator object.

    21      *  @stable ICU 2.0

    22      */

    23     typedef void UBreakIterator;

    24 #endif

    26 #if !UCONFIG_NO_BREAK_ITERATION

    28 #include "unicode/parseerr.h"

    30 /**

    31  * \file

    32  * \brief C API: BreakIterator

    33  *

    34  * <h2> BreakIterator C API </h2>

    35  *

    36  * The BreakIterator C API defines  methods for finding the location

    37  * of boundaries in text. Pointer to a UBreakIterator maintain a

    38  * current position and scan over text returning the index of characters

    39  * where boundaries occur.

    40  * <P>

    41  * Line boundary analysis determines where a text string can be broken

    42  * when line-wrapping. The mechanism correctly handles punctuation and

    43  * hyphenated words.

    44  * <P>

    45  * Sentence boundary analysis allows selection with correct

    46  * interpretation of periods within numbers and abbreviations, and

    47  * trailing punctuation marks such as quotation marks and parentheses.

    48  * <P>

    49  * Word boundary analysis is used by search and replace functions, as

    50  * well as within text editing applications that allow the user to

    51  * select words with a double click. Word selection provides correct

    52  * interpretation of punctuation marks within and following

    53  * words. Characters that are not part of a word, such as symbols or

    54  * punctuation marks, have word-breaks on both sides.

    55  * <P>

    56  * Character boundary analysis allows users to interact with

    57  * characters as they expect to, for example, when moving the cursor

    58  * through a text string. Character boundary analysis provides correct

    59  * navigation of through character strings, regardless of how the

    60  * character is stored.  For example, an accented character might be

    61  * stored as a base character and a diacritical mark. What users

    62  * consider to be a character can differ between languages.

    63  * <P>

    64  * Title boundary analysis locates all positions,

    65  * typically starts of words, that should be set to Title Case

    66  * when title casing the text.

    67  * <P>

    68  *

    69  * This is the interface for all text boundaries.

    70  * <P>

    71  * Examples:

    72  * <P>

    73  * Helper function to output text

    74  * <pre>

    75  * \code

    76  *    void printTextRange(UChar* str, int32_t start, int32_t end ) {

    77  *         UChar* result;

    78  *         UChar* temp;

    79  *         const char* res;

    80  *         temp=(UChar*)malloc(sizeof(UChar) * ((u_strlen(str)-start)+1));

    81  *         result=(UChar*)malloc(sizeof(UChar) * ((end-start)+1));

    82  *         u_strcpy(temp, &str[start]);

    83  *         u_strncpy(result, temp, end-start);

    84  *         res=(char*)malloc(sizeof(char) * (u_strlen(result)+1));

    85  *         u_austrcpy(res, result);

    86  *         printf("%s\n", res);

    87  *    }

    88  * \endcode

    89  * </pre>

    90  * Print each element in order:

    91  * <pre>

    92  * \code

    93  *    void printEachForward( UBreakIterator* boundary, UChar* str) {

    94  *       int32_t end;

    95  *       int32_t start = ubrk_first(boundary);

    96  *       for (end = ubrk_next(boundary)); end != UBRK_DONE; start = end, end = ubrk_next(boundary)) {

    97  *             printTextRange(str, start, end );

    98  *         }

    99  *    }

   100  * \endcode

   101  * </pre>

   102  * Print each element in reverse order:

   103  * <pre>

   104  * \code

   105  *    void printEachBackward( UBreakIterator* boundary, UChar* str) {

   106  *       int32_t start;

   107  *       int32_t end = ubrk_last(boundary);

   108  *       for (start = ubrk_previous(boundary); start != UBRK_DONE;  end = start, start =ubrk_previous(boundary)) {

   109  *             printTextRange( str, start, end );

   110  *         }

   111  *    }

   112  * \endcode

   113  * </pre>

   114  * Print first element

   115  * <pre>

   116  * \code

   117  *    void printFirst(UBreakIterator* boundary, UChar* str) {

   118  *        int32_t end;

   119  *        int32_t start = ubrk_first(boundary);

   120  *        end = ubrk_next(boundary);

   121  *        printTextRange( str, start, end );

   122  *    }

   123  * \endcode

   124  * </pre>

   125  * Print last element

   126  * <pre>

   127  * \code

   128  *    void printLast(UBreakIterator* boundary, UChar* str) {

   129  *        int32_t start;

   130  *        int32_t end = ubrk_last(boundary);

   131  *        start = ubrk_previous(boundary);

   132  *        printTextRange(str, start, end );

   133  *    }

   134  * \endcode

   135  * </pre>

   136  * Print the element at a specified position

   137  * <pre>

   138  * \code

   139  *    void printAt(UBreakIterator* boundary, int32_t pos , UChar* str) {

   140  *        int32_t start;

   141  *        int32_t end = ubrk_following(boundary, pos);

   142  *        start = ubrk_previous(boundary);

   143  *        printTextRange(str, start, end );

   144  *    }

   145  * \endcode

   146  * </pre>

   147  * Creating and using text boundaries

   148  * <pre>

   149  * \code

   150  *       void BreakIterator_Example( void ) {

   151  *           UBreakIterator* boundary;

   152  *           UChar *stringToExamine;

   153  *           stringToExamine=(UChar*)malloc(sizeof(UChar) * (strlen("Aaa bbb ccc. Ddd eee fff.")+1) );

   154  *           u_uastrcpy(stringToExamine, "Aaa bbb ccc. Ddd eee fff.");

   155  *           printf("Examining: "Aaa bbb ccc. Ddd eee fff.");

   156  *

   157  *           //print each sentence in forward and reverse order

   158  *           boundary = ubrk_open(UBRK_SENTENCE, "en_us", stringToExamine, u_strlen(stringToExamine), &status);

   159  *           printf("----- forward: -----------\n");

   160  *           printEachForward(boundary, stringToExamine);

   161  *           printf("----- backward: ----------\n");

   162  *           printEachBackward(boundary, stringToExamine);

   163  *           ubrk_close(boundary);

   164  *

   165  *           //print each word in order

   166  *           boundary = ubrk_open(UBRK_WORD, "en_us", stringToExamine, u_strlen(stringToExamine), &status);

   167  *           printf("----- forward: -----------\n");

   168  *           printEachForward(boundary, stringToExamine);

   169  *           printf("----- backward: ----------\n");

   170  *           printEachBackward(boundary, stringToExamine);

   171  *           //print first element

   172  *           printf("----- first: -------------\n");

   173  *           printFirst(boundary, stringToExamine);

   174  *           //print last element

   175  *           printf("----- last: --------------\n");

   176  *           printLast(boundary, stringToExamine);

   177  *           //print word at charpos 10

   178  *           printf("----- at pos 10: ---------\n");

   179  *           printAt(boundary, 10 , stringToExamine);

   180  *

   181  *           ubrk_close(boundary);

   182  *       }

   183  * \endcode

   184  * </pre>

   185  */

   187 /** The possible types of text boundaries.  @stable ICU 2.0 */

   188 typedef enum UBreakIteratorType {

   189   /** Character breaks  @stable ICU 2.0 */

   190   UBRK_CHARACTER,

   191   /** Word breaks @stable ICU 2.0 */

   192   UBRK_WORD,

   193   /** Line breaks @stable ICU 2.0 */

   194   UBRK_LINE,

   195   /** Sentence breaks @stable ICU 2.0 */

   196   UBRK_SENTENCE,

   198 #ifndef U_HIDE_DEPRECATED_API

   199   /**

   200    * Title Case breaks

   201    * The iterator created using this type locates title boundaries as described for

   202    * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,

   203    * please use Word Boundary iterator.

   204    *

   205    * @deprecated ICU 2.8 Use the word break iterator for titlecasing for Unicode 4 and later.

   206    */

   207   UBRK_TITLE

   208 #endif /* U_HIDE_DEPRECATED_API */

   210 } UBreakIteratorType;

   212 /** Value indicating all text boundaries have been returned.

   213  *  @stable ICU 2.0

   214  */

   215 #define UBRK_DONE ((int32_t) -1)

   218 /**

   219  *  Enum constants for the word break tags returned by

   220  *  getRuleStatus().  A range of values is defined for each category of

   221  *  word, to allow for further subdivisions of a category in future releases.

   222  *  Applications should check for tag values falling within the range, rather

   223  *  than for single individual values.

   224  *  @stable ICU 2.2

   225 */

   226 typedef enum UWordBreak {

   227     /** Tag value for "words" that do not fit into any of other categories.

   228      *  Includes spaces and most punctuation. */

   229     UBRK_WORD_NONE           = 0,

   230     /** Upper bound for tags for uncategorized words. */

   231     UBRK_WORD_NONE_LIMIT     = 100,

   232     /** Tag value for words that appear to be numbers, lower limit.    */

   233     UBRK_WORD_NUMBER         = 100,

   234     /** Tag value for words that appear to be numbers, upper limit.    */

   235     UBRK_WORD_NUMBER_LIMIT   = 200,

   236     /** Tag value for words that contain letters, excluding

   237      *  hiragana, katakana or ideographic characters, lower limit.    */

   238     UBRK_WORD_LETTER         = 200,

   239     /** Tag value for words containing letters, upper limit  */

   240     UBRK_WORD_LETTER_LIMIT   = 300,

   241     /** Tag value for words containing kana characters, lower limit */

   242     UBRK_WORD_KANA           = 300,

   243     /** Tag value for words containing kana characters, upper limit */

   244     UBRK_WORD_KANA_LIMIT     = 400,

   245     /** Tag value for words containing ideographic characters, lower limit */

   246     UBRK_WORD_IDEO           = 400,

   247     /** Tag value for words containing ideographic characters, upper limit */

   248     UBRK_WORD_IDEO_LIMIT     = 500

   249 } UWordBreak;

   251 /**

   252  *  Enum constants for the line break tags returned by getRuleStatus().

   253  *  A range of values is defined for each category of

   254  *  word, to allow for further subdivisions of a category in future releases.

   255  *  Applications should check for tag values falling within the range, rather

   256  *  than for single individual values.

   257  *  @stable ICU 2.8

   258 */

   259 typedef enum ULineBreakTag {

   260     /** Tag value for soft line breaks, positions at which a line break

   261       *  is acceptable but not required                */

   262     UBRK_LINE_SOFT            = 0,

   263     /** Upper bound for soft line breaks.              */

   264     UBRK_LINE_SOFT_LIMIT      = 100,

   265     /** Tag value for a hard, or mandatory line break  */

   266     UBRK_LINE_HARD            = 100,

   267     /** Upper bound for hard line breaks.              */

   268     UBRK_LINE_HARD_LIMIT      = 200

   269 } ULineBreakTag;

   273 /**

   274  *  Enum constants for the sentence break tags returned by getRuleStatus().

   275  *  A range of values is defined for each category of

   276  *  sentence, to allow for further subdivisions of a category in future releases.

   277  *  Applications should check for tag values falling within the range, rather

   278  *  than for single individual values.

   279  *  @stable ICU 2.8

   280 */

   281 typedef enum USentenceBreakTag {

   282     /** Tag value for for sentences  ending with a sentence terminator

   283       * ('.', '?', '!', etc.) character, possibly followed by a

   284       * hard separator (CR, LF, PS, etc.)

   285       */

   286     UBRK_SENTENCE_TERM       = 0,

   287     /** Upper bound for tags for sentences ended by sentence terminators.    */

   288     UBRK_SENTENCE_TERM_LIMIT = 100,

   289     /** Tag value for for sentences that do not contain an ending

   290       * sentence terminator ('.', '?', '!', etc.) character, but

   291       * are ended only by a hard separator (CR, LF, PS, etc.) or end of input.

   292       */

   293     UBRK_SENTENCE_SEP        = 100,

   294     /** Upper bound for tags for sentences ended by a separator.              */

   295     UBRK_SENTENCE_SEP_LIMIT  = 200

   296     /** Tag value for a hard, or mandatory line break  */

   297 } USentenceBreakTag;

   300 /**

   301  * Open a new UBreakIterator for locating text boundaries for a specified locale.

   302  * A UBreakIterator may be used for detecting character, line, word,

   303  * and sentence breaks in text.

   304  * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,

   305  * UBRK_LINE, UBRK_SENTENCE

   306  * @param locale The locale specifying the text-breaking conventions.

   307  * @param text The text to be iterated over.

   308  * @param textLength The number of characters in text, or -1 if null-terminated.

   309  * @param status A UErrorCode to receive any errors.

   310  * @return A UBreakIterator for the specified locale.

   311  * @see ubrk_openRules

   312  * @stable ICU 2.0

   313  */

   314 U_STABLE UBreakIterator* U_EXPORT2

   315 ubrk_open(UBreakIteratorType type,

   316       const char *locale,

   317       const UChar *text,

   318       int32_t textLength,

   319       UErrorCode *status);

   321 /**

   322  * Open a new UBreakIterator for locating text boundaries using specified breaking rules.

   323  * The rule syntax is ... (TBD)

   324  * @param rules A set of rules specifying the text breaking conventions.

   325  * @param rulesLength The number of characters in rules, or -1 if null-terminated.

   326  * @param text The text to be iterated over.  May be null, in which case ubrk_setText() is

   327  *        used to specify the text to be iterated.

   328  * @param textLength The number of characters in text, or -1 if null-terminated.

   329  * @param parseErr   Receives position and context information for any syntax errors

   330  *                   detected while parsing the rules.

   331  * @param status A UErrorCode to receive any errors.

   332  * @return A UBreakIterator for the specified rules.

   333  * @see ubrk_open

   334  * @stable ICU 2.2

   335  */

   336 U_STABLE UBreakIterator* U_EXPORT2

   337 ubrk_openRules(const UChar     *rules,

   338                int32_t         rulesLength,

   339                const UChar     *text,

   340                int32_t          textLength,

   341                UParseError     *parseErr,

   342                UErrorCode      *status);

   344 /**

   345  * Thread safe cloning operation

   346  * @param bi iterator to be cloned

   347  * @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.

   348  *  If buffer is not large enough, new memory will be allocated.

   349  *  Clients can use the U_BRK_SAFECLONE_BUFFERSIZE. This will probably be enough to avoid memory allocations.

   350  * @param pBufferSize pointer to size of allocated space.

   351  *  If *pBufferSize == 0, a sufficient size for use in cloning will

   352  *  be returned ('pre-flighting')

   353  *  If *pBufferSize is not enough for a stack-based safe clone,

   354  *  new memory will be allocated.

   355  * @param status to indicate whether the operation went on smoothly or there were errors

   356  *  An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary.

   357  * @return pointer to the new clone

   358  * @stable ICU 2.0

   359  */

   360 U_STABLE UBreakIterator * U_EXPORT2

   361 ubrk_safeClone(

   362           const UBreakIterator *bi,

   363           void *stackBuffer,

   364           int32_t *pBufferSize,

   365           UErrorCode *status);

   367 /**

   368   * A recommended size (in bytes) for the memory buffer to be passed to ubrk_saveClone().

   369   * @stable ICU 2.0

   370   */

   371 #define U_BRK_SAFECLONE_BUFFERSIZE 512

   373 /**

   374 * Close a UBreakIterator.

   375 * Once closed, a UBreakIterator may no longer be used.

   376 * @param bi The break iterator to close.

   377  * @stable ICU 2.0

   378 */

   379 U_STABLE void U_EXPORT2

   380 ubrk_close(UBreakIterator *bi);

   382 /**

   383  * Sets an existing iterator to point to a new piece of text

   384  * @param bi The iterator to use

   385  * @param text The text to be set

   386  * @param textLength The length of the text

   387  * @param status The error code

   388  * @stable ICU 2.0

   389  */

   390 U_STABLE void U_EXPORT2

   391 ubrk_setText(UBreakIterator* bi,

   392              const UChar*    text,

   393              int32_t         textLength,

   394              UErrorCode*     status);

   397 /**

   398  * Sets an existing iterator to point to a new piece of text

   399  * @param bi The iterator to use

   400  * @param text The text to be set

   401  * @param status The error code

   402  * @draft ICU 3.4

   403  */

   404 U_DRAFT void U_EXPORT2

   405 ubrk_setUText(UBreakIterator* bi,

   406              UText*          text,

   407              UErrorCode*     status);

   411 /**

   412  * Determine the most recently-returned text boundary.

   413  *

   414  * @param bi The break iterator to use.

   415  * @return The character index most recently returned by \ref ubrk_next, \ref ubrk_previous,

   416  * \ref ubrk_first, or \ref ubrk_last.

   417  * @stable ICU 2.0

   418  */

   419 U_STABLE int32_t U_EXPORT2

   420 ubrk_current(const UBreakIterator *bi);

   422 /**

   423  * Determine the text boundary following the current text boundary.

   424  *

   425  * @param bi The break iterator to use.

   426  * @return The character index of the next text boundary, or UBRK_DONE

   427  * if all text boundaries have been returned.

   428  * @see ubrk_previous

   429  * @stable ICU 2.0

   430  */

   431 U_STABLE int32_t U_EXPORT2

   432 ubrk_next(UBreakIterator *bi);

   434 /**

   435  * Determine the text boundary preceding the current text boundary.

   436  *

   437  * @param bi The break iterator to use.

   438  * @return The character index of the preceding text boundary, or UBRK_DONE

   439  * if all text boundaries have been returned.

   440  * @see ubrk_next

   441  * @stable ICU 2.0

   442  */

   443 U_STABLE int32_t U_EXPORT2

   444 ubrk_previous(UBreakIterator *bi);

   446 /**

   447  * Determine the index of the first character in the text being scanned.

   448  * This is not always the same as index 0 of the text.

   449  * @param bi The break iterator to use.

   450  * @return The character index of the first character in the text being scanned.

   451  * @see ubrk_last

   452  * @stable ICU 2.0

   453  */

   454 U_STABLE int32_t U_EXPORT2

   455 ubrk_first(UBreakIterator *bi);

   457 /**

   458  * Determine the index immediately <EM>beyond</EM> the last character in the text being

   459  * scanned.

   460  * This is not the same as the last character.

   461  * @param bi The break iterator to use.

   462  * @return The character offset immediately <EM>beyond</EM> the last character in the

   463  * text being scanned.

   464  * @see ubrk_first

   465  * @stable ICU 2.0

   466  */

   467 U_STABLE int32_t U_EXPORT2

   468 ubrk_last(UBreakIterator *bi);

   470 /**

   471  * Determine the text boundary preceding the specified offset.

   472  * The value returned is always smaller than offset, or UBRK_DONE.

   473  * @param bi The break iterator to use.

   474  * @param offset The offset to begin scanning.

   475  * @return The text boundary preceding offset, or UBRK_DONE.

   476  * @see ubrk_following

   477  * @stable ICU 2.0

   478  */

   479 U_STABLE int32_t U_EXPORT2

   480 ubrk_preceding(UBreakIterator *bi,

   481            int32_t offset);

   483 /**

   484  * Determine the text boundary following the specified offset.

   485  * The value returned is always greater than offset, or UBRK_DONE.

   486  * @param bi The break iterator to use.

   487  * @param offset The offset to begin scanning.

   488  * @return The text boundary following offset, or UBRK_DONE.

   489  * @see ubrk_preceding

   490  * @stable ICU 2.0

   491  */

   492 U_STABLE int32_t U_EXPORT2

   493 ubrk_following(UBreakIterator *bi,

   494            int32_t offset);

   496 /**

   497 * Get a locale for which text breaking information is available.

   498 * A UBreakIterator in a locale returned by this function will perform the correct

   499 * text breaking for the locale.

   500 * @param index The index of the desired locale.

   501 * @return A locale for which number text breaking information is available, or 0 if none.

   502 * @see ubrk_countAvailable

   503 * @stable ICU 2.0

   504 */

   505 U_STABLE const char* U_EXPORT2

   506 ubrk_getAvailable(int32_t index);

   508 /**

   509 * Determine how many locales have text breaking information available.

   510 * This function is most useful as determining the loop ending condition for

   511 * calls to \ref ubrk_getAvailable.

   512 * @return The number of locales for which text breaking information is available.

   513 * @see ubrk_getAvailable

   514 * @stable ICU 2.0

   515 */

   516 U_STABLE int32_t U_EXPORT2

   517 ubrk_countAvailable(void);

   520 /**

   521 * Returns true if the specfied position is a boundary position.  As a side

   522 * effect, leaves the iterator pointing to the first boundary position at

   523 * or after "offset".

   524 * @param bi The break iterator to use.

   525 * @param offset the offset to check.

   526 * @return True if "offset" is a boundary position.

   527 * @stable ICU 2.0

   528 */

   529 U_STABLE  UBool U_EXPORT2

   530 ubrk_isBoundary(UBreakIterator *bi, int32_t offset);

   532 /**

   533  * Return the status from the break rule that determined the most recently

   534  * returned break position.  The values appear in the rule source

   535  * within brackets, {123}, for example.  For rules that do not specify a

   536  * status, a default value of 0 is returned.

   537  * <p>

   538  * For word break iterators, the possible values are defined in enum UWordBreak.

   539  * @stable ICU 2.2

   540  */

   541 U_STABLE  int32_t U_EXPORT2

   542 ubrk_getRuleStatus(UBreakIterator *bi);

   544 /**

   545  * Get the statuses from the break rules that determined the most recently

   546  * returned break position.  The values appear in the rule source

   547  * within brackets, {123}, for example.  The default status value for rules

   548  * that do not explicitly provide one is zero.

   549  * <p>

   550  * For word break iterators, the possible values are defined in enum UWordBreak.

   551  * @param bi        The break iterator to use

   552  * @param fillInVec an array to be filled in with the status values.

   553  * @param capacity  the length of the supplied vector.  A length of zero causes

   554  *                  the function to return the number of status values, in the

   555  *                  normal way, without attemtping to store any values.

   556  * @param status    receives error codes.

   557  * @return          The number of rule status values from rules that determined

   558  *                  the most recent boundary returned by the break iterator.

   559  * @draft ICU 3.0

   560  */

   561 U_DRAFT  int32_t U_EXPORT2

   562 ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status);

   564 /**

   565  * Return the locale of the break iterator. You can choose between the valid and

   566  * the actual locale.

   567  * @param bi break iterator

   568  * @param type locale type (valid or actual)

   569  * @param status error code

   570  * @return locale string

   571  * @draft ICU 2.8 likely to change after ICU 3.0, based on feedback

   572  */

   573 U_DRAFT const char* U_EXPORT2

   574 ubrk_getLocaleByType(const UBreakIterator *bi, ULocDataLocaleType type, UErrorCode* status);

   577 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

   579 #endif

author	sl
	Tue, 10 Jun 2014 14:32:02 +0200
changeset 1	260cb5ec6c19
permissions	-rw-r--r--