os/textandloc/fontservices/textshaperplugin/IcuSource/common/unicode/ubrk.h
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
     1 /*
     2 * Copyright (C) 1996-2005, International Business Machines Corporation and others. All Rights Reserved.
     3 *****************************************************************************************
     4 */
     5 
     6 #ifndef UBRK_H
     7 #define UBRK_H
     8 
     9 #include "unicode/utypes.h"
    10 #include "unicode/uloc.h"
    11 #include "unicode/utext.h"
    12 
    13 /**
    14  * A text-break iterator.
    15  *  For usage in C programs.
    16  */
    17 #ifndef UBRK_TYPEDEF_UBREAK_ITERATOR
    18 #   define UBRK_TYPEDEF_UBREAK_ITERATOR
    19     /**
    20      *  Opaque type representing an ICU Break iterator object.
    21      *  @stable ICU 2.0
    22      */
    23     typedef void UBreakIterator;
    24 #endif
    25 
    26 #if !UCONFIG_NO_BREAK_ITERATION
    27 
    28 #include "unicode/parseerr.h"
    29 
    30 /**
    31  * \file
    32  * \brief C API: BreakIterator
    33  *
    34  * <h2> BreakIterator C API </h2>
    35  *
    36  * The BreakIterator C API defines  methods for finding the location
    37  * of boundaries in text. Pointer to a UBreakIterator maintain a
    38  * current position and scan over text returning the index of characters
    39  * where boundaries occur.
    40  * <P>
    41  * Line boundary analysis determines where a text string can be broken
    42  * when line-wrapping. The mechanism correctly handles punctuation and
    43  * hyphenated words.
    44  * <P>
    45  * Sentence boundary analysis allows selection with correct
    46  * interpretation of periods within numbers and abbreviations, and
    47  * trailing punctuation marks such as quotation marks and parentheses.
    48  * <P>
    49  * Word boundary analysis is used by search and replace functions, as
    50  * well as within text editing applications that allow the user to
    51  * select words with a double click. Word selection provides correct
    52  * interpretation of punctuation marks within and following
    53  * words. Characters that are not part of a word, such as symbols or
    54  * punctuation marks, have word-breaks on both sides.
    55  * <P>
    56  * Character boundary analysis allows users to interact with
    57  * characters as they expect to, for example, when moving the cursor
    58  * through a text string. Character boundary analysis provides correct
    59  * navigation of through character strings, regardless of how the
    60  * character is stored.  For example, an accented character might be
    61  * stored as a base character and a diacritical mark. What users
    62  * consider to be a character can differ between languages.
    63  * <P>
    64  * Title boundary analysis locates all positions,
    65  * typically starts of words, that should be set to Title Case
    66  * when title casing the text.
    67  * <P>
    68  *
    69  * This is the interface for all text boundaries.
    70  * <P>
    71  * Examples:
    72  * <P>
    73  * Helper function to output text
    74  * <pre>
    75  * \code
    76  *    void printTextRange(UChar* str, int32_t start, int32_t end ) {
    77  *         UChar* result;
    78  *         UChar* temp;
    79  *         const char* res;
    80  *         temp=(UChar*)malloc(sizeof(UChar) * ((u_strlen(str)-start)+1));
    81  *         result=(UChar*)malloc(sizeof(UChar) * ((end-start)+1));
    82  *         u_strcpy(temp, &str[start]);
    83  *         u_strncpy(result, temp, end-start);
    84  *         res=(char*)malloc(sizeof(char) * (u_strlen(result)+1));
    85  *         u_austrcpy(res, result);
    86  *         printf("%s\n", res);
    87  *    }
    88  * \endcode
    89  * </pre>
    90  * Print each element in order:
    91  * <pre>
    92  * \code
    93  *    void printEachForward( UBreakIterator* boundary, UChar* str) {
    94  *       int32_t end;
    95  *       int32_t start = ubrk_first(boundary);
    96  *       for (end = ubrk_next(boundary)); end != UBRK_DONE; start = end, end = ubrk_next(boundary)) {
    97  *             printTextRange(str, start, end );
    98  *         }
    99  *    }
   100  * \endcode
   101  * </pre>
   102  * Print each element in reverse order:
   103  * <pre>
   104  * \code
   105  *    void printEachBackward( UBreakIterator* boundary, UChar* str) {
   106  *       int32_t start;
   107  *       int32_t end = ubrk_last(boundary);
   108  *       for (start = ubrk_previous(boundary); start != UBRK_DONE;  end = start, start =ubrk_previous(boundary)) {
   109  *             printTextRange( str, start, end );
   110  *         }
   111  *    }
   112  * \endcode
   113  * </pre>
   114  * Print first element
   115  * <pre>
   116  * \code
   117  *    void printFirst(UBreakIterator* boundary, UChar* str) {
   118  *        int32_t end;
   119  *        int32_t start = ubrk_first(boundary);
   120  *        end = ubrk_next(boundary);
   121  *        printTextRange( str, start, end );
   122  *    }
   123  * \endcode
   124  * </pre>
   125  * Print last element
   126  * <pre>
   127  * \code
   128  *    void printLast(UBreakIterator* boundary, UChar* str) {
   129  *        int32_t start;
   130  *        int32_t end = ubrk_last(boundary);
   131  *        start = ubrk_previous(boundary);
   132  *        printTextRange(str, start, end );
   133  *    }
   134  * \endcode
   135  * </pre>
   136  * Print the element at a specified position
   137  * <pre>
   138  * \code
   139  *    void printAt(UBreakIterator* boundary, int32_t pos , UChar* str) {
   140  *        int32_t start;
   141  *        int32_t end = ubrk_following(boundary, pos);
   142  *        start = ubrk_previous(boundary);
   143  *        printTextRange(str, start, end );
   144  *    }
   145  * \endcode
   146  * </pre>
   147  * Creating and using text boundaries
   148  * <pre>
   149  * \code
   150  *       void BreakIterator_Example( void ) {
   151  *           UBreakIterator* boundary;
   152  *           UChar *stringToExamine;
   153  *           stringToExamine=(UChar*)malloc(sizeof(UChar) * (strlen("Aaa bbb ccc. Ddd eee fff.")+1) );
   154  *           u_uastrcpy(stringToExamine, "Aaa bbb ccc. Ddd eee fff.");
   155  *           printf("Examining: "Aaa bbb ccc. Ddd eee fff.");
   156  *
   157  *           //print each sentence in forward and reverse order
   158  *           boundary = ubrk_open(UBRK_SENTENCE, "en_us", stringToExamine, u_strlen(stringToExamine), &status);
   159  *           printf("----- forward: -----------\n");
   160  *           printEachForward(boundary, stringToExamine);
   161  *           printf("----- backward: ----------\n");
   162  *           printEachBackward(boundary, stringToExamine);
   163  *           ubrk_close(boundary);
   164  *
   165  *           //print each word in order
   166  *           boundary = ubrk_open(UBRK_WORD, "en_us", stringToExamine, u_strlen(stringToExamine), &status);
   167  *           printf("----- forward: -----------\n");
   168  *           printEachForward(boundary, stringToExamine);
   169  *           printf("----- backward: ----------\n");
   170  *           printEachBackward(boundary, stringToExamine);
   171  *           //print first element
   172  *           printf("----- first: -------------\n");
   173  *           printFirst(boundary, stringToExamine);
   174  *           //print last element
   175  *           printf("----- last: --------------\n");
   176  *           printLast(boundary, stringToExamine);
   177  *           //print word at charpos 10
   178  *           printf("----- at pos 10: ---------\n");
   179  *           printAt(boundary, 10 , stringToExamine);
   180  *
   181  *           ubrk_close(boundary);
   182  *       }
   183  * \endcode
   184  * </pre>
   185  */
   186 
   187 /** The possible types of text boundaries.  @stable ICU 2.0 */
   188 typedef enum UBreakIteratorType {
   189   /** Character breaks  @stable ICU 2.0 */
   190   UBRK_CHARACTER,
   191   /** Word breaks @stable ICU 2.0 */
   192   UBRK_WORD,
   193   /** Line breaks @stable ICU 2.0 */
   194   UBRK_LINE,
   195   /** Sentence breaks @stable ICU 2.0 */
   196   UBRK_SENTENCE,
   197 
   198 #ifndef U_HIDE_DEPRECATED_API
   199   /** 
   200    * Title Case breaks 
   201    * The iterator created using this type locates title boundaries as described for 
   202    * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
   203    * please use Word Boundary iterator.
   204    *
   205    * @deprecated ICU 2.8 Use the word break iterator for titlecasing for Unicode 4 and later.
   206    */
   207   UBRK_TITLE
   208 #endif /* U_HIDE_DEPRECATED_API */
   209 
   210 } UBreakIteratorType;
   211 
   212 /** Value indicating all text boundaries have been returned.
   213  *  @stable ICU 2.0 
   214  */
   215 #define UBRK_DONE ((int32_t) -1)
   216 
   217 
   218 /**
   219  *  Enum constants for the word break tags returned by
   220  *  getRuleStatus().  A range of values is defined for each category of
   221  *  word, to allow for further subdivisions of a category in future releases.
   222  *  Applications should check for tag values falling within the range, rather
   223  *  than for single individual values.
   224  *  @stable ICU 2.2
   225 */
   226 typedef enum UWordBreak {
   227     /** Tag value for "words" that do not fit into any of other categories. 
   228      *  Includes spaces and most punctuation. */
   229     UBRK_WORD_NONE           = 0,
   230     /** Upper bound for tags for uncategorized words. */
   231     UBRK_WORD_NONE_LIMIT     = 100,
   232     /** Tag value for words that appear to be numbers, lower limit.    */
   233     UBRK_WORD_NUMBER         = 100,
   234     /** Tag value for words that appear to be numbers, upper limit.    */
   235     UBRK_WORD_NUMBER_LIMIT   = 200,
   236     /** Tag value for words that contain letters, excluding
   237      *  hiragana, katakana or ideographic characters, lower limit.    */
   238     UBRK_WORD_LETTER         = 200,
   239     /** Tag value for words containing letters, upper limit  */
   240     UBRK_WORD_LETTER_LIMIT   = 300,
   241     /** Tag value for words containing kana characters, lower limit */
   242     UBRK_WORD_KANA           = 300,
   243     /** Tag value for words containing kana characters, upper limit */
   244     UBRK_WORD_KANA_LIMIT     = 400,
   245     /** Tag value for words containing ideographic characters, lower limit */
   246     UBRK_WORD_IDEO           = 400,
   247     /** Tag value for words containing ideographic characters, upper limit */
   248     UBRK_WORD_IDEO_LIMIT     = 500
   249 } UWordBreak;
   250 
   251 /**
   252  *  Enum constants for the line break tags returned by getRuleStatus().
   253  *  A range of values is defined for each category of
   254  *  word, to allow for further subdivisions of a category in future releases.
   255  *  Applications should check for tag values falling within the range, rather
   256  *  than for single individual values.
   257  *  @stable ICU 2.8
   258 */
   259 typedef enum ULineBreakTag {
   260     /** Tag value for soft line breaks, positions at which a line break
   261       *  is acceptable but not required                */
   262     UBRK_LINE_SOFT            = 0,
   263     /** Upper bound for soft line breaks.              */
   264     UBRK_LINE_SOFT_LIMIT      = 100,
   265     /** Tag value for a hard, or mandatory line break  */
   266     UBRK_LINE_HARD            = 100,
   267     /** Upper bound for hard line breaks.              */
   268     UBRK_LINE_HARD_LIMIT      = 200
   269 } ULineBreakTag;
   270 
   271 
   272 
   273 /**
   274  *  Enum constants for the sentence break tags returned by getRuleStatus().
   275  *  A range of values is defined for each category of
   276  *  sentence, to allow for further subdivisions of a category in future releases.
   277  *  Applications should check for tag values falling within the range, rather
   278  *  than for single individual values.
   279  *  @stable ICU 2.8
   280 */
   281 typedef enum USentenceBreakTag {
   282     /** Tag value for for sentences  ending with a sentence terminator
   283       * ('.', '?', '!', etc.) character, possibly followed by a
   284       * hard separator (CR, LF, PS, etc.)
   285       */
   286     UBRK_SENTENCE_TERM       = 0,
   287     /** Upper bound for tags for sentences ended by sentence terminators.    */
   288     UBRK_SENTENCE_TERM_LIMIT = 100,
   289     /** Tag value for for sentences that do not contain an ending
   290       * sentence terminator ('.', '?', '!', etc.) character, but 
   291       * are ended only by a hard separator (CR, LF, PS, etc.) or end of input.
   292       */
   293     UBRK_SENTENCE_SEP        = 100,
   294     /** Upper bound for tags for sentences ended by a separator.              */
   295     UBRK_SENTENCE_SEP_LIMIT  = 200
   296     /** Tag value for a hard, or mandatory line break  */
   297 } USentenceBreakTag;
   298 
   299 
   300 /**
   301  * Open a new UBreakIterator for locating text boundaries for a specified locale.
   302  * A UBreakIterator may be used for detecting character, line, word,
   303  * and sentence breaks in text.
   304  * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,
   305  * UBRK_LINE, UBRK_SENTENCE
   306  * @param locale The locale specifying the text-breaking conventions.
   307  * @param text The text to be iterated over.
   308  * @param textLength The number of characters in text, or -1 if null-terminated.
   309  * @param status A UErrorCode to receive any errors.
   310  * @return A UBreakIterator for the specified locale.
   311  * @see ubrk_openRules
   312  * @stable ICU 2.0
   313  */
   314 U_STABLE UBreakIterator* U_EXPORT2
   315 ubrk_open(UBreakIteratorType type,
   316       const char *locale,
   317       const UChar *text,
   318       int32_t textLength,
   319       UErrorCode *status);
   320 
   321 /**
   322  * Open a new UBreakIterator for locating text boundaries using specified breaking rules.
   323  * The rule syntax is ... (TBD)
   324  * @param rules A set of rules specifying the text breaking conventions.
   325  * @param rulesLength The number of characters in rules, or -1 if null-terminated.
   326  * @param text The text to be iterated over.  May be null, in which case ubrk_setText() is
   327  *        used to specify the text to be iterated.
   328  * @param textLength The number of characters in text, or -1 if null-terminated.
   329  * @param parseErr   Receives position and context information for any syntax errors
   330  *                   detected while parsing the rules.
   331  * @param status A UErrorCode to receive any errors.
   332  * @return A UBreakIterator for the specified rules.
   333  * @see ubrk_open
   334  * @stable ICU 2.2
   335  */
   336 U_STABLE UBreakIterator* U_EXPORT2
   337 ubrk_openRules(const UChar     *rules,
   338                int32_t         rulesLength,
   339                const UChar     *text,
   340                int32_t          textLength,
   341                UParseError     *parseErr,
   342                UErrorCode      *status);
   343 
   344 /**
   345  * Thread safe cloning operation
   346  * @param bi iterator to be cloned
   347  * @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
   348  *  If buffer is not large enough, new memory will be allocated.
   349  *  Clients can use the U_BRK_SAFECLONE_BUFFERSIZE. This will probably be enough to avoid memory allocations.
   350  * @param pBufferSize pointer to size of allocated space.
   351  *  If *pBufferSize == 0, a sufficient size for use in cloning will
   352  *  be returned ('pre-flighting')
   353  *  If *pBufferSize is not enough for a stack-based safe clone,
   354  *  new memory will be allocated.
   355  * @param status to indicate whether the operation went on smoothly or there were errors
   356  *  An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary.
   357  * @return pointer to the new clone
   358  * @stable ICU 2.0
   359  */
   360 U_STABLE UBreakIterator * U_EXPORT2
   361 ubrk_safeClone(
   362           const UBreakIterator *bi,
   363           void *stackBuffer,
   364           int32_t *pBufferSize,
   365           UErrorCode *status);
   366 
   367 /**
   368   * A recommended size (in bytes) for the memory buffer to be passed to ubrk_saveClone().
   369   * @stable ICU 2.0
   370   */
   371 #define U_BRK_SAFECLONE_BUFFERSIZE 512
   372 
   373 /**
   374 * Close a UBreakIterator.
   375 * Once closed, a UBreakIterator may no longer be used.
   376 * @param bi The break iterator to close.
   377  * @stable ICU 2.0
   378 */
   379 U_STABLE void U_EXPORT2
   380 ubrk_close(UBreakIterator *bi);
   381 
   382 /**
   383  * Sets an existing iterator to point to a new piece of text
   384  * @param bi The iterator to use
   385  * @param text The text to be set
   386  * @param textLength The length of the text
   387  * @param status The error code
   388  * @stable ICU 2.0
   389  */
   390 U_STABLE void U_EXPORT2
   391 ubrk_setText(UBreakIterator* bi,
   392              const UChar*    text,
   393              int32_t         textLength,
   394              UErrorCode*     status);
   395 
   396 
   397 /**
   398  * Sets an existing iterator to point to a new piece of text
   399  * @param bi The iterator to use
   400  * @param text The text to be set
   401  * @param status The error code
   402  * @draft ICU 3.4
   403  */
   404 U_DRAFT void U_EXPORT2
   405 ubrk_setUText(UBreakIterator* bi,
   406              UText*          text,
   407              UErrorCode*     status);
   408 
   409 
   410 
   411 /**
   412  * Determine the most recently-returned text boundary.
   413  *
   414  * @param bi The break iterator to use.
   415  * @return The character index most recently returned by \ref ubrk_next, \ref ubrk_previous,
   416  * \ref ubrk_first, or \ref ubrk_last.
   417  * @stable ICU 2.0
   418  */
   419 U_STABLE int32_t U_EXPORT2
   420 ubrk_current(const UBreakIterator *bi);
   421 
   422 /**
   423  * Determine the text boundary following the current text boundary.
   424  *
   425  * @param bi The break iterator to use.
   426  * @return The character index of the next text boundary, or UBRK_DONE
   427  * if all text boundaries have been returned.
   428  * @see ubrk_previous
   429  * @stable ICU 2.0
   430  */
   431 U_STABLE int32_t U_EXPORT2
   432 ubrk_next(UBreakIterator *bi);
   433 
   434 /**
   435  * Determine the text boundary preceding the current text boundary.
   436  *
   437  * @param bi The break iterator to use.
   438  * @return The character index of the preceding text boundary, or UBRK_DONE
   439  * if all text boundaries have been returned.
   440  * @see ubrk_next
   441  * @stable ICU 2.0
   442  */
   443 U_STABLE int32_t U_EXPORT2
   444 ubrk_previous(UBreakIterator *bi);
   445 
   446 /**
   447  * Determine the index of the first character in the text being scanned.
   448  * This is not always the same as index 0 of the text.
   449  * @param bi The break iterator to use.
   450  * @return The character index of the first character in the text being scanned.
   451  * @see ubrk_last
   452  * @stable ICU 2.0
   453  */
   454 U_STABLE int32_t U_EXPORT2
   455 ubrk_first(UBreakIterator *bi);
   456 
   457 /**
   458  * Determine the index immediately <EM>beyond</EM> the last character in the text being
   459  * scanned.
   460  * This is not the same as the last character.
   461  * @param bi The break iterator to use.
   462  * @return The character offset immediately <EM>beyond</EM> the last character in the
   463  * text being scanned.
   464  * @see ubrk_first
   465  * @stable ICU 2.0
   466  */
   467 U_STABLE int32_t U_EXPORT2
   468 ubrk_last(UBreakIterator *bi);
   469 
   470 /**
   471  * Determine the text boundary preceding the specified offset.
   472  * The value returned is always smaller than offset, or UBRK_DONE.
   473  * @param bi The break iterator to use.
   474  * @param offset The offset to begin scanning.
   475  * @return The text boundary preceding offset, or UBRK_DONE.
   476  * @see ubrk_following
   477  * @stable ICU 2.0
   478  */
   479 U_STABLE int32_t U_EXPORT2
   480 ubrk_preceding(UBreakIterator *bi,
   481            int32_t offset);
   482 
   483 /**
   484  * Determine the text boundary following the specified offset.
   485  * The value returned is always greater than offset, or UBRK_DONE.
   486  * @param bi The break iterator to use.
   487  * @param offset The offset to begin scanning.
   488  * @return The text boundary following offset, or UBRK_DONE.
   489  * @see ubrk_preceding
   490  * @stable ICU 2.0
   491  */
   492 U_STABLE int32_t U_EXPORT2
   493 ubrk_following(UBreakIterator *bi,
   494            int32_t offset);
   495 
   496 /**
   497 * Get a locale for which text breaking information is available.
   498 * A UBreakIterator in a locale returned by this function will perform the correct
   499 * text breaking for the locale.
   500 * @param index The index of the desired locale.
   501 * @return A locale for which number text breaking information is available, or 0 if none.
   502 * @see ubrk_countAvailable
   503 * @stable ICU 2.0
   504 */
   505 U_STABLE const char* U_EXPORT2
   506 ubrk_getAvailable(int32_t index);
   507 
   508 /**
   509 * Determine how many locales have text breaking information available.
   510 * This function is most useful as determining the loop ending condition for
   511 * calls to \ref ubrk_getAvailable.
   512 * @return The number of locales for which text breaking information is available.
   513 * @see ubrk_getAvailable
   514 * @stable ICU 2.0
   515 */
   516 U_STABLE int32_t U_EXPORT2
   517 ubrk_countAvailable(void);
   518 
   519 
   520 /**
   521 * Returns true if the specfied position is a boundary position.  As a side
   522 * effect, leaves the iterator pointing to the first boundary position at
   523 * or after "offset".
   524 * @param bi The break iterator to use.
   525 * @param offset the offset to check.
   526 * @return True if "offset" is a boundary position.
   527 * @stable ICU 2.0
   528 */
   529 U_STABLE  UBool U_EXPORT2
   530 ubrk_isBoundary(UBreakIterator *bi, int32_t offset);
   531 
   532 /**
   533  * Return the status from the break rule that determined the most recently
   534  * returned break position.  The values appear in the rule source
   535  * within brackets, {123}, for example.  For rules that do not specify a
   536  * status, a default value of 0 is returned.
   537  * <p>
   538  * For word break iterators, the possible values are defined in enum UWordBreak.
   539  * @stable ICU 2.2
   540  */
   541 U_STABLE  int32_t U_EXPORT2
   542 ubrk_getRuleStatus(UBreakIterator *bi);
   543 
   544 /**
   545  * Get the statuses from the break rules that determined the most recently
   546  * returned break position.  The values appear in the rule source
   547  * within brackets, {123}, for example.  The default status value for rules
   548  * that do not explicitly provide one is zero.
   549  * <p>
   550  * For word break iterators, the possible values are defined in enum UWordBreak.
   551  * @param bi        The break iterator to use
   552  * @param fillInVec an array to be filled in with the status values.  
   553  * @param capacity  the length of the supplied vector.  A length of zero causes
   554  *                  the function to return the number of status values, in the
   555  *                  normal way, without attemtping to store any values.
   556  * @param status    receives error codes.  
   557  * @return          The number of rule status values from rules that determined 
   558  *                  the most recent boundary returned by the break iterator.
   559  * @draft ICU 3.0
   560  */
   561 U_DRAFT  int32_t U_EXPORT2
   562 ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status);
   563 
   564 /**
   565  * Return the locale of the break iterator. You can choose between the valid and
   566  * the actual locale.
   567  * @param bi break iterator
   568  * @param type locale type (valid or actual)
   569  * @param status error code
   570  * @return locale string
   571  * @draft ICU 2.8 likely to change after ICU 3.0, based on feedback
   572  */
   573 U_DRAFT const char* U_EXPORT2
   574 ubrk_getLocaleByType(const UBreakIterator *bi, ULocDataLocaleType type, UErrorCode* status);
   575 
   576 
   577 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   578 
   579 #endif