Symaptic: os/textandloc/fontservices/textshaperplugin/IcuSource/common/unicode/normlzr.h@260cb5ec6c19

     1 /*

     2  ********************************************************************

     3  * COPYRIGHT:

     4  * Copyright (c) 1996-2005, International Business Machines Corporation and

     5  * others. All Rights Reserved.

     6  ********************************************************************

     7  */

     9 #ifndef NORMLZR_H

    10 #define NORMLZR_H

    12 #include "unicode/utypes.h"

    14 /**

    15  * \file

    16  * \brief C++ API: Unicode Normalization

    17  */

    19 #if !UCONFIG_NO_NORMALIZATION

    21 #include "unicode/uobject.h"

    22 #include "unicode/unistr.h"

    23 #include "unicode/chariter.h"

    24 #include "unicode/unorm.h"

    27 struct UCharIterator;

    28 typedef struct UCharIterator UCharIterator; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */

    30 U_NAMESPACE_BEGIN

    31 /**

    32  *

    33  * The Normalizer class consists of two parts:

    34  * - static functions that normalize strings or test if strings are normalized

    35  * - a Normalizer object is an iterator that takes any kind of text and

    36  *   provides iteration over its normalized form

    37  *

    38  * The Normalizer class is not suitable for subclassing.

    39  *

    40  * The static functions are basically wrappers around the C implementation,

    41  * using UnicodeString instead of UChar*.

    42  * For basic information about normalization forms and details about the C API

    43  * please see the documentation in unorm.h.

    44  *

    45  * The iterator API with the Normalizer constructors and the non-static functions

    46  * uses a CharacterIterator as input. It is possible to pass a string which

    47  * is then internally wrapped in a CharacterIterator.

    48  * The input text is not normalized all at once, but incrementally where needed

    49  * (providing efficient random access).

    50  * This allows to pass in a large text but spend only a small amount of time

    51  * normalizing a small part of that text.

    52  * However, if the entire text is normalized, then the iterator will be

    53  * slower than normalizing the entire text at once and iterating over the result.

    54  * A possible use of the Normalizer iterator is also to report an index into the

    55  * original text that is close to where the normalized characters come from.

    56  *

    57  * <em>Important:</em> The iterator API was cleaned up significantly for ICU 2.0.

    58  * The earlier implementation reported the getIndex() inconsistently,

    59  * and previous() could not be used after setIndex(), next(), first(), and current().

    60  *

    61  * Normalizer allows to start normalizing from anywhere in the input text by

    62  * calling setIndexOnly(), first(), or last().

    63  * Without calling any of these, the iterator will start at the beginning of the text.

    64  *

    65  * At any time, next() returns the next normalized code point (UChar32),

    66  * with post-increment semantics (like CharacterIterator::next32PostInc()).

    67  * previous() returns the previous normalized code point (UChar32),

    68  * with pre-decrement semantics (like CharacterIterator::previous32()).

    69  *

    70  * current() returns the current code point

    71  * (respectively the one at the newly set index) without moving

    72  * the getIndex(). Note that if the text at the current position

    73  * needs to be normalized, then these functions will do that.

    74  * (This is why current() is not const.)

    75  * It is more efficient to call setIndexOnly() instead, which does not

    76  * normalize.

    77  *

    78  * getIndex() always refers to the position in the input text where the normalized

    79  * code points are returned from. It does not always change with each returned

    80  * code point.

    81  * The code point that is returned from any of the functions

    82  * corresponds to text at or after getIndex(), according to the

    83  * function's iteration semantics (post-increment or pre-decrement).

    84  *

    85  * next() returns a code point from at or after the getIndex()

    86  * from before the next() call. After the next() call, the getIndex()

    87  * might have moved to where the next code point will be returned from

    88  * (from a next() or current() call).

    89  * This is semantically equivalent to array access with array[index++]

    90  * (post-increment semantics).

    91  *

    92  * previous() returns a code point from at or after the getIndex()

    93  * from after the previous() call.

    94  * This is semantically equivalent to array access with array[--index]

    95  * (pre-decrement semantics).

    96  *

    97  * Internally, the Normalizer iterator normalizes a small piece of text

    98  * starting at the getIndex() and ending at a following "safe" index.

    99  * The normalized results is stored in an internal string buffer, and

   100  * the code points are iterated from there.

   101  * With multiple iteration calls, this is repeated until the next piece

   102  * of text needs to be normalized, and the getIndex() needs to be moved.

   103  *

   104  * The following "safe" index, the internal buffer, and the secondary

   105  * iteration index into that buffer are not exposed on the API.

   106  * This also means that it is currently not practical to return to

   107  * a particular, arbitrary position in the text because one would need to

   108  * know, and be able to set, in addition to the getIndex(), at least also the

   109  * current index into the internal buffer.

   110  * It is currently only possible to observe when getIndex() changes

   111  * (with careful consideration of the iteration semantics),

   112  * at which time the internal index will be 0.

   113  * For example, if getIndex() is different after next() than before it,

   114  * then the internal index is 0 and one can return to this getIndex()

   115  * later with setIndexOnly().

   116  *

   117  * @author Laura Werner, Mark Davis, Markus Scherer

   118  * @stable ICU 2.0

   119  */

   120 class U_COMMON_API Normalizer : public UObject {

   121 public:

   122   /**

   123    * If DONE is returned from an iteration function that returns a code point,

   124    * then there are no more normalization results available.

   125    * @stable ICU 2.0

   126    */

   127   enum {

   128       DONE=0xffff

   129   };

   131   // Constructors

   133   /**

   134    * Creates a new <code>Normalizer</code> object for iterating over the

   135    * normalized form of a given string.

   136    * <p>

   137    * @param str   The string to be normalized.  The normalization

   138    *              will start at the beginning of the string.

   139    *

   140    * @param mode  The normalization mode.

   141    * @stable ICU 2.0

   142    */

   143   Normalizer(const UnicodeString& str, UNormalizationMode mode);

   145   /**

   146    * Creates a new <code>Normalizer</code> object for iterating over the

   147    * normalized form of a given string.

   148    * <p>

   149    * @param str   The string to be normalized.  The normalization

   150    *              will start at the beginning of the string.

   151    *

   152    * @param length Length of the string, or -1 if NUL-terminated.

   153    * @param mode  The normalization mode.

   154    * @stable ICU 2.0

   155    */

   156   Normalizer(const UChar* str, int32_t length, UNormalizationMode mode);

   158   /**

   159    * Creates a new <code>Normalizer</code> object for iterating over the

   160    * normalized form of the given text.

   161    * <p>

   162    * @param iter  The input text to be normalized.  The normalization

   163    *              will start at the beginning of the string.

   164    *

   165    * @param mode  The normalization mode.

   166    * @stable ICU 2.0

   167    */

   168   Normalizer(const CharacterIterator& iter, UNormalizationMode mode);

   170   /**

   171    * Copy constructor.

   172    * @param copy The object to be copied.

   173    * @stable ICU 2.0

   174    */

   175   Normalizer(const Normalizer& copy);

   177   /**

   178    * Destructor

   179    * @stable ICU 2.0

   180    */

   181   virtual ~Normalizer();

   184   //-------------------------------------------------------------------------

   185   // Static utility methods

   186   //-------------------------------------------------------------------------

   188   /**

   189    * Normalizes a <code>UnicodeString</code> according to the specified normalization mode.

   190    * This is a wrapper for unorm_normalize(), using UnicodeString's.

   191    *

   192    * The <code>options</code> parameter specifies which optional

   193    * <code>Normalizer</code> features are to be enabled for this operation.

   194    *

   195    * @param source    the input string to be normalized.

   196    * @param mode      the normalization mode

   197    * @param options   the optional features to be enabled (0 for no options)

   198    * @param result    The normalized string (on output).

   199    * @param status    The error code.

   200    * @stable ICU 2.0

   201    */

   202   static void U_EXPORT2 normalize(const UnicodeString& source,

   203                         UNormalizationMode mode, int32_t options,

   204                         UnicodeString& result,

   205                         UErrorCode &status);

   207   /**

   208    * Compose a <code>UnicodeString</code>.

   209    * This is equivalent to normalize() with mode UNORM_NFC or UNORM_NFKC.

   210    * This is a wrapper for unorm_normalize(), using UnicodeString's.

   211    *

   212    * The <code>options</code> parameter specifies which optional

   213    * <code>Normalizer</code> features are to be enabled for this operation.

   214    *

   215    * @param source    the string to be composed.

   216    * @param compat    Perform compatibility decomposition before composition.

   217    *                  If this argument is <code>FALSE</code>, only canonical

   218    *                  decomposition will be performed.

   219    * @param options   the optional features to be enabled (0 for no options)

   220    * @param result    The composed string (on output).

   221    * @param status    The error code.

   222    * @stable ICU 2.0

   223    */

   224   static void U_EXPORT2 compose(const UnicodeString& source,

   225                       UBool compat, int32_t options,

   226                       UnicodeString& result,

   227                       UErrorCode &status);

   229   /**

   230    * Static method to decompose a <code>UnicodeString</code>.

   231    * This is equivalent to normalize() with mode UNORM_NFD or UNORM_NFKD.

   232    * This is a wrapper for unorm_normalize(), using UnicodeString's.

   233    *

   234    * The <code>options</code> parameter specifies which optional

   235    * <code>Normalizer</code> features are to be enabled for this operation.

   236    *

   237    * @param source    the string to be decomposed.

   238    * @param compat    Perform compatibility decomposition.

   239    *                  If this argument is <code>FALSE</code>, only canonical

   240    *                  decomposition will be performed.

   241    * @param options   the optional features to be enabled (0 for no options)

   242    * @param result    The decomposed string (on output).

   243    * @param status    The error code.

   244    * @stable ICU 2.0

   245    */

   246   static void U_EXPORT2 decompose(const UnicodeString& source,

   247                         UBool compat, int32_t options,

   248                         UnicodeString& result,

   249                         UErrorCode &status);

   251   /**

   252    * Performing quick check on a string, to quickly determine if the string is

   253    * in a particular normalization format.

   254    * This is a wrapper for unorm_quickCheck(), using a UnicodeString.

   255    *

   256    * Three types of result can be returned UNORM_YES, UNORM_NO or

   257    * UNORM_MAYBE. Result UNORM_YES indicates that the argument

   258    * string is in the desired normalized format, UNORM_NO determines that

   259    * argument string is not in the desired normalized format. A

   260    * UNORM_MAYBE result indicates that a more thorough check is required,

   261    * the user may have to put the string in its normalized form and compare the

   262    * results.

   263    * @param source       string for determining if it is in a normalized format

   264    * @param mode         normalization format

   265    * @param status A reference to a UErrorCode to receive any errors

   266    * @return UNORM_YES, UNORM_NO or UNORM_MAYBE

   267    *

   268    * @see isNormalized

   269    * @stable ICU 2.0

   270    */

   271   static inline UNormalizationCheckResult

   272   quickCheck(const UnicodeString &source, UNormalizationMode mode, UErrorCode &status);

   274   /**

   275    * Performing quick check on a string; same as the other version of quickCheck

   276    * but takes an extra options parameter like most normalization functions.

   277    *

   278    * @param source       string for determining if it is in a normalized format

   279    * @param mode         normalization format

   280    * @param options      the optional features to be enabled (0 for no options)

   281    * @param status A reference to a UErrorCode to receive any errors

   282    * @return UNORM_YES, UNORM_NO or UNORM_MAYBE

   283    *

   284    * @see isNormalized

   285    * @stable ICU 2.6

   286    */

   287   static inline UNormalizationCheckResult

   288   quickCheck(const UnicodeString &source, UNormalizationMode mode, int32_t options, UErrorCode &status);

   290   /**

   291    * Test if a string is in a given normalization form.

   292    * This is semantically equivalent to source.equals(normalize(source, mode)) .

   293    *

   294    * Unlike unorm_quickCheck(), this function returns a definitive result,

   295    * never a "maybe".

   296    * For NFD, NFKD, and FCD, both functions work exactly the same.

   297    * For NFC and NFKC where quickCheck may return "maybe", this function will

   298    * perform further tests to arrive at a TRUE/FALSE result.

   299    *

   300    * @param src        String that is to be tested if it is in a normalization format.

   301    * @param mode       Which normalization form to test for.

   302    * @param errorCode  ICU error code in/out parameter.

   303    *                   Must fulfill U_SUCCESS before the function call.

   304    * @return Boolean value indicating whether the source string is in the

   305    *         "mode" normalization form.

   306    *

   307    * @see quickCheck

   308    * @stable ICU 2.2

   309    */

   310   static inline UBool

   311   isNormalized(const UnicodeString &src, UNormalizationMode mode, UErrorCode &errorCode);

   313   /**

   314    * Test if a string is in a given normalization form; same as the other version of isNormalized

   315    * but takes an extra options parameter like most normalization functions.

   316    *

   317    * @param src        String that is to be tested if it is in a normalization format.

   318    * @param mode       Which normalization form to test for.

   319    * @param options      the optional features to be enabled (0 for no options)

   320    * @param errorCode  ICU error code in/out parameter.

   321    *                   Must fulfill U_SUCCESS before the function call.

   322    * @return Boolean value indicating whether the source string is in the

   323    *         "mode" normalization form.

   324    *

   325    * @see quickCheck

   326    * @stable ICU 2.6

   327    */

   328   static inline UBool

   329   isNormalized(const UnicodeString &src, UNormalizationMode mode, int32_t options, UErrorCode &errorCode);

   331   /**

   332    * Concatenate normalized strings, making sure that the result is normalized as well.

   333    *

   334    * If both the left and the right strings are in

   335    * the normalization form according to "mode/options",

   336    * then the result will be

   337    *

   338    * \code

   339    *     dest=normalize(left+right, mode, options)

   340    * \endcode

   341    *

   342    * For details see unorm_concatenate in unorm.h.

   343    *

   344    * @param left Left source string.

   345    * @param right Right source string.

   346    * @param result The output string.

   347    * @param mode The normalization mode.

   348    * @param options A bit set of normalization options.

   349    * @param errorCode ICU error code in/out parameter.

   350    *                   Must fulfill U_SUCCESS before the function call.

   351    * @return result

   352    *

   353    * @see unorm_concatenate

   354    * @see normalize

   355    * @see unorm_next

   356    * @see unorm_previous

   357    *

   358    * @stable ICU 2.1

   359    */

   360   static UnicodeString &

   361   U_EXPORT2 concatenate(UnicodeString &left, UnicodeString &right,

   362               UnicodeString &result,

   363               UNormalizationMode mode, int32_t options,

   364               UErrorCode &errorCode);

   366   /**

   367    * Compare two strings for canonical equivalence.

   368    * Further options include case-insensitive comparison and

   369    * code point order (as opposed to code unit order).

   370    *

   371    * Canonical equivalence between two strings is defined as their normalized

   372    * forms (NFD or NFC) being identical.

   373    * This function compares strings incrementally instead of normalizing

   374    * (and optionally case-folding) both strings entirely,

   375    * improving performance significantly.

   376    *

   377    * Bulk normalization is only necessary if the strings do not fulfill the FCD

   378    * conditions. Only in this case, and only if the strings are relatively long,

   379    * is memory allocated temporarily.

   380    * For FCD strings and short non-FCD strings there is no memory allocation.

   381    *

   382    * Semantically, this is equivalent to

   383    *   strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2)))

   384    * where code point order and foldCase are all optional.

   385    *

   386    * UAX 21 2.5 Caseless Matching specifies that for a canonical caseless match

   387    * the case folding must be performed first, then the normalization.

   388    *

   389    * @param s1 First source string.

   390    * @param s2 Second source string.

   391    *

   392    * @param options A bit set of options:

   393    *   - U_FOLD_CASE_DEFAULT or 0 is used for default options:

   394    *     Case-sensitive comparison in code unit order, and the input strings

   395    *     are quick-checked for FCD.

   396    *

   397    *   - UNORM_INPUT_IS_FCD

   398    *     Set if the caller knows that both s1 and s2 fulfill the FCD conditions.

   399    *     If not set, the function will quickCheck for FCD

   400    *     and normalize if necessary.

   401    *

   402    *   - U_COMPARE_CODE_POINT_ORDER

   403    *     Set to choose code point order instead of code unit order

   404    *     (see u_strCompare for details).

   405    *

   406    *   - U_COMPARE_IGNORE_CASE

   407    *     Set to compare strings case-insensitively using case folding,

   408    *     instead of case-sensitively.

   409    *     If set, then the following case folding options are used.

   410    *

   411    *   - Options as used with case-insensitive comparisons, currently:

   412    *

   413    *   - U_FOLD_CASE_EXCLUDE_SPECIAL_I

   414    *    (see u_strCaseCompare for details)

   415    *

   416    *   - regular normalization options shifted left by UNORM_COMPARE_NORM_OPTIONS_SHIFT

   417    *

   418    * @param errorCode ICU error code in/out parameter.

   419    *                  Must fulfill U_SUCCESS before the function call.

   420    * @return <0 or 0 or >0 as usual for string comparisons

   421    *

   422    * @see unorm_compare

   423    * @see normalize

   424    * @see UNORM_FCD

   425    * @see u_strCompare

   426    * @see u_strCaseCompare

   427    *

   428    * @stable ICU 2.2

   429    */

   430   static inline int32_t

   431   compare(const UnicodeString &s1, const UnicodeString &s2,

   432           uint32_t options,

   433           UErrorCode &errorCode);

   435   //-------------------------------------------------------------------------

   436   // Iteration API

   437   //-------------------------------------------------------------------------

   439   /**

   440    * Return the current character in the normalized text.

   441    * current() may need to normalize some text at getIndex().

   442    * The getIndex() is not changed.

   443    *

   444    * @return the current normalized code point

   445    * @stable ICU 2.0

   446    */

   447   UChar32              current(void);

   449   /**

   450    * Return the first character in the normalized text.

   451    * This is equivalent to setIndexOnly(startIndex()) followed by next().

   452    * (Post-increment semantics.)

   453    *

   454    * @return the first normalized code point

   455    * @stable ICU 2.0

   456    */

   457   UChar32              first(void);

   459   /**

   460    * Return the last character in the normalized text.

   461    * This is equivalent to setIndexOnly(endIndex()) followed by previous().

   462    * (Pre-decrement semantics.)

   463    *

   464    * @return the last normalized code point

   465    * @stable ICU 2.0

   466    */

   467   UChar32              last(void);

   469   /**

   470    * Return the next character in the normalized text.

   471    * (Post-increment semantics.)

   472    * If the end of the text has already been reached, DONE is returned.

   473    * The DONE value could be confused with a U+FFFF non-character code point

   474    * in the text. If this is possible, you can test getIndex()<endIndex()

   475    * before calling next(), or (getIndex()<endIndex() || last()!=DONE)

   476    * after calling next(). (Calling last() will change the iterator state!)

   477    *

   478    * The C API unorm_next() is more efficient and does not have this ambiguity.

   479    *

   480    * @return the next normalized code point

   481    * @stable ICU 2.0

   482    */

   483   UChar32              next(void);

   485   /**

   486    * Return the previous character in the normalized text and decrement.

   487    * (Pre-decrement semantics.)

   488    * If the beginning of the text has already been reached, DONE is returned.

   489    * The DONE value could be confused with a U+FFFF non-character code point

   490    * in the text. If this is possible, you can test

   491    * (getIndex()>startIndex() || first()!=DONE). (Calling first() will change

   492    * the iterator state!)

   493    *

   494    * The C API unorm_previous() is more efficient and does not have this ambiguity.

   495    *

   496    * @return the previous normalized code point

   497    * @stable ICU 2.0

   498    */

   499   UChar32              previous(void);

   501   /**

   502    * Set the iteration position in the input text that is being normalized,

   503    * without any immediate normalization.

   504    * After setIndexOnly(), getIndex() will return the same index that is

   505    * specified here.

   506    *

   507    * @param index the desired index in the input text.

   508    * @stable ICU 2.0

   509    */

   510   void                 setIndexOnly(int32_t index);

   512   /**

   513    * Reset the index to the beginning of the text.

   514    * This is equivalent to setIndexOnly(startIndex)).

   515    * @stable ICU 2.0

   516    */

   517   void                reset(void);

   519   /**

   520    * Retrieve the current iteration position in the input text that is

   521    * being normalized.

   522    *

   523    * A following call to next() will return a normalized code point from

   524    * the input text at or after this index.

   525    *

   526    * After a call to previous(), getIndex() will point at or before the

   527    * position in the input text where the normalized code point

   528    * was returned from with previous().

   529    *

   530    * @return the current index in the input text

   531    * @stable ICU 2.0

   532    */

   533   int32_t            getIndex(void) const;

   535   /**

   536    * Retrieve the index of the start of the input text. This is the begin index

   537    * of the <code>CharacterIterator</code> or the start (i.e. index 0) of the string

   538    * over which this <code>Normalizer</code> is iterating.

   539    *

   540    * @return the smallest index in the input text where the Normalizer operates

   541    * @stable ICU 2.0

   542    */

   543   int32_t            startIndex(void) const;

   545   /**

   546    * Retrieve the index of the end of the input text. This is the end index

   547    * of the <code>CharacterIterator</code> or the length of the string

   548    * over which this <code>Normalizer</code> is iterating.

   549    * This end index is exclusive, i.e., the Normalizer operates only on characters

   550    * before this index.

   551    *

   552    * @return the first index in the input text where the Normalizer does not operate

   553    * @stable ICU 2.0

   554    */

   555   int32_t            endIndex(void) const;

   557   /**

   558    * Returns TRUE when both iterators refer to the same character in the same

   559    * input text.

   560    *

   561    * @param that a Normalizer object to compare this one to

   562    * @return comparison result

   563    * @stable ICU 2.0

   564    */

   565   UBool        operator==(const Normalizer& that) const;

   567   /**

   568    * Returns FALSE when both iterators refer to the same character in the same

   569    * input text.

   570    *

   571    * @param that a Normalizer object to compare this one to

   572    * @return comparison result

   573    * @stable ICU 2.0

   574    */

   575   inline UBool        operator!=(const Normalizer& that) const;

   577   /**

   578    * Returns a pointer to a new Normalizer that is a clone of this one.

   579    * The caller is responsible for deleting the new clone.

   580    * @return a pointer to a new Normalizer

   581    * @stable ICU 2.0

   582    */

   583   Normalizer*        clone(void) const;

   585   /**

   586    * Generates a hash code for this iterator.

   587    *

   588    * @return the hash code

   589    * @stable ICU 2.0

   590    */

   591   int32_t                hashCode(void) const;

   593   //-------------------------------------------------------------------------

   594   // Property access methods

   595   //-------------------------------------------------------------------------

   597   /**

   598    * Set the normalization mode for this object.

   599    * <p>

   600    * <b>Note:</b>If the normalization mode is changed while iterating

   601    * over a string, calls to {@link #next() } and {@link #previous() } may

   602    * return previously buffers characters in the old normalization mode

   603    * until the iteration is able to re-sync at the next base character.

   604    * It is safest to call {@link #setIndexOnly }, {@link #reset() },

   605    * {@link #setText }, {@link #first() },

   606    * {@link #last() }, etc. after calling <code>setMode</code>.

   607    * <p>

   608    * @param newMode the new mode for this <code>Normalizer</code>.

   609    * @see #getUMode

   610    * @stable ICU 2.0

   611    */

   612   void setMode(UNormalizationMode newMode);

   614   /**

   615    * Return the normalization mode for this object.

   616    *

   617    * This is an unusual name because there used to be a getMode() that

   618    * returned a different type.

   619    *

   620    * @return the mode for this <code>Normalizer</code>

   621    * @see #setMode

   622    * @stable ICU 2.0

   623    */

   624   UNormalizationMode getUMode(void) const;

   626   /**

   627    * Set options that affect this <code>Normalizer</code>'s operation.

   628    * Options do not change the basic composition or decomposition operation

   629    * that is being performed, but they control whether

   630    * certain optional portions of the operation are done.

   631    * Currently the only available option is obsolete.

   632    *

   633    * It is possible to specify multiple options that are all turned on or off.

   634    *

   635    * @param   option  the option(s) whose value is/are to be set.

   636    * @param   value   the new setting for the option.  Use <code>TRUE</code> to

   637    *                  turn the option(s) on and <code>FALSE</code> to turn it/them off.

   638    *

   639    * @see #getOption

   640    * @stable ICU 2.0

   641    */

   642   void setOption(int32_t option,

   643          UBool value);

   645   /**

   646    * Determine whether an option is turned on or off.

   647    * If multiple options are specified, then the result is TRUE if any

   648    * of them are set.

   649    * <p>

   650    * @param option the option(s) that are to be checked

   651    * @return TRUE if any of the option(s) are set

   652    * @see #setOption

   653    * @stable ICU 2.0

   654    */

   655   UBool getOption(int32_t option) const;

   657   /**

   658    * Set the input text over which this <code>Normalizer</code> will iterate.

   659    * The iteration position is set to the beginning.

   660    *

   661    * @param newText a string that replaces the current input text

   662    * @param status a UErrorCode

   663    * @stable ICU 2.0

   664    */

   665   void setText(const UnicodeString& newText,

   666            UErrorCode &status);

   668   /**

   669    * Set the input text over which this <code>Normalizer</code> will iterate.

   670    * The iteration position is set to the beginning.

   671    *

   672    * @param newText a CharacterIterator object that replaces the current input text

   673    * @param status a UErrorCode

   674    * @stable ICU 2.0

   675    */

   676   void setText(const CharacterIterator& newText,

   677            UErrorCode &status);

   679   /**

   680    * Set the input text over which this <code>Normalizer</code> will iterate.

   681    * The iteration position is set to the beginning.

   682    *

   683    * @param newText a string that replaces the current input text

   684    * @param length the length of the string, or -1 if NUL-terminated

   685    * @param status a UErrorCode

   686    * @stable ICU 2.0

   687    */

   688   void setText(const UChar* newText,

   689                     int32_t length,

   690             UErrorCode &status);

   691   /**

   692    * Copies the input text into the UnicodeString argument.

   693    *

   694    * @param result Receives a copy of the text under iteration.

   695    * @stable ICU 2.0

   696    */

   697   void            getText(UnicodeString&  result);

   699   /**

   700    * ICU "poor man's RTTI", returns a UClassID for this class.

   701    * @returns a UClassID for this class.

   702    * @stable ICU 2.2

   703    */

   704   static UClassID U_EXPORT2 getStaticClassID();

   706   /**

   707    * ICU "poor man's RTTI", returns a UClassID for the actual class.

   708    * @return a UClassID for the actual class.

   709    * @stable ICU 2.2

   710    */

   711   virtual UClassID getDynamicClassID() const;

   713 private:

   714   //-------------------------------------------------------------------------

   715   // Private functions

   716   //-------------------------------------------------------------------------

   718   Normalizer(); // default constructor not implemented

   719   Normalizer &operator=(const Normalizer &that); // assignment operator not implemented

   721   // Private utility methods for iteration

   722   // For documentation, see the source code

   723   UBool nextNormalize();

   724   UBool previousNormalize();

   726   void    init(CharacterIterator *iter);

   727   void    clearBuffer(void);

   729   //-------------------------------------------------------------------------

   730   // Private data

   731   //-------------------------------------------------------------------------

   733   UNormalizationMode  fUMode;

   734   int32_t             fOptions;

   736   // The input text and our position in it

   737   UCharIterator       *text;

   739   // The normalization buffer is the result of normalization

   740   // of the source in [currentIndex..nextIndex[ .

   741   int32_t         currentIndex, nextIndex;

   743   // A buffer for holding intermediate results

   744   UnicodeString       buffer;

   745   int32_t         bufferPos;

   747 };

   749 //-------------------------------------------------------------------------

   750 // Inline implementations

   751 //-------------------------------------------------------------------------

   753 inline UBool

   754 Normalizer::operator!= (const Normalizer& other) const

   755 { return ! operator==(other); }

   757 inline UNormalizationCheckResult

   758 Normalizer::quickCheck(const UnicodeString& source,

   759                        UNormalizationMode mode,

   760                        UErrorCode &status) {

   761     if(U_FAILURE(status)) {

   762         return UNORM_MAYBE;

   763     }

   765     return unorm_quickCheck(source.getBuffer(), source.length(),

   766                             mode, &status);

   767 }

   769 inline UNormalizationCheckResult

   770 Normalizer::quickCheck(const UnicodeString& source,

   771                        UNormalizationMode mode, int32_t options,

   772                        UErrorCode &status) {

   773     if(U_FAILURE(status)) {

   774         return UNORM_MAYBE;

   775     }

   777     return unorm_quickCheckWithOptions(source.getBuffer(), source.length(),

   778                                        mode, options, &status);

   779 }

   781 inline UBool

   782 Normalizer::isNormalized(const UnicodeString& source,

   783                          UNormalizationMode mode,

   784                          UErrorCode &status) {

   785     if(U_FAILURE(status)) {

   786         return FALSE;

   787     }

   789     return unorm_isNormalized(source.getBuffer(), source.length(),

   790                               mode, &status);

   791 }

   793 inline UBool

   794 Normalizer::isNormalized(const UnicodeString& source,

   795                          UNormalizationMode mode, int32_t options,

   796                          UErrorCode &status) {

   797     if(U_FAILURE(status)) {

   798         return FALSE;

   799     }

   801     return unorm_isNormalizedWithOptions(source.getBuffer(), source.length(),

   802                                          mode, options, &status);

   803 }

   805 inline int32_t

   806 Normalizer::compare(const UnicodeString &s1, const UnicodeString &s2,

   807                     uint32_t options,

   808                     UErrorCode &errorCode) {

   809   // all argument checking is done in unorm_compare

   810   return unorm_compare(s1.getBuffer(), s1.length(),

   811                        s2.getBuffer(), s2.length(),

   812                        options,

   813                        &errorCode);

   814 }

   816 U_NAMESPACE_END

   818 #endif /* #if !UCONFIG_NO_NORMALIZATION */

   820 #endif // NORMLZR_H

author	sl
	Tue, 10 Jun 2014 14:32:02 +0200
changeset 1	260cb5ec6c19
permissions	-rw-r--r--