sl@0: /* sl@0: ******************************************************************** sl@0: * COPYRIGHT: sl@0: * Copyright (c) 1996-2005, International Business Machines Corporation and sl@0: * others. All Rights Reserved. sl@0: ******************************************************************** sl@0: */ sl@0: sl@0: #ifndef NORMLZR_H sl@0: #define NORMLZR_H sl@0: sl@0: #include "unicode/utypes.h" sl@0: sl@0: /** sl@0: * \file sl@0: * \brief C++ API: Unicode Normalization sl@0: */ sl@0: sl@0: #if !UCONFIG_NO_NORMALIZATION sl@0: sl@0: #include "unicode/uobject.h" sl@0: #include "unicode/unistr.h" sl@0: #include "unicode/chariter.h" sl@0: #include "unicode/unorm.h" sl@0: sl@0: sl@0: struct UCharIterator; sl@0: typedef struct UCharIterator UCharIterator; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */ sl@0: sl@0: U_NAMESPACE_BEGIN sl@0: /** sl@0: * sl@0: * The Normalizer class consists of two parts: sl@0: * - static functions that normalize strings or test if strings are normalized sl@0: * - a Normalizer object is an iterator that takes any kind of text and sl@0: * provides iteration over its normalized form sl@0: * sl@0: * The Normalizer class is not suitable for subclassing. sl@0: * sl@0: * The static functions are basically wrappers around the C implementation, sl@0: * using UnicodeString instead of UChar*. sl@0: * For basic information about normalization forms and details about the C API sl@0: * please see the documentation in unorm.h. sl@0: * sl@0: * The iterator API with the Normalizer constructors and the non-static functions sl@0: * uses a CharacterIterator as input. It is possible to pass a string which sl@0: * is then internally wrapped in a CharacterIterator. sl@0: * The input text is not normalized all at once, but incrementally where needed sl@0: * (providing efficient random access). sl@0: * This allows to pass in a large text but spend only a small amount of time sl@0: * normalizing a small part of that text. sl@0: * However, if the entire text is normalized, then the iterator will be sl@0: * slower than normalizing the entire text at once and iterating over the result. sl@0: * A possible use of the Normalizer iterator is also to report an index into the sl@0: * original text that is close to where the normalized characters come from. sl@0: * sl@0: * Important: The iterator API was cleaned up significantly for ICU 2.0. sl@0: * The earlier implementation reported the getIndex() inconsistently, sl@0: * and previous() could not be used after setIndex(), next(), first(), and current(). sl@0: * sl@0: * Normalizer allows to start normalizing from anywhere in the input text by sl@0: * calling setIndexOnly(), first(), or last(). sl@0: * Without calling any of these, the iterator will start at the beginning of the text. sl@0: * sl@0: * At any time, next() returns the next normalized code point (UChar32), sl@0: * with post-increment semantics (like CharacterIterator::next32PostInc()). sl@0: * previous() returns the previous normalized code point (UChar32), sl@0: * with pre-decrement semantics (like CharacterIterator::previous32()). sl@0: * sl@0: * current() returns the current code point sl@0: * (respectively the one at the newly set index) without moving sl@0: * the getIndex(). Note that if the text at the current position sl@0: * needs to be normalized, then these functions will do that. sl@0: * (This is why current() is not const.) sl@0: * It is more efficient to call setIndexOnly() instead, which does not sl@0: * normalize. sl@0: * sl@0: * getIndex() always refers to the position in the input text where the normalized sl@0: * code points are returned from. It does not always change with each returned sl@0: * code point. sl@0: * The code point that is returned from any of the functions sl@0: * corresponds to text at or after getIndex(), according to the sl@0: * function's iteration semantics (post-increment or pre-decrement). sl@0: * sl@0: * next() returns a code point from at or after the getIndex() sl@0: * from before the next() call. After the next() call, the getIndex() sl@0: * might have moved to where the next code point will be returned from sl@0: * (from a next() or current() call). sl@0: * This is semantically equivalent to array access with array[index++] sl@0: * (post-increment semantics). sl@0: * sl@0: * previous() returns a code point from at or after the getIndex() sl@0: * from after the previous() call. sl@0: * This is semantically equivalent to array access with array[--index] sl@0: * (pre-decrement semantics). sl@0: * sl@0: * Internally, the Normalizer iterator normalizes a small piece of text sl@0: * starting at the getIndex() and ending at a following "safe" index. sl@0: * The normalized results is stored in an internal string buffer, and sl@0: * the code points are iterated from there. sl@0: * With multiple iteration calls, this is repeated until the next piece sl@0: * of text needs to be normalized, and the getIndex() needs to be moved. sl@0: * sl@0: * The following "safe" index, the internal buffer, and the secondary sl@0: * iteration index into that buffer are not exposed on the API. sl@0: * This also means that it is currently not practical to return to sl@0: * a particular, arbitrary position in the text because one would need to sl@0: * know, and be able to set, in addition to the getIndex(), at least also the sl@0: * current index into the internal buffer. sl@0: * It is currently only possible to observe when getIndex() changes sl@0: * (with careful consideration of the iteration semantics), sl@0: * at which time the internal index will be 0. sl@0: * For example, if getIndex() is different after next() than before it, sl@0: * then the internal index is 0 and one can return to this getIndex() sl@0: * later with setIndexOnly(). sl@0: * sl@0: * @author Laura Werner, Mark Davis, Markus Scherer sl@0: * @stable ICU 2.0 sl@0: */ sl@0: class U_COMMON_API Normalizer : public UObject { sl@0: public: sl@0: /** sl@0: * If DONE is returned from an iteration function that returns a code point, sl@0: * then there are no more normalization results available. sl@0: * @stable ICU 2.0 sl@0: */ sl@0: enum { sl@0: DONE=0xffff sl@0: }; sl@0: sl@0: // Constructors sl@0: sl@0: /** sl@0: * Creates a new Normalizer object for iterating over the sl@0: * normalized form of a given string. sl@0: *

sl@0: * @param str The string to be normalized. The normalization sl@0: * will start at the beginning of the string. sl@0: * sl@0: * @param mode The normalization mode. sl@0: * @stable ICU 2.0 sl@0: */ sl@0: Normalizer(const UnicodeString& str, UNormalizationMode mode); sl@0: sl@0: /** sl@0: * Creates a new Normalizer object for iterating over the sl@0: * normalized form of a given string. sl@0: *

sl@0: * @param str The string to be normalized. The normalization sl@0: * will start at the beginning of the string. sl@0: * sl@0: * @param length Length of the string, or -1 if NUL-terminated. sl@0: * @param mode The normalization mode. sl@0: * @stable ICU 2.0 sl@0: */ sl@0: Normalizer(const UChar* str, int32_t length, UNormalizationMode mode); sl@0: sl@0: /** sl@0: * Creates a new Normalizer object for iterating over the sl@0: * normalized form of the given text. sl@0: *

sl@0: * @param iter The input text to be normalized. The normalization sl@0: * will start at the beginning of the string. sl@0: * sl@0: * @param mode The normalization mode. sl@0: * @stable ICU 2.0 sl@0: */ sl@0: Normalizer(const CharacterIterator& iter, UNormalizationMode mode); sl@0: sl@0: /** sl@0: * Copy constructor. sl@0: * @param copy The object to be copied. sl@0: * @stable ICU 2.0 sl@0: */ sl@0: Normalizer(const Normalizer& copy); sl@0: sl@0: /** sl@0: * Destructor sl@0: * @stable ICU 2.0 sl@0: */ sl@0: virtual ~Normalizer(); sl@0: sl@0: sl@0: //------------------------------------------------------------------------- sl@0: // Static utility methods sl@0: //------------------------------------------------------------------------- sl@0: sl@0: /** sl@0: * Normalizes a UnicodeString according to the specified normalization mode. sl@0: * This is a wrapper for unorm_normalize(), using UnicodeString's. sl@0: * sl@0: * The options parameter specifies which optional sl@0: * Normalizer features are to be enabled for this operation. sl@0: * sl@0: * @param source the input string to be normalized. sl@0: * @param mode the normalization mode sl@0: * @param options the optional features to be enabled (0 for no options) sl@0: * @param result The normalized string (on output). sl@0: * @param status The error code. sl@0: * @stable ICU 2.0 sl@0: */ sl@0: static void U_EXPORT2 normalize(const UnicodeString& source, sl@0: UNormalizationMode mode, int32_t options, sl@0: UnicodeString& result, sl@0: UErrorCode &status); sl@0: sl@0: /** sl@0: * Compose a UnicodeString. sl@0: * This is equivalent to normalize() with mode UNORM_NFC or UNORM_NFKC. sl@0: * This is a wrapper for unorm_normalize(), using UnicodeString's. sl@0: * sl@0: * The options parameter specifies which optional sl@0: * Normalizer features are to be enabled for this operation. sl@0: * sl@0: * @param source the string to be composed. sl@0: * @param compat Perform compatibility decomposition before composition. sl@0: * If this argument is FALSE, only canonical sl@0: * decomposition will be performed. sl@0: * @param options the optional features to be enabled (0 for no options) sl@0: * @param result The composed string (on output). sl@0: * @param status The error code. sl@0: * @stable ICU 2.0 sl@0: */ sl@0: static void U_EXPORT2 compose(const UnicodeString& source, sl@0: UBool compat, int32_t options, sl@0: UnicodeString& result, sl@0: UErrorCode &status); sl@0: sl@0: /** sl@0: * Static method to decompose a UnicodeString. sl@0: * This is equivalent to normalize() with mode UNORM_NFD or UNORM_NFKD. sl@0: * This is a wrapper for unorm_normalize(), using UnicodeString's. sl@0: * sl@0: * The options parameter specifies which optional sl@0: * Normalizer features are to be enabled for this operation. sl@0: * sl@0: * @param source the string to be decomposed. sl@0: * @param compat Perform compatibility decomposition. sl@0: * If this argument is FALSE, only canonical sl@0: * decomposition will be performed. sl@0: * @param options the optional features to be enabled (0 for no options) sl@0: * @param result The decomposed string (on output). sl@0: * @param status The error code. sl@0: * @stable ICU 2.0 sl@0: */ sl@0: static void U_EXPORT2 decompose(const UnicodeString& source, sl@0: UBool compat, int32_t options, sl@0: UnicodeString& result, sl@0: UErrorCode &status); sl@0: sl@0: /** sl@0: * Performing quick check on a string, to quickly determine if the string is sl@0: * in a particular normalization format. sl@0: * This is a wrapper for unorm_quickCheck(), using a UnicodeString. sl@0: * sl@0: * Three types of result can be returned UNORM_YES, UNORM_NO or sl@0: * UNORM_MAYBE. Result UNORM_YES indicates that the argument sl@0: * string is in the desired normalized format, UNORM_NO determines that sl@0: * argument string is not in the desired normalized format. A sl@0: * UNORM_MAYBE result indicates that a more thorough check is required, sl@0: * the user may have to put the string in its normalized form and compare the sl@0: * results. sl@0: * @param source string for determining if it is in a normalized format sl@0: * @param mode normalization format sl@0: * @param status A reference to a UErrorCode to receive any errors sl@0: * @return UNORM_YES, UNORM_NO or UNORM_MAYBE sl@0: * sl@0: * @see isNormalized sl@0: * @stable ICU 2.0 sl@0: */ sl@0: static inline UNormalizationCheckResult sl@0: quickCheck(const UnicodeString &source, UNormalizationMode mode, UErrorCode &status); sl@0: sl@0: /** sl@0: * Performing quick check on a string; same as the other version of quickCheck sl@0: * but takes an extra options parameter like most normalization functions. sl@0: * sl@0: * @param source string for determining if it is in a normalized format sl@0: * @param mode normalization format sl@0: * @param options the optional features to be enabled (0 for no options) sl@0: * @param status A reference to a UErrorCode to receive any errors sl@0: * @return UNORM_YES, UNORM_NO or UNORM_MAYBE sl@0: * sl@0: * @see isNormalized sl@0: * @stable ICU 2.6 sl@0: */ sl@0: static inline UNormalizationCheckResult sl@0: quickCheck(const UnicodeString &source, UNormalizationMode mode, int32_t options, UErrorCode &status); sl@0: sl@0: /** sl@0: * Test if a string is in a given normalization form. sl@0: * This is semantically equivalent to source.equals(normalize(source, mode)) . sl@0: * sl@0: * Unlike unorm_quickCheck(), this function returns a definitive result, sl@0: * never a "maybe". sl@0: * For NFD, NFKD, and FCD, both functions work exactly the same. sl@0: * For NFC and NFKC where quickCheck may return "maybe", this function will sl@0: * perform further tests to arrive at a TRUE/FALSE result. sl@0: * sl@0: * @param src String that is to be tested if it is in a normalization format. sl@0: * @param mode Which normalization form to test for. sl@0: * @param errorCode ICU error code in/out parameter. sl@0: * Must fulfill U_SUCCESS before the function call. sl@0: * @return Boolean value indicating whether the source string is in the sl@0: * "mode" normalization form. sl@0: * sl@0: * @see quickCheck sl@0: * @stable ICU 2.2 sl@0: */ sl@0: static inline UBool sl@0: isNormalized(const UnicodeString &src, UNormalizationMode mode, UErrorCode &errorCode); sl@0: sl@0: /** sl@0: * Test if a string is in a given normalization form; same as the other version of isNormalized sl@0: * but takes an extra options parameter like most normalization functions. sl@0: * sl@0: * @param src String that is to be tested if it is in a normalization format. sl@0: * @param mode Which normalization form to test for. sl@0: * @param options the optional features to be enabled (0 for no options) sl@0: * @param errorCode ICU error code in/out parameter. sl@0: * Must fulfill U_SUCCESS before the function call. sl@0: * @return Boolean value indicating whether the source string is in the sl@0: * "mode" normalization form. sl@0: * sl@0: * @see quickCheck sl@0: * @stable ICU 2.6 sl@0: */ sl@0: static inline UBool sl@0: isNormalized(const UnicodeString &src, UNormalizationMode mode, int32_t options, UErrorCode &errorCode); sl@0: sl@0: /** sl@0: * Concatenate normalized strings, making sure that the result is normalized as well. sl@0: * sl@0: * If both the left and the right strings are in sl@0: * the normalization form according to "mode/options", sl@0: * then the result will be sl@0: * sl@0: * \code sl@0: * dest=normalize(left+right, mode, options) sl@0: * \endcode sl@0: * sl@0: * For details see unorm_concatenate in unorm.h. sl@0: * sl@0: * @param left Left source string. sl@0: * @param right Right source string. sl@0: * @param result The output string. sl@0: * @param mode The normalization mode. sl@0: * @param options A bit set of normalization options. sl@0: * @param errorCode ICU error code in/out parameter. sl@0: * Must fulfill U_SUCCESS before the function call. sl@0: * @return result sl@0: * sl@0: * @see unorm_concatenate sl@0: * @see normalize sl@0: * @see unorm_next sl@0: * @see unorm_previous sl@0: * sl@0: * @stable ICU 2.1 sl@0: */ sl@0: static UnicodeString & sl@0: U_EXPORT2 concatenate(UnicodeString &left, UnicodeString &right, sl@0: UnicodeString &result, sl@0: UNormalizationMode mode, int32_t options, sl@0: UErrorCode &errorCode); sl@0: sl@0: /** sl@0: * Compare two strings for canonical equivalence. sl@0: * Further options include case-insensitive comparison and sl@0: * code point order (as opposed to code unit order). sl@0: * sl@0: * Canonical equivalence between two strings is defined as their normalized sl@0: * forms (NFD or NFC) being identical. sl@0: * This function compares strings incrementally instead of normalizing sl@0: * (and optionally case-folding) both strings entirely, sl@0: * improving performance significantly. sl@0: * sl@0: * Bulk normalization is only necessary if the strings do not fulfill the FCD sl@0: * conditions. Only in this case, and only if the strings are relatively long, sl@0: * is memory allocated temporarily. sl@0: * For FCD strings and short non-FCD strings there is no memory allocation. sl@0: * sl@0: * Semantically, this is equivalent to sl@0: * strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2))) sl@0: * where code point order and foldCase are all optional. sl@0: * sl@0: * UAX 21 2.5 Caseless Matching specifies that for a canonical caseless match sl@0: * the case folding must be performed first, then the normalization. sl@0: * sl@0: * @param s1 First source string. sl@0: * @param s2 Second source string. sl@0: * sl@0: * @param options A bit set of options: sl@0: * - U_FOLD_CASE_DEFAULT or 0 is used for default options: sl@0: * Case-sensitive comparison in code unit order, and the input strings sl@0: * are quick-checked for FCD. sl@0: * sl@0: * - UNORM_INPUT_IS_FCD sl@0: * Set if the caller knows that both s1 and s2 fulfill the FCD conditions. sl@0: * If not set, the function will quickCheck for FCD sl@0: * and normalize if necessary. sl@0: * sl@0: * - U_COMPARE_CODE_POINT_ORDER sl@0: * Set to choose code point order instead of code unit order sl@0: * (see u_strCompare for details). sl@0: * sl@0: * - U_COMPARE_IGNORE_CASE sl@0: * Set to compare strings case-insensitively using case folding, sl@0: * instead of case-sensitively. sl@0: * If set, then the following case folding options are used. sl@0: * sl@0: * - Options as used with case-insensitive comparisons, currently: sl@0: * sl@0: * - U_FOLD_CASE_EXCLUDE_SPECIAL_I sl@0: * (see u_strCaseCompare for details) sl@0: * sl@0: * - regular normalization options shifted left by UNORM_COMPARE_NORM_OPTIONS_SHIFT sl@0: * sl@0: * @param errorCode ICU error code in/out parameter. sl@0: * Must fulfill U_SUCCESS before the function call. sl@0: * @return <0 or 0 or >0 as usual for string comparisons sl@0: * sl@0: * @see unorm_compare sl@0: * @see normalize sl@0: * @see UNORM_FCD sl@0: * @see u_strCompare sl@0: * @see u_strCaseCompare sl@0: * sl@0: * @stable ICU 2.2 sl@0: */ sl@0: static inline int32_t sl@0: compare(const UnicodeString &s1, const UnicodeString &s2, sl@0: uint32_t options, sl@0: UErrorCode &errorCode); sl@0: sl@0: //------------------------------------------------------------------------- sl@0: // Iteration API sl@0: //------------------------------------------------------------------------- sl@0: sl@0: /** sl@0: * Return the current character in the normalized text. sl@0: * current() may need to normalize some text at getIndex(). sl@0: * The getIndex() is not changed. sl@0: * sl@0: * @return the current normalized code point sl@0: * @stable ICU 2.0 sl@0: */ sl@0: UChar32 current(void); sl@0: sl@0: /** sl@0: * Return the first character in the normalized text. sl@0: * This is equivalent to setIndexOnly(startIndex()) followed by next(). sl@0: * (Post-increment semantics.) sl@0: * sl@0: * @return the first normalized code point sl@0: * @stable ICU 2.0 sl@0: */ sl@0: UChar32 first(void); sl@0: sl@0: /** sl@0: * Return the last character in the normalized text. sl@0: * This is equivalent to setIndexOnly(endIndex()) followed by previous(). sl@0: * (Pre-decrement semantics.) sl@0: * sl@0: * @return the last normalized code point sl@0: * @stable ICU 2.0 sl@0: */ sl@0: UChar32 last(void); sl@0: sl@0: /** sl@0: * Return the next character in the normalized text. sl@0: * (Post-increment semantics.) sl@0: * If the end of the text has already been reached, DONE is returned. sl@0: * The DONE value could be confused with a U+FFFF non-character code point sl@0: * in the text. If this is possible, you can test getIndex()startIndex() || first()!=DONE). (Calling first() will change sl@0: * the iterator state!) sl@0: * sl@0: * The C API unorm_previous() is more efficient and does not have this ambiguity. sl@0: * sl@0: * @return the previous normalized code point sl@0: * @stable ICU 2.0 sl@0: */ sl@0: UChar32 previous(void); sl@0: sl@0: /** sl@0: * Set the iteration position in the input text that is being normalized, sl@0: * without any immediate normalization. sl@0: * After setIndexOnly(), getIndex() will return the same index that is sl@0: * specified here. sl@0: * sl@0: * @param index the desired index in the input text. sl@0: * @stable ICU 2.0 sl@0: */ sl@0: void setIndexOnly(int32_t index); sl@0: sl@0: /** sl@0: * Reset the index to the beginning of the text. sl@0: * This is equivalent to setIndexOnly(startIndex)). sl@0: * @stable ICU 2.0 sl@0: */ sl@0: void reset(void); sl@0: sl@0: /** sl@0: * Retrieve the current iteration position in the input text that is sl@0: * being normalized. sl@0: * sl@0: * A following call to next() will return a normalized code point from sl@0: * the input text at or after this index. sl@0: * sl@0: * After a call to previous(), getIndex() will point at or before the sl@0: * position in the input text where the normalized code point sl@0: * was returned from with previous(). sl@0: * sl@0: * @return the current index in the input text sl@0: * @stable ICU 2.0 sl@0: */ sl@0: int32_t getIndex(void) const; sl@0: sl@0: /** sl@0: * Retrieve the index of the start of the input text. This is the begin index sl@0: * of the CharacterIterator or the start (i.e. index 0) of the string sl@0: * over which this Normalizer is iterating. sl@0: * sl@0: * @return the smallest index in the input text where the Normalizer operates sl@0: * @stable ICU 2.0 sl@0: */ sl@0: int32_t startIndex(void) const; sl@0: sl@0: /** sl@0: * Retrieve the index of the end of the input text. This is the end index sl@0: * of the CharacterIterator or the length of the string sl@0: * over which this Normalizer is iterating. sl@0: * This end index is exclusive, i.e., the Normalizer operates only on characters sl@0: * before this index. sl@0: * sl@0: * @return the first index in the input text where the Normalizer does not operate sl@0: * @stable ICU 2.0 sl@0: */ sl@0: int32_t endIndex(void) const; sl@0: sl@0: /** sl@0: * Returns TRUE when both iterators refer to the same character in the same sl@0: * input text. sl@0: * sl@0: * @param that a Normalizer object to compare this one to sl@0: * @return comparison result sl@0: * @stable ICU 2.0 sl@0: */ sl@0: UBool operator==(const Normalizer& that) const; sl@0: sl@0: /** sl@0: * Returns FALSE when both iterators refer to the same character in the same sl@0: * input text. sl@0: * sl@0: * @param that a Normalizer object to compare this one to sl@0: * @return comparison result sl@0: * @stable ICU 2.0 sl@0: */ sl@0: inline UBool operator!=(const Normalizer& that) const; sl@0: sl@0: /** sl@0: * Returns a pointer to a new Normalizer that is a clone of this one. sl@0: * The caller is responsible for deleting the new clone. sl@0: * @return a pointer to a new Normalizer sl@0: * @stable ICU 2.0 sl@0: */ sl@0: Normalizer* clone(void) const; sl@0: sl@0: /** sl@0: * Generates a hash code for this iterator. sl@0: * sl@0: * @return the hash code sl@0: * @stable ICU 2.0 sl@0: */ sl@0: int32_t hashCode(void) const; sl@0: sl@0: //------------------------------------------------------------------------- sl@0: // Property access methods sl@0: //------------------------------------------------------------------------- sl@0: sl@0: /** sl@0: * Set the normalization mode for this object. sl@0: *

sl@0: * Note:If the normalization mode is changed while iterating sl@0: * over a string, calls to {@link #next() } and {@link #previous() } may sl@0: * return previously buffers characters in the old normalization mode sl@0: * until the iteration is able to re-sync at the next base character. sl@0: * It is safest to call {@link #setIndexOnly }, {@link #reset() }, sl@0: * {@link #setText }, {@link #first() }, sl@0: * {@link #last() }, etc. after calling setMode. sl@0: *

sl@0: * @param newMode the new mode for this Normalizer. sl@0: * @see #getUMode sl@0: * @stable ICU 2.0 sl@0: */ sl@0: void setMode(UNormalizationMode newMode); sl@0: sl@0: /** sl@0: * Return the normalization mode for this object. sl@0: * sl@0: * This is an unusual name because there used to be a getMode() that sl@0: * returned a different type. sl@0: * sl@0: * @return the mode for this Normalizer sl@0: * @see #setMode sl@0: * @stable ICU 2.0 sl@0: */ sl@0: UNormalizationMode getUMode(void) const; sl@0: sl@0: /** sl@0: * Set options that affect this Normalizer's operation. sl@0: * Options do not change the basic composition or decomposition operation sl@0: * that is being performed, but they control whether sl@0: * certain optional portions of the operation are done. sl@0: * Currently the only available option is obsolete. sl@0: * sl@0: * It is possible to specify multiple options that are all turned on or off. sl@0: * sl@0: * @param option the option(s) whose value is/are to be set. sl@0: * @param value the new setting for the option. Use TRUE to sl@0: * turn the option(s) on and FALSE to turn it/them off. sl@0: * sl@0: * @see #getOption sl@0: * @stable ICU 2.0 sl@0: */ sl@0: void setOption(int32_t option, sl@0: UBool value); sl@0: sl@0: /** sl@0: * Determine whether an option is turned on or off. sl@0: * If multiple options are specified, then the result is TRUE if any sl@0: * of them are set. sl@0: *

sl@0: * @param option the option(s) that are to be checked sl@0: * @return TRUE if any of the option(s) are set sl@0: * @see #setOption sl@0: * @stable ICU 2.0 sl@0: */ sl@0: UBool getOption(int32_t option) const; sl@0: sl@0: /** sl@0: * Set the input text over which this Normalizer will iterate. sl@0: * The iteration position is set to the beginning. sl@0: * sl@0: * @param newText a string that replaces the current input text sl@0: * @param status a UErrorCode sl@0: * @stable ICU 2.0 sl@0: */ sl@0: void setText(const UnicodeString& newText, sl@0: UErrorCode &status); sl@0: sl@0: /** sl@0: * Set the input text over which this Normalizer will iterate. sl@0: * The iteration position is set to the beginning. sl@0: * sl@0: * @param newText a CharacterIterator object that replaces the current input text sl@0: * @param status a UErrorCode sl@0: * @stable ICU 2.0 sl@0: */ sl@0: void setText(const CharacterIterator& newText, sl@0: UErrorCode &status); sl@0: sl@0: /** sl@0: * Set the input text over which this Normalizer will iterate. sl@0: * The iteration position is set to the beginning. sl@0: * sl@0: * @param newText a string that replaces the current input text sl@0: * @param length the length of the string, or -1 if NUL-terminated sl@0: * @param status a UErrorCode sl@0: * @stable ICU 2.0 sl@0: */ sl@0: void setText(const UChar* newText, sl@0: int32_t length, sl@0: UErrorCode &status); sl@0: /** sl@0: * Copies the input text into the UnicodeString argument. sl@0: * sl@0: * @param result Receives a copy of the text under iteration. sl@0: * @stable ICU 2.0 sl@0: */ sl@0: void getText(UnicodeString& result); sl@0: sl@0: /** sl@0: * ICU "poor man's RTTI", returns a UClassID for this class. sl@0: * @returns a UClassID for this class. sl@0: * @stable ICU 2.2 sl@0: */ sl@0: static UClassID U_EXPORT2 getStaticClassID(); sl@0: sl@0: /** sl@0: * ICU "poor man's RTTI", returns a UClassID for the actual class. sl@0: * @return a UClassID for the actual class. sl@0: * @stable ICU 2.2 sl@0: */ sl@0: virtual UClassID getDynamicClassID() const; sl@0: sl@0: private: sl@0: //------------------------------------------------------------------------- sl@0: // Private functions sl@0: //------------------------------------------------------------------------- sl@0: sl@0: Normalizer(); // default constructor not implemented sl@0: Normalizer &operator=(const Normalizer &that); // assignment operator not implemented sl@0: sl@0: // Private utility methods for iteration sl@0: // For documentation, see the source code sl@0: UBool nextNormalize(); sl@0: UBool previousNormalize(); sl@0: sl@0: void init(CharacterIterator *iter); sl@0: void clearBuffer(void); sl@0: sl@0: //------------------------------------------------------------------------- sl@0: // Private data sl@0: //------------------------------------------------------------------------- sl@0: sl@0: UNormalizationMode fUMode; sl@0: int32_t fOptions; sl@0: sl@0: // The input text and our position in it sl@0: UCharIterator *text; sl@0: sl@0: // The normalization buffer is the result of normalization sl@0: // of the source in [currentIndex..nextIndex[ . sl@0: int32_t currentIndex, nextIndex; sl@0: sl@0: // A buffer for holding intermediate results sl@0: UnicodeString buffer; sl@0: int32_t bufferPos; sl@0: sl@0: }; sl@0: sl@0: //------------------------------------------------------------------------- sl@0: // Inline implementations sl@0: //------------------------------------------------------------------------- sl@0: sl@0: inline UBool sl@0: Normalizer::operator!= (const Normalizer& other) const sl@0: { return ! operator==(other); } sl@0: sl@0: inline UNormalizationCheckResult sl@0: Normalizer::quickCheck(const UnicodeString& source, sl@0: UNormalizationMode mode, sl@0: UErrorCode &status) { sl@0: if(U_FAILURE(status)) { sl@0: return UNORM_MAYBE; sl@0: } sl@0: sl@0: return unorm_quickCheck(source.getBuffer(), source.length(), sl@0: mode, &status); sl@0: } sl@0: sl@0: inline UNormalizationCheckResult sl@0: Normalizer::quickCheck(const UnicodeString& source, sl@0: UNormalizationMode mode, int32_t options, sl@0: UErrorCode &status) { sl@0: if(U_FAILURE(status)) { sl@0: return UNORM_MAYBE; sl@0: } sl@0: sl@0: return unorm_quickCheckWithOptions(source.getBuffer(), source.length(), sl@0: mode, options, &status); sl@0: } sl@0: sl@0: inline UBool sl@0: Normalizer::isNormalized(const UnicodeString& source, sl@0: UNormalizationMode mode, sl@0: UErrorCode &status) { sl@0: if(U_FAILURE(status)) { sl@0: return FALSE; sl@0: } sl@0: sl@0: return unorm_isNormalized(source.getBuffer(), source.length(), sl@0: mode, &status); sl@0: } sl@0: sl@0: inline UBool sl@0: Normalizer::isNormalized(const UnicodeString& source, sl@0: UNormalizationMode mode, int32_t options, sl@0: UErrorCode &status) { sl@0: if(U_FAILURE(status)) { sl@0: return FALSE; sl@0: } sl@0: sl@0: return unorm_isNormalizedWithOptions(source.getBuffer(), source.length(), sl@0: mode, options, &status); sl@0: } sl@0: sl@0: inline int32_t sl@0: Normalizer::compare(const UnicodeString &s1, const UnicodeString &s2, sl@0: uint32_t options, sl@0: UErrorCode &errorCode) { sl@0: // all argument checking is done in unorm_compare sl@0: return unorm_compare(s1.getBuffer(), s1.length(), sl@0: s2.getBuffer(), s2.length(), sl@0: options, sl@0: &errorCode); sl@0: } sl@0: sl@0: U_NAMESPACE_END sl@0: sl@0: #endif /* #if !UCONFIG_NO_NORMALIZATION */ sl@0: sl@0: #endif // NORMLZR_H