1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/textandloc/fontservices/textshaperplugin/IcuSource/common/unicode/normlzr.h Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,820 @@
1.4 +/*
1.5 + ********************************************************************
1.6 + * COPYRIGHT:
1.7 + * Copyright (c) 1996-2005, International Business Machines Corporation and
1.8 + * others. All Rights Reserved.
1.9 + ********************************************************************
1.10 + */
1.11 +
1.12 +#ifndef NORMLZR_H
1.13 +#define NORMLZR_H
1.14 +
1.15 +#include "unicode/utypes.h"
1.16 +
1.17 +/**
1.18 + * \file
1.19 + * \brief C++ API: Unicode Normalization
1.20 + */
1.21 +
1.22 +#if !UCONFIG_NO_NORMALIZATION
1.23 +
1.24 +#include "unicode/uobject.h"
1.25 +#include "unicode/unistr.h"
1.26 +#include "unicode/chariter.h"
1.27 +#include "unicode/unorm.h"
1.28 +
1.29 +
1.30 +struct UCharIterator;
1.31 +typedef struct UCharIterator UCharIterator; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */
1.32 +
1.33 +U_NAMESPACE_BEGIN
1.34 +/**
1.35 + *
1.36 + * The Normalizer class consists of two parts:
1.37 + * - static functions that normalize strings or test if strings are normalized
1.38 + * - a Normalizer object is an iterator that takes any kind of text and
1.39 + * provides iteration over its normalized form
1.40 + *
1.41 + * The Normalizer class is not suitable for subclassing.
1.42 + *
1.43 + * The static functions are basically wrappers around the C implementation,
1.44 + * using UnicodeString instead of UChar*.
1.45 + * For basic information about normalization forms and details about the C API
1.46 + * please see the documentation in unorm.h.
1.47 + *
1.48 + * The iterator API with the Normalizer constructors and the non-static functions
1.49 + * uses a CharacterIterator as input. It is possible to pass a string which
1.50 + * is then internally wrapped in a CharacterIterator.
1.51 + * The input text is not normalized all at once, but incrementally where needed
1.52 + * (providing efficient random access).
1.53 + * This allows to pass in a large text but spend only a small amount of time
1.54 + * normalizing a small part of that text.
1.55 + * However, if the entire text is normalized, then the iterator will be
1.56 + * slower than normalizing the entire text at once and iterating over the result.
1.57 + * A possible use of the Normalizer iterator is also to report an index into the
1.58 + * original text that is close to where the normalized characters come from.
1.59 + *
1.60 + * <em>Important:</em> The iterator API was cleaned up significantly for ICU 2.0.
1.61 + * The earlier implementation reported the getIndex() inconsistently,
1.62 + * and previous() could not be used after setIndex(), next(), first(), and current().
1.63 + *
1.64 + * Normalizer allows to start normalizing from anywhere in the input text by
1.65 + * calling setIndexOnly(), first(), or last().
1.66 + * Without calling any of these, the iterator will start at the beginning of the text.
1.67 + *
1.68 + * At any time, next() returns the next normalized code point (UChar32),
1.69 + * with post-increment semantics (like CharacterIterator::next32PostInc()).
1.70 + * previous() returns the previous normalized code point (UChar32),
1.71 + * with pre-decrement semantics (like CharacterIterator::previous32()).
1.72 + *
1.73 + * current() returns the current code point
1.74 + * (respectively the one at the newly set index) without moving
1.75 + * the getIndex(). Note that if the text at the current position
1.76 + * needs to be normalized, then these functions will do that.
1.77 + * (This is why current() is not const.)
1.78 + * It is more efficient to call setIndexOnly() instead, which does not
1.79 + * normalize.
1.80 + *
1.81 + * getIndex() always refers to the position in the input text where the normalized
1.82 + * code points are returned from. It does not always change with each returned
1.83 + * code point.
1.84 + * The code point that is returned from any of the functions
1.85 + * corresponds to text at or after getIndex(), according to the
1.86 + * function's iteration semantics (post-increment or pre-decrement).
1.87 + *
1.88 + * next() returns a code point from at or after the getIndex()
1.89 + * from before the next() call. After the next() call, the getIndex()
1.90 + * might have moved to where the next code point will be returned from
1.91 + * (from a next() or current() call).
1.92 + * This is semantically equivalent to array access with array[index++]
1.93 + * (post-increment semantics).
1.94 + *
1.95 + * previous() returns a code point from at or after the getIndex()
1.96 + * from after the previous() call.
1.97 + * This is semantically equivalent to array access with array[--index]
1.98 + * (pre-decrement semantics).
1.99 + *
1.100 + * Internally, the Normalizer iterator normalizes a small piece of text
1.101 + * starting at the getIndex() and ending at a following "safe" index.
1.102 + * The normalized results is stored in an internal string buffer, and
1.103 + * the code points are iterated from there.
1.104 + * With multiple iteration calls, this is repeated until the next piece
1.105 + * of text needs to be normalized, and the getIndex() needs to be moved.
1.106 + *
1.107 + * The following "safe" index, the internal buffer, and the secondary
1.108 + * iteration index into that buffer are not exposed on the API.
1.109 + * This also means that it is currently not practical to return to
1.110 + * a particular, arbitrary position in the text because one would need to
1.111 + * know, and be able to set, in addition to the getIndex(), at least also the
1.112 + * current index into the internal buffer.
1.113 + * It is currently only possible to observe when getIndex() changes
1.114 + * (with careful consideration of the iteration semantics),
1.115 + * at which time the internal index will be 0.
1.116 + * For example, if getIndex() is different after next() than before it,
1.117 + * then the internal index is 0 and one can return to this getIndex()
1.118 + * later with setIndexOnly().
1.119 + *
1.120 + * @author Laura Werner, Mark Davis, Markus Scherer
1.121 + * @stable ICU 2.0
1.122 + */
1.123 +class U_COMMON_API Normalizer : public UObject {
1.124 +public:
1.125 + /**
1.126 + * If DONE is returned from an iteration function that returns a code point,
1.127 + * then there are no more normalization results available.
1.128 + * @stable ICU 2.0
1.129 + */
1.130 + enum {
1.131 + DONE=0xffff
1.132 + };
1.133 +
1.134 + // Constructors
1.135 +
1.136 + /**
1.137 + * Creates a new <code>Normalizer</code> object for iterating over the
1.138 + * normalized form of a given string.
1.139 + * <p>
1.140 + * @param str The string to be normalized. The normalization
1.141 + * will start at the beginning of the string.
1.142 + *
1.143 + * @param mode The normalization mode.
1.144 + * @stable ICU 2.0
1.145 + */
1.146 + Normalizer(const UnicodeString& str, UNormalizationMode mode);
1.147 +
1.148 + /**
1.149 + * Creates a new <code>Normalizer</code> object for iterating over the
1.150 + * normalized form of a given string.
1.151 + * <p>
1.152 + * @param str The string to be normalized. The normalization
1.153 + * will start at the beginning of the string.
1.154 + *
1.155 + * @param length Length of the string, or -1 if NUL-terminated.
1.156 + * @param mode The normalization mode.
1.157 + * @stable ICU 2.0
1.158 + */
1.159 + Normalizer(const UChar* str, int32_t length, UNormalizationMode mode);
1.160 +
1.161 + /**
1.162 + * Creates a new <code>Normalizer</code> object for iterating over the
1.163 + * normalized form of the given text.
1.164 + * <p>
1.165 + * @param iter The input text to be normalized. The normalization
1.166 + * will start at the beginning of the string.
1.167 + *
1.168 + * @param mode The normalization mode.
1.169 + * @stable ICU 2.0
1.170 + */
1.171 + Normalizer(const CharacterIterator& iter, UNormalizationMode mode);
1.172 +
1.173 + /**
1.174 + * Copy constructor.
1.175 + * @param copy The object to be copied.
1.176 + * @stable ICU 2.0
1.177 + */
1.178 + Normalizer(const Normalizer& copy);
1.179 +
1.180 + /**
1.181 + * Destructor
1.182 + * @stable ICU 2.0
1.183 + */
1.184 + virtual ~Normalizer();
1.185 +
1.186 +
1.187 + //-------------------------------------------------------------------------
1.188 + // Static utility methods
1.189 + //-------------------------------------------------------------------------
1.190 +
1.191 + /**
1.192 + * Normalizes a <code>UnicodeString</code> according to the specified normalization mode.
1.193 + * This is a wrapper for unorm_normalize(), using UnicodeString's.
1.194 + *
1.195 + * The <code>options</code> parameter specifies which optional
1.196 + * <code>Normalizer</code> features are to be enabled for this operation.
1.197 + *
1.198 + * @param source the input string to be normalized.
1.199 + * @param mode the normalization mode
1.200 + * @param options the optional features to be enabled (0 for no options)
1.201 + * @param result The normalized string (on output).
1.202 + * @param status The error code.
1.203 + * @stable ICU 2.0
1.204 + */
1.205 + static void U_EXPORT2 normalize(const UnicodeString& source,
1.206 + UNormalizationMode mode, int32_t options,
1.207 + UnicodeString& result,
1.208 + UErrorCode &status);
1.209 +
1.210 + /**
1.211 + * Compose a <code>UnicodeString</code>.
1.212 + * This is equivalent to normalize() with mode UNORM_NFC or UNORM_NFKC.
1.213 + * This is a wrapper for unorm_normalize(), using UnicodeString's.
1.214 + *
1.215 + * The <code>options</code> parameter specifies which optional
1.216 + * <code>Normalizer</code> features are to be enabled for this operation.
1.217 + *
1.218 + * @param source the string to be composed.
1.219 + * @param compat Perform compatibility decomposition before composition.
1.220 + * If this argument is <code>FALSE</code>, only canonical
1.221 + * decomposition will be performed.
1.222 + * @param options the optional features to be enabled (0 for no options)
1.223 + * @param result The composed string (on output).
1.224 + * @param status The error code.
1.225 + * @stable ICU 2.0
1.226 + */
1.227 + static void U_EXPORT2 compose(const UnicodeString& source,
1.228 + UBool compat, int32_t options,
1.229 + UnicodeString& result,
1.230 + UErrorCode &status);
1.231 +
1.232 + /**
1.233 + * Static method to decompose a <code>UnicodeString</code>.
1.234 + * This is equivalent to normalize() with mode UNORM_NFD or UNORM_NFKD.
1.235 + * This is a wrapper for unorm_normalize(), using UnicodeString's.
1.236 + *
1.237 + * The <code>options</code> parameter specifies which optional
1.238 + * <code>Normalizer</code> features are to be enabled for this operation.
1.239 + *
1.240 + * @param source the string to be decomposed.
1.241 + * @param compat Perform compatibility decomposition.
1.242 + * If this argument is <code>FALSE</code>, only canonical
1.243 + * decomposition will be performed.
1.244 + * @param options the optional features to be enabled (0 for no options)
1.245 + * @param result The decomposed string (on output).
1.246 + * @param status The error code.
1.247 + * @stable ICU 2.0
1.248 + */
1.249 + static void U_EXPORT2 decompose(const UnicodeString& source,
1.250 + UBool compat, int32_t options,
1.251 + UnicodeString& result,
1.252 + UErrorCode &status);
1.253 +
1.254 + /**
1.255 + * Performing quick check on a string, to quickly determine if the string is
1.256 + * in a particular normalization format.
1.257 + * This is a wrapper for unorm_quickCheck(), using a UnicodeString.
1.258 + *
1.259 + * Three types of result can be returned UNORM_YES, UNORM_NO or
1.260 + * UNORM_MAYBE. Result UNORM_YES indicates that the argument
1.261 + * string is in the desired normalized format, UNORM_NO determines that
1.262 + * argument string is not in the desired normalized format. A
1.263 + * UNORM_MAYBE result indicates that a more thorough check is required,
1.264 + * the user may have to put the string in its normalized form and compare the
1.265 + * results.
1.266 + * @param source string for determining if it is in a normalized format
1.267 + * @param mode normalization format
1.268 + * @param status A reference to a UErrorCode to receive any errors
1.269 + * @return UNORM_YES, UNORM_NO or UNORM_MAYBE
1.270 + *
1.271 + * @see isNormalized
1.272 + * @stable ICU 2.0
1.273 + */
1.274 + static inline UNormalizationCheckResult
1.275 + quickCheck(const UnicodeString &source, UNormalizationMode mode, UErrorCode &status);
1.276 +
1.277 + /**
1.278 + * Performing quick check on a string; same as the other version of quickCheck
1.279 + * but takes an extra options parameter like most normalization functions.
1.280 + *
1.281 + * @param source string for determining if it is in a normalized format
1.282 + * @param mode normalization format
1.283 + * @param options the optional features to be enabled (0 for no options)
1.284 + * @param status A reference to a UErrorCode to receive any errors
1.285 + * @return UNORM_YES, UNORM_NO or UNORM_MAYBE
1.286 + *
1.287 + * @see isNormalized
1.288 + * @stable ICU 2.6
1.289 + */
1.290 + static inline UNormalizationCheckResult
1.291 + quickCheck(const UnicodeString &source, UNormalizationMode mode, int32_t options, UErrorCode &status);
1.292 +
1.293 + /**
1.294 + * Test if a string is in a given normalization form.
1.295 + * This is semantically equivalent to source.equals(normalize(source, mode)) .
1.296 + *
1.297 + * Unlike unorm_quickCheck(), this function returns a definitive result,
1.298 + * never a "maybe".
1.299 + * For NFD, NFKD, and FCD, both functions work exactly the same.
1.300 + * For NFC and NFKC where quickCheck may return "maybe", this function will
1.301 + * perform further tests to arrive at a TRUE/FALSE result.
1.302 + *
1.303 + * @param src String that is to be tested if it is in a normalization format.
1.304 + * @param mode Which normalization form to test for.
1.305 + * @param errorCode ICU error code in/out parameter.
1.306 + * Must fulfill U_SUCCESS before the function call.
1.307 + * @return Boolean value indicating whether the source string is in the
1.308 + * "mode" normalization form.
1.309 + *
1.310 + * @see quickCheck
1.311 + * @stable ICU 2.2
1.312 + */
1.313 + static inline UBool
1.314 + isNormalized(const UnicodeString &src, UNormalizationMode mode, UErrorCode &errorCode);
1.315 +
1.316 + /**
1.317 + * Test if a string is in a given normalization form; same as the other version of isNormalized
1.318 + * but takes an extra options parameter like most normalization functions.
1.319 + *
1.320 + * @param src String that is to be tested if it is in a normalization format.
1.321 + * @param mode Which normalization form to test for.
1.322 + * @param options the optional features to be enabled (0 for no options)
1.323 + * @param errorCode ICU error code in/out parameter.
1.324 + * Must fulfill U_SUCCESS before the function call.
1.325 + * @return Boolean value indicating whether the source string is in the
1.326 + * "mode" normalization form.
1.327 + *
1.328 + * @see quickCheck
1.329 + * @stable ICU 2.6
1.330 + */
1.331 + static inline UBool
1.332 + isNormalized(const UnicodeString &src, UNormalizationMode mode, int32_t options, UErrorCode &errorCode);
1.333 +
1.334 + /**
1.335 + * Concatenate normalized strings, making sure that the result is normalized as well.
1.336 + *
1.337 + * If both the left and the right strings are in
1.338 + * the normalization form according to "mode/options",
1.339 + * then the result will be
1.340 + *
1.341 + * \code
1.342 + * dest=normalize(left+right, mode, options)
1.343 + * \endcode
1.344 + *
1.345 + * For details see unorm_concatenate in unorm.h.
1.346 + *
1.347 + * @param left Left source string.
1.348 + * @param right Right source string.
1.349 + * @param result The output string.
1.350 + * @param mode The normalization mode.
1.351 + * @param options A bit set of normalization options.
1.352 + * @param errorCode ICU error code in/out parameter.
1.353 + * Must fulfill U_SUCCESS before the function call.
1.354 + * @return result
1.355 + *
1.356 + * @see unorm_concatenate
1.357 + * @see normalize
1.358 + * @see unorm_next
1.359 + * @see unorm_previous
1.360 + *
1.361 + * @stable ICU 2.1
1.362 + */
1.363 + static UnicodeString &
1.364 + U_EXPORT2 concatenate(UnicodeString &left, UnicodeString &right,
1.365 + UnicodeString &result,
1.366 + UNormalizationMode mode, int32_t options,
1.367 + UErrorCode &errorCode);
1.368 +
1.369 + /**
1.370 + * Compare two strings for canonical equivalence.
1.371 + * Further options include case-insensitive comparison and
1.372 + * code point order (as opposed to code unit order).
1.373 + *
1.374 + * Canonical equivalence between two strings is defined as their normalized
1.375 + * forms (NFD or NFC) being identical.
1.376 + * This function compares strings incrementally instead of normalizing
1.377 + * (and optionally case-folding) both strings entirely,
1.378 + * improving performance significantly.
1.379 + *
1.380 + * Bulk normalization is only necessary if the strings do not fulfill the FCD
1.381 + * conditions. Only in this case, and only if the strings are relatively long,
1.382 + * is memory allocated temporarily.
1.383 + * For FCD strings and short non-FCD strings there is no memory allocation.
1.384 + *
1.385 + * Semantically, this is equivalent to
1.386 + * strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2)))
1.387 + * where code point order and foldCase are all optional.
1.388 + *
1.389 + * UAX 21 2.5 Caseless Matching specifies that for a canonical caseless match
1.390 + * the case folding must be performed first, then the normalization.
1.391 + *
1.392 + * @param s1 First source string.
1.393 + * @param s2 Second source string.
1.394 + *
1.395 + * @param options A bit set of options:
1.396 + * - U_FOLD_CASE_DEFAULT or 0 is used for default options:
1.397 + * Case-sensitive comparison in code unit order, and the input strings
1.398 + * are quick-checked for FCD.
1.399 + *
1.400 + * - UNORM_INPUT_IS_FCD
1.401 + * Set if the caller knows that both s1 and s2 fulfill the FCD conditions.
1.402 + * If not set, the function will quickCheck for FCD
1.403 + * and normalize if necessary.
1.404 + *
1.405 + * - U_COMPARE_CODE_POINT_ORDER
1.406 + * Set to choose code point order instead of code unit order
1.407 + * (see u_strCompare for details).
1.408 + *
1.409 + * - U_COMPARE_IGNORE_CASE
1.410 + * Set to compare strings case-insensitively using case folding,
1.411 + * instead of case-sensitively.
1.412 + * If set, then the following case folding options are used.
1.413 + *
1.414 + * - Options as used with case-insensitive comparisons, currently:
1.415 + *
1.416 + * - U_FOLD_CASE_EXCLUDE_SPECIAL_I
1.417 + * (see u_strCaseCompare for details)
1.418 + *
1.419 + * - regular normalization options shifted left by UNORM_COMPARE_NORM_OPTIONS_SHIFT
1.420 + *
1.421 + * @param errorCode ICU error code in/out parameter.
1.422 + * Must fulfill U_SUCCESS before the function call.
1.423 + * @return <0 or 0 or >0 as usual for string comparisons
1.424 + *
1.425 + * @see unorm_compare
1.426 + * @see normalize
1.427 + * @see UNORM_FCD
1.428 + * @see u_strCompare
1.429 + * @see u_strCaseCompare
1.430 + *
1.431 + * @stable ICU 2.2
1.432 + */
1.433 + static inline int32_t
1.434 + compare(const UnicodeString &s1, const UnicodeString &s2,
1.435 + uint32_t options,
1.436 + UErrorCode &errorCode);
1.437 +
1.438 + //-------------------------------------------------------------------------
1.439 + // Iteration API
1.440 + //-------------------------------------------------------------------------
1.441 +
1.442 + /**
1.443 + * Return the current character in the normalized text.
1.444 + * current() may need to normalize some text at getIndex().
1.445 + * The getIndex() is not changed.
1.446 + *
1.447 + * @return the current normalized code point
1.448 + * @stable ICU 2.0
1.449 + */
1.450 + UChar32 current(void);
1.451 +
1.452 + /**
1.453 + * Return the first character in the normalized text.
1.454 + * This is equivalent to setIndexOnly(startIndex()) followed by next().
1.455 + * (Post-increment semantics.)
1.456 + *
1.457 + * @return the first normalized code point
1.458 + * @stable ICU 2.0
1.459 + */
1.460 + UChar32 first(void);
1.461 +
1.462 + /**
1.463 + * Return the last character in the normalized text.
1.464 + * This is equivalent to setIndexOnly(endIndex()) followed by previous().
1.465 + * (Pre-decrement semantics.)
1.466 + *
1.467 + * @return the last normalized code point
1.468 + * @stable ICU 2.0
1.469 + */
1.470 + UChar32 last(void);
1.471 +
1.472 + /**
1.473 + * Return the next character in the normalized text.
1.474 + * (Post-increment semantics.)
1.475 + * If the end of the text has already been reached, DONE is returned.
1.476 + * The DONE value could be confused with a U+FFFF non-character code point
1.477 + * in the text. If this is possible, you can test getIndex()<endIndex()
1.478 + * before calling next(), or (getIndex()<endIndex() || last()!=DONE)
1.479 + * after calling next(). (Calling last() will change the iterator state!)
1.480 + *
1.481 + * The C API unorm_next() is more efficient and does not have this ambiguity.
1.482 + *
1.483 + * @return the next normalized code point
1.484 + * @stable ICU 2.0
1.485 + */
1.486 + UChar32 next(void);
1.487 +
1.488 + /**
1.489 + * Return the previous character in the normalized text and decrement.
1.490 + * (Pre-decrement semantics.)
1.491 + * If the beginning of the text has already been reached, DONE is returned.
1.492 + * The DONE value could be confused with a U+FFFF non-character code point
1.493 + * in the text. If this is possible, you can test
1.494 + * (getIndex()>startIndex() || first()!=DONE). (Calling first() will change
1.495 + * the iterator state!)
1.496 + *
1.497 + * The C API unorm_previous() is more efficient and does not have this ambiguity.
1.498 + *
1.499 + * @return the previous normalized code point
1.500 + * @stable ICU 2.0
1.501 + */
1.502 + UChar32 previous(void);
1.503 +
1.504 + /**
1.505 + * Set the iteration position in the input text that is being normalized,
1.506 + * without any immediate normalization.
1.507 + * After setIndexOnly(), getIndex() will return the same index that is
1.508 + * specified here.
1.509 + *
1.510 + * @param index the desired index in the input text.
1.511 + * @stable ICU 2.0
1.512 + */
1.513 + void setIndexOnly(int32_t index);
1.514 +
1.515 + /**
1.516 + * Reset the index to the beginning of the text.
1.517 + * This is equivalent to setIndexOnly(startIndex)).
1.518 + * @stable ICU 2.0
1.519 + */
1.520 + void reset(void);
1.521 +
1.522 + /**
1.523 + * Retrieve the current iteration position in the input text that is
1.524 + * being normalized.
1.525 + *
1.526 + * A following call to next() will return a normalized code point from
1.527 + * the input text at or after this index.
1.528 + *
1.529 + * After a call to previous(), getIndex() will point at or before the
1.530 + * position in the input text where the normalized code point
1.531 + * was returned from with previous().
1.532 + *
1.533 + * @return the current index in the input text
1.534 + * @stable ICU 2.0
1.535 + */
1.536 + int32_t getIndex(void) const;
1.537 +
1.538 + /**
1.539 + * Retrieve the index of the start of the input text. This is the begin index
1.540 + * of the <code>CharacterIterator</code> or the start (i.e. index 0) of the string
1.541 + * over which this <code>Normalizer</code> is iterating.
1.542 + *
1.543 + * @return the smallest index in the input text where the Normalizer operates
1.544 + * @stable ICU 2.0
1.545 + */
1.546 + int32_t startIndex(void) const;
1.547 +
1.548 + /**
1.549 + * Retrieve the index of the end of the input text. This is the end index
1.550 + * of the <code>CharacterIterator</code> or the length of the string
1.551 + * over which this <code>Normalizer</code> is iterating.
1.552 + * This end index is exclusive, i.e., the Normalizer operates only on characters
1.553 + * before this index.
1.554 + *
1.555 + * @return the first index in the input text where the Normalizer does not operate
1.556 + * @stable ICU 2.0
1.557 + */
1.558 + int32_t endIndex(void) const;
1.559 +
1.560 + /**
1.561 + * Returns TRUE when both iterators refer to the same character in the same
1.562 + * input text.
1.563 + *
1.564 + * @param that a Normalizer object to compare this one to
1.565 + * @return comparison result
1.566 + * @stable ICU 2.0
1.567 + */
1.568 + UBool operator==(const Normalizer& that) const;
1.569 +
1.570 + /**
1.571 + * Returns FALSE when both iterators refer to the same character in the same
1.572 + * input text.
1.573 + *
1.574 + * @param that a Normalizer object to compare this one to
1.575 + * @return comparison result
1.576 + * @stable ICU 2.0
1.577 + */
1.578 + inline UBool operator!=(const Normalizer& that) const;
1.579 +
1.580 + /**
1.581 + * Returns a pointer to a new Normalizer that is a clone of this one.
1.582 + * The caller is responsible for deleting the new clone.
1.583 + * @return a pointer to a new Normalizer
1.584 + * @stable ICU 2.0
1.585 + */
1.586 + Normalizer* clone(void) const;
1.587 +
1.588 + /**
1.589 + * Generates a hash code for this iterator.
1.590 + *
1.591 + * @return the hash code
1.592 + * @stable ICU 2.0
1.593 + */
1.594 + int32_t hashCode(void) const;
1.595 +
1.596 + //-------------------------------------------------------------------------
1.597 + // Property access methods
1.598 + //-------------------------------------------------------------------------
1.599 +
1.600 + /**
1.601 + * Set the normalization mode for this object.
1.602 + * <p>
1.603 + * <b>Note:</b>If the normalization mode is changed while iterating
1.604 + * over a string, calls to {@link #next() } and {@link #previous() } may
1.605 + * return previously buffers characters in the old normalization mode
1.606 + * until the iteration is able to re-sync at the next base character.
1.607 + * It is safest to call {@link #setIndexOnly }, {@link #reset() },
1.608 + * {@link #setText }, {@link #first() },
1.609 + * {@link #last() }, etc. after calling <code>setMode</code>.
1.610 + * <p>
1.611 + * @param newMode the new mode for this <code>Normalizer</code>.
1.612 + * @see #getUMode
1.613 + * @stable ICU 2.0
1.614 + */
1.615 + void setMode(UNormalizationMode newMode);
1.616 +
1.617 + /**
1.618 + * Return the normalization mode for this object.
1.619 + *
1.620 + * This is an unusual name because there used to be a getMode() that
1.621 + * returned a different type.
1.622 + *
1.623 + * @return the mode for this <code>Normalizer</code>
1.624 + * @see #setMode
1.625 + * @stable ICU 2.0
1.626 + */
1.627 + UNormalizationMode getUMode(void) const;
1.628 +
1.629 + /**
1.630 + * Set options that affect this <code>Normalizer</code>'s operation.
1.631 + * Options do not change the basic composition or decomposition operation
1.632 + * that is being performed, but they control whether
1.633 + * certain optional portions of the operation are done.
1.634 + * Currently the only available option is obsolete.
1.635 + *
1.636 + * It is possible to specify multiple options that are all turned on or off.
1.637 + *
1.638 + * @param option the option(s) whose value is/are to be set.
1.639 + * @param value the new setting for the option. Use <code>TRUE</code> to
1.640 + * turn the option(s) on and <code>FALSE</code> to turn it/them off.
1.641 + *
1.642 + * @see #getOption
1.643 + * @stable ICU 2.0
1.644 + */
1.645 + void setOption(int32_t option,
1.646 + UBool value);
1.647 +
1.648 + /**
1.649 + * Determine whether an option is turned on or off.
1.650 + * If multiple options are specified, then the result is TRUE if any
1.651 + * of them are set.
1.652 + * <p>
1.653 + * @param option the option(s) that are to be checked
1.654 + * @return TRUE if any of the option(s) are set
1.655 + * @see #setOption
1.656 + * @stable ICU 2.0
1.657 + */
1.658 + UBool getOption(int32_t option) const;
1.659 +
1.660 + /**
1.661 + * Set the input text over which this <code>Normalizer</code> will iterate.
1.662 + * The iteration position is set to the beginning.
1.663 + *
1.664 + * @param newText a string that replaces the current input text
1.665 + * @param status a UErrorCode
1.666 + * @stable ICU 2.0
1.667 + */
1.668 + void setText(const UnicodeString& newText,
1.669 + UErrorCode &status);
1.670 +
1.671 + /**
1.672 + * Set the input text over which this <code>Normalizer</code> will iterate.
1.673 + * The iteration position is set to the beginning.
1.674 + *
1.675 + * @param newText a CharacterIterator object that replaces the current input text
1.676 + * @param status a UErrorCode
1.677 + * @stable ICU 2.0
1.678 + */
1.679 + void setText(const CharacterIterator& newText,
1.680 + UErrorCode &status);
1.681 +
1.682 + /**
1.683 + * Set the input text over which this <code>Normalizer</code> will iterate.
1.684 + * The iteration position is set to the beginning.
1.685 + *
1.686 + * @param newText a string that replaces the current input text
1.687 + * @param length the length of the string, or -1 if NUL-terminated
1.688 + * @param status a UErrorCode
1.689 + * @stable ICU 2.0
1.690 + */
1.691 + void setText(const UChar* newText,
1.692 + int32_t length,
1.693 + UErrorCode &status);
1.694 + /**
1.695 + * Copies the input text into the UnicodeString argument.
1.696 + *
1.697 + * @param result Receives a copy of the text under iteration.
1.698 + * @stable ICU 2.0
1.699 + */
1.700 + void getText(UnicodeString& result);
1.701 +
1.702 + /**
1.703 + * ICU "poor man's RTTI", returns a UClassID for this class.
1.704 + * @returns a UClassID for this class.
1.705 + * @stable ICU 2.2
1.706 + */
1.707 + static UClassID U_EXPORT2 getStaticClassID();
1.708 +
1.709 + /**
1.710 + * ICU "poor man's RTTI", returns a UClassID for the actual class.
1.711 + * @return a UClassID for the actual class.
1.712 + * @stable ICU 2.2
1.713 + */
1.714 + virtual UClassID getDynamicClassID() const;
1.715 +
1.716 +private:
1.717 + //-------------------------------------------------------------------------
1.718 + // Private functions
1.719 + //-------------------------------------------------------------------------
1.720 +
1.721 + Normalizer(); // default constructor not implemented
1.722 + Normalizer &operator=(const Normalizer &that); // assignment operator not implemented
1.723 +
1.724 + // Private utility methods for iteration
1.725 + // For documentation, see the source code
1.726 + UBool nextNormalize();
1.727 + UBool previousNormalize();
1.728 +
1.729 + void init(CharacterIterator *iter);
1.730 + void clearBuffer(void);
1.731 +
1.732 + //-------------------------------------------------------------------------
1.733 + // Private data
1.734 + //-------------------------------------------------------------------------
1.735 +
1.736 + UNormalizationMode fUMode;
1.737 + int32_t fOptions;
1.738 +
1.739 + // The input text and our position in it
1.740 + UCharIterator *text;
1.741 +
1.742 + // The normalization buffer is the result of normalization
1.743 + // of the source in [currentIndex..nextIndex[ .
1.744 + int32_t currentIndex, nextIndex;
1.745 +
1.746 + // A buffer for holding intermediate results
1.747 + UnicodeString buffer;
1.748 + int32_t bufferPos;
1.749 +
1.750 +};
1.751 +
1.752 +//-------------------------------------------------------------------------
1.753 +// Inline implementations
1.754 +//-------------------------------------------------------------------------
1.755 +
1.756 +inline UBool
1.757 +Normalizer::operator!= (const Normalizer& other) const
1.758 +{ return ! operator==(other); }
1.759 +
1.760 +inline UNormalizationCheckResult
1.761 +Normalizer::quickCheck(const UnicodeString& source,
1.762 + UNormalizationMode mode,
1.763 + UErrorCode &status) {
1.764 + if(U_FAILURE(status)) {
1.765 + return UNORM_MAYBE;
1.766 + }
1.767 +
1.768 + return unorm_quickCheck(source.getBuffer(), source.length(),
1.769 + mode, &status);
1.770 +}
1.771 +
1.772 +inline UNormalizationCheckResult
1.773 +Normalizer::quickCheck(const UnicodeString& source,
1.774 + UNormalizationMode mode, int32_t options,
1.775 + UErrorCode &status) {
1.776 + if(U_FAILURE(status)) {
1.777 + return UNORM_MAYBE;
1.778 + }
1.779 +
1.780 + return unorm_quickCheckWithOptions(source.getBuffer(), source.length(),
1.781 + mode, options, &status);
1.782 +}
1.783 +
1.784 +inline UBool
1.785 +Normalizer::isNormalized(const UnicodeString& source,
1.786 + UNormalizationMode mode,
1.787 + UErrorCode &status) {
1.788 + if(U_FAILURE(status)) {
1.789 + return FALSE;
1.790 + }
1.791 +
1.792 + return unorm_isNormalized(source.getBuffer(), source.length(),
1.793 + mode, &status);
1.794 +}
1.795 +
1.796 +inline UBool
1.797 +Normalizer::isNormalized(const UnicodeString& source,
1.798 + UNormalizationMode mode, int32_t options,
1.799 + UErrorCode &status) {
1.800 + if(U_FAILURE(status)) {
1.801 + return FALSE;
1.802 + }
1.803 +
1.804 + return unorm_isNormalizedWithOptions(source.getBuffer(), source.length(),
1.805 + mode, options, &status);
1.806 +}
1.807 +
1.808 +inline int32_t
1.809 +Normalizer::compare(const UnicodeString &s1, const UnicodeString &s2,
1.810 + uint32_t options,
1.811 + UErrorCode &errorCode) {
1.812 + // all argument checking is done in unorm_compare
1.813 + return unorm_compare(s1.getBuffer(), s1.length(),
1.814 + s2.getBuffer(), s2.length(),
1.815 + options,
1.816 + &errorCode);
1.817 +}
1.818 +
1.819 +U_NAMESPACE_END
1.820 +
1.821 +#endif /* #if !UCONFIG_NO_NORMALIZATION */
1.822 +
1.823 +#endif // NORMLZR_H