Symaptic: os/textandloc/fontservices/textshaperplugin/IcuSource/common/unicode/normlzr.h@260cb5ec6c19 (annotated)

sl@0	1	/*
sl@0	2	********************************************************************
sl@0	3	* COPYRIGHT:
sl@0	4	* Copyright (c) 1996-2005, International Business Machines Corporation and
sl@0	5	* others. All Rights Reserved.
sl@0	6	********************************************************************
sl@0	7	*/
sl@0	8
sl@0	9	#ifndef NORMLZR_H
sl@0	10	#define NORMLZR_H
sl@0	11
sl@0	12	#include "unicode/utypes.h"
sl@0	13
sl@0	14	/**
sl@0	15	* \file
sl@0	16	* \brief C++ API: Unicode Normalization
sl@0	17	*/
sl@0	18
sl@0	19	#if !UCONFIG_NO_NORMALIZATION
sl@0	20
sl@0	21	#include "unicode/uobject.h"
sl@0	22	#include "unicode/unistr.h"
sl@0	23	#include "unicode/chariter.h"
sl@0	24	#include "unicode/unorm.h"
sl@0	25
sl@0	26
sl@0	27	struct UCharIterator;
sl@0	28	typedef struct UCharIterator UCharIterator; /*< C typedef for struct UCharIterator. @stable ICU 2.1 /
sl@0	29
sl@0	30	U_NAMESPACE_BEGIN
sl@0	31	/**
sl@0	32	*
sl@0	33	* The Normalizer class consists of two parts:
sl@0	34	* - static functions that normalize strings or test if strings are normalized
sl@0	35	* - a Normalizer object is an iterator that takes any kind of text and
sl@0	36	* provides iteration over its normalized form
sl@0	37	*
sl@0	38	* The Normalizer class is not suitable for subclassing.
sl@0	39	*
sl@0	40	* The static functions are basically wrappers around the C implementation,
sl@0	41	* using UnicodeString instead of UChar*.
sl@0	42	* For basic information about normalization forms and details about the C API
sl@0	43	* please see the documentation in unorm.h.
sl@0	44	*
sl@0	45	* The iterator API with the Normalizer constructors and the non-static functions
sl@0	46	* uses a CharacterIterator as input. It is possible to pass a string which
sl@0	47	* is then internally wrapped in a CharacterIterator.
sl@0	48	* The input text is not normalized all at once, but incrementally where needed
sl@0	49	* (providing efficient random access).
sl@0	50	* This allows to pass in a large text but spend only a small amount of time
sl@0	51	* normalizing a small part of that text.
sl@0	52	* However, if the entire text is normalized, then the iterator will be
sl@0	53	* slower than normalizing the entire text at once and iterating over the result.
sl@0	54	* A possible use of the Normalizer iterator is also to report an index into the
sl@0	55	* original text that is close to where the normalized characters come from.
sl@0	56	*
sl@0	57	* <em>Important:</em> The iterator API was cleaned up significantly for ICU 2.0.
sl@0	58	* The earlier implementation reported the getIndex() inconsistently,
sl@0	59	* and previous() could not be used after setIndex(), next(), first(), and current().
sl@0	60	*
sl@0	61	* Normalizer allows to start normalizing from anywhere in the input text by
sl@0	62	* calling setIndexOnly(), first(), or last().
sl@0	63	* Without calling any of these, the iterator will start at the beginning of the text.
sl@0	64	*
sl@0	65	* At any time, next() returns the next normalized code point (UChar32),
sl@0	66	* with post-increment semantics (like CharacterIterator::next32PostInc()).
sl@0	67	* previous() returns the previous normalized code point (UChar32),
sl@0	68	* with pre-decrement semantics (like CharacterIterator::previous32()).
sl@0	69	*
sl@0	70	* current() returns the current code point
sl@0	71	* (respectively the one at the newly set index) without moving
sl@0	72	* the getIndex(). Note that if the text at the current position
sl@0	73	* needs to be normalized, then these functions will do that.
sl@0	74	* (This is why current() is not const.)
sl@0	75	* It is more efficient to call setIndexOnly() instead, which does not
sl@0	76	* normalize.
sl@0	77	*
sl@0	78	* getIndex() always refers to the position in the input text where the normalized
sl@0	79	* code points are returned from. It does not always change with each returned
sl@0	80	* code point.
sl@0	81	* The code point that is returned from any of the functions
sl@0	82	* corresponds to text at or after getIndex(), according to the
sl@0	83	* function's iteration semantics (post-increment or pre-decrement).
sl@0	84	*
sl@0	85	* next() returns a code point from at or after the getIndex()
sl@0	86	* from before the next() call. After the next() call, the getIndex()
sl@0	87	* might have moved to where the next code point will be returned from
sl@0	88	* (from a next() or current() call).
sl@0	89	* This is semantically equivalent to array access with array[index++]
sl@0	90	* (post-increment semantics).
sl@0	91	*
sl@0	92	* previous() returns a code point from at or after the getIndex()
sl@0	93	* from after the previous() call.
sl@0	94	* This is semantically equivalent to array access with array[--index]
sl@0	95	* (pre-decrement semantics).
sl@0	96	*
sl@0	97	* Internally, the Normalizer iterator normalizes a small piece of text
sl@0	98	* starting at the getIndex() and ending at a following "safe" index.
sl@0	99	* The normalized results is stored in an internal string buffer, and
sl@0	100	* the code points are iterated from there.
sl@0	101	* With multiple iteration calls, this is repeated until the next piece
sl@0	102	* of text needs to be normalized, and the getIndex() needs to be moved.
sl@0	103	*
sl@0	104	* The following "safe" index, the internal buffer, and the secondary
sl@0	105	* iteration index into that buffer are not exposed on the API.
sl@0	106	* This also means that it is currently not practical to return to
sl@0	107	* a particular, arbitrary position in the text because one would need to
sl@0	108	* know, and be able to set, in addition to the getIndex(), at least also the
sl@0	109	* current index into the internal buffer.
sl@0	110	* It is currently only possible to observe when getIndex() changes
sl@0	111	* (with careful consideration of the iteration semantics),
sl@0	112	* at which time the internal index will be 0.
sl@0	113	* For example, if getIndex() is different after next() than before it,
sl@0	114	* then the internal index is 0 and one can return to this getIndex()
sl@0	115	* later with setIndexOnly().
sl@0	116	*
sl@0	117	* @author Laura Werner, Mark Davis, Markus Scherer
sl@0	118	* @stable ICU 2.0
sl@0	119	*/
sl@0	120	class U_COMMON_API Normalizer : public UObject {
sl@0	121	public:
sl@0	122	/**
sl@0	123	* If DONE is returned from an iteration function that returns a code point,
sl@0	124	* then there are no more normalization results available.
sl@0	125	* @stable ICU 2.0
sl@0	126	*/
sl@0	127	enum {
sl@0	128	DONE=0xffff
sl@0	129	};
sl@0	130
sl@0	131	// Constructors
sl@0	132
sl@0	133	/**
sl@0	134	* Creates a new <code>Normalizer</code> object for iterating over the
sl@0	135	* normalized form of a given string.
sl@0	136	* <p>
sl@0	137	* @param str The string to be normalized. The normalization
sl@0	138	* will start at the beginning of the string.
sl@0	139	*
sl@0	140	* @param mode The normalization mode.
sl@0	141	* @stable ICU 2.0
sl@0	142	*/
sl@0	143	Normalizer(const UnicodeString& str, UNormalizationMode mode);
sl@0	144
sl@0	145	/**
sl@0	146	* Creates a new <code>Normalizer</code> object for iterating over the
sl@0	147	* normalized form of a given string.
sl@0	148	* <p>
sl@0	149	* @param str The string to be normalized. The normalization
sl@0	150	* will start at the beginning of the string.
sl@0	151	*
sl@0	152	* @param length Length of the string, or -1 if NUL-terminated.
sl@0	153	* @param mode The normalization mode.
sl@0	154	* @stable ICU 2.0
sl@0	155	*/
sl@0	156	Normalizer(const UChar* str, int32_t length, UNormalizationMode mode);
sl@0	157
sl@0	158	/**
sl@0	159	* Creates a new <code>Normalizer</code> object for iterating over the
sl@0	160	* normalized form of the given text.
sl@0	161	* <p>
sl@0	162	* @param iter The input text to be normalized. The normalization
sl@0	163	* will start at the beginning of the string.
sl@0	164	*
sl@0	165	* @param mode The normalization mode.
sl@0	166	* @stable ICU 2.0
sl@0	167	*/
sl@0	168	Normalizer(const CharacterIterator& iter, UNormalizationMode mode);
sl@0	169
sl@0	170	/**
sl@0	171	* Copy constructor.
sl@0	172	* @param copy The object to be copied.
sl@0	173	* @stable ICU 2.0
sl@0	174	*/
sl@0	175	Normalizer(const Normalizer& copy);
sl@0	176
sl@0	177	/**
sl@0	178	* Destructor
sl@0	179	* @stable ICU 2.0
sl@0	180	*/
sl@0	181	virtual ~Normalizer();
sl@0	182
sl@0	183
sl@0	184	//-------------------------------------------------------------------------
sl@0	185	// Static utility methods
sl@0	186	//-------------------------------------------------------------------------
sl@0	187
sl@0	188	/**
sl@0	189	* Normalizes a <code>UnicodeString</code> according to the specified normalization mode.
sl@0	190	* This is a wrapper for unorm_normalize(), using UnicodeString's.
sl@0	191	*
sl@0	192	* The <code>options</code> parameter specifies which optional
sl@0	193	* <code>Normalizer</code> features are to be enabled for this operation.
sl@0	194	*
sl@0	195	* @param source the input string to be normalized.
sl@0	196	* @param mode the normalization mode
sl@0	197	* @param options the optional features to be enabled (0 for no options)
sl@0	198	* @param result The normalized string (on output).
sl@0	199	* @param status The error code.
sl@0	200	* @stable ICU 2.0
sl@0	201	*/
sl@0	202	static void U_EXPORT2 normalize(const UnicodeString& source,
sl@0	203	UNormalizationMode mode, int32_t options,
sl@0	204	UnicodeString& result,
sl@0	205	UErrorCode &status);
sl@0	206
sl@0	207	/**
sl@0	208	* Compose a <code>UnicodeString</code>.
sl@0	209	* This is equivalent to normalize() with mode UNORM_NFC or UNORM_NFKC.
sl@0	210	* This is a wrapper for unorm_normalize(), using UnicodeString's.
sl@0	211	*
sl@0	212	* The <code>options</code> parameter specifies which optional
sl@0	213	* <code>Normalizer</code> features are to be enabled for this operation.
sl@0	214	*
sl@0	215	* @param source the string to be composed.
sl@0	216	* @param compat Perform compatibility decomposition before composition.
sl@0	217	* If this argument is <code>FALSE</code>, only canonical
sl@0	218	* decomposition will be performed.
sl@0	219	* @param options the optional features to be enabled (0 for no options)
sl@0	220	* @param result The composed string (on output).
sl@0	221	* @param status The error code.
sl@0	222	* @stable ICU 2.0
sl@0	223	*/
sl@0	224	static void U_EXPORT2 compose(const UnicodeString& source,
sl@0	225	UBool compat, int32_t options,
sl@0	226	UnicodeString& result,
sl@0	227	UErrorCode &status);
sl@0	228
sl@0	229	/**
sl@0	230	* Static method to decompose a <code>UnicodeString</code>.
sl@0	231	* This is equivalent to normalize() with mode UNORM_NFD or UNORM_NFKD.
sl@0	232	* This is a wrapper for unorm_normalize(), using UnicodeString's.
sl@0	233	*
sl@0	234	* The <code>options</code> parameter specifies which optional
sl@0	235	* <code>Normalizer</code> features are to be enabled for this operation.
sl@0	236	*
sl@0	237	* @param source the string to be decomposed.
sl@0	238	* @param compat Perform compatibility decomposition.
sl@0	239	* If this argument is <code>FALSE</code>, only canonical
sl@0	240	* decomposition will be performed.
sl@0	241	* @param options the optional features to be enabled (0 for no options)
sl@0	242	* @param result The decomposed string (on output).
sl@0	243	* @param status The error code.
sl@0	244	* @stable ICU 2.0
sl@0	245	*/
sl@0	246	static void U_EXPORT2 decompose(const UnicodeString& source,
sl@0	247	UBool compat, int32_t options,
sl@0	248	UnicodeString& result,
sl@0	249	UErrorCode &status);
sl@0	250
sl@0	251	/**
sl@0	252	* Performing quick check on a string, to quickly determine if the string is
sl@0	253	* in a particular normalization format.
sl@0	254	* This is a wrapper for unorm_quickCheck(), using a UnicodeString.
sl@0	255	*
sl@0	256	* Three types of result can be returned UNORM_YES, UNORM_NO or
sl@0	257	* UNORM_MAYBE. Result UNORM_YES indicates that the argument
sl@0	258	* string is in the desired normalized format, UNORM_NO determines that
sl@0	259	* argument string is not in the desired normalized format. A
sl@0	260	* UNORM_MAYBE result indicates that a more thorough check is required,
sl@0	261	* the user may have to put the string in its normalized form and compare the
sl@0	262	* results.
sl@0	263	* @param source string for determining if it is in a normalized format
sl@0	264	* @param mode normalization format
sl@0	265	* @param status A reference to a UErrorCode to receive any errors
sl@0	266	* @return UNORM_YES, UNORM_NO or UNORM_MAYBE
sl@0	267	*
sl@0	268	* @see isNormalized
sl@0	269	* @stable ICU 2.0
sl@0	270	*/
sl@0	271	static inline UNormalizationCheckResult
sl@0	272	quickCheck(const UnicodeString &source, UNormalizationMode mode, UErrorCode &status);
sl@0	273
sl@0	274	/**
sl@0	275	* Performing quick check on a string; same as the other version of quickCheck
sl@0	276	* but takes an extra options parameter like most normalization functions.
sl@0	277	*
sl@0	278	* @param source string for determining if it is in a normalized format
sl@0	279	* @param mode normalization format
sl@0	280	* @param options the optional features to be enabled (0 for no options)
sl@0	281	* @param status A reference to a UErrorCode to receive any errors
sl@0	282	* @return UNORM_YES, UNORM_NO or UNORM_MAYBE
sl@0	283	*
sl@0	284	* @see isNormalized
sl@0	285	* @stable ICU 2.6
sl@0	286	*/
sl@0	287	static inline UNormalizationCheckResult
sl@0	288	quickCheck(const UnicodeString &source, UNormalizationMode mode, int32_t options, UErrorCode &status);
sl@0	289
sl@0	290	/**
sl@0	291	* Test if a string is in a given normalization form.
sl@0	292	* This is semantically equivalent to source.equals(normalize(source, mode)) .
sl@0	293	*
sl@0	294	* Unlike unorm_quickCheck(), this function returns a definitive result,
sl@0	295	* never a "maybe".
sl@0	296	* For NFD, NFKD, and FCD, both functions work exactly the same.
sl@0	297	* For NFC and NFKC where quickCheck may return "maybe", this function will
sl@0	298	* perform further tests to arrive at a TRUE/FALSE result.
sl@0	299	*
sl@0	300	* @param src String that is to be tested if it is in a normalization format.
sl@0	301	* @param mode Which normalization form to test for.
sl@0	302	* @param errorCode ICU error code in/out parameter.
sl@0	303	* Must fulfill U_SUCCESS before the function call.
sl@0	304	* @return Boolean value indicating whether the source string is in the
sl@0	305	* "mode" normalization form.
sl@0	306	*
sl@0	307	* @see quickCheck
sl@0	308	* @stable ICU 2.2
sl@0	309	*/
sl@0	310	static inline UBool
sl@0	311	isNormalized(const UnicodeString &src, UNormalizationMode mode, UErrorCode &errorCode);
sl@0	312
sl@0	313	/**
sl@0	314	* Test if a string is in a given normalization form; same as the other version of isNormalized
sl@0	315	* but takes an extra options parameter like most normalization functions.
sl@0	316	*
sl@0	317	* @param src String that is to be tested if it is in a normalization format.
sl@0	318	* @param mode Which normalization form to test for.
sl@0	319	* @param options the optional features to be enabled (0 for no options)
sl@0	320	* @param errorCode ICU error code in/out parameter.
sl@0	321	* Must fulfill U_SUCCESS before the function call.
sl@0	322	* @return Boolean value indicating whether the source string is in the
sl@0	323	* "mode" normalization form.
sl@0	324	*
sl@0	325	* @see quickCheck
sl@0	326	* @stable ICU 2.6
sl@0	327	*/
sl@0	328	static inline UBool
sl@0	329	isNormalized(const UnicodeString &src, UNormalizationMode mode, int32_t options, UErrorCode &errorCode);
sl@0	330
sl@0	331	/**
sl@0	332	* Concatenate normalized strings, making sure that the result is normalized as well.
sl@0	333	*
sl@0	334	* If both the left and the right strings are in
sl@0	335	* the normalization form according to "mode/options",
sl@0	336	* then the result will be
sl@0	337	*
sl@0	338	* \code
sl@0	339	* dest=normalize(left+right, mode, options)
sl@0	340	* \endcode
sl@0	341	*
sl@0	342	* For details see unorm_concatenate in unorm.h.
sl@0	343	*
sl@0	344	* @param left Left source string.
sl@0	345	* @param right Right source string.
sl@0	346	* @param result The output string.
sl@0	347	* @param mode The normalization mode.
sl@0	348	* @param options A bit set of normalization options.
sl@0	349	* @param errorCode ICU error code in/out parameter.
sl@0	350	* Must fulfill U_SUCCESS before the function call.
sl@0	351	* @return result
sl@0	352	*
sl@0	353	* @see unorm_concatenate
sl@0	354	* @see normalize
sl@0	355	* @see unorm_next
sl@0	356	* @see unorm_previous
sl@0	357	*
sl@0	358	* @stable ICU 2.1
sl@0	359	*/
sl@0	360	static UnicodeString &
sl@0	361	U_EXPORT2 concatenate(UnicodeString &left, UnicodeString &right,
sl@0	362	UnicodeString &result,
sl@0	363	UNormalizationMode mode, int32_t options,
sl@0	364	UErrorCode &errorCode);
sl@0	365
sl@0	366	/**
sl@0	367	* Compare two strings for canonical equivalence.
sl@0	368	* Further options include case-insensitive comparison and
sl@0	369	* code point order (as opposed to code unit order).
sl@0	370	*
sl@0	371	* Canonical equivalence between two strings is defined as their normalized
sl@0	372	* forms (NFD or NFC) being identical.
sl@0	373	* This function compares strings incrementally instead of normalizing
sl@0	374	* (and optionally case-folding) both strings entirely,
sl@0	375	* improving performance significantly.
sl@0	376	*
sl@0	377	* Bulk normalization is only necessary if the strings do not fulfill the FCD
sl@0	378	* conditions. Only in this case, and only if the strings are relatively long,
sl@0	379	* is memory allocated temporarily.
sl@0	380	* For FCD strings and short non-FCD strings there is no memory allocation.
sl@0	381	*
sl@0	382	* Semantically, this is equivalent to
sl@0	383	* strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2)))
sl@0	384	* where code point order and foldCase are all optional.
sl@0	385	*
sl@0	386	* UAX 21 2.5 Caseless Matching specifies that for a canonical caseless match
sl@0	387	* the case folding must be performed first, then the normalization.
sl@0	388	*
sl@0	389	* @param s1 First source string.
sl@0	390	* @param s2 Second source string.
sl@0	391	*
sl@0	392	* @param options A bit set of options:
sl@0	393	* - U_FOLD_CASE_DEFAULT or 0 is used for default options:
sl@0	394	* Case-sensitive comparison in code unit order, and the input strings
sl@0	395	* are quick-checked for FCD.
sl@0	396	*
sl@0	397	* - UNORM_INPUT_IS_FCD
sl@0	398	* Set if the caller knows that both s1 and s2 fulfill the FCD conditions.
sl@0	399	* If not set, the function will quickCheck for FCD
sl@0	400	* and normalize if necessary.
sl@0	401	*
sl@0	402	* - U_COMPARE_CODE_POINT_ORDER
sl@0	403	* Set to choose code point order instead of code unit order
sl@0	404	* (see u_strCompare for details).
sl@0	405	*
sl@0	406	* - U_COMPARE_IGNORE_CASE
sl@0	407	* Set to compare strings case-insensitively using case folding,
sl@0	408	* instead of case-sensitively.
sl@0	409	* If set, then the following case folding options are used.
sl@0	410	*
sl@0	411	* - Options as used with case-insensitive comparisons, currently:
sl@0	412	*
sl@0	413	* - U_FOLD_CASE_EXCLUDE_SPECIAL_I
sl@0	414	* (see u_strCaseCompare for details)
sl@0	415	*
sl@0	416	* - regular normalization options shifted left by UNORM_COMPARE_NORM_OPTIONS_SHIFT
sl@0	417	*
sl@0	418	* @param errorCode ICU error code in/out parameter.
sl@0	419	* Must fulfill U_SUCCESS before the function call.
sl@0	420	* @return <0 or 0 or >0 as usual for string comparisons
sl@0	421	*
sl@0	422	* @see unorm_compare
sl@0	423	* @see normalize
sl@0	424	* @see UNORM_FCD
sl@0	425	* @see u_strCompare
sl@0	426	* @see u_strCaseCompare
sl@0	427	*
sl@0	428	* @stable ICU 2.2
sl@0	429	*/
sl@0	430	static inline int32_t
sl@0	431	compare(const UnicodeString &s1, const UnicodeString &s2,
sl@0	432	uint32_t options,
sl@0	433	UErrorCode &errorCode);
sl@0	434
sl@0	435	//-------------------------------------------------------------------------
sl@0	436	// Iteration API
sl@0	437	//-------------------------------------------------------------------------
sl@0	438
sl@0	439	/**
sl@0	440	* Return the current character in the normalized text.
sl@0	441	* current() may need to normalize some text at getIndex().
sl@0	442	* The getIndex() is not changed.
sl@0	443	*
sl@0	444	* @return the current normalized code point
sl@0	445	* @stable ICU 2.0
sl@0	446	*/
sl@0	447	UChar32 current(void);
sl@0	448
sl@0	449	/**
sl@0	450	* Return the first character in the normalized text.
sl@0	451	* This is equivalent to setIndexOnly(startIndex()) followed by next().
sl@0	452	* (Post-increment semantics.)
sl@0	453	*
sl@0	454	* @return the first normalized code point
sl@0	455	* @stable ICU 2.0
sl@0	456	*/
sl@0	457	UChar32 first(void);
sl@0	458
sl@0	459	/**
sl@0	460	* Return the last character in the normalized text.
sl@0	461	* This is equivalent to setIndexOnly(endIndex()) followed by previous().
sl@0	462	* (Pre-decrement semantics.)
sl@0	463	*
sl@0	464	* @return the last normalized code point
sl@0	465	* @stable ICU 2.0
sl@0	466	*/
sl@0	467	UChar32 last(void);
sl@0	468
sl@0	469	/**
sl@0	470	* Return the next character in the normalized text.
sl@0	471	* (Post-increment semantics.)
sl@0	472	* If the end of the text has already been reached, DONE is returned.
sl@0	473	* The DONE value could be confused with a U+FFFF non-character code point
sl@0	474	* in the text. If this is possible, you can test getIndex()<endIndex()
sl@0	475	* before calling next(), or (getIndex()<endIndex() \|\| last()!=DONE)
sl@0	476	* after calling next(). (Calling last() will change the iterator state!)
sl@0	477	*
sl@0	478	* The C API unorm_next() is more efficient and does not have this ambiguity.
sl@0	479	*
sl@0	480	* @return the next normalized code point
sl@0	481	* @stable ICU 2.0
sl@0	482	*/
sl@0	483	UChar32 next(void);
sl@0	484
sl@0	485	/**
sl@0	486	* Return the previous character in the normalized text and decrement.
sl@0	487	* (Pre-decrement semantics.)
sl@0	488	* If the beginning of the text has already been reached, DONE is returned.
sl@0	489	* The DONE value could be confused with a U+FFFF non-character code point
sl@0	490	* in the text. If this is possible, you can test
sl@0	491	* (getIndex()>startIndex() \|\| first()!=DONE). (Calling first() will change
sl@0	492	* the iterator state!)
sl@0	493	*
sl@0	494	* The C API unorm_previous() is more efficient and does not have this ambiguity.
sl@0	495	*
sl@0	496	* @return the previous normalized code point
sl@0	497	* @stable ICU 2.0
sl@0	498	*/
sl@0	499	UChar32 previous(void);
sl@0	500
sl@0	501	/**
sl@0	502	* Set the iteration position in the input text that is being normalized,
sl@0	503	* without any immediate normalization.
sl@0	504	* After setIndexOnly(), getIndex() will return the same index that is
sl@0	505	* specified here.
sl@0	506	*
sl@0	507	* @param index the desired index in the input text.
sl@0	508	* @stable ICU 2.0
sl@0	509	*/
sl@0	510	void setIndexOnly(int32_t index);
sl@0	511
sl@0	512	/**
sl@0	513	* Reset the index to the beginning of the text.
sl@0	514	* This is equivalent to setIndexOnly(startIndex)).
sl@0	515	* @stable ICU 2.0
sl@0	516	*/
sl@0	517	void reset(void);
sl@0	518
sl@0	519	/**
sl@0	520	* Retrieve the current iteration position in the input text that is
sl@0	521	* being normalized.
sl@0	522	*
sl@0	523	* A following call to next() will return a normalized code point from
sl@0	524	* the input text at or after this index.
sl@0	525	*
sl@0	526	* After a call to previous(), getIndex() will point at or before the
sl@0	527	* position in the input text where the normalized code point
sl@0	528	* was returned from with previous().
sl@0	529	*
sl@0	530	* @return the current index in the input text
sl@0	531	* @stable ICU 2.0
sl@0	532	*/
sl@0	533	int32_t getIndex(void) const;
sl@0	534
sl@0	535	/**
sl@0	536	* Retrieve the index of the start of the input text. This is the begin index
sl@0	537	* of the <code>CharacterIterator</code> or the start (i.e. index 0) of the string
sl@0	538	* over which this <code>Normalizer</code> is iterating.
sl@0	539	*
sl@0	540	* @return the smallest index in the input text where the Normalizer operates
sl@0	541	* @stable ICU 2.0
sl@0	542	*/
sl@0	543	int32_t startIndex(void) const;
sl@0	544
sl@0	545	/**
sl@0	546	* Retrieve the index of the end of the input text. This is the end index
sl@0	547	* of the <code>CharacterIterator</code> or the length of the string
sl@0	548	* over which this <code>Normalizer</code> is iterating.
sl@0	549	* This end index is exclusive, i.e., the Normalizer operates only on characters
sl@0	550	* before this index.
sl@0	551	*
sl@0	552	* @return the first index in the input text where the Normalizer does not operate
sl@0	553	* @stable ICU 2.0
sl@0	554	*/
sl@0	555	int32_t endIndex(void) const;
sl@0	556
sl@0	557	/**
sl@0	558	* Returns TRUE when both iterators refer to the same character in the same
sl@0	559	* input text.
sl@0	560	*
sl@0	561	* @param that a Normalizer object to compare this one to
sl@0	562	* @return comparison result
sl@0	563	* @stable ICU 2.0
sl@0	564	*/
sl@0	565	UBool operator==(const Normalizer& that) const;
sl@0	566
sl@0	567	/**
sl@0	568	* Returns FALSE when both iterators refer to the same character in the same
sl@0	569	* input text.
sl@0	570	*
sl@0	571	* @param that a Normalizer object to compare this one to
sl@0	572	* @return comparison result
sl@0	573	* @stable ICU 2.0
sl@0	574	*/
sl@0	575	inline UBool operator!=(const Normalizer& that) const;
sl@0	576
sl@0	577	/**
sl@0	578	* Returns a pointer to a new Normalizer that is a clone of this one.
sl@0	579	* The caller is responsible for deleting the new clone.
sl@0	580	* @return a pointer to a new Normalizer
sl@0	581	* @stable ICU 2.0
sl@0	582	*/
sl@0	583	Normalizer* clone(void) const;
sl@0	584
sl@0	585	/**
sl@0	586	* Generates a hash code for this iterator.
sl@0	587	*
sl@0	588	* @return the hash code
sl@0	589	* @stable ICU 2.0
sl@0	590	*/
sl@0	591	int32_t hashCode(void) const;
sl@0	592
sl@0	593	//-------------------------------------------------------------------------
sl@0	594	// Property access methods
sl@0	595	//-------------------------------------------------------------------------
sl@0	596
sl@0	597	/**
sl@0	598	* Set the normalization mode for this object.
sl@0	599	* <p>
sl@0	600	* <b>Note:</b>If the normalization mode is changed while iterating
sl@0	601	* over a string, calls to {@link #next() } and {@link #previous() } may
sl@0	602	* return previously buffers characters in the old normalization mode
sl@0	603	* until the iteration is able to re-sync at the next base character.
sl@0	604	* It is safest to call {@link #setIndexOnly }, {@link #reset() },
sl@0	605	* {@link #setText }, {@link #first() },
sl@0	606	* {@link #last() }, etc. after calling <code>setMode</code>.
sl@0	607	* <p>
sl@0	608	* @param newMode the new mode for this <code>Normalizer</code>.
sl@0	609	* @see #getUMode
sl@0	610	* @stable ICU 2.0
sl@0	611	*/
sl@0	612	void setMode(UNormalizationMode newMode);
sl@0	613
sl@0	614	/**
sl@0	615	* Return the normalization mode for this object.
sl@0	616	*
sl@0	617	* This is an unusual name because there used to be a getMode() that
sl@0	618	* returned a different type.
sl@0	619	*
sl@0	620	* @return the mode for this <code>Normalizer</code>
sl@0	621	* @see #setMode
sl@0	622	* @stable ICU 2.0
sl@0	623	*/
sl@0	624	UNormalizationMode getUMode(void) const;
sl@0	625
sl@0	626	/**
sl@0	627	* Set options that affect this <code>Normalizer</code>'s operation.
sl@0	628	* Options do not change the basic composition or decomposition operation
sl@0	629	* that is being performed, but they control whether
sl@0	630	* certain optional portions of the operation are done.
sl@0	631	* Currently the only available option is obsolete.
sl@0	632	*
sl@0	633	* It is possible to specify multiple options that are all turned on or off.
sl@0	634	*
sl@0	635	* @param option the option(s) whose value is/are to be set.
sl@0	636	* @param value the new setting for the option. Use <code>TRUE</code> to
sl@0	637	* turn the option(s) on and <code>FALSE</code> to turn it/them off.
sl@0	638	*
sl@0	639	* @see #getOption
sl@0	640	* @stable ICU 2.0
sl@0	641	*/
sl@0	642	void setOption(int32_t option,
sl@0	643	UBool value);
sl@0	644
sl@0	645	/**
sl@0	646	* Determine whether an option is turned on or off.
sl@0	647	* If multiple options are specified, then the result is TRUE if any
sl@0	648	* of them are set.
sl@0	649	* <p>
sl@0	650	* @param option the option(s) that are to be checked
sl@0	651	* @return TRUE if any of the option(s) are set
sl@0	652	* @see #setOption
sl@0	653	* @stable ICU 2.0
sl@0	654	*/
sl@0	655	UBool getOption(int32_t option) const;
sl@0	656
sl@0	657	/**
sl@0	658	* Set the input text over which this <code>Normalizer</code> will iterate.
sl@0	659	* The iteration position is set to the beginning.
sl@0	660	*
sl@0	661	* @param newText a string that replaces the current input text
sl@0	662	* @param status a UErrorCode
sl@0	663	* @stable ICU 2.0
sl@0	664	*/
sl@0	665	void setText(const UnicodeString& newText,
sl@0	666	UErrorCode &status);
sl@0	667
sl@0	668	/**
sl@0	669	* Set the input text over which this <code>Normalizer</code> will iterate.
sl@0	670	* The iteration position is set to the beginning.
sl@0	671	*
sl@0	672	* @param newText a CharacterIterator object that replaces the current input text
sl@0	673	* @param status a UErrorCode
sl@0	674	* @stable ICU 2.0
sl@0	675	*/
sl@0	676	void setText(const CharacterIterator& newText,
sl@0	677	UErrorCode &status);
sl@0	678
sl@0	679	/**
sl@0	680	* Set the input text over which this <code>Normalizer</code> will iterate.
sl@0	681	* The iteration position is set to the beginning.
sl@0	682	*
sl@0	683	* @param newText a string that replaces the current input text
sl@0	684	* @param length the length of the string, or -1 if NUL-terminated
sl@0	685	* @param status a UErrorCode
sl@0	686	* @stable ICU 2.0
sl@0	687	*/
sl@0	688	void setText(const UChar* newText,
sl@0	689	int32_t length,
sl@0	690	UErrorCode &status);
sl@0	691	/**
sl@0	692	* Copies the input text into the UnicodeString argument.
sl@0	693	*
sl@0	694	* @param result Receives a copy of the text under iteration.
sl@0	695	* @stable ICU 2.0
sl@0	696	*/
sl@0	697	void getText(UnicodeString& result);
sl@0	698
sl@0	699	/**
sl@0	700	* ICU "poor man's RTTI", returns a UClassID for this class.
sl@0	701	* @returns a UClassID for this class.
sl@0	702	* @stable ICU 2.2
sl@0	703	*/
sl@0	704	static UClassID U_EXPORT2 getStaticClassID();
sl@0	705
sl@0	706	/**
sl@0	707	* ICU "poor man's RTTI", returns a UClassID for the actual class.
sl@0	708	* @return a UClassID for the actual class.
sl@0	709	* @stable ICU 2.2
sl@0	710	*/
sl@0	711	virtual UClassID getDynamicClassID() const;
sl@0	712
sl@0	713	private:
sl@0	714	//-------------------------------------------------------------------------
sl@0	715	// Private functions
sl@0	716	//-------------------------------------------------------------------------
sl@0	717
sl@0	718	Normalizer(); // default constructor not implemented
sl@0	719	Normalizer &operator=(const Normalizer &that); // assignment operator not implemented
sl@0	720
sl@0	721	// Private utility methods for iteration
sl@0	722	// For documentation, see the source code
sl@0	723	UBool nextNormalize();
sl@0	724	UBool previousNormalize();
sl@0	725
sl@0	726	void init(CharacterIterator *iter);
sl@0	727	void clearBuffer(void);
sl@0	728
sl@0	729	//-------------------------------------------------------------------------
sl@0	730	// Private data
sl@0	731	//-------------------------------------------------------------------------
sl@0	732
sl@0	733	UNormalizationMode fUMode;
sl@0	734	int32_t fOptions;
sl@0	735
sl@0	736	// The input text and our position in it
sl@0	737	UCharIterator *text;
sl@0	738
sl@0	739	// The normalization buffer is the result of normalization
sl@0	740	// of the source in [currentIndex..nextIndex[ .
sl@0	741	int32_t currentIndex, nextIndex;
sl@0	742
sl@0	743	// A buffer for holding intermediate results
sl@0	744	UnicodeString buffer;
sl@0	745	int32_t bufferPos;
sl@0	746
sl@0	747	};
sl@0	748
sl@0	749	//-------------------------------------------------------------------------
sl@0	750	// Inline implementations
sl@0	751	//-------------------------------------------------------------------------
sl@0	752
sl@0	753	inline UBool
sl@0	754	Normalizer::operator!= (const Normalizer& other) const
sl@0	755	{ return ! operator==(other); }
sl@0	756
sl@0	757	inline UNormalizationCheckResult
sl@0	758	Normalizer::quickCheck(const UnicodeString& source,
sl@0	759	UNormalizationMode mode,
sl@0	760	UErrorCode &status) {
sl@0	761	if(U_FAILURE(status)) {
sl@0	762	return UNORM_MAYBE;
sl@0	763	}
sl@0	764
sl@0	765	return unorm_quickCheck(source.getBuffer(), source.length(),
sl@0	766	mode, &status);
sl@0	767	}
sl@0	768
sl@0	769	inline UNormalizationCheckResult
sl@0	770	Normalizer::quickCheck(const UnicodeString& source,
sl@0	771	UNormalizationMode mode, int32_t options,
sl@0	772	UErrorCode &status) {
sl@0	773	if(U_FAILURE(status)) {
sl@0	774	return UNORM_MAYBE;
sl@0	775	}
sl@0	776
sl@0	777	return unorm_quickCheckWithOptions(source.getBuffer(), source.length(),
sl@0	778	mode, options, &status);
sl@0	779	}
sl@0	780
sl@0	781	inline UBool
sl@0	782	Normalizer::isNormalized(const UnicodeString& source,
sl@0	783	UNormalizationMode mode,
sl@0	784	UErrorCode &status) {
sl@0	785	if(U_FAILURE(status)) {
sl@0	786	return FALSE;
sl@0	787	}
sl@0	788
sl@0	789	return unorm_isNormalized(source.getBuffer(), source.length(),
sl@0	790	mode, &status);
sl@0	791	}
sl@0	792
sl@0	793	inline UBool
sl@0	794	Normalizer::isNormalized(const UnicodeString& source,
sl@0	795	UNormalizationMode mode, int32_t options,
sl@0	796	UErrorCode &status) {
sl@0	797	if(U_FAILURE(status)) {
sl@0	798	return FALSE;
sl@0	799	}
sl@0	800
sl@0	801	return unorm_isNormalizedWithOptions(source.getBuffer(), source.length(),
sl@0	802	mode, options, &status);
sl@0	803	}
sl@0	804
sl@0	805	inline int32_t
sl@0	806	Normalizer::compare(const UnicodeString &s1, const UnicodeString &s2,
sl@0	807	uint32_t options,
sl@0	808	UErrorCode &errorCode) {
sl@0	809	// all argument checking is done in unorm_compare
sl@0	810	return unorm_compare(s1.getBuffer(), s1.length(),
sl@0	811	s2.getBuffer(), s2.length(),
sl@0	812	options,
sl@0	813	&errorCode);
sl@0	814	}
sl@0	815
sl@0	816	U_NAMESPACE_END
sl@0	817
sl@0	818	#endif /* #if !UCONFIG_NO_NORMALIZATION */
sl@0	819
sl@0	820	#endif // NORMLZR_H

author	sl
	Tue, 10 Jun 2014 14:32:02 +0200
changeset 1	260cb5ec6c19
permissions	-rw-r--r--