sl@0
|
1 |
/*
|
sl@0
|
2 |
*******************************************************************************
|
sl@0
|
3 |
* Copyright (c) 1996-2005, International Business Machines Corporation
|
sl@0
|
4 |
* and others. All Rights Reserved.
|
sl@0
|
5 |
*******************************************************************************
|
sl@0
|
6 |
* File unorm.h
|
sl@0
|
7 |
*
|
sl@0
|
8 |
* Created by: Vladimir Weinstein 12052000
|
sl@0
|
9 |
*
|
sl@0
|
10 |
* Modification history :
|
sl@0
|
11 |
*
|
sl@0
|
12 |
* Date Name Description
|
sl@0
|
13 |
* 02/01/01 synwee Added normalization quickcheck enum and method.
|
sl@0
|
14 |
*/
|
sl@0
|
15 |
#ifndef UNORM_H
|
sl@0
|
16 |
#define UNORM_H
|
sl@0
|
17 |
|
sl@0
|
18 |
#include "unicode/utypes.h"
|
sl@0
|
19 |
|
sl@0
|
20 |
#if !UCONFIG_NO_NORMALIZATION
|
sl@0
|
21 |
|
sl@0
|
22 |
#include "unicode/uiter.h"
|
sl@0
|
23 |
|
sl@0
|
24 |
/**
|
sl@0
|
25 |
* \file
|
sl@0
|
26 |
* \brief C API: Unicode Normalization
|
sl@0
|
27 |
*
|
sl@0
|
28 |
* <h2>Unicode normalization API</h2>
|
sl@0
|
29 |
*
|
sl@0
|
30 |
* <code>unorm_normalize</code> transforms Unicode text into an equivalent composed or
|
sl@0
|
31 |
* decomposed form, allowing for easier sorting and searching of text.
|
sl@0
|
32 |
* <code>unorm_normalize</code> supports the standard normalization forms described in
|
sl@0
|
33 |
* <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
|
sl@0
|
34 |
* Unicode Standard Annex #15: Unicode Normalization Forms</a>.
|
sl@0
|
35 |
*
|
sl@0
|
36 |
* Characters with accents or other adornments can be encoded in
|
sl@0
|
37 |
* several different ways in Unicode. For example, take the character A-acute.
|
sl@0
|
38 |
* In Unicode, this can be encoded as a single character (the
|
sl@0
|
39 |
* "composed" form):
|
sl@0
|
40 |
*
|
sl@0
|
41 |
* \code
|
sl@0
|
42 |
* 00C1 LATIN CAPITAL LETTER A WITH ACUTE
|
sl@0
|
43 |
* \endcode
|
sl@0
|
44 |
*
|
sl@0
|
45 |
* or as two separate characters (the "decomposed" form):
|
sl@0
|
46 |
*
|
sl@0
|
47 |
* \code
|
sl@0
|
48 |
* 0041 LATIN CAPITAL LETTER A
|
sl@0
|
49 |
* 0301 COMBINING ACUTE ACCENT
|
sl@0
|
50 |
* \endcode
|
sl@0
|
51 |
*
|
sl@0
|
52 |
* To a user of your program, however, both of these sequences should be
|
sl@0
|
53 |
* treated as the same "user-level" character "A with acute accent". When you are searching or
|
sl@0
|
54 |
* comparing text, you must ensure that these two sequences are treated
|
sl@0
|
55 |
* equivalently. In addition, you must handle characters with more than one
|
sl@0
|
56 |
* accent. Sometimes the order of a character's combining accents is
|
sl@0
|
57 |
* significant, while in other cases accent sequences in different orders are
|
sl@0
|
58 |
* really equivalent.
|
sl@0
|
59 |
*
|
sl@0
|
60 |
* Similarly, the string "ffi" can be encoded as three separate letters:
|
sl@0
|
61 |
*
|
sl@0
|
62 |
* \code
|
sl@0
|
63 |
* 0066 LATIN SMALL LETTER F
|
sl@0
|
64 |
* 0066 LATIN SMALL LETTER F
|
sl@0
|
65 |
* 0069 LATIN SMALL LETTER I
|
sl@0
|
66 |
* \endcode
|
sl@0
|
67 |
*
|
sl@0
|
68 |
* or as the single character
|
sl@0
|
69 |
*
|
sl@0
|
70 |
* \code
|
sl@0
|
71 |
* FB03 LATIN SMALL LIGATURE FFI
|
sl@0
|
72 |
* \endcode
|
sl@0
|
73 |
*
|
sl@0
|
74 |
* The ffi ligature is not a distinct semantic character, and strictly speaking
|
sl@0
|
75 |
* it shouldn't be in Unicode at all, but it was included for compatibility
|
sl@0
|
76 |
* with existing character sets that already provided it. The Unicode standard
|
sl@0
|
77 |
* identifies such characters by giving them "compatibility" decompositions
|
sl@0
|
78 |
* into the corresponding semantic characters. When sorting and searching, you
|
sl@0
|
79 |
* will often want to use these mappings.
|
sl@0
|
80 |
*
|
sl@0
|
81 |
* <code>unorm_normalize</code> helps solve these problems by transforming text into the
|
sl@0
|
82 |
* canonical composed and decomposed forms as shown in the first example above.
|
sl@0
|
83 |
* In addition, you can have it perform compatibility decompositions so that
|
sl@0
|
84 |
* you can treat compatibility characters the same as their equivalents.
|
sl@0
|
85 |
* Finally, <code>unorm_normalize</code> rearranges accents into the proper canonical
|
sl@0
|
86 |
* order, so that you do not have to worry about accent rearrangement on your
|
sl@0
|
87 |
* own.
|
sl@0
|
88 |
*
|
sl@0
|
89 |
* Form FCD, "Fast C or D", is also designed for collation.
|
sl@0
|
90 |
* It allows to work on strings that are not necessarily normalized
|
sl@0
|
91 |
* with an algorithm (like in collation) that works under "canonical closure", i.e., it treats precomposed
|
sl@0
|
92 |
* characters and their decomposed equivalents the same.
|
sl@0
|
93 |
*
|
sl@0
|
94 |
* It is not a normalization form because it does not provide for uniqueness of representation. Multiple strings
|
sl@0
|
95 |
* may be canonically equivalent (their NFDs are identical) and may all conform to FCD without being identical
|
sl@0
|
96 |
* themselves.
|
sl@0
|
97 |
*
|
sl@0
|
98 |
* The form is defined such that the "raw decomposition", the recursive canonical decomposition of each character,
|
sl@0
|
99 |
* results in a string that is canonically ordered. This means that precomposed characters are allowed for as long
|
sl@0
|
100 |
* as their decompositions do not need canonical reordering.
|
sl@0
|
101 |
*
|
sl@0
|
102 |
* Its advantage for a process like collation is that all NFD and most NFC texts - and many unnormalized texts -
|
sl@0
|
103 |
* already conform to FCD and do not need to be normalized (NFD) for such a process. The FCD quick check will
|
sl@0
|
104 |
* return UNORM_YES for most strings in practice.
|
sl@0
|
105 |
*
|
sl@0
|
106 |
* unorm_normalize(UNORM_FCD) may be implemented with UNORM_NFD.
|
sl@0
|
107 |
*
|
sl@0
|
108 |
* For more details on FCD see the collation design document:
|
sl@0
|
109 |
* http://dev.icu-project.org/cgi-bin/viewcvs.cgi/~checkout~/icuhtml/design/collation/ICU_collation_design.htm
|
sl@0
|
110 |
*
|
sl@0
|
111 |
* ICU collation performs either NFD or FCD normalization automatically if normalization
|
sl@0
|
112 |
* is turned on for the collator object.
|
sl@0
|
113 |
* Beyond collation and string search, normalized strings may be useful for string equivalence comparisons,
|
sl@0
|
114 |
* transliteration/transcription, unique representations, etc.
|
sl@0
|
115 |
*
|
sl@0
|
116 |
* The W3C generally recommends to exchange texts in NFC.
|
sl@0
|
117 |
* Note also that most legacy character encodings use only precomposed forms and often do not
|
sl@0
|
118 |
* encode any combining marks by themselves. For conversion to such character encodings the
|
sl@0
|
119 |
* Unicode text needs to be normalized to NFC.
|
sl@0
|
120 |
* For more usage examples, see the Unicode Standard Annex.
|
sl@0
|
121 |
*/
|
sl@0
|
122 |
|
sl@0
|
123 |
/**
|
sl@0
|
124 |
* Constants for normalization modes.
|
sl@0
|
125 |
* @stable ICU 2.0
|
sl@0
|
126 |
*/
|
sl@0
|
127 |
typedef enum {
|
sl@0
|
128 |
/** No decomposition/composition. @stable ICU 2.0 */
|
sl@0
|
129 |
UNORM_NONE = 1,
|
sl@0
|
130 |
/** Canonical decomposition. @stable ICU 2.0 */
|
sl@0
|
131 |
UNORM_NFD = 2,
|
sl@0
|
132 |
/** Compatibility decomposition. @stable ICU 2.0 */
|
sl@0
|
133 |
UNORM_NFKD = 3,
|
sl@0
|
134 |
/** Canonical decomposition followed by canonical composition. @stable ICU 2.0 */
|
sl@0
|
135 |
UNORM_NFC = 4,
|
sl@0
|
136 |
/** Default normalization. @stable ICU 2.0 */
|
sl@0
|
137 |
UNORM_DEFAULT = UNORM_NFC,
|
sl@0
|
138 |
/** Compatibility decomposition followed by canonical composition. @stable ICU 2.0 */
|
sl@0
|
139 |
UNORM_NFKC =5,
|
sl@0
|
140 |
/** "Fast C or D" form. @stable ICU 2.0 */
|
sl@0
|
141 |
UNORM_FCD = 6,
|
sl@0
|
142 |
|
sl@0
|
143 |
/** One more than the highest normalization mode constant. @stable ICU 2.0 */
|
sl@0
|
144 |
UNORM_MODE_COUNT
|
sl@0
|
145 |
} UNormalizationMode;
|
sl@0
|
146 |
|
sl@0
|
147 |
/**
|
sl@0
|
148 |
* Constants for options flags for normalization.
|
sl@0
|
149 |
* Use 0 for default options,
|
sl@0
|
150 |
* including normalization according to the Unicode version
|
sl@0
|
151 |
* that is currently supported by ICU (see u_getUnicodeVersion).
|
sl@0
|
152 |
* @stable ICU 2.6
|
sl@0
|
153 |
*/
|
sl@0
|
154 |
enum {
|
sl@0
|
155 |
/**
|
sl@0
|
156 |
* Options bit set value to select Unicode 3.2 normalization
|
sl@0
|
157 |
* (except NormalizationCorrections).
|
sl@0
|
158 |
* At most one Unicode version can be selected at a time.
|
sl@0
|
159 |
* @stable ICU 2.6
|
sl@0
|
160 |
*/
|
sl@0
|
161 |
UNORM_UNICODE_3_2=0x20
|
sl@0
|
162 |
};
|
sl@0
|
163 |
|
sl@0
|
164 |
/**
|
sl@0
|
165 |
* Lowest-order bit number of unorm_compare() options bits corresponding to
|
sl@0
|
166 |
* normalization options bits.
|
sl@0
|
167 |
*
|
sl@0
|
168 |
* The options parameter for unorm_compare() uses most bits for
|
sl@0
|
169 |
* itself and for various comparison and folding flags.
|
sl@0
|
170 |
* The most significant bits, however, are shifted down and passed on
|
sl@0
|
171 |
* to the normalization implementation.
|
sl@0
|
172 |
* (That is, from unorm_compare(..., options, ...),
|
sl@0
|
173 |
* options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT will be passed on to the
|
sl@0
|
174 |
* internal normalization functions.)
|
sl@0
|
175 |
*
|
sl@0
|
176 |
* @see unorm_compare
|
sl@0
|
177 |
* @stable ICU 2.6
|
sl@0
|
178 |
*/
|
sl@0
|
179 |
#define UNORM_COMPARE_NORM_OPTIONS_SHIFT 20
|
sl@0
|
180 |
|
sl@0
|
181 |
/**
|
sl@0
|
182 |
* Normalize a string.
|
sl@0
|
183 |
* The string will be normalized according the specified normalization mode
|
sl@0
|
184 |
* and options.
|
sl@0
|
185 |
*
|
sl@0
|
186 |
* @param source The string to normalize.
|
sl@0
|
187 |
* @param sourceLength The length of source, or -1 if NUL-terminated.
|
sl@0
|
188 |
* @param mode The normalization mode; one of UNORM_NONE,
|
sl@0
|
189 |
* UNORM_NFD, UNORM_NFC, UNORM_NFKC, UNORM_NFKD, UNORM_DEFAULT.
|
sl@0
|
190 |
* @param options The normalization options, ORed together (0 for no options).
|
sl@0
|
191 |
* @param result A pointer to a buffer to receive the result string.
|
sl@0
|
192 |
* The result string is NUL-terminated if possible.
|
sl@0
|
193 |
* @param resultLength The maximum size of result.
|
sl@0
|
194 |
* @param status A pointer to a UErrorCode to receive any errors.
|
sl@0
|
195 |
* @return The total buffer size needed; if greater than resultLength,
|
sl@0
|
196 |
* the output was truncated, and the error code is set to U_BUFFER_OVERFLOW_ERROR.
|
sl@0
|
197 |
* @stable ICU 2.0
|
sl@0
|
198 |
*/
|
sl@0
|
199 |
U_STABLE int32_t U_EXPORT2
|
sl@0
|
200 |
unorm_normalize(const UChar *source, int32_t sourceLength,
|
sl@0
|
201 |
UNormalizationMode mode, int32_t options,
|
sl@0
|
202 |
UChar *result, int32_t resultLength,
|
sl@0
|
203 |
UErrorCode *status);
|
sl@0
|
204 |
#endif
|
sl@0
|
205 |
/**
|
sl@0
|
206 |
* Result values for unorm_quickCheck().
|
sl@0
|
207 |
* For details see Unicode Technical Report 15.
|
sl@0
|
208 |
* @stable ICU 2.0
|
sl@0
|
209 |
*/
|
sl@0
|
210 |
typedef enum UNormalizationCheckResult {
|
sl@0
|
211 |
/**
|
sl@0
|
212 |
* Indicates that string is not in the normalized format
|
sl@0
|
213 |
*/
|
sl@0
|
214 |
UNORM_NO,
|
sl@0
|
215 |
/**
|
sl@0
|
216 |
* Indicates that string is in the normalized format
|
sl@0
|
217 |
*/
|
sl@0
|
218 |
UNORM_YES,
|
sl@0
|
219 |
/**
|
sl@0
|
220 |
* Indicates that string cannot be determined if it is in the normalized
|
sl@0
|
221 |
* format without further thorough checks.
|
sl@0
|
222 |
*/
|
sl@0
|
223 |
UNORM_MAYBE
|
sl@0
|
224 |
} UNormalizationCheckResult;
|
sl@0
|
225 |
#if !UCONFIG_NO_NORMALIZATION
|
sl@0
|
226 |
/**
|
sl@0
|
227 |
* Performing quick check on a string, to quickly determine if the string is
|
sl@0
|
228 |
* in a particular normalization format.
|
sl@0
|
229 |
* Three types of result can be returned UNORM_YES, UNORM_NO or
|
sl@0
|
230 |
* UNORM_MAYBE. Result UNORM_YES indicates that the argument
|
sl@0
|
231 |
* string is in the desired normalized format, UNORM_NO determines that
|
sl@0
|
232 |
* argument string is not in the desired normalized format. A
|
sl@0
|
233 |
* UNORM_MAYBE result indicates that a more thorough check is required,
|
sl@0
|
234 |
* the user may have to put the string in its normalized form and compare the
|
sl@0
|
235 |
* results.
|
sl@0
|
236 |
*
|
sl@0
|
237 |
* @param source string for determining if it is in a normalized format
|
sl@0
|
238 |
* @param sourcelength length of source to test, or -1 if NUL-terminated
|
sl@0
|
239 |
* @param mode which normalization form to test for
|
sl@0
|
240 |
* @param status a pointer to a UErrorCode to receive any errors
|
sl@0
|
241 |
* @return UNORM_YES, UNORM_NO or UNORM_MAYBE
|
sl@0
|
242 |
*
|
sl@0
|
243 |
* @see unorm_isNormalized
|
sl@0
|
244 |
* @stable ICU 2.0
|
sl@0
|
245 |
*/
|
sl@0
|
246 |
U_STABLE UNormalizationCheckResult U_EXPORT2
|
sl@0
|
247 |
unorm_quickCheck(const UChar *source, int32_t sourcelength,
|
sl@0
|
248 |
UNormalizationMode mode,
|
sl@0
|
249 |
UErrorCode *status);
|
sl@0
|
250 |
|
sl@0
|
251 |
/**
|
sl@0
|
252 |
* Performing quick check on a string; same as unorm_quickCheck but
|
sl@0
|
253 |
* takes an extra options parameter like most normalization functions.
|
sl@0
|
254 |
*
|
sl@0
|
255 |
* @param src String that is to be tested if it is in a normalization format.
|
sl@0
|
256 |
* @param srcLength Length of source to test, or -1 if NUL-terminated.
|
sl@0
|
257 |
* @param mode Which normalization form to test for.
|
sl@0
|
258 |
* @param options The normalization options, ORed together (0 for no options).
|
sl@0
|
259 |
* @param pErrorCode ICU error code in/out parameter.
|
sl@0
|
260 |
* Must fulfill U_SUCCESS before the function call.
|
sl@0
|
261 |
* @return UNORM_YES, UNORM_NO or UNORM_MAYBE
|
sl@0
|
262 |
*
|
sl@0
|
263 |
* @see unorm_quickCheck
|
sl@0
|
264 |
* @see unorm_isNormalized
|
sl@0
|
265 |
* @stable ICU 2.6
|
sl@0
|
266 |
*/
|
sl@0
|
267 |
U_STABLE UNormalizationCheckResult U_EXPORT2
|
sl@0
|
268 |
unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength,
|
sl@0
|
269 |
UNormalizationMode mode, int32_t options,
|
sl@0
|
270 |
UErrorCode *pErrorCode);
|
sl@0
|
271 |
|
sl@0
|
272 |
/**
|
sl@0
|
273 |
* Test if a string is in a given normalization form.
|
sl@0
|
274 |
* This is semantically equivalent to source.equals(normalize(source, mode)) .
|
sl@0
|
275 |
*
|
sl@0
|
276 |
* Unlike unorm_quickCheck(), this function returns a definitive result,
|
sl@0
|
277 |
* never a "maybe".
|
sl@0
|
278 |
* For NFD, NFKD, and FCD, both functions work exactly the same.
|
sl@0
|
279 |
* For NFC and NFKC where quickCheck may return "maybe", this function will
|
sl@0
|
280 |
* perform further tests to arrive at a TRUE/FALSE result.
|
sl@0
|
281 |
*
|
sl@0
|
282 |
* @param src String that is to be tested if it is in a normalization format.
|
sl@0
|
283 |
* @param srcLength Length of source to test, or -1 if NUL-terminated.
|
sl@0
|
284 |
* @param mode Which normalization form to test for.
|
sl@0
|
285 |
* @param pErrorCode ICU error code in/out parameter.
|
sl@0
|
286 |
* Must fulfill U_SUCCESS before the function call.
|
sl@0
|
287 |
* @return Boolean value indicating whether the source string is in the
|
sl@0
|
288 |
* "mode" normalization form.
|
sl@0
|
289 |
*
|
sl@0
|
290 |
* @see unorm_quickCheck
|
sl@0
|
291 |
* @stable ICU 2.2
|
sl@0
|
292 |
*/
|
sl@0
|
293 |
U_STABLE UBool U_EXPORT2
|
sl@0
|
294 |
unorm_isNormalized(const UChar *src, int32_t srcLength,
|
sl@0
|
295 |
UNormalizationMode mode,
|
sl@0
|
296 |
UErrorCode *pErrorCode);
|
sl@0
|
297 |
|
sl@0
|
298 |
/**
|
sl@0
|
299 |
* Test if a string is in a given normalization form; same as unorm_isNormalized but
|
sl@0
|
300 |
* takes an extra options parameter like most normalization functions.
|
sl@0
|
301 |
*
|
sl@0
|
302 |
* @param src String that is to be tested if it is in a normalization format.
|
sl@0
|
303 |
* @param srcLength Length of source to test, or -1 if NUL-terminated.
|
sl@0
|
304 |
* @param mode Which normalization form to test for.
|
sl@0
|
305 |
* @param options The normalization options, ORed together (0 for no options).
|
sl@0
|
306 |
* @param pErrorCode ICU error code in/out parameter.
|
sl@0
|
307 |
* Must fulfill U_SUCCESS before the function call.
|
sl@0
|
308 |
* @return Boolean value indicating whether the source string is in the
|
sl@0
|
309 |
* "mode/options" normalization form.
|
sl@0
|
310 |
*
|
sl@0
|
311 |
* @see unorm_quickCheck
|
sl@0
|
312 |
* @see unorm_isNormalized
|
sl@0
|
313 |
* @stable ICU 2.6
|
sl@0
|
314 |
*/
|
sl@0
|
315 |
U_STABLE UBool U_EXPORT2
|
sl@0
|
316 |
unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength,
|
sl@0
|
317 |
UNormalizationMode mode, int32_t options,
|
sl@0
|
318 |
UErrorCode *pErrorCode);
|
sl@0
|
319 |
|
sl@0
|
320 |
/**
|
sl@0
|
321 |
* Iterative normalization forward.
|
sl@0
|
322 |
* This function (together with unorm_previous) is somewhat
|
sl@0
|
323 |
* similar to the C++ Normalizer class (see its non-static functions).
|
sl@0
|
324 |
*
|
sl@0
|
325 |
* Iterative normalization is useful when only a small portion of a longer
|
sl@0
|
326 |
* string/text needs to be processed.
|
sl@0
|
327 |
*
|
sl@0
|
328 |
* For example, the likelihood may be high that processing the first 10% of some
|
sl@0
|
329 |
* text will be sufficient to find certain data.
|
sl@0
|
330 |
* Another example: When one wants to concatenate two normalized strings and get a
|
sl@0
|
331 |
* normalized result, it is much more efficient to normalize just a small part of
|
sl@0
|
332 |
* the result around the concatenation place instead of re-normalizing everything.
|
sl@0
|
333 |
*
|
sl@0
|
334 |
* The input text is an instance of the C character iteration API UCharIterator.
|
sl@0
|
335 |
* It may wrap around a simple string, a CharacterIterator, a Replaceable, or any
|
sl@0
|
336 |
* other kind of text object.
|
sl@0
|
337 |
*
|
sl@0
|
338 |
* If a buffer overflow occurs, then the caller needs to reset the iterator to the
|
sl@0
|
339 |
* old index and call the function again with a larger buffer - if the caller cares
|
sl@0
|
340 |
* for the actual output.
|
sl@0
|
341 |
* Regardless of the output buffer, the iterator will always be moved to the next
|
sl@0
|
342 |
* normalization boundary.
|
sl@0
|
343 |
*
|
sl@0
|
344 |
* This function (like unorm_previous) serves two purposes:
|
sl@0
|
345 |
*
|
sl@0
|
346 |
* 1) To find the next boundary so that the normalization of the part of the text
|
sl@0
|
347 |
* from the current position to that boundary does not affect and is not affected
|
sl@0
|
348 |
* by the part of the text beyond that boundary.
|
sl@0
|
349 |
*
|
sl@0
|
350 |
* 2) To normalize the text up to the boundary.
|
sl@0
|
351 |
*
|
sl@0
|
352 |
* The second step is optional, per the doNormalize parameter.
|
sl@0
|
353 |
* It is omitted for operations like string concatenation, where the two adjacent
|
sl@0
|
354 |
* string ends need to be normalized together.
|
sl@0
|
355 |
* In such a case, the output buffer will just contain a copy of the text up to the
|
sl@0
|
356 |
* boundary.
|
sl@0
|
357 |
*
|
sl@0
|
358 |
* pNeededToNormalize is an output-only parameter. Its output value is only defined
|
sl@0
|
359 |
* if normalization was requested (doNormalize) and successful (especially, no
|
sl@0
|
360 |
* buffer overflow).
|
sl@0
|
361 |
* It is useful for operations like a normalizing transliterator, where one would
|
sl@0
|
362 |
* not want to replace a piece of text if it is not modified.
|
sl@0
|
363 |
*
|
sl@0
|
364 |
* If doNormalize==TRUE and pNeededToNormalize!=NULL then *pNeeded... is set TRUE
|
sl@0
|
365 |
* if the normalization was necessary.
|
sl@0
|
366 |
*
|
sl@0
|
367 |
* If doNormalize==FALSE then *pNeededToNormalize will be set to FALSE.
|
sl@0
|
368 |
*
|
sl@0
|
369 |
* If the buffer overflows, then *pNeededToNormalize will be undefined;
|
sl@0
|
370 |
* essentially, whenever U_FAILURE is true (like in buffer overflows), this result
|
sl@0
|
371 |
* will be undefined.
|
sl@0
|
372 |
*
|
sl@0
|
373 |
* @param src The input text in the form of a C character iterator.
|
sl@0
|
374 |
* @param dest The output buffer; can be NULL if destCapacity==0 for pure preflighting.
|
sl@0
|
375 |
* @param destCapacity The number of UChars that fit into dest.
|
sl@0
|
376 |
* @param mode The normalization mode.
|
sl@0
|
377 |
* @param options The normalization options, ORed together (0 for no options).
|
sl@0
|
378 |
* @param doNormalize Indicates if the source text up to the next boundary
|
sl@0
|
379 |
* is to be normalized (TRUE) or just copied (FALSE).
|
sl@0
|
380 |
* @param pNeededToNormalize Output flag indicating if the normalization resulted in
|
sl@0
|
381 |
* different text from the input.
|
sl@0
|
382 |
* Not defined if an error occurs including buffer overflow.
|
sl@0
|
383 |
* Always FALSE if !doNormalize.
|
sl@0
|
384 |
* @param pErrorCode ICU error code in/out parameter.
|
sl@0
|
385 |
* Must fulfill U_SUCCESS before the function call.
|
sl@0
|
386 |
* @return Length of output (number of UChars) when successful or buffer overflow.
|
sl@0
|
387 |
*
|
sl@0
|
388 |
* @see unorm_previous
|
sl@0
|
389 |
* @see unorm_normalize
|
sl@0
|
390 |
*
|
sl@0
|
391 |
* @stable ICU 2.1
|
sl@0
|
392 |
*/
|
sl@0
|
393 |
U_STABLE int32_t U_EXPORT2
|
sl@0
|
394 |
unorm_next(UCharIterator *src,
|
sl@0
|
395 |
UChar *dest, int32_t destCapacity,
|
sl@0
|
396 |
UNormalizationMode mode, int32_t options,
|
sl@0
|
397 |
UBool doNormalize, UBool *pNeededToNormalize,
|
sl@0
|
398 |
UErrorCode *pErrorCode);
|
sl@0
|
399 |
|
sl@0
|
400 |
/**
|
sl@0
|
401 |
* Iterative normalization backward.
|
sl@0
|
402 |
* This function (together with unorm_next) is somewhat
|
sl@0
|
403 |
* similar to the C++ Normalizer class (see its non-static functions).
|
sl@0
|
404 |
* For all details see unorm_next.
|
sl@0
|
405 |
*
|
sl@0
|
406 |
* @param src The input text in the form of a C character iterator.
|
sl@0
|
407 |
* @param dest The output buffer; can be NULL if destCapacity==0 for pure preflighting.
|
sl@0
|
408 |
* @param destCapacity The number of UChars that fit into dest.
|
sl@0
|
409 |
* @param mode The normalization mode.
|
sl@0
|
410 |
* @param options The normalization options, ORed together (0 for no options).
|
sl@0
|
411 |
* @param doNormalize Indicates if the source text up to the next boundary
|
sl@0
|
412 |
* is to be normalized (TRUE) or just copied (FALSE).
|
sl@0
|
413 |
* @param pNeededToNormalize Output flag indicating if the normalization resulted in
|
sl@0
|
414 |
* different text from the input.
|
sl@0
|
415 |
* Not defined if an error occurs including buffer overflow.
|
sl@0
|
416 |
* Always FALSE if !doNormalize.
|
sl@0
|
417 |
* @param pErrorCode ICU error code in/out parameter.
|
sl@0
|
418 |
* Must fulfill U_SUCCESS before the function call.
|
sl@0
|
419 |
* @return Length of output (number of UChars) when successful or buffer overflow.
|
sl@0
|
420 |
*
|
sl@0
|
421 |
* @see unorm_next
|
sl@0
|
422 |
* @see unorm_normalize
|
sl@0
|
423 |
*
|
sl@0
|
424 |
* @stable ICU 2.1
|
sl@0
|
425 |
*/
|
sl@0
|
426 |
U_STABLE int32_t U_EXPORT2
|
sl@0
|
427 |
unorm_previous(UCharIterator *src,
|
sl@0
|
428 |
UChar *dest, int32_t destCapacity,
|
sl@0
|
429 |
UNormalizationMode mode, int32_t options,
|
sl@0
|
430 |
UBool doNormalize, UBool *pNeededToNormalize,
|
sl@0
|
431 |
UErrorCode *pErrorCode);
|
sl@0
|
432 |
|
sl@0
|
433 |
/**
|
sl@0
|
434 |
* Concatenate normalized strings, making sure that the result is normalized as well.
|
sl@0
|
435 |
*
|
sl@0
|
436 |
* If both the left and the right strings are in
|
sl@0
|
437 |
* the normalization form according to "mode/options",
|
sl@0
|
438 |
* then the result will be
|
sl@0
|
439 |
*
|
sl@0
|
440 |
* \code
|
sl@0
|
441 |
* dest=normalize(left+right, mode, options)
|
sl@0
|
442 |
* \endcode
|
sl@0
|
443 |
*
|
sl@0
|
444 |
* With the input strings already being normalized,
|
sl@0
|
445 |
* this function will use unorm_next() and unorm_previous()
|
sl@0
|
446 |
* to find the adjacent end pieces of the input strings.
|
sl@0
|
447 |
* Only the concatenation of these end pieces will be normalized and
|
sl@0
|
448 |
* then concatenated with the remaining parts of the input strings.
|
sl@0
|
449 |
*
|
sl@0
|
450 |
* It is allowed to have dest==left to avoid copying the entire left string.
|
sl@0
|
451 |
*
|
sl@0
|
452 |
* @param left Left source string, may be same as dest.
|
sl@0
|
453 |
* @param leftLength Length of left source string, or -1 if NUL-terminated.
|
sl@0
|
454 |
* @param right Right source string.
|
sl@0
|
455 |
* @param rightLength Length of right source string, or -1 if NUL-terminated.
|
sl@0
|
456 |
* @param dest The output buffer; can be NULL if destCapacity==0 for pure preflighting.
|
sl@0
|
457 |
* @param destCapacity The number of UChars that fit into dest.
|
sl@0
|
458 |
* @param mode The normalization mode.
|
sl@0
|
459 |
* @param options The normalization options, ORed together (0 for no options).
|
sl@0
|
460 |
* @param pErrorCode ICU error code in/out parameter.
|
sl@0
|
461 |
* Must fulfill U_SUCCESS before the function call.
|
sl@0
|
462 |
* @return Length of output (number of UChars) when successful or buffer overflow.
|
sl@0
|
463 |
*
|
sl@0
|
464 |
* @see unorm_normalize
|
sl@0
|
465 |
* @see unorm_next
|
sl@0
|
466 |
* @see unorm_previous
|
sl@0
|
467 |
*
|
sl@0
|
468 |
* @stable ICU 2.1
|
sl@0
|
469 |
*/
|
sl@0
|
470 |
U_STABLE int32_t U_EXPORT2
|
sl@0
|
471 |
unorm_concatenate(const UChar *left, int32_t leftLength,
|
sl@0
|
472 |
const UChar *right, int32_t rightLength,
|
sl@0
|
473 |
UChar *dest, int32_t destCapacity,
|
sl@0
|
474 |
UNormalizationMode mode, int32_t options,
|
sl@0
|
475 |
UErrorCode *pErrorCode);
|
sl@0
|
476 |
|
sl@0
|
477 |
/**
|
sl@0
|
478 |
* Option bit for unorm_compare:
|
sl@0
|
479 |
* Both input strings are assumed to fulfill FCD conditions.
|
sl@0
|
480 |
* @stable ICU 2.2
|
sl@0
|
481 |
*/
|
sl@0
|
482 |
#define UNORM_INPUT_IS_FCD 0x20000
|
sl@0
|
483 |
|
sl@0
|
484 |
/**
|
sl@0
|
485 |
* Option bit for unorm_compare:
|
sl@0
|
486 |
* Perform case-insensitive comparison.
|
sl@0
|
487 |
* @stable ICU 2.2
|
sl@0
|
488 |
*/
|
sl@0
|
489 |
#define U_COMPARE_IGNORE_CASE 0x10000
|
sl@0
|
490 |
|
sl@0
|
491 |
#ifndef U_COMPARE_CODE_POINT_ORDER
|
sl@0
|
492 |
/* see also unistr.h and ustring.h */
|
sl@0
|
493 |
/**
|
sl@0
|
494 |
* Option bit for u_strCaseCompare, u_strcasecmp, unorm_compare, etc:
|
sl@0
|
495 |
* Compare strings in code point order instead of code unit order.
|
sl@0
|
496 |
* @stable ICU 2.2
|
sl@0
|
497 |
*/
|
sl@0
|
498 |
#define U_COMPARE_CODE_POINT_ORDER 0x8000
|
sl@0
|
499 |
#endif
|
sl@0
|
500 |
|
sl@0
|
501 |
/**
|
sl@0
|
502 |
* Compare two strings for canonical equivalence.
|
sl@0
|
503 |
* Further options include case-insensitive comparison and
|
sl@0
|
504 |
* code point order (as opposed to code unit order).
|
sl@0
|
505 |
*
|
sl@0
|
506 |
* Canonical equivalence between two strings is defined as their normalized
|
sl@0
|
507 |
* forms (NFD or NFC) being identical.
|
sl@0
|
508 |
* This function compares strings incrementally instead of normalizing
|
sl@0
|
509 |
* (and optionally case-folding) both strings entirely,
|
sl@0
|
510 |
* improving performance significantly.
|
sl@0
|
511 |
*
|
sl@0
|
512 |
* Bulk normalization is only necessary if the strings do not fulfill the FCD
|
sl@0
|
513 |
* conditions. Only in this case, and only if the strings are relatively long,
|
sl@0
|
514 |
* is memory allocated temporarily.
|
sl@0
|
515 |
* For FCD strings and short non-FCD strings there is no memory allocation.
|
sl@0
|
516 |
*
|
sl@0
|
517 |
* Semantically, this is equivalent to
|
sl@0
|
518 |
* strcmp[CodePointOrder](NFD(foldCase(NFD(s1))), NFD(foldCase(NFD(s2))))
|
sl@0
|
519 |
* where code point order and foldCase are all optional.
|
sl@0
|
520 |
*
|
sl@0
|
521 |
* UAX 21 2.5 Caseless Matching specifies that for a canonical caseless match
|
sl@0
|
522 |
* the case folding must be performed first, then the normalization.
|
sl@0
|
523 |
*
|
sl@0
|
524 |
* @param s1 First source string.
|
sl@0
|
525 |
* @param length1 Length of first source string, or -1 if NUL-terminated.
|
sl@0
|
526 |
*
|
sl@0
|
527 |
* @param s2 Second source string.
|
sl@0
|
528 |
* @param length2 Length of second source string, or -1 if NUL-terminated.
|
sl@0
|
529 |
*
|
sl@0
|
530 |
* @param options A bit set of options:
|
sl@0
|
531 |
* - U_FOLD_CASE_DEFAULT or 0 is used for default options:
|
sl@0
|
532 |
* Case-sensitive comparison in code unit order, and the input strings
|
sl@0
|
533 |
* are quick-checked for FCD.
|
sl@0
|
534 |
*
|
sl@0
|
535 |
* - UNORM_INPUT_IS_FCD
|
sl@0
|
536 |
* Set if the caller knows that both s1 and s2 fulfill the FCD conditions.
|
sl@0
|
537 |
* If not set, the function will quickCheck for FCD
|
sl@0
|
538 |
* and normalize if necessary.
|
sl@0
|
539 |
*
|
sl@0
|
540 |
* - U_COMPARE_CODE_POINT_ORDER
|
sl@0
|
541 |
* Set to choose code point order instead of code unit order
|
sl@0
|
542 |
* (see u_strCompare for details).
|
sl@0
|
543 |
*
|
sl@0
|
544 |
* - U_COMPARE_IGNORE_CASE
|
sl@0
|
545 |
* Set to compare strings case-insensitively using case folding,
|
sl@0
|
546 |
* instead of case-sensitively.
|
sl@0
|
547 |
* If set, then the following case folding options are used.
|
sl@0
|
548 |
*
|
sl@0
|
549 |
* - Options as used with case-insensitive comparisons, currently:
|
sl@0
|
550 |
*
|
sl@0
|
551 |
* - U_FOLD_CASE_EXCLUDE_SPECIAL_I
|
sl@0
|
552 |
* (see u_strCaseCompare for details)
|
sl@0
|
553 |
*
|
sl@0
|
554 |
* - regular normalization options shifted left by UNORM_COMPARE_NORM_OPTIONS_SHIFT
|
sl@0
|
555 |
*
|
sl@0
|
556 |
* @param pErrorCode ICU error code in/out parameter.
|
sl@0
|
557 |
* Must fulfill U_SUCCESS before the function call.
|
sl@0
|
558 |
* @return <0 or 0 or >0 as usual for string comparisons
|
sl@0
|
559 |
*
|
sl@0
|
560 |
* @see unorm_normalize
|
sl@0
|
561 |
* @see UNORM_FCD
|
sl@0
|
562 |
* @see u_strCompare
|
sl@0
|
563 |
* @see u_strCaseCompare
|
sl@0
|
564 |
*
|
sl@0
|
565 |
* @stable ICU 2.2
|
sl@0
|
566 |
*/
|
sl@0
|
567 |
U_STABLE int32_t U_EXPORT2
|
sl@0
|
568 |
unorm_compare(const UChar *s1, int32_t length1,
|
sl@0
|
569 |
const UChar *s2, int32_t length2,
|
sl@0
|
570 |
uint32_t options,
|
sl@0
|
571 |
UErrorCode *pErrorCode);
|
sl@0
|
572 |
|
sl@0
|
573 |
#endif /* #if !UCONFIG_NO_NORMALIZATION */
|
sl@0
|
574 |
|
sl@0
|
575 |
#endif
|