sl@0
|
1 |
/*
|
sl@0
|
2 |
*******************************************************************************
|
sl@0
|
3 |
*
|
sl@0
|
4 |
* Copyright (C) 2004-2005, International Business Machines
|
sl@0
|
5 |
* Corporation and others. All Rights Reserved.
|
sl@0
|
6 |
*
|
sl@0
|
7 |
*******************************************************************************
|
sl@0
|
8 |
* file name: ucase.h
|
sl@0
|
9 |
* encoding: US-ASCII
|
sl@0
|
10 |
* tab size: 8 (not used)
|
sl@0
|
11 |
* indentation:4
|
sl@0
|
12 |
*
|
sl@0
|
13 |
* created on: 2004aug30
|
sl@0
|
14 |
* created by: Markus W. Scherer
|
sl@0
|
15 |
*
|
sl@0
|
16 |
* Low-level Unicode character/string case mapping code.
|
sl@0
|
17 |
*/
|
sl@0
|
18 |
|
sl@0
|
19 |
#ifndef __UCASE_H__
|
sl@0
|
20 |
#define __UCASE_H__
|
sl@0
|
21 |
|
sl@0
|
22 |
#include "unicode/utypes.h"
|
sl@0
|
23 |
#include "unicode/uset.h"
|
sl@0
|
24 |
#include "uset_imp.h"
|
sl@0
|
25 |
#include "udataswp.h"
|
sl@0
|
26 |
|
sl@0
|
27 |
U_CDECL_BEGIN
|
sl@0
|
28 |
|
sl@0
|
29 |
/* library API -------------------------------------------------------------- */
|
sl@0
|
30 |
|
sl@0
|
31 |
struct UCaseProps;
|
sl@0
|
32 |
typedef struct UCaseProps UCaseProps;
|
sl@0
|
33 |
|
sl@0
|
34 |
U_CAPI UCaseProps * U_EXPORT2
|
sl@0
|
35 |
ucase_open(UErrorCode *pErrorCode);
|
sl@0
|
36 |
|
sl@0
|
37 |
U_CAPI UCaseProps * U_EXPORT2
|
sl@0
|
38 |
ucase_openBinary(const uint8_t *bin, int32_t length, UErrorCode *pErrorCode);
|
sl@0
|
39 |
|
sl@0
|
40 |
U_CAPI void U_EXPORT2
|
sl@0
|
41 |
ucase_close(UCaseProps *csp);
|
sl@0
|
42 |
|
sl@0
|
43 |
|
sl@0
|
44 |
U_CAPI const UCaseProps * U_EXPORT2
|
sl@0
|
45 |
ucase_getSingleton(UErrorCode *pErrorCode);
|
sl@0
|
46 |
|
sl@0
|
47 |
/**
|
sl@0
|
48 |
* Get a singleton dummy object, one that works with no real data.
|
sl@0
|
49 |
* This can be used when the real data is not available.
|
sl@0
|
50 |
* Using the dummy can reduce checks for available data after an initial failure.
|
sl@0
|
51 |
*/
|
sl@0
|
52 |
U_CAPI const UCaseProps * U_EXPORT2
|
sl@0
|
53 |
ucase_getDummy(UErrorCode *pErrorCode);
|
sl@0
|
54 |
|
sl@0
|
55 |
|
sl@0
|
56 |
U_CAPI int32_t U_EXPORT2
|
sl@0
|
57 |
ucase_swap(const UDataSwapper *ds,
|
sl@0
|
58 |
const void *inData, int32_t length, void *outData,
|
sl@0
|
59 |
UErrorCode *pErrorCode);
|
sl@0
|
60 |
|
sl@0
|
61 |
U_CAPI void U_EXPORT2
|
sl@0
|
62 |
ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode);
|
sl@0
|
63 |
|
sl@0
|
64 |
/**
|
sl@0
|
65 |
* Requires non-NULL locale ID but otherwise does the equivalent of
|
sl@0
|
66 |
* checking for language codes as if uloc_getLanguage() were called:
|
sl@0
|
67 |
* Accepts both 2- and 3-letter codes and accepts case variants.
|
sl@0
|
68 |
*/
|
sl@0
|
69 |
U_CFUNC int32_t
|
sl@0
|
70 |
ucase_getCaseLocale(const char *locale, int32_t *locCache);
|
sl@0
|
71 |
|
sl@0
|
72 |
/**
|
sl@0
|
73 |
* Bit mask for getting just the options from a string compare options word
|
sl@0
|
74 |
* that are relevant for case-insensitive string comparison.
|
sl@0
|
75 |
* See uchar.h. Also include _STRNCMP_STYLE and U_COMPARE_CODE_POINT_ORDER.
|
sl@0
|
76 |
* @internal
|
sl@0
|
77 |
*/
|
sl@0
|
78 |
#define _STRCASECMP_OPTIONS_MASK 0xffff
|
sl@0
|
79 |
|
sl@0
|
80 |
/**
|
sl@0
|
81 |
* Bit mask for getting just the options from a string compare options word
|
sl@0
|
82 |
* that are relevant for case folding (of a single string or code point).
|
sl@0
|
83 |
* See uchar.h.
|
sl@0
|
84 |
* @internal
|
sl@0
|
85 |
*/
|
sl@0
|
86 |
#define _FOLD_CASE_OPTIONS_MASK 0xff
|
sl@0
|
87 |
|
sl@0
|
88 |
/* single-code point functions */
|
sl@0
|
89 |
|
sl@0
|
90 |
U_CAPI UChar32 U_EXPORT2
|
sl@0
|
91 |
ucase_tolower(const UCaseProps *csp, UChar32 c);
|
sl@0
|
92 |
|
sl@0
|
93 |
U_CAPI UChar32 U_EXPORT2
|
sl@0
|
94 |
ucase_toupper(const UCaseProps *csp, UChar32 c);
|
sl@0
|
95 |
|
sl@0
|
96 |
U_CAPI UChar32 U_EXPORT2
|
sl@0
|
97 |
ucase_totitle(const UCaseProps *csp, UChar32 c);
|
sl@0
|
98 |
|
sl@0
|
99 |
U_CAPI UChar32 U_EXPORT2
|
sl@0
|
100 |
ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options);
|
sl@0
|
101 |
|
sl@0
|
102 |
/**
|
sl@0
|
103 |
* Adds all simple case mappings and the full case folding for c to sa,
|
sl@0
|
104 |
* and also adds special case closure mappings.
|
sl@0
|
105 |
* c itself is not added.
|
sl@0
|
106 |
* For example, the mappings
|
sl@0
|
107 |
* - for s include long s
|
sl@0
|
108 |
* - for sharp s include ss
|
sl@0
|
109 |
* - for k include the Kelvin sign
|
sl@0
|
110 |
*/
|
sl@0
|
111 |
U_CAPI void U_EXPORT2
|
sl@0
|
112 |
ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa);
|
sl@0
|
113 |
|
sl@0
|
114 |
/**
|
sl@0
|
115 |
* Maps the string to single code points and adds the associated case closure
|
sl@0
|
116 |
* mappings.
|
sl@0
|
117 |
* The string is mapped to code points if it is their full case folding string.
|
sl@0
|
118 |
* In other words, this performs a reverse full case folding and then
|
sl@0
|
119 |
* adds the case closure items of the resulting code points.
|
sl@0
|
120 |
* If the string is found and its closure applied, then
|
sl@0
|
121 |
* the string itself is added as well as part of its code points' closure.
|
sl@0
|
122 |
* It must be length>=0.
|
sl@0
|
123 |
*
|
sl@0
|
124 |
* @return TRUE if the string was found
|
sl@0
|
125 |
*/
|
sl@0
|
126 |
U_CAPI UBool U_EXPORT2
|
sl@0
|
127 |
ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa);
|
sl@0
|
128 |
|
sl@0
|
129 |
/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
|
sl@0
|
130 |
U_CAPI int32_t U_EXPORT2
|
sl@0
|
131 |
ucase_getType(const UCaseProps *csp, UChar32 c);
|
sl@0
|
132 |
|
sl@0
|
133 |
/** @return same as ucase_getType(), or <0 if c is case-ignorable */
|
sl@0
|
134 |
U_CAPI int32_t U_EXPORT2
|
sl@0
|
135 |
ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c);
|
sl@0
|
136 |
|
sl@0
|
137 |
U_CAPI UBool U_EXPORT2
|
sl@0
|
138 |
ucase_isSoftDotted(const UCaseProps *csp, UChar32 c);
|
sl@0
|
139 |
|
sl@0
|
140 |
U_CAPI UBool U_EXPORT2
|
sl@0
|
141 |
ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c);
|
sl@0
|
142 |
|
sl@0
|
143 |
/* string case mapping functions */
|
sl@0
|
144 |
|
sl@0
|
145 |
/**
|
sl@0
|
146 |
* Iterator function for string case mappings, which need to look at the
|
sl@0
|
147 |
* context (surrounding text) of a given character for conditional mappings.
|
sl@0
|
148 |
*
|
sl@0
|
149 |
* The iterator only needs to go backward or forward away from the
|
sl@0
|
150 |
* character in question. It does not use any indexes on this interface.
|
sl@0
|
151 |
* It does not support random access or an arbitrary change of
|
sl@0
|
152 |
* iteration direction.
|
sl@0
|
153 |
*
|
sl@0
|
154 |
* The code point being case-mapped itself is never returned by
|
sl@0
|
155 |
* this iterator.
|
sl@0
|
156 |
*
|
sl@0
|
157 |
* @param context A pointer to the iterator's working data.
|
sl@0
|
158 |
* @param dir If <0 then start iterating backward from the character;
|
sl@0
|
159 |
* if >0 then start iterating forward from the character;
|
sl@0
|
160 |
* if 0 then continue iterating in the current direction.
|
sl@0
|
161 |
* @return Next code point, or <0 when the iteration is done.
|
sl@0
|
162 |
*/
|
sl@0
|
163 |
typedef UChar32 U_CALLCONV
|
sl@0
|
164 |
UCaseContextIterator(void *context, int8_t dir);
|
sl@0
|
165 |
|
sl@0
|
166 |
/**
|
sl@0
|
167 |
* Sample struct which may be used by some implementations of
|
sl@0
|
168 |
* UCaseContextIterator.
|
sl@0
|
169 |
*/
|
sl@0
|
170 |
struct UCaseContext {
|
sl@0
|
171 |
void *p;
|
sl@0
|
172 |
int32_t start, index, limit;
|
sl@0
|
173 |
int32_t cpStart, cpLimit;
|
sl@0
|
174 |
int8_t dir;
|
sl@0
|
175 |
int8_t b1, b2, b3;
|
sl@0
|
176 |
};
|
sl@0
|
177 |
typedef struct UCaseContext UCaseContext;
|
sl@0
|
178 |
|
sl@0
|
179 |
enum {
|
sl@0
|
180 |
/**
|
sl@0
|
181 |
* For string case mappings, a single character (a code point) is mapped
|
sl@0
|
182 |
* either to itself (in which case in-place mapping functions do nothing),
|
sl@0
|
183 |
* or to another single code point, or to a string.
|
sl@0
|
184 |
* Aside from the string contents, these are indicated with a single int32_t
|
sl@0
|
185 |
* value as follows:
|
sl@0
|
186 |
*
|
sl@0
|
187 |
* Mapping to self: Negative values (~self instead of -self to support U+0000)
|
sl@0
|
188 |
*
|
sl@0
|
189 |
* Mapping to another code point: Positive values >UCASE_MAX_STRING_LENGTH
|
sl@0
|
190 |
*
|
sl@0
|
191 |
* Mapping to a string: The string length (0..UCASE_MAX_STRING_LENGTH) is
|
sl@0
|
192 |
* returned. Note that the string result may indeed have zero length.
|
sl@0
|
193 |
*/
|
sl@0
|
194 |
UCASE_MAX_STRING_LENGTH=0x1f
|
sl@0
|
195 |
};
|
sl@0
|
196 |
|
sl@0
|
197 |
/**
|
sl@0
|
198 |
* Get the full lowercase mapping for c.
|
sl@0
|
199 |
*
|
sl@0
|
200 |
* @param csp Case mapping properties.
|
sl@0
|
201 |
* @param c Character to be mapped.
|
sl@0
|
202 |
* @param iter Character iterator, used for context-sensitive mappings.
|
sl@0
|
203 |
* See UCaseContextIterator for details.
|
sl@0
|
204 |
* If iter==NULL then a context-independent result is returned.
|
sl@0
|
205 |
* @param context Pointer to be passed into iter.
|
sl@0
|
206 |
* @param pString If the mapping result is a string, then the pointer is
|
sl@0
|
207 |
* written to *pString.
|
sl@0
|
208 |
* @param locale Locale ID for locale-dependent mappings.
|
sl@0
|
209 |
* @param locCache Initialize to 0; may be used to cache the result of parsing
|
sl@0
|
210 |
* the locale ID for subsequent calls.
|
sl@0
|
211 |
* Can be NULL.
|
sl@0
|
212 |
* @return Output code point or string length, see UCASE_MAX_STRING_LENGTH.
|
sl@0
|
213 |
*
|
sl@0
|
214 |
* @see UCaseContextIterator
|
sl@0
|
215 |
* @see UCASE_MAX_STRING_LENGTH
|
sl@0
|
216 |
* @internal
|
sl@0
|
217 |
*/
|
sl@0
|
218 |
U_CAPI int32_t U_EXPORT2
|
sl@0
|
219 |
ucase_toFullLower(const UCaseProps *csp, UChar32 c,
|
sl@0
|
220 |
UCaseContextIterator *iter, void *context,
|
sl@0
|
221 |
const UChar **pString,
|
sl@0
|
222 |
const char *locale, int32_t *locCache);
|
sl@0
|
223 |
|
sl@0
|
224 |
U_CAPI int32_t U_EXPORT2
|
sl@0
|
225 |
ucase_toFullUpper(const UCaseProps *csp, UChar32 c,
|
sl@0
|
226 |
UCaseContextIterator *iter, void *context,
|
sl@0
|
227 |
const UChar **pString,
|
sl@0
|
228 |
const char *locale, int32_t *locCache);
|
sl@0
|
229 |
|
sl@0
|
230 |
U_CAPI int32_t U_EXPORT2
|
sl@0
|
231 |
ucase_toFullTitle(const UCaseProps *csp, UChar32 c,
|
sl@0
|
232 |
UCaseContextIterator *iter, void *context,
|
sl@0
|
233 |
const UChar **pString,
|
sl@0
|
234 |
const char *locale, int32_t *locCache);
|
sl@0
|
235 |
|
sl@0
|
236 |
U_CAPI int32_t U_EXPORT2
|
sl@0
|
237 |
ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
|
sl@0
|
238 |
const UChar **pString,
|
sl@0
|
239 |
uint32_t options);
|
sl@0
|
240 |
|
sl@0
|
241 |
/* file definitions --------------------------------------------------------- */
|
sl@0
|
242 |
|
sl@0
|
243 |
#define UCASE_DATA_NAME "ucase"
|
sl@0
|
244 |
#define UCASE_DATA_TYPE "icu"
|
sl@0
|
245 |
|
sl@0
|
246 |
/* format "cAsE" */
|
sl@0
|
247 |
#define UCASE_FMT_0 0x63
|
sl@0
|
248 |
#define UCASE_FMT_1 0x41
|
sl@0
|
249 |
#define UCASE_FMT_2 0x53
|
sl@0
|
250 |
#define UCASE_FMT_3 0x45
|
sl@0
|
251 |
|
sl@0
|
252 |
/* indexes into indexes[] */
|
sl@0
|
253 |
enum {
|
sl@0
|
254 |
UCASE_IX_INDEX_TOP,
|
sl@0
|
255 |
UCASE_IX_LENGTH,
|
sl@0
|
256 |
UCASE_IX_TRIE_SIZE,
|
sl@0
|
257 |
UCASE_IX_EXC_LENGTH,
|
sl@0
|
258 |
UCASE_IX_UNFOLD_LENGTH,
|
sl@0
|
259 |
|
sl@0
|
260 |
UCASE_IX_MAX_FULL_LENGTH=15,
|
sl@0
|
261 |
UCASE_IX_TOP=16
|
sl@0
|
262 |
};
|
sl@0
|
263 |
|
sl@0
|
264 |
/* definitions for 16-bit case properties word ------------------------------ */
|
sl@0
|
265 |
|
sl@0
|
266 |
/* 2-bit constants for types of cased characters */
|
sl@0
|
267 |
#define UCASE_TYPE_MASK 3
|
sl@0
|
268 |
enum {
|
sl@0
|
269 |
UCASE_NONE,
|
sl@0
|
270 |
UCASE_LOWER,
|
sl@0
|
271 |
UCASE_UPPER,
|
sl@0
|
272 |
UCASE_TITLE
|
sl@0
|
273 |
};
|
sl@0
|
274 |
|
sl@0
|
275 |
#define UCASE_GET_TYPE(props) ((props)&UCASE_TYPE_MASK)
|
sl@0
|
276 |
|
sl@0
|
277 |
#define UCASE_SENSITIVE 4
|
sl@0
|
278 |
#define UCASE_EXCEPTION 8
|
sl@0
|
279 |
|
sl@0
|
280 |
#define UCASE_DOT_MASK 0x30
|
sl@0
|
281 |
enum {
|
sl@0
|
282 |
UCASE_NO_DOT=0, /* normal characters with cc=0 */
|
sl@0
|
283 |
UCASE_SOFT_DOTTED=0x10, /* soft-dotted characters with cc=0 */
|
sl@0
|
284 |
UCASE_ABOVE=0x20, /* "above" accents with cc=230 */
|
sl@0
|
285 |
UCASE_OTHER_ACCENT=0x30 /* other accent character (0<cc!=230) */
|
sl@0
|
286 |
};
|
sl@0
|
287 |
|
sl@0
|
288 |
/* no exception: bits 15..6 are a 10-bit signed case mapping delta */
|
sl@0
|
289 |
#define UCASE_DELTA_SHIFT 6
|
sl@0
|
290 |
#define UCASE_DELTA_MASK 0xffc0
|
sl@0
|
291 |
#define UCASE_MAX_DELTA 0x1ff
|
sl@0
|
292 |
#define UCASE_MIN_DELTA (-UCASE_MAX_DELTA-1)
|
sl@0
|
293 |
|
sl@0
|
294 |
#define UCASE_GET_DELTA(props) ((int16_t)(props)>>UCASE_DELTA_SHIFT)
|
sl@0
|
295 |
|
sl@0
|
296 |
/* case-ignorable uses one of the delta bits, see gencase/store.c */
|
sl@0
|
297 |
#define UCASE_CASE_IGNORABLE 0x40
|
sl@0
|
298 |
|
sl@0
|
299 |
/* exception: bits 15..4 are an unsigned 12-bit index into the exceptions array */
|
sl@0
|
300 |
#define UCASE_EXC_SHIFT 4
|
sl@0
|
301 |
#define UCASE_EXC_MASK 0xfff0
|
sl@0
|
302 |
#define UCASE_MAX_EXCEPTIONS 0x1000
|
sl@0
|
303 |
|
sl@0
|
304 |
/* definitions for 16-bit main exceptions word ------------------------------ */
|
sl@0
|
305 |
|
sl@0
|
306 |
/* first 8 bits indicate values in optional slots */
|
sl@0
|
307 |
enum {
|
sl@0
|
308 |
UCASE_EXC_LOWER,
|
sl@0
|
309 |
UCASE_EXC_FOLD,
|
sl@0
|
310 |
UCASE_EXC_UPPER,
|
sl@0
|
311 |
UCASE_EXC_TITLE,
|
sl@0
|
312 |
UCASE_EXC_4, /* reserved */
|
sl@0
|
313 |
UCASE_EXC_5, /* reserved */
|
sl@0
|
314 |
UCASE_EXC_CLOSURE,
|
sl@0
|
315 |
UCASE_EXC_FULL_MAPPINGS,
|
sl@0
|
316 |
UCASE_EXC_ALL_SLOTS /* one past the last slot */
|
sl@0
|
317 |
};
|
sl@0
|
318 |
|
sl@0
|
319 |
/* each slot is 2 uint16_t instead of 1 */
|
sl@0
|
320 |
#define UCASE_EXC_DOUBLE_SLOTS 0x100
|
sl@0
|
321 |
|
sl@0
|
322 |
/* reserved: exception bits 11..9 */
|
sl@0
|
323 |
|
sl@0
|
324 |
/* UCASE_EXC_DOT_MASK=UCASE_DOT_MASK<<UCASE_EXC_DOT_SHIFT */
|
sl@0
|
325 |
#define UCASE_EXC_DOT_SHIFT 8
|
sl@0
|
326 |
|
sl@0
|
327 |
/* normally stored in the main word, but pushed out for larger exception indexes */
|
sl@0
|
328 |
#define UCASE_EXC_DOT_MASK 0x3000
|
sl@0
|
329 |
enum {
|
sl@0
|
330 |
UCASE_EXC_NO_DOT=0,
|
sl@0
|
331 |
UCASE_EXC_SOFT_DOTTED=0x1000,
|
sl@0
|
332 |
UCASE_EXC_ABOVE=0x2000, /* "above" accents with cc=230 */
|
sl@0
|
333 |
UCASE_EXC_OTHER_ACCENT=0x3000 /* other character (0<cc!=230) */
|
sl@0
|
334 |
};
|
sl@0
|
335 |
|
sl@0
|
336 |
/* complex/conditional mappings */
|
sl@0
|
337 |
#define UCASE_EXC_CONDITIONAL_SPECIAL 0x4000
|
sl@0
|
338 |
#define UCASE_EXC_CONDITIONAL_FOLD 0x8000
|
sl@0
|
339 |
|
sl@0
|
340 |
/* definitions for lengths word for full case mappings */
|
sl@0
|
341 |
#define UCASE_FULL_LOWER 0xf
|
sl@0
|
342 |
#define UCASE_FULL_FOLDING 0xf0
|
sl@0
|
343 |
#define UCASE_FULL_UPPER 0xf00
|
sl@0
|
344 |
#define UCASE_FULL_TITLE 0xf000
|
sl@0
|
345 |
|
sl@0
|
346 |
/* maximum lengths */
|
sl@0
|
347 |
#define UCASE_FULL_MAPPINGS_MAX_LENGTH (4*0xf)
|
sl@0
|
348 |
#define UCASE_CLOSURE_MAX_LENGTH 0xf
|
sl@0
|
349 |
|
sl@0
|
350 |
/* constants for reverse case folding ("unfold") data */
|
sl@0
|
351 |
enum {
|
sl@0
|
352 |
UCASE_UNFOLD_ROWS,
|
sl@0
|
353 |
UCASE_UNFOLD_ROW_WIDTH,
|
sl@0
|
354 |
UCASE_UNFOLD_STRING_WIDTH
|
sl@0
|
355 |
};
|
sl@0
|
356 |
|
sl@0
|
357 |
U_CDECL_END
|
sl@0
|
358 |
|
sl@0
|
359 |
#endif
|