sl@0
|
1 |
/*
|
sl@0
|
2 |
***************************************************************************
|
sl@0
|
3 |
* Copyright (C) 1999-2005, International Business Machines Corporation
|
sl@0
|
4 |
* and others. All Rights Reserved.
|
sl@0
|
5 |
***************************************************************************
|
sl@0
|
6 |
* Date Name Description
|
sl@0
|
7 |
* 10/20/99 alan Creation.
|
sl@0
|
8 |
***************************************************************************
|
sl@0
|
9 |
*/
|
sl@0
|
10 |
|
sl@0
|
11 |
#ifndef UNICODESET_H
|
sl@0
|
12 |
#define UNICODESET_H
|
sl@0
|
13 |
|
sl@0
|
14 |
#include "unicode/unifilt.h"
|
sl@0
|
15 |
#include "unicode/unistr.h"
|
sl@0
|
16 |
#include "unicode/uset.h"
|
sl@0
|
17 |
|
sl@0
|
18 |
/**
|
sl@0
|
19 |
* \file
|
sl@0
|
20 |
* \brief C++ API: Unicode Set
|
sl@0
|
21 |
*/
|
sl@0
|
22 |
|
sl@0
|
23 |
U_NAMESPACE_BEGIN
|
sl@0
|
24 |
|
sl@0
|
25 |
class ParsePosition;
|
sl@0
|
26 |
class SymbolTable;
|
sl@0
|
27 |
class UVector;
|
sl@0
|
28 |
class RuleCharacterIterator;
|
sl@0
|
29 |
|
sl@0
|
30 |
/**
|
sl@0
|
31 |
* A mutable set of Unicode characters and multicharacter strings. Objects of this class
|
sl@0
|
32 |
* represent <em>character classes</em> used in regular expressions.
|
sl@0
|
33 |
* A character specifies a subset of Unicode code points. Legal
|
sl@0
|
34 |
* code points are U+0000 to U+10FFFF, inclusive.
|
sl@0
|
35 |
*
|
sl@0
|
36 |
* <p>The UnicodeSet class is not designed to be subclassed.
|
sl@0
|
37 |
*
|
sl@0
|
38 |
* <p><code>UnicodeSet</code> supports two APIs. The first is the
|
sl@0
|
39 |
* <em>operand</em> API that allows the caller to modify the value of
|
sl@0
|
40 |
* a <code>UnicodeSet</code> object. It conforms to Java 2's
|
sl@0
|
41 |
* <code>java.util.Set</code> interface, although
|
sl@0
|
42 |
* <code>UnicodeSet</code> does not actually implement that
|
sl@0
|
43 |
* interface. All methods of <code>Set</code> are supported, with the
|
sl@0
|
44 |
* modification that they take a character range or single character
|
sl@0
|
45 |
* instead of an <code>Object</code>, and they take a
|
sl@0
|
46 |
* <code>UnicodeSet</code> instead of a <code>Collection</code>. The
|
sl@0
|
47 |
* operand API may be thought of in terms of boolean logic: a boolean
|
sl@0
|
48 |
* OR is implemented by <code>add</code>, a boolean AND is implemented
|
sl@0
|
49 |
* by <code>retain</code>, a boolean XOR is implemented by
|
sl@0
|
50 |
* <code>complement</code> taking an argument, and a boolean NOT is
|
sl@0
|
51 |
* implemented by <code>complement</code> with no argument. In terms
|
sl@0
|
52 |
* of traditional set theory function names, <code>add</code> is a
|
sl@0
|
53 |
* union, <code>retain</code> is an intersection, <code>remove</code>
|
sl@0
|
54 |
* is an asymmetric difference, and <code>complement</code> with no
|
sl@0
|
55 |
* argument is a set complement with respect to the superset range
|
sl@0
|
56 |
* <code>MIN_VALUE-MAX_VALUE</code>
|
sl@0
|
57 |
*
|
sl@0
|
58 |
* <p>The second API is the
|
sl@0
|
59 |
* <code>applyPattern()</code>/<code>toPattern()</code> API from the
|
sl@0
|
60 |
* <code>java.text.Format</code>-derived classes. Unlike the
|
sl@0
|
61 |
* methods that add characters, add categories, and control the logic
|
sl@0
|
62 |
* of the set, the method <code>applyPattern()</code> sets all
|
sl@0
|
63 |
* attributes of a <code>UnicodeSet</code> at once, based on a
|
sl@0
|
64 |
* string pattern.
|
sl@0
|
65 |
*
|
sl@0
|
66 |
* <p><b>Pattern syntax</b></p>
|
sl@0
|
67 |
*
|
sl@0
|
68 |
* Patterns are accepted by the constructors and the
|
sl@0
|
69 |
* <code>applyPattern()</code> methods and returned by the
|
sl@0
|
70 |
* <code>toPattern()</code> method. These patterns follow a syntax
|
sl@0
|
71 |
* similar to that employed by version 8 regular expression character
|
sl@0
|
72 |
* classes. Here are some simple examples:
|
sl@0
|
73 |
*
|
sl@0
|
74 |
* \htmlonly<blockquote>\endhtmlonly
|
sl@0
|
75 |
* <table>
|
sl@0
|
76 |
* <tr align="top">
|
sl@0
|
77 |
* <td nowrap valign="top" align="left"><code>[]</code></td>
|
sl@0
|
78 |
* <td valign="top">No characters</td>
|
sl@0
|
79 |
* </tr><tr align="top">
|
sl@0
|
80 |
* <td nowrap valign="top" align="left"><code>[a]</code></td>
|
sl@0
|
81 |
* <td valign="top">The character 'a'</td>
|
sl@0
|
82 |
* </tr><tr align="top">
|
sl@0
|
83 |
* <td nowrap valign="top" align="left"><code>[ae]</code></td>
|
sl@0
|
84 |
* <td valign="top">The characters 'a' and 'e'</td>
|
sl@0
|
85 |
* </tr>
|
sl@0
|
86 |
* <tr>
|
sl@0
|
87 |
* <td nowrap valign="top" align="left"><code>[a-e]</code></td>
|
sl@0
|
88 |
* <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code
|
sl@0
|
89 |
* point order</td>
|
sl@0
|
90 |
* </tr>
|
sl@0
|
91 |
* <tr>
|
sl@0
|
92 |
* <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td>
|
sl@0
|
93 |
* <td valign="top">The character U+4E01</td>
|
sl@0
|
94 |
* </tr>
|
sl@0
|
95 |
* <tr>
|
sl@0
|
96 |
* <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td>
|
sl@0
|
97 |
* <td valign="top">The character 'a' and the multicharacter strings "ab" and
|
sl@0
|
98 |
* "ac"</td>
|
sl@0
|
99 |
* </tr>
|
sl@0
|
100 |
* <tr>
|
sl@0
|
101 |
* <td nowrap valign="top" align="left"><code>[\\p{Lu}]</code></td>
|
sl@0
|
102 |
* <td valign="top">All characters in the general category Uppercase Letter</td>
|
sl@0
|
103 |
* </tr>
|
sl@0
|
104 |
* </table>
|
sl@0
|
105 |
* \htmlonly</blockquote>\endhtmlonly
|
sl@0
|
106 |
*
|
sl@0
|
107 |
* Any character may be preceded by a backslash in order to remove any special
|
sl@0
|
108 |
* meaning. White space characters, as defined by UCharacter.isWhitespace(), are
|
sl@0
|
109 |
* ignored, unless they are escaped.
|
sl@0
|
110 |
*
|
sl@0
|
111 |
* <p>Property patterns specify a set of characters having a certain
|
sl@0
|
112 |
* property as defined by the Unicode standard. Both the POSIX-like
|
sl@0
|
113 |
* "[:Lu:]" and the Perl-like syntax "\\p{Lu}" are recognized. For a
|
sl@0
|
114 |
* complete list of supported property patterns, see the User's Guide
|
sl@0
|
115 |
* for UnicodeSet at
|
sl@0
|
116 |
* <a href="http://icu.sourceforge.net/userguide/unicodeSet.html">
|
sl@0
|
117 |
* http://icu.sourceforge.net/userguide/unicodeSet.html</a>.
|
sl@0
|
118 |
* Actual determination of property data is defined by the underlying
|
sl@0
|
119 |
* Unicode database as implemented by UCharacter.
|
sl@0
|
120 |
*
|
sl@0
|
121 |
* <p>Patterns specify individual characters, ranges of characters, and
|
sl@0
|
122 |
* Unicode property sets. When elements are concatenated, they
|
sl@0
|
123 |
* specify their union. To complement a set, place a '^' immediately
|
sl@0
|
124 |
* after the opening '['. Property patterns are inverted by modifying
|
sl@0
|
125 |
* their delimiters; "[:^foo]" and "\\P{foo}". In any other location,
|
sl@0
|
126 |
* '^' has no special meaning.
|
sl@0
|
127 |
*
|
sl@0
|
128 |
* <p>Ranges are indicated by placing two a '-' between two
|
sl@0
|
129 |
* characters, as in "a-z". This specifies the range of all
|
sl@0
|
130 |
* characters from the left to the right, in Unicode order. If the
|
sl@0
|
131 |
* left character is greater than or equal to the
|
sl@0
|
132 |
* right character it is a syntax error. If a '-' occurs as the first
|
sl@0
|
133 |
* character after the opening '[' or '[^', or if it occurs as the
|
sl@0
|
134 |
* last character before the closing ']', then it is taken as a
|
sl@0
|
135 |
* literal. Thus "[a\-b]", "[-ab]", and "[ab-]" all indicate the same
|
sl@0
|
136 |
* set of three characters, 'a', 'b', and '-'.
|
sl@0
|
137 |
*
|
sl@0
|
138 |
* <p>Sets may be intersected using the '&' operator or the asymmetric
|
sl@0
|
139 |
* set difference may be taken using the '-' operator, for example,
|
sl@0
|
140 |
* "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters
|
sl@0
|
141 |
* with values less than 4096. Operators ('&' and '|') have equal
|
sl@0
|
142 |
* precedence and bind left-to-right. Thus
|
sl@0
|
143 |
* "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
|
sl@0
|
144 |
* "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for
|
sl@0
|
145 |
* difference; intersection is commutative.
|
sl@0
|
146 |
*
|
sl@0
|
147 |
* <table>
|
sl@0
|
148 |
* <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a'
|
sl@0
|
149 |
* <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a'
|
sl@0
|
150 |
* through 'z' and all letters in between, in Unicode order
|
sl@0
|
151 |
* <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
|
sl@0
|
152 |
* all characters but 'a' through 'z',
|
sl@0
|
153 |
* that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
|
sl@0
|
154 |
* <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
|
sl@0
|
155 |
* <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
|
sl@0
|
156 |
* <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code>
|
sl@0
|
157 |
* <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
|
sl@0
|
158 |
* <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
|
sl@0
|
159 |
* <td>The asymmetric difference of sets specified by <em>pat1</em> and
|
sl@0
|
160 |
* <em>pat2</em>
|
sl@0
|
161 |
* <tr valign=top><td nowrap><code>[:Lu:] or \\p{Lu}</code>
|
sl@0
|
162 |
* <td>The set of characters having the specified
|
sl@0
|
163 |
* Unicode property; in
|
sl@0
|
164 |
* this case, Unicode uppercase letters
|
sl@0
|
165 |
* <tr valign=top><td nowrap><code>[:^Lu:] or \\P{Lu}</code>
|
sl@0
|
166 |
* <td>The set of characters <em>not</em> having the given
|
sl@0
|
167 |
* Unicode property
|
sl@0
|
168 |
* </table>
|
sl@0
|
169 |
*
|
sl@0
|
170 |
* <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p>
|
sl@0
|
171 |
*
|
sl@0
|
172 |
* <p><b>Formal syntax</b></p>
|
sl@0
|
173 |
*
|
sl@0
|
174 |
* \htmlonly<blockquote>\endhtmlonly
|
sl@0
|
175 |
* <table>
|
sl@0
|
176 |
* <tr align="top">
|
sl@0
|
177 |
* <td nowrap valign="top" align="right"><code>pattern := </code></td>
|
sl@0
|
178 |
* <td valign="top"><code>('[' '^'? item* ']') |
|
sl@0
|
179 |
* property</code></td>
|
sl@0
|
180 |
* </tr>
|
sl@0
|
181 |
* <tr align="top">
|
sl@0
|
182 |
* <td nowrap valign="top" align="right"><code>item := </code></td>
|
sl@0
|
183 |
* <td valign="top"><code>char | (char '-' char) | pattern-expr<br>
|
sl@0
|
184 |
* </code></td>
|
sl@0
|
185 |
* </tr>
|
sl@0
|
186 |
* <tr align="top">
|
sl@0
|
187 |
* <td nowrap valign="top" align="right"><code>pattern-expr := </code></td>
|
sl@0
|
188 |
* <td valign="top"><code>pattern | pattern-expr pattern |
|
sl@0
|
189 |
* pattern-expr op pattern<br>
|
sl@0
|
190 |
* </code></td>
|
sl@0
|
191 |
* </tr>
|
sl@0
|
192 |
* <tr align="top">
|
sl@0
|
193 |
* <td nowrap valign="top" align="right"><code>op := </code></td>
|
sl@0
|
194 |
* <td valign="top"><code>'&' | '-'<br>
|
sl@0
|
195 |
* </code></td>
|
sl@0
|
196 |
* </tr>
|
sl@0
|
197 |
* <tr align="top">
|
sl@0
|
198 |
* <td nowrap valign="top" align="right"><code>special := </code></td>
|
sl@0
|
199 |
* <td valign="top"><code>'[' | ']' | '-'<br>
|
sl@0
|
200 |
* </code></td>
|
sl@0
|
201 |
* </tr>
|
sl@0
|
202 |
* <tr align="top">
|
sl@0
|
203 |
* <td nowrap valign="top" align="right"><code>char := </code></td>
|
sl@0
|
204 |
* <td valign="top"><em>any character that is not</em><code> special<br>
|
sl@0
|
205 |
* | ('\' </code><em>any character</em><code>)<br>
|
sl@0
|
206 |
* | ('\\u' hex hex hex hex)<br>
|
sl@0
|
207 |
* </code></td>
|
sl@0
|
208 |
* </tr>
|
sl@0
|
209 |
* <tr align="top">
|
sl@0
|
210 |
* <td nowrap valign="top" align="right"><code>hex := </code></td>
|
sl@0
|
211 |
* <td valign="top"><em>any character for which
|
sl@0
|
212 |
* </em><code>Character.digit(c, 16)</code><em>
|
sl@0
|
213 |
* returns a non-negative result</em></td>
|
sl@0
|
214 |
* </tr>
|
sl@0
|
215 |
* <tr>
|
sl@0
|
216 |
* <td nowrap valign="top" align="right"><code>property := </code></td>
|
sl@0
|
217 |
* <td valign="top"><em>a Unicode property set pattern</em></td>
|
sl@0
|
218 |
* </tr>
|
sl@0
|
219 |
* </table>
|
sl@0
|
220 |
* <br>
|
sl@0
|
221 |
* <table border="1">
|
sl@0
|
222 |
* <tr>
|
sl@0
|
223 |
* <td>Legend: <table>
|
sl@0
|
224 |
* <tr>
|
sl@0
|
225 |
* <td nowrap valign="top"><code>a := b</code></td>
|
sl@0
|
226 |
* <td width="20" valign="top"> </td>
|
sl@0
|
227 |
* <td valign="top"><code>a</code> may be replaced by <code>b</code> </td>
|
sl@0
|
228 |
* </tr>
|
sl@0
|
229 |
* <tr>
|
sl@0
|
230 |
* <td nowrap valign="top"><code>a?</code></td>
|
sl@0
|
231 |
* <td valign="top"></td>
|
sl@0
|
232 |
* <td valign="top">zero or one instance of <code>a</code><br>
|
sl@0
|
233 |
* </td>
|
sl@0
|
234 |
* </tr>
|
sl@0
|
235 |
* <tr>
|
sl@0
|
236 |
* <td nowrap valign="top"><code>a*</code></td>
|
sl@0
|
237 |
* <td valign="top"></td>
|
sl@0
|
238 |
* <td valign="top">one or more instances of <code>a</code><br>
|
sl@0
|
239 |
* </td>
|
sl@0
|
240 |
* </tr>
|
sl@0
|
241 |
* <tr>
|
sl@0
|
242 |
* <td nowrap valign="top"><code>a | b</code></td>
|
sl@0
|
243 |
* <td valign="top"></td>
|
sl@0
|
244 |
* <td valign="top">either <code>a</code> or <code>b</code><br>
|
sl@0
|
245 |
* </td>
|
sl@0
|
246 |
* </tr>
|
sl@0
|
247 |
* <tr>
|
sl@0
|
248 |
* <td nowrap valign="top"><code>'a'</code></td>
|
sl@0
|
249 |
* <td valign="top"></td>
|
sl@0
|
250 |
* <td valign="top">the literal string between the quotes </td>
|
sl@0
|
251 |
* </tr>
|
sl@0
|
252 |
* </table>
|
sl@0
|
253 |
* </td>
|
sl@0
|
254 |
* </tr>
|
sl@0
|
255 |
* </table>
|
sl@0
|
256 |
* \htmlonly</blockquote>\endhtmlonly
|
sl@0
|
257 |
*
|
sl@0
|
258 |
* @author Alan Liu
|
sl@0
|
259 |
* @stable ICU 2.0
|
sl@0
|
260 |
*/
|
sl@0
|
261 |
class U_COMMON_API UnicodeSet : public UnicodeFilter {
|
sl@0
|
262 |
|
sl@0
|
263 |
int32_t len; // length of list used; 0 <= len <= capacity
|
sl@0
|
264 |
int32_t capacity; // capacity of list
|
sl@0
|
265 |
int32_t bufferCapacity; // capacity of buffer
|
sl@0
|
266 |
UChar32* list; // MUST be terminated with HIGH
|
sl@0
|
267 |
UChar32* buffer; // internal buffer, may be NULL
|
sl@0
|
268 |
|
sl@0
|
269 |
UVector* strings; // maintained in sorted order
|
sl@0
|
270 |
|
sl@0
|
271 |
/**
|
sl@0
|
272 |
* The pattern representation of this set. This may not be the
|
sl@0
|
273 |
* most economical pattern. It is the pattern supplied to
|
sl@0
|
274 |
* applyPattern(), with variables substituted and whitespace
|
sl@0
|
275 |
* removed. For sets constructed without applyPattern(), or
|
sl@0
|
276 |
* modified using the non-pattern API, this string will be empty,
|
sl@0
|
277 |
* indicating that toPattern() must generate a pattern
|
sl@0
|
278 |
* representation from the inversion list.
|
sl@0
|
279 |
*/
|
sl@0
|
280 |
UnicodeString pat;
|
sl@0
|
281 |
|
sl@0
|
282 |
public:
|
sl@0
|
283 |
|
sl@0
|
284 |
enum {
|
sl@0
|
285 |
/**
|
sl@0
|
286 |
* Minimum value that can be stored in a UnicodeSet.
|
sl@0
|
287 |
* @stable ICU 2.4
|
sl@0
|
288 |
*/
|
sl@0
|
289 |
MIN_VALUE = 0,
|
sl@0
|
290 |
|
sl@0
|
291 |
/**
|
sl@0
|
292 |
* Maximum value that can be stored in a UnicodeSet.
|
sl@0
|
293 |
* @stable ICU 2.4
|
sl@0
|
294 |
*/
|
sl@0
|
295 |
MAX_VALUE = 0x10ffff
|
sl@0
|
296 |
};
|
sl@0
|
297 |
|
sl@0
|
298 |
//----------------------------------------------------------------
|
sl@0
|
299 |
// Constructors &c
|
sl@0
|
300 |
//----------------------------------------------------------------
|
sl@0
|
301 |
|
sl@0
|
302 |
public:
|
sl@0
|
303 |
|
sl@0
|
304 |
/**
|
sl@0
|
305 |
* Constructs an empty set.
|
sl@0
|
306 |
* @stable ICU 2.0
|
sl@0
|
307 |
*/
|
sl@0
|
308 |
UnicodeSet();
|
sl@0
|
309 |
|
sl@0
|
310 |
/**
|
sl@0
|
311 |
* Constructs a set containing the given range. If <code>end >
|
sl@0
|
312 |
* start</code> then an empty set is created.
|
sl@0
|
313 |
*
|
sl@0
|
314 |
* @param start first character, inclusive, of range
|
sl@0
|
315 |
* @param end last character, inclusive, of range
|
sl@0
|
316 |
* @stable ICU 2.4
|
sl@0
|
317 |
*/
|
sl@0
|
318 |
UnicodeSet(UChar32 start, UChar32 end);
|
sl@0
|
319 |
|
sl@0
|
320 |
/**
|
sl@0
|
321 |
* Constructs a set from the given pattern. See the class
|
sl@0
|
322 |
* description for the syntax of the pattern language.
|
sl@0
|
323 |
* @param pattern a string specifying what characters are in the set
|
sl@0
|
324 |
* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
|
sl@0
|
325 |
* contains a syntax error.
|
sl@0
|
326 |
* @stable ICU 2.0
|
sl@0
|
327 |
*/
|
sl@0
|
328 |
UnicodeSet(const UnicodeString& pattern,
|
sl@0
|
329 |
UErrorCode& status);
|
sl@0
|
330 |
|
sl@0
|
331 |
/**
|
sl@0
|
332 |
* Constructs a set from the given pattern. See the class
|
sl@0
|
333 |
* description for the syntax of the pattern language.
|
sl@0
|
334 |
* @param pattern a string specifying what characters are in the set
|
sl@0
|
335 |
* @param options bitmask for options to apply to the pattern.
|
sl@0
|
336 |
* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
|
sl@0
|
337 |
* @param symbols a symbol table mapping variable names to values
|
sl@0
|
338 |
* and stand-in characters to UnicodeSets; may be NULL
|
sl@0
|
339 |
* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
|
sl@0
|
340 |
* contains a syntax error.
|
sl@0
|
341 |
* @internal
|
sl@0
|
342 |
*/
|
sl@0
|
343 |
UnicodeSet(const UnicodeString& pattern,
|
sl@0
|
344 |
uint32_t options,
|
sl@0
|
345 |
const SymbolTable* symbols,
|
sl@0
|
346 |
UErrorCode& status);
|
sl@0
|
347 |
|
sl@0
|
348 |
/**
|
sl@0
|
349 |
* Constructs a set from the given pattern. See the class description
|
sl@0
|
350 |
* for the syntax of the pattern language.
|
sl@0
|
351 |
* @param pattern a string specifying what characters are in the set
|
sl@0
|
352 |
* @param pos on input, the position in pattern at which to start parsing.
|
sl@0
|
353 |
* On output, the position after the last character parsed.
|
sl@0
|
354 |
* @param options bitmask for options to apply to the pattern.
|
sl@0
|
355 |
* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
|
sl@0
|
356 |
* @param symbols a symbol table mapping variable names to values
|
sl@0
|
357 |
* and stand-in characters to UnicodeSets; may be NULL
|
sl@0
|
358 |
* @param status input-output error code
|
sl@0
|
359 |
* @stable ICU 2.8
|
sl@0
|
360 |
*/
|
sl@0
|
361 |
UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
|
sl@0
|
362 |
uint32_t options,
|
sl@0
|
363 |
const SymbolTable* symbols,
|
sl@0
|
364 |
UErrorCode& status);
|
sl@0
|
365 |
|
sl@0
|
366 |
#ifdef U_USE_UNICODESET_DEPRECATES
|
sl@0
|
367 |
/**
|
sl@0
|
368 |
* Obsolete: Constructs a set from the given Unicode character category.
|
sl@0
|
369 |
* @param category an integer indicating the character category as
|
sl@0
|
370 |
* defined in uchar.h.
|
sl@0
|
371 |
* @obsolete ICU 2.6. Use a pattern with the category instead since this API will be removed in that release.
|
sl@0
|
372 |
*/
|
sl@0
|
373 |
UnicodeSet(int8_t category, UErrorCode& status);
|
sl@0
|
374 |
#endif
|
sl@0
|
375 |
|
sl@0
|
376 |
/**
|
sl@0
|
377 |
* Constructs a set that is identical to the given UnicodeSet.
|
sl@0
|
378 |
* @stable ICU 2.0
|
sl@0
|
379 |
*/
|
sl@0
|
380 |
UnicodeSet(const UnicodeSet& o);
|
sl@0
|
381 |
|
sl@0
|
382 |
/**
|
sl@0
|
383 |
* Destructs the set.
|
sl@0
|
384 |
* @stable ICU 2.0
|
sl@0
|
385 |
*/
|
sl@0
|
386 |
virtual ~UnicodeSet();
|
sl@0
|
387 |
|
sl@0
|
388 |
/**
|
sl@0
|
389 |
* Assigns this object to be a copy of another.
|
sl@0
|
390 |
* @stable ICU 2.0
|
sl@0
|
391 |
*/
|
sl@0
|
392 |
UnicodeSet& operator=(const UnicodeSet& o);
|
sl@0
|
393 |
|
sl@0
|
394 |
/**
|
sl@0
|
395 |
* Compares the specified object with this set for equality. Returns
|
sl@0
|
396 |
* <tt>true</tt> if the two sets
|
sl@0
|
397 |
* have the same size, and every member of the specified set is
|
sl@0
|
398 |
* contained in this set (or equivalently, every member of this set is
|
sl@0
|
399 |
* contained in the specified set).
|
sl@0
|
400 |
*
|
sl@0
|
401 |
* @param o set to be compared for equality with this set.
|
sl@0
|
402 |
* @return <tt>true</tt> if the specified set is equal to this set.
|
sl@0
|
403 |
* @stable ICU 2.0
|
sl@0
|
404 |
*/
|
sl@0
|
405 |
virtual UBool operator==(const UnicodeSet& o) const;
|
sl@0
|
406 |
|
sl@0
|
407 |
/**
|
sl@0
|
408 |
* Compares the specified object with this set for equality. Returns
|
sl@0
|
409 |
* <tt>true</tt> if the specified set is not equal to this set.
|
sl@0
|
410 |
* @stable ICU 2.0
|
sl@0
|
411 |
*/
|
sl@0
|
412 |
UBool operator!=(const UnicodeSet& o) const;
|
sl@0
|
413 |
|
sl@0
|
414 |
/**
|
sl@0
|
415 |
* Returns a copy of this object. All UnicodeFunctor objects have
|
sl@0
|
416 |
* to support cloning in order to allow classes using
|
sl@0
|
417 |
* UnicodeFunctors, such as Transliterator, to implement cloning.
|
sl@0
|
418 |
* @stable ICU 2.0
|
sl@0
|
419 |
*/
|
sl@0
|
420 |
virtual UnicodeFunctor* clone() const;
|
sl@0
|
421 |
|
sl@0
|
422 |
/**
|
sl@0
|
423 |
* Returns the hash code value for this set.
|
sl@0
|
424 |
*
|
sl@0
|
425 |
* @return the hash code value for this set.
|
sl@0
|
426 |
* @see Object#hashCode()
|
sl@0
|
427 |
* @stable ICU 2.0
|
sl@0
|
428 |
*/
|
sl@0
|
429 |
virtual int32_t hashCode(void) const;
|
sl@0
|
430 |
|
sl@0
|
431 |
//----------------------------------------------------------------
|
sl@0
|
432 |
// Public API
|
sl@0
|
433 |
//----------------------------------------------------------------
|
sl@0
|
434 |
|
sl@0
|
435 |
/**
|
sl@0
|
436 |
* Make this object represent the range <code>start - end</code>.
|
sl@0
|
437 |
* If <code>end > start</code> then this object is set to an
|
sl@0
|
438 |
* an empty range.
|
sl@0
|
439 |
*
|
sl@0
|
440 |
* @param start first character in the set, inclusive
|
sl@0
|
441 |
* @param end last character in the set, inclusive
|
sl@0
|
442 |
* @stable ICU 2.4
|
sl@0
|
443 |
*/
|
sl@0
|
444 |
UnicodeSet& set(UChar32 start, UChar32 end);
|
sl@0
|
445 |
|
sl@0
|
446 |
/**
|
sl@0
|
447 |
* Return true if the given position, in the given pattern, appears
|
sl@0
|
448 |
* to be the start of a UnicodeSet pattern.
|
sl@0
|
449 |
* @stable ICU 2.4
|
sl@0
|
450 |
*/
|
sl@0
|
451 |
static UBool resemblesPattern(const UnicodeString& pattern,
|
sl@0
|
452 |
int32_t pos);
|
sl@0
|
453 |
|
sl@0
|
454 |
/**
|
sl@0
|
455 |
* Modifies this set to represent the set specified by the given
|
sl@0
|
456 |
* pattern, optionally ignoring white space. See the class
|
sl@0
|
457 |
* description for the syntax of the pattern language.
|
sl@0
|
458 |
* @param pattern a string specifying what characters are in the set
|
sl@0
|
459 |
* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
|
sl@0
|
460 |
* contains a syntax error.
|
sl@0
|
461 |
* <em> Empties the set passed before applying the pattern.</em>
|
sl@0
|
462 |
* @return a reference to this
|
sl@0
|
463 |
* @stable ICU 2.0
|
sl@0
|
464 |
*/
|
sl@0
|
465 |
UnicodeSet& applyPattern(const UnicodeString& pattern,
|
sl@0
|
466 |
UErrorCode& status);
|
sl@0
|
467 |
|
sl@0
|
468 |
/**
|
sl@0
|
469 |
* Modifies this set to represent the set specified by the given
|
sl@0
|
470 |
* pattern, optionally ignoring white space. See the class
|
sl@0
|
471 |
* description for the syntax of the pattern language.
|
sl@0
|
472 |
* @param pattern a string specifying what characters are in the set
|
sl@0
|
473 |
* @param options bitmask for options to apply to the pattern.
|
sl@0
|
474 |
* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
|
sl@0
|
475 |
* @param symbols a symbol table mapping variable names to
|
sl@0
|
476 |
* values and stand-ins to UnicodeSets; may be NULL
|
sl@0
|
477 |
* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
|
sl@0
|
478 |
* contains a syntax error.
|
sl@0
|
479 |
*<em> Empties the set passed before applying the pattern.</em>
|
sl@0
|
480 |
* @return a reference to this
|
sl@0
|
481 |
* @internal
|
sl@0
|
482 |
*/
|
sl@0
|
483 |
UnicodeSet& applyPattern(const UnicodeString& pattern,
|
sl@0
|
484 |
uint32_t options,
|
sl@0
|
485 |
const SymbolTable* symbols,
|
sl@0
|
486 |
UErrorCode& status);
|
sl@0
|
487 |
|
sl@0
|
488 |
/**
|
sl@0
|
489 |
* Parses the given pattern, starting at the given position. The
|
sl@0
|
490 |
* character at pattern.charAt(pos.getIndex()) must be '[', or the
|
sl@0
|
491 |
* parse fails. Parsing continues until the corresponding closing
|
sl@0
|
492 |
* ']'. If a syntax error is encountered between the opening and
|
sl@0
|
493 |
* closing brace, the parse fails. Upon return from a successful
|
sl@0
|
494 |
* parse, the ParsePosition is updated to point to the character
|
sl@0
|
495 |
* following the closing ']', and a StringBuffer containing a
|
sl@0
|
496 |
* pairs list for the parsed pattern is returned. This method calls
|
sl@0
|
497 |
* itself recursively to parse embedded subpatterns.
|
sl@0
|
498 |
*<em> Empties the set passed before applying the pattern.</em>
|
sl@0
|
499 |
*
|
sl@0
|
500 |
* @param pattern the string containing the pattern to be parsed.
|
sl@0
|
501 |
* The portion of the string from pos.getIndex(), which must be a
|
sl@0
|
502 |
* '[', to the corresponding closing ']', is parsed.
|
sl@0
|
503 |
* @param pos upon entry, the position at which to being parsing.
|
sl@0
|
504 |
* The character at pattern.charAt(pos.getIndex()) must be a '['.
|
sl@0
|
505 |
* Upon return from a successful parse, pos.getIndex() is either
|
sl@0
|
506 |
* the character after the closing ']' of the parsed pattern, or
|
sl@0
|
507 |
* pattern.length() if the closing ']' is the last character of
|
sl@0
|
508 |
* the pattern string.
|
sl@0
|
509 |
* @param options bitmask for options to apply to the pattern.
|
sl@0
|
510 |
* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
|
sl@0
|
511 |
* @param symbols a symbol table mapping variable names to
|
sl@0
|
512 |
* values and stand-ins to UnicodeSets; may be NULL
|
sl@0
|
513 |
* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
|
sl@0
|
514 |
* contains a syntax error.
|
sl@0
|
515 |
* @return a reference to this
|
sl@0
|
516 |
* @stable ICU 2.8
|
sl@0
|
517 |
*/
|
sl@0
|
518 |
UnicodeSet& applyPattern(const UnicodeString& pattern,
|
sl@0
|
519 |
ParsePosition& pos,
|
sl@0
|
520 |
uint32_t options,
|
sl@0
|
521 |
const SymbolTable* symbols,
|
sl@0
|
522 |
UErrorCode& status);
|
sl@0
|
523 |
|
sl@0
|
524 |
/**
|
sl@0
|
525 |
* Returns a string representation of this set. If the result of
|
sl@0
|
526 |
* calling this function is passed to a UnicodeSet constructor, it
|
sl@0
|
527 |
* will produce another set that is equal to this one.
|
sl@0
|
528 |
* @param result the string to receive the rules. Previous
|
sl@0
|
529 |
* contents will be deleted.
|
sl@0
|
530 |
* @param escapeUnprintable if TRUE then convert unprintable
|
sl@0
|
531 |
* character to their hex escape representations, \\uxxxx or
|
sl@0
|
532 |
* \\Uxxxxxxxx. Unprintable characters are those other than
|
sl@0
|
533 |
* U+000A, U+0020..U+007E.
|
sl@0
|
534 |
* @stable ICU 2.0
|
sl@0
|
535 |
*/
|
sl@0
|
536 |
virtual UnicodeString& toPattern(UnicodeString& result,
|
sl@0
|
537 |
UBool escapeUnprintable = FALSE) const;
|
sl@0
|
538 |
|
sl@0
|
539 |
/**
|
sl@0
|
540 |
* Modifies this set to contain those code points which have the given value
|
sl@0
|
541 |
* for the given binary or enumerated property, as returned by
|
sl@0
|
542 |
* u_getIntPropertyValue. Prior contents of this set are lost.
|
sl@0
|
543 |
*
|
sl@0
|
544 |
* @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1
|
sl@0
|
545 |
* or UCHAR_INT_START..UCHAR_INT_LIMIT-1
|
sl@0
|
546 |
* or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1.
|
sl@0
|
547 |
*
|
sl@0
|
548 |
* @param value a value in the range u_getIntPropertyMinValue(prop)..
|
sl@0
|
549 |
* u_getIntPropertyMaxValue(prop), with one exception. If prop is
|
sl@0
|
550 |
* UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but
|
sl@0
|
551 |
* rather a mask value produced by U_GET_GC_MASK(). This allows grouped
|
sl@0
|
552 |
* categories such as [:L:] to be represented.
|
sl@0
|
553 |
*
|
sl@0
|
554 |
* @param ec error code input/output parameter
|
sl@0
|
555 |
*
|
sl@0
|
556 |
* @return a reference to this set
|
sl@0
|
557 |
*
|
sl@0
|
558 |
* @stable ICU 2.4
|
sl@0
|
559 |
*/
|
sl@0
|
560 |
UnicodeSet& applyIntPropertyValue(UProperty prop,
|
sl@0
|
561 |
int32_t value,
|
sl@0
|
562 |
UErrorCode& ec);
|
sl@0
|
563 |
|
sl@0
|
564 |
/**
|
sl@0
|
565 |
* Modifies this set to contain those code points which have the
|
sl@0
|
566 |
* given value for the given property. Prior contents of this
|
sl@0
|
567 |
* set are lost.
|
sl@0
|
568 |
*
|
sl@0
|
569 |
* @param prop a property alias, either short or long. The name is matched
|
sl@0
|
570 |
* loosely. See PropertyAliases.txt for names and a description of loose
|
sl@0
|
571 |
* matching. If the value string is empty, then this string is interpreted
|
sl@0
|
572 |
* as either a General_Category value alias, a Script value alias, a binary
|
sl@0
|
573 |
* property alias, or a special ID. Special IDs are matched loosely and
|
sl@0
|
574 |
* correspond to the following sets:
|
sl@0
|
575 |
*
|
sl@0
|
576 |
* "ANY" = [\\u0000-\\U0010FFFF],
|
sl@0
|
577 |
* "ASCII" = [\\u0000-\\u007F],
|
sl@0
|
578 |
* "Assigned" = [:^Cn:].
|
sl@0
|
579 |
*
|
sl@0
|
580 |
* @param value a value alias, either short or long. The name is matched
|
sl@0
|
581 |
* loosely. See PropertyValueAliases.txt for names and a description of
|
sl@0
|
582 |
* loose matching. In addition to aliases listed, numeric values and
|
sl@0
|
583 |
* canonical combining classes may be expressed numerically, e.g., ("nv",
|
sl@0
|
584 |
* "0.5") or ("ccc", "220"). The value string may also be empty.
|
sl@0
|
585 |
*
|
sl@0
|
586 |
* @param ec error code input/output parameter
|
sl@0
|
587 |
*
|
sl@0
|
588 |
* @return a reference to this set
|
sl@0
|
589 |
*
|
sl@0
|
590 |
* @stable ICU 2.4
|
sl@0
|
591 |
*/
|
sl@0
|
592 |
UnicodeSet& applyPropertyAlias(const UnicodeString& prop,
|
sl@0
|
593 |
const UnicodeString& value,
|
sl@0
|
594 |
UErrorCode& ec);
|
sl@0
|
595 |
|
sl@0
|
596 |
/**
|
sl@0
|
597 |
* Returns the number of elements in this set (its cardinality).
|
sl@0
|
598 |
* Note than the elements of a set may include both individual
|
sl@0
|
599 |
* codepoints and strings.
|
sl@0
|
600 |
*
|
sl@0
|
601 |
* @return the number of elements in this set (its cardinality).
|
sl@0
|
602 |
* @stable ICU 2.0
|
sl@0
|
603 |
*/
|
sl@0
|
604 |
virtual int32_t size(void) const;
|
sl@0
|
605 |
|
sl@0
|
606 |
/**
|
sl@0
|
607 |
* Returns <tt>true</tt> if this set contains no elements.
|
sl@0
|
608 |
*
|
sl@0
|
609 |
* @return <tt>true</tt> if this set contains no elements.
|
sl@0
|
610 |
* @stable ICU 2.0
|
sl@0
|
611 |
*/
|
sl@0
|
612 |
virtual UBool isEmpty(void) const;
|
sl@0
|
613 |
|
sl@0
|
614 |
/**
|
sl@0
|
615 |
* Returns true if this set contains the given character.
|
sl@0
|
616 |
* @param c character to be checked for containment
|
sl@0
|
617 |
* @return true if the test condition is met
|
sl@0
|
618 |
* @stable ICU 2.0
|
sl@0
|
619 |
*/
|
sl@0
|
620 |
virtual UBool contains(UChar32 c) const;
|
sl@0
|
621 |
|
sl@0
|
622 |
/**
|
sl@0
|
623 |
* Returns true if this set contains every character
|
sl@0
|
624 |
* of the given range.
|
sl@0
|
625 |
* @param start first character, inclusive, of the range
|
sl@0
|
626 |
* @param end last character, inclusive, of the range
|
sl@0
|
627 |
* @return true if the test condition is met
|
sl@0
|
628 |
* @stable ICU 2.0
|
sl@0
|
629 |
*/
|
sl@0
|
630 |
virtual UBool contains(UChar32 start, UChar32 end) const;
|
sl@0
|
631 |
|
sl@0
|
632 |
/**
|
sl@0
|
633 |
* Returns <tt>true</tt> if this set contains the given
|
sl@0
|
634 |
* multicharacter string.
|
sl@0
|
635 |
* @param s string to be checked for containment
|
sl@0
|
636 |
* @return <tt>true</tt> if this set contains the specified string
|
sl@0
|
637 |
* @stable ICU 2.4
|
sl@0
|
638 |
*/
|
sl@0
|
639 |
UBool contains(const UnicodeString& s) const;
|
sl@0
|
640 |
|
sl@0
|
641 |
/**
|
sl@0
|
642 |
* Returns true if this set contains all the characters and strings
|
sl@0
|
643 |
* of the given set.
|
sl@0
|
644 |
* @param c set to be checked for containment
|
sl@0
|
645 |
* @return true if the test condition is met
|
sl@0
|
646 |
* @stable ICU 2.4
|
sl@0
|
647 |
*/
|
sl@0
|
648 |
virtual UBool containsAll(const UnicodeSet& c) const;
|
sl@0
|
649 |
|
sl@0
|
650 |
/**
|
sl@0
|
651 |
* Returns true if this set contains all the characters
|
sl@0
|
652 |
* of the given string.
|
sl@0
|
653 |
* @param s string containing characters to be checked for containment
|
sl@0
|
654 |
* @return true if the test condition is met
|
sl@0
|
655 |
* @stable ICU 2.4
|
sl@0
|
656 |
*/
|
sl@0
|
657 |
UBool containsAll(const UnicodeString& s) const;
|
sl@0
|
658 |
|
sl@0
|
659 |
/**
|
sl@0
|
660 |
* Returns true if this set contains none of the characters
|
sl@0
|
661 |
* of the given range.
|
sl@0
|
662 |
* @param start first character, inclusive, of the range
|
sl@0
|
663 |
* @param end last character, inclusive, of the range
|
sl@0
|
664 |
* @return true if the test condition is met
|
sl@0
|
665 |
* @stable ICU 2.4
|
sl@0
|
666 |
*/
|
sl@0
|
667 |
UBool containsNone(UChar32 start, UChar32 end) const;
|
sl@0
|
668 |
|
sl@0
|
669 |
/**
|
sl@0
|
670 |
* Returns true if this set contains none of the characters and strings
|
sl@0
|
671 |
* of the given set.
|
sl@0
|
672 |
* @param c set to be checked for containment
|
sl@0
|
673 |
* @return true if the test condition is met
|
sl@0
|
674 |
* @stable ICU 2.4
|
sl@0
|
675 |
*/
|
sl@0
|
676 |
UBool containsNone(const UnicodeSet& c) const;
|
sl@0
|
677 |
|
sl@0
|
678 |
/**
|
sl@0
|
679 |
* Returns true if this set contains none of the characters
|
sl@0
|
680 |
* of the given string.
|
sl@0
|
681 |
* @param s string containing characters to be checked for containment
|
sl@0
|
682 |
* @return true if the test condition is met
|
sl@0
|
683 |
* @stable ICU 2.4
|
sl@0
|
684 |
*/
|
sl@0
|
685 |
UBool containsNone(const UnicodeString& s) const;
|
sl@0
|
686 |
|
sl@0
|
687 |
/**
|
sl@0
|
688 |
* Returns true if this set contains one or more of the characters
|
sl@0
|
689 |
* in the given range.
|
sl@0
|
690 |
* @param start first character, inclusive, of the range
|
sl@0
|
691 |
* @param end last character, inclusive, of the range
|
sl@0
|
692 |
* @return true if the condition is met
|
sl@0
|
693 |
* @stable ICU 2.4
|
sl@0
|
694 |
*/
|
sl@0
|
695 |
inline UBool containsSome(UChar32 start, UChar32 end) const;
|
sl@0
|
696 |
|
sl@0
|
697 |
/**
|
sl@0
|
698 |
* Returns true if this set contains one or more of the characters
|
sl@0
|
699 |
* and strings of the given set.
|
sl@0
|
700 |
* @param s The set to be checked for containment
|
sl@0
|
701 |
* @return true if the condition is met
|
sl@0
|
702 |
* @stable ICU 2.4
|
sl@0
|
703 |
*/
|
sl@0
|
704 |
inline UBool containsSome(const UnicodeSet& s) const;
|
sl@0
|
705 |
|
sl@0
|
706 |
/**
|
sl@0
|
707 |
* Returns true if this set contains one or more of the characters
|
sl@0
|
708 |
* of the given string.
|
sl@0
|
709 |
* @param s string containing characters to be checked for containment
|
sl@0
|
710 |
* @return true if the condition is met
|
sl@0
|
711 |
* @stable ICU 2.4
|
sl@0
|
712 |
*/
|
sl@0
|
713 |
inline UBool containsSome(const UnicodeString& s) const;
|
sl@0
|
714 |
|
sl@0
|
715 |
/**
|
sl@0
|
716 |
* Implement UnicodeMatcher::matches()
|
sl@0
|
717 |
* @stable ICU 2.4
|
sl@0
|
718 |
*/
|
sl@0
|
719 |
virtual UMatchDegree matches(const Replaceable& text,
|
sl@0
|
720 |
int32_t& offset,
|
sl@0
|
721 |
int32_t limit,
|
sl@0
|
722 |
UBool incremental);
|
sl@0
|
723 |
|
sl@0
|
724 |
private:
|
sl@0
|
725 |
/**
|
sl@0
|
726 |
* Returns the longest match for s in text at the given position.
|
sl@0
|
727 |
* If limit > start then match forward from start+1 to limit
|
sl@0
|
728 |
* matching all characters except s.charAt(0). If limit < start,
|
sl@0
|
729 |
* go backward starting from start-1 matching all characters
|
sl@0
|
730 |
* except s.charAt(s.length()-1). This method assumes that the
|
sl@0
|
731 |
* first character, text.charAt(start), matches s, so it does not
|
sl@0
|
732 |
* check it.
|
sl@0
|
733 |
* @param text the text to match
|
sl@0
|
734 |
* @param start the first character to match. In the forward
|
sl@0
|
735 |
* direction, text.charAt(start) is matched against s.charAt(0).
|
sl@0
|
736 |
* In the reverse direction, it is matched against
|
sl@0
|
737 |
* s.charAt(s.length()-1).
|
sl@0
|
738 |
* @param limit the limit offset for matching, either last+1 in
|
sl@0
|
739 |
* the forward direction, or last-1 in the reverse direction,
|
sl@0
|
740 |
* where last is the index of the last character to match.
|
sl@0
|
741 |
* @return If part of s matches up to the limit, return |limit -
|
sl@0
|
742 |
* start|. If all of s matches before reaching the limit, return
|
sl@0
|
743 |
* s.length(). If there is a mismatch between s and text, return
|
sl@0
|
744 |
* 0
|
sl@0
|
745 |
*/
|
sl@0
|
746 |
static int32_t matchRest(const Replaceable& text,
|
sl@0
|
747 |
int32_t start, int32_t limit,
|
sl@0
|
748 |
const UnicodeString& s);
|
sl@0
|
749 |
|
sl@0
|
750 |
/**
|
sl@0
|
751 |
* Returns the smallest value i such that c < list[i]. Caller
|
sl@0
|
752 |
* must ensure that c is a legal value or this method will enter
|
sl@0
|
753 |
* an infinite loop. This method performs a binary search.
|
sl@0
|
754 |
* @param c a character in the range MIN_VALUE..MAX_VALUE
|
sl@0
|
755 |
* inclusive
|
sl@0
|
756 |
* @return the smallest integer i in the range 0..len-1,
|
sl@0
|
757 |
* inclusive, such that c < list[i]
|
sl@0
|
758 |
*/
|
sl@0
|
759 |
int32_t findCodePoint(UChar32 c) const;
|
sl@0
|
760 |
|
sl@0
|
761 |
public:
|
sl@0
|
762 |
|
sl@0
|
763 |
/**
|
sl@0
|
764 |
* Implementation of UnicodeMatcher API. Union the set of all
|
sl@0
|
765 |
* characters that may be matched by this object into the given
|
sl@0
|
766 |
* set.
|
sl@0
|
767 |
* @param toUnionTo the set into which to union the source characters
|
sl@0
|
768 |
* @stable ICU 2.4
|
sl@0
|
769 |
*/
|
sl@0
|
770 |
virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
|
sl@0
|
771 |
|
sl@0
|
772 |
/**
|
sl@0
|
773 |
* Returns the index of the given character within this set, where
|
sl@0
|
774 |
* the set is ordered by ascending code point. If the character
|
sl@0
|
775 |
* is not in this set, return -1. The inverse of this method is
|
sl@0
|
776 |
* <code>charAt()</code>.
|
sl@0
|
777 |
* @return an index from 0..size()-1, or -1
|
sl@0
|
778 |
* @stable ICU 2.4
|
sl@0
|
779 |
*/
|
sl@0
|
780 |
int32_t indexOf(UChar32 c) const;
|
sl@0
|
781 |
|
sl@0
|
782 |
/**
|
sl@0
|
783 |
* Returns the character at the given index within this set, where
|
sl@0
|
784 |
* the set is ordered by ascending code point. If the index is
|
sl@0
|
785 |
* out of range, return (UChar32)-1. The inverse of this method is
|
sl@0
|
786 |
* <code>indexOf()</code>.
|
sl@0
|
787 |
* @param index an index from 0..size()-1
|
sl@0
|
788 |
* @return the character at the given index, or (UChar32)-1.
|
sl@0
|
789 |
* @stable ICU 2.4
|
sl@0
|
790 |
*/
|
sl@0
|
791 |
UChar32 charAt(int32_t index) const;
|
sl@0
|
792 |
|
sl@0
|
793 |
/**
|
sl@0
|
794 |
* Adds the specified range to this set if it is not already
|
sl@0
|
795 |
* present. If this set already contains the specified range,
|
sl@0
|
796 |
* the call leaves this set unchanged. If <code>end > start</code>
|
sl@0
|
797 |
* then an empty range is added, leaving the set unchanged.
|
sl@0
|
798 |
* This is equivalent to a boolean logic OR, or a set UNION.
|
sl@0
|
799 |
*
|
sl@0
|
800 |
* @param start first character, inclusive, of range to be added
|
sl@0
|
801 |
* to this set.
|
sl@0
|
802 |
* @param end last character, inclusive, of range to be added
|
sl@0
|
803 |
* to this set.
|
sl@0
|
804 |
* @stable ICU 2.0
|
sl@0
|
805 |
*/
|
sl@0
|
806 |
virtual UnicodeSet& add(UChar32 start, UChar32 end);
|
sl@0
|
807 |
|
sl@0
|
808 |
/**
|
sl@0
|
809 |
* Adds the specified character to this set if it is not already
|
sl@0
|
810 |
* present. If this set already contains the specified character,
|
sl@0
|
811 |
* the call leaves this set unchanged.
|
sl@0
|
812 |
* @stable ICU 2.0
|
sl@0
|
813 |
*/
|
sl@0
|
814 |
UnicodeSet& add(UChar32 c);
|
sl@0
|
815 |
|
sl@0
|
816 |
/**
|
sl@0
|
817 |
* Adds the specified multicharacter to this set if it is not already
|
sl@0
|
818 |
* present. If this set already contains the multicharacter,
|
sl@0
|
819 |
* the call leaves this set unchanged.
|
sl@0
|
820 |
* Thus "ch" => {"ch"}
|
sl@0
|
821 |
* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
|
sl@0
|
822 |
* @param s the source string
|
sl@0
|
823 |
* @return this object, for chaining
|
sl@0
|
824 |
* @stable ICU 2.4
|
sl@0
|
825 |
*/
|
sl@0
|
826 |
UnicodeSet& add(const UnicodeString& s);
|
sl@0
|
827 |
|
sl@0
|
828 |
private:
|
sl@0
|
829 |
/**
|
sl@0
|
830 |
* @return a code point IF the string consists of a single one.
|
sl@0
|
831 |
* otherwise returns -1.
|
sl@0
|
832 |
* @param string to test
|
sl@0
|
833 |
*/
|
sl@0
|
834 |
static int32_t getSingleCP(const UnicodeString& s);
|
sl@0
|
835 |
|
sl@0
|
836 |
void _add(const UnicodeString& s);
|
sl@0
|
837 |
|
sl@0
|
838 |
public:
|
sl@0
|
839 |
/**
|
sl@0
|
840 |
* Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
|
sl@0
|
841 |
* If this set already any particular character, it has no effect on that character.
|
sl@0
|
842 |
* @param s the source string
|
sl@0
|
843 |
* @return this object, for chaining
|
sl@0
|
844 |
* @stable ICU 2.4
|
sl@0
|
845 |
*/
|
sl@0
|
846 |
UnicodeSet& addAll(const UnicodeString& s);
|
sl@0
|
847 |
|
sl@0
|
848 |
/**
|
sl@0
|
849 |
* Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
|
sl@0
|
850 |
* If this set already any particular character, it has no effect on that character.
|
sl@0
|
851 |
* @param s the source string
|
sl@0
|
852 |
* @return this object, for chaining
|
sl@0
|
853 |
* @stable ICU 2.4
|
sl@0
|
854 |
*/
|
sl@0
|
855 |
UnicodeSet& retainAll(const UnicodeString& s);
|
sl@0
|
856 |
|
sl@0
|
857 |
/**
|
sl@0
|
858 |
* Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
|
sl@0
|
859 |
* If this set already any particular character, it has no effect on that character.
|
sl@0
|
860 |
* @param s the source string
|
sl@0
|
861 |
* @return this object, for chaining
|
sl@0
|
862 |
* @stable ICU 2.4
|
sl@0
|
863 |
*/
|
sl@0
|
864 |
UnicodeSet& complementAll(const UnicodeString& s);
|
sl@0
|
865 |
|
sl@0
|
866 |
/**
|
sl@0
|
867 |
* Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
|
sl@0
|
868 |
* If this set already any particular character, it has no effect on that character.
|
sl@0
|
869 |
* @param s the source string
|
sl@0
|
870 |
* @return this object, for chaining
|
sl@0
|
871 |
* @stable ICU 2.4
|
sl@0
|
872 |
*/
|
sl@0
|
873 |
UnicodeSet& removeAll(const UnicodeString& s);
|
sl@0
|
874 |
|
sl@0
|
875 |
/**
|
sl@0
|
876 |
* Makes a set from a multicharacter string. Thus "ch" => {"ch"}
|
sl@0
|
877 |
* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
|
sl@0
|
878 |
* @param s the source string
|
sl@0
|
879 |
* @return a newly created set containing the given string.
|
sl@0
|
880 |
* The caller owns the return object and is responsible for deleting it.
|
sl@0
|
881 |
* @stable ICU 2.4
|
sl@0
|
882 |
*/
|
sl@0
|
883 |
static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s);
|
sl@0
|
884 |
|
sl@0
|
885 |
|
sl@0
|
886 |
/**
|
sl@0
|
887 |
* Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
|
sl@0
|
888 |
* @param s the source string
|
sl@0
|
889 |
* @return a newly created set containing the given characters
|
sl@0
|
890 |
* The caller owns the return object and is responsible for deleting it.
|
sl@0
|
891 |
* @stable ICU 2.4
|
sl@0
|
892 |
*/
|
sl@0
|
893 |
static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s);
|
sl@0
|
894 |
|
sl@0
|
895 |
/**
|
sl@0
|
896 |
* Retain only the elements in this set that are contained in the
|
sl@0
|
897 |
* specified range. If <code>end > start</code> then an empty range is
|
sl@0
|
898 |
* retained, leaving the set empty. This is equivalent to
|
sl@0
|
899 |
* a boolean logic AND, or a set INTERSECTION.
|
sl@0
|
900 |
*
|
sl@0
|
901 |
* @param start first character, inclusive, of range to be retained
|
sl@0
|
902 |
* to this set.
|
sl@0
|
903 |
* @param end last character, inclusive, of range to be retained
|
sl@0
|
904 |
* to this set.
|
sl@0
|
905 |
* @stable ICU 2.0
|
sl@0
|
906 |
*/
|
sl@0
|
907 |
virtual UnicodeSet& retain(UChar32 start, UChar32 end);
|
sl@0
|
908 |
|
sl@0
|
909 |
|
sl@0
|
910 |
/**
|
sl@0
|
911 |
* Retain the specified character from this set if it is present.
|
sl@0
|
912 |
* @stable ICU 2.0
|
sl@0
|
913 |
*/
|
sl@0
|
914 |
UnicodeSet& retain(UChar32 c);
|
sl@0
|
915 |
|
sl@0
|
916 |
/**
|
sl@0
|
917 |
* Removes the specified range from this set if it is present.
|
sl@0
|
918 |
* The set will not contain the specified range once the call
|
sl@0
|
919 |
* returns. If <code>end > start</code> then an empty range is
|
sl@0
|
920 |
* removed, leaving the set unchanged.
|
sl@0
|
921 |
*
|
sl@0
|
922 |
* @param start first character, inclusive, of range to be removed
|
sl@0
|
923 |
* from this set.
|
sl@0
|
924 |
* @param end last character, inclusive, of range to be removed
|
sl@0
|
925 |
* from this set.
|
sl@0
|
926 |
* @stable ICU 2.0
|
sl@0
|
927 |
*/
|
sl@0
|
928 |
virtual UnicodeSet& remove(UChar32 start, UChar32 end);
|
sl@0
|
929 |
|
sl@0
|
930 |
/**
|
sl@0
|
931 |
* Removes the specified character from this set if it is present.
|
sl@0
|
932 |
* The set will not contain the specified range once the call
|
sl@0
|
933 |
* returns.
|
sl@0
|
934 |
* @stable ICU 2.0
|
sl@0
|
935 |
*/
|
sl@0
|
936 |
UnicodeSet& remove(UChar32 c);
|
sl@0
|
937 |
|
sl@0
|
938 |
/**
|
sl@0
|
939 |
* Removes the specified string from this set if it is present.
|
sl@0
|
940 |
* The set will not contain the specified character once the call
|
sl@0
|
941 |
* returns.
|
sl@0
|
942 |
* @param s the source string
|
sl@0
|
943 |
* @return this object, for chaining
|
sl@0
|
944 |
* @stable ICU 2.4
|
sl@0
|
945 |
*/
|
sl@0
|
946 |
UnicodeSet& remove(const UnicodeString& s);
|
sl@0
|
947 |
|
sl@0
|
948 |
/**
|
sl@0
|
949 |
* Inverts this set. This operation modifies this set so that
|
sl@0
|
950 |
* its value is its complement. This is equivalent to
|
sl@0
|
951 |
* <code>complement(MIN_VALUE, MAX_VALUE)</code>.
|
sl@0
|
952 |
* @stable ICU 2.0
|
sl@0
|
953 |
*/
|
sl@0
|
954 |
virtual UnicodeSet& complement(void);
|
sl@0
|
955 |
|
sl@0
|
956 |
/**
|
sl@0
|
957 |
* Complements the specified range in this set. Any character in
|
sl@0
|
958 |
* the range will be removed if it is in this set, or will be
|
sl@0
|
959 |
* added if it is not in this set. If <code>end > start</code>
|
sl@0
|
960 |
* then an empty range is complemented, leaving the set unchanged.
|
sl@0
|
961 |
* This is equivalent to a boolean logic XOR.
|
sl@0
|
962 |
*
|
sl@0
|
963 |
* @param start first character, inclusive, of range to be removed
|
sl@0
|
964 |
* from this set.
|
sl@0
|
965 |
* @param end last character, inclusive, of range to be removed
|
sl@0
|
966 |
* from this set.
|
sl@0
|
967 |
* @stable ICU 2.0
|
sl@0
|
968 |
*/
|
sl@0
|
969 |
virtual UnicodeSet& complement(UChar32 start, UChar32 end);
|
sl@0
|
970 |
|
sl@0
|
971 |
/**
|
sl@0
|
972 |
* Complements the specified character in this set. The character
|
sl@0
|
973 |
* will be removed if it is in this set, or will be added if it is
|
sl@0
|
974 |
* not in this set.
|
sl@0
|
975 |
* @stable ICU 2.0
|
sl@0
|
976 |
*/
|
sl@0
|
977 |
UnicodeSet& complement(UChar32 c);
|
sl@0
|
978 |
|
sl@0
|
979 |
/**
|
sl@0
|
980 |
* Complement the specified string in this set.
|
sl@0
|
981 |
* The set will not contain the specified string once the call
|
sl@0
|
982 |
* returns.
|
sl@0
|
983 |
* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
|
sl@0
|
984 |
* @param s the string to complement
|
sl@0
|
985 |
* @return this object, for chaining
|
sl@0
|
986 |
* @stable ICU 2.4
|
sl@0
|
987 |
*/
|
sl@0
|
988 |
UnicodeSet& complement(const UnicodeString& s);
|
sl@0
|
989 |
|
sl@0
|
990 |
/**
|
sl@0
|
991 |
* Adds all of the elements in the specified set to this set if
|
sl@0
|
992 |
* they're not already present. This operation effectively
|
sl@0
|
993 |
* modifies this set so that its value is the <i>union</i> of the two
|
sl@0
|
994 |
* sets. The behavior of this operation is unspecified if the specified
|
sl@0
|
995 |
* collection is modified while the operation is in progress.
|
sl@0
|
996 |
*
|
sl@0
|
997 |
* @param c set whose elements are to be added to this set.
|
sl@0
|
998 |
* @see #add(char, char)
|
sl@0
|
999 |
* @stable ICU 2.0
|
sl@0
|
1000 |
*/
|
sl@0
|
1001 |
virtual UnicodeSet& addAll(const UnicodeSet& c);
|
sl@0
|
1002 |
|
sl@0
|
1003 |
/**
|
sl@0
|
1004 |
* Retains only the elements in this set that are contained in the
|
sl@0
|
1005 |
* specified set. In other words, removes from this set all of
|
sl@0
|
1006 |
* its elements that are not contained in the specified set. This
|
sl@0
|
1007 |
* operation effectively modifies this set so that its value is
|
sl@0
|
1008 |
* the <i>intersection</i> of the two sets.
|
sl@0
|
1009 |
*
|
sl@0
|
1010 |
* @param c set that defines which elements this set will retain.
|
sl@0
|
1011 |
* @stable ICU 2.0
|
sl@0
|
1012 |
*/
|
sl@0
|
1013 |
virtual UnicodeSet& retainAll(const UnicodeSet& c);
|
sl@0
|
1014 |
|
sl@0
|
1015 |
/**
|
sl@0
|
1016 |
* Removes from this set all of its elements that are contained in the
|
sl@0
|
1017 |
* specified set. This operation effectively modifies this
|
sl@0
|
1018 |
* set so that its value is the <i>asymmetric set difference</i> of
|
sl@0
|
1019 |
* the two sets.
|
sl@0
|
1020 |
*
|
sl@0
|
1021 |
* @param c set that defines which elements will be removed from
|
sl@0
|
1022 |
* this set.
|
sl@0
|
1023 |
* @stable ICU 2.0
|
sl@0
|
1024 |
*/
|
sl@0
|
1025 |
virtual UnicodeSet& removeAll(const UnicodeSet& c);
|
sl@0
|
1026 |
|
sl@0
|
1027 |
/**
|
sl@0
|
1028 |
* Complements in this set all elements contained in the specified
|
sl@0
|
1029 |
* set. Any character in the other set will be removed if it is
|
sl@0
|
1030 |
* in this set, or will be added if it is not in this set.
|
sl@0
|
1031 |
*
|
sl@0
|
1032 |
* @param c set that defines which elements will be xor'ed from
|
sl@0
|
1033 |
* this set.
|
sl@0
|
1034 |
* @stable ICU 2.4
|
sl@0
|
1035 |
*/
|
sl@0
|
1036 |
virtual UnicodeSet& complementAll(const UnicodeSet& c);
|
sl@0
|
1037 |
|
sl@0
|
1038 |
/**
|
sl@0
|
1039 |
* Removes all of the elements from this set. This set will be
|
sl@0
|
1040 |
* empty after this call returns.
|
sl@0
|
1041 |
* @stable ICU 2.0
|
sl@0
|
1042 |
*/
|
sl@0
|
1043 |
virtual UnicodeSet& clear(void);
|
sl@0
|
1044 |
|
sl@0
|
1045 |
/**
|
sl@0
|
1046 |
* Close this set over the given attribute. For the attribute
|
sl@0
|
1047 |
* USET_CASE, the result is to modify this set so that:
|
sl@0
|
1048 |
*
|
sl@0
|
1049 |
* 1. For each character or string 'a' in this set, all strings or
|
sl@0
|
1050 |
* characters 'b' such that foldCase(a) == foldCase(b) are added
|
sl@0
|
1051 |
* to this set.
|
sl@0
|
1052 |
*
|
sl@0
|
1053 |
* 2. For each string 'e' in the resulting set, if e !=
|
sl@0
|
1054 |
* foldCase(e), 'e' will be removed.
|
sl@0
|
1055 |
*
|
sl@0
|
1056 |
* Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]
|
sl@0
|
1057 |
*
|
sl@0
|
1058 |
* (Here foldCase(x) refers to the operation u_strFoldCase, and a
|
sl@0
|
1059 |
* == b denotes that the contents are the same, not pointer
|
sl@0
|
1060 |
* comparison.)
|
sl@0
|
1061 |
*
|
sl@0
|
1062 |
* @param attribute bitmask for attributes to close over.
|
sl@0
|
1063 |
* Currently only the USET_CASE bit is supported. Any undefined bits
|
sl@0
|
1064 |
* are ignored.
|
sl@0
|
1065 |
* @return a reference to this set.
|
sl@0
|
1066 |
* @internal
|
sl@0
|
1067 |
*/
|
sl@0
|
1068 |
UnicodeSet& closeOver(int32_t attribute);
|
sl@0
|
1069 |
|
sl@0
|
1070 |
/**
|
sl@0
|
1071 |
* Iteration method that returns the number of ranges contained in
|
sl@0
|
1072 |
* this set.
|
sl@0
|
1073 |
* @see #getRangeStart
|
sl@0
|
1074 |
* @see #getRangeEnd
|
sl@0
|
1075 |
* @stable ICU 2.4
|
sl@0
|
1076 |
*/
|
sl@0
|
1077 |
virtual int32_t getRangeCount(void) const;
|
sl@0
|
1078 |
|
sl@0
|
1079 |
/**
|
sl@0
|
1080 |
* Iteration method that returns the first character in the
|
sl@0
|
1081 |
* specified range of this set.
|
sl@0
|
1082 |
* @see #getRangeCount
|
sl@0
|
1083 |
* @see #getRangeEnd
|
sl@0
|
1084 |
* @stable ICU 2.4
|
sl@0
|
1085 |
*/
|
sl@0
|
1086 |
virtual UChar32 getRangeStart(int32_t index) const;
|
sl@0
|
1087 |
|
sl@0
|
1088 |
/**
|
sl@0
|
1089 |
* Iteration method that returns the last character in the
|
sl@0
|
1090 |
* specified range of this set.
|
sl@0
|
1091 |
* @see #getRangeStart
|
sl@0
|
1092 |
* @see #getRangeEnd
|
sl@0
|
1093 |
* @stable ICU 2.4
|
sl@0
|
1094 |
*/
|
sl@0
|
1095 |
virtual UChar32 getRangeEnd(int32_t index) const;
|
sl@0
|
1096 |
|
sl@0
|
1097 |
/**
|
sl@0
|
1098 |
* Serializes this set into an array of 16-bit integers. Serialization
|
sl@0
|
1099 |
* (currently) only records the characters in the set; multicharacter
|
sl@0
|
1100 |
* strings are ignored.
|
sl@0
|
1101 |
*
|
sl@0
|
1102 |
* The array has following format (each line is one 16-bit
|
sl@0
|
1103 |
* integer):
|
sl@0
|
1104 |
*
|
sl@0
|
1105 |
* length = (n+2*m) | (m!=0?0x8000:0)
|
sl@0
|
1106 |
* bmpLength = n; present if m!=0
|
sl@0
|
1107 |
* bmp[0]
|
sl@0
|
1108 |
* bmp[1]
|
sl@0
|
1109 |
* ...
|
sl@0
|
1110 |
* bmp[n-1]
|
sl@0
|
1111 |
* supp-high[0]
|
sl@0
|
1112 |
* supp-low[0]
|
sl@0
|
1113 |
* supp-high[1]
|
sl@0
|
1114 |
* supp-low[1]
|
sl@0
|
1115 |
* ...
|
sl@0
|
1116 |
* supp-high[m-1]
|
sl@0
|
1117 |
* supp-low[m-1]
|
sl@0
|
1118 |
*
|
sl@0
|
1119 |
* The array starts with a header. After the header are n bmp
|
sl@0
|
1120 |
* code points, then m supplementary code points. Either n or m
|
sl@0
|
1121 |
* or both may be zero. n+2*m is always <= 0x7FFF.
|
sl@0
|
1122 |
*
|
sl@0
|
1123 |
* If there are no supplementary characters (if m==0) then the
|
sl@0
|
1124 |
* header is one 16-bit integer, 'length', with value n.
|
sl@0
|
1125 |
*
|
sl@0
|
1126 |
* If there are supplementary characters (if m!=0) then the header
|
sl@0
|
1127 |
* is two 16-bit integers. The first, 'length', has value
|
sl@0
|
1128 |
* (n+2*m)|0x8000. The second, 'bmpLength', has value n.
|
sl@0
|
1129 |
*
|
sl@0
|
1130 |
* After the header the code points are stored in ascending order.
|
sl@0
|
1131 |
* Supplementary code points are stored as most significant 16
|
sl@0
|
1132 |
* bits followed by least significant 16 bits.
|
sl@0
|
1133 |
*
|
sl@0
|
1134 |
* @param dest pointer to buffer of destCapacity 16-bit integers.
|
sl@0
|
1135 |
* May be NULL only if destCapacity is zero.
|
sl@0
|
1136 |
* @param destCapacity size of dest, or zero. Must not be negative.
|
sl@0
|
1137 |
* @param ec error code. Will be set to U_INDEX_OUTOFBOUNDS_ERROR
|
sl@0
|
1138 |
* if n+2*m > 0x7FFF. Will be set to U_BUFFER_OVERFLOW_ERROR if
|
sl@0
|
1139 |
* n+2*m+(m!=0?2:1) > destCapacity.
|
sl@0
|
1140 |
* @return the total length of the serialized format, including
|
sl@0
|
1141 |
* the header, that is, n+2*m+(m!=0?2:1), or 0 on error other
|
sl@0
|
1142 |
* than U_BUFFER_OVERFLOW_ERROR.
|
sl@0
|
1143 |
* @stable ICU 2.4
|
sl@0
|
1144 |
*/
|
sl@0
|
1145 |
int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const;
|
sl@0
|
1146 |
|
sl@0
|
1147 |
/**
|
sl@0
|
1148 |
* Reallocate this objects internal structures to take up the least
|
sl@0
|
1149 |
* possible space, without changing this object's value.
|
sl@0
|
1150 |
* @stable ICU 2.4
|
sl@0
|
1151 |
*/
|
sl@0
|
1152 |
virtual UnicodeSet& compact();
|
sl@0
|
1153 |
|
sl@0
|
1154 |
/**
|
sl@0
|
1155 |
* Return the class ID for this class. This is useful only for
|
sl@0
|
1156 |
* comparing to a return value from getDynamicClassID(). For example:
|
sl@0
|
1157 |
* <pre>
|
sl@0
|
1158 |
* . Base* polymorphic_pointer = createPolymorphicObject();
|
sl@0
|
1159 |
* . if (polymorphic_pointer->getDynamicClassID() ==
|
sl@0
|
1160 |
* . Derived::getStaticClassID()) ...
|
sl@0
|
1161 |
* </pre>
|
sl@0
|
1162 |
* @return The class ID for all objects of this class.
|
sl@0
|
1163 |
* @stable ICU 2.0
|
sl@0
|
1164 |
*/
|
sl@0
|
1165 |
static UClassID U_EXPORT2 getStaticClassID(void);
|
sl@0
|
1166 |
|
sl@0
|
1167 |
/**
|
sl@0
|
1168 |
* Implement UnicodeFunctor API.
|
sl@0
|
1169 |
*
|
sl@0
|
1170 |
* @return The class ID for this object. All objects of a given
|
sl@0
|
1171 |
* class have the same class ID. Objects of other classes have
|
sl@0
|
1172 |
* different class IDs.
|
sl@0
|
1173 |
* @stable ICU 2.4
|
sl@0
|
1174 |
*/
|
sl@0
|
1175 |
virtual UClassID getDynamicClassID(void) const;
|
sl@0
|
1176 |
|
sl@0
|
1177 |
private:
|
sl@0
|
1178 |
|
sl@0
|
1179 |
// Private API for the USet API
|
sl@0
|
1180 |
|
sl@0
|
1181 |
friend class USetAccess;
|
sl@0
|
1182 |
|
sl@0
|
1183 |
int32_t getStringCount() const;
|
sl@0
|
1184 |
|
sl@0
|
1185 |
const UnicodeString* getString(int32_t index) const;
|
sl@0
|
1186 |
|
sl@0
|
1187 |
//----------------------------------------------------------------
|
sl@0
|
1188 |
// RuleBasedTransliterator support
|
sl@0
|
1189 |
//----------------------------------------------------------------
|
sl@0
|
1190 |
|
sl@0
|
1191 |
private:
|
sl@0
|
1192 |
|
sl@0
|
1193 |
/**
|
sl@0
|
1194 |
* Returns <tt>true</tt> if this set contains any character whose low byte
|
sl@0
|
1195 |
* is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
|
sl@0
|
1196 |
* indexing.
|
sl@0
|
1197 |
*/
|
sl@0
|
1198 |
virtual UBool matchesIndexValue(uint8_t v) const;
|
sl@0
|
1199 |
|
sl@0
|
1200 |
private:
|
sl@0
|
1201 |
|
sl@0
|
1202 |
//----------------------------------------------------------------
|
sl@0
|
1203 |
// Implementation: Pattern parsing
|
sl@0
|
1204 |
//----------------------------------------------------------------
|
sl@0
|
1205 |
|
sl@0
|
1206 |
void applyPattern(RuleCharacterIterator& chars,
|
sl@0
|
1207 |
const SymbolTable* symbols,
|
sl@0
|
1208 |
UnicodeString& rebuiltPat,
|
sl@0
|
1209 |
uint32_t options,
|
sl@0
|
1210 |
UErrorCode& ec);
|
sl@0
|
1211 |
|
sl@0
|
1212 |
//----------------------------------------------------------------
|
sl@0
|
1213 |
// Implementation: Utility methods
|
sl@0
|
1214 |
//----------------------------------------------------------------
|
sl@0
|
1215 |
|
sl@0
|
1216 |
void ensureCapacity(int32_t newLen);
|
sl@0
|
1217 |
|
sl@0
|
1218 |
void ensureBufferCapacity(int32_t newLen);
|
sl@0
|
1219 |
|
sl@0
|
1220 |
void swapBuffers(void);
|
sl@0
|
1221 |
|
sl@0
|
1222 |
UBool allocateStrings();
|
sl@0
|
1223 |
|
sl@0
|
1224 |
UnicodeString& _toPattern(UnicodeString& result,
|
sl@0
|
1225 |
UBool escapeUnprintable) const;
|
sl@0
|
1226 |
|
sl@0
|
1227 |
UnicodeString& _generatePattern(UnicodeString& result,
|
sl@0
|
1228 |
UBool escapeUnprintable) const;
|
sl@0
|
1229 |
|
sl@0
|
1230 |
static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);
|
sl@0
|
1231 |
|
sl@0
|
1232 |
static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
|
sl@0
|
1233 |
|
sl@0
|
1234 |
//----------------------------------------------------------------
|
sl@0
|
1235 |
// Implementation: Fundamental operators
|
sl@0
|
1236 |
//----------------------------------------------------------------
|
sl@0
|
1237 |
|
sl@0
|
1238 |
void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);
|
sl@0
|
1239 |
|
sl@0
|
1240 |
void add(const UChar32* other, int32_t otherLen, int8_t polarity);
|
sl@0
|
1241 |
|
sl@0
|
1242 |
void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
|
sl@0
|
1243 |
|
sl@0
|
1244 |
/**
|
sl@0
|
1245 |
* Return true if the given position, in the given pattern, appears
|
sl@0
|
1246 |
* to be the start of a property set pattern [:foo:], \\p{foo}, or
|
sl@0
|
1247 |
* \\P{foo}, or \\N{name}.
|
sl@0
|
1248 |
*/
|
sl@0
|
1249 |
static UBool resemblesPropertyPattern(const UnicodeString& pattern,
|
sl@0
|
1250 |
int32_t pos);
|
sl@0
|
1251 |
|
sl@0
|
1252 |
static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
|
sl@0
|
1253 |
int32_t iterOpts);
|
sl@0
|
1254 |
|
sl@0
|
1255 |
/**
|
sl@0
|
1256 |
* Parse the given property pattern at the given parse position
|
sl@0
|
1257 |
* and set this UnicodeSet to the result.
|
sl@0
|
1258 |
*
|
sl@0
|
1259 |
* The original design document is out of date, but still useful.
|
sl@0
|
1260 |
* Ignore the property and value names:
|
sl@0
|
1261 |
* http://dev.icu-project.org/cgi-bin/viewcvs.cgi/~checkout~/icuhtml/design/unicodeset_properties.html
|
sl@0
|
1262 |
*
|
sl@0
|
1263 |
* Recognized syntax:
|
sl@0
|
1264 |
*
|
sl@0
|
1265 |
* [:foo:] [:^foo:] - white space not allowed within "[:" or ":]"
|
sl@0
|
1266 |
* \\p{foo} \\P{foo} - white space not allowed within "\\p" or "\\P"
|
sl@0
|
1267 |
* \\N{name} - white space not allowed within "\\N"
|
sl@0
|
1268 |
*
|
sl@0
|
1269 |
* Other than the above restrictions, white space is ignored. Case
|
sl@0
|
1270 |
* is ignored except in "\\p" and "\\P" and "\\N". In 'name' leading
|
sl@0
|
1271 |
* and trailing space is deleted, and internal runs of whitespace
|
sl@0
|
1272 |
* are collapsed to a single space.
|
sl@0
|
1273 |
*
|
sl@0
|
1274 |
* We support binary properties, enumerated properties, and the
|
sl@0
|
1275 |
* following non-enumerated properties:
|
sl@0
|
1276 |
*
|
sl@0
|
1277 |
* Numeric_Value
|
sl@0
|
1278 |
* Name
|
sl@0
|
1279 |
* Unicode_1_Name
|
sl@0
|
1280 |
*
|
sl@0
|
1281 |
* @param pattern the pattern string
|
sl@0
|
1282 |
* @param ppos on entry, the position at which to begin parsing.
|
sl@0
|
1283 |
* This should be one of the locations marked '^':
|
sl@0
|
1284 |
*
|
sl@0
|
1285 |
* [:blah:] \\p{blah} \\P{blah} \\N{name}
|
sl@0
|
1286 |
* ^ % ^ % ^ % ^ %
|
sl@0
|
1287 |
*
|
sl@0
|
1288 |
* On return, the position after the last character parsed, that is,
|
sl@0
|
1289 |
* the locations marked '%'. If the parse fails, ppos is returned
|
sl@0
|
1290 |
* unchanged.
|
sl@0
|
1291 |
* @return a reference to this.
|
sl@0
|
1292 |
*/
|
sl@0
|
1293 |
UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
|
sl@0
|
1294 |
ParsePosition& ppos,
|
sl@0
|
1295 |
UErrorCode &ec);
|
sl@0
|
1296 |
|
sl@0
|
1297 |
void applyPropertyPattern(RuleCharacterIterator& chars,
|
sl@0
|
1298 |
UnicodeString& rebuiltPat,
|
sl@0
|
1299 |
UErrorCode& ec);
|
sl@0
|
1300 |
|
sl@0
|
1301 |
/**
|
sl@0
|
1302 |
* A filter that returns TRUE if the given code point should be
|
sl@0
|
1303 |
* included in the UnicodeSet being constructed.
|
sl@0
|
1304 |
*/
|
sl@0
|
1305 |
typedef UBool (*Filter)(UChar32 codePoint, void* context);
|
sl@0
|
1306 |
|
sl@0
|
1307 |
/**
|
sl@0
|
1308 |
* Given a filter, set this UnicodeSet to the code points
|
sl@0
|
1309 |
* contained by that filter. The filter MUST be
|
sl@0
|
1310 |
* property-conformant. That is, if it returns value v for one
|
sl@0
|
1311 |
* code point, then it must return v for all affiliated code
|
sl@0
|
1312 |
* points, as defined by the inclusions list. See
|
sl@0
|
1313 |
* getInclusions().
|
sl@0
|
1314 |
* src is a UPropertySource value.
|
sl@0
|
1315 |
*/
|
sl@0
|
1316 |
void applyFilter(Filter filter,
|
sl@0
|
1317 |
void* context,
|
sl@0
|
1318 |
int32_t src,
|
sl@0
|
1319 |
UErrorCode &status);
|
sl@0
|
1320 |
|
sl@0
|
1321 |
/**
|
sl@0
|
1322 |
* Return a cached copy of the inclusions list for the property source.
|
sl@0
|
1323 |
*/
|
sl@0
|
1324 |
static const UnicodeSet* getInclusions(int32_t src, UErrorCode &errorCode);
|
sl@0
|
1325 |
|
sl@0
|
1326 |
friend class UnicodeSetIterator;
|
sl@0
|
1327 |
};
|
sl@0
|
1328 |
|
sl@0
|
1329 |
inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const {
|
sl@0
|
1330 |
return !operator==(o);
|
sl@0
|
1331 |
}
|
sl@0
|
1332 |
|
sl@0
|
1333 |
inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
|
sl@0
|
1334 |
return !containsNone(start, end);
|
sl@0
|
1335 |
}
|
sl@0
|
1336 |
|
sl@0
|
1337 |
inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const {
|
sl@0
|
1338 |
return !containsNone(s);
|
sl@0
|
1339 |
}
|
sl@0
|
1340 |
|
sl@0
|
1341 |
inline UBool UnicodeSet::containsSome(const UnicodeString& s) const {
|
sl@0
|
1342 |
return !containsNone(s);
|
sl@0
|
1343 |
}
|
sl@0
|
1344 |
|
sl@0
|
1345 |
U_NAMESPACE_END
|
sl@0
|
1346 |
|
sl@0
|
1347 |
#endif
|