1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/textandloc/fontservices/textshaperplugin/IcuSource/common/unicode/uniset.h Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,1347 @@
1.4 +/*
1.5 +***************************************************************************
1.6 +* Copyright (C) 1999-2005, International Business Machines Corporation
1.7 +* and others. All Rights Reserved.
1.8 +***************************************************************************
1.9 +* Date Name Description
1.10 +* 10/20/99 alan Creation.
1.11 +***************************************************************************
1.12 +*/
1.13 +
1.14 +#ifndef UNICODESET_H
1.15 +#define UNICODESET_H
1.16 +
1.17 +#include "unicode/unifilt.h"
1.18 +#include "unicode/unistr.h"
1.19 +#include "unicode/uset.h"
1.20 +
1.21 +/**
1.22 + * \file
1.23 + * \brief C++ API: Unicode Set
1.24 + */
1.25 +
1.26 +U_NAMESPACE_BEGIN
1.27 +
1.28 +class ParsePosition;
1.29 +class SymbolTable;
1.30 +class UVector;
1.31 +class RuleCharacterIterator;
1.32 +
1.33 +/**
1.34 + * A mutable set of Unicode characters and multicharacter strings. Objects of this class
1.35 + * represent <em>character classes</em> used in regular expressions.
1.36 + * A character specifies a subset of Unicode code points. Legal
1.37 + * code points are U+0000 to U+10FFFF, inclusive.
1.38 + *
1.39 + * <p>The UnicodeSet class is not designed to be subclassed.
1.40 + *
1.41 + * <p><code>UnicodeSet</code> supports two APIs. The first is the
1.42 + * <em>operand</em> API that allows the caller to modify the value of
1.43 + * a <code>UnicodeSet</code> object. It conforms to Java 2's
1.44 + * <code>java.util.Set</code> interface, although
1.45 + * <code>UnicodeSet</code> does not actually implement that
1.46 + * interface. All methods of <code>Set</code> are supported, with the
1.47 + * modification that they take a character range or single character
1.48 + * instead of an <code>Object</code>, and they take a
1.49 + * <code>UnicodeSet</code> instead of a <code>Collection</code>. The
1.50 + * operand API may be thought of in terms of boolean logic: a boolean
1.51 + * OR is implemented by <code>add</code>, a boolean AND is implemented
1.52 + * by <code>retain</code>, a boolean XOR is implemented by
1.53 + * <code>complement</code> taking an argument, and a boolean NOT is
1.54 + * implemented by <code>complement</code> with no argument. In terms
1.55 + * of traditional set theory function names, <code>add</code> is a
1.56 + * union, <code>retain</code> is an intersection, <code>remove</code>
1.57 + * is an asymmetric difference, and <code>complement</code> with no
1.58 + * argument is a set complement with respect to the superset range
1.59 + * <code>MIN_VALUE-MAX_VALUE</code>
1.60 + *
1.61 + * <p>The second API is the
1.62 + * <code>applyPattern()</code>/<code>toPattern()</code> API from the
1.63 + * <code>java.text.Format</code>-derived classes. Unlike the
1.64 + * methods that add characters, add categories, and control the logic
1.65 + * of the set, the method <code>applyPattern()</code> sets all
1.66 + * attributes of a <code>UnicodeSet</code> at once, based on a
1.67 + * string pattern.
1.68 + *
1.69 + * <p><b>Pattern syntax</b></p>
1.70 + *
1.71 + * Patterns are accepted by the constructors and the
1.72 + * <code>applyPattern()</code> methods and returned by the
1.73 + * <code>toPattern()</code> method. These patterns follow a syntax
1.74 + * similar to that employed by version 8 regular expression character
1.75 + * classes. Here are some simple examples:
1.76 + *
1.77 + * \htmlonly<blockquote>\endhtmlonly
1.78 + * <table>
1.79 + * <tr align="top">
1.80 + * <td nowrap valign="top" align="left"><code>[]</code></td>
1.81 + * <td valign="top">No characters</td>
1.82 + * </tr><tr align="top">
1.83 + * <td nowrap valign="top" align="left"><code>[a]</code></td>
1.84 + * <td valign="top">The character 'a'</td>
1.85 + * </tr><tr align="top">
1.86 + * <td nowrap valign="top" align="left"><code>[ae]</code></td>
1.87 + * <td valign="top">The characters 'a' and 'e'</td>
1.88 + * </tr>
1.89 + * <tr>
1.90 + * <td nowrap valign="top" align="left"><code>[a-e]</code></td>
1.91 + * <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code
1.92 + * point order</td>
1.93 + * </tr>
1.94 + * <tr>
1.95 + * <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td>
1.96 + * <td valign="top">The character U+4E01</td>
1.97 + * </tr>
1.98 + * <tr>
1.99 + * <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td>
1.100 + * <td valign="top">The character 'a' and the multicharacter strings "ab" and
1.101 + * "ac"</td>
1.102 + * </tr>
1.103 + * <tr>
1.104 + * <td nowrap valign="top" align="left"><code>[\\p{Lu}]</code></td>
1.105 + * <td valign="top">All characters in the general category Uppercase Letter</td>
1.106 + * </tr>
1.107 + * </table>
1.108 + * \htmlonly</blockquote>\endhtmlonly
1.109 + *
1.110 + * Any character may be preceded by a backslash in order to remove any special
1.111 + * meaning. White space characters, as defined by UCharacter.isWhitespace(), are
1.112 + * ignored, unless they are escaped.
1.113 + *
1.114 + * <p>Property patterns specify a set of characters having a certain
1.115 + * property as defined by the Unicode standard. Both the POSIX-like
1.116 + * "[:Lu:]" and the Perl-like syntax "\\p{Lu}" are recognized. For a
1.117 + * complete list of supported property patterns, see the User's Guide
1.118 + * for UnicodeSet at
1.119 + * <a href="http://icu.sourceforge.net/userguide/unicodeSet.html">
1.120 + * http://icu.sourceforge.net/userguide/unicodeSet.html</a>.
1.121 + * Actual determination of property data is defined by the underlying
1.122 + * Unicode database as implemented by UCharacter.
1.123 + *
1.124 + * <p>Patterns specify individual characters, ranges of characters, and
1.125 + * Unicode property sets. When elements are concatenated, they
1.126 + * specify their union. To complement a set, place a '^' immediately
1.127 + * after the opening '['. Property patterns are inverted by modifying
1.128 + * their delimiters; "[:^foo]" and "\\P{foo}". In any other location,
1.129 + * '^' has no special meaning.
1.130 + *
1.131 + * <p>Ranges are indicated by placing two a '-' between two
1.132 + * characters, as in "a-z". This specifies the range of all
1.133 + * characters from the left to the right, in Unicode order. If the
1.134 + * left character is greater than or equal to the
1.135 + * right character it is a syntax error. If a '-' occurs as the first
1.136 + * character after the opening '[' or '[^', or if it occurs as the
1.137 + * last character before the closing ']', then it is taken as a
1.138 + * literal. Thus "[a\-b]", "[-ab]", and "[ab-]" all indicate the same
1.139 + * set of three characters, 'a', 'b', and '-'.
1.140 + *
1.141 + * <p>Sets may be intersected using the '&' operator or the asymmetric
1.142 + * set difference may be taken using the '-' operator, for example,
1.143 + * "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters
1.144 + * with values less than 4096. Operators ('&' and '|') have equal
1.145 + * precedence and bind left-to-right. Thus
1.146 + * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
1.147 + * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for
1.148 + * difference; intersection is commutative.
1.149 + *
1.150 + * <table>
1.151 + * <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a'
1.152 + * <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a'
1.153 + * through 'z' and all letters in between, in Unicode order
1.154 + * <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
1.155 + * all characters but 'a' through 'z',
1.156 + * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
1.157 + * <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
1.158 + * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
1.159 + * <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code>
1.160 + * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
1.161 + * <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
1.162 + * <td>The asymmetric difference of sets specified by <em>pat1</em> and
1.163 + * <em>pat2</em>
1.164 + * <tr valign=top><td nowrap><code>[:Lu:] or \\p{Lu}</code>
1.165 + * <td>The set of characters having the specified
1.166 + * Unicode property; in
1.167 + * this case, Unicode uppercase letters
1.168 + * <tr valign=top><td nowrap><code>[:^Lu:] or \\P{Lu}</code>
1.169 + * <td>The set of characters <em>not</em> having the given
1.170 + * Unicode property
1.171 + * </table>
1.172 + *
1.173 + * <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p>
1.174 + *
1.175 + * <p><b>Formal syntax</b></p>
1.176 + *
1.177 + * \htmlonly<blockquote>\endhtmlonly
1.178 + * <table>
1.179 + * <tr align="top">
1.180 + * <td nowrap valign="top" align="right"><code>pattern := </code></td>
1.181 + * <td valign="top"><code>('[' '^'? item* ']') |
1.182 + * property</code></td>
1.183 + * </tr>
1.184 + * <tr align="top">
1.185 + * <td nowrap valign="top" align="right"><code>item := </code></td>
1.186 + * <td valign="top"><code>char | (char '-' char) | pattern-expr<br>
1.187 + * </code></td>
1.188 + * </tr>
1.189 + * <tr align="top">
1.190 + * <td nowrap valign="top" align="right"><code>pattern-expr := </code></td>
1.191 + * <td valign="top"><code>pattern | pattern-expr pattern |
1.192 + * pattern-expr op pattern<br>
1.193 + * </code></td>
1.194 + * </tr>
1.195 + * <tr align="top">
1.196 + * <td nowrap valign="top" align="right"><code>op := </code></td>
1.197 + * <td valign="top"><code>'&' | '-'<br>
1.198 + * </code></td>
1.199 + * </tr>
1.200 + * <tr align="top">
1.201 + * <td nowrap valign="top" align="right"><code>special := </code></td>
1.202 + * <td valign="top"><code>'[' | ']' | '-'<br>
1.203 + * </code></td>
1.204 + * </tr>
1.205 + * <tr align="top">
1.206 + * <td nowrap valign="top" align="right"><code>char := </code></td>
1.207 + * <td valign="top"><em>any character that is not</em><code> special<br>
1.208 + * | ('\' </code><em>any character</em><code>)<br>
1.209 + * | ('\\u' hex hex hex hex)<br>
1.210 + * </code></td>
1.211 + * </tr>
1.212 + * <tr align="top">
1.213 + * <td nowrap valign="top" align="right"><code>hex := </code></td>
1.214 + * <td valign="top"><em>any character for which
1.215 + * </em><code>Character.digit(c, 16)</code><em>
1.216 + * returns a non-negative result</em></td>
1.217 + * </tr>
1.218 + * <tr>
1.219 + * <td nowrap valign="top" align="right"><code>property := </code></td>
1.220 + * <td valign="top"><em>a Unicode property set pattern</em></td>
1.221 + * </tr>
1.222 + * </table>
1.223 + * <br>
1.224 + * <table border="1">
1.225 + * <tr>
1.226 + * <td>Legend: <table>
1.227 + * <tr>
1.228 + * <td nowrap valign="top"><code>a := b</code></td>
1.229 + * <td width="20" valign="top"> </td>
1.230 + * <td valign="top"><code>a</code> may be replaced by <code>b</code> </td>
1.231 + * </tr>
1.232 + * <tr>
1.233 + * <td nowrap valign="top"><code>a?</code></td>
1.234 + * <td valign="top"></td>
1.235 + * <td valign="top">zero or one instance of <code>a</code><br>
1.236 + * </td>
1.237 + * </tr>
1.238 + * <tr>
1.239 + * <td nowrap valign="top"><code>a*</code></td>
1.240 + * <td valign="top"></td>
1.241 + * <td valign="top">one or more instances of <code>a</code><br>
1.242 + * </td>
1.243 + * </tr>
1.244 + * <tr>
1.245 + * <td nowrap valign="top"><code>a | b</code></td>
1.246 + * <td valign="top"></td>
1.247 + * <td valign="top">either <code>a</code> or <code>b</code><br>
1.248 + * </td>
1.249 + * </tr>
1.250 + * <tr>
1.251 + * <td nowrap valign="top"><code>'a'</code></td>
1.252 + * <td valign="top"></td>
1.253 + * <td valign="top">the literal string between the quotes </td>
1.254 + * </tr>
1.255 + * </table>
1.256 + * </td>
1.257 + * </tr>
1.258 + * </table>
1.259 + * \htmlonly</blockquote>\endhtmlonly
1.260 + *
1.261 + * @author Alan Liu
1.262 + * @stable ICU 2.0
1.263 + */
1.264 +class U_COMMON_API UnicodeSet : public UnicodeFilter {
1.265 +
1.266 + int32_t len; // length of list used; 0 <= len <= capacity
1.267 + int32_t capacity; // capacity of list
1.268 + int32_t bufferCapacity; // capacity of buffer
1.269 + UChar32* list; // MUST be terminated with HIGH
1.270 + UChar32* buffer; // internal buffer, may be NULL
1.271 +
1.272 + UVector* strings; // maintained in sorted order
1.273 +
1.274 + /**
1.275 + * The pattern representation of this set. This may not be the
1.276 + * most economical pattern. It is the pattern supplied to
1.277 + * applyPattern(), with variables substituted and whitespace
1.278 + * removed. For sets constructed without applyPattern(), or
1.279 + * modified using the non-pattern API, this string will be empty,
1.280 + * indicating that toPattern() must generate a pattern
1.281 + * representation from the inversion list.
1.282 + */
1.283 + UnicodeString pat;
1.284 +
1.285 +public:
1.286 +
1.287 + enum {
1.288 + /**
1.289 + * Minimum value that can be stored in a UnicodeSet.
1.290 + * @stable ICU 2.4
1.291 + */
1.292 + MIN_VALUE = 0,
1.293 +
1.294 + /**
1.295 + * Maximum value that can be stored in a UnicodeSet.
1.296 + * @stable ICU 2.4
1.297 + */
1.298 + MAX_VALUE = 0x10ffff
1.299 + };
1.300 +
1.301 + //----------------------------------------------------------------
1.302 + // Constructors &c
1.303 + //----------------------------------------------------------------
1.304 +
1.305 +public:
1.306 +
1.307 + /**
1.308 + * Constructs an empty set.
1.309 + * @stable ICU 2.0
1.310 + */
1.311 + UnicodeSet();
1.312 +
1.313 + /**
1.314 + * Constructs a set containing the given range. If <code>end >
1.315 + * start</code> then an empty set is created.
1.316 + *
1.317 + * @param start first character, inclusive, of range
1.318 + * @param end last character, inclusive, of range
1.319 + * @stable ICU 2.4
1.320 + */
1.321 + UnicodeSet(UChar32 start, UChar32 end);
1.322 +
1.323 + /**
1.324 + * Constructs a set from the given pattern. See the class
1.325 + * description for the syntax of the pattern language.
1.326 + * @param pattern a string specifying what characters are in the set
1.327 + * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
1.328 + * contains a syntax error.
1.329 + * @stable ICU 2.0
1.330 + */
1.331 + UnicodeSet(const UnicodeString& pattern,
1.332 + UErrorCode& status);
1.333 +
1.334 + /**
1.335 + * Constructs a set from the given pattern. See the class
1.336 + * description for the syntax of the pattern language.
1.337 + * @param pattern a string specifying what characters are in the set
1.338 + * @param options bitmask for options to apply to the pattern.
1.339 + * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
1.340 + * @param symbols a symbol table mapping variable names to values
1.341 + * and stand-in characters to UnicodeSets; may be NULL
1.342 + * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
1.343 + * contains a syntax error.
1.344 + * @internal
1.345 + */
1.346 + UnicodeSet(const UnicodeString& pattern,
1.347 + uint32_t options,
1.348 + const SymbolTable* symbols,
1.349 + UErrorCode& status);
1.350 +
1.351 + /**
1.352 + * Constructs a set from the given pattern. See the class description
1.353 + * for the syntax of the pattern language.
1.354 + * @param pattern a string specifying what characters are in the set
1.355 + * @param pos on input, the position in pattern at which to start parsing.
1.356 + * On output, the position after the last character parsed.
1.357 + * @param options bitmask for options to apply to the pattern.
1.358 + * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
1.359 + * @param symbols a symbol table mapping variable names to values
1.360 + * and stand-in characters to UnicodeSets; may be NULL
1.361 + * @param status input-output error code
1.362 + * @stable ICU 2.8
1.363 + */
1.364 + UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
1.365 + uint32_t options,
1.366 + const SymbolTable* symbols,
1.367 + UErrorCode& status);
1.368 +
1.369 +#ifdef U_USE_UNICODESET_DEPRECATES
1.370 + /**
1.371 + * Obsolete: Constructs a set from the given Unicode character category.
1.372 + * @param category an integer indicating the character category as
1.373 + * defined in uchar.h.
1.374 + * @obsolete ICU 2.6. Use a pattern with the category instead since this API will be removed in that release.
1.375 + */
1.376 + UnicodeSet(int8_t category, UErrorCode& status);
1.377 +#endif
1.378 +
1.379 + /**
1.380 + * Constructs a set that is identical to the given UnicodeSet.
1.381 + * @stable ICU 2.0
1.382 + */
1.383 + UnicodeSet(const UnicodeSet& o);
1.384 +
1.385 + /**
1.386 + * Destructs the set.
1.387 + * @stable ICU 2.0
1.388 + */
1.389 + virtual ~UnicodeSet();
1.390 +
1.391 + /**
1.392 + * Assigns this object to be a copy of another.
1.393 + * @stable ICU 2.0
1.394 + */
1.395 + UnicodeSet& operator=(const UnicodeSet& o);
1.396 +
1.397 + /**
1.398 + * Compares the specified object with this set for equality. Returns
1.399 + * <tt>true</tt> if the two sets
1.400 + * have the same size, and every member of the specified set is
1.401 + * contained in this set (or equivalently, every member of this set is
1.402 + * contained in the specified set).
1.403 + *
1.404 + * @param o set to be compared for equality with this set.
1.405 + * @return <tt>true</tt> if the specified set is equal to this set.
1.406 + * @stable ICU 2.0
1.407 + */
1.408 + virtual UBool operator==(const UnicodeSet& o) const;
1.409 +
1.410 + /**
1.411 + * Compares the specified object with this set for equality. Returns
1.412 + * <tt>true</tt> if the specified set is not equal to this set.
1.413 + * @stable ICU 2.0
1.414 + */
1.415 + UBool operator!=(const UnicodeSet& o) const;
1.416 +
1.417 + /**
1.418 + * Returns a copy of this object. All UnicodeFunctor objects have
1.419 + * to support cloning in order to allow classes using
1.420 + * UnicodeFunctors, such as Transliterator, to implement cloning.
1.421 + * @stable ICU 2.0
1.422 + */
1.423 + virtual UnicodeFunctor* clone() const;
1.424 +
1.425 + /**
1.426 + * Returns the hash code value for this set.
1.427 + *
1.428 + * @return the hash code value for this set.
1.429 + * @see Object#hashCode()
1.430 + * @stable ICU 2.0
1.431 + */
1.432 + virtual int32_t hashCode(void) const;
1.433 +
1.434 + //----------------------------------------------------------------
1.435 + // Public API
1.436 + //----------------------------------------------------------------
1.437 +
1.438 + /**
1.439 + * Make this object represent the range <code>start - end</code>.
1.440 + * If <code>end > start</code> then this object is set to an
1.441 + * an empty range.
1.442 + *
1.443 + * @param start first character in the set, inclusive
1.444 + * @param end last character in the set, inclusive
1.445 + * @stable ICU 2.4
1.446 + */
1.447 + UnicodeSet& set(UChar32 start, UChar32 end);
1.448 +
1.449 + /**
1.450 + * Return true if the given position, in the given pattern, appears
1.451 + * to be the start of a UnicodeSet pattern.
1.452 + * @stable ICU 2.4
1.453 + */
1.454 + static UBool resemblesPattern(const UnicodeString& pattern,
1.455 + int32_t pos);
1.456 +
1.457 + /**
1.458 + * Modifies this set to represent the set specified by the given
1.459 + * pattern, optionally ignoring white space. See the class
1.460 + * description for the syntax of the pattern language.
1.461 + * @param pattern a string specifying what characters are in the set
1.462 + * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
1.463 + * contains a syntax error.
1.464 + * <em> Empties the set passed before applying the pattern.</em>
1.465 + * @return a reference to this
1.466 + * @stable ICU 2.0
1.467 + */
1.468 + UnicodeSet& applyPattern(const UnicodeString& pattern,
1.469 + UErrorCode& status);
1.470 +
1.471 + /**
1.472 + * Modifies this set to represent the set specified by the given
1.473 + * pattern, optionally ignoring white space. See the class
1.474 + * description for the syntax of the pattern language.
1.475 + * @param pattern a string specifying what characters are in the set
1.476 + * @param options bitmask for options to apply to the pattern.
1.477 + * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
1.478 + * @param symbols a symbol table mapping variable names to
1.479 + * values and stand-ins to UnicodeSets; may be NULL
1.480 + * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
1.481 + * contains a syntax error.
1.482 + *<em> Empties the set passed before applying the pattern.</em>
1.483 + * @return a reference to this
1.484 + * @internal
1.485 + */
1.486 + UnicodeSet& applyPattern(const UnicodeString& pattern,
1.487 + uint32_t options,
1.488 + const SymbolTable* symbols,
1.489 + UErrorCode& status);
1.490 +
1.491 + /**
1.492 + * Parses the given pattern, starting at the given position. The
1.493 + * character at pattern.charAt(pos.getIndex()) must be '[', or the
1.494 + * parse fails. Parsing continues until the corresponding closing
1.495 + * ']'. If a syntax error is encountered between the opening and
1.496 + * closing brace, the parse fails. Upon return from a successful
1.497 + * parse, the ParsePosition is updated to point to the character
1.498 + * following the closing ']', and a StringBuffer containing a
1.499 + * pairs list for the parsed pattern is returned. This method calls
1.500 + * itself recursively to parse embedded subpatterns.
1.501 + *<em> Empties the set passed before applying the pattern.</em>
1.502 + *
1.503 + * @param pattern the string containing the pattern to be parsed.
1.504 + * The portion of the string from pos.getIndex(), which must be a
1.505 + * '[', to the corresponding closing ']', is parsed.
1.506 + * @param pos upon entry, the position at which to being parsing.
1.507 + * The character at pattern.charAt(pos.getIndex()) must be a '['.
1.508 + * Upon return from a successful parse, pos.getIndex() is either
1.509 + * the character after the closing ']' of the parsed pattern, or
1.510 + * pattern.length() if the closing ']' is the last character of
1.511 + * the pattern string.
1.512 + * @param options bitmask for options to apply to the pattern.
1.513 + * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
1.514 + * @param symbols a symbol table mapping variable names to
1.515 + * values and stand-ins to UnicodeSets; may be NULL
1.516 + * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
1.517 + * contains a syntax error.
1.518 + * @return a reference to this
1.519 + * @stable ICU 2.8
1.520 + */
1.521 + UnicodeSet& applyPattern(const UnicodeString& pattern,
1.522 + ParsePosition& pos,
1.523 + uint32_t options,
1.524 + const SymbolTable* symbols,
1.525 + UErrorCode& status);
1.526 +
1.527 + /**
1.528 + * Returns a string representation of this set. If the result of
1.529 + * calling this function is passed to a UnicodeSet constructor, it
1.530 + * will produce another set that is equal to this one.
1.531 + * @param result the string to receive the rules. Previous
1.532 + * contents will be deleted.
1.533 + * @param escapeUnprintable if TRUE then convert unprintable
1.534 + * character to their hex escape representations, \\uxxxx or
1.535 + * \\Uxxxxxxxx. Unprintable characters are those other than
1.536 + * U+000A, U+0020..U+007E.
1.537 + * @stable ICU 2.0
1.538 + */
1.539 + virtual UnicodeString& toPattern(UnicodeString& result,
1.540 + UBool escapeUnprintable = FALSE) const;
1.541 +
1.542 + /**
1.543 + * Modifies this set to contain those code points which have the given value
1.544 + * for the given binary or enumerated property, as returned by
1.545 + * u_getIntPropertyValue. Prior contents of this set are lost.
1.546 + *
1.547 + * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1
1.548 + * or UCHAR_INT_START..UCHAR_INT_LIMIT-1
1.549 + * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1.
1.550 + *
1.551 + * @param value a value in the range u_getIntPropertyMinValue(prop)..
1.552 + * u_getIntPropertyMaxValue(prop), with one exception. If prop is
1.553 + * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but
1.554 + * rather a mask value produced by U_GET_GC_MASK(). This allows grouped
1.555 + * categories such as [:L:] to be represented.
1.556 + *
1.557 + * @param ec error code input/output parameter
1.558 + *
1.559 + * @return a reference to this set
1.560 + *
1.561 + * @stable ICU 2.4
1.562 + */
1.563 + UnicodeSet& applyIntPropertyValue(UProperty prop,
1.564 + int32_t value,
1.565 + UErrorCode& ec);
1.566 +
1.567 + /**
1.568 + * Modifies this set to contain those code points which have the
1.569 + * given value for the given property. Prior contents of this
1.570 + * set are lost.
1.571 + *
1.572 + * @param prop a property alias, either short or long. The name is matched
1.573 + * loosely. See PropertyAliases.txt for names and a description of loose
1.574 + * matching. If the value string is empty, then this string is interpreted
1.575 + * as either a General_Category value alias, a Script value alias, a binary
1.576 + * property alias, or a special ID. Special IDs are matched loosely and
1.577 + * correspond to the following sets:
1.578 + *
1.579 + * "ANY" = [\\u0000-\\U0010FFFF],
1.580 + * "ASCII" = [\\u0000-\\u007F],
1.581 + * "Assigned" = [:^Cn:].
1.582 + *
1.583 + * @param value a value alias, either short or long. The name is matched
1.584 + * loosely. See PropertyValueAliases.txt for names and a description of
1.585 + * loose matching. In addition to aliases listed, numeric values and
1.586 + * canonical combining classes may be expressed numerically, e.g., ("nv",
1.587 + * "0.5") or ("ccc", "220"). The value string may also be empty.
1.588 + *
1.589 + * @param ec error code input/output parameter
1.590 + *
1.591 + * @return a reference to this set
1.592 + *
1.593 + * @stable ICU 2.4
1.594 + */
1.595 + UnicodeSet& applyPropertyAlias(const UnicodeString& prop,
1.596 + const UnicodeString& value,
1.597 + UErrorCode& ec);
1.598 +
1.599 + /**
1.600 + * Returns the number of elements in this set (its cardinality).
1.601 + * Note than the elements of a set may include both individual
1.602 + * codepoints and strings.
1.603 + *
1.604 + * @return the number of elements in this set (its cardinality).
1.605 + * @stable ICU 2.0
1.606 + */
1.607 + virtual int32_t size(void) const;
1.608 +
1.609 + /**
1.610 + * Returns <tt>true</tt> if this set contains no elements.
1.611 + *
1.612 + * @return <tt>true</tt> if this set contains no elements.
1.613 + * @stable ICU 2.0
1.614 + */
1.615 + virtual UBool isEmpty(void) const;
1.616 +
1.617 + /**
1.618 + * Returns true if this set contains the given character.
1.619 + * @param c character to be checked for containment
1.620 + * @return true if the test condition is met
1.621 + * @stable ICU 2.0
1.622 + */
1.623 + virtual UBool contains(UChar32 c) const;
1.624 +
1.625 + /**
1.626 + * Returns true if this set contains every character
1.627 + * of the given range.
1.628 + * @param start first character, inclusive, of the range
1.629 + * @param end last character, inclusive, of the range
1.630 + * @return true if the test condition is met
1.631 + * @stable ICU 2.0
1.632 + */
1.633 + virtual UBool contains(UChar32 start, UChar32 end) const;
1.634 +
1.635 + /**
1.636 + * Returns <tt>true</tt> if this set contains the given
1.637 + * multicharacter string.
1.638 + * @param s string to be checked for containment
1.639 + * @return <tt>true</tt> if this set contains the specified string
1.640 + * @stable ICU 2.4
1.641 + */
1.642 + UBool contains(const UnicodeString& s) const;
1.643 +
1.644 + /**
1.645 + * Returns true if this set contains all the characters and strings
1.646 + * of the given set.
1.647 + * @param c set to be checked for containment
1.648 + * @return true if the test condition is met
1.649 + * @stable ICU 2.4
1.650 + */
1.651 + virtual UBool containsAll(const UnicodeSet& c) const;
1.652 +
1.653 + /**
1.654 + * Returns true if this set contains all the characters
1.655 + * of the given string.
1.656 + * @param s string containing characters to be checked for containment
1.657 + * @return true if the test condition is met
1.658 + * @stable ICU 2.4
1.659 + */
1.660 + UBool containsAll(const UnicodeString& s) const;
1.661 +
1.662 + /**
1.663 + * Returns true if this set contains none of the characters
1.664 + * of the given range.
1.665 + * @param start first character, inclusive, of the range
1.666 + * @param end last character, inclusive, of the range
1.667 + * @return true if the test condition is met
1.668 + * @stable ICU 2.4
1.669 + */
1.670 + UBool containsNone(UChar32 start, UChar32 end) const;
1.671 +
1.672 + /**
1.673 + * Returns true if this set contains none of the characters and strings
1.674 + * of the given set.
1.675 + * @param c set to be checked for containment
1.676 + * @return true if the test condition is met
1.677 + * @stable ICU 2.4
1.678 + */
1.679 + UBool containsNone(const UnicodeSet& c) const;
1.680 +
1.681 + /**
1.682 + * Returns true if this set contains none of the characters
1.683 + * of the given string.
1.684 + * @param s string containing characters to be checked for containment
1.685 + * @return true if the test condition is met
1.686 + * @stable ICU 2.4
1.687 + */
1.688 + UBool containsNone(const UnicodeString& s) const;
1.689 +
1.690 + /**
1.691 + * Returns true if this set contains one or more of the characters
1.692 + * in the given range.
1.693 + * @param start first character, inclusive, of the range
1.694 + * @param end last character, inclusive, of the range
1.695 + * @return true if the condition is met
1.696 + * @stable ICU 2.4
1.697 + */
1.698 + inline UBool containsSome(UChar32 start, UChar32 end) const;
1.699 +
1.700 + /**
1.701 + * Returns true if this set contains one or more of the characters
1.702 + * and strings of the given set.
1.703 + * @param s The set to be checked for containment
1.704 + * @return true if the condition is met
1.705 + * @stable ICU 2.4
1.706 + */
1.707 + inline UBool containsSome(const UnicodeSet& s) const;
1.708 +
1.709 + /**
1.710 + * Returns true if this set contains one or more of the characters
1.711 + * of the given string.
1.712 + * @param s string containing characters to be checked for containment
1.713 + * @return true if the condition is met
1.714 + * @stable ICU 2.4
1.715 + */
1.716 + inline UBool containsSome(const UnicodeString& s) const;
1.717 +
1.718 + /**
1.719 + * Implement UnicodeMatcher::matches()
1.720 + * @stable ICU 2.4
1.721 + */
1.722 + virtual UMatchDegree matches(const Replaceable& text,
1.723 + int32_t& offset,
1.724 + int32_t limit,
1.725 + UBool incremental);
1.726 +
1.727 +private:
1.728 + /**
1.729 + * Returns the longest match for s in text at the given position.
1.730 + * If limit > start then match forward from start+1 to limit
1.731 + * matching all characters except s.charAt(0). If limit < start,
1.732 + * go backward starting from start-1 matching all characters
1.733 + * except s.charAt(s.length()-1). This method assumes that the
1.734 + * first character, text.charAt(start), matches s, so it does not
1.735 + * check it.
1.736 + * @param text the text to match
1.737 + * @param start the first character to match. In the forward
1.738 + * direction, text.charAt(start) is matched against s.charAt(0).
1.739 + * In the reverse direction, it is matched against
1.740 + * s.charAt(s.length()-1).
1.741 + * @param limit the limit offset for matching, either last+1 in
1.742 + * the forward direction, or last-1 in the reverse direction,
1.743 + * where last is the index of the last character to match.
1.744 + * @return If part of s matches up to the limit, return |limit -
1.745 + * start|. If all of s matches before reaching the limit, return
1.746 + * s.length(). If there is a mismatch between s and text, return
1.747 + * 0
1.748 + */
1.749 + static int32_t matchRest(const Replaceable& text,
1.750 + int32_t start, int32_t limit,
1.751 + const UnicodeString& s);
1.752 +
1.753 + /**
1.754 + * Returns the smallest value i such that c < list[i]. Caller
1.755 + * must ensure that c is a legal value or this method will enter
1.756 + * an infinite loop. This method performs a binary search.
1.757 + * @param c a character in the range MIN_VALUE..MAX_VALUE
1.758 + * inclusive
1.759 + * @return the smallest integer i in the range 0..len-1,
1.760 + * inclusive, such that c < list[i]
1.761 + */
1.762 + int32_t findCodePoint(UChar32 c) const;
1.763 +
1.764 +public:
1.765 +
1.766 + /**
1.767 + * Implementation of UnicodeMatcher API. Union the set of all
1.768 + * characters that may be matched by this object into the given
1.769 + * set.
1.770 + * @param toUnionTo the set into which to union the source characters
1.771 + * @stable ICU 2.4
1.772 + */
1.773 + virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
1.774 +
1.775 + /**
1.776 + * Returns the index of the given character within this set, where
1.777 + * the set is ordered by ascending code point. If the character
1.778 + * is not in this set, return -1. The inverse of this method is
1.779 + * <code>charAt()</code>.
1.780 + * @return an index from 0..size()-1, or -1
1.781 + * @stable ICU 2.4
1.782 + */
1.783 + int32_t indexOf(UChar32 c) const;
1.784 +
1.785 + /**
1.786 + * Returns the character at the given index within this set, where
1.787 + * the set is ordered by ascending code point. If the index is
1.788 + * out of range, return (UChar32)-1. The inverse of this method is
1.789 + * <code>indexOf()</code>.
1.790 + * @param index an index from 0..size()-1
1.791 + * @return the character at the given index, or (UChar32)-1.
1.792 + * @stable ICU 2.4
1.793 + */
1.794 + UChar32 charAt(int32_t index) const;
1.795 +
1.796 + /**
1.797 + * Adds the specified range to this set if it is not already
1.798 + * present. If this set already contains the specified range,
1.799 + * the call leaves this set unchanged. If <code>end > start</code>
1.800 + * then an empty range is added, leaving the set unchanged.
1.801 + * This is equivalent to a boolean logic OR, or a set UNION.
1.802 + *
1.803 + * @param start first character, inclusive, of range to be added
1.804 + * to this set.
1.805 + * @param end last character, inclusive, of range to be added
1.806 + * to this set.
1.807 + * @stable ICU 2.0
1.808 + */
1.809 + virtual UnicodeSet& add(UChar32 start, UChar32 end);
1.810 +
1.811 + /**
1.812 + * Adds the specified character to this set if it is not already
1.813 + * present. If this set already contains the specified character,
1.814 + * the call leaves this set unchanged.
1.815 + * @stable ICU 2.0
1.816 + */
1.817 + UnicodeSet& add(UChar32 c);
1.818 +
1.819 + /**
1.820 + * Adds the specified multicharacter to this set if it is not already
1.821 + * present. If this set already contains the multicharacter,
1.822 + * the call leaves this set unchanged.
1.823 + * Thus "ch" => {"ch"}
1.824 + * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1.825 + * @param s the source string
1.826 + * @return this object, for chaining
1.827 + * @stable ICU 2.4
1.828 + */
1.829 + UnicodeSet& add(const UnicodeString& s);
1.830 +
1.831 + private:
1.832 + /**
1.833 + * @return a code point IF the string consists of a single one.
1.834 + * otherwise returns -1.
1.835 + * @param string to test
1.836 + */
1.837 + static int32_t getSingleCP(const UnicodeString& s);
1.838 +
1.839 + void _add(const UnicodeString& s);
1.840 +
1.841 + public:
1.842 + /**
1.843 + * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1.844 + * If this set already any particular character, it has no effect on that character.
1.845 + * @param s the source string
1.846 + * @return this object, for chaining
1.847 + * @stable ICU 2.4
1.848 + */
1.849 + UnicodeSet& addAll(const UnicodeString& s);
1.850 +
1.851 + /**
1.852 + * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1.853 + * If this set already any particular character, it has no effect on that character.
1.854 + * @param s the source string
1.855 + * @return this object, for chaining
1.856 + * @stable ICU 2.4
1.857 + */
1.858 + UnicodeSet& retainAll(const UnicodeString& s);
1.859 +
1.860 + /**
1.861 + * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1.862 + * If this set already any particular character, it has no effect on that character.
1.863 + * @param s the source string
1.864 + * @return this object, for chaining
1.865 + * @stable ICU 2.4
1.866 + */
1.867 + UnicodeSet& complementAll(const UnicodeString& s);
1.868 +
1.869 + /**
1.870 + * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1.871 + * If this set already any particular character, it has no effect on that character.
1.872 + * @param s the source string
1.873 + * @return this object, for chaining
1.874 + * @stable ICU 2.4
1.875 + */
1.876 + UnicodeSet& removeAll(const UnicodeString& s);
1.877 +
1.878 + /**
1.879 + * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1.880 + * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1.881 + * @param s the source string
1.882 + * @return a newly created set containing the given string.
1.883 + * The caller owns the return object and is responsible for deleting it.
1.884 + * @stable ICU 2.4
1.885 + */
1.886 + static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s);
1.887 +
1.888 +
1.889 + /**
1.890 + * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1.891 + * @param s the source string
1.892 + * @return a newly created set containing the given characters
1.893 + * The caller owns the return object and is responsible for deleting it.
1.894 + * @stable ICU 2.4
1.895 + */
1.896 + static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s);
1.897 +
1.898 + /**
1.899 + * Retain only the elements in this set that are contained in the
1.900 + * specified range. If <code>end > start</code> then an empty range is
1.901 + * retained, leaving the set empty. This is equivalent to
1.902 + * a boolean logic AND, or a set INTERSECTION.
1.903 + *
1.904 + * @param start first character, inclusive, of range to be retained
1.905 + * to this set.
1.906 + * @param end last character, inclusive, of range to be retained
1.907 + * to this set.
1.908 + * @stable ICU 2.0
1.909 + */
1.910 + virtual UnicodeSet& retain(UChar32 start, UChar32 end);
1.911 +
1.912 +
1.913 + /**
1.914 + * Retain the specified character from this set if it is present.
1.915 + * @stable ICU 2.0
1.916 + */
1.917 + UnicodeSet& retain(UChar32 c);
1.918 +
1.919 + /**
1.920 + * Removes the specified range from this set if it is present.
1.921 + * The set will not contain the specified range once the call
1.922 + * returns. If <code>end > start</code> then an empty range is
1.923 + * removed, leaving the set unchanged.
1.924 + *
1.925 + * @param start first character, inclusive, of range to be removed
1.926 + * from this set.
1.927 + * @param end last character, inclusive, of range to be removed
1.928 + * from this set.
1.929 + * @stable ICU 2.0
1.930 + */
1.931 + virtual UnicodeSet& remove(UChar32 start, UChar32 end);
1.932 +
1.933 + /**
1.934 + * Removes the specified character from this set if it is present.
1.935 + * The set will not contain the specified range once the call
1.936 + * returns.
1.937 + * @stable ICU 2.0
1.938 + */
1.939 + UnicodeSet& remove(UChar32 c);
1.940 +
1.941 + /**
1.942 + * Removes the specified string from this set if it is present.
1.943 + * The set will not contain the specified character once the call
1.944 + * returns.
1.945 + * @param s the source string
1.946 + * @return this object, for chaining
1.947 + * @stable ICU 2.4
1.948 + */
1.949 + UnicodeSet& remove(const UnicodeString& s);
1.950 +
1.951 + /**
1.952 + * Inverts this set. This operation modifies this set so that
1.953 + * its value is its complement. This is equivalent to
1.954 + * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1.955 + * @stable ICU 2.0
1.956 + */
1.957 + virtual UnicodeSet& complement(void);
1.958 +
1.959 + /**
1.960 + * Complements the specified range in this set. Any character in
1.961 + * the range will be removed if it is in this set, or will be
1.962 + * added if it is not in this set. If <code>end > start</code>
1.963 + * then an empty range is complemented, leaving the set unchanged.
1.964 + * This is equivalent to a boolean logic XOR.
1.965 + *
1.966 + * @param start first character, inclusive, of range to be removed
1.967 + * from this set.
1.968 + * @param end last character, inclusive, of range to be removed
1.969 + * from this set.
1.970 + * @stable ICU 2.0
1.971 + */
1.972 + virtual UnicodeSet& complement(UChar32 start, UChar32 end);
1.973 +
1.974 + /**
1.975 + * Complements the specified character in this set. The character
1.976 + * will be removed if it is in this set, or will be added if it is
1.977 + * not in this set.
1.978 + * @stable ICU 2.0
1.979 + */
1.980 + UnicodeSet& complement(UChar32 c);
1.981 +
1.982 + /**
1.983 + * Complement the specified string in this set.
1.984 + * The set will not contain the specified string once the call
1.985 + * returns.
1.986 + * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1.987 + * @param s the string to complement
1.988 + * @return this object, for chaining
1.989 + * @stable ICU 2.4
1.990 + */
1.991 + UnicodeSet& complement(const UnicodeString& s);
1.992 +
1.993 + /**
1.994 + * Adds all of the elements in the specified set to this set if
1.995 + * they're not already present. This operation effectively
1.996 + * modifies this set so that its value is the <i>union</i> of the two
1.997 + * sets. The behavior of this operation is unspecified if the specified
1.998 + * collection is modified while the operation is in progress.
1.999 + *
1.1000 + * @param c set whose elements are to be added to this set.
1.1001 + * @see #add(char, char)
1.1002 + * @stable ICU 2.0
1.1003 + */
1.1004 + virtual UnicodeSet& addAll(const UnicodeSet& c);
1.1005 +
1.1006 + /**
1.1007 + * Retains only the elements in this set that are contained in the
1.1008 + * specified set. In other words, removes from this set all of
1.1009 + * its elements that are not contained in the specified set. This
1.1010 + * operation effectively modifies this set so that its value is
1.1011 + * the <i>intersection</i> of the two sets.
1.1012 + *
1.1013 + * @param c set that defines which elements this set will retain.
1.1014 + * @stable ICU 2.0
1.1015 + */
1.1016 + virtual UnicodeSet& retainAll(const UnicodeSet& c);
1.1017 +
1.1018 + /**
1.1019 + * Removes from this set all of its elements that are contained in the
1.1020 + * specified set. This operation effectively modifies this
1.1021 + * set so that its value is the <i>asymmetric set difference</i> of
1.1022 + * the two sets.
1.1023 + *
1.1024 + * @param c set that defines which elements will be removed from
1.1025 + * this set.
1.1026 + * @stable ICU 2.0
1.1027 + */
1.1028 + virtual UnicodeSet& removeAll(const UnicodeSet& c);
1.1029 +
1.1030 + /**
1.1031 + * Complements in this set all elements contained in the specified
1.1032 + * set. Any character in the other set will be removed if it is
1.1033 + * in this set, or will be added if it is not in this set.
1.1034 + *
1.1035 + * @param c set that defines which elements will be xor'ed from
1.1036 + * this set.
1.1037 + * @stable ICU 2.4
1.1038 + */
1.1039 + virtual UnicodeSet& complementAll(const UnicodeSet& c);
1.1040 +
1.1041 + /**
1.1042 + * Removes all of the elements from this set. This set will be
1.1043 + * empty after this call returns.
1.1044 + * @stable ICU 2.0
1.1045 + */
1.1046 + virtual UnicodeSet& clear(void);
1.1047 +
1.1048 + /**
1.1049 + * Close this set over the given attribute. For the attribute
1.1050 + * USET_CASE, the result is to modify this set so that:
1.1051 + *
1.1052 + * 1. For each character or string 'a' in this set, all strings or
1.1053 + * characters 'b' such that foldCase(a) == foldCase(b) are added
1.1054 + * to this set.
1.1055 + *
1.1056 + * 2. For each string 'e' in the resulting set, if e !=
1.1057 + * foldCase(e), 'e' will be removed.
1.1058 + *
1.1059 + * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]
1.1060 + *
1.1061 + * (Here foldCase(x) refers to the operation u_strFoldCase, and a
1.1062 + * == b denotes that the contents are the same, not pointer
1.1063 + * comparison.)
1.1064 + *
1.1065 + * @param attribute bitmask for attributes to close over.
1.1066 + * Currently only the USET_CASE bit is supported. Any undefined bits
1.1067 + * are ignored.
1.1068 + * @return a reference to this set.
1.1069 + * @internal
1.1070 + */
1.1071 + UnicodeSet& closeOver(int32_t attribute);
1.1072 +
1.1073 + /**
1.1074 + * Iteration method that returns the number of ranges contained in
1.1075 + * this set.
1.1076 + * @see #getRangeStart
1.1077 + * @see #getRangeEnd
1.1078 + * @stable ICU 2.4
1.1079 + */
1.1080 + virtual int32_t getRangeCount(void) const;
1.1081 +
1.1082 + /**
1.1083 + * Iteration method that returns the first character in the
1.1084 + * specified range of this set.
1.1085 + * @see #getRangeCount
1.1086 + * @see #getRangeEnd
1.1087 + * @stable ICU 2.4
1.1088 + */
1.1089 + virtual UChar32 getRangeStart(int32_t index) const;
1.1090 +
1.1091 + /**
1.1092 + * Iteration method that returns the last character in the
1.1093 + * specified range of this set.
1.1094 + * @see #getRangeStart
1.1095 + * @see #getRangeEnd
1.1096 + * @stable ICU 2.4
1.1097 + */
1.1098 + virtual UChar32 getRangeEnd(int32_t index) const;
1.1099 +
1.1100 + /**
1.1101 + * Serializes this set into an array of 16-bit integers. Serialization
1.1102 + * (currently) only records the characters in the set; multicharacter
1.1103 + * strings are ignored.
1.1104 + *
1.1105 + * The array has following format (each line is one 16-bit
1.1106 + * integer):
1.1107 + *
1.1108 + * length = (n+2*m) | (m!=0?0x8000:0)
1.1109 + * bmpLength = n; present if m!=0
1.1110 + * bmp[0]
1.1111 + * bmp[1]
1.1112 + * ...
1.1113 + * bmp[n-1]
1.1114 + * supp-high[0]
1.1115 + * supp-low[0]
1.1116 + * supp-high[1]
1.1117 + * supp-low[1]
1.1118 + * ...
1.1119 + * supp-high[m-1]
1.1120 + * supp-low[m-1]
1.1121 + *
1.1122 + * The array starts with a header. After the header are n bmp
1.1123 + * code points, then m supplementary code points. Either n or m
1.1124 + * or both may be zero. n+2*m is always <= 0x7FFF.
1.1125 + *
1.1126 + * If there are no supplementary characters (if m==0) then the
1.1127 + * header is one 16-bit integer, 'length', with value n.
1.1128 + *
1.1129 + * If there are supplementary characters (if m!=0) then the header
1.1130 + * is two 16-bit integers. The first, 'length', has value
1.1131 + * (n+2*m)|0x8000. The second, 'bmpLength', has value n.
1.1132 + *
1.1133 + * After the header the code points are stored in ascending order.
1.1134 + * Supplementary code points are stored as most significant 16
1.1135 + * bits followed by least significant 16 bits.
1.1136 + *
1.1137 + * @param dest pointer to buffer of destCapacity 16-bit integers.
1.1138 + * May be NULL only if destCapacity is zero.
1.1139 + * @param destCapacity size of dest, or zero. Must not be negative.
1.1140 + * @param ec error code. Will be set to U_INDEX_OUTOFBOUNDS_ERROR
1.1141 + * if n+2*m > 0x7FFF. Will be set to U_BUFFER_OVERFLOW_ERROR if
1.1142 + * n+2*m+(m!=0?2:1) > destCapacity.
1.1143 + * @return the total length of the serialized format, including
1.1144 + * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other
1.1145 + * than U_BUFFER_OVERFLOW_ERROR.
1.1146 + * @stable ICU 2.4
1.1147 + */
1.1148 + int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const;
1.1149 +
1.1150 + /**
1.1151 + * Reallocate this objects internal structures to take up the least
1.1152 + * possible space, without changing this object's value.
1.1153 + * @stable ICU 2.4
1.1154 + */
1.1155 + virtual UnicodeSet& compact();
1.1156 +
1.1157 + /**
1.1158 + * Return the class ID for this class. This is useful only for
1.1159 + * comparing to a return value from getDynamicClassID(). For example:
1.1160 + * <pre>
1.1161 + * . Base* polymorphic_pointer = createPolymorphicObject();
1.1162 + * . if (polymorphic_pointer->getDynamicClassID() ==
1.1163 + * . Derived::getStaticClassID()) ...
1.1164 + * </pre>
1.1165 + * @return The class ID for all objects of this class.
1.1166 + * @stable ICU 2.0
1.1167 + */
1.1168 + static UClassID U_EXPORT2 getStaticClassID(void);
1.1169 +
1.1170 + /**
1.1171 + * Implement UnicodeFunctor API.
1.1172 + *
1.1173 + * @return The class ID for this object. All objects of a given
1.1174 + * class have the same class ID. Objects of other classes have
1.1175 + * different class IDs.
1.1176 + * @stable ICU 2.4
1.1177 + */
1.1178 + virtual UClassID getDynamicClassID(void) const;
1.1179 +
1.1180 +private:
1.1181 +
1.1182 + // Private API for the USet API
1.1183 +
1.1184 + friend class USetAccess;
1.1185 +
1.1186 + int32_t getStringCount() const;
1.1187 +
1.1188 + const UnicodeString* getString(int32_t index) const;
1.1189 +
1.1190 + //----------------------------------------------------------------
1.1191 + // RuleBasedTransliterator support
1.1192 + //----------------------------------------------------------------
1.1193 +
1.1194 +private:
1.1195 +
1.1196 + /**
1.1197 + * Returns <tt>true</tt> if this set contains any character whose low byte
1.1198 + * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
1.1199 + * indexing.
1.1200 + */
1.1201 + virtual UBool matchesIndexValue(uint8_t v) const;
1.1202 +
1.1203 +private:
1.1204 +
1.1205 + //----------------------------------------------------------------
1.1206 + // Implementation: Pattern parsing
1.1207 + //----------------------------------------------------------------
1.1208 +
1.1209 + void applyPattern(RuleCharacterIterator& chars,
1.1210 + const SymbolTable* symbols,
1.1211 + UnicodeString& rebuiltPat,
1.1212 + uint32_t options,
1.1213 + UErrorCode& ec);
1.1214 +
1.1215 + //----------------------------------------------------------------
1.1216 + // Implementation: Utility methods
1.1217 + //----------------------------------------------------------------
1.1218 +
1.1219 + void ensureCapacity(int32_t newLen);
1.1220 +
1.1221 + void ensureBufferCapacity(int32_t newLen);
1.1222 +
1.1223 + void swapBuffers(void);
1.1224 +
1.1225 + UBool allocateStrings();
1.1226 +
1.1227 + UnicodeString& _toPattern(UnicodeString& result,
1.1228 + UBool escapeUnprintable) const;
1.1229 +
1.1230 + UnicodeString& _generatePattern(UnicodeString& result,
1.1231 + UBool escapeUnprintable) const;
1.1232 +
1.1233 + static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);
1.1234 +
1.1235 + static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
1.1236 +
1.1237 + //----------------------------------------------------------------
1.1238 + // Implementation: Fundamental operators
1.1239 + //----------------------------------------------------------------
1.1240 +
1.1241 + void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);
1.1242 +
1.1243 + void add(const UChar32* other, int32_t otherLen, int8_t polarity);
1.1244 +
1.1245 + void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
1.1246 +
1.1247 + /**
1.1248 + * Return true if the given position, in the given pattern, appears
1.1249 + * to be the start of a property set pattern [:foo:], \\p{foo}, or
1.1250 + * \\P{foo}, or \\N{name}.
1.1251 + */
1.1252 + static UBool resemblesPropertyPattern(const UnicodeString& pattern,
1.1253 + int32_t pos);
1.1254 +
1.1255 + static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
1.1256 + int32_t iterOpts);
1.1257 +
1.1258 + /**
1.1259 + * Parse the given property pattern at the given parse position
1.1260 + * and set this UnicodeSet to the result.
1.1261 + *
1.1262 + * The original design document is out of date, but still useful.
1.1263 + * Ignore the property and value names:
1.1264 + * http://dev.icu-project.org/cgi-bin/viewcvs.cgi/~checkout~/icuhtml/design/unicodeset_properties.html
1.1265 + *
1.1266 + * Recognized syntax:
1.1267 + *
1.1268 + * [:foo:] [:^foo:] - white space not allowed within "[:" or ":]"
1.1269 + * \\p{foo} \\P{foo} - white space not allowed within "\\p" or "\\P"
1.1270 + * \\N{name} - white space not allowed within "\\N"
1.1271 + *
1.1272 + * Other than the above restrictions, white space is ignored. Case
1.1273 + * is ignored except in "\\p" and "\\P" and "\\N". In 'name' leading
1.1274 + * and trailing space is deleted, and internal runs of whitespace
1.1275 + * are collapsed to a single space.
1.1276 + *
1.1277 + * We support binary properties, enumerated properties, and the
1.1278 + * following non-enumerated properties:
1.1279 + *
1.1280 + * Numeric_Value
1.1281 + * Name
1.1282 + * Unicode_1_Name
1.1283 + *
1.1284 + * @param pattern the pattern string
1.1285 + * @param ppos on entry, the position at which to begin parsing.
1.1286 + * This should be one of the locations marked '^':
1.1287 + *
1.1288 + * [:blah:] \\p{blah} \\P{blah} \\N{name}
1.1289 + * ^ % ^ % ^ % ^ %
1.1290 + *
1.1291 + * On return, the position after the last character parsed, that is,
1.1292 + * the locations marked '%'. If the parse fails, ppos is returned
1.1293 + * unchanged.
1.1294 + * @return a reference to this.
1.1295 + */
1.1296 + UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
1.1297 + ParsePosition& ppos,
1.1298 + UErrorCode &ec);
1.1299 +
1.1300 + void applyPropertyPattern(RuleCharacterIterator& chars,
1.1301 + UnicodeString& rebuiltPat,
1.1302 + UErrorCode& ec);
1.1303 +
1.1304 + /**
1.1305 + * A filter that returns TRUE if the given code point should be
1.1306 + * included in the UnicodeSet being constructed.
1.1307 + */
1.1308 + typedef UBool (*Filter)(UChar32 codePoint, void* context);
1.1309 +
1.1310 + /**
1.1311 + * Given a filter, set this UnicodeSet to the code points
1.1312 + * contained by that filter. The filter MUST be
1.1313 + * property-conformant. That is, if it returns value v for one
1.1314 + * code point, then it must return v for all affiliated code
1.1315 + * points, as defined by the inclusions list. See
1.1316 + * getInclusions().
1.1317 + * src is a UPropertySource value.
1.1318 + */
1.1319 + void applyFilter(Filter filter,
1.1320 + void* context,
1.1321 + int32_t src,
1.1322 + UErrorCode &status);
1.1323 +
1.1324 + /**
1.1325 + * Return a cached copy of the inclusions list for the property source.
1.1326 + */
1.1327 + static const UnicodeSet* getInclusions(int32_t src, UErrorCode &errorCode);
1.1328 +
1.1329 + friend class UnicodeSetIterator;
1.1330 +};
1.1331 +
1.1332 +inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const {
1.1333 + return !operator==(o);
1.1334 +}
1.1335 +
1.1336 +inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
1.1337 + return !containsNone(start, end);
1.1338 +}
1.1339 +
1.1340 +inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const {
1.1341 + return !containsNone(s);
1.1342 +}
1.1343 +
1.1344 +inline UBool UnicodeSet::containsSome(const UnicodeString& s) const {
1.1345 + return !containsNone(s);
1.1346 +}
1.1347 +
1.1348 +U_NAMESPACE_END
1.1349 +
1.1350 +#endif