os/textandloc/fontservices/textshaperplugin/IcuSource/common/unicode/uniset.h
changeset 0 bde4ae8d615e
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/os/textandloc/fontservices/textshaperplugin/IcuSource/common/unicode/uniset.h	Fri Jun 15 03:10:57 2012 +0200
     1.3 @@ -0,0 +1,1347 @@
     1.4 +/*
     1.5 +***************************************************************************
     1.6 +* Copyright (C) 1999-2005, International Business Machines Corporation
     1.7 +* and others. All Rights Reserved.
     1.8 +***************************************************************************
     1.9 +*   Date        Name        Description
    1.10 +*   10/20/99    alan        Creation.
    1.11 +***************************************************************************
    1.12 +*/
    1.13 +
    1.14 +#ifndef UNICODESET_H
    1.15 +#define UNICODESET_H
    1.16 +
    1.17 +#include "unicode/unifilt.h"
    1.18 +#include "unicode/unistr.h"
    1.19 +#include "unicode/uset.h"
    1.20 +
    1.21 +/**
    1.22 + * \file 
    1.23 + * \brief C++ API: Unicode Set
    1.24 + */
    1.25 + 
    1.26 +U_NAMESPACE_BEGIN
    1.27 +
    1.28 +class ParsePosition;
    1.29 +class SymbolTable;
    1.30 +class UVector;
    1.31 +class RuleCharacterIterator;
    1.32 +
    1.33 +/**
    1.34 + * A mutable set of Unicode characters and multicharacter strings.  Objects of this class
    1.35 + * represent <em>character classes</em> used in regular expressions.
    1.36 + * A character specifies a subset of Unicode code points.  Legal
    1.37 + * code points are U+0000 to U+10FFFF, inclusive.
    1.38 + *
    1.39 + * <p>The UnicodeSet class is not designed to be subclassed.
    1.40 + *
    1.41 + * <p><code>UnicodeSet</code> supports two APIs. The first is the
    1.42 + * <em>operand</em> API that allows the caller to modify the value of
    1.43 + * a <code>UnicodeSet</code> object. It conforms to Java 2's
    1.44 + * <code>java.util.Set</code> interface, although
    1.45 + * <code>UnicodeSet</code> does not actually implement that
    1.46 + * interface. All methods of <code>Set</code> are supported, with the
    1.47 + * modification that they take a character range or single character
    1.48 + * instead of an <code>Object</code>, and they take a
    1.49 + * <code>UnicodeSet</code> instead of a <code>Collection</code>.  The
    1.50 + * operand API may be thought of in terms of boolean logic: a boolean
    1.51 + * OR is implemented by <code>add</code>, a boolean AND is implemented
    1.52 + * by <code>retain</code>, a boolean XOR is implemented by
    1.53 + * <code>complement</code> taking an argument, and a boolean NOT is
    1.54 + * implemented by <code>complement</code> with no argument.  In terms
    1.55 + * of traditional set theory function names, <code>add</code> is a
    1.56 + * union, <code>retain</code> is an intersection, <code>remove</code>
    1.57 + * is an asymmetric difference, and <code>complement</code> with no
    1.58 + * argument is a set complement with respect to the superset range
    1.59 + * <code>MIN_VALUE-MAX_VALUE</code>
    1.60 + *
    1.61 + * <p>The second API is the
    1.62 + * <code>applyPattern()</code>/<code>toPattern()</code> API from the
    1.63 + * <code>java.text.Format</code>-derived classes.  Unlike the
    1.64 + * methods that add characters, add categories, and control the logic
    1.65 + * of the set, the method <code>applyPattern()</code> sets all
    1.66 + * attributes of a <code>UnicodeSet</code> at once, based on a
    1.67 + * string pattern.
    1.68 + *
    1.69 + * <p><b>Pattern syntax</b></p>
    1.70 + *
    1.71 + * Patterns are accepted by the constructors and the
    1.72 + * <code>applyPattern()</code> methods and returned by the
    1.73 + * <code>toPattern()</code> method.  These patterns follow a syntax
    1.74 + * similar to that employed by version 8 regular expression character
    1.75 + * classes.  Here are some simple examples:
    1.76 + *
    1.77 + * \htmlonly<blockquote>\endhtmlonly
    1.78 + *   <table>
    1.79 + *     <tr align="top">
    1.80 + *       <td nowrap valign="top" align="left"><code>[]</code></td>
    1.81 + *       <td valign="top">No characters</td>
    1.82 + *     </tr><tr align="top">
    1.83 + *       <td nowrap valign="top" align="left"><code>[a]</code></td>
    1.84 + *       <td valign="top">The character 'a'</td>
    1.85 + *     </tr><tr align="top">
    1.86 + *       <td nowrap valign="top" align="left"><code>[ae]</code></td>
    1.87 + *       <td valign="top">The characters 'a' and 'e'</td>
    1.88 + *     </tr>
    1.89 + *     <tr>
    1.90 + *       <td nowrap valign="top" align="left"><code>[a-e]</code></td>
    1.91 + *       <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code
    1.92 + *       point order</td>
    1.93 + *     </tr>
    1.94 + *     <tr>
    1.95 + *       <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td>
    1.96 + *       <td valign="top">The character U+4E01</td>
    1.97 + *     </tr>
    1.98 + *     <tr>
    1.99 + *       <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td>
   1.100 + *       <td valign="top">The character 'a' and the multicharacter strings &quot;ab&quot; and
   1.101 + *       &quot;ac&quot;</td>
   1.102 + *     </tr>
   1.103 + *     <tr>
   1.104 + *       <td nowrap valign="top" align="left"><code>[\\p{Lu}]</code></td>
   1.105 + *       <td valign="top">All characters in the general category Uppercase Letter</td>
   1.106 + *     </tr>
   1.107 + *   </table>
   1.108 + * \htmlonly</blockquote>\endhtmlonly
   1.109 + *
   1.110 + * Any character may be preceded by a backslash in order to remove any special
   1.111 + * meaning.  White space characters, as defined by UCharacter.isWhitespace(), are
   1.112 + * ignored, unless they are escaped.
   1.113 + *
   1.114 + * <p>Property patterns specify a set of characters having a certain
   1.115 + * property as defined by the Unicode standard.  Both the POSIX-like
   1.116 + * "[:Lu:]" and the Perl-like syntax "\\p{Lu}" are recognized.  For a
   1.117 + * complete list of supported property patterns, see the User's Guide
   1.118 + * for UnicodeSet at
   1.119 + * <a href="http://icu.sourceforge.net/userguide/unicodeSet.html">
   1.120 + * http://icu.sourceforge.net/userguide/unicodeSet.html</a>.
   1.121 + * Actual determination of property data is defined by the underlying
   1.122 + * Unicode database as implemented by UCharacter.
   1.123 + *
   1.124 + * <p>Patterns specify individual characters, ranges of characters, and
   1.125 + * Unicode property sets.  When elements are concatenated, they
   1.126 + * specify their union.  To complement a set, place a '^' immediately
   1.127 + * after the opening '['.  Property patterns are inverted by modifying
   1.128 + * their delimiters; "[:^foo]" and "\\P{foo}".  In any other location,
   1.129 + * '^' has no special meaning.
   1.130 + *
   1.131 + * <p>Ranges are indicated by placing two a '-' between two
   1.132 + * characters, as in "a-z".  This specifies the range of all
   1.133 + * characters from the left to the right, in Unicode order.  If the
   1.134 + * left character is greater than or equal to the
   1.135 + * right character it is a syntax error.  If a '-' occurs as the first
   1.136 + * character after the opening '[' or '[^', or if it occurs as the
   1.137 + * last character before the closing ']', then it is taken as a
   1.138 + * literal.  Thus "[a\-b]", "[-ab]", and "[ab-]" all indicate the same
   1.139 + * set of three characters, 'a', 'b', and '-'.
   1.140 + *
   1.141 + * <p>Sets may be intersected using the '&' operator or the asymmetric
   1.142 + * set difference may be taken using the '-' operator, for example,
   1.143 + * "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters
   1.144 + * with values less than 4096.  Operators ('&' and '|') have equal
   1.145 + * precedence and bind left-to-right.  Thus
   1.146 + * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
   1.147 + * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]".  This only really matters for
   1.148 + * difference; intersection is commutative.
   1.149 + *
   1.150 + * <table>
   1.151 + * <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a'
   1.152 + * <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a'
   1.153 + * through 'z' and all letters in between, in Unicode order
   1.154 + * <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
   1.155 + * all characters but 'a' through 'z',
   1.156 + * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
   1.157 + * <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
   1.158 + * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
   1.159 + * <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code>
   1.160 + * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
   1.161 + * <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
   1.162 + * <td>The asymmetric difference of sets specified by <em>pat1</em> and
   1.163 + * <em>pat2</em>
   1.164 + * <tr valign=top><td nowrap><code>[:Lu:] or \\p{Lu}</code>
   1.165 + * <td>The set of characters having the specified
   1.166 + * Unicode property; in
   1.167 + * this case, Unicode uppercase letters
   1.168 + * <tr valign=top><td nowrap><code>[:^Lu:] or \\P{Lu}</code>
   1.169 + * <td>The set of characters <em>not</em> having the given
   1.170 + * Unicode property
   1.171 + * </table>
   1.172 + *
   1.173 + * <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p>
   1.174 + *
   1.175 + * <p><b>Formal syntax</b></p>
   1.176 + *
   1.177 + * \htmlonly<blockquote>\endhtmlonly
   1.178 + *   <table>
   1.179 + *     <tr align="top">
   1.180 + *       <td nowrap valign="top" align="right"><code>pattern :=&nbsp; </code></td>
   1.181 + *       <td valign="top"><code>('[' '^'? item* ']') |
   1.182 + *       property</code></td>
   1.183 + *     </tr>
   1.184 + *     <tr align="top">
   1.185 + *       <td nowrap valign="top" align="right"><code>item :=&nbsp; </code></td>
   1.186 + *       <td valign="top"><code>char | (char '-' char) | pattern-expr<br>
   1.187 + *       </code></td>
   1.188 + *     </tr>
   1.189 + *     <tr align="top">
   1.190 + *       <td nowrap valign="top" align="right"><code>pattern-expr :=&nbsp; </code></td>
   1.191 + *       <td valign="top"><code>pattern | pattern-expr pattern |
   1.192 + *       pattern-expr op pattern<br>
   1.193 + *       </code></td>
   1.194 + *     </tr>
   1.195 + *     <tr align="top">
   1.196 + *       <td nowrap valign="top" align="right"><code>op :=&nbsp; </code></td>
   1.197 + *       <td valign="top"><code>'&amp;' | '-'<br>
   1.198 + *       </code></td>
   1.199 + *     </tr>
   1.200 + *     <tr align="top">
   1.201 + *       <td nowrap valign="top" align="right"><code>special :=&nbsp; </code></td>
   1.202 + *       <td valign="top"><code>'[' | ']' | '-'<br>
   1.203 + *       </code></td>
   1.204 + *     </tr>
   1.205 + *     <tr align="top">
   1.206 + *       <td nowrap valign="top" align="right"><code>char :=&nbsp; </code></td>
   1.207 + *       <td valign="top"><em>any character that is not</em><code> special<br>
   1.208 + *       | ('\' </code><em>any character</em><code>)<br>
   1.209 + *       | ('\\u' hex hex hex hex)<br>
   1.210 + *       </code></td>
   1.211 + *     </tr>
   1.212 + *     <tr align="top">
   1.213 + *       <td nowrap valign="top" align="right"><code>hex :=&nbsp; </code></td>
   1.214 + *       <td valign="top"><em>any character for which
   1.215 + *       </em><code>Character.digit(c, 16)</code><em>
   1.216 + *       returns a non-negative result</em></td>
   1.217 + *     </tr>
   1.218 + *     <tr>
   1.219 + *       <td nowrap valign="top" align="right"><code>property :=&nbsp; </code></td>
   1.220 + *       <td valign="top"><em>a Unicode property set pattern</em></td>
   1.221 + *     </tr>
   1.222 + *   </table>
   1.223 + *   <br>
   1.224 + *   <table border="1">
   1.225 + *     <tr>
   1.226 + *       <td>Legend: <table>
   1.227 + *         <tr>
   1.228 + *           <td nowrap valign="top"><code>a := b</code></td>
   1.229 + *           <td width="20" valign="top">&nbsp; </td>
   1.230 + *           <td valign="top"><code>a</code> may be replaced by <code>b</code> </td>
   1.231 + *         </tr>
   1.232 + *         <tr>
   1.233 + *           <td nowrap valign="top"><code>a?</code></td>
   1.234 + *           <td valign="top"></td>
   1.235 + *           <td valign="top">zero or one instance of <code>a</code><br>
   1.236 + *           </td>
   1.237 + *         </tr>
   1.238 + *         <tr>
   1.239 + *           <td nowrap valign="top"><code>a*</code></td>
   1.240 + *           <td valign="top"></td>
   1.241 + *           <td valign="top">one or more instances of <code>a</code><br>
   1.242 + *           </td>
   1.243 + *         </tr>
   1.244 + *         <tr>
   1.245 + *           <td nowrap valign="top"><code>a | b</code></td>
   1.246 + *           <td valign="top"></td>
   1.247 + *           <td valign="top">either <code>a</code> or <code>b</code><br>
   1.248 + *           </td>
   1.249 + *         </tr>
   1.250 + *         <tr>
   1.251 + *           <td nowrap valign="top"><code>'a'</code></td>
   1.252 + *           <td valign="top"></td>
   1.253 + *           <td valign="top">the literal string between the quotes </td>
   1.254 + *         </tr>
   1.255 + *       </table>
   1.256 + *       </td>
   1.257 + *     </tr>
   1.258 + *   </table>
   1.259 + * \htmlonly</blockquote>\endhtmlonly
   1.260 + *
   1.261 + * @author Alan Liu
   1.262 + * @stable ICU 2.0
   1.263 + */
   1.264 +class U_COMMON_API UnicodeSet : public UnicodeFilter {
   1.265 +
   1.266 +    int32_t len; // length of list used; 0 <= len <= capacity
   1.267 +    int32_t capacity; // capacity of list
   1.268 +    int32_t bufferCapacity; // capacity of buffer
   1.269 +    UChar32* list; // MUST be terminated with HIGH
   1.270 +    UChar32* buffer; // internal buffer, may be NULL
   1.271 +
   1.272 +    UVector* strings; // maintained in sorted order
   1.273 +
   1.274 +    /**
   1.275 +     * The pattern representation of this set.  This may not be the
   1.276 +     * most economical pattern.  It is the pattern supplied to
   1.277 +     * applyPattern(), with variables substituted and whitespace
   1.278 +     * removed.  For sets constructed without applyPattern(), or
   1.279 +     * modified using the non-pattern API, this string will be empty,
   1.280 +     * indicating that toPattern() must generate a pattern
   1.281 +     * representation from the inversion list.
   1.282 +     */
   1.283 +    UnicodeString pat;
   1.284 +
   1.285 +public:
   1.286 +
   1.287 +    enum {
   1.288 +        /**
   1.289 +         * Minimum value that can be stored in a UnicodeSet.
   1.290 +         * @stable ICU 2.4
   1.291 +         */
   1.292 +        MIN_VALUE = 0,
   1.293 +
   1.294 +        /**
   1.295 +         * Maximum value that can be stored in a UnicodeSet.
   1.296 +         * @stable ICU 2.4
   1.297 +         */
   1.298 +        MAX_VALUE = 0x10ffff
   1.299 +    };
   1.300 +
   1.301 +    //----------------------------------------------------------------
   1.302 +    // Constructors &c
   1.303 +    //----------------------------------------------------------------
   1.304 +
   1.305 +public:
   1.306 +
   1.307 +    /**
   1.308 +     * Constructs an empty set.
   1.309 +     * @stable ICU 2.0
   1.310 +     */
   1.311 +    UnicodeSet();
   1.312 +
   1.313 +    /**
   1.314 +     * Constructs a set containing the given range. If <code>end >
   1.315 +     * start</code> then an empty set is created.
   1.316 +     *
   1.317 +     * @param start first character, inclusive, of range
   1.318 +     * @param end last character, inclusive, of range
   1.319 +     * @stable ICU 2.4
   1.320 +     */
   1.321 +    UnicodeSet(UChar32 start, UChar32 end);
   1.322 +
   1.323 +    /**
   1.324 +     * Constructs a set from the given pattern.  See the class
   1.325 +     * description for the syntax of the pattern language.
   1.326 +     * @param pattern a string specifying what characters are in the set
   1.327 +     * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
   1.328 +     * contains a syntax error.
   1.329 +     * @stable ICU 2.0
   1.330 +     */
   1.331 +    UnicodeSet(const UnicodeString& pattern,
   1.332 +               UErrorCode& status);
   1.333 +
   1.334 +    /**
   1.335 +     * Constructs a set from the given pattern.  See the class
   1.336 +     * description for the syntax of the pattern language.
   1.337 +     * @param pattern a string specifying what characters are in the set
   1.338 +     * @param options bitmask for options to apply to the pattern.
   1.339 +     * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
   1.340 +     * @param symbols a symbol table mapping variable names to values
   1.341 +     * and stand-in characters to UnicodeSets; may be NULL
   1.342 +     * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
   1.343 +     * contains a syntax error.
   1.344 +     * @internal
   1.345 +     */
   1.346 +    UnicodeSet(const UnicodeString& pattern,
   1.347 +               uint32_t options,
   1.348 +               const SymbolTable* symbols,
   1.349 +               UErrorCode& status);
   1.350 +
   1.351 +    /**
   1.352 +     * Constructs a set from the given pattern.  See the class description
   1.353 +     * for the syntax of the pattern language.
   1.354 +     * @param pattern a string specifying what characters are in the set
   1.355 +     * @param pos on input, the position in pattern at which to start parsing.
   1.356 +     * On output, the position after the last character parsed.
   1.357 +     * @param options bitmask for options to apply to the pattern.
   1.358 +     * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
   1.359 +     * @param symbols a symbol table mapping variable names to values
   1.360 +     * and stand-in characters to UnicodeSets; may be NULL
   1.361 +     * @param status input-output error code
   1.362 +     * @stable ICU 2.8
   1.363 +     */
   1.364 +    UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
   1.365 +               uint32_t options,
   1.366 +               const SymbolTable* symbols,
   1.367 +               UErrorCode& status);
   1.368 +
   1.369 +#ifdef U_USE_UNICODESET_DEPRECATES
   1.370 +    /**
   1.371 +     * Obsolete: Constructs a set from the given Unicode character category.
   1.372 +     * @param category an integer indicating the character category as
   1.373 +     * defined in uchar.h.
   1.374 +     * @obsolete ICU 2.6. Use a pattern with the category instead since this API will be removed in that release.
   1.375 +     */
   1.376 +    UnicodeSet(int8_t category, UErrorCode& status);
   1.377 +#endif
   1.378 +
   1.379 +    /**
   1.380 +     * Constructs a set that is identical to the given UnicodeSet.
   1.381 +     * @stable ICU 2.0
   1.382 +     */
   1.383 +    UnicodeSet(const UnicodeSet& o);
   1.384 +
   1.385 +    /**
   1.386 +     * Destructs the set.
   1.387 +     * @stable ICU 2.0
   1.388 +     */
   1.389 +    virtual ~UnicodeSet();
   1.390 +
   1.391 +    /**
   1.392 +     * Assigns this object to be a copy of another.
   1.393 +     * @stable ICU 2.0
   1.394 +     */
   1.395 +    UnicodeSet& operator=(const UnicodeSet& o);
   1.396 +
   1.397 +    /**
   1.398 +     * Compares the specified object with this set for equality.  Returns
   1.399 +     * <tt>true</tt> if the two sets
   1.400 +     * have the same size, and every member of the specified set is
   1.401 +     * contained in this set (or equivalently, every member of this set is
   1.402 +     * contained in the specified set).
   1.403 +     *
   1.404 +     * @param o set to be compared for equality with this set.
   1.405 +     * @return <tt>true</tt> if the specified set is equal to this set.
   1.406 +     * @stable ICU 2.0
   1.407 +     */
   1.408 +    virtual UBool operator==(const UnicodeSet& o) const;
   1.409 +
   1.410 +    /**
   1.411 +     * Compares the specified object with this set for equality.  Returns
   1.412 +     * <tt>true</tt> if the specified set is not equal to this set.
   1.413 +     * @stable ICU 2.0
   1.414 +     */
   1.415 +    UBool operator!=(const UnicodeSet& o) const;
   1.416 +
   1.417 +    /**
   1.418 +     * Returns a copy of this object.  All UnicodeFunctor objects have
   1.419 +     * to support cloning in order to allow classes using
   1.420 +     * UnicodeFunctors, such as Transliterator, to implement cloning.
   1.421 +     * @stable ICU 2.0
   1.422 +     */
   1.423 +    virtual UnicodeFunctor* clone() const;
   1.424 +
   1.425 +    /**
   1.426 +     * Returns the hash code value for this set.
   1.427 +     *
   1.428 +     * @return the hash code value for this set.
   1.429 +     * @see Object#hashCode()
   1.430 +     * @stable ICU 2.0
   1.431 +     */
   1.432 +    virtual int32_t hashCode(void) const;
   1.433 +
   1.434 +    //----------------------------------------------------------------
   1.435 +    // Public API
   1.436 +    //----------------------------------------------------------------
   1.437 +
   1.438 +    /**
   1.439 +     * Make this object represent the range <code>start - end</code>.
   1.440 +     * If <code>end > start</code> then this object is set to an
   1.441 +     * an empty range.
   1.442 +     *
   1.443 +     * @param start first character in the set, inclusive
   1.444 +     * @param end last character in the set, inclusive
   1.445 +     * @stable ICU 2.4
   1.446 +     */
   1.447 +    UnicodeSet& set(UChar32 start, UChar32 end);
   1.448 +
   1.449 +    /**
   1.450 +     * Return true if the given position, in the given pattern, appears
   1.451 +     * to be the start of a UnicodeSet pattern.
   1.452 +     * @stable ICU 2.4
   1.453 +     */
   1.454 +    static UBool resemblesPattern(const UnicodeString& pattern,
   1.455 +                                  int32_t pos);
   1.456 +
   1.457 +    /**
   1.458 +     * Modifies this set to represent the set specified by the given
   1.459 +     * pattern, optionally ignoring white space.  See the class
   1.460 +     * description for the syntax of the pattern language.
   1.461 +     * @param pattern a string specifying what characters are in the set
   1.462 +     * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
   1.463 +     * contains a syntax error.
   1.464 +     * <em> Empties the set passed before applying the pattern.</em>
   1.465 +     * @return a reference to this
   1.466 +     * @stable ICU 2.0
   1.467 +     */
   1.468 +    UnicodeSet& applyPattern(const UnicodeString& pattern,
   1.469 +                             UErrorCode& status);
   1.470 +
   1.471 +    /**
   1.472 +     * Modifies this set to represent the set specified by the given
   1.473 +     * pattern, optionally ignoring white space.  See the class
   1.474 +     * description for the syntax of the pattern language.
   1.475 +     * @param pattern a string specifying what characters are in the set
   1.476 +     * @param options bitmask for options to apply to the pattern.
   1.477 +     * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
   1.478 +     * @param symbols a symbol table mapping variable names to
   1.479 +     * values and stand-ins to UnicodeSets; may be NULL
   1.480 +     * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
   1.481 +     * contains a syntax error.
   1.482 +     *<em> Empties the set passed before applying the pattern.</em>
   1.483 +     * @return a reference to this
   1.484 +     * @internal
   1.485 +     */
   1.486 +    UnicodeSet& applyPattern(const UnicodeString& pattern,
   1.487 +                             uint32_t options,
   1.488 +                             const SymbolTable* symbols,
   1.489 +                             UErrorCode& status);
   1.490 +
   1.491 +    /**
   1.492 +     * Parses the given pattern, starting at the given position.  The
   1.493 +     * character at pattern.charAt(pos.getIndex()) must be '[', or the
   1.494 +     * parse fails.  Parsing continues until the corresponding closing
   1.495 +     * ']'.  If a syntax error is encountered between the opening and
   1.496 +     * closing brace, the parse fails.  Upon return from a successful
   1.497 +     * parse, the ParsePosition is updated to point to the character
   1.498 +     * following the closing ']', and a StringBuffer containing a
   1.499 +     * pairs list for the parsed pattern is returned.  This method calls
   1.500 +     * itself recursively to parse embedded subpatterns.
   1.501 +     *<em> Empties the set passed before applying the pattern.</em>
   1.502 +     *
   1.503 +     * @param pattern the string containing the pattern to be parsed.
   1.504 +     * The portion of the string from pos.getIndex(), which must be a
   1.505 +     * '[', to the corresponding closing ']', is parsed.
   1.506 +     * @param pos upon entry, the position at which to being parsing.
   1.507 +     * The character at pattern.charAt(pos.getIndex()) must be a '['.
   1.508 +     * Upon return from a successful parse, pos.getIndex() is either
   1.509 +     * the character after the closing ']' of the parsed pattern, or
   1.510 +     * pattern.length() if the closing ']' is the last character of
   1.511 +     * the pattern string.
   1.512 +     * @param options bitmask for options to apply to the pattern.
   1.513 +     * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
   1.514 +     * @param symbols a symbol table mapping variable names to
   1.515 +     * values and stand-ins to UnicodeSets; may be NULL
   1.516 +     * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
   1.517 +     * contains a syntax error.
   1.518 +     * @return a reference to this
   1.519 +     * @stable ICU 2.8
   1.520 +     */
   1.521 +    UnicodeSet& applyPattern(const UnicodeString& pattern,
   1.522 +                             ParsePosition& pos,
   1.523 +                             uint32_t options,
   1.524 +                             const SymbolTable* symbols,
   1.525 +                             UErrorCode& status);
   1.526 +
   1.527 +    /**
   1.528 +     * Returns a string representation of this set.  If the result of
   1.529 +     * calling this function is passed to a UnicodeSet constructor, it
   1.530 +     * will produce another set that is equal to this one.
   1.531 +     * @param result the string to receive the rules.  Previous
   1.532 +     * contents will be deleted.
   1.533 +     * @param escapeUnprintable if TRUE then convert unprintable
   1.534 +     * character to their hex escape representations, \\uxxxx or
   1.535 +     * \\Uxxxxxxxx.  Unprintable characters are those other than
   1.536 +     * U+000A, U+0020..U+007E.
   1.537 +     * @stable ICU 2.0
   1.538 +     */
   1.539 +    virtual UnicodeString& toPattern(UnicodeString& result,
   1.540 +                             UBool escapeUnprintable = FALSE) const;
   1.541 +
   1.542 +    /**
   1.543 +     * Modifies this set to contain those code points which have the given value
   1.544 +     * for the given binary or enumerated property, as returned by
   1.545 +     * u_getIntPropertyValue.  Prior contents of this set are lost.
   1.546 +     *
   1.547 +     * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1
   1.548 +     * or UCHAR_INT_START..UCHAR_INT_LIMIT-1
   1.549 +     * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1.
   1.550 +     *
   1.551 +     * @param value a value in the range u_getIntPropertyMinValue(prop)..
   1.552 +     * u_getIntPropertyMaxValue(prop), with one exception.  If prop is
   1.553 +     * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but
   1.554 +     * rather a mask value produced by U_GET_GC_MASK().  This allows grouped
   1.555 +     * categories such as [:L:] to be represented.
   1.556 +     *
   1.557 +     * @param ec error code input/output parameter
   1.558 +     *
   1.559 +     * @return a reference to this set
   1.560 +     *
   1.561 +     * @stable ICU 2.4
   1.562 +     */
   1.563 +    UnicodeSet& applyIntPropertyValue(UProperty prop,
   1.564 +                                      int32_t value,
   1.565 +                                      UErrorCode& ec);
   1.566 +
   1.567 +    /**
   1.568 +     * Modifies this set to contain those code points which have the
   1.569 +     * given value for the given property.  Prior contents of this
   1.570 +     * set are lost.
   1.571 +     *
   1.572 +     * @param prop a property alias, either short or long.  The name is matched
   1.573 +     * loosely.  See PropertyAliases.txt for names and a description of loose
   1.574 +     * matching.  If the value string is empty, then this string is interpreted
   1.575 +     * as either a General_Category value alias, a Script value alias, a binary
   1.576 +     * property alias, or a special ID.  Special IDs are matched loosely and
   1.577 +     * correspond to the following sets:
   1.578 +     *
   1.579 +     * "ANY" = [\\u0000-\\U0010FFFF],
   1.580 +     * "ASCII" = [\\u0000-\\u007F],
   1.581 +     * "Assigned" = [:^Cn:].
   1.582 +     *
   1.583 +     * @param value a value alias, either short or long.  The name is matched
   1.584 +     * loosely.  See PropertyValueAliases.txt for names and a description of
   1.585 +     * loose matching.  In addition to aliases listed, numeric values and
   1.586 +     * canonical combining classes may be expressed numerically, e.g., ("nv",
   1.587 +     * "0.5") or ("ccc", "220").  The value string may also be empty.
   1.588 +     *
   1.589 +     * @param ec error code input/output parameter
   1.590 +     *
   1.591 +     * @return a reference to this set
   1.592 +     *
   1.593 +     * @stable ICU 2.4
   1.594 +     */
   1.595 +    UnicodeSet& applyPropertyAlias(const UnicodeString& prop,
   1.596 +                                   const UnicodeString& value,
   1.597 +                                   UErrorCode& ec);
   1.598 +
   1.599 +    /**
   1.600 +     * Returns the number of elements in this set (its cardinality).
   1.601 +     * Note than the elements of a set may include both individual
   1.602 +     * codepoints and strings.
   1.603 +     *
   1.604 +     * @return the number of elements in this set (its cardinality).
   1.605 +     * @stable ICU 2.0
   1.606 +     */
   1.607 +    virtual int32_t size(void) const;
   1.608 +
   1.609 +    /**
   1.610 +     * Returns <tt>true</tt> if this set contains no elements.
   1.611 +     *
   1.612 +     * @return <tt>true</tt> if this set contains no elements.
   1.613 +     * @stable ICU 2.0
   1.614 +     */
   1.615 +    virtual UBool isEmpty(void) const;
   1.616 +
   1.617 +    /**
   1.618 +     * Returns true if this set contains the given character.
   1.619 +     * @param c character to be checked for containment
   1.620 +     * @return true if the test condition is met
   1.621 +     * @stable ICU 2.0
   1.622 +     */
   1.623 +    virtual UBool contains(UChar32 c) const;
   1.624 +
   1.625 +    /**
   1.626 +     * Returns true if this set contains every character
   1.627 +     * of the given range.
   1.628 +     * @param start first character, inclusive, of the range
   1.629 +     * @param end last character, inclusive, of the range
   1.630 +     * @return true if the test condition is met
   1.631 +     * @stable ICU 2.0
   1.632 +     */
   1.633 +    virtual UBool contains(UChar32 start, UChar32 end) const;
   1.634 +
   1.635 +    /**
   1.636 +     * Returns <tt>true</tt> if this set contains the given
   1.637 +     * multicharacter string.
   1.638 +     * @param s string to be checked for containment
   1.639 +     * @return <tt>true</tt> if this set contains the specified string
   1.640 +     * @stable ICU 2.4
   1.641 +     */
   1.642 +    UBool contains(const UnicodeString& s) const;
   1.643 +
   1.644 +    /**
   1.645 +     * Returns true if this set contains all the characters and strings
   1.646 +     * of the given set.
   1.647 +     * @param c set to be checked for containment
   1.648 +     * @return true if the test condition is met
   1.649 +     * @stable ICU 2.4
   1.650 +     */
   1.651 +    virtual UBool containsAll(const UnicodeSet& c) const;
   1.652 +
   1.653 +    /**
   1.654 +     * Returns true if this set contains all the characters
   1.655 +     * of the given string.
   1.656 +     * @param s string containing characters to be checked for containment
   1.657 +     * @return true if the test condition is met
   1.658 +     * @stable ICU 2.4
   1.659 +     */
   1.660 +    UBool containsAll(const UnicodeString& s) const;
   1.661 +
   1.662 +    /**
   1.663 +     * Returns true if this set contains none of the characters
   1.664 +     * of the given range.
   1.665 +     * @param start first character, inclusive, of the range
   1.666 +     * @param end last character, inclusive, of the range
   1.667 +     * @return true if the test condition is met
   1.668 +     * @stable ICU 2.4
   1.669 +     */
   1.670 +    UBool containsNone(UChar32 start, UChar32 end) const;
   1.671 +
   1.672 +    /**
   1.673 +     * Returns true if this set contains none of the characters and strings
   1.674 +     * of the given set.
   1.675 +     * @param c set to be checked for containment
   1.676 +     * @return true if the test condition is met
   1.677 +     * @stable ICU 2.4
   1.678 +     */
   1.679 +    UBool containsNone(const UnicodeSet& c) const;
   1.680 +
   1.681 +    /**
   1.682 +     * Returns true if this set contains none of the characters
   1.683 +     * of the given string.
   1.684 +     * @param s string containing characters to be checked for containment
   1.685 +     * @return true if the test condition is met
   1.686 +     * @stable ICU 2.4
   1.687 +     */
   1.688 +    UBool containsNone(const UnicodeString& s) const;
   1.689 +
   1.690 +    /**
   1.691 +     * Returns true if this set contains one or more of the characters
   1.692 +     * in the given range.
   1.693 +     * @param start first character, inclusive, of the range
   1.694 +     * @param end last character, inclusive, of the range
   1.695 +     * @return true if the condition is met
   1.696 +     * @stable ICU 2.4
   1.697 +     */
   1.698 +    inline UBool containsSome(UChar32 start, UChar32 end) const;
   1.699 +
   1.700 +    /**
   1.701 +     * Returns true if this set contains one or more of the characters
   1.702 +     * and strings of the given set.
   1.703 +     * @param s The set to be checked for containment
   1.704 +     * @return true if the condition is met
   1.705 +     * @stable ICU 2.4
   1.706 +     */
   1.707 +    inline UBool containsSome(const UnicodeSet& s) const;
   1.708 +
   1.709 +    /**
   1.710 +     * Returns true if this set contains one or more of the characters
   1.711 +     * of the given string.
   1.712 +     * @param s string containing characters to be checked for containment
   1.713 +     * @return true if the condition is met
   1.714 +     * @stable ICU 2.4
   1.715 +     */
   1.716 +    inline UBool containsSome(const UnicodeString& s) const;
   1.717 +
   1.718 +    /**
   1.719 +     * Implement UnicodeMatcher::matches()
   1.720 +     * @stable ICU 2.4
   1.721 +     */
   1.722 +    virtual UMatchDegree matches(const Replaceable& text,
   1.723 +                         int32_t& offset,
   1.724 +                         int32_t limit,
   1.725 +                         UBool incremental);
   1.726 +
   1.727 +private:
   1.728 +    /**
   1.729 +     * Returns the longest match for s in text at the given position.
   1.730 +     * If limit > start then match forward from start+1 to limit
   1.731 +     * matching all characters except s.charAt(0).  If limit < start,
   1.732 +     * go backward starting from start-1 matching all characters
   1.733 +     * except s.charAt(s.length()-1).  This method assumes that the
   1.734 +     * first character, text.charAt(start), matches s, so it does not
   1.735 +     * check it.
   1.736 +     * @param text the text to match
   1.737 +     * @param start the first character to match.  In the forward
   1.738 +     * direction, text.charAt(start) is matched against s.charAt(0).
   1.739 +     * In the reverse direction, it is matched against
   1.740 +     * s.charAt(s.length()-1).
   1.741 +     * @param limit the limit offset for matching, either last+1 in
   1.742 +     * the forward direction, or last-1 in the reverse direction,
   1.743 +     * where last is the index of the last character to match.
   1.744 +     * @return If part of s matches up to the limit, return |limit -
   1.745 +     * start|.  If all of s matches before reaching the limit, return
   1.746 +     * s.length().  If there is a mismatch between s and text, return
   1.747 +     * 0
   1.748 +     */
   1.749 +    static int32_t matchRest(const Replaceable& text,
   1.750 +                             int32_t start, int32_t limit,
   1.751 +                             const UnicodeString& s);
   1.752 +
   1.753 +    /**
   1.754 +     * Returns the smallest value i such that c < list[i].  Caller
   1.755 +     * must ensure that c is a legal value or this method will enter
   1.756 +     * an infinite loop.  This method performs a binary search.
   1.757 +     * @param c a character in the range MIN_VALUE..MAX_VALUE
   1.758 +     * inclusive
   1.759 +     * @return the smallest integer i in the range 0..len-1,
   1.760 +     * inclusive, such that c < list[i]
   1.761 +     */
   1.762 +    int32_t findCodePoint(UChar32 c) const;
   1.763 +
   1.764 +public:
   1.765 +
   1.766 +    /**
   1.767 +     * Implementation of UnicodeMatcher API.  Union the set of all
   1.768 +     * characters that may be matched by this object into the given
   1.769 +     * set.
   1.770 +     * @param toUnionTo the set into which to union the source characters
   1.771 +     * @stable ICU 2.4
   1.772 +     */
   1.773 +    virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
   1.774 +
   1.775 +    /**
   1.776 +     * Returns the index of the given character within this set, where
   1.777 +     * the set is ordered by ascending code point.  If the character
   1.778 +     * is not in this set, return -1.  The inverse of this method is
   1.779 +     * <code>charAt()</code>.
   1.780 +     * @return an index from 0..size()-1, or -1
   1.781 +     * @stable ICU 2.4
   1.782 +     */
   1.783 +    int32_t indexOf(UChar32 c) const;
   1.784 +
   1.785 +    /**
   1.786 +     * Returns the character at the given index within this set, where
   1.787 +     * the set is ordered by ascending code point.  If the index is
   1.788 +     * out of range, return (UChar32)-1.  The inverse of this method is
   1.789 +     * <code>indexOf()</code>.
   1.790 +     * @param index an index from 0..size()-1
   1.791 +     * @return the character at the given index, or (UChar32)-1.
   1.792 +     * @stable ICU 2.4
   1.793 +     */
   1.794 +    UChar32 charAt(int32_t index) const;
   1.795 +
   1.796 +    /**
   1.797 +     * Adds the specified range to this set if it is not already
   1.798 +     * present.  If this set already contains the specified range,
   1.799 +     * the call leaves this set unchanged.  If <code>end > start</code>
   1.800 +     * then an empty range is added, leaving the set unchanged.
   1.801 +     * This is equivalent to a boolean logic OR, or a set UNION.
   1.802 +     *
   1.803 +     * @param start first character, inclusive, of range to be added
   1.804 +     * to this set.
   1.805 +     * @param end last character, inclusive, of range to be added
   1.806 +     * to this set.
   1.807 +     * @stable ICU 2.0
   1.808 +     */
   1.809 +    virtual UnicodeSet& add(UChar32 start, UChar32 end);
   1.810 +
   1.811 +    /**
   1.812 +     * Adds the specified character to this set if it is not already
   1.813 +     * present.  If this set already contains the specified character,
   1.814 +     * the call leaves this set unchanged.
   1.815 +     * @stable ICU 2.0
   1.816 +     */
   1.817 +    UnicodeSet& add(UChar32 c);
   1.818 +
   1.819 +    /**
   1.820 +     * Adds the specified multicharacter to this set if it is not already
   1.821 +     * present.  If this set already contains the multicharacter,
   1.822 +     * the call leaves this set unchanged.
   1.823 +     * Thus "ch" => {"ch"}
   1.824 +     * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
   1.825 +     * @param s the source string
   1.826 +     * @return this object, for chaining
   1.827 +     * @stable ICU 2.4
   1.828 +     */
   1.829 +    UnicodeSet& add(const UnicodeString& s);
   1.830 +
   1.831 + private:
   1.832 +    /**
   1.833 +     * @return a code point IF the string consists of a single one.
   1.834 +     * otherwise returns -1.
   1.835 +     * @param string to test
   1.836 +     */
   1.837 +    static int32_t getSingleCP(const UnicodeString& s);
   1.838 +
   1.839 +    void _add(const UnicodeString& s);
   1.840 +
   1.841 + public:
   1.842 +    /**
   1.843 +     * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
   1.844 +     * If this set already any particular character, it has no effect on that character.
   1.845 +     * @param s the source string
   1.846 +     * @return this object, for chaining
   1.847 +     * @stable ICU 2.4
   1.848 +     */
   1.849 +    UnicodeSet& addAll(const UnicodeString& s);
   1.850 +
   1.851 +    /**
   1.852 +     * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
   1.853 +     * If this set already any particular character, it has no effect on that character.
   1.854 +     * @param s the source string
   1.855 +     * @return this object, for chaining
   1.856 +     * @stable ICU 2.4
   1.857 +     */
   1.858 +    UnicodeSet& retainAll(const UnicodeString& s);
   1.859 +
   1.860 +    /**
   1.861 +     * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
   1.862 +     * If this set already any particular character, it has no effect on that character.
   1.863 +     * @param s the source string
   1.864 +     * @return this object, for chaining
   1.865 +     * @stable ICU 2.4
   1.866 +     */
   1.867 +    UnicodeSet& complementAll(const UnicodeString& s);
   1.868 +
   1.869 +    /**
   1.870 +     * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
   1.871 +     * If this set already any particular character, it has no effect on that character.
   1.872 +     * @param s the source string
   1.873 +     * @return this object, for chaining
   1.874 +     * @stable ICU 2.4
   1.875 +     */
   1.876 +    UnicodeSet& removeAll(const UnicodeString& s);
   1.877 +
   1.878 +    /**
   1.879 +     * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
   1.880 +     * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
   1.881 +     * @param s the source string
   1.882 +     * @return a newly created set containing the given string.
   1.883 +     * The caller owns the return object and is responsible for deleting it.
   1.884 +     * @stable ICU 2.4
   1.885 +     */
   1.886 +    static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s);
   1.887 +
   1.888 +
   1.889 +    /**
   1.890 +     * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
   1.891 +     * @param s the source string
   1.892 +     * @return a newly created set containing the given characters
   1.893 +     * The caller owns the return object and is responsible for deleting it.
   1.894 +     * @stable ICU 2.4
   1.895 +     */
   1.896 +    static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s);
   1.897 +
   1.898 +    /**
   1.899 +     * Retain only the elements in this set that are contained in the
   1.900 +     * specified range.  If <code>end > start</code> then an empty range is
   1.901 +     * retained, leaving the set empty.  This is equivalent to
   1.902 +     * a boolean logic AND, or a set INTERSECTION.
   1.903 +     *
   1.904 +     * @param start first character, inclusive, of range to be retained
   1.905 +     * to this set.
   1.906 +     * @param end last character, inclusive, of range to be retained
   1.907 +     * to this set.
   1.908 +     * @stable ICU 2.0
   1.909 +     */
   1.910 +    virtual UnicodeSet& retain(UChar32 start, UChar32 end);
   1.911 +
   1.912 +
   1.913 +    /**
   1.914 +     * Retain the specified character from this set if it is present.
   1.915 +     * @stable ICU 2.0
   1.916 +     */
   1.917 +    UnicodeSet& retain(UChar32 c);
   1.918 +
   1.919 +    /**
   1.920 +     * Removes the specified range from this set if it is present.
   1.921 +     * The set will not contain the specified range once the call
   1.922 +     * returns.  If <code>end > start</code> then an empty range is
   1.923 +     * removed, leaving the set unchanged.
   1.924 +     *
   1.925 +     * @param start first character, inclusive, of range to be removed
   1.926 +     * from this set.
   1.927 +     * @param end last character, inclusive, of range to be removed
   1.928 +     * from this set.
   1.929 +     * @stable ICU 2.0
   1.930 +     */
   1.931 +    virtual UnicodeSet& remove(UChar32 start, UChar32 end);
   1.932 +
   1.933 +    /**
   1.934 +     * Removes the specified character from this set if it is present.
   1.935 +     * The set will not contain the specified range once the call
   1.936 +     * returns.
   1.937 +     * @stable ICU 2.0
   1.938 +     */
   1.939 +    UnicodeSet& remove(UChar32 c);
   1.940 +
   1.941 +    /**
   1.942 +     * Removes the specified string from this set if it is present.
   1.943 +     * The set will not contain the specified character once the call
   1.944 +     * returns.
   1.945 +     * @param s the source string
   1.946 +     * @return this object, for chaining
   1.947 +     * @stable ICU 2.4
   1.948 +     */
   1.949 +    UnicodeSet& remove(const UnicodeString& s);
   1.950 +
   1.951 +    /**
   1.952 +     * Inverts this set.  This operation modifies this set so that
   1.953 +     * its value is its complement.  This is equivalent to
   1.954 +     * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
   1.955 +     * @stable ICU 2.0
   1.956 +     */
   1.957 +    virtual UnicodeSet& complement(void);
   1.958 +
   1.959 +    /**
   1.960 +     * Complements the specified range in this set.  Any character in
   1.961 +     * the range will be removed if it is in this set, or will be
   1.962 +     * added if it is not in this set.  If <code>end > start</code>
   1.963 +     * then an empty range is complemented, leaving the set unchanged.
   1.964 +     * This is equivalent to a boolean logic XOR.
   1.965 +     *
   1.966 +     * @param start first character, inclusive, of range to be removed
   1.967 +     * from this set.
   1.968 +     * @param end last character, inclusive, of range to be removed
   1.969 +     * from this set.
   1.970 +     * @stable ICU 2.0
   1.971 +     */
   1.972 +    virtual UnicodeSet& complement(UChar32 start, UChar32 end);
   1.973 +
   1.974 +    /**
   1.975 +     * Complements the specified character in this set.  The character
   1.976 +     * will be removed if it is in this set, or will be added if it is
   1.977 +     * not in this set.
   1.978 +     * @stable ICU 2.0
   1.979 +     */
   1.980 +    UnicodeSet& complement(UChar32 c);
   1.981 +
   1.982 +    /**
   1.983 +     * Complement the specified string in this set.
   1.984 +     * The set will not contain the specified string once the call
   1.985 +     * returns.
   1.986 +     * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
   1.987 +     * @param s the string to complement
   1.988 +     * @return this object, for chaining
   1.989 +     * @stable ICU 2.4
   1.990 +     */
   1.991 +    UnicodeSet& complement(const UnicodeString& s);
   1.992 +
   1.993 +    /**
   1.994 +     * Adds all of the elements in the specified set to this set if
   1.995 +     * they're not already present.  This operation effectively
   1.996 +     * modifies this set so that its value is the <i>union</i> of the two
   1.997 +     * sets.  The behavior of this operation is unspecified if the specified
   1.998 +     * collection is modified while the operation is in progress.
   1.999 +     *
  1.1000 +     * @param c set whose elements are to be added to this set.
  1.1001 +     * @see #add(char, char)
  1.1002 +     * @stable ICU 2.0
  1.1003 +     */
  1.1004 +    virtual UnicodeSet& addAll(const UnicodeSet& c);
  1.1005 +
  1.1006 +    /**
  1.1007 +     * Retains only the elements in this set that are contained in the
  1.1008 +     * specified set.  In other words, removes from this set all of
  1.1009 +     * its elements that are not contained in the specified set.  This
  1.1010 +     * operation effectively modifies this set so that its value is
  1.1011 +     * the <i>intersection</i> of the two sets.
  1.1012 +     *
  1.1013 +     * @param c set that defines which elements this set will retain.
  1.1014 +     * @stable ICU 2.0
  1.1015 +     */
  1.1016 +    virtual UnicodeSet& retainAll(const UnicodeSet& c);
  1.1017 +
  1.1018 +    /**
  1.1019 +     * Removes from this set all of its elements that are contained in the
  1.1020 +     * specified set.  This operation effectively modifies this
  1.1021 +     * set so that its value is the <i>asymmetric set difference</i> of
  1.1022 +     * the two sets.
  1.1023 +     *
  1.1024 +     * @param c set that defines which elements will be removed from
  1.1025 +     *          this set.
  1.1026 +     * @stable ICU 2.0
  1.1027 +     */
  1.1028 +    virtual UnicodeSet& removeAll(const UnicodeSet& c);
  1.1029 +
  1.1030 +    /**
  1.1031 +     * Complements in this set all elements contained in the specified
  1.1032 +     * set.  Any character in the other set will be removed if it is
  1.1033 +     * in this set, or will be added if it is not in this set.
  1.1034 +     *
  1.1035 +     * @param c set that defines which elements will be xor'ed from
  1.1036 +     *          this set.
  1.1037 +     * @stable ICU 2.4
  1.1038 +     */
  1.1039 +    virtual UnicodeSet& complementAll(const UnicodeSet& c);
  1.1040 +
  1.1041 +    /**
  1.1042 +     * Removes all of the elements from this set.  This set will be
  1.1043 +     * empty after this call returns.
  1.1044 +     * @stable ICU 2.0
  1.1045 +     */
  1.1046 +    virtual UnicodeSet& clear(void);
  1.1047 +
  1.1048 +    /**
  1.1049 +     * Close this set over the given attribute.  For the attribute
  1.1050 +     * USET_CASE, the result is to modify this set so that:
  1.1051 +     *
  1.1052 +     * 1. For each character or string 'a' in this set, all strings or
  1.1053 +     * characters 'b' such that foldCase(a) == foldCase(b) are added
  1.1054 +     * to this set.
  1.1055 +     *
  1.1056 +     * 2. For each string 'e' in the resulting set, if e !=
  1.1057 +     * foldCase(e), 'e' will be removed.
  1.1058 +     *
  1.1059 +     * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]
  1.1060 +     *
  1.1061 +     * (Here foldCase(x) refers to the operation u_strFoldCase, and a
  1.1062 +     * == b denotes that the contents are the same, not pointer
  1.1063 +     * comparison.)
  1.1064 +     *
  1.1065 +     * @param attribute bitmask for attributes to close over.
  1.1066 +     * Currently only the USET_CASE bit is supported.  Any undefined bits
  1.1067 +     * are ignored.
  1.1068 +     * @return a reference to this set.
  1.1069 +     * @internal
  1.1070 +     */
  1.1071 +    UnicodeSet& closeOver(int32_t attribute);
  1.1072 +
  1.1073 +    /**
  1.1074 +     * Iteration method that returns the number of ranges contained in
  1.1075 +     * this set.
  1.1076 +     * @see #getRangeStart
  1.1077 +     * @see #getRangeEnd
  1.1078 +     * @stable ICU 2.4
  1.1079 +     */
  1.1080 +    virtual int32_t getRangeCount(void) const;
  1.1081 +
  1.1082 +    /**
  1.1083 +     * Iteration method that returns the first character in the
  1.1084 +     * specified range of this set.
  1.1085 +     * @see #getRangeCount
  1.1086 +     * @see #getRangeEnd
  1.1087 +     * @stable ICU 2.4
  1.1088 +     */
  1.1089 +    virtual UChar32 getRangeStart(int32_t index) const;
  1.1090 +
  1.1091 +    /**
  1.1092 +     * Iteration method that returns the last character in the
  1.1093 +     * specified range of this set.
  1.1094 +     * @see #getRangeStart
  1.1095 +     * @see #getRangeEnd
  1.1096 +     * @stable ICU 2.4
  1.1097 +     */
  1.1098 +    virtual UChar32 getRangeEnd(int32_t index) const;
  1.1099 +
  1.1100 +    /**
  1.1101 +     * Serializes this set into an array of 16-bit integers.  Serialization
  1.1102 +     * (currently) only records the characters in the set; multicharacter
  1.1103 +     * strings are ignored.
  1.1104 +     *
  1.1105 +     * The array has following format (each line is one 16-bit
  1.1106 +     * integer):
  1.1107 +     *
  1.1108 +     *  length     = (n+2*m) | (m!=0?0x8000:0)
  1.1109 +     *  bmpLength  = n; present if m!=0
  1.1110 +     *  bmp[0]
  1.1111 +     *  bmp[1]
  1.1112 +     *  ...
  1.1113 +     *  bmp[n-1]
  1.1114 +     *  supp-high[0]
  1.1115 +     *  supp-low[0]
  1.1116 +     *  supp-high[1]
  1.1117 +     *  supp-low[1]
  1.1118 +     *  ...
  1.1119 +     *  supp-high[m-1]
  1.1120 +     *  supp-low[m-1]
  1.1121 +     *
  1.1122 +     * The array starts with a header.  After the header are n bmp
  1.1123 +     * code points, then m supplementary code points.  Either n or m
  1.1124 +     * or both may be zero.  n+2*m is always <= 0x7FFF.
  1.1125 +     *
  1.1126 +     * If there are no supplementary characters (if m==0) then the
  1.1127 +     * header is one 16-bit integer, 'length', with value n.
  1.1128 +     *
  1.1129 +     * If there are supplementary characters (if m!=0) then the header
  1.1130 +     * is two 16-bit integers.  The first, 'length', has value
  1.1131 +     * (n+2*m)|0x8000.  The second, 'bmpLength', has value n.
  1.1132 +     *
  1.1133 +     * After the header the code points are stored in ascending order.
  1.1134 +     * Supplementary code points are stored as most significant 16
  1.1135 +     * bits followed by least significant 16 bits.
  1.1136 +     *
  1.1137 +     * @param dest pointer to buffer of destCapacity 16-bit integers.
  1.1138 +     * May be NULL only if destCapacity is zero.
  1.1139 +     * @param destCapacity size of dest, or zero.  Must not be negative.
  1.1140 +     * @param ec error code.  Will be set to U_INDEX_OUTOFBOUNDS_ERROR
  1.1141 +     * if n+2*m > 0x7FFF.  Will be set to U_BUFFER_OVERFLOW_ERROR if
  1.1142 +     * n+2*m+(m!=0?2:1) > destCapacity.
  1.1143 +     * @return the total length of the serialized format, including
  1.1144 +     * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other
  1.1145 +     * than U_BUFFER_OVERFLOW_ERROR.
  1.1146 +     * @stable ICU 2.4
  1.1147 +     */
  1.1148 +    int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const;
  1.1149 +
  1.1150 +    /**
  1.1151 +     * Reallocate this objects internal structures to take up the least
  1.1152 +     * possible space, without changing this object's value.
  1.1153 +     * @stable ICU 2.4
  1.1154 +     */
  1.1155 +    virtual UnicodeSet& compact();
  1.1156 +
  1.1157 +    /**
  1.1158 +     * Return the class ID for this class.  This is useful only for
  1.1159 +     * comparing to a return value from getDynamicClassID().  For example:
  1.1160 +     * <pre>
  1.1161 +     * .      Base* polymorphic_pointer = createPolymorphicObject();
  1.1162 +     * .      if (polymorphic_pointer->getDynamicClassID() ==
  1.1163 +     * .          Derived::getStaticClassID()) ...
  1.1164 +     * </pre>
  1.1165 +     * @return          The class ID for all objects of this class.
  1.1166 +     * @stable ICU 2.0
  1.1167 +     */
  1.1168 +    static UClassID U_EXPORT2 getStaticClassID(void);
  1.1169 +
  1.1170 +    /**
  1.1171 +     * Implement UnicodeFunctor API.
  1.1172 +     *
  1.1173 +     * @return The class ID for this object. All objects of a given
  1.1174 +     * class have the same class ID.  Objects of other classes have
  1.1175 +     * different class IDs.
  1.1176 +     * @stable ICU 2.4
  1.1177 +     */
  1.1178 +    virtual UClassID getDynamicClassID(void) const;
  1.1179 +
  1.1180 +private:
  1.1181 +
  1.1182 +    // Private API for the USet API
  1.1183 +
  1.1184 +    friend class USetAccess;
  1.1185 +
  1.1186 +    int32_t getStringCount() const;
  1.1187 +
  1.1188 +    const UnicodeString* getString(int32_t index) const;
  1.1189 +
  1.1190 +    //----------------------------------------------------------------
  1.1191 +    // RuleBasedTransliterator support
  1.1192 +    //----------------------------------------------------------------
  1.1193 +
  1.1194 +private:
  1.1195 +
  1.1196 +    /**
  1.1197 +     * Returns <tt>true</tt> if this set contains any character whose low byte
  1.1198 +     * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
  1.1199 +     * indexing.
  1.1200 +     */
  1.1201 +    virtual UBool matchesIndexValue(uint8_t v) const;
  1.1202 +
  1.1203 +private:
  1.1204 +
  1.1205 +    //----------------------------------------------------------------
  1.1206 +    // Implementation: Pattern parsing
  1.1207 +    //----------------------------------------------------------------
  1.1208 +
  1.1209 +    void applyPattern(RuleCharacterIterator& chars,
  1.1210 +                      const SymbolTable* symbols,
  1.1211 +                      UnicodeString& rebuiltPat,
  1.1212 +                      uint32_t options,
  1.1213 +                      UErrorCode& ec);
  1.1214 +
  1.1215 +    //----------------------------------------------------------------
  1.1216 +    // Implementation: Utility methods
  1.1217 +    //----------------------------------------------------------------
  1.1218 +
  1.1219 +    void ensureCapacity(int32_t newLen);
  1.1220 +
  1.1221 +    void ensureBufferCapacity(int32_t newLen);
  1.1222 +
  1.1223 +    void swapBuffers(void);
  1.1224 +
  1.1225 +    UBool allocateStrings();
  1.1226 +
  1.1227 +    UnicodeString& _toPattern(UnicodeString& result,
  1.1228 +                              UBool escapeUnprintable) const;
  1.1229 +
  1.1230 +    UnicodeString& _generatePattern(UnicodeString& result,
  1.1231 +                                    UBool escapeUnprintable) const;
  1.1232 +
  1.1233 +    static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);
  1.1234 +
  1.1235 +    static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
  1.1236 +
  1.1237 +    //----------------------------------------------------------------
  1.1238 +    // Implementation: Fundamental operators
  1.1239 +    //----------------------------------------------------------------
  1.1240 +
  1.1241 +    void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);
  1.1242 +
  1.1243 +    void add(const UChar32* other, int32_t otherLen, int8_t polarity);
  1.1244 +
  1.1245 +    void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
  1.1246 +
  1.1247 +    /**
  1.1248 +     * Return true if the given position, in the given pattern, appears
  1.1249 +     * to be the start of a property set pattern [:foo:], \\p{foo}, or
  1.1250 +     * \\P{foo}, or \\N{name}.
  1.1251 +     */
  1.1252 +    static UBool resemblesPropertyPattern(const UnicodeString& pattern,
  1.1253 +                                          int32_t pos);
  1.1254 +
  1.1255 +    static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
  1.1256 +                                          int32_t iterOpts);
  1.1257 +
  1.1258 +    /**
  1.1259 +     * Parse the given property pattern at the given parse position
  1.1260 +     * and set this UnicodeSet to the result.
  1.1261 +     *
  1.1262 +     * The original design document is out of date, but still useful.
  1.1263 +     * Ignore the property and value names:
  1.1264 +     * http://dev.icu-project.org/cgi-bin/viewcvs.cgi/~checkout~/icuhtml/design/unicodeset_properties.html
  1.1265 +     *
  1.1266 +     * Recognized syntax:
  1.1267 +     *
  1.1268 +     * [:foo:] [:^foo:] - white space not allowed within "[:" or ":]"
  1.1269 +     * \\p{foo} \\P{foo}  - white space not allowed within "\\p" or "\\P"
  1.1270 +     * \\N{name}         - white space not allowed within "\\N"
  1.1271 +     *
  1.1272 +     * Other than the above restrictions, white space is ignored.  Case
  1.1273 +     * is ignored except in "\\p" and "\\P" and "\\N".  In 'name' leading
  1.1274 +     * and trailing space is deleted, and internal runs of whitespace
  1.1275 +     * are collapsed to a single space.
  1.1276 +     *
  1.1277 +     * We support binary properties, enumerated properties, and the
  1.1278 +     * following non-enumerated properties:
  1.1279 +     *
  1.1280 +     *  Numeric_Value
  1.1281 +     *  Name
  1.1282 +     *  Unicode_1_Name
  1.1283 +     *
  1.1284 +     * @param pattern the pattern string
  1.1285 +     * @param ppos on entry, the position at which to begin parsing.
  1.1286 +     * This should be one of the locations marked '^':
  1.1287 +     *
  1.1288 +     *   [:blah:]     \\p{blah}     \\P{blah}     \\N{name}
  1.1289 +     *   ^       %    ^       %    ^       %    ^       %
  1.1290 +     *
  1.1291 +     * On return, the position after the last character parsed, that is,
  1.1292 +     * the locations marked '%'.  If the parse fails, ppos is returned
  1.1293 +     * unchanged.
  1.1294 +     * @return a reference to this.
  1.1295 +     */
  1.1296 +    UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
  1.1297 +                                     ParsePosition& ppos,
  1.1298 +                                     UErrorCode &ec);
  1.1299 +
  1.1300 +    void applyPropertyPattern(RuleCharacterIterator& chars,
  1.1301 +                              UnicodeString& rebuiltPat,
  1.1302 +                              UErrorCode& ec);
  1.1303 +
  1.1304 +    /**
  1.1305 +     * A filter that returns TRUE if the given code point should be
  1.1306 +     * included in the UnicodeSet being constructed.
  1.1307 +     */
  1.1308 +    typedef UBool (*Filter)(UChar32 codePoint, void* context);
  1.1309 +
  1.1310 +    /**
  1.1311 +     * Given a filter, set this UnicodeSet to the code points
  1.1312 +     * contained by that filter.  The filter MUST be
  1.1313 +     * property-conformant.  That is, if it returns value v for one
  1.1314 +     * code point, then it must return v for all affiliated code
  1.1315 +     * points, as defined by the inclusions list.  See
  1.1316 +     * getInclusions().
  1.1317 +     * src is a UPropertySource value.
  1.1318 +     */
  1.1319 +    void applyFilter(Filter filter,
  1.1320 +                     void* context,
  1.1321 +                     int32_t src,
  1.1322 +                     UErrorCode &status);
  1.1323 +
  1.1324 +    /**
  1.1325 +     * Return a cached copy of the inclusions list for the property source.
  1.1326 +     */
  1.1327 +    static const UnicodeSet* getInclusions(int32_t src, UErrorCode &errorCode);
  1.1328 +
  1.1329 +    friend class UnicodeSetIterator;
  1.1330 +};
  1.1331 +
  1.1332 +inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const {
  1.1333 +    return !operator==(o);
  1.1334 +}
  1.1335 +
  1.1336 +inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
  1.1337 +    return !containsNone(start, end);
  1.1338 +}
  1.1339 +
  1.1340 +inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const {
  1.1341 +    return !containsNone(s);
  1.1342 +}
  1.1343 +
  1.1344 +inline UBool UnicodeSet::containsSome(const UnicodeString& s) const {
  1.1345 +    return !containsNone(s);
  1.1346 +}
  1.1347 +
  1.1348 +U_NAMESPACE_END
  1.1349 +
  1.1350 +#endif