os/textandloc/fontservices/textshaperplugin/IcuSource/common/unicode/uniset.h
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
     1 /*
     2 ***************************************************************************
     3 * Copyright (C) 1999-2005, International Business Machines Corporation
     4 * and others. All Rights Reserved.
     5 ***************************************************************************
     6 *   Date        Name        Description
     7 *   10/20/99    alan        Creation.
     8 ***************************************************************************
     9 */
    10 
    11 #ifndef UNICODESET_H
    12 #define UNICODESET_H
    13 
    14 #include "unicode/unifilt.h"
    15 #include "unicode/unistr.h"
    16 #include "unicode/uset.h"
    17 
    18 /**
    19  * \file 
    20  * \brief C++ API: Unicode Set
    21  */
    22  
    23 U_NAMESPACE_BEGIN
    24 
    25 class ParsePosition;
    26 class SymbolTable;
    27 class UVector;
    28 class RuleCharacterIterator;
    29 
    30 /**
    31  * A mutable set of Unicode characters and multicharacter strings.  Objects of this class
    32  * represent <em>character classes</em> used in regular expressions.
    33  * A character specifies a subset of Unicode code points.  Legal
    34  * code points are U+0000 to U+10FFFF, inclusive.
    35  *
    36  * <p>The UnicodeSet class is not designed to be subclassed.
    37  *
    38  * <p><code>UnicodeSet</code> supports two APIs. The first is the
    39  * <em>operand</em> API that allows the caller to modify the value of
    40  * a <code>UnicodeSet</code> object. It conforms to Java 2's
    41  * <code>java.util.Set</code> interface, although
    42  * <code>UnicodeSet</code> does not actually implement that
    43  * interface. All methods of <code>Set</code> are supported, with the
    44  * modification that they take a character range or single character
    45  * instead of an <code>Object</code>, and they take a
    46  * <code>UnicodeSet</code> instead of a <code>Collection</code>.  The
    47  * operand API may be thought of in terms of boolean logic: a boolean
    48  * OR is implemented by <code>add</code>, a boolean AND is implemented
    49  * by <code>retain</code>, a boolean XOR is implemented by
    50  * <code>complement</code> taking an argument, and a boolean NOT is
    51  * implemented by <code>complement</code> with no argument.  In terms
    52  * of traditional set theory function names, <code>add</code> is a
    53  * union, <code>retain</code> is an intersection, <code>remove</code>
    54  * is an asymmetric difference, and <code>complement</code> with no
    55  * argument is a set complement with respect to the superset range
    56  * <code>MIN_VALUE-MAX_VALUE</code>
    57  *
    58  * <p>The second API is the
    59  * <code>applyPattern()</code>/<code>toPattern()</code> API from the
    60  * <code>java.text.Format</code>-derived classes.  Unlike the
    61  * methods that add characters, add categories, and control the logic
    62  * of the set, the method <code>applyPattern()</code> sets all
    63  * attributes of a <code>UnicodeSet</code> at once, based on a
    64  * string pattern.
    65  *
    66  * <p><b>Pattern syntax</b></p>
    67  *
    68  * Patterns are accepted by the constructors and the
    69  * <code>applyPattern()</code> methods and returned by the
    70  * <code>toPattern()</code> method.  These patterns follow a syntax
    71  * similar to that employed by version 8 regular expression character
    72  * classes.  Here are some simple examples:
    73  *
    74  * \htmlonly<blockquote>\endhtmlonly
    75  *   <table>
    76  *     <tr align="top">
    77  *       <td nowrap valign="top" align="left"><code>[]</code></td>
    78  *       <td valign="top">No characters</td>
    79  *     </tr><tr align="top">
    80  *       <td nowrap valign="top" align="left"><code>[a]</code></td>
    81  *       <td valign="top">The character 'a'</td>
    82  *     </tr><tr align="top">
    83  *       <td nowrap valign="top" align="left"><code>[ae]</code></td>
    84  *       <td valign="top">The characters 'a' and 'e'</td>
    85  *     </tr>
    86  *     <tr>
    87  *       <td nowrap valign="top" align="left"><code>[a-e]</code></td>
    88  *       <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code
    89  *       point order</td>
    90  *     </tr>
    91  *     <tr>
    92  *       <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td>
    93  *       <td valign="top">The character U+4E01</td>
    94  *     </tr>
    95  *     <tr>
    96  *       <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td>
    97  *       <td valign="top">The character 'a' and the multicharacter strings &quot;ab&quot; and
    98  *       &quot;ac&quot;</td>
    99  *     </tr>
   100  *     <tr>
   101  *       <td nowrap valign="top" align="left"><code>[\\p{Lu}]</code></td>
   102  *       <td valign="top">All characters in the general category Uppercase Letter</td>
   103  *     </tr>
   104  *   </table>
   105  * \htmlonly</blockquote>\endhtmlonly
   106  *
   107  * Any character may be preceded by a backslash in order to remove any special
   108  * meaning.  White space characters, as defined by UCharacter.isWhitespace(), are
   109  * ignored, unless they are escaped.
   110  *
   111  * <p>Property patterns specify a set of characters having a certain
   112  * property as defined by the Unicode standard.  Both the POSIX-like
   113  * "[:Lu:]" and the Perl-like syntax "\\p{Lu}" are recognized.  For a
   114  * complete list of supported property patterns, see the User's Guide
   115  * for UnicodeSet at
   116  * <a href="http://icu.sourceforge.net/userguide/unicodeSet.html">
   117  * http://icu.sourceforge.net/userguide/unicodeSet.html</a>.
   118  * Actual determination of property data is defined by the underlying
   119  * Unicode database as implemented by UCharacter.
   120  *
   121  * <p>Patterns specify individual characters, ranges of characters, and
   122  * Unicode property sets.  When elements are concatenated, they
   123  * specify their union.  To complement a set, place a '^' immediately
   124  * after the opening '['.  Property patterns are inverted by modifying
   125  * their delimiters; "[:^foo]" and "\\P{foo}".  In any other location,
   126  * '^' has no special meaning.
   127  *
   128  * <p>Ranges are indicated by placing two a '-' between two
   129  * characters, as in "a-z".  This specifies the range of all
   130  * characters from the left to the right, in Unicode order.  If the
   131  * left character is greater than or equal to the
   132  * right character it is a syntax error.  If a '-' occurs as the first
   133  * character after the opening '[' or '[^', or if it occurs as the
   134  * last character before the closing ']', then it is taken as a
   135  * literal.  Thus "[a\-b]", "[-ab]", and "[ab-]" all indicate the same
   136  * set of three characters, 'a', 'b', and '-'.
   137  *
   138  * <p>Sets may be intersected using the '&' operator or the asymmetric
   139  * set difference may be taken using the '-' operator, for example,
   140  * "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters
   141  * with values less than 4096.  Operators ('&' and '|') have equal
   142  * precedence and bind left-to-right.  Thus
   143  * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
   144  * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]".  This only really matters for
   145  * difference; intersection is commutative.
   146  *
   147  * <table>
   148  * <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a'
   149  * <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a'
   150  * through 'z' and all letters in between, in Unicode order
   151  * <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
   152  * all characters but 'a' through 'z',
   153  * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
   154  * <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
   155  * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
   156  * <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code>
   157  * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
   158  * <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
   159  * <td>The asymmetric difference of sets specified by <em>pat1</em> and
   160  * <em>pat2</em>
   161  * <tr valign=top><td nowrap><code>[:Lu:] or \\p{Lu}</code>
   162  * <td>The set of characters having the specified
   163  * Unicode property; in
   164  * this case, Unicode uppercase letters
   165  * <tr valign=top><td nowrap><code>[:^Lu:] or \\P{Lu}</code>
   166  * <td>The set of characters <em>not</em> having the given
   167  * Unicode property
   168  * </table>
   169  *
   170  * <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p>
   171  *
   172  * <p><b>Formal syntax</b></p>
   173  *
   174  * \htmlonly<blockquote>\endhtmlonly
   175  *   <table>
   176  *     <tr align="top">
   177  *       <td nowrap valign="top" align="right"><code>pattern :=&nbsp; </code></td>
   178  *       <td valign="top"><code>('[' '^'? item* ']') |
   179  *       property</code></td>
   180  *     </tr>
   181  *     <tr align="top">
   182  *       <td nowrap valign="top" align="right"><code>item :=&nbsp; </code></td>
   183  *       <td valign="top"><code>char | (char '-' char) | pattern-expr<br>
   184  *       </code></td>
   185  *     </tr>
   186  *     <tr align="top">
   187  *       <td nowrap valign="top" align="right"><code>pattern-expr :=&nbsp; </code></td>
   188  *       <td valign="top"><code>pattern | pattern-expr pattern |
   189  *       pattern-expr op pattern<br>
   190  *       </code></td>
   191  *     </tr>
   192  *     <tr align="top">
   193  *       <td nowrap valign="top" align="right"><code>op :=&nbsp; </code></td>
   194  *       <td valign="top"><code>'&amp;' | '-'<br>
   195  *       </code></td>
   196  *     </tr>
   197  *     <tr align="top">
   198  *       <td nowrap valign="top" align="right"><code>special :=&nbsp; </code></td>
   199  *       <td valign="top"><code>'[' | ']' | '-'<br>
   200  *       </code></td>
   201  *     </tr>
   202  *     <tr align="top">
   203  *       <td nowrap valign="top" align="right"><code>char :=&nbsp; </code></td>
   204  *       <td valign="top"><em>any character that is not</em><code> special<br>
   205  *       | ('\' </code><em>any character</em><code>)<br>
   206  *       | ('\\u' hex hex hex hex)<br>
   207  *       </code></td>
   208  *     </tr>
   209  *     <tr align="top">
   210  *       <td nowrap valign="top" align="right"><code>hex :=&nbsp; </code></td>
   211  *       <td valign="top"><em>any character for which
   212  *       </em><code>Character.digit(c, 16)</code><em>
   213  *       returns a non-negative result</em></td>
   214  *     </tr>
   215  *     <tr>
   216  *       <td nowrap valign="top" align="right"><code>property :=&nbsp; </code></td>
   217  *       <td valign="top"><em>a Unicode property set pattern</em></td>
   218  *     </tr>
   219  *   </table>
   220  *   <br>
   221  *   <table border="1">
   222  *     <tr>
   223  *       <td>Legend: <table>
   224  *         <tr>
   225  *           <td nowrap valign="top"><code>a := b</code></td>
   226  *           <td width="20" valign="top">&nbsp; </td>
   227  *           <td valign="top"><code>a</code> may be replaced by <code>b</code> </td>
   228  *         </tr>
   229  *         <tr>
   230  *           <td nowrap valign="top"><code>a?</code></td>
   231  *           <td valign="top"></td>
   232  *           <td valign="top">zero or one instance of <code>a</code><br>
   233  *           </td>
   234  *         </tr>
   235  *         <tr>
   236  *           <td nowrap valign="top"><code>a*</code></td>
   237  *           <td valign="top"></td>
   238  *           <td valign="top">one or more instances of <code>a</code><br>
   239  *           </td>
   240  *         </tr>
   241  *         <tr>
   242  *           <td nowrap valign="top"><code>a | b</code></td>
   243  *           <td valign="top"></td>
   244  *           <td valign="top">either <code>a</code> or <code>b</code><br>
   245  *           </td>
   246  *         </tr>
   247  *         <tr>
   248  *           <td nowrap valign="top"><code>'a'</code></td>
   249  *           <td valign="top"></td>
   250  *           <td valign="top">the literal string between the quotes </td>
   251  *         </tr>
   252  *       </table>
   253  *       </td>
   254  *     </tr>
   255  *   </table>
   256  * \htmlonly</blockquote>\endhtmlonly
   257  *
   258  * @author Alan Liu
   259  * @stable ICU 2.0
   260  */
   261 class U_COMMON_API UnicodeSet : public UnicodeFilter {
   262 
   263     int32_t len; // length of list used; 0 <= len <= capacity
   264     int32_t capacity; // capacity of list
   265     int32_t bufferCapacity; // capacity of buffer
   266     UChar32* list; // MUST be terminated with HIGH
   267     UChar32* buffer; // internal buffer, may be NULL
   268 
   269     UVector* strings; // maintained in sorted order
   270 
   271     /**
   272      * The pattern representation of this set.  This may not be the
   273      * most economical pattern.  It is the pattern supplied to
   274      * applyPattern(), with variables substituted and whitespace
   275      * removed.  For sets constructed without applyPattern(), or
   276      * modified using the non-pattern API, this string will be empty,
   277      * indicating that toPattern() must generate a pattern
   278      * representation from the inversion list.
   279      */
   280     UnicodeString pat;
   281 
   282 public:
   283 
   284     enum {
   285         /**
   286          * Minimum value that can be stored in a UnicodeSet.
   287          * @stable ICU 2.4
   288          */
   289         MIN_VALUE = 0,
   290 
   291         /**
   292          * Maximum value that can be stored in a UnicodeSet.
   293          * @stable ICU 2.4
   294          */
   295         MAX_VALUE = 0x10ffff
   296     };
   297 
   298     //----------------------------------------------------------------
   299     // Constructors &c
   300     //----------------------------------------------------------------
   301 
   302 public:
   303 
   304     /**
   305      * Constructs an empty set.
   306      * @stable ICU 2.0
   307      */
   308     UnicodeSet();
   309 
   310     /**
   311      * Constructs a set containing the given range. If <code>end >
   312      * start</code> then an empty set is created.
   313      *
   314      * @param start first character, inclusive, of range
   315      * @param end last character, inclusive, of range
   316      * @stable ICU 2.4
   317      */
   318     UnicodeSet(UChar32 start, UChar32 end);
   319 
   320     /**
   321      * Constructs a set from the given pattern.  See the class
   322      * description for the syntax of the pattern language.
   323      * @param pattern a string specifying what characters are in the set
   324      * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
   325      * contains a syntax error.
   326      * @stable ICU 2.0
   327      */
   328     UnicodeSet(const UnicodeString& pattern,
   329                UErrorCode& status);
   330 
   331     /**
   332      * Constructs a set from the given pattern.  See the class
   333      * description for the syntax of the pattern language.
   334      * @param pattern a string specifying what characters are in the set
   335      * @param options bitmask for options to apply to the pattern.
   336      * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
   337      * @param symbols a symbol table mapping variable names to values
   338      * and stand-in characters to UnicodeSets; may be NULL
   339      * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
   340      * contains a syntax error.
   341      * @internal
   342      */
   343     UnicodeSet(const UnicodeString& pattern,
   344                uint32_t options,
   345                const SymbolTable* symbols,
   346                UErrorCode& status);
   347 
   348     /**
   349      * Constructs a set from the given pattern.  See the class description
   350      * for the syntax of the pattern language.
   351      * @param pattern a string specifying what characters are in the set
   352      * @param pos on input, the position in pattern at which to start parsing.
   353      * On output, the position after the last character parsed.
   354      * @param options bitmask for options to apply to the pattern.
   355      * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
   356      * @param symbols a symbol table mapping variable names to values
   357      * and stand-in characters to UnicodeSets; may be NULL
   358      * @param status input-output error code
   359      * @stable ICU 2.8
   360      */
   361     UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
   362                uint32_t options,
   363                const SymbolTable* symbols,
   364                UErrorCode& status);
   365 
   366 #ifdef U_USE_UNICODESET_DEPRECATES
   367     /**
   368      * Obsolete: Constructs a set from the given Unicode character category.
   369      * @param category an integer indicating the character category as
   370      * defined in uchar.h.
   371      * @obsolete ICU 2.6. Use a pattern with the category instead since this API will be removed in that release.
   372      */
   373     UnicodeSet(int8_t category, UErrorCode& status);
   374 #endif
   375 
   376     /**
   377      * Constructs a set that is identical to the given UnicodeSet.
   378      * @stable ICU 2.0
   379      */
   380     UnicodeSet(const UnicodeSet& o);
   381 
   382     /**
   383      * Destructs the set.
   384      * @stable ICU 2.0
   385      */
   386     virtual ~UnicodeSet();
   387 
   388     /**
   389      * Assigns this object to be a copy of another.
   390      * @stable ICU 2.0
   391      */
   392     UnicodeSet& operator=(const UnicodeSet& o);
   393 
   394     /**
   395      * Compares the specified object with this set for equality.  Returns
   396      * <tt>true</tt> if the two sets
   397      * have the same size, and every member of the specified set is
   398      * contained in this set (or equivalently, every member of this set is
   399      * contained in the specified set).
   400      *
   401      * @param o set to be compared for equality with this set.
   402      * @return <tt>true</tt> if the specified set is equal to this set.
   403      * @stable ICU 2.0
   404      */
   405     virtual UBool operator==(const UnicodeSet& o) const;
   406 
   407     /**
   408      * Compares the specified object with this set for equality.  Returns
   409      * <tt>true</tt> if the specified set is not equal to this set.
   410      * @stable ICU 2.0
   411      */
   412     UBool operator!=(const UnicodeSet& o) const;
   413 
   414     /**
   415      * Returns a copy of this object.  All UnicodeFunctor objects have
   416      * to support cloning in order to allow classes using
   417      * UnicodeFunctors, such as Transliterator, to implement cloning.
   418      * @stable ICU 2.0
   419      */
   420     virtual UnicodeFunctor* clone() const;
   421 
   422     /**
   423      * Returns the hash code value for this set.
   424      *
   425      * @return the hash code value for this set.
   426      * @see Object#hashCode()
   427      * @stable ICU 2.0
   428      */
   429     virtual int32_t hashCode(void) const;
   430 
   431     //----------------------------------------------------------------
   432     // Public API
   433     //----------------------------------------------------------------
   434 
   435     /**
   436      * Make this object represent the range <code>start - end</code>.
   437      * If <code>end > start</code> then this object is set to an
   438      * an empty range.
   439      *
   440      * @param start first character in the set, inclusive
   441      * @param end last character in the set, inclusive
   442      * @stable ICU 2.4
   443      */
   444     UnicodeSet& set(UChar32 start, UChar32 end);
   445 
   446     /**
   447      * Return true if the given position, in the given pattern, appears
   448      * to be the start of a UnicodeSet pattern.
   449      * @stable ICU 2.4
   450      */
   451     static UBool resemblesPattern(const UnicodeString& pattern,
   452                                   int32_t pos);
   453 
   454     /**
   455      * Modifies this set to represent the set specified by the given
   456      * pattern, optionally ignoring white space.  See the class
   457      * description for the syntax of the pattern language.
   458      * @param pattern a string specifying what characters are in the set
   459      * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
   460      * contains a syntax error.
   461      * <em> Empties the set passed before applying the pattern.</em>
   462      * @return a reference to this
   463      * @stable ICU 2.0
   464      */
   465     UnicodeSet& applyPattern(const UnicodeString& pattern,
   466                              UErrorCode& status);
   467 
   468     /**
   469      * Modifies this set to represent the set specified by the given
   470      * pattern, optionally ignoring white space.  See the class
   471      * description for the syntax of the pattern language.
   472      * @param pattern a string specifying what characters are in the set
   473      * @param options bitmask for options to apply to the pattern.
   474      * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
   475      * @param symbols a symbol table mapping variable names to
   476      * values and stand-ins to UnicodeSets; may be NULL
   477      * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
   478      * contains a syntax error.
   479      *<em> Empties the set passed before applying the pattern.</em>
   480      * @return a reference to this
   481      * @internal
   482      */
   483     UnicodeSet& applyPattern(const UnicodeString& pattern,
   484                              uint32_t options,
   485                              const SymbolTable* symbols,
   486                              UErrorCode& status);
   487 
   488     /**
   489      * Parses the given pattern, starting at the given position.  The
   490      * character at pattern.charAt(pos.getIndex()) must be '[', or the
   491      * parse fails.  Parsing continues until the corresponding closing
   492      * ']'.  If a syntax error is encountered between the opening and
   493      * closing brace, the parse fails.  Upon return from a successful
   494      * parse, the ParsePosition is updated to point to the character
   495      * following the closing ']', and a StringBuffer containing a
   496      * pairs list for the parsed pattern is returned.  This method calls
   497      * itself recursively to parse embedded subpatterns.
   498      *<em> Empties the set passed before applying the pattern.</em>
   499      *
   500      * @param pattern the string containing the pattern to be parsed.
   501      * The portion of the string from pos.getIndex(), which must be a
   502      * '[', to the corresponding closing ']', is parsed.
   503      * @param pos upon entry, the position at which to being parsing.
   504      * The character at pattern.charAt(pos.getIndex()) must be a '['.
   505      * Upon return from a successful parse, pos.getIndex() is either
   506      * the character after the closing ']' of the parsed pattern, or
   507      * pattern.length() if the closing ']' is the last character of
   508      * the pattern string.
   509      * @param options bitmask for options to apply to the pattern.
   510      * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
   511      * @param symbols a symbol table mapping variable names to
   512      * values and stand-ins to UnicodeSets; may be NULL
   513      * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
   514      * contains a syntax error.
   515      * @return a reference to this
   516      * @stable ICU 2.8
   517      */
   518     UnicodeSet& applyPattern(const UnicodeString& pattern,
   519                              ParsePosition& pos,
   520                              uint32_t options,
   521                              const SymbolTable* symbols,
   522                              UErrorCode& status);
   523 
   524     /**
   525      * Returns a string representation of this set.  If the result of
   526      * calling this function is passed to a UnicodeSet constructor, it
   527      * will produce another set that is equal to this one.
   528      * @param result the string to receive the rules.  Previous
   529      * contents will be deleted.
   530      * @param escapeUnprintable if TRUE then convert unprintable
   531      * character to their hex escape representations, \\uxxxx or
   532      * \\Uxxxxxxxx.  Unprintable characters are those other than
   533      * U+000A, U+0020..U+007E.
   534      * @stable ICU 2.0
   535      */
   536     virtual UnicodeString& toPattern(UnicodeString& result,
   537                              UBool escapeUnprintable = FALSE) const;
   538 
   539     /**
   540      * Modifies this set to contain those code points which have the given value
   541      * for the given binary or enumerated property, as returned by
   542      * u_getIntPropertyValue.  Prior contents of this set are lost.
   543      *
   544      * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1
   545      * or UCHAR_INT_START..UCHAR_INT_LIMIT-1
   546      * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1.
   547      *
   548      * @param value a value in the range u_getIntPropertyMinValue(prop)..
   549      * u_getIntPropertyMaxValue(prop), with one exception.  If prop is
   550      * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but
   551      * rather a mask value produced by U_GET_GC_MASK().  This allows grouped
   552      * categories such as [:L:] to be represented.
   553      *
   554      * @param ec error code input/output parameter
   555      *
   556      * @return a reference to this set
   557      *
   558      * @stable ICU 2.4
   559      */
   560     UnicodeSet& applyIntPropertyValue(UProperty prop,
   561                                       int32_t value,
   562                                       UErrorCode& ec);
   563 
   564     /**
   565      * Modifies this set to contain those code points which have the
   566      * given value for the given property.  Prior contents of this
   567      * set are lost.
   568      *
   569      * @param prop a property alias, either short or long.  The name is matched
   570      * loosely.  See PropertyAliases.txt for names and a description of loose
   571      * matching.  If the value string is empty, then this string is interpreted
   572      * as either a General_Category value alias, a Script value alias, a binary
   573      * property alias, or a special ID.  Special IDs are matched loosely and
   574      * correspond to the following sets:
   575      *
   576      * "ANY" = [\\u0000-\\U0010FFFF],
   577      * "ASCII" = [\\u0000-\\u007F],
   578      * "Assigned" = [:^Cn:].
   579      *
   580      * @param value a value alias, either short or long.  The name is matched
   581      * loosely.  See PropertyValueAliases.txt for names and a description of
   582      * loose matching.  In addition to aliases listed, numeric values and
   583      * canonical combining classes may be expressed numerically, e.g., ("nv",
   584      * "0.5") or ("ccc", "220").  The value string may also be empty.
   585      *
   586      * @param ec error code input/output parameter
   587      *
   588      * @return a reference to this set
   589      *
   590      * @stable ICU 2.4
   591      */
   592     UnicodeSet& applyPropertyAlias(const UnicodeString& prop,
   593                                    const UnicodeString& value,
   594                                    UErrorCode& ec);
   595 
   596     /**
   597      * Returns the number of elements in this set (its cardinality).
   598      * Note than the elements of a set may include both individual
   599      * codepoints and strings.
   600      *
   601      * @return the number of elements in this set (its cardinality).
   602      * @stable ICU 2.0
   603      */
   604     virtual int32_t size(void) const;
   605 
   606     /**
   607      * Returns <tt>true</tt> if this set contains no elements.
   608      *
   609      * @return <tt>true</tt> if this set contains no elements.
   610      * @stable ICU 2.0
   611      */
   612     virtual UBool isEmpty(void) const;
   613 
   614     /**
   615      * Returns true if this set contains the given character.
   616      * @param c character to be checked for containment
   617      * @return true if the test condition is met
   618      * @stable ICU 2.0
   619      */
   620     virtual UBool contains(UChar32 c) const;
   621 
   622     /**
   623      * Returns true if this set contains every character
   624      * of the given range.
   625      * @param start first character, inclusive, of the range
   626      * @param end last character, inclusive, of the range
   627      * @return true if the test condition is met
   628      * @stable ICU 2.0
   629      */
   630     virtual UBool contains(UChar32 start, UChar32 end) const;
   631 
   632     /**
   633      * Returns <tt>true</tt> if this set contains the given
   634      * multicharacter string.
   635      * @param s string to be checked for containment
   636      * @return <tt>true</tt> if this set contains the specified string
   637      * @stable ICU 2.4
   638      */
   639     UBool contains(const UnicodeString& s) const;
   640 
   641     /**
   642      * Returns true if this set contains all the characters and strings
   643      * of the given set.
   644      * @param c set to be checked for containment
   645      * @return true if the test condition is met
   646      * @stable ICU 2.4
   647      */
   648     virtual UBool containsAll(const UnicodeSet& c) const;
   649 
   650     /**
   651      * Returns true if this set contains all the characters
   652      * of the given string.
   653      * @param s string containing characters to be checked for containment
   654      * @return true if the test condition is met
   655      * @stable ICU 2.4
   656      */
   657     UBool containsAll(const UnicodeString& s) const;
   658 
   659     /**
   660      * Returns true if this set contains none of the characters
   661      * of the given range.
   662      * @param start first character, inclusive, of the range
   663      * @param end last character, inclusive, of the range
   664      * @return true if the test condition is met
   665      * @stable ICU 2.4
   666      */
   667     UBool containsNone(UChar32 start, UChar32 end) const;
   668 
   669     /**
   670      * Returns true if this set contains none of the characters and strings
   671      * of the given set.
   672      * @param c set to be checked for containment
   673      * @return true if the test condition is met
   674      * @stable ICU 2.4
   675      */
   676     UBool containsNone(const UnicodeSet& c) const;
   677 
   678     /**
   679      * Returns true if this set contains none of the characters
   680      * of the given string.
   681      * @param s string containing characters to be checked for containment
   682      * @return true if the test condition is met
   683      * @stable ICU 2.4
   684      */
   685     UBool containsNone(const UnicodeString& s) const;
   686 
   687     /**
   688      * Returns true if this set contains one or more of the characters
   689      * in the given range.
   690      * @param start first character, inclusive, of the range
   691      * @param end last character, inclusive, of the range
   692      * @return true if the condition is met
   693      * @stable ICU 2.4
   694      */
   695     inline UBool containsSome(UChar32 start, UChar32 end) const;
   696 
   697     /**
   698      * Returns true if this set contains one or more of the characters
   699      * and strings of the given set.
   700      * @param s The set to be checked for containment
   701      * @return true if the condition is met
   702      * @stable ICU 2.4
   703      */
   704     inline UBool containsSome(const UnicodeSet& s) const;
   705 
   706     /**
   707      * Returns true if this set contains one or more of the characters
   708      * of the given string.
   709      * @param s string containing characters to be checked for containment
   710      * @return true if the condition is met
   711      * @stable ICU 2.4
   712      */
   713     inline UBool containsSome(const UnicodeString& s) const;
   714 
   715     /**
   716      * Implement UnicodeMatcher::matches()
   717      * @stable ICU 2.4
   718      */
   719     virtual UMatchDegree matches(const Replaceable& text,
   720                          int32_t& offset,
   721                          int32_t limit,
   722                          UBool incremental);
   723 
   724 private:
   725     /**
   726      * Returns the longest match for s in text at the given position.
   727      * If limit > start then match forward from start+1 to limit
   728      * matching all characters except s.charAt(0).  If limit < start,
   729      * go backward starting from start-1 matching all characters
   730      * except s.charAt(s.length()-1).  This method assumes that the
   731      * first character, text.charAt(start), matches s, so it does not
   732      * check it.
   733      * @param text the text to match
   734      * @param start the first character to match.  In the forward
   735      * direction, text.charAt(start) is matched against s.charAt(0).
   736      * In the reverse direction, it is matched against
   737      * s.charAt(s.length()-1).
   738      * @param limit the limit offset for matching, either last+1 in
   739      * the forward direction, or last-1 in the reverse direction,
   740      * where last is the index of the last character to match.
   741      * @return If part of s matches up to the limit, return |limit -
   742      * start|.  If all of s matches before reaching the limit, return
   743      * s.length().  If there is a mismatch between s and text, return
   744      * 0
   745      */
   746     static int32_t matchRest(const Replaceable& text,
   747                              int32_t start, int32_t limit,
   748                              const UnicodeString& s);
   749 
   750     /**
   751      * Returns the smallest value i such that c < list[i].  Caller
   752      * must ensure that c is a legal value or this method will enter
   753      * an infinite loop.  This method performs a binary search.
   754      * @param c a character in the range MIN_VALUE..MAX_VALUE
   755      * inclusive
   756      * @return the smallest integer i in the range 0..len-1,
   757      * inclusive, such that c < list[i]
   758      */
   759     int32_t findCodePoint(UChar32 c) const;
   760 
   761 public:
   762 
   763     /**
   764      * Implementation of UnicodeMatcher API.  Union the set of all
   765      * characters that may be matched by this object into the given
   766      * set.
   767      * @param toUnionTo the set into which to union the source characters
   768      * @stable ICU 2.4
   769      */
   770     virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
   771 
   772     /**
   773      * Returns the index of the given character within this set, where
   774      * the set is ordered by ascending code point.  If the character
   775      * is not in this set, return -1.  The inverse of this method is
   776      * <code>charAt()</code>.
   777      * @return an index from 0..size()-1, or -1
   778      * @stable ICU 2.4
   779      */
   780     int32_t indexOf(UChar32 c) const;
   781 
   782     /**
   783      * Returns the character at the given index within this set, where
   784      * the set is ordered by ascending code point.  If the index is
   785      * out of range, return (UChar32)-1.  The inverse of this method is
   786      * <code>indexOf()</code>.
   787      * @param index an index from 0..size()-1
   788      * @return the character at the given index, or (UChar32)-1.
   789      * @stable ICU 2.4
   790      */
   791     UChar32 charAt(int32_t index) const;
   792 
   793     /**
   794      * Adds the specified range to this set if it is not already
   795      * present.  If this set already contains the specified range,
   796      * the call leaves this set unchanged.  If <code>end > start</code>
   797      * then an empty range is added, leaving the set unchanged.
   798      * This is equivalent to a boolean logic OR, or a set UNION.
   799      *
   800      * @param start first character, inclusive, of range to be added
   801      * to this set.
   802      * @param end last character, inclusive, of range to be added
   803      * to this set.
   804      * @stable ICU 2.0
   805      */
   806     virtual UnicodeSet& add(UChar32 start, UChar32 end);
   807 
   808     /**
   809      * Adds the specified character to this set if it is not already
   810      * present.  If this set already contains the specified character,
   811      * the call leaves this set unchanged.
   812      * @stable ICU 2.0
   813      */
   814     UnicodeSet& add(UChar32 c);
   815 
   816     /**
   817      * Adds the specified multicharacter to this set if it is not already
   818      * present.  If this set already contains the multicharacter,
   819      * the call leaves this set unchanged.
   820      * Thus "ch" => {"ch"}
   821      * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
   822      * @param s the source string
   823      * @return this object, for chaining
   824      * @stable ICU 2.4
   825      */
   826     UnicodeSet& add(const UnicodeString& s);
   827 
   828  private:
   829     /**
   830      * @return a code point IF the string consists of a single one.
   831      * otherwise returns -1.
   832      * @param string to test
   833      */
   834     static int32_t getSingleCP(const UnicodeString& s);
   835 
   836     void _add(const UnicodeString& s);
   837 
   838  public:
   839     /**
   840      * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
   841      * If this set already any particular character, it has no effect on that character.
   842      * @param s the source string
   843      * @return this object, for chaining
   844      * @stable ICU 2.4
   845      */
   846     UnicodeSet& addAll(const UnicodeString& s);
   847 
   848     /**
   849      * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
   850      * If this set already any particular character, it has no effect on that character.
   851      * @param s the source string
   852      * @return this object, for chaining
   853      * @stable ICU 2.4
   854      */
   855     UnicodeSet& retainAll(const UnicodeString& s);
   856 
   857     /**
   858      * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
   859      * If this set already any particular character, it has no effect on that character.
   860      * @param s the source string
   861      * @return this object, for chaining
   862      * @stable ICU 2.4
   863      */
   864     UnicodeSet& complementAll(const UnicodeString& s);
   865 
   866     /**
   867      * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
   868      * If this set already any particular character, it has no effect on that character.
   869      * @param s the source string
   870      * @return this object, for chaining
   871      * @stable ICU 2.4
   872      */
   873     UnicodeSet& removeAll(const UnicodeString& s);
   874 
   875     /**
   876      * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
   877      * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
   878      * @param s the source string
   879      * @return a newly created set containing the given string.
   880      * The caller owns the return object and is responsible for deleting it.
   881      * @stable ICU 2.4
   882      */
   883     static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s);
   884 
   885 
   886     /**
   887      * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
   888      * @param s the source string
   889      * @return a newly created set containing the given characters
   890      * The caller owns the return object and is responsible for deleting it.
   891      * @stable ICU 2.4
   892      */
   893     static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s);
   894 
   895     /**
   896      * Retain only the elements in this set that are contained in the
   897      * specified range.  If <code>end > start</code> then an empty range is
   898      * retained, leaving the set empty.  This is equivalent to
   899      * a boolean logic AND, or a set INTERSECTION.
   900      *
   901      * @param start first character, inclusive, of range to be retained
   902      * to this set.
   903      * @param end last character, inclusive, of range to be retained
   904      * to this set.
   905      * @stable ICU 2.0
   906      */
   907     virtual UnicodeSet& retain(UChar32 start, UChar32 end);
   908 
   909 
   910     /**
   911      * Retain the specified character from this set if it is present.
   912      * @stable ICU 2.0
   913      */
   914     UnicodeSet& retain(UChar32 c);
   915 
   916     /**
   917      * Removes the specified range from this set if it is present.
   918      * The set will not contain the specified range once the call
   919      * returns.  If <code>end > start</code> then an empty range is
   920      * removed, leaving the set unchanged.
   921      *
   922      * @param start first character, inclusive, of range to be removed
   923      * from this set.
   924      * @param end last character, inclusive, of range to be removed
   925      * from this set.
   926      * @stable ICU 2.0
   927      */
   928     virtual UnicodeSet& remove(UChar32 start, UChar32 end);
   929 
   930     /**
   931      * Removes the specified character from this set if it is present.
   932      * The set will not contain the specified range once the call
   933      * returns.
   934      * @stable ICU 2.0
   935      */
   936     UnicodeSet& remove(UChar32 c);
   937 
   938     /**
   939      * Removes the specified string from this set if it is present.
   940      * The set will not contain the specified character once the call
   941      * returns.
   942      * @param s the source string
   943      * @return this object, for chaining
   944      * @stable ICU 2.4
   945      */
   946     UnicodeSet& remove(const UnicodeString& s);
   947 
   948     /**
   949      * Inverts this set.  This operation modifies this set so that
   950      * its value is its complement.  This is equivalent to
   951      * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
   952      * @stable ICU 2.0
   953      */
   954     virtual UnicodeSet& complement(void);
   955 
   956     /**
   957      * Complements the specified range in this set.  Any character in
   958      * the range will be removed if it is in this set, or will be
   959      * added if it is not in this set.  If <code>end > start</code>
   960      * then an empty range is complemented, leaving the set unchanged.
   961      * This is equivalent to a boolean logic XOR.
   962      *
   963      * @param start first character, inclusive, of range to be removed
   964      * from this set.
   965      * @param end last character, inclusive, of range to be removed
   966      * from this set.
   967      * @stable ICU 2.0
   968      */
   969     virtual UnicodeSet& complement(UChar32 start, UChar32 end);
   970 
   971     /**
   972      * Complements the specified character in this set.  The character
   973      * will be removed if it is in this set, or will be added if it is
   974      * not in this set.
   975      * @stable ICU 2.0
   976      */
   977     UnicodeSet& complement(UChar32 c);
   978 
   979     /**
   980      * Complement the specified string in this set.
   981      * The set will not contain the specified string once the call
   982      * returns.
   983      * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
   984      * @param s the string to complement
   985      * @return this object, for chaining
   986      * @stable ICU 2.4
   987      */
   988     UnicodeSet& complement(const UnicodeString& s);
   989 
   990     /**
   991      * Adds all of the elements in the specified set to this set if
   992      * they're not already present.  This operation effectively
   993      * modifies this set so that its value is the <i>union</i> of the two
   994      * sets.  The behavior of this operation is unspecified if the specified
   995      * collection is modified while the operation is in progress.
   996      *
   997      * @param c set whose elements are to be added to this set.
   998      * @see #add(char, char)
   999      * @stable ICU 2.0
  1000      */
  1001     virtual UnicodeSet& addAll(const UnicodeSet& c);
  1002 
  1003     /**
  1004      * Retains only the elements in this set that are contained in the
  1005      * specified set.  In other words, removes from this set all of
  1006      * its elements that are not contained in the specified set.  This
  1007      * operation effectively modifies this set so that its value is
  1008      * the <i>intersection</i> of the two sets.
  1009      *
  1010      * @param c set that defines which elements this set will retain.
  1011      * @stable ICU 2.0
  1012      */
  1013     virtual UnicodeSet& retainAll(const UnicodeSet& c);
  1014 
  1015     /**
  1016      * Removes from this set all of its elements that are contained in the
  1017      * specified set.  This operation effectively modifies this
  1018      * set so that its value is the <i>asymmetric set difference</i> of
  1019      * the two sets.
  1020      *
  1021      * @param c set that defines which elements will be removed from
  1022      *          this set.
  1023      * @stable ICU 2.0
  1024      */
  1025     virtual UnicodeSet& removeAll(const UnicodeSet& c);
  1026 
  1027     /**
  1028      * Complements in this set all elements contained in the specified
  1029      * set.  Any character in the other set will be removed if it is
  1030      * in this set, or will be added if it is not in this set.
  1031      *
  1032      * @param c set that defines which elements will be xor'ed from
  1033      *          this set.
  1034      * @stable ICU 2.4
  1035      */
  1036     virtual UnicodeSet& complementAll(const UnicodeSet& c);
  1037 
  1038     /**
  1039      * Removes all of the elements from this set.  This set will be
  1040      * empty after this call returns.
  1041      * @stable ICU 2.0
  1042      */
  1043     virtual UnicodeSet& clear(void);
  1044 
  1045     /**
  1046      * Close this set over the given attribute.  For the attribute
  1047      * USET_CASE, the result is to modify this set so that:
  1048      *
  1049      * 1. For each character or string 'a' in this set, all strings or
  1050      * characters 'b' such that foldCase(a) == foldCase(b) are added
  1051      * to this set.
  1052      *
  1053      * 2. For each string 'e' in the resulting set, if e !=
  1054      * foldCase(e), 'e' will be removed.
  1055      *
  1056      * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]
  1057      *
  1058      * (Here foldCase(x) refers to the operation u_strFoldCase, and a
  1059      * == b denotes that the contents are the same, not pointer
  1060      * comparison.)
  1061      *
  1062      * @param attribute bitmask for attributes to close over.
  1063      * Currently only the USET_CASE bit is supported.  Any undefined bits
  1064      * are ignored.
  1065      * @return a reference to this set.
  1066      * @internal
  1067      */
  1068     UnicodeSet& closeOver(int32_t attribute);
  1069 
  1070     /**
  1071      * Iteration method that returns the number of ranges contained in
  1072      * this set.
  1073      * @see #getRangeStart
  1074      * @see #getRangeEnd
  1075      * @stable ICU 2.4
  1076      */
  1077     virtual int32_t getRangeCount(void) const;
  1078 
  1079     /**
  1080      * Iteration method that returns the first character in the
  1081      * specified range of this set.
  1082      * @see #getRangeCount
  1083      * @see #getRangeEnd
  1084      * @stable ICU 2.4
  1085      */
  1086     virtual UChar32 getRangeStart(int32_t index) const;
  1087 
  1088     /**
  1089      * Iteration method that returns the last character in the
  1090      * specified range of this set.
  1091      * @see #getRangeStart
  1092      * @see #getRangeEnd
  1093      * @stable ICU 2.4
  1094      */
  1095     virtual UChar32 getRangeEnd(int32_t index) const;
  1096 
  1097     /**
  1098      * Serializes this set into an array of 16-bit integers.  Serialization
  1099      * (currently) only records the characters in the set; multicharacter
  1100      * strings are ignored.
  1101      *
  1102      * The array has following format (each line is one 16-bit
  1103      * integer):
  1104      *
  1105      *  length     = (n+2*m) | (m!=0?0x8000:0)
  1106      *  bmpLength  = n; present if m!=0
  1107      *  bmp[0]
  1108      *  bmp[1]
  1109      *  ...
  1110      *  bmp[n-1]
  1111      *  supp-high[0]
  1112      *  supp-low[0]
  1113      *  supp-high[1]
  1114      *  supp-low[1]
  1115      *  ...
  1116      *  supp-high[m-1]
  1117      *  supp-low[m-1]
  1118      *
  1119      * The array starts with a header.  After the header are n bmp
  1120      * code points, then m supplementary code points.  Either n or m
  1121      * or both may be zero.  n+2*m is always <= 0x7FFF.
  1122      *
  1123      * If there are no supplementary characters (if m==0) then the
  1124      * header is one 16-bit integer, 'length', with value n.
  1125      *
  1126      * If there are supplementary characters (if m!=0) then the header
  1127      * is two 16-bit integers.  The first, 'length', has value
  1128      * (n+2*m)|0x8000.  The second, 'bmpLength', has value n.
  1129      *
  1130      * After the header the code points are stored in ascending order.
  1131      * Supplementary code points are stored as most significant 16
  1132      * bits followed by least significant 16 bits.
  1133      *
  1134      * @param dest pointer to buffer of destCapacity 16-bit integers.
  1135      * May be NULL only if destCapacity is zero.
  1136      * @param destCapacity size of dest, or zero.  Must not be negative.
  1137      * @param ec error code.  Will be set to U_INDEX_OUTOFBOUNDS_ERROR
  1138      * if n+2*m > 0x7FFF.  Will be set to U_BUFFER_OVERFLOW_ERROR if
  1139      * n+2*m+(m!=0?2:1) > destCapacity.
  1140      * @return the total length of the serialized format, including
  1141      * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other
  1142      * than U_BUFFER_OVERFLOW_ERROR.
  1143      * @stable ICU 2.4
  1144      */
  1145     int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const;
  1146 
  1147     /**
  1148      * Reallocate this objects internal structures to take up the least
  1149      * possible space, without changing this object's value.
  1150      * @stable ICU 2.4
  1151      */
  1152     virtual UnicodeSet& compact();
  1153 
  1154     /**
  1155      * Return the class ID for this class.  This is useful only for
  1156      * comparing to a return value from getDynamicClassID().  For example:
  1157      * <pre>
  1158      * .      Base* polymorphic_pointer = createPolymorphicObject();
  1159      * .      if (polymorphic_pointer->getDynamicClassID() ==
  1160      * .          Derived::getStaticClassID()) ...
  1161      * </pre>
  1162      * @return          The class ID for all objects of this class.
  1163      * @stable ICU 2.0
  1164      */
  1165     static UClassID U_EXPORT2 getStaticClassID(void);
  1166 
  1167     /**
  1168      * Implement UnicodeFunctor API.
  1169      *
  1170      * @return The class ID for this object. All objects of a given
  1171      * class have the same class ID.  Objects of other classes have
  1172      * different class IDs.
  1173      * @stable ICU 2.4
  1174      */
  1175     virtual UClassID getDynamicClassID(void) const;
  1176 
  1177 private:
  1178 
  1179     // Private API for the USet API
  1180 
  1181     friend class USetAccess;
  1182 
  1183     int32_t getStringCount() const;
  1184 
  1185     const UnicodeString* getString(int32_t index) const;
  1186 
  1187     //----------------------------------------------------------------
  1188     // RuleBasedTransliterator support
  1189     //----------------------------------------------------------------
  1190 
  1191 private:
  1192 
  1193     /**
  1194      * Returns <tt>true</tt> if this set contains any character whose low byte
  1195      * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
  1196      * indexing.
  1197      */
  1198     virtual UBool matchesIndexValue(uint8_t v) const;
  1199 
  1200 private:
  1201 
  1202     //----------------------------------------------------------------
  1203     // Implementation: Pattern parsing
  1204     //----------------------------------------------------------------
  1205 
  1206     void applyPattern(RuleCharacterIterator& chars,
  1207                       const SymbolTable* symbols,
  1208                       UnicodeString& rebuiltPat,
  1209                       uint32_t options,
  1210                       UErrorCode& ec);
  1211 
  1212     //----------------------------------------------------------------
  1213     // Implementation: Utility methods
  1214     //----------------------------------------------------------------
  1215 
  1216     void ensureCapacity(int32_t newLen);
  1217 
  1218     void ensureBufferCapacity(int32_t newLen);
  1219 
  1220     void swapBuffers(void);
  1221 
  1222     UBool allocateStrings();
  1223 
  1224     UnicodeString& _toPattern(UnicodeString& result,
  1225                               UBool escapeUnprintable) const;
  1226 
  1227     UnicodeString& _generatePattern(UnicodeString& result,
  1228                                     UBool escapeUnprintable) const;
  1229 
  1230     static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);
  1231 
  1232     static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
  1233 
  1234     //----------------------------------------------------------------
  1235     // Implementation: Fundamental operators
  1236     //----------------------------------------------------------------
  1237 
  1238     void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);
  1239 
  1240     void add(const UChar32* other, int32_t otherLen, int8_t polarity);
  1241 
  1242     void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
  1243 
  1244     /**
  1245      * Return true if the given position, in the given pattern, appears
  1246      * to be the start of a property set pattern [:foo:], \\p{foo}, or
  1247      * \\P{foo}, or \\N{name}.
  1248      */
  1249     static UBool resemblesPropertyPattern(const UnicodeString& pattern,
  1250                                           int32_t pos);
  1251 
  1252     static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
  1253                                           int32_t iterOpts);
  1254 
  1255     /**
  1256      * Parse the given property pattern at the given parse position
  1257      * and set this UnicodeSet to the result.
  1258      *
  1259      * The original design document is out of date, but still useful.
  1260      * Ignore the property and value names:
  1261      * http://dev.icu-project.org/cgi-bin/viewcvs.cgi/~checkout~/icuhtml/design/unicodeset_properties.html
  1262      *
  1263      * Recognized syntax:
  1264      *
  1265      * [:foo:] [:^foo:] - white space not allowed within "[:" or ":]"
  1266      * \\p{foo} \\P{foo}  - white space not allowed within "\\p" or "\\P"
  1267      * \\N{name}         - white space not allowed within "\\N"
  1268      *
  1269      * Other than the above restrictions, white space is ignored.  Case
  1270      * is ignored except in "\\p" and "\\P" and "\\N".  In 'name' leading
  1271      * and trailing space is deleted, and internal runs of whitespace
  1272      * are collapsed to a single space.
  1273      *
  1274      * We support binary properties, enumerated properties, and the
  1275      * following non-enumerated properties:
  1276      *
  1277      *  Numeric_Value
  1278      *  Name
  1279      *  Unicode_1_Name
  1280      *
  1281      * @param pattern the pattern string
  1282      * @param ppos on entry, the position at which to begin parsing.
  1283      * This should be one of the locations marked '^':
  1284      *
  1285      *   [:blah:]     \\p{blah}     \\P{blah}     \\N{name}
  1286      *   ^       %    ^       %    ^       %    ^       %
  1287      *
  1288      * On return, the position after the last character parsed, that is,
  1289      * the locations marked '%'.  If the parse fails, ppos is returned
  1290      * unchanged.
  1291      * @return a reference to this.
  1292      */
  1293     UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
  1294                                      ParsePosition& ppos,
  1295                                      UErrorCode &ec);
  1296 
  1297     void applyPropertyPattern(RuleCharacterIterator& chars,
  1298                               UnicodeString& rebuiltPat,
  1299                               UErrorCode& ec);
  1300 
  1301     /**
  1302      * A filter that returns TRUE if the given code point should be
  1303      * included in the UnicodeSet being constructed.
  1304      */
  1305     typedef UBool (*Filter)(UChar32 codePoint, void* context);
  1306 
  1307     /**
  1308      * Given a filter, set this UnicodeSet to the code points
  1309      * contained by that filter.  The filter MUST be
  1310      * property-conformant.  That is, if it returns value v for one
  1311      * code point, then it must return v for all affiliated code
  1312      * points, as defined by the inclusions list.  See
  1313      * getInclusions().
  1314      * src is a UPropertySource value.
  1315      */
  1316     void applyFilter(Filter filter,
  1317                      void* context,
  1318                      int32_t src,
  1319                      UErrorCode &status);
  1320 
  1321     /**
  1322      * Return a cached copy of the inclusions list for the property source.
  1323      */
  1324     static const UnicodeSet* getInclusions(int32_t src, UErrorCode &errorCode);
  1325 
  1326     friend class UnicodeSetIterator;
  1327 };
  1328 
  1329 inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const {
  1330     return !operator==(o);
  1331 }
  1332 
  1333 inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
  1334     return !containsNone(start, end);
  1335 }
  1336 
  1337 inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const {
  1338     return !containsNone(s);
  1339 }
  1340 
  1341 inline UBool UnicodeSet::containsSome(const UnicodeString& s) const {
  1342     return !containsNone(s);
  1343 }
  1344 
  1345 U_NAMESPACE_END
  1346 
  1347 #endif