Symaptic: os/textandloc/fontservices/textshaperplugin/IcuSource/common/unicode/uniset.h@260cb5ec6c19

     1 /*

     2 ***************************************************************************

     3 * Copyright (C) 1999-2005, International Business Machines Corporation

     4 * and others. All Rights Reserved.

     5 ***************************************************************************

     6 *   Date        Name        Description

     7 *   10/20/99    alan        Creation.

     8 ***************************************************************************

     9 */

    11 #ifndef UNICODESET_H

    12 #define UNICODESET_H

    14 #include "unicode/unifilt.h"

    15 #include "unicode/unistr.h"

    16 #include "unicode/uset.h"

    18 /**

    19  * \file

    20  * \brief C++ API: Unicode Set

    21  */

    23 U_NAMESPACE_BEGIN

    25 class ParsePosition;

    26 class SymbolTable;

    27 class UVector;

    28 class RuleCharacterIterator;

    30 /**

    31  * A mutable set of Unicode characters and multicharacter strings.  Objects of this class

    32  * represent <em>character classes</em> used in regular expressions.

    33  * A character specifies a subset of Unicode code points.  Legal

    34  * code points are U+0000 to U+10FFFF, inclusive.

    35  *

    36  * <p>The UnicodeSet class is not designed to be subclassed.

    37  *

    38  * <p><code>UnicodeSet</code> supports two APIs. The first is the

    39  * <em>operand</em> API that allows the caller to modify the value of

    40  * a <code>UnicodeSet</code> object. It conforms to Java 2's

    41  * <code>java.util.Set</code> interface, although

    42  * <code>UnicodeSet</code> does not actually implement that

    43  * interface. All methods of <code>Set</code> are supported, with the

    44  * modification that they take a character range or single character

    45  * instead of an <code>Object</code>, and they take a

    46  * <code>UnicodeSet</code> instead of a <code>Collection</code>.  The

    47  * operand API may be thought of in terms of boolean logic: a boolean

    48  * OR is implemented by <code>add</code>, a boolean AND is implemented

    49  * by <code>retain</code>, a boolean XOR is implemented by

    50  * <code>complement</code> taking an argument, and a boolean NOT is

    51  * implemented by <code>complement</code> with no argument.  In terms

    52  * of traditional set theory function names, <code>add</code> is a

    53  * union, <code>retain</code> is an intersection, <code>remove</code>

    54  * is an asymmetric difference, and <code>complement</code> with no

    55  * argument is a set complement with respect to the superset range

    56  * <code>MIN_VALUE-MAX_VALUE</code>

    57  *

    58  * <p>The second API is the

    59  * <code>applyPattern()</code>/<code>toPattern()</code> API from the

    60  * <code>java.text.Format</code>-derived classes.  Unlike the

    61  * methods that add characters, add categories, and control the logic

    62  * of the set, the method <code>applyPattern()</code> sets all

    63  * attributes of a <code>UnicodeSet</code> at once, based on a

    64  * string pattern.

    65  *

    66  * <p><b>Pattern syntax</b></p>

    67  *

    68  * Patterns are accepted by the constructors and the

    69  * <code>applyPattern()</code> methods and returned by the

    70  * <code>toPattern()</code> method.  These patterns follow a syntax

    71  * similar to that employed by version 8 regular expression character

    72  * classes.  Here are some simple examples:

    73  *

    74  * \htmlonly<blockquote>\endhtmlonly

    75  *   <table>

    76  *     <tr align="top">

    77  *       <td nowrap valign="top" align="left"><code>[]</code></td>

    78  *       <td valign="top">No characters</td>

    79  *     </tr><tr align="top">

    80  *       <td nowrap valign="top" align="left"><code>[a]</code></td>

    81  *       <td valign="top">The character 'a'</td>

    82  *     </tr><tr align="top">

    83  *       <td nowrap valign="top" align="left"><code>[ae]</code></td>

    84  *       <td valign="top">The characters 'a' and 'e'</td>

    85  *     </tr>

    86  *     <tr>

    87  *       <td nowrap valign="top" align="left"><code>[a-e]</code></td>

    88  *       <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code

    89  *       point order</td>

    90  *     </tr>

    91  *     <tr>

    92  *       <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td>

    93  *       <td valign="top">The character U+4E01</td>

    94  *     </tr>

    95  *     <tr>

    96  *       <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td>

    97  *       <td valign="top">The character 'a' and the multicharacter strings &quot;ab&quot; and

    98  *       &quot;ac&quot;</td>

    99  *     </tr>

   100  *     <tr>

   101  *       <td nowrap valign="top" align="left"><code>[\\p{Lu}]</code></td>

   102  *       <td valign="top">All characters in the general category Uppercase Letter</td>

   103  *     </tr>

   104  *   </table>

   105  * \htmlonly</blockquote>\endhtmlonly

   106  *

   107  * Any character may be preceded by a backslash in order to remove any special

   108  * meaning.  White space characters, as defined by UCharacter.isWhitespace(), are

   109  * ignored, unless they are escaped.

   110  *

   111  * <p>Property patterns specify a set of characters having a certain

   112  * property as defined by the Unicode standard.  Both the POSIX-like

   113  * "[:Lu:]" and the Perl-like syntax "\\p{Lu}" are recognized.  For a

   114  * complete list of supported property patterns, see the User's Guide

   115  * for UnicodeSet at

   116  * <a href="http://icu.sourceforge.net/userguide/unicodeSet.html">

   117  * http://icu.sourceforge.net/userguide/unicodeSet.html</a>.

   118  * Actual determination of property data is defined by the underlying

   119  * Unicode database as implemented by UCharacter.

   120  *

   121  * <p>Patterns specify individual characters, ranges of characters, and

   122  * Unicode property sets.  When elements are concatenated, they

   123  * specify their union.  To complement a set, place a '^' immediately

   124  * after the opening '['.  Property patterns are inverted by modifying

   125  * their delimiters; "[:^foo]" and "\\P{foo}".  In any other location,

   126  * '^' has no special meaning.

   127  *

   128  * <p>Ranges are indicated by placing two a '-' between two

   129  * characters, as in "a-z".  This specifies the range of all

   130  * characters from the left to the right, in Unicode order.  If the

   131  * left character is greater than or equal to the

   132  * right character it is a syntax error.  If a '-' occurs as the first

   133  * character after the opening '[' or '[^', or if it occurs as the

   134  * last character before the closing ']', then it is taken as a

   135  * literal.  Thus "[a\-b]", "[-ab]", and "[ab-]" all indicate the same

   136  * set of three characters, 'a', 'b', and '-'.

   137  *

   138  * <p>Sets may be intersected using the '&' operator or the asymmetric

   139  * set difference may be taken using the '-' operator, for example,

   140  * "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters

   141  * with values less than 4096.  Operators ('&' and '|') have equal

   142  * precedence and bind left-to-right.  Thus

   143  * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to

   144  * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]".  This only really matters for

   145  * difference; intersection is commutative.

   146  *

   147  * <table>

   148  * <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a'

   149  * <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a'

   150  * through 'z' and all letters in between, in Unicode order

   151  * <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing

   152  * all characters but 'a' through 'z',

   153  * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF

   154  * <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>

   155  * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>

   156  * <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code>

   157  * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>

   158  * <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>

   159  * <td>The asymmetric difference of sets specified by <em>pat1</em> and

   160  * <em>pat2</em>

   161  * <tr valign=top><td nowrap><code>[:Lu:] or \\p{Lu}</code>

   162  * <td>The set of characters having the specified

   163  * Unicode property; in

   164  * this case, Unicode uppercase letters

   165  * <tr valign=top><td nowrap><code>[:^Lu:] or \\P{Lu}</code>

   166  * <td>The set of characters <em>not</em> having the given

   167  * Unicode property

   168  * </table>

   169  *

   170  * <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p>

   171  *

   172  * <p><b>Formal syntax</b></p>

   173  *

   174  * \htmlonly<blockquote>\endhtmlonly

   175  *   <table>

   176  *     <tr align="top">

   177  *       <td nowrap valign="top" align="right"><code>pattern :=&nbsp; </code></td>

   178  *       <td valign="top"><code>('[' '^'? item* ']') |

   179  *       property</code></td>

   180  *     </tr>

   181  *     <tr align="top">

   182  *       <td nowrap valign="top" align="right"><code>item :=&nbsp; </code></td>

   183  *       <td valign="top"><code>char | (char '-' char) | pattern-expr<br>

   184  *       </code></td>

   185  *     </tr>

   186  *     <tr align="top">

   187  *       <td nowrap valign="top" align="right"><code>pattern-expr :=&nbsp; </code></td>

   188  *       <td valign="top"><code>pattern | pattern-expr pattern |

   189  *       pattern-expr op pattern<br>

   190  *       </code></td>

   191  *     </tr>

   192  *     <tr align="top">

   193  *       <td nowrap valign="top" align="right"><code>op :=&nbsp; </code></td>

   194  *       <td valign="top"><code>'&amp;' | '-'<br>

   195  *       </code></td>

   196  *     </tr>

   197  *     <tr align="top">

   198  *       <td nowrap valign="top" align="right"><code>special :=&nbsp; </code></td>

   199  *       <td valign="top"><code>'[' | ']' | '-'<br>

   200  *       </code></td>

   201  *     </tr>

   202  *     <tr align="top">

   203  *       <td nowrap valign="top" align="right"><code>char :=&nbsp; </code></td>

   204  *       <td valign="top"><em>any character that is not</em><code> special<br>

   205  *       | ('\' </code><em>any character</em><code>)<br>

   206  *       | ('\\u' hex hex hex hex)<br>

   207  *       </code></td>

   208  *     </tr>

   209  *     <tr align="top">

   210  *       <td nowrap valign="top" align="right"><code>hex :=&nbsp; </code></td>

   211  *       <td valign="top"><em>any character for which

   212  *       </em><code>Character.digit(c, 16)</code><em>

   213  *       returns a non-negative result</em></td>

   214  *     </tr>

   215  *     <tr>

   216  *       <td nowrap valign="top" align="right"><code>property :=&nbsp; </code></td>

   217  *       <td valign="top"><em>a Unicode property set pattern</em></td>

   218  *     </tr>

   219  *   </table>

   220  *   <br>

   221  *   <table border="1">

   222  *     <tr>

   223  *       <td>Legend: <table>

   224  *         <tr>

   225  *           <td nowrap valign="top"><code>a := b</code></td>

   226  *           <td width="20" valign="top">&nbsp; </td>

   227  *           <td valign="top"><code>a</code> may be replaced by <code>b</code> </td>

   228  *         </tr>

   229  *         <tr>

   230  *           <td nowrap valign="top"><code>a?</code></td>

   231  *           <td valign="top"></td>

   232  *           <td valign="top">zero or one instance of <code>a</code><br>

   233  *           </td>

   234  *         </tr>

   235  *         <tr>

   236  *           <td nowrap valign="top"><code>a*</code></td>

   237  *           <td valign="top"></td>

   238  *           <td valign="top">one or more instances of <code>a</code><br>

   239  *           </td>

   240  *         </tr>

   241  *         <tr>

   242  *           <td nowrap valign="top"><code>a | b</code></td>

   243  *           <td valign="top"></td>

   244  *           <td valign="top">either <code>a</code> or <code>b</code><br>

   245  *           </td>

   246  *         </tr>

   247  *         <tr>

   248  *           <td nowrap valign="top"><code>'a'</code></td>

   249  *           <td valign="top"></td>

   250  *           <td valign="top">the literal string between the quotes </td>

   251  *         </tr>

   252  *       </table>

   253  *       </td>

   254  *     </tr>

   255  *   </table>

   256  * \htmlonly</blockquote>\endhtmlonly

   257  *

   258  * @author Alan Liu

   259  * @stable ICU 2.0

   260  */

   261 class U_COMMON_API UnicodeSet : public UnicodeFilter {

   263     int32_t len; // length of list used; 0 <= len <= capacity

   264     int32_t capacity; // capacity of list

   265     int32_t bufferCapacity; // capacity of buffer

   266     UChar32* list; // MUST be terminated with HIGH

   267     UChar32* buffer; // internal buffer, may be NULL

   269     UVector* strings; // maintained in sorted order

   271     /**

   272      * The pattern representation of this set.  This may not be the

   273      * most economical pattern.  It is the pattern supplied to

   274      * applyPattern(), with variables substituted and whitespace

   275      * removed.  For sets constructed without applyPattern(), or

   276      * modified using the non-pattern API, this string will be empty,

   277      * indicating that toPattern() must generate a pattern

   278      * representation from the inversion list.

   279      */

   280     UnicodeString pat;

   282 public:

   284     enum {

   285         /**

   286          * Minimum value that can be stored in a UnicodeSet.

   287          * @stable ICU 2.4

   288          */

   289         MIN_VALUE = 0,

   291         /**

   292          * Maximum value that can be stored in a UnicodeSet.

   293          * @stable ICU 2.4

   294          */

   295         MAX_VALUE = 0x10ffff

   296     };

   298     //----------------------------------------------------------------

   299     // Constructors &c

   300     //----------------------------------------------------------------

   302 public:

   304     /**

   305      * Constructs an empty set.

   306      * @stable ICU 2.0

   307      */

   308     UnicodeSet();

   310     /**

   311      * Constructs a set containing the given range. If <code>end >

   312      * start</code> then an empty set is created.

   313      *

   314      * @param start first character, inclusive, of range

   315      * @param end last character, inclusive, of range

   316      * @stable ICU 2.4

   317      */

   318     UnicodeSet(UChar32 start, UChar32 end);

   320     /**

   321      * Constructs a set from the given pattern.  See the class

   322      * description for the syntax of the pattern language.

   323      * @param pattern a string specifying what characters are in the set

   324      * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern

   325      * contains a syntax error.

   326      * @stable ICU 2.0

   327      */

   328     UnicodeSet(const UnicodeString& pattern,

   329                UErrorCode& status);

   331     /**

   332      * Constructs a set from the given pattern.  See the class

   333      * description for the syntax of the pattern language.

   334      * @param pattern a string specifying what characters are in the set

   335      * @param options bitmask for options to apply to the pattern.

   336      * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.

   337      * @param symbols a symbol table mapping variable names to values

   338      * and stand-in characters to UnicodeSets; may be NULL

   339      * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern

   340      * contains a syntax error.

   341      * @internal

   342      */

   343     UnicodeSet(const UnicodeString& pattern,

   344                uint32_t options,

   345                const SymbolTable* symbols,

   346                UErrorCode& status);

   348     /**

   349      * Constructs a set from the given pattern.  See the class description

   350      * for the syntax of the pattern language.

   351      * @param pattern a string specifying what characters are in the set

   352      * @param pos on input, the position in pattern at which to start parsing.

   353      * On output, the position after the last character parsed.

   354      * @param options bitmask for options to apply to the pattern.

   355      * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.

   356      * @param symbols a symbol table mapping variable names to values

   357      * and stand-in characters to UnicodeSets; may be NULL

   358      * @param status input-output error code

   359      * @stable ICU 2.8

   360      */

   361     UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,

   362                uint32_t options,

   363                const SymbolTable* symbols,

   364                UErrorCode& status);

   366 #ifdef U_USE_UNICODESET_DEPRECATES

   367     /**

   368      * Obsolete: Constructs a set from the given Unicode character category.

   369      * @param category an integer indicating the character category as

   370      * defined in uchar.h.

   371      * @obsolete ICU 2.6. Use a pattern with the category instead since this API will be removed in that release.

   372      */

   373     UnicodeSet(int8_t category, UErrorCode& status);

   374 #endif

   376     /**

   377      * Constructs a set that is identical to the given UnicodeSet.

   378      * @stable ICU 2.0

   379      */

   380     UnicodeSet(const UnicodeSet& o);

   382     /**

   383      * Destructs the set.

   384      * @stable ICU 2.0

   385      */

   386     virtual ~UnicodeSet();

   388     /**

   389      * Assigns this object to be a copy of another.

   390      * @stable ICU 2.0

   391      */

   392     UnicodeSet& operator=(const UnicodeSet& o);

   394     /**

   395      * Compares the specified object with this set for equality.  Returns

   396      * <tt>true</tt> if the two sets

   397      * have the same size, and every member of the specified set is

   398      * contained in this set (or equivalently, every member of this set is

   399      * contained in the specified set).

   400      *

   401      * @param o set to be compared for equality with this set.

   402      * @return <tt>true</tt> if the specified set is equal to this set.

   403      * @stable ICU 2.0

   404      */

   405     virtual UBool operator==(const UnicodeSet& o) const;

   407     /**

   408      * Compares the specified object with this set for equality.  Returns

   409      * <tt>true</tt> if the specified set is not equal to this set.

   410      * @stable ICU 2.0

   411      */

   412     UBool operator!=(const UnicodeSet& o) const;

   414     /**

   415      * Returns a copy of this object.  All UnicodeFunctor objects have

   416      * to support cloning in order to allow classes using

   417      * UnicodeFunctors, such as Transliterator, to implement cloning.

   418      * @stable ICU 2.0

   419      */

   420     virtual UnicodeFunctor* clone() const;

   422     /**

   423      * Returns the hash code value for this set.

   424      *

   425      * @return the hash code value for this set.

   426      * @see Object#hashCode()

   427      * @stable ICU 2.0

   428      */

   429     virtual int32_t hashCode(void) const;

   431     //----------------------------------------------------------------

   432     // Public API

   433     //----------------------------------------------------------------

   435     /**

   436      * Make this object represent the range <code>start - end</code>.

   437      * If <code>end > start</code> then this object is set to an

   438      * an empty range.

   439      *

   440      * @param start first character in the set, inclusive

   441      * @param end last character in the set, inclusive

   442      * @stable ICU 2.4

   443      */

   444     UnicodeSet& set(UChar32 start, UChar32 end);

   446     /**

   447      * Return true if the given position, in the given pattern, appears

   448      * to be the start of a UnicodeSet pattern.

   449      * @stable ICU 2.4

   450      */

   451     static UBool resemblesPattern(const UnicodeString& pattern,

   452                                   int32_t pos);

   454     /**

   455      * Modifies this set to represent the set specified by the given

   456      * pattern, optionally ignoring white space.  See the class

   457      * description for the syntax of the pattern language.

   458      * @param pattern a string specifying what characters are in the set

   459      * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern

   460      * contains a syntax error.

   461      * <em> Empties the set passed before applying the pattern.</em>

   462      * @return a reference to this

   463      * @stable ICU 2.0

   464      */

   465     UnicodeSet& applyPattern(const UnicodeString& pattern,

   466                              UErrorCode& status);

   468     /**

   469      * Modifies this set to represent the set specified by the given

   470      * pattern, optionally ignoring white space.  See the class

   471      * description for the syntax of the pattern language.

   472      * @param pattern a string specifying what characters are in the set

   473      * @param options bitmask for options to apply to the pattern.

   474      * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.

   475      * @param symbols a symbol table mapping variable names to

   476      * values and stand-ins to UnicodeSets; may be NULL

   477      * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern

   478      * contains a syntax error.

   479      *<em> Empties the set passed before applying the pattern.</em>

   480      * @return a reference to this

   481      * @internal

   482      */

   483     UnicodeSet& applyPattern(const UnicodeString& pattern,

   484                              uint32_t options,

   485                              const SymbolTable* symbols,

   486                              UErrorCode& status);

   488     /**

   489      * Parses the given pattern, starting at the given position.  The

   490      * character at pattern.charAt(pos.getIndex()) must be '[', or the

   491      * parse fails.  Parsing continues until the corresponding closing

   492      * ']'.  If a syntax error is encountered between the opening and

   493      * closing brace, the parse fails.  Upon return from a successful

   494      * parse, the ParsePosition is updated to point to the character

   495      * following the closing ']', and a StringBuffer containing a

   496      * pairs list for the parsed pattern is returned.  This method calls

   497      * itself recursively to parse embedded subpatterns.

   498      *<em> Empties the set passed before applying the pattern.</em>

   499      *

   500      * @param pattern the string containing the pattern to be parsed.

   501      * The portion of the string from pos.getIndex(), which must be a

   502      * '[', to the corresponding closing ']', is parsed.

   503      * @param pos upon entry, the position at which to being parsing.

   504      * The character at pattern.charAt(pos.getIndex()) must be a '['.

   505      * Upon return from a successful parse, pos.getIndex() is either

   506      * the character after the closing ']' of the parsed pattern, or

   507      * pattern.length() if the closing ']' is the last character of

   508      * the pattern string.

   509      * @param options bitmask for options to apply to the pattern.

   510      * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.

   511      * @param symbols a symbol table mapping variable names to

   512      * values and stand-ins to UnicodeSets; may be NULL

   513      * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern

   514      * contains a syntax error.

   515      * @return a reference to this

   516      * @stable ICU 2.8

   517      */

   518     UnicodeSet& applyPattern(const UnicodeString& pattern,

   519                              ParsePosition& pos,

   520                              uint32_t options,

   521                              const SymbolTable* symbols,

   522                              UErrorCode& status);

   524     /**

   525      * Returns a string representation of this set.  If the result of

   526      * calling this function is passed to a UnicodeSet constructor, it

   527      * will produce another set that is equal to this one.

   528      * @param result the string to receive the rules.  Previous

   529      * contents will be deleted.

   530      * @param escapeUnprintable if TRUE then convert unprintable

   531      * character to their hex escape representations, \\uxxxx or

   532      * \\Uxxxxxxxx.  Unprintable characters are those other than

   533      * U+000A, U+0020..U+007E.

   534      * @stable ICU 2.0

   535      */

   536     virtual UnicodeString& toPattern(UnicodeString& result,

   537                              UBool escapeUnprintable = FALSE) const;

   539     /**

   540      * Modifies this set to contain those code points which have the given value

   541      * for the given binary or enumerated property, as returned by

   542      * u_getIntPropertyValue.  Prior contents of this set are lost.

   543      *

   544      * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1

   545      * or UCHAR_INT_START..UCHAR_INT_LIMIT-1

   546      * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1.

   547      *

   548      * @param value a value in the range u_getIntPropertyMinValue(prop)..

   549      * u_getIntPropertyMaxValue(prop), with one exception.  If prop is

   550      * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but

   551      * rather a mask value produced by U_GET_GC_MASK().  This allows grouped

   552      * categories such as [:L:] to be represented.

   553      *

   554      * @param ec error code input/output parameter

   555      *

   556      * @return a reference to this set

   557      *

   558      * @stable ICU 2.4

   559      */

   560     UnicodeSet& applyIntPropertyValue(UProperty prop,

   561                                       int32_t value,

   562                                       UErrorCode& ec);

   564     /**

   565      * Modifies this set to contain those code points which have the

   566      * given value for the given property.  Prior contents of this

   567      * set are lost.

   568      *

   569      * @param prop a property alias, either short or long.  The name is matched

   570      * loosely.  See PropertyAliases.txt for names and a description of loose

   571      * matching.  If the value string is empty, then this string is interpreted

   572      * as either a General_Category value alias, a Script value alias, a binary

   573      * property alias, or a special ID.  Special IDs are matched loosely and

   574      * correspond to the following sets:

   575      *

   576      * "ANY" = [\\u0000-\\U0010FFFF],

   577      * "ASCII" = [\\u0000-\\u007F],

   578      * "Assigned" = [:^Cn:].

   579      *

   580      * @param value a value alias, either short or long.  The name is matched

   581      * loosely.  See PropertyValueAliases.txt for names and a description of

   582      * loose matching.  In addition to aliases listed, numeric values and

   583      * canonical combining classes may be expressed numerically, e.g., ("nv",

   584      * "0.5") or ("ccc", "220").  The value string may also be empty.

   585      *

   586      * @param ec error code input/output parameter

   587      *

   588      * @return a reference to this set

   589      *

   590      * @stable ICU 2.4

   591      */

   592     UnicodeSet& applyPropertyAlias(const UnicodeString& prop,

   593                                    const UnicodeString& value,

   594                                    UErrorCode& ec);

   596     /**

   597      * Returns the number of elements in this set (its cardinality).

   598      * Note than the elements of a set may include both individual

   599      * codepoints and strings.

   600      *

   601      * @return the number of elements in this set (its cardinality).

   602      * @stable ICU 2.0

   603      */

   604     virtual int32_t size(void) const;

   606     /**

   607      * Returns <tt>true</tt> if this set contains no elements.

   608      *

   609      * @return <tt>true</tt> if this set contains no elements.

   610      * @stable ICU 2.0

   611      */

   612     virtual UBool isEmpty(void) const;

   614     /**

   615      * Returns true if this set contains the given character.

   616      * @param c character to be checked for containment

   617      * @return true if the test condition is met

   618      * @stable ICU 2.0

   619      */

   620     virtual UBool contains(UChar32 c) const;

   622     /**

   623      * Returns true if this set contains every character

   624      * of the given range.

   625      * @param start first character, inclusive, of the range

   626      * @param end last character, inclusive, of the range

   627      * @return true if the test condition is met

   628      * @stable ICU 2.0

   629      */

   630     virtual UBool contains(UChar32 start, UChar32 end) const;

   632     /**

   633      * Returns <tt>true</tt> if this set contains the given

   634      * multicharacter string.

   635      * @param s string to be checked for containment

   636      * @return <tt>true</tt> if this set contains the specified string

   637      * @stable ICU 2.4

   638      */

   639     UBool contains(const UnicodeString& s) const;

   641     /**

   642      * Returns true if this set contains all the characters and strings

   643      * of the given set.

   644      * @param c set to be checked for containment

   645      * @return true if the test condition is met

   646      * @stable ICU 2.4

   647      */

   648     virtual UBool containsAll(const UnicodeSet& c) const;

   650     /**

   651      * Returns true if this set contains all the characters

   652      * of the given string.

   653      * @param s string containing characters to be checked for containment

   654      * @return true if the test condition is met

   655      * @stable ICU 2.4

   656      */

   657     UBool containsAll(const UnicodeString& s) const;

   659     /**

   660      * Returns true if this set contains none of the characters

   661      * of the given range.

   662      * @param start first character, inclusive, of the range

   663      * @param end last character, inclusive, of the range

   664      * @return true if the test condition is met

   665      * @stable ICU 2.4

   666      */

   667     UBool containsNone(UChar32 start, UChar32 end) const;

   669     /**

   670      * Returns true if this set contains none of the characters and strings

   671      * of the given set.

   672      * @param c set to be checked for containment

   673      * @return true if the test condition is met

   674      * @stable ICU 2.4

   675      */

   676     UBool containsNone(const UnicodeSet& c) const;

   678     /**

   679      * Returns true if this set contains none of the characters

   680      * of the given string.

   681      * @param s string containing characters to be checked for containment

   682      * @return true if the test condition is met

   683      * @stable ICU 2.4

   684      */

   685     UBool containsNone(const UnicodeString& s) const;

   687     /**

   688      * Returns true if this set contains one or more of the characters

   689      * in the given range.

   690      * @param start first character, inclusive, of the range

   691      * @param end last character, inclusive, of the range

   692      * @return true if the condition is met

   693      * @stable ICU 2.4

   694      */

   695     inline UBool containsSome(UChar32 start, UChar32 end) const;

   697     /**

   698      * Returns true if this set contains one or more of the characters

   699      * and strings of the given set.

   700      * @param s The set to be checked for containment

   701      * @return true if the condition is met

   702      * @stable ICU 2.4

   703      */

   704     inline UBool containsSome(const UnicodeSet& s) const;

   706     /**

   707      * Returns true if this set contains one or more of the characters

   708      * of the given string.

   709      * @param s string containing characters to be checked for containment

   710      * @return true if the condition is met

   711      * @stable ICU 2.4

   712      */

   713     inline UBool containsSome(const UnicodeString& s) const;

   715     /**

   716      * Implement UnicodeMatcher::matches()

   717      * @stable ICU 2.4

   718      */

   719     virtual UMatchDegree matches(const Replaceable& text,

   720                          int32_t& offset,

   721                          int32_t limit,

   722                          UBool incremental);

   724 private:

   725     /**

   726      * Returns the longest match for s in text at the given position.

   727      * If limit > start then match forward from start+1 to limit

   728      * matching all characters except s.charAt(0).  If limit < start,

   729      * go backward starting from start-1 matching all characters

   730      * except s.charAt(s.length()-1).  This method assumes that the

   731      * first character, text.charAt(start), matches s, so it does not

   732      * check it.

   733      * @param text the text to match

   734      * @param start the first character to match.  In the forward

   735      * direction, text.charAt(start) is matched against s.charAt(0).

   736      * In the reverse direction, it is matched against

   737      * s.charAt(s.length()-1).

   738      * @param limit the limit offset for matching, either last+1 in

   739      * the forward direction, or last-1 in the reverse direction,

   740      * where last is the index of the last character to match.

   741      * @return If part of s matches up to the limit, return |limit -

   742      * start|.  If all of s matches before reaching the limit, return

   743      * s.length().  If there is a mismatch between s and text, return

   744      * 0

   745      */

   746     static int32_t matchRest(const Replaceable& text,

   747                              int32_t start, int32_t limit,

   748                              const UnicodeString& s);

   750     /**

   751      * Returns the smallest value i such that c < list[i].  Caller

   752      * must ensure that c is a legal value or this method will enter

   753      * an infinite loop.  This method performs a binary search.

   754      * @param c a character in the range MIN_VALUE..MAX_VALUE

   755      * inclusive

   756      * @return the smallest integer i in the range 0..len-1,

   757      * inclusive, such that c < list[i]

   758      */

   759     int32_t findCodePoint(UChar32 c) const;

   761 public:

   763     /**

   764      * Implementation of UnicodeMatcher API.  Union the set of all

   765      * characters that may be matched by this object into the given

   766      * set.

   767      * @param toUnionTo the set into which to union the source characters

   768      * @stable ICU 2.4

   769      */

   770     virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;

   772     /**

   773      * Returns the index of the given character within this set, where

   774      * the set is ordered by ascending code point.  If the character

   775      * is not in this set, return -1.  The inverse of this method is

   776      * <code>charAt()</code>.

   777      * @return an index from 0..size()-1, or -1

   778      * @stable ICU 2.4

   779      */

   780     int32_t indexOf(UChar32 c) const;

   782     /**

   783      * Returns the character at the given index within this set, where

   784      * the set is ordered by ascending code point.  If the index is

   785      * out of range, return (UChar32)-1.  The inverse of this method is

   786      * <code>indexOf()</code>.

   787      * @param index an index from 0..size()-1

   788      * @return the character at the given index, or (UChar32)-1.

   789      * @stable ICU 2.4

   790      */

   791     UChar32 charAt(int32_t index) const;

   793     /**

   794      * Adds the specified range to this set if it is not already

   795      * present.  If this set already contains the specified range,

   796      * the call leaves this set unchanged.  If <code>end > start</code>

   797      * then an empty range is added, leaving the set unchanged.

   798      * This is equivalent to a boolean logic OR, or a set UNION.

   799      *

   800      * @param start first character, inclusive, of range to be added

   801      * to this set.

   802      * @param end last character, inclusive, of range to be added

   803      * to this set.

   804      * @stable ICU 2.0

   805      */

   806     virtual UnicodeSet& add(UChar32 start, UChar32 end);

   808     /**

   809      * Adds the specified character to this set if it is not already

   810      * present.  If this set already contains the specified character,

   811      * the call leaves this set unchanged.

   812      * @stable ICU 2.0

   813      */

   814     UnicodeSet& add(UChar32 c);

   816     /**

   817      * Adds the specified multicharacter to this set if it is not already

   818      * present.  If this set already contains the multicharacter,

   819      * the call leaves this set unchanged.

   820      * Thus "ch" => {"ch"}

   821      * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>

   822      * @param s the source string

   823      * @return this object, for chaining

   824      * @stable ICU 2.4

   825      */

   826     UnicodeSet& add(const UnicodeString& s);

   828  private:

   829     /**

   830      * @return a code point IF the string consists of a single one.

   831      * otherwise returns -1.

   832      * @param string to test

   833      */

   834     static int32_t getSingleCP(const UnicodeString& s);

   836     void _add(const UnicodeString& s);

   838  public:

   839     /**

   840      * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}

   841      * If this set already any particular character, it has no effect on that character.

   842      * @param s the source string

   843      * @return this object, for chaining

   844      * @stable ICU 2.4

   845      */

   846     UnicodeSet& addAll(const UnicodeString& s);

   848     /**

   849      * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}

   850      * If this set already any particular character, it has no effect on that character.

   851      * @param s the source string

   852      * @return this object, for chaining

   853      * @stable ICU 2.4

   854      */

   855     UnicodeSet& retainAll(const UnicodeString& s);

   857     /**

   858      * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}

   859      * If this set already any particular character, it has no effect on that character.

   860      * @param s the source string

   861      * @return this object, for chaining

   862      * @stable ICU 2.4

   863      */

   864     UnicodeSet& complementAll(const UnicodeString& s);

   866     /**

   867      * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}

   868      * If this set already any particular character, it has no effect on that character.

   869      * @param s the source string

   870      * @return this object, for chaining

   871      * @stable ICU 2.4

   872      */

   873     UnicodeSet& removeAll(const UnicodeString& s);

   875     /**

   876      * Makes a set from a multicharacter string. Thus "ch" => {"ch"}

   877      * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>

   878      * @param s the source string

   879      * @return a newly created set containing the given string.

   880      * The caller owns the return object and is responsible for deleting it.

   881      * @stable ICU 2.4

   882      */

   883     static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s);

   886     /**

   887      * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}

   888      * @param s the source string

   889      * @return a newly created set containing the given characters

   890      * The caller owns the return object and is responsible for deleting it.

   891      * @stable ICU 2.4

   892      */

   893     static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s);

   895     /**

   896      * Retain only the elements in this set that are contained in the

   897      * specified range.  If <code>end > start</code> then an empty range is

   898      * retained, leaving the set empty.  This is equivalent to

   899      * a boolean logic AND, or a set INTERSECTION.

   900      *

   901      * @param start first character, inclusive, of range to be retained

   902      * to this set.

   903      * @param end last character, inclusive, of range to be retained

   904      * to this set.

   905      * @stable ICU 2.0

   906      */

   907     virtual UnicodeSet& retain(UChar32 start, UChar32 end);

   910     /**

   911      * Retain the specified character from this set if it is present.

   912      * @stable ICU 2.0

   913      */

   914     UnicodeSet& retain(UChar32 c);

   916     /**

   917      * Removes the specified range from this set if it is present.

   918      * The set will not contain the specified range once the call

   919      * returns.  If <code>end > start</code> then an empty range is

   920      * removed, leaving the set unchanged.

   921      *

   922      * @param start first character, inclusive, of range to be removed

   923      * from this set.

   924      * @param end last character, inclusive, of range to be removed

   925      * from this set.

   926      * @stable ICU 2.0

   927      */

   928     virtual UnicodeSet& remove(UChar32 start, UChar32 end);

   930     /**

   931      * Removes the specified character from this set if it is present.

   932      * The set will not contain the specified range once the call

   933      * returns.

   934      * @stable ICU 2.0

   935      */

   936     UnicodeSet& remove(UChar32 c);

   938     /**

   939      * Removes the specified string from this set if it is present.

   940      * The set will not contain the specified character once the call

   941      * returns.

   942      * @param s the source string

   943      * @return this object, for chaining

   944      * @stable ICU 2.4

   945      */

   946     UnicodeSet& remove(const UnicodeString& s);

   948     /**

   949      * Inverts this set.  This operation modifies this set so that

   950      * its value is its complement.  This is equivalent to

   951      * <code>complement(MIN_VALUE, MAX_VALUE)</code>.

   952      * @stable ICU 2.0

   953      */

   954     virtual UnicodeSet& complement(void);

   956     /**

   957      * Complements the specified range in this set.  Any character in

   958      * the range will be removed if it is in this set, or will be

   959      * added if it is not in this set.  If <code>end > start</code>

   960      * then an empty range is complemented, leaving the set unchanged.

   961      * This is equivalent to a boolean logic XOR.

   962      *

   963      * @param start first character, inclusive, of range to be removed

   964      * from this set.

   965      * @param end last character, inclusive, of range to be removed

   966      * from this set.

   967      * @stable ICU 2.0

   968      */

   969     virtual UnicodeSet& complement(UChar32 start, UChar32 end);

   971     /**

   972      * Complements the specified character in this set.  The character

   973      * will be removed if it is in this set, or will be added if it is

   974      * not in this set.

   975      * @stable ICU 2.0

   976      */

   977     UnicodeSet& complement(UChar32 c);

   979     /**

   980      * Complement the specified string in this set.

   981      * The set will not contain the specified string once the call

   982      * returns.

   983      * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>

   984      * @param s the string to complement

   985      * @return this object, for chaining

   986      * @stable ICU 2.4

   987      */

   988     UnicodeSet& complement(const UnicodeString& s);

   990     /**

   991      * Adds all of the elements in the specified set to this set if

   992      * they're not already present.  This operation effectively

   993      * modifies this set so that its value is the <i>union</i> of the two

   994      * sets.  The behavior of this operation is unspecified if the specified

   995      * collection is modified while the operation is in progress.

   996      *

   997      * @param c set whose elements are to be added to this set.

   998      * @see #add(char, char)

   999      * @stable ICU 2.0

  1000      */

  1001     virtual UnicodeSet& addAll(const UnicodeSet& c);

  1003     /**

  1004      * Retains only the elements in this set that are contained in the

  1005      * specified set.  In other words, removes from this set all of

  1006      * its elements that are not contained in the specified set.  This

  1007      * operation effectively modifies this set so that its value is

  1008      * the <i>intersection</i> of the two sets.

  1009      *

  1010      * @param c set that defines which elements this set will retain.

  1011      * @stable ICU 2.0

  1012      */

  1013     virtual UnicodeSet& retainAll(const UnicodeSet& c);

  1015     /**

  1016      * Removes from this set all of its elements that are contained in the

  1017      * specified set.  This operation effectively modifies this

  1018      * set so that its value is the <i>asymmetric set difference</i> of

  1019      * the two sets.

  1020      *

  1021      * @param c set that defines which elements will be removed from

  1022      *          this set.

  1023      * @stable ICU 2.0

  1024      */

  1025     virtual UnicodeSet& removeAll(const UnicodeSet& c);

  1027     /**

  1028      * Complements in this set all elements contained in the specified

  1029      * set.  Any character in the other set will be removed if it is

  1030      * in this set, or will be added if it is not in this set.

  1031      *

  1032      * @param c set that defines which elements will be xor'ed from

  1033      *          this set.

  1034      * @stable ICU 2.4

  1035      */

  1036     virtual UnicodeSet& complementAll(const UnicodeSet& c);

  1038     /**

  1039      * Removes all of the elements from this set.  This set will be

  1040      * empty after this call returns.

  1041      * @stable ICU 2.0

  1042      */

  1043     virtual UnicodeSet& clear(void);

  1045     /**

  1046      * Close this set over the given attribute.  For the attribute

  1047      * USET_CASE, the result is to modify this set so that:

  1048      *

  1049      * 1. For each character or string 'a' in this set, all strings or

  1050      * characters 'b' such that foldCase(a) == foldCase(b) are added

  1051      * to this set.

  1052      *

  1053      * 2. For each string 'e' in the resulting set, if e !=

  1054      * foldCase(e), 'e' will be removed.

  1055      *

  1056      * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]

  1057      *

  1058      * (Here foldCase(x) refers to the operation u_strFoldCase, and a

  1059      * == b denotes that the contents are the same, not pointer

  1060      * comparison.)

  1061      *

  1062      * @param attribute bitmask for attributes to close over.

  1063      * Currently only the USET_CASE bit is supported.  Any undefined bits

  1064      * are ignored.

  1065      * @return a reference to this set.

  1066      * @internal

  1067      */

  1068     UnicodeSet& closeOver(int32_t attribute);

  1070     /**

  1071      * Iteration method that returns the number of ranges contained in

  1072      * this set.

  1073      * @see #getRangeStart

  1074      * @see #getRangeEnd

  1075      * @stable ICU 2.4

  1076      */

  1077     virtual int32_t getRangeCount(void) const;

  1079     /**

  1080      * Iteration method that returns the first character in the

  1081      * specified range of this set.

  1082      * @see #getRangeCount

  1083      * @see #getRangeEnd

  1084      * @stable ICU 2.4

  1085      */

  1086     virtual UChar32 getRangeStart(int32_t index) const;

  1088     /**

  1089      * Iteration method that returns the last character in the

  1090      * specified range of this set.

  1091      * @see #getRangeStart

  1092      * @see #getRangeEnd

  1093      * @stable ICU 2.4

  1094      */

  1095     virtual UChar32 getRangeEnd(int32_t index) const;

  1097     /**

  1098      * Serializes this set into an array of 16-bit integers.  Serialization

  1099      * (currently) only records the characters in the set; multicharacter

  1100      * strings are ignored.

  1101      *

  1102      * The array has following format (each line is one 16-bit

  1103      * integer):

  1104      *

  1105      *  length     = (n+2*m) | (m!=0?0x8000:0)

  1106      *  bmpLength  = n; present if m!=0

  1107      *  bmp[0]

  1108      *  bmp[1]

  1109      *  ...

  1110      *  bmp[n-1]

  1111      *  supp-high[0]

  1112      *  supp-low[0]

  1113      *  supp-high[1]

  1114      *  supp-low[1]

  1115      *  ...

  1116      *  supp-high[m-1]

  1117      *  supp-low[m-1]

  1118      *

  1119      * The array starts with a header.  After the header are n bmp

  1120      * code points, then m supplementary code points.  Either n or m

  1121      * or both may be zero.  n+2*m is always <= 0x7FFF.

  1122      *

  1123      * If there are no supplementary characters (if m==0) then the

  1124      * header is one 16-bit integer, 'length', with value n.

  1125      *

  1126      * If there are supplementary characters (if m!=0) then the header

  1127      * is two 16-bit integers.  The first, 'length', has value

  1128      * (n+2*m)|0x8000.  The second, 'bmpLength', has value n.

  1129      *

  1130      * After the header the code points are stored in ascending order.

  1131      * Supplementary code points are stored as most significant 16

  1132      * bits followed by least significant 16 bits.

  1133      *

  1134      * @param dest pointer to buffer of destCapacity 16-bit integers.

  1135      * May be NULL only if destCapacity is zero.

  1136      * @param destCapacity size of dest, or zero.  Must not be negative.

  1137      * @param ec error code.  Will be set to U_INDEX_OUTOFBOUNDS_ERROR

  1138      * if n+2*m > 0x7FFF.  Will be set to U_BUFFER_OVERFLOW_ERROR if

  1139      * n+2*m+(m!=0?2:1) > destCapacity.

  1140      * @return the total length of the serialized format, including

  1141      * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other

  1142      * than U_BUFFER_OVERFLOW_ERROR.

  1143      * @stable ICU 2.4

  1144      */

  1145     int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const;

  1147     /**

  1148      * Reallocate this objects internal structures to take up the least

  1149      * possible space, without changing this object's value.

  1150      * @stable ICU 2.4

  1151      */

  1152     virtual UnicodeSet& compact();

  1154     /**

  1155      * Return the class ID for this class.  This is useful only for

  1156      * comparing to a return value from getDynamicClassID().  For example:

  1157      * <pre>

  1158      * .      Base* polymorphic_pointer = createPolymorphicObject();

  1159      * .      if (polymorphic_pointer->getDynamicClassID() ==

  1160      * .          Derived::getStaticClassID()) ...

  1161      * </pre>

  1162      * @return          The class ID for all objects of this class.

  1163      * @stable ICU 2.0

  1164      */

  1165     static UClassID U_EXPORT2 getStaticClassID(void);

  1167     /**

  1168      * Implement UnicodeFunctor API.

  1169      *

  1170      * @return The class ID for this object. All objects of a given

  1171      * class have the same class ID.  Objects of other classes have

  1172      * different class IDs.

  1173      * @stable ICU 2.4

  1174      */

  1175     virtual UClassID getDynamicClassID(void) const;

  1177 private:

  1179     // Private API for the USet API

  1181     friend class USetAccess;

  1183     int32_t getStringCount() const;

  1185     const UnicodeString* getString(int32_t index) const;

  1187     //----------------------------------------------------------------

  1188     // RuleBasedTransliterator support

  1189     //----------------------------------------------------------------

  1191 private:

  1193     /**

  1194      * Returns <tt>true</tt> if this set contains any character whose low byte

  1195      * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for

  1196      * indexing.

  1197      */

  1198     virtual UBool matchesIndexValue(uint8_t v) const;

  1200 private:

  1202     //----------------------------------------------------------------

  1203     // Implementation: Pattern parsing

  1204     //----------------------------------------------------------------

  1206     void applyPattern(RuleCharacterIterator& chars,

  1207                       const SymbolTable* symbols,

  1208                       UnicodeString& rebuiltPat,

  1209                       uint32_t options,

  1210                       UErrorCode& ec);

  1212     //----------------------------------------------------------------

  1213     // Implementation: Utility methods

  1214     //----------------------------------------------------------------

  1216     void ensureCapacity(int32_t newLen);

  1218     void ensureBufferCapacity(int32_t newLen);

  1220     void swapBuffers(void);

  1222     UBool allocateStrings();

  1224     UnicodeString& _toPattern(UnicodeString& result,

  1225                               UBool escapeUnprintable) const;

  1227     UnicodeString& _generatePattern(UnicodeString& result,

  1228                                     UBool escapeUnprintable) const;

  1230     static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);

  1232     static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);

  1234     //----------------------------------------------------------------

  1235     // Implementation: Fundamental operators

  1236     //----------------------------------------------------------------

  1238     void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);

  1240     void add(const UChar32* other, int32_t otherLen, int8_t polarity);

  1242     void retain(const UChar32* other, int32_t otherLen, int8_t polarity);

  1244     /**

  1245      * Return true if the given position, in the given pattern, appears

  1246      * to be the start of a property set pattern [:foo:], \\p{foo}, or

  1247      * \\P{foo}, or \\N{name}.

  1248      */

  1249     static UBool resemblesPropertyPattern(const UnicodeString& pattern,

  1250                                           int32_t pos);

  1252     static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,

  1253                                           int32_t iterOpts);

  1255     /**

  1256      * Parse the given property pattern at the given parse position

  1257      * and set this UnicodeSet to the result.

  1258      *

  1259      * The original design document is out of date, but still useful.

  1260      * Ignore the property and value names:

  1261      * http://dev.icu-project.org/cgi-bin/viewcvs.cgi/~checkout~/icuhtml/design/unicodeset_properties.html

  1262      *

  1263      * Recognized syntax:

  1264      *

  1265      * [:foo:] [:^foo:] - white space not allowed within "[:" or ":]"

  1266      * \\p{foo} \\P{foo}  - white space not allowed within "\\p" or "\\P"

  1267      * \\N{name}         - white space not allowed within "\\N"

  1268      *

  1269      * Other than the above restrictions, white space is ignored.  Case

  1270      * is ignored except in "\\p" and "\\P" and "\\N".  In 'name' leading

  1271      * and trailing space is deleted, and internal runs of whitespace

  1272      * are collapsed to a single space.

  1273      *

  1274      * We support binary properties, enumerated properties, and the

  1275      * following non-enumerated properties:

  1276      *

  1277      *  Numeric_Value

  1278      *  Name

  1279      *  Unicode_1_Name

  1280      *

  1281      * @param pattern the pattern string

  1282      * @param ppos on entry, the position at which to begin parsing.

  1283      * This should be one of the locations marked '^':

  1284      *

  1285      *   [:blah:]     \\p{blah}     \\P{blah}     \\N{name}

  1286      *   ^       %    ^       %    ^       %    ^       %

  1287      *

  1288      * On return, the position after the last character parsed, that is,

  1289      * the locations marked '%'.  If the parse fails, ppos is returned

  1290      * unchanged.

  1291      * @return a reference to this.

  1292      */

  1293     UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,

  1294                                      ParsePosition& ppos,

  1295                                      UErrorCode &ec);

  1297     void applyPropertyPattern(RuleCharacterIterator& chars,

  1298                               UnicodeString& rebuiltPat,

  1299                               UErrorCode& ec);

  1301     /**

  1302      * A filter that returns TRUE if the given code point should be

  1303      * included in the UnicodeSet being constructed.

  1304      */

  1305     typedef UBool (*Filter)(UChar32 codePoint, void* context);

  1307     /**

  1308      * Given a filter, set this UnicodeSet to the code points

  1309      * contained by that filter.  The filter MUST be

  1310      * property-conformant.  That is, if it returns value v for one

  1311      * code point, then it must return v for all affiliated code

  1312      * points, as defined by the inclusions list.  See

  1313      * getInclusions().

  1314      * src is a UPropertySource value.

  1315      */

  1316     void applyFilter(Filter filter,

  1317                      void* context,

  1318                      int32_t src,

  1319                      UErrorCode &status);

  1321     /**

  1322      * Return a cached copy of the inclusions list for the property source.

  1323      */

  1324     static const UnicodeSet* getInclusions(int32_t src, UErrorCode &errorCode);

  1326     friend class UnicodeSetIterator;

  1327 };

  1329 inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const {

  1330     return !operator==(o);

  1331 }

  1333 inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {

  1334     return !containsNone(start, end);

  1335 }

  1337 inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const {

  1338     return !containsNone(s);

  1339 }

  1341 inline UBool UnicodeSet::containsSome(const UnicodeString& s) const {

  1342     return !containsNone(s);

  1343 }

  1345 U_NAMESPACE_END

  1347 #endif

author	sl
	Tue, 10 Jun 2014 14:32:02 +0200
changeset 1	260cb5ec6c19
permissions	-rw-r--r--