Symaptic: os/textandloc/fontservices/textshaperplugin/IcuSource/common/unicode/uniset.h@260cb5ec6c19 (annotated)

sl@0	1	/*
sl@0	2	***************************************************************************
sl@0	3	* Copyright (C) 1999-2005, International Business Machines Corporation
sl@0	4	* and others. All Rights Reserved.
sl@0	5	***************************************************************************
sl@0	6	* Date Name Description
sl@0	7	* 10/20/99 alan Creation.
sl@0	8	***************************************************************************
sl@0	9	*/
sl@0	10
sl@0	11	#ifndef UNICODESET_H
sl@0	12	#define UNICODESET_H
sl@0	13
sl@0	14	#include "unicode/unifilt.h"
sl@0	15	#include "unicode/unistr.h"
sl@0	16	#include "unicode/uset.h"
sl@0	17
sl@0	18	/**
sl@0	19	* \file
sl@0	20	* \brief C++ API: Unicode Set
sl@0	21	*/
sl@0	22
sl@0	23	U_NAMESPACE_BEGIN
sl@0	24
sl@0	25	class ParsePosition;
sl@0	26	class SymbolTable;
sl@0	27	class UVector;
sl@0	28	class RuleCharacterIterator;
sl@0	29
sl@0	30	/**
sl@0	31	* A mutable set of Unicode characters and multicharacter strings. Objects of this class
sl@0	32	* represent <em>character classes</em> used in regular expressions.
sl@0	33	* A character specifies a subset of Unicode code points. Legal
sl@0	34	* code points are U+0000 to U+10FFFF, inclusive.
sl@0	35	*
sl@0	36	* <p>The UnicodeSet class is not designed to be subclassed.
sl@0	37	*
sl@0	38	* <p><code>UnicodeSet</code> supports two APIs. The first is the
sl@0	39	* <em>operand</em> API that allows the caller to modify the value of
sl@0	40	* a <code>UnicodeSet</code> object. It conforms to Java 2's
sl@0	41	* <code>java.util.Set</code> interface, although
sl@0	42	* <code>UnicodeSet</code> does not actually implement that
sl@0	43	* interface. All methods of <code>Set</code> are supported, with the
sl@0	44	* modification that they take a character range or single character
sl@0	45	* instead of an <code>Object</code>, and they take a
sl@0	46	* <code>UnicodeSet</code> instead of a <code>Collection</code>. The
sl@0	47	* operand API may be thought of in terms of boolean logic: a boolean
sl@0	48	* OR is implemented by <code>add</code>, a boolean AND is implemented
sl@0	49	* by <code>retain</code>, a boolean XOR is implemented by
sl@0	50	* <code>complement</code> taking an argument, and a boolean NOT is
sl@0	51	* implemented by <code>complement</code> with no argument. In terms
sl@0	52	* of traditional set theory function names, <code>add</code> is a
sl@0	53	* union, <code>retain</code> is an intersection, <code>remove</code>
sl@0	54	* is an asymmetric difference, and <code>complement</code> with no
sl@0	55	* argument is a set complement with respect to the superset range
sl@0	56	* <code>MIN_VALUE-MAX_VALUE</code>
sl@0	57	*
sl@0	58	* <p>The second API is the
sl@0	59	* <code>applyPattern()</code>/<code>toPattern()</code> API from the
sl@0	60	* <code>java.text.Format</code>-derived classes. Unlike the
sl@0	61	* methods that add characters, add categories, and control the logic
sl@0	62	* of the set, the method <code>applyPattern()</code> sets all
sl@0	63	* attributes of a <code>UnicodeSet</code> at once, based on a
sl@0	64	* string pattern.
sl@0	65	*
sl@0	66	* <p><b>Pattern syntax</b></p>
sl@0	67	*
sl@0	68	* Patterns are accepted by the constructors and the
sl@0	69	* <code>applyPattern()</code> methods and returned by the
sl@0	70	* <code>toPattern()</code> method. These patterns follow a syntax
sl@0	71	* similar to that employed by version 8 regular expression character
sl@0	72	* classes. Here are some simple examples:
sl@0	73	*
sl@0	74	* \htmlonly<blockquote>\endhtmlonly
sl@0	75	* <table>
sl@0	76	* <tr align="top">
sl@0	77	* <td nowrap valign="top" align="left"><code>[]</code></td>
sl@0	78	* <td valign="top">No characters</td>
sl@0	79	* </tr><tr align="top">
sl@0	80	* <td nowrap valign="top" align="left"><code>[a]</code></td>
sl@0	81	* <td valign="top">The character 'a'</td>
sl@0	82	* </tr><tr align="top">
sl@0	83	* <td nowrap valign="top" align="left"><code>[ae]</code></td>
sl@0	84	* <td valign="top">The characters 'a' and 'e'</td>
sl@0	85	* </tr>
sl@0	86	* <tr>
sl@0	87	* <td nowrap valign="top" align="left"><code>[a-e]</code></td>
sl@0	88	* <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code
sl@0	89	* point order</td>
sl@0	90	* </tr>
sl@0	91	* <tr>
sl@0	92	* <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td>
sl@0	93	* <td valign="top">The character U+4E01</td>
sl@0	94	* </tr>
sl@0	95	* <tr>
sl@0	96	* <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td>
sl@0	97	* <td valign="top">The character 'a' and the multicharacter strings "ab" and
sl@0	98	* "ac"</td>
sl@0	99	* </tr>
sl@0	100	* <tr>
sl@0	101	* <td nowrap valign="top" align="left"><code>[\\p{Lu}]</code></td>
sl@0	102	* <td valign="top">All characters in the general category Uppercase Letter</td>
sl@0	103	* </tr>
sl@0	104	* </table>
sl@0	105	* \htmlonly</blockquote>\endhtmlonly
sl@0	106	*
sl@0	107	* Any character may be preceded by a backslash in order to remove any special
sl@0	108	* meaning. White space characters, as defined by UCharacter.isWhitespace(), are
sl@0	109	* ignored, unless they are escaped.
sl@0	110	*
sl@0	111	* <p>Property patterns specify a set of characters having a certain
sl@0	112	* property as defined by the Unicode standard. Both the POSIX-like
sl@0	113	* "[:Lu:]" and the Perl-like syntax "\\p{Lu}" are recognized. For a
sl@0	114	* complete list of supported property patterns, see the User's Guide
sl@0	115	* for UnicodeSet at
sl@0	116	* <a href="http://icu.sourceforge.net/userguide/unicodeSet.html">
sl@0	117	* http://icu.sourceforge.net/userguide/unicodeSet.html</a>.
sl@0	118	* Actual determination of property data is defined by the underlying
sl@0	119	* Unicode database as implemented by UCharacter.
sl@0	120	*
sl@0	121	* <p>Patterns specify individual characters, ranges of characters, and
sl@0	122	* Unicode property sets. When elements are concatenated, they
sl@0	123	* specify their union. To complement a set, place a '^' immediately
sl@0	124	* after the opening '['. Property patterns are inverted by modifying
sl@0	125	* their delimiters; "[:^foo]" and "\\P{foo}". In any other location,
sl@0	126	* '^' has no special meaning.
sl@0	127	*
sl@0	128	* <p>Ranges are indicated by placing two a '-' between two
sl@0	129	* characters, as in "a-z". This specifies the range of all
sl@0	130	* characters from the left to the right, in Unicode order. If the
sl@0	131	* left character is greater than or equal to the
sl@0	132	* right character it is a syntax error. If a '-' occurs as the first
sl@0	133	* character after the opening '[' or '[^', or if it occurs as the
sl@0	134	* last character before the closing ']', then it is taken as a
sl@0	135	* literal. Thus "[a\-b]", "[-ab]", and "[ab-]" all indicate the same
sl@0	136	* set of three characters, 'a', 'b', and '-'.
sl@0	137	*
sl@0	138	* <p>Sets may be intersected using the '&' operator or the asymmetric
sl@0	139	* set difference may be taken using the '-' operator, for example,
sl@0	140	* "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters
sl@0	141	* with values less than 4096. Operators ('&' and '\|') have equal
sl@0	142	* precedence and bind left-to-right. Thus
sl@0	143	* "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
sl@0	144	* "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for
sl@0	145	* difference; intersection is commutative.
sl@0	146	*
sl@0	147	* <table>
sl@0	148	* <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a'
sl@0	149	* <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a'
sl@0	150	* through 'z' and all letters in between, in Unicode order
sl@0	151	* <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
sl@0	152	* all characters but 'a' through 'z',
sl@0	153	* that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
sl@0	154	* <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
sl@0	155	* <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
sl@0	156	* <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code>
sl@0	157	* <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
sl@0	158	* <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
sl@0	159	* <td>The asymmetric difference of sets specified by <em>pat1</em> and
sl@0	160	* <em>pat2</em>
sl@0	161	* <tr valign=top><td nowrap><code>[:Lu:] or \\p{Lu}</code>
sl@0	162	* <td>The set of characters having the specified
sl@0	163	* Unicode property; in
sl@0	164	* this case, Unicode uppercase letters
sl@0	165	* <tr valign=top><td nowrap><code>[:^Lu:] or \\P{Lu}</code>
sl@0	166	* <td>The set of characters <em>not</em> having the given
sl@0	167	* Unicode property
sl@0	168	* </table>
sl@0	169	*
sl@0	170	* <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p>
sl@0	171	*
sl@0	172	* <p><b>Formal syntax</b></p>
sl@0	173	*
sl@0	174	* \htmlonly<blockquote>\endhtmlonly
sl@0	175	* <table>
sl@0	176	* <tr align="top">
sl@0	177	* <td nowrap valign="top" align="right"><code>pattern :=  </code></td>
sl@0	178	* <td valign="top"><code>('[' '^'? item* ']') \|
sl@0	179	* property</code></td>
sl@0	180	* </tr>
sl@0	181	* <tr align="top">
sl@0	182	* <td nowrap valign="top" align="right"><code>item :=  </code></td>
sl@0	183	* <td valign="top"><code>char \| (char '-' char) \| pattern-expr<br>
sl@0	184	* </code></td>
sl@0	185	* </tr>
sl@0	186	* <tr align="top">
sl@0	187	* <td nowrap valign="top" align="right"><code>pattern-expr :=  </code></td>
sl@0	188	* <td valign="top"><code>pattern \| pattern-expr pattern \|
sl@0	189	* pattern-expr op pattern<br>
sl@0	190	* </code></td>
sl@0	191	* </tr>
sl@0	192	* <tr align="top">
sl@0	193	* <td nowrap valign="top" align="right"><code>op :=  </code></td>
sl@0	194	* <td valign="top"><code>'&' \| '-'<br>
sl@0	195	* </code></td>
sl@0	196	* </tr>
sl@0	197	* <tr align="top">
sl@0	198	* <td nowrap valign="top" align="right"><code>special :=  </code></td>
sl@0	199	* <td valign="top"><code>'[' \| ']' \| '-'<br>
sl@0	200	* </code></td>
sl@0	201	* </tr>
sl@0	202	* <tr align="top">
sl@0	203	* <td nowrap valign="top" align="right"><code>char :=  </code></td>
sl@0	204	* <td valign="top"><em>any character that is not</em><code> special<br>
sl@0	205	* \| ('\' </code><em>any character</em><code>)<br>
sl@0	206	* \| ('\\u' hex hex hex hex)<br>
sl@0	207	* </code></td>
sl@0	208	* </tr>
sl@0	209	* <tr align="top">
sl@0	210	* <td nowrap valign="top" align="right"><code>hex :=  </code></td>
sl@0	211	* <td valign="top"><em>any character for which
sl@0	212	* </em><code>Character.digit(c, 16)</code><em>
sl@0	213	* returns a non-negative result</em></td>
sl@0	214	* </tr>
sl@0	215	* <tr>
sl@0	216	* <td nowrap valign="top" align="right"><code>property :=  </code></td>
sl@0	217	* <td valign="top"><em>a Unicode property set pattern</em></td>
sl@0	218	* </tr>
sl@0	219	* </table>
sl@0	220	* <br>
sl@0	221	* <table border="1">
sl@0	222	* <tr>
sl@0	223	* <td>Legend: <table>
sl@0	224	* <tr>
sl@0	225	* <td nowrap valign="top"><code>a := b</code></td>
sl@0	226	* <td width="20" valign="top">  </td>
sl@0	227	* <td valign="top"><code>a</code> may be replaced by <code>b</code> </td>
sl@0	228	* </tr>
sl@0	229	* <tr>
sl@0	230	* <td nowrap valign="top"><code>a?</code></td>
sl@0	231	* <td valign="top"></td>
sl@0	232	* <td valign="top">zero or one instance of <code>a</code><br>
sl@0	233	* </td>
sl@0	234	* </tr>
sl@0	235	* <tr>
sl@0	236	* <td nowrap valign="top"><code>a*</code></td>
sl@0	237	* <td valign="top"></td>
sl@0	238	* <td valign="top">one or more instances of <code>a</code><br>
sl@0	239	* </td>
sl@0	240	* </tr>
sl@0	241	* <tr>
sl@0	242	* <td nowrap valign="top"><code>a \| b</code></td>
sl@0	243	* <td valign="top"></td>
sl@0	244	* <td valign="top">either <code>a</code> or <code>b</code><br>
sl@0	245	* </td>
sl@0	246	* </tr>
sl@0	247	* <tr>
sl@0	248	* <td nowrap valign="top"><code>'a'</code></td>
sl@0	249	* <td valign="top"></td>
sl@0	250	* <td valign="top">the literal string between the quotes </td>
sl@0	251	* </tr>
sl@0	252	* </table>
sl@0	253	* </td>
sl@0	254	* </tr>
sl@0	255	* </table>
sl@0	256	* \htmlonly</blockquote>\endhtmlonly
sl@0	257	*
sl@0	258	* @author Alan Liu
sl@0	259	* @stable ICU 2.0
sl@0	260	*/
sl@0	261	class U_COMMON_API UnicodeSet : public UnicodeFilter {
sl@0	262
sl@0	263	int32_t len; // length of list used; 0 <= len <= capacity
sl@0	264	int32_t capacity; // capacity of list
sl@0	265	int32_t bufferCapacity; // capacity of buffer
sl@0	266	UChar32* list; // MUST be terminated with HIGH
sl@0	267	UChar32* buffer; // internal buffer, may be NULL
sl@0	268
sl@0	269	UVector* strings; // maintained in sorted order
sl@0	270
sl@0	271	/**
sl@0	272	* The pattern representation of this set. This may not be the
sl@0	273	* most economical pattern. It is the pattern supplied to
sl@0	274	* applyPattern(), with variables substituted and whitespace
sl@0	275	* removed. For sets constructed without applyPattern(), or
sl@0	276	* modified using the non-pattern API, this string will be empty,
sl@0	277	* indicating that toPattern() must generate a pattern
sl@0	278	* representation from the inversion list.
sl@0	279	*/
sl@0	280	UnicodeString pat;
sl@0	281
sl@0	282	public:
sl@0	283
sl@0	284	enum {
sl@0	285	/**
sl@0	286	* Minimum value that can be stored in a UnicodeSet.
sl@0	287	* @stable ICU 2.4
sl@0	288	*/
sl@0	289	MIN_VALUE = 0,
sl@0	290
sl@0	291	/**
sl@0	292	* Maximum value that can be stored in a UnicodeSet.
sl@0	293	* @stable ICU 2.4
sl@0	294	*/
sl@0	295	MAX_VALUE = 0x10ffff
sl@0	296	};
sl@0	297
sl@0	298	//----------------------------------------------------------------
sl@0	299	// Constructors &c
sl@0	300	//----------------------------------------------------------------
sl@0	301
sl@0	302	public:
sl@0	303
sl@0	304	/**
sl@0	305	* Constructs an empty set.
sl@0	306	* @stable ICU 2.0
sl@0	307	*/
sl@0	308	UnicodeSet();
sl@0	309
sl@0	310	/**
sl@0	311	* Constructs a set containing the given range. If <code>end >
sl@0	312	* start</code> then an empty set is created.
sl@0	313	*
sl@0	314	* @param start first character, inclusive, of range
sl@0	315	* @param end last character, inclusive, of range
sl@0	316	* @stable ICU 2.4
sl@0	317	*/
sl@0	318	UnicodeSet(UChar32 start, UChar32 end);
sl@0	319
sl@0	320	/**
sl@0	321	* Constructs a set from the given pattern. See the class
sl@0	322	* description for the syntax of the pattern language.
sl@0	323	* @param pattern a string specifying what characters are in the set
sl@0	324	* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
sl@0	325	* contains a syntax error.
sl@0	326	* @stable ICU 2.0
sl@0	327	*/
sl@0	328	UnicodeSet(const UnicodeString& pattern,
sl@0	329	UErrorCode& status);
sl@0	330
sl@0	331	/**
sl@0	332	* Constructs a set from the given pattern. See the class
sl@0	333	* description for the syntax of the pattern language.
sl@0	334	* @param pattern a string specifying what characters are in the set
sl@0	335	* @param options bitmask for options to apply to the pattern.
sl@0	336	* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
sl@0	337	* @param symbols a symbol table mapping variable names to values
sl@0	338	* and stand-in characters to UnicodeSets; may be NULL
sl@0	339	* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
sl@0	340	* contains a syntax error.
sl@0	341	* @internal
sl@0	342	*/
sl@0	343	UnicodeSet(const UnicodeString& pattern,
sl@0	344	uint32_t options,
sl@0	345	const SymbolTable* symbols,
sl@0	346	UErrorCode& status);
sl@0	347
sl@0	348	/**
sl@0	349	* Constructs a set from the given pattern. See the class description
sl@0	350	* for the syntax of the pattern language.
sl@0	351	* @param pattern a string specifying what characters are in the set
sl@0	352	* @param pos on input, the position in pattern at which to start parsing.
sl@0	353	* On output, the position after the last character parsed.
sl@0	354	* @param options bitmask for options to apply to the pattern.
sl@0	355	* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
sl@0	356	* @param symbols a symbol table mapping variable names to values
sl@0	357	* and stand-in characters to UnicodeSets; may be NULL
sl@0	358	* @param status input-output error code
sl@0	359	* @stable ICU 2.8
sl@0	360	*/
sl@0	361	UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
sl@0	362	uint32_t options,
sl@0	363	const SymbolTable* symbols,
sl@0	364	UErrorCode& status);
sl@0	365
sl@0	366	#ifdef U_USE_UNICODESET_DEPRECATES
sl@0	367	/**
sl@0	368	* Obsolete: Constructs a set from the given Unicode character category.
sl@0	369	* @param category an integer indicating the character category as
sl@0	370	* defined in uchar.h.
sl@0	371	* @obsolete ICU 2.6. Use a pattern with the category instead since this API will be removed in that release.
sl@0	372	*/
sl@0	373	UnicodeSet(int8_t category, UErrorCode& status);
sl@0	374	#endif
sl@0	375
sl@0	376	/**
sl@0	377	* Constructs a set that is identical to the given UnicodeSet.
sl@0	378	* @stable ICU 2.0
sl@0	379	*/
sl@0	380	UnicodeSet(const UnicodeSet& o);
sl@0	381
sl@0	382	/**
sl@0	383	* Destructs the set.
sl@0	384	* @stable ICU 2.0
sl@0	385	*/
sl@0	386	virtual ~UnicodeSet();
sl@0	387
sl@0	388	/**
sl@0	389	* Assigns this object to be a copy of another.
sl@0	390	* @stable ICU 2.0
sl@0	391	*/
sl@0	392	UnicodeSet& operator=(const UnicodeSet& o);
sl@0	393
sl@0	394	/**
sl@0	395	* Compares the specified object with this set for equality. Returns
sl@0	396	* <tt>true</tt> if the two sets
sl@0	397	* have the same size, and every member of the specified set is
sl@0	398	* contained in this set (or equivalently, every member of this set is
sl@0	399	* contained in the specified set).
sl@0	400	*
sl@0	401	* @param o set to be compared for equality with this set.
sl@0	402	* @return <tt>true</tt> if the specified set is equal to this set.
sl@0	403	* @stable ICU 2.0
sl@0	404	*/
sl@0	405	virtual UBool operator==(const UnicodeSet& o) const;
sl@0	406
sl@0	407	/**
sl@0	408	* Compares the specified object with this set for equality. Returns
sl@0	409	* <tt>true</tt> if the specified set is not equal to this set.
sl@0	410	* @stable ICU 2.0
sl@0	411	*/
sl@0	412	UBool operator!=(const UnicodeSet& o) const;
sl@0	413
sl@0	414	/**
sl@0	415	* Returns a copy of this object. All UnicodeFunctor objects have
sl@0	416	* to support cloning in order to allow classes using
sl@0	417	* UnicodeFunctors, such as Transliterator, to implement cloning.
sl@0	418	* @stable ICU 2.0
sl@0	419	*/
sl@0	420	virtual UnicodeFunctor* clone() const;
sl@0	421
sl@0	422	/**
sl@0	423	* Returns the hash code value for this set.
sl@0	424	*
sl@0	425	* @return the hash code value for this set.
sl@0	426	* @see Object#hashCode()
sl@0	427	* @stable ICU 2.0
sl@0	428	*/
sl@0	429	virtual int32_t hashCode(void) const;
sl@0	430
sl@0	431	//----------------------------------------------------------------
sl@0	432	// Public API
sl@0	433	//----------------------------------------------------------------
sl@0	434
sl@0	435	/**
sl@0	436	* Make this object represent the range <code>start - end</code>.
sl@0	437	* If <code>end > start</code> then this object is set to an
sl@0	438	* an empty range.
sl@0	439	*
sl@0	440	* @param start first character in the set, inclusive
sl@0	441	* @param end last character in the set, inclusive
sl@0	442	* @stable ICU 2.4
sl@0	443	*/
sl@0	444	UnicodeSet& set(UChar32 start, UChar32 end);
sl@0	445
sl@0	446	/**
sl@0	447	* Return true if the given position, in the given pattern, appears
sl@0	448	* to be the start of a UnicodeSet pattern.
sl@0	449	* @stable ICU 2.4
sl@0	450	*/
sl@0	451	static UBool resemblesPattern(const UnicodeString& pattern,
sl@0	452	int32_t pos);
sl@0	453
sl@0	454	/**
sl@0	455	* Modifies this set to represent the set specified by the given
sl@0	456	* pattern, optionally ignoring white space. See the class
sl@0	457	* description for the syntax of the pattern language.
sl@0	458	* @param pattern a string specifying what characters are in the set
sl@0	459	* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
sl@0	460	* contains a syntax error.
sl@0	461	* <em> Empties the set passed before applying the pattern.</em>
sl@0	462	* @return a reference to this
sl@0	463	* @stable ICU 2.0
sl@0	464	*/
sl@0	465	UnicodeSet& applyPattern(const UnicodeString& pattern,
sl@0	466	UErrorCode& status);
sl@0	467
sl@0	468	/**
sl@0	469	* Modifies this set to represent the set specified by the given
sl@0	470	* pattern, optionally ignoring white space. See the class
sl@0	471	* description for the syntax of the pattern language.
sl@0	472	* @param pattern a string specifying what characters are in the set
sl@0	473	* @param options bitmask for options to apply to the pattern.
sl@0	474	* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
sl@0	475	* @param symbols a symbol table mapping variable names to
sl@0	476	* values and stand-ins to UnicodeSets; may be NULL
sl@0	477	* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
sl@0	478	* contains a syntax error.
sl@0	479	*<em> Empties the set passed before applying the pattern.</em>
sl@0	480	* @return a reference to this
sl@0	481	* @internal
sl@0	482	*/
sl@0	483	UnicodeSet& applyPattern(const UnicodeString& pattern,
sl@0	484	uint32_t options,
sl@0	485	const SymbolTable* symbols,
sl@0	486	UErrorCode& status);
sl@0	487
sl@0	488	/**
sl@0	489	* Parses the given pattern, starting at the given position. The
sl@0	490	* character at pattern.charAt(pos.getIndex()) must be '[', or the
sl@0	491	* parse fails. Parsing continues until the corresponding closing
sl@0	492	* ']'. If a syntax error is encountered between the opening and
sl@0	493	* closing brace, the parse fails. Upon return from a successful
sl@0	494	* parse, the ParsePosition is updated to point to the character
sl@0	495	* following the closing ']', and a StringBuffer containing a
sl@0	496	* pairs list for the parsed pattern is returned. This method calls
sl@0	497	* itself recursively to parse embedded subpatterns.
sl@0	498	*<em> Empties the set passed before applying the pattern.</em>
sl@0	499	*
sl@0	500	* @param pattern the string containing the pattern to be parsed.
sl@0	501	* The portion of the string from pos.getIndex(), which must be a
sl@0	502	* '[', to the corresponding closing ']', is parsed.
sl@0	503	* @param pos upon entry, the position at which to being parsing.
sl@0	504	* The character at pattern.charAt(pos.getIndex()) must be a '['.
sl@0	505	* Upon return from a successful parse, pos.getIndex() is either
sl@0	506	* the character after the closing ']' of the parsed pattern, or
sl@0	507	* pattern.length() if the closing ']' is the last character of
sl@0	508	* the pattern string.
sl@0	509	* @param options bitmask for options to apply to the pattern.
sl@0	510	* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
sl@0	511	* @param symbols a symbol table mapping variable names to
sl@0	512	* values and stand-ins to UnicodeSets; may be NULL
sl@0	513	* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
sl@0	514	* contains a syntax error.
sl@0	515	* @return a reference to this
sl@0	516	* @stable ICU 2.8
sl@0	517	*/
sl@0	518	UnicodeSet& applyPattern(const UnicodeString& pattern,
sl@0	519	ParsePosition& pos,
sl@0	520	uint32_t options,
sl@0	521	const SymbolTable* symbols,
sl@0	522	UErrorCode& status);
sl@0	523
sl@0	524	/**
sl@0	525	* Returns a string representation of this set. If the result of
sl@0	526	* calling this function is passed to a UnicodeSet constructor, it
sl@0	527	* will produce another set that is equal to this one.
sl@0	528	* @param result the string to receive the rules. Previous
sl@0	529	* contents will be deleted.
sl@0	530	* @param escapeUnprintable if TRUE then convert unprintable
sl@0	531	* character to their hex escape representations, \\uxxxx or
sl@0	532	* \\Uxxxxxxxx. Unprintable characters are those other than
sl@0	533	* U+000A, U+0020..U+007E.
sl@0	534	* @stable ICU 2.0
sl@0	535	*/
sl@0	536	virtual UnicodeString& toPattern(UnicodeString& result,
sl@0	537	UBool escapeUnprintable = FALSE) const;
sl@0	538
sl@0	539	/**
sl@0	540	* Modifies this set to contain those code points which have the given value
sl@0	541	* for the given binary or enumerated property, as returned by
sl@0	542	* u_getIntPropertyValue. Prior contents of this set are lost.
sl@0	543	*
sl@0	544	* @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1
sl@0	545	* or UCHAR_INT_START..UCHAR_INT_LIMIT-1
sl@0	546	* or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1.
sl@0	547	*
sl@0	548	* @param value a value in the range u_getIntPropertyMinValue(prop)..
sl@0	549	* u_getIntPropertyMaxValue(prop), with one exception. If prop is
sl@0	550	* UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but
sl@0	551	* rather a mask value produced by U_GET_GC_MASK(). This allows grouped
sl@0	552	* categories such as [:L:] to be represented.
sl@0	553	*
sl@0	554	* @param ec error code input/output parameter
sl@0	555	*
sl@0	556	* @return a reference to this set
sl@0	557	*
sl@0	558	* @stable ICU 2.4
sl@0	559	*/
sl@0	560	UnicodeSet& applyIntPropertyValue(UProperty prop,
sl@0	561	int32_t value,
sl@0	562	UErrorCode& ec);
sl@0	563
sl@0	564	/**
sl@0	565	* Modifies this set to contain those code points which have the
sl@0	566	* given value for the given property. Prior contents of this
sl@0	567	* set are lost.
sl@0	568	*
sl@0	569	* @param prop a property alias, either short or long. The name is matched
sl@0	570	* loosely. See PropertyAliases.txt for names and a description of loose
sl@0	571	* matching. If the value string is empty, then this string is interpreted
sl@0	572	* as either a General_Category value alias, a Script value alias, a binary
sl@0	573	* property alias, or a special ID. Special IDs are matched loosely and
sl@0	574	* correspond to the following sets:
sl@0	575	*
sl@0	576	* "ANY" = [\\u0000-\\U0010FFFF],
sl@0	577	* "ASCII" = [\\u0000-\\u007F],
sl@0	578	* "Assigned" = [:^Cn:].
sl@0	579	*
sl@0	580	* @param value a value alias, either short or long. The name is matched
sl@0	581	* loosely. See PropertyValueAliases.txt for names and a description of
sl@0	582	* loose matching. In addition to aliases listed, numeric values and
sl@0	583	* canonical combining classes may be expressed numerically, e.g., ("nv",
sl@0	584	* "0.5") or ("ccc", "220"). The value string may also be empty.
sl@0	585	*
sl@0	586	* @param ec error code input/output parameter
sl@0	587	*
sl@0	588	* @return a reference to this set
sl@0	589	*
sl@0	590	* @stable ICU 2.4
sl@0	591	*/
sl@0	592	UnicodeSet& applyPropertyAlias(const UnicodeString& prop,
sl@0	593	const UnicodeString& value,
sl@0	594	UErrorCode& ec);
sl@0	595
sl@0	596	/**
sl@0	597	* Returns the number of elements in this set (its cardinality).
sl@0	598	* Note than the elements of a set may include both individual
sl@0	599	* codepoints and strings.
sl@0	600	*
sl@0	601	* @return the number of elements in this set (its cardinality).
sl@0	602	* @stable ICU 2.0
sl@0	603	*/
sl@0	604	virtual int32_t size(void) const;
sl@0	605
sl@0	606	/**
sl@0	607	* Returns <tt>true</tt> if this set contains no elements.
sl@0	608	*
sl@0	609	* @return <tt>true</tt> if this set contains no elements.
sl@0	610	* @stable ICU 2.0
sl@0	611	*/
sl@0	612	virtual UBool isEmpty(void) const;
sl@0	613
sl@0	614	/**
sl@0	615	* Returns true if this set contains the given character.
sl@0	616	* @param c character to be checked for containment
sl@0	617	* @return true if the test condition is met
sl@0	618	* @stable ICU 2.0
sl@0	619	*/
sl@0	620	virtual UBool contains(UChar32 c) const;
sl@0	621
sl@0	622	/**
sl@0	623	* Returns true if this set contains every character
sl@0	624	* of the given range.
sl@0	625	* @param start first character, inclusive, of the range
sl@0	626	* @param end last character, inclusive, of the range
sl@0	627	* @return true if the test condition is met
sl@0	628	* @stable ICU 2.0
sl@0	629	*/
sl@0	630	virtual UBool contains(UChar32 start, UChar32 end) const;
sl@0	631
sl@0	632	/**
sl@0	633	* Returns <tt>true</tt> if this set contains the given
sl@0	634	* multicharacter string.
sl@0	635	* @param s string to be checked for containment
sl@0	636	* @return <tt>true</tt> if this set contains the specified string
sl@0	637	* @stable ICU 2.4
sl@0	638	*/
sl@0	639	UBool contains(const UnicodeString& s) const;
sl@0	640
sl@0	641	/**
sl@0	642	* Returns true if this set contains all the characters and strings
sl@0	643	* of the given set.
sl@0	644	* @param c set to be checked for containment
sl@0	645	* @return true if the test condition is met
sl@0	646	* @stable ICU 2.4
sl@0	647	*/
sl@0	648	virtual UBool containsAll(const UnicodeSet& c) const;
sl@0	649
sl@0	650	/**
sl@0	651	* Returns true if this set contains all the characters
sl@0	652	* of the given string.
sl@0	653	* @param s string containing characters to be checked for containment
sl@0	654	* @return true if the test condition is met
sl@0	655	* @stable ICU 2.4
sl@0	656	*/
sl@0	657	UBool containsAll(const UnicodeString& s) const;
sl@0	658
sl@0	659	/**
sl@0	660	* Returns true if this set contains none of the characters
sl@0	661	* of the given range.
sl@0	662	* @param start first character, inclusive, of the range
sl@0	663	* @param end last character, inclusive, of the range
sl@0	664	* @return true if the test condition is met
sl@0	665	* @stable ICU 2.4
sl@0	666	*/
sl@0	667	UBool containsNone(UChar32 start, UChar32 end) const;
sl@0	668
sl@0	669	/**
sl@0	670	* Returns true if this set contains none of the characters and strings
sl@0	671	* of the given set.
sl@0	672	* @param c set to be checked for containment
sl@0	673	* @return true if the test condition is met
sl@0	674	* @stable ICU 2.4
sl@0	675	*/
sl@0	676	UBool containsNone(const UnicodeSet& c) const;
sl@0	677
sl@0	678	/**
sl@0	679	* Returns true if this set contains none of the characters
sl@0	680	* of the given string.
sl@0	681	* @param s string containing characters to be checked for containment
sl@0	682	* @return true if the test condition is met
sl@0	683	* @stable ICU 2.4
sl@0	684	*/
sl@0	685	UBool containsNone(const UnicodeString& s) const;
sl@0	686
sl@0	687	/**
sl@0	688	* Returns true if this set contains one or more of the characters
sl@0	689	* in the given range.
sl@0	690	* @param start first character, inclusive, of the range
sl@0	691	* @param end last character, inclusive, of the range
sl@0	692	* @return true if the condition is met
sl@0	693	* @stable ICU 2.4
sl@0	694	*/
sl@0	695	inline UBool containsSome(UChar32 start, UChar32 end) const;
sl@0	696
sl@0	697	/**
sl@0	698	* Returns true if this set contains one or more of the characters
sl@0	699	* and strings of the given set.
sl@0	700	* @param s The set to be checked for containment
sl@0	701	* @return true if the condition is met
sl@0	702	* @stable ICU 2.4
sl@0	703	*/
sl@0	704	inline UBool containsSome(const UnicodeSet& s) const;
sl@0	705
sl@0	706	/**
sl@0	707	* Returns true if this set contains one or more of the characters
sl@0	708	* of the given string.
sl@0	709	* @param s string containing characters to be checked for containment
sl@0	710	* @return true if the condition is met
sl@0	711	* @stable ICU 2.4
sl@0	712	*/
sl@0	713	inline UBool containsSome(const UnicodeString& s) const;
sl@0	714
sl@0	715	/**
sl@0	716	* Implement UnicodeMatcher::matches()
sl@0	717	* @stable ICU 2.4
sl@0	718	*/
sl@0	719	virtual UMatchDegree matches(const Replaceable& text,
sl@0	720	int32_t& offset,
sl@0	721	int32_t limit,
sl@0	722	UBool incremental);
sl@0	723
sl@0	724	private:
sl@0	725	/**
sl@0	726	* Returns the longest match for s in text at the given position.
sl@0	727	* If limit > start then match forward from start+1 to limit
sl@0	728	* matching all characters except s.charAt(0). If limit < start,
sl@0	729	* go backward starting from start-1 matching all characters
sl@0	730	* except s.charAt(s.length()-1). This method assumes that the
sl@0	731	* first character, text.charAt(start), matches s, so it does not
sl@0	732	* check it.
sl@0	733	* @param text the text to match
sl@0	734	* @param start the first character to match. In the forward
sl@0	735	* direction, text.charAt(start) is matched against s.charAt(0).
sl@0	736	* In the reverse direction, it is matched against
sl@0	737	* s.charAt(s.length()-1).
sl@0	738	* @param limit the limit offset for matching, either last+1 in
sl@0	739	* the forward direction, or last-1 in the reverse direction,
sl@0	740	* where last is the index of the last character to match.
sl@0	741	* @return If part of s matches up to the limit, return \|limit -
sl@0	742	* start\|. If all of s matches before reaching the limit, return
sl@0	743	* s.length(). If there is a mismatch between s and text, return
sl@0	744	* 0
sl@0	745	*/
sl@0	746	static int32_t matchRest(const Replaceable& text,
sl@0	747	int32_t start, int32_t limit,
sl@0	748	const UnicodeString& s);
sl@0	749
sl@0	750	/**
sl@0	751	* Returns the smallest value i such that c < list[i]. Caller
sl@0	752	* must ensure that c is a legal value or this method will enter
sl@0	753	* an infinite loop. This method performs a binary search.
sl@0	754	* @param c a character in the range MIN_VALUE..MAX_VALUE
sl@0	755	* inclusive
sl@0	756	* @return the smallest integer i in the range 0..len-1,
sl@0	757	* inclusive, such that c < list[i]
sl@0	758	*/
sl@0	759	int32_t findCodePoint(UChar32 c) const;
sl@0	760
sl@0	761	public:
sl@0	762
sl@0	763	/**
sl@0	764	* Implementation of UnicodeMatcher API. Union the set of all
sl@0	765	* characters that may be matched by this object into the given
sl@0	766	* set.
sl@0	767	* @param toUnionTo the set into which to union the source characters
sl@0	768	* @stable ICU 2.4
sl@0	769	*/
sl@0	770	virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
sl@0	771
sl@0	772	/**
sl@0	773	* Returns the index of the given character within this set, where
sl@0	774	* the set is ordered by ascending code point. If the character
sl@0	775	* is not in this set, return -1. The inverse of this method is
sl@0	776	* <code>charAt()</code>.
sl@0	777	* @return an index from 0..size()-1, or -1
sl@0	778	* @stable ICU 2.4
sl@0	779	*/
sl@0	780	int32_t indexOf(UChar32 c) const;
sl@0	781
sl@0	782	/**
sl@0	783	* Returns the character at the given index within this set, where
sl@0	784	* the set is ordered by ascending code point. If the index is
sl@0	785	* out of range, return (UChar32)-1. The inverse of this method is
sl@0	786	* <code>indexOf()</code>.
sl@0	787	* @param index an index from 0..size()-1
sl@0	788	* @return the character at the given index, or (UChar32)-1.
sl@0	789	* @stable ICU 2.4
sl@0	790	*/
sl@0	791	UChar32 charAt(int32_t index) const;
sl@0	792
sl@0	793	/**
sl@0	794	* Adds the specified range to this set if it is not already
sl@0	795	* present. If this set already contains the specified range,
sl@0	796	* the call leaves this set unchanged. If <code>end > start</code>
sl@0	797	* then an empty range is added, leaving the set unchanged.
sl@0	798	* This is equivalent to a boolean logic OR, or a set UNION.
sl@0	799	*
sl@0	800	* @param start first character, inclusive, of range to be added
sl@0	801	* to this set.
sl@0	802	* @param end last character, inclusive, of range to be added
sl@0	803	* to this set.
sl@0	804	* @stable ICU 2.0
sl@0	805	*/
sl@0	806	virtual UnicodeSet& add(UChar32 start, UChar32 end);
sl@0	807
sl@0	808	/**
sl@0	809	* Adds the specified character to this set if it is not already
sl@0	810	* present. If this set already contains the specified character,
sl@0	811	* the call leaves this set unchanged.
sl@0	812	* @stable ICU 2.0
sl@0	813	*/
sl@0	814	UnicodeSet& add(UChar32 c);
sl@0	815
sl@0	816	/**
sl@0	817	* Adds the specified multicharacter to this set if it is not already
sl@0	818	* present. If this set already contains the multicharacter,
sl@0	819	* the call leaves this set unchanged.
sl@0	820	* Thus "ch" => {"ch"}
sl@0	821	* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
sl@0	822	* @param s the source string
sl@0	823	* @return this object, for chaining
sl@0	824	* @stable ICU 2.4
sl@0	825	*/
sl@0	826	UnicodeSet& add(const UnicodeString& s);
sl@0	827
sl@0	828	private:
sl@0	829	/**
sl@0	830	* @return a code point IF the string consists of a single one.
sl@0	831	* otherwise returns -1.
sl@0	832	* @param string to test
sl@0	833	*/
sl@0	834	static int32_t getSingleCP(const UnicodeString& s);
sl@0	835
sl@0	836	void _add(const UnicodeString& s);
sl@0	837
sl@0	838	public:
sl@0	839	/**
sl@0	840	* Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
sl@0	841	* If this set already any particular character, it has no effect on that character.
sl@0	842	* @param s the source string
sl@0	843	* @return this object, for chaining
sl@0	844	* @stable ICU 2.4
sl@0	845	*/
sl@0	846	UnicodeSet& addAll(const UnicodeString& s);
sl@0	847
sl@0	848	/**
sl@0	849	* Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
sl@0	850	* If this set already any particular character, it has no effect on that character.
sl@0	851	* @param s the source string
sl@0	852	* @return this object, for chaining
sl@0	853	* @stable ICU 2.4
sl@0	854	*/
sl@0	855	UnicodeSet& retainAll(const UnicodeString& s);
sl@0	856
sl@0	857	/**
sl@0	858	* Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
sl@0	859	* If this set already any particular character, it has no effect on that character.
sl@0	860	* @param s the source string
sl@0	861	* @return this object, for chaining
sl@0	862	* @stable ICU 2.4
sl@0	863	*/
sl@0	864	UnicodeSet& complementAll(const UnicodeString& s);
sl@0	865
sl@0	866	/**
sl@0	867	* Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
sl@0	868	* If this set already any particular character, it has no effect on that character.
sl@0	869	* @param s the source string
sl@0	870	* @return this object, for chaining
sl@0	871	* @stable ICU 2.4
sl@0	872	*/
sl@0	873	UnicodeSet& removeAll(const UnicodeString& s);
sl@0	874
sl@0	875	/**
sl@0	876	* Makes a set from a multicharacter string. Thus "ch" => {"ch"}
sl@0	877	* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
sl@0	878	* @param s the source string
sl@0	879	* @return a newly created set containing the given string.
sl@0	880	* The caller owns the return object and is responsible for deleting it.
sl@0	881	* @stable ICU 2.4
sl@0	882	*/
sl@0	883	static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s);
sl@0	884
sl@0	885
sl@0	886	/**
sl@0	887	* Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
sl@0	888	* @param s the source string
sl@0	889	* @return a newly created set containing the given characters
sl@0	890	* The caller owns the return object and is responsible for deleting it.
sl@0	891	* @stable ICU 2.4
sl@0	892	*/
sl@0	893	static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s);
sl@0	894
sl@0	895	/**
sl@0	896	* Retain only the elements in this set that are contained in the
sl@0	897	* specified range. If <code>end > start</code> then an empty range is
sl@0	898	* retained, leaving the set empty. This is equivalent to
sl@0	899	* a boolean logic AND, or a set INTERSECTION.
sl@0	900	*
sl@0	901	* @param start first character, inclusive, of range to be retained
sl@0	902	* to this set.
sl@0	903	* @param end last character, inclusive, of range to be retained
sl@0	904	* to this set.
sl@0	905	* @stable ICU 2.0
sl@0	906	*/
sl@0	907	virtual UnicodeSet& retain(UChar32 start, UChar32 end);
sl@0	908
sl@0	909
sl@0	910	/**
sl@0	911	* Retain the specified character from this set if it is present.
sl@0	912	* @stable ICU 2.0
sl@0	913	*/
sl@0	914	UnicodeSet& retain(UChar32 c);
sl@0	915
sl@0	916	/**
sl@0	917	* Removes the specified range from this set if it is present.
sl@0	918	* The set will not contain the specified range once the call
sl@0	919	* returns. If <code>end > start</code> then an empty range is
sl@0	920	* removed, leaving the set unchanged.
sl@0	921	*
sl@0	922	* @param start first character, inclusive, of range to be removed
sl@0	923	* from this set.
sl@0	924	* @param end last character, inclusive, of range to be removed
sl@0	925	* from this set.
sl@0	926	* @stable ICU 2.0
sl@0	927	*/
sl@0	928	virtual UnicodeSet& remove(UChar32 start, UChar32 end);
sl@0	929
sl@0	930	/**
sl@0	931	* Removes the specified character from this set if it is present.
sl@0	932	* The set will not contain the specified range once the call
sl@0	933	* returns.
sl@0	934	* @stable ICU 2.0
sl@0	935	*/
sl@0	936	UnicodeSet& remove(UChar32 c);
sl@0	937
sl@0	938	/**
sl@0	939	* Removes the specified string from this set if it is present.
sl@0	940	* The set will not contain the specified character once the call
sl@0	941	* returns.
sl@0	942	* @param s the source string
sl@0	943	* @return this object, for chaining
sl@0	944	* @stable ICU 2.4
sl@0	945	*/
sl@0	946	UnicodeSet& remove(const UnicodeString& s);
sl@0	947
sl@0	948	/**
sl@0	949	* Inverts this set. This operation modifies this set so that
sl@0	950	* its value is its complement. This is equivalent to
sl@0	951	* <code>complement(MIN_VALUE, MAX_VALUE)</code>.
sl@0	952	* @stable ICU 2.0
sl@0	953	*/
sl@0	954	virtual UnicodeSet& complement(void);
sl@0	955
sl@0	956	/**
sl@0	957	* Complements the specified range in this set. Any character in
sl@0	958	* the range will be removed if it is in this set, or will be
sl@0	959	* added if it is not in this set. If <code>end > start</code>
sl@0	960	* then an empty range is complemented, leaving the set unchanged.
sl@0	961	* This is equivalent to a boolean logic XOR.
sl@0	962	*
sl@0	963	* @param start first character, inclusive, of range to be removed
sl@0	964	* from this set.
sl@0	965	* @param end last character, inclusive, of range to be removed
sl@0	966	* from this set.
sl@0	967	* @stable ICU 2.0
sl@0	968	*/
sl@0	969	virtual UnicodeSet& complement(UChar32 start, UChar32 end);
sl@0	970
sl@0	971	/**
sl@0	972	* Complements the specified character in this set. The character
sl@0	973	* will be removed if it is in this set, or will be added if it is
sl@0	974	* not in this set.
sl@0	975	* @stable ICU 2.0
sl@0	976	*/
sl@0	977	UnicodeSet& complement(UChar32 c);
sl@0	978
sl@0	979	/**
sl@0	980	* Complement the specified string in this set.
sl@0	981	* The set will not contain the specified string once the call
sl@0	982	* returns.
sl@0	983	* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
sl@0	984	* @param s the string to complement
sl@0	985	* @return this object, for chaining
sl@0	986	* @stable ICU 2.4
sl@0	987	*/
sl@0	988	UnicodeSet& complement(const UnicodeString& s);
sl@0	989
sl@0	990	/**
sl@0	991	* Adds all of the elements in the specified set to this set if
sl@0	992	* they're not already present. This operation effectively
sl@0	993	* modifies this set so that its value is the <i>union</i> of the two
sl@0	994	* sets. The behavior of this operation is unspecified if the specified
sl@0	995	* collection is modified while the operation is in progress.
sl@0	996	*
sl@0	997	* @param c set whose elements are to be added to this set.
sl@0	998	* @see #add(char, char)
sl@0	999	* @stable ICU 2.0
sl@0	1000	*/
sl@0	1001	virtual UnicodeSet& addAll(const UnicodeSet& c);
sl@0	1002
sl@0	1003	/**
sl@0	1004	* Retains only the elements in this set that are contained in the
sl@0	1005	* specified set. In other words, removes from this set all of
sl@0	1006	* its elements that are not contained in the specified set. This
sl@0	1007	* operation effectively modifies this set so that its value is
sl@0	1008	* the <i>intersection</i> of the two sets.
sl@0	1009	*
sl@0	1010	* @param c set that defines which elements this set will retain.
sl@0	1011	* @stable ICU 2.0
sl@0	1012	*/
sl@0	1013	virtual UnicodeSet& retainAll(const UnicodeSet& c);
sl@0	1014
sl@0	1015	/**
sl@0	1016	* Removes from this set all of its elements that are contained in the
sl@0	1017	* specified set. This operation effectively modifies this
sl@0	1018	* set so that its value is the <i>asymmetric set difference</i> of
sl@0	1019	* the two sets.
sl@0	1020	*
sl@0	1021	* @param c set that defines which elements will be removed from
sl@0	1022	* this set.
sl@0	1023	* @stable ICU 2.0
sl@0	1024	*/
sl@0	1025	virtual UnicodeSet& removeAll(const UnicodeSet& c);
sl@0	1026
sl@0	1027	/**
sl@0	1028	* Complements in this set all elements contained in the specified
sl@0	1029	* set. Any character in the other set will be removed if it is
sl@0	1030	* in this set, or will be added if it is not in this set.
sl@0	1031	*
sl@0	1032	* @param c set that defines which elements will be xor'ed from
sl@0	1033	* this set.
sl@0	1034	* @stable ICU 2.4
sl@0	1035	*/
sl@0	1036	virtual UnicodeSet& complementAll(const UnicodeSet& c);
sl@0	1037
sl@0	1038	/**
sl@0	1039	* Removes all of the elements from this set. This set will be
sl@0	1040	* empty after this call returns.
sl@0	1041	* @stable ICU 2.0
sl@0	1042	*/
sl@0	1043	virtual UnicodeSet& clear(void);
sl@0	1044
sl@0	1045	/**
sl@0	1046	* Close this set over the given attribute. For the attribute
sl@0	1047	* USET_CASE, the result is to modify this set so that:
sl@0	1048	*
sl@0	1049	* 1. For each character or string 'a' in this set, all strings or
sl@0	1050	* characters 'b' such that foldCase(a) == foldCase(b) are added
sl@0	1051	* to this set.
sl@0	1052	*
sl@0	1053	* 2. For each string 'e' in the resulting set, if e !=
sl@0	1054	* foldCase(e), 'e' will be removed.
sl@0	1055	*
sl@0	1056	* Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]
sl@0	1057	*
sl@0	1058	* (Here foldCase(x) refers to the operation u_strFoldCase, and a
sl@0	1059	* == b denotes that the contents are the same, not pointer
sl@0	1060	* comparison.)
sl@0	1061	*
sl@0	1062	* @param attribute bitmask for attributes to close over.
sl@0	1063	* Currently only the USET_CASE bit is supported. Any undefined bits
sl@0	1064	* are ignored.
sl@0	1065	* @return a reference to this set.
sl@0	1066	* @internal
sl@0	1067	*/
sl@0	1068	UnicodeSet& closeOver(int32_t attribute);
sl@0	1069
sl@0	1070	/**
sl@0	1071	* Iteration method that returns the number of ranges contained in
sl@0	1072	* this set.
sl@0	1073	* @see #getRangeStart
sl@0	1074	* @see #getRangeEnd
sl@0	1075	* @stable ICU 2.4
sl@0	1076	*/
sl@0	1077	virtual int32_t getRangeCount(void) const;
sl@0	1078
sl@0	1079	/**
sl@0	1080	* Iteration method that returns the first character in the
sl@0	1081	* specified range of this set.
sl@0	1082	* @see #getRangeCount
sl@0	1083	* @see #getRangeEnd
sl@0	1084	* @stable ICU 2.4
sl@0	1085	*/
sl@0	1086	virtual UChar32 getRangeStart(int32_t index) const;
sl@0	1087
sl@0	1088	/**
sl@0	1089	* Iteration method that returns the last character in the
sl@0	1090	* specified range of this set.
sl@0	1091	* @see #getRangeStart
sl@0	1092	* @see #getRangeEnd
sl@0	1093	* @stable ICU 2.4
sl@0	1094	*/
sl@0	1095	virtual UChar32 getRangeEnd(int32_t index) const;
sl@0	1096
sl@0	1097	/**
sl@0	1098	* Serializes this set into an array of 16-bit integers. Serialization
sl@0	1099	* (currently) only records the characters in the set; multicharacter
sl@0	1100	* strings are ignored.
sl@0	1101	*
sl@0	1102	* The array has following format (each line is one 16-bit
sl@0	1103	* integer):
sl@0	1104	*
sl@0	1105	* length = (n+2*m) \| (m!=0?0x8000:0)
sl@0	1106	* bmpLength = n; present if m!=0
sl@0	1107	* bmp[0]
sl@0	1108	* bmp[1]
sl@0	1109	* ...
sl@0	1110	* bmp[n-1]
sl@0	1111	* supp-high[0]
sl@0	1112	* supp-low[0]
sl@0	1113	* supp-high[1]
sl@0	1114	* supp-low[1]
sl@0	1115	* ...
sl@0	1116	* supp-high[m-1]
sl@0	1117	* supp-low[m-1]
sl@0	1118	*
sl@0	1119	* The array starts with a header. After the header are n bmp
sl@0	1120	* code points, then m supplementary code points. Either n or m
sl@0	1121	* or both may be zero. n+2*m is always <= 0x7FFF.
sl@0	1122	*
sl@0	1123	* If there are no supplementary characters (if m==0) then the
sl@0	1124	* header is one 16-bit integer, 'length', with value n.
sl@0	1125	*
sl@0	1126	* If there are supplementary characters (if m!=0) then the header
sl@0	1127	* is two 16-bit integers. The first, 'length', has value
sl@0	1128	* (n+2*m)\|0x8000. The second, 'bmpLength', has value n.
sl@0	1129	*
sl@0	1130	* After the header the code points are stored in ascending order.
sl@0	1131	* Supplementary code points are stored as most significant 16
sl@0	1132	* bits followed by least significant 16 bits.
sl@0	1133	*
sl@0	1134	* @param dest pointer to buffer of destCapacity 16-bit integers.
sl@0	1135	* May be NULL only if destCapacity is zero.
sl@0	1136	* @param destCapacity size of dest, or zero. Must not be negative.
sl@0	1137	* @param ec error code. Will be set to U_INDEX_OUTOFBOUNDS_ERROR
sl@0	1138	* if n+2*m > 0x7FFF. Will be set to U_BUFFER_OVERFLOW_ERROR if
sl@0	1139	* n+2*m+(m!=0?2:1) > destCapacity.
sl@0	1140	* @return the total length of the serialized format, including
sl@0	1141	* the header, that is, n+2*m+(m!=0?2:1), or 0 on error other
sl@0	1142	* than U_BUFFER_OVERFLOW_ERROR.
sl@0	1143	* @stable ICU 2.4
sl@0	1144	*/
sl@0	1145	int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const;
sl@0	1146
sl@0	1147	/**
sl@0	1148	* Reallocate this objects internal structures to take up the least
sl@0	1149	* possible space, without changing this object's value.
sl@0	1150	* @stable ICU 2.4
sl@0	1151	*/
sl@0	1152	virtual UnicodeSet& compact();
sl@0	1153
sl@0	1154	/**
sl@0	1155	* Return the class ID for this class. This is useful only for
sl@0	1156	* comparing to a return value from getDynamicClassID(). For example:
sl@0	1157	* <pre>
sl@0	1158	* . Base* polymorphic_pointer = createPolymorphicObject();
sl@0	1159	* . if (polymorphic_pointer->getDynamicClassID() ==
sl@0	1160	* . Derived::getStaticClassID()) ...
sl@0	1161	* </pre>
sl@0	1162	* @return The class ID for all objects of this class.
sl@0	1163	* @stable ICU 2.0
sl@0	1164	*/
sl@0	1165	static UClassID U_EXPORT2 getStaticClassID(void);
sl@0	1166
sl@0	1167	/**
sl@0	1168	* Implement UnicodeFunctor API.
sl@0	1169	*
sl@0	1170	* @return The class ID for this object. All objects of a given
sl@0	1171	* class have the same class ID. Objects of other classes have
sl@0	1172	* different class IDs.
sl@0	1173	* @stable ICU 2.4
sl@0	1174	*/
sl@0	1175	virtual UClassID getDynamicClassID(void) const;
sl@0	1176
sl@0	1177	private:
sl@0	1178
sl@0	1179	// Private API for the USet API
sl@0	1180
sl@0	1181	friend class USetAccess;
sl@0	1182
sl@0	1183	int32_t getStringCount() const;
sl@0	1184
sl@0	1185	const UnicodeString* getString(int32_t index) const;
sl@0	1186
sl@0	1187	//----------------------------------------------------------------
sl@0	1188	// RuleBasedTransliterator support
sl@0	1189	//----------------------------------------------------------------
sl@0	1190
sl@0	1191	private:
sl@0	1192
sl@0	1193	/**
sl@0	1194	* Returns <tt>true</tt> if this set contains any character whose low byte
sl@0	1195	* is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
sl@0	1196	* indexing.
sl@0	1197	*/
sl@0	1198	virtual UBool matchesIndexValue(uint8_t v) const;
sl@0	1199
sl@0	1200	private:
sl@0	1201
sl@0	1202	//----------------------------------------------------------------
sl@0	1203	// Implementation: Pattern parsing
sl@0	1204	//----------------------------------------------------------------
sl@0	1205
sl@0	1206	void applyPattern(RuleCharacterIterator& chars,
sl@0	1207	const SymbolTable* symbols,
sl@0	1208	UnicodeString& rebuiltPat,
sl@0	1209	uint32_t options,
sl@0	1210	UErrorCode& ec);
sl@0	1211
sl@0	1212	//----------------------------------------------------------------
sl@0	1213	// Implementation: Utility methods
sl@0	1214	//----------------------------------------------------------------
sl@0	1215
sl@0	1216	void ensureCapacity(int32_t newLen);
sl@0	1217
sl@0	1218	void ensureBufferCapacity(int32_t newLen);
sl@0	1219
sl@0	1220	void swapBuffers(void);
sl@0	1221
sl@0	1222	UBool allocateStrings();
sl@0	1223
sl@0	1224	UnicodeString& _toPattern(UnicodeString& result,
sl@0	1225	UBool escapeUnprintable) const;
sl@0	1226
sl@0	1227	UnicodeString& _generatePattern(UnicodeString& result,
sl@0	1228	UBool escapeUnprintable) const;
sl@0	1229
sl@0	1230	static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);
sl@0	1231
sl@0	1232	static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
sl@0	1233
sl@0	1234	//----------------------------------------------------------------
sl@0	1235	// Implementation: Fundamental operators
sl@0	1236	//----------------------------------------------------------------
sl@0	1237
sl@0	1238	void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);
sl@0	1239
sl@0	1240	void add(const UChar32* other, int32_t otherLen, int8_t polarity);
sl@0	1241
sl@0	1242	void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
sl@0	1243
sl@0	1244	/**
sl@0	1245	* Return true if the given position, in the given pattern, appears
sl@0	1246	* to be the start of a property set pattern [:foo:], \\p{foo}, or
sl@0	1247	* \\P{foo}, or \\N{name}.
sl@0	1248	*/
sl@0	1249	static UBool resemblesPropertyPattern(const UnicodeString& pattern,
sl@0	1250	int32_t pos);
sl@0	1251
sl@0	1252	static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
sl@0	1253	int32_t iterOpts);
sl@0	1254
sl@0	1255	/**
sl@0	1256	* Parse the given property pattern at the given parse position
sl@0	1257	* and set this UnicodeSet to the result.
sl@0	1258	*
sl@0	1259	* The original design document is out of date, but still useful.
sl@0	1260	* Ignore the property and value names:
sl@0	1261	* http://dev.icu-project.org/cgi-bin/viewcvs.cgi/~checkout~/icuhtml/design/unicodeset_properties.html
sl@0	1262	*
sl@0	1263	* Recognized syntax:
sl@0	1264	*
sl@0	1265	* [:foo:] [:^foo:] - white space not allowed within "[:" or ":]"
sl@0	1266	* \\p{foo} \\P{foo} - white space not allowed within "\\p" or "\\P"
sl@0	1267	* \\N{name} - white space not allowed within "\\N"
sl@0	1268	*
sl@0	1269	* Other than the above restrictions, white space is ignored. Case
sl@0	1270	* is ignored except in "\\p" and "\\P" and "\\N". In 'name' leading
sl@0	1271	* and trailing space is deleted, and internal runs of whitespace
sl@0	1272	* are collapsed to a single space.
sl@0	1273	*
sl@0	1274	* We support binary properties, enumerated properties, and the
sl@0	1275	* following non-enumerated properties:
sl@0	1276	*
sl@0	1277	* Numeric_Value
sl@0	1278	* Name
sl@0	1279	* Unicode_1_Name
sl@0	1280	*
sl@0	1281	* @param pattern the pattern string
sl@0	1282	* @param ppos on entry, the position at which to begin parsing.
sl@0	1283	* This should be one of the locations marked '^':
sl@0	1284	*
sl@0	1285	* [:blah:] \\p{blah} \\P{blah} \\N{name}
sl@0	1286	* ^ % ^ % ^ % ^ %
sl@0	1287	*
sl@0	1288	* On return, the position after the last character parsed, that is,
sl@0	1289	* the locations marked '%'. If the parse fails, ppos is returned
sl@0	1290	* unchanged.
sl@0	1291	* @return a reference to this.
sl@0	1292	*/
sl@0	1293	UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
sl@0	1294	ParsePosition& ppos,
sl@0	1295	UErrorCode &ec);
sl@0	1296
sl@0	1297	void applyPropertyPattern(RuleCharacterIterator& chars,
sl@0	1298	UnicodeString& rebuiltPat,
sl@0	1299	UErrorCode& ec);
sl@0	1300
sl@0	1301	/**
sl@0	1302	* A filter that returns TRUE if the given code point should be
sl@0	1303	* included in the UnicodeSet being constructed.
sl@0	1304	*/
sl@0	1305	typedef UBool (Filter)(UChar32 codePoint, void context);
sl@0	1306
sl@0	1307	/**
sl@0	1308	* Given a filter, set this UnicodeSet to the code points
sl@0	1309	* contained by that filter. The filter MUST be
sl@0	1310	* property-conformant. That is, if it returns value v for one
sl@0	1311	* code point, then it must return v for all affiliated code
sl@0	1312	* points, as defined by the inclusions list. See
sl@0	1313	* getInclusions().
sl@0	1314	* src is a UPropertySource value.
sl@0	1315	*/
sl@0	1316	void applyFilter(Filter filter,
sl@0	1317	void* context,
sl@0	1318	int32_t src,
sl@0	1319	UErrorCode &status);
sl@0	1320
sl@0	1321	/**
sl@0	1322	* Return a cached copy of the inclusions list for the property source.
sl@0	1323	*/
sl@0	1324	static const UnicodeSet* getInclusions(int32_t src, UErrorCode &errorCode);
sl@0	1325
sl@0	1326	friend class UnicodeSetIterator;
sl@0	1327	};
sl@0	1328
sl@0	1329	inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const {
sl@0	1330	return !operator==(o);
sl@0	1331	}
sl@0	1332
sl@0	1333	inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
sl@0	1334	return !containsNone(start, end);
sl@0	1335	}
sl@0	1336
sl@0	1337	inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const {
sl@0	1338	return !containsNone(s);
sl@0	1339	}
sl@0	1340
sl@0	1341	inline UBool UnicodeSet::containsSome(const UnicodeString& s) const {
sl@0	1342	return !containsNone(s);
sl@0	1343	}
sl@0	1344
sl@0	1345	U_NAMESPACE_END
sl@0	1346
sl@0	1347	#endif

author	sl
	Tue, 10 Jun 2014 14:32:02 +0200
changeset 1	260cb5ec6c19
permissions	-rw-r--r--