sl@0
|
1 |
/*
|
sl@0
|
2 |
*******************************************************************************
|
sl@0
|
3 |
* Copyright (C) 1996-2005, International Business Machines Corporation and *
|
sl@0
|
4 |
* others. All Rights Reserved. *
|
sl@0
|
5 |
*******************************************************************************
|
sl@0
|
6 |
*/
|
sl@0
|
7 |
|
sl@0
|
8 |
#ifndef CANITER_H
|
sl@0
|
9 |
#define CANITER_H
|
sl@0
|
10 |
|
sl@0
|
11 |
#include "unicode/utypes.h"
|
sl@0
|
12 |
|
sl@0
|
13 |
#if !UCONFIG_NO_NORMALIZATION
|
sl@0
|
14 |
|
sl@0
|
15 |
#include "unicode/uobject.h"
|
sl@0
|
16 |
#include "unicode/unistr.h"
|
sl@0
|
17 |
|
sl@0
|
18 |
/**
|
sl@0
|
19 |
* \file
|
sl@0
|
20 |
* \brief C++ API: Canonical Iterator
|
sl@0
|
21 |
*/
|
sl@0
|
22 |
|
sl@0
|
23 |
/** Should permutation skip characters with combining class zero
|
sl@0
|
24 |
* Should be either TRUE or FALSE. This is a compile time option
|
sl@0
|
25 |
* @stable ICU 2.4
|
sl@0
|
26 |
*/
|
sl@0
|
27 |
#ifndef CANITER_SKIP_ZEROES
|
sl@0
|
28 |
#define CANITER_SKIP_ZEROES TRUE
|
sl@0
|
29 |
#endif
|
sl@0
|
30 |
|
sl@0
|
31 |
U_NAMESPACE_BEGIN
|
sl@0
|
32 |
|
sl@0
|
33 |
class Hashtable;
|
sl@0
|
34 |
|
sl@0
|
35 |
/**
|
sl@0
|
36 |
* This class allows one to iterate through all the strings that are canonically equivalent to a given
|
sl@0
|
37 |
* string. For example, here are some sample results:
|
sl@0
|
38 |
Results for: {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
|
sl@0
|
39 |
1: \\u0041\\u030A\\u0064\\u0307\\u0327
|
sl@0
|
40 |
= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
|
sl@0
|
41 |
2: \\u0041\\u030A\\u0064\\u0327\\u0307
|
sl@0
|
42 |
= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
|
sl@0
|
43 |
3: \\u0041\\u030A\\u1E0B\\u0327
|
sl@0
|
44 |
= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
|
sl@0
|
45 |
4: \\u0041\\u030A\\u1E11\\u0307
|
sl@0
|
46 |
= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
|
sl@0
|
47 |
5: \\u00C5\\u0064\\u0307\\u0327
|
sl@0
|
48 |
= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
|
sl@0
|
49 |
6: \\u00C5\\u0064\\u0327\\u0307
|
sl@0
|
50 |
= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
|
sl@0
|
51 |
7: \\u00C5\\u1E0B\\u0327
|
sl@0
|
52 |
= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
|
sl@0
|
53 |
8: \\u00C5\\u1E11\\u0307
|
sl@0
|
54 |
= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
|
sl@0
|
55 |
9: \\u212B\\u0064\\u0307\\u0327
|
sl@0
|
56 |
= {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
|
sl@0
|
57 |
10: \\u212B\\u0064\\u0327\\u0307
|
sl@0
|
58 |
= {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
|
sl@0
|
59 |
11: \\u212B\\u1E0B\\u0327
|
sl@0
|
60 |
= {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
|
sl@0
|
61 |
12: \\u212B\\u1E11\\u0307
|
sl@0
|
62 |
= {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
|
sl@0
|
63 |
*<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
|
sl@0
|
64 |
* since it has not been optimized for that situation.
|
sl@0
|
65 |
* Note, CanonicalIterator is not intended to be subclassed.
|
sl@0
|
66 |
* @author M. Davis
|
sl@0
|
67 |
* @author C++ port by V. Weinstein
|
sl@0
|
68 |
* @stable ICU 2.4
|
sl@0
|
69 |
*/
|
sl@0
|
70 |
class U_COMMON_API CanonicalIterator : public UObject {
|
sl@0
|
71 |
public:
|
sl@0
|
72 |
/**
|
sl@0
|
73 |
* Construct a CanonicalIterator object
|
sl@0
|
74 |
* @param source string to get results for
|
sl@0
|
75 |
* @param status Fill-in parameter which receives the status of this operation.
|
sl@0
|
76 |
* @stable ICU 2.4
|
sl@0
|
77 |
*/
|
sl@0
|
78 |
CanonicalIterator(const UnicodeString &source, UErrorCode &status);
|
sl@0
|
79 |
|
sl@0
|
80 |
/** Destructor
|
sl@0
|
81 |
* Cleans pieces
|
sl@0
|
82 |
* @stable ICU 2.4
|
sl@0
|
83 |
*/
|
sl@0
|
84 |
virtual ~CanonicalIterator();
|
sl@0
|
85 |
|
sl@0
|
86 |
/**
|
sl@0
|
87 |
* Gets the NFD form of the current source we are iterating over.
|
sl@0
|
88 |
* @return gets the source: NOTE: it is the NFD form of source
|
sl@0
|
89 |
* @stable ICU 2.4
|
sl@0
|
90 |
*/
|
sl@0
|
91 |
UnicodeString getSource();
|
sl@0
|
92 |
|
sl@0
|
93 |
/**
|
sl@0
|
94 |
* Resets the iterator so that one can start again from the beginning.
|
sl@0
|
95 |
* @stable ICU 2.4
|
sl@0
|
96 |
*/
|
sl@0
|
97 |
void reset();
|
sl@0
|
98 |
|
sl@0
|
99 |
/**
|
sl@0
|
100 |
* Get the next canonically equivalent string.
|
sl@0
|
101 |
* <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
|
sl@0
|
102 |
* @return the next string that is canonically equivalent. A bogus string is returned when
|
sl@0
|
103 |
* the iteration is done.
|
sl@0
|
104 |
* @stable ICU 2.4
|
sl@0
|
105 |
*/
|
sl@0
|
106 |
UnicodeString next();
|
sl@0
|
107 |
|
sl@0
|
108 |
/**
|
sl@0
|
109 |
* Set a new source for this iterator. Allows object reuse.
|
sl@0
|
110 |
* @param newSource the source string to iterate against. This allows the same iterator to be used
|
sl@0
|
111 |
* while changing the source string, saving object creation.
|
sl@0
|
112 |
* @param status Fill-in parameter which receives the status of this operation.
|
sl@0
|
113 |
* @stable ICU 2.4
|
sl@0
|
114 |
*/
|
sl@0
|
115 |
void setSource(const UnicodeString &newSource, UErrorCode &status);
|
sl@0
|
116 |
|
sl@0
|
117 |
/**
|
sl@0
|
118 |
* Dumb recursive implementation of permutation.
|
sl@0
|
119 |
* TODO: optimize
|
sl@0
|
120 |
* @param source the string to find permutations for
|
sl@0
|
121 |
* @param skipZeros determine if skip zeros
|
sl@0
|
122 |
* @param result the results in a set.
|
sl@0
|
123 |
* @param status Fill-in parameter which receives the status of this operation.
|
sl@0
|
124 |
* @internal
|
sl@0
|
125 |
*/
|
sl@0
|
126 |
static void U_EXPORT2 permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status);
|
sl@0
|
127 |
|
sl@0
|
128 |
/**
|
sl@0
|
129 |
* ICU "poor man's RTTI", returns a UClassID for this class.
|
sl@0
|
130 |
*
|
sl@0
|
131 |
* @stable ICU 2.2
|
sl@0
|
132 |
*/
|
sl@0
|
133 |
static UClassID U_EXPORT2 getStaticClassID();
|
sl@0
|
134 |
|
sl@0
|
135 |
/**
|
sl@0
|
136 |
* ICU "poor man's RTTI", returns a UClassID for the actual class.
|
sl@0
|
137 |
*
|
sl@0
|
138 |
* @stable ICU 2.2
|
sl@0
|
139 |
*/
|
sl@0
|
140 |
virtual UClassID getDynamicClassID() const;
|
sl@0
|
141 |
|
sl@0
|
142 |
private:
|
sl@0
|
143 |
// ===================== PRIVATES ==============================
|
sl@0
|
144 |
// private default constructor
|
sl@0
|
145 |
CanonicalIterator();
|
sl@0
|
146 |
|
sl@0
|
147 |
|
sl@0
|
148 |
/**
|
sl@0
|
149 |
* Copy constructor. Private for now.
|
sl@0
|
150 |
* @internal
|
sl@0
|
151 |
*/
|
sl@0
|
152 |
CanonicalIterator(const CanonicalIterator& other);
|
sl@0
|
153 |
|
sl@0
|
154 |
/**
|
sl@0
|
155 |
* Assignment operator. Private for now.
|
sl@0
|
156 |
* @internal
|
sl@0
|
157 |
*/
|
sl@0
|
158 |
CanonicalIterator& operator=(const CanonicalIterator& other);
|
sl@0
|
159 |
|
sl@0
|
160 |
// fields
|
sl@0
|
161 |
UnicodeString source;
|
sl@0
|
162 |
UBool done;
|
sl@0
|
163 |
|
sl@0
|
164 |
// 2 dimensional array holds the pieces of the string with
|
sl@0
|
165 |
// their different canonically equivalent representations
|
sl@0
|
166 |
UnicodeString **pieces;
|
sl@0
|
167 |
int32_t pieces_length;
|
sl@0
|
168 |
int32_t *pieces_lengths;
|
sl@0
|
169 |
|
sl@0
|
170 |
// current is used in iterating to combine pieces
|
sl@0
|
171 |
int32_t *current;
|
sl@0
|
172 |
int32_t current_length;
|
sl@0
|
173 |
|
sl@0
|
174 |
// transient fields
|
sl@0
|
175 |
UnicodeString buffer;
|
sl@0
|
176 |
|
sl@0
|
177 |
// we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
|
sl@0
|
178 |
UnicodeString *getEquivalents(const UnicodeString &segment, int32_t &result_len, UErrorCode &status); //private String[] getEquivalents(String segment)
|
sl@0
|
179 |
|
sl@0
|
180 |
//Set getEquivalents2(String segment);
|
sl@0
|
181 |
Hashtable *getEquivalents2(const UChar *segment, int32_t segLen, UErrorCode &status);
|
sl@0
|
182 |
//Hashtable *getEquivalents2(const UnicodeString &segment, int32_t segLen, UErrorCode &status);
|
sl@0
|
183 |
|
sl@0
|
184 |
/**
|
sl@0
|
185 |
* See if the decomposition of cp2 is at segment starting at segmentPos
|
sl@0
|
186 |
* (with canonical rearrangment!)
|
sl@0
|
187 |
* If so, take the remainder, and return the equivalents
|
sl@0
|
188 |
*/
|
sl@0
|
189 |
//Set extract(int comp, String segment, int segmentPos, StringBuffer buffer);
|
sl@0
|
190 |
Hashtable *extract(UChar32 comp, const UChar *segment, int32_t segLen, int32_t segmentPos, UErrorCode &status);
|
sl@0
|
191 |
//Hashtable *extract(UChar32 comp, const UnicodeString &segment, int32_t segLen, int32_t segmentPos, UErrorCode &status);
|
sl@0
|
192 |
|
sl@0
|
193 |
void cleanPieces();
|
sl@0
|
194 |
|
sl@0
|
195 |
};
|
sl@0
|
196 |
|
sl@0
|
197 |
U_NAMESPACE_END
|
sl@0
|
198 |
|
sl@0
|
199 |
#endif /* #if !UCONFIG_NO_NORMALIZATION */
|
sl@0
|
200 |
|
sl@0
|
201 |
#endif
|