sl@0
|
1 |
/*
|
sl@0
|
2 |
**********************************************************************
|
sl@0
|
3 |
* Copyright (c) 2003-2005, International Business Machines
|
sl@0
|
4 |
* Corporation and others. All Rights Reserved.
|
sl@0
|
5 |
**********************************************************************
|
sl@0
|
6 |
* Author: Alan Liu
|
sl@0
|
7 |
* Created: September 24 2003
|
sl@0
|
8 |
* Since: ICU 2.8
|
sl@0
|
9 |
**********************************************************************
|
sl@0
|
10 |
*/
|
sl@0
|
11 |
#ifndef _RULEITER_H_
|
sl@0
|
12 |
#define _RULEITER_H_
|
sl@0
|
13 |
|
sl@0
|
14 |
#include "unicode/utypes.h"
|
sl@0
|
15 |
|
sl@0
|
16 |
U_NAMESPACE_BEGIN
|
sl@0
|
17 |
|
sl@0
|
18 |
class UnicodeString;
|
sl@0
|
19 |
class ParsePosition;
|
sl@0
|
20 |
class SymbolTable;
|
sl@0
|
21 |
|
sl@0
|
22 |
/**
|
sl@0
|
23 |
* An iterator that returns 32-bit code points. This class is deliberately
|
sl@0
|
24 |
* <em>not</em> related to any of the ICU character iterator classes
|
sl@0
|
25 |
* in order to minimize complexity.
|
sl@0
|
26 |
* @author Alan Liu
|
sl@0
|
27 |
* @since ICU 2.8
|
sl@0
|
28 |
*/
|
sl@0
|
29 |
class U_COMMON_API RuleCharacterIterator {
|
sl@0
|
30 |
|
sl@0
|
31 |
// TODO: Ideas for later. (Do not implement if not needed, lest the
|
sl@0
|
32 |
// code coverage numbers go down due to unused methods.)
|
sl@0
|
33 |
// 1. Add a copy constructor, operator==() method.
|
sl@0
|
34 |
// 2. Rather than return DONE, throw an exception if the end
|
sl@0
|
35 |
// is reached -- this is an alternate usage model, probably not useful.
|
sl@0
|
36 |
|
sl@0
|
37 |
private:
|
sl@0
|
38 |
/**
|
sl@0
|
39 |
* Text being iterated.
|
sl@0
|
40 |
*/
|
sl@0
|
41 |
const UnicodeString& text;
|
sl@0
|
42 |
|
sl@0
|
43 |
/**
|
sl@0
|
44 |
* Position of iterator.
|
sl@0
|
45 |
*/
|
sl@0
|
46 |
ParsePosition& pos;
|
sl@0
|
47 |
|
sl@0
|
48 |
/**
|
sl@0
|
49 |
* Symbol table used to parse and dereference variables. May be 0.
|
sl@0
|
50 |
*/
|
sl@0
|
51 |
const SymbolTable* sym;
|
sl@0
|
52 |
|
sl@0
|
53 |
/**
|
sl@0
|
54 |
* Current variable expansion, or 0 if none.
|
sl@0
|
55 |
*/
|
sl@0
|
56 |
const UnicodeString* buf;
|
sl@0
|
57 |
|
sl@0
|
58 |
/**
|
sl@0
|
59 |
* Position within buf. Meaningless if buf == 0.
|
sl@0
|
60 |
*/
|
sl@0
|
61 |
int32_t bufPos;
|
sl@0
|
62 |
|
sl@0
|
63 |
public:
|
sl@0
|
64 |
/**
|
sl@0
|
65 |
* Value returned when there are no more characters to iterate.
|
sl@0
|
66 |
*/
|
sl@0
|
67 |
enum { DONE = -1 };
|
sl@0
|
68 |
|
sl@0
|
69 |
/**
|
sl@0
|
70 |
* Bitmask option to enable parsing of variable names. If (options &
|
sl@0
|
71 |
* PARSE_VARIABLES) != 0, then an embedded variable will be expanded to
|
sl@0
|
72 |
* its value. Variables are parsed using the SymbolTable API.
|
sl@0
|
73 |
*/
|
sl@0
|
74 |
enum { PARSE_VARIABLES = 1 };
|
sl@0
|
75 |
|
sl@0
|
76 |
/**
|
sl@0
|
77 |
* Bitmask option to enable parsing of escape sequences. If (options &
|
sl@0
|
78 |
* PARSE_ESCAPES) != 0, then an embedded escape sequence will be expanded
|
sl@0
|
79 |
* to its value. Escapes are parsed using Utility.unescapeAt().
|
sl@0
|
80 |
*/
|
sl@0
|
81 |
enum { PARSE_ESCAPES = 2 };
|
sl@0
|
82 |
|
sl@0
|
83 |
/**
|
sl@0
|
84 |
* Bitmask option to enable skipping of whitespace. If (options &
|
sl@0
|
85 |
* SKIP_WHITESPACE) != 0, then whitespace characters will be silently
|
sl@0
|
86 |
* skipped, as if they were not present in the input. Whitespace
|
sl@0
|
87 |
* characters are defined by UCharacterProperty.isRuleWhiteSpace().
|
sl@0
|
88 |
*/
|
sl@0
|
89 |
enum { SKIP_WHITESPACE = 4 };
|
sl@0
|
90 |
|
sl@0
|
91 |
/**
|
sl@0
|
92 |
* Constructs an iterator over the given text, starting at the given
|
sl@0
|
93 |
* position.
|
sl@0
|
94 |
* @param text the text to be iterated
|
sl@0
|
95 |
* @param sym the symbol table, or null if there is none. If sym is null,
|
sl@0
|
96 |
* then variables will not be deferenced, even if the PARSE_VARIABLES
|
sl@0
|
97 |
* option is set.
|
sl@0
|
98 |
* @param pos upon input, the index of the next character to return. If a
|
sl@0
|
99 |
* variable has been dereferenced, then pos will <em>not</em> increment as
|
sl@0
|
100 |
* characters of the variable value are iterated.
|
sl@0
|
101 |
*/
|
sl@0
|
102 |
RuleCharacterIterator(const UnicodeString& text, const SymbolTable* sym,
|
sl@0
|
103 |
ParsePosition& pos);
|
sl@0
|
104 |
|
sl@0
|
105 |
/**
|
sl@0
|
106 |
* Returns true if this iterator has no more characters to return.
|
sl@0
|
107 |
*/
|
sl@0
|
108 |
UBool atEnd() const;
|
sl@0
|
109 |
|
sl@0
|
110 |
/**
|
sl@0
|
111 |
* Returns the next character using the given options, or DONE if there
|
sl@0
|
112 |
* are no more characters, and advance the position to the next
|
sl@0
|
113 |
* character.
|
sl@0
|
114 |
* @param options one or more of the following options, bitwise-OR-ed
|
sl@0
|
115 |
* together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE.
|
sl@0
|
116 |
* @param isEscaped output parameter set to TRUE if the character
|
sl@0
|
117 |
* was escaped
|
sl@0
|
118 |
* @param ec input-output error code. An error will only be set by
|
sl@0
|
119 |
* this routing if options includes PARSE_VARIABLES and an unknown
|
sl@0
|
120 |
* variable name is seen, or if options includes PARSE_ESCAPES and
|
sl@0
|
121 |
* an invalid escape sequence is seen.
|
sl@0
|
122 |
* @return the current 32-bit code point, or DONE
|
sl@0
|
123 |
*/
|
sl@0
|
124 |
UChar32 next(int32_t options, UBool& isEscaped, UErrorCode& ec);
|
sl@0
|
125 |
|
sl@0
|
126 |
/**
|
sl@0
|
127 |
* Returns true if this iterator is currently within a variable expansion.
|
sl@0
|
128 |
*/
|
sl@0
|
129 |
inline UBool inVariable() const;
|
sl@0
|
130 |
|
sl@0
|
131 |
/**
|
sl@0
|
132 |
* An opaque object representing the position of a RuleCharacterIterator.
|
sl@0
|
133 |
*/
|
sl@0
|
134 |
struct Pos {
|
sl@0
|
135 |
private:
|
sl@0
|
136 |
const UnicodeString* buf;
|
sl@0
|
137 |
int32_t pos;
|
sl@0
|
138 |
int32_t bufPos;
|
sl@0
|
139 |
friend class RuleCharacterIterator;
|
sl@0
|
140 |
};
|
sl@0
|
141 |
|
sl@0
|
142 |
/**
|
sl@0
|
143 |
* Sets an object which, when later passed to setPos(), will
|
sl@0
|
144 |
* restore this iterator's position. Usage idiom:
|
sl@0
|
145 |
*
|
sl@0
|
146 |
* RuleCharacterIterator iterator = ...;
|
sl@0
|
147 |
* RuleCharacterIterator::Pos pos;
|
sl@0
|
148 |
* iterator.getPos(pos);
|
sl@0
|
149 |
* for (;;) {
|
sl@0
|
150 |
* iterator.getPos(pos);
|
sl@0
|
151 |
* int c = iterator.next(...);
|
sl@0
|
152 |
* ...
|
sl@0
|
153 |
* }
|
sl@0
|
154 |
* iterator.setPos(pos);
|
sl@0
|
155 |
*
|
sl@0
|
156 |
* @param p a position object to be set to this iterator's
|
sl@0
|
157 |
* current position.
|
sl@0
|
158 |
*/
|
sl@0
|
159 |
void getPos(Pos& p) const;
|
sl@0
|
160 |
|
sl@0
|
161 |
/**
|
sl@0
|
162 |
* Restores this iterator to the position it had when getPos()
|
sl@0
|
163 |
* set the given object.
|
sl@0
|
164 |
* @param p a position object previously set by getPos()
|
sl@0
|
165 |
*/
|
sl@0
|
166 |
void setPos(const Pos& p);
|
sl@0
|
167 |
|
sl@0
|
168 |
/**
|
sl@0
|
169 |
* Skips ahead past any ignored characters, as indicated by the given
|
sl@0
|
170 |
* options. This is useful in conjunction with the lookahead() method.
|
sl@0
|
171 |
*
|
sl@0
|
172 |
* Currently, this only has an effect for SKIP_WHITESPACE.
|
sl@0
|
173 |
* @param options one or more of the following options, bitwise-OR-ed
|
sl@0
|
174 |
* together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE.
|
sl@0
|
175 |
*/
|
sl@0
|
176 |
void skipIgnored(int32_t options);
|
sl@0
|
177 |
|
sl@0
|
178 |
/**
|
sl@0
|
179 |
* Returns a string containing the remainder of the characters to be
|
sl@0
|
180 |
* returned by this iterator, without any option processing. If the
|
sl@0
|
181 |
* iterator is currently within a variable expansion, this will only
|
sl@0
|
182 |
* extend to the end of the variable expansion. This method is provided
|
sl@0
|
183 |
* so that iterators may interoperate with string-based APIs. The typical
|
sl@0
|
184 |
* sequence of calls is to call skipIgnored(), then call lookahead(), then
|
sl@0
|
185 |
* parse the string returned by lookahead(), then call jumpahead() to
|
sl@0
|
186 |
* resynchronize the iterator.
|
sl@0
|
187 |
* @param result a string to receive the characters to be returned
|
sl@0
|
188 |
* by future calls to next()
|
sl@0
|
189 |
* @return a reference to result
|
sl@0
|
190 |
*/
|
sl@0
|
191 |
UnicodeString& lookahead(UnicodeString& result) const;
|
sl@0
|
192 |
|
sl@0
|
193 |
/**
|
sl@0
|
194 |
* Advances the position by the given number of 16-bit code units.
|
sl@0
|
195 |
* This is useful in conjunction with the lookahead() method.
|
sl@0
|
196 |
* @param count the number of 16-bit code units to jump over
|
sl@0
|
197 |
*/
|
sl@0
|
198 |
void jumpahead(int32_t count);
|
sl@0
|
199 |
|
sl@0
|
200 |
/**
|
sl@0
|
201 |
* Returns a string representation of this object, consisting of the
|
sl@0
|
202 |
* characters being iterated, with a '|' marking the current position.
|
sl@0
|
203 |
* Position within an expanded variable is <em>not</em> indicated.
|
sl@0
|
204 |
* @param result output parameter to receive a string
|
sl@0
|
205 |
* representation of this object
|
sl@0
|
206 |
*/
|
sl@0
|
207 |
// UnicodeString& toString(UnicodeString& result) const;
|
sl@0
|
208 |
|
sl@0
|
209 |
private:
|
sl@0
|
210 |
/**
|
sl@0
|
211 |
* Returns the current 32-bit code point without parsing escapes, parsing
|
sl@0
|
212 |
* variables, or skipping whitespace.
|
sl@0
|
213 |
* @return the current 32-bit code point
|
sl@0
|
214 |
*/
|
sl@0
|
215 |
UChar32 _current() const;
|
sl@0
|
216 |
|
sl@0
|
217 |
/**
|
sl@0
|
218 |
* Advances the position by the given amount.
|
sl@0
|
219 |
* @param count the number of 16-bit code units to advance past
|
sl@0
|
220 |
*/
|
sl@0
|
221 |
void _advance(int32_t count);
|
sl@0
|
222 |
};
|
sl@0
|
223 |
|
sl@0
|
224 |
inline UBool RuleCharacterIterator::inVariable() const {
|
sl@0
|
225 |
return buf != 0;
|
sl@0
|
226 |
}
|
sl@0
|
227 |
|
sl@0
|
228 |
U_NAMESPACE_END
|
sl@0
|
229 |
|
sl@0
|
230 |
#endif // _RULEITER_H_
|
sl@0
|
231 |
//eof
|