sl@0
|
1 |
/*
|
sl@0
|
2 |
***************************************************************************
|
sl@0
|
3 |
* Copyright (C) 1999-2005 International Business Machines Corporation *
|
sl@0
|
4 |
* and others. All rights reserved. *
|
sl@0
|
5 |
***************************************************************************
|
sl@0
|
6 |
|
sl@0
|
7 |
**********************************************************************
|
sl@0
|
8 |
* Date Name Description
|
sl@0
|
9 |
* 10/22/99 alan Creation.
|
sl@0
|
10 |
* 11/11/99 rgillam Complete port from Java.
|
sl@0
|
11 |
**********************************************************************
|
sl@0
|
12 |
*/
|
sl@0
|
13 |
|
sl@0
|
14 |
#ifndef RBBI_H
|
sl@0
|
15 |
#define RBBI_H
|
sl@0
|
16 |
|
sl@0
|
17 |
#include "unicode/utypes.h"
|
sl@0
|
18 |
|
sl@0
|
19 |
/**
|
sl@0
|
20 |
* \file
|
sl@0
|
21 |
* \brief C++ API: Rule Based Break Iterator
|
sl@0
|
22 |
*/
|
sl@0
|
23 |
|
sl@0
|
24 |
#if !UCONFIG_NO_BREAK_ITERATION
|
sl@0
|
25 |
|
sl@0
|
26 |
#include "unicode/brkiter.h"
|
sl@0
|
27 |
#include "unicode/udata.h"
|
sl@0
|
28 |
#include "unicode/parseerr.h"
|
sl@0
|
29 |
|
sl@0
|
30 |
|
sl@0
|
31 |
struct UTrie;
|
sl@0
|
32 |
|
sl@0
|
33 |
U_NAMESPACE_BEGIN
|
sl@0
|
34 |
|
sl@0
|
35 |
/** @internal */
|
sl@0
|
36 |
struct RBBIDataHeader;
|
sl@0
|
37 |
class RuleBasedBreakIteratorTables;
|
sl@0
|
38 |
class BreakIterator;
|
sl@0
|
39 |
class RBBIDataWrapper;
|
sl@0
|
40 |
struct RBBIStateTable;
|
sl@0
|
41 |
|
sl@0
|
42 |
|
sl@0
|
43 |
|
sl@0
|
44 |
/**
|
sl@0
|
45 |
*
|
sl@0
|
46 |
* A subclass of BreakIterator whose behavior is specified using a list of rules.
|
sl@0
|
47 |
* <p>Instances of this class are most commonly created by the factory methods of
|
sl@0
|
48 |
* BreakIterator::createWordInstance(), BreakIterator::createLineInstance(), etc.,
|
sl@0
|
49 |
* and then used via the abstract API in class BreakIterator</p>
|
sl@0
|
50 |
*
|
sl@0
|
51 |
* <p>See the ICU User Guide for information on Break Iterator Rules.</p>
|
sl@0
|
52 |
*
|
sl@0
|
53 |
* <p>This class is not intended to be subclassed. (Class DictionaryBasedBreakIterator
|
sl@0
|
54 |
* is a subclass, but that relationship is effectively internal to the ICU
|
sl@0
|
55 |
* implementation. The subclassing interface to RulesBasedBreakIterator is
|
sl@0
|
56 |
* not part of the ICU API, and may not remain stable.</p>
|
sl@0
|
57 |
*
|
sl@0
|
58 |
*/
|
sl@0
|
59 |
class U_COMMON_API RuleBasedBreakIterator : public BreakIterator {
|
sl@0
|
60 |
|
sl@0
|
61 |
protected:
|
sl@0
|
62 |
/**
|
sl@0
|
63 |
* The character iterator through which this BreakIterator accesses the text
|
sl@0
|
64 |
* @internal
|
sl@0
|
65 |
*/
|
sl@0
|
66 |
CharacterIterator* fText;
|
sl@0
|
67 |
|
sl@0
|
68 |
/**
|
sl@0
|
69 |
* The rule data for this BreakIterator instance
|
sl@0
|
70 |
* @internal
|
sl@0
|
71 |
*/
|
sl@0
|
72 |
RBBIDataWrapper *fData;
|
sl@0
|
73 |
|
sl@0
|
74 |
/** Index of the Rule {tag} values for the most recent match.
|
sl@0
|
75 |
* @internal
|
sl@0
|
76 |
*/
|
sl@0
|
77 |
int32_t fLastRuleStatusIndex;
|
sl@0
|
78 |
|
sl@0
|
79 |
/**
|
sl@0
|
80 |
* Rule tag value valid flag.
|
sl@0
|
81 |
* Some iterator operations don't intrinsically set the correct tag value.
|
sl@0
|
82 |
* This flag lets us lazily compute the value if we are ever asked for it.
|
sl@0
|
83 |
* @internal
|
sl@0
|
84 |
*/
|
sl@0
|
85 |
UBool fLastStatusIndexValid;
|
sl@0
|
86 |
|
sl@0
|
87 |
/**
|
sl@0
|
88 |
* Counter for the number of characters encountered with the "dictionary"
|
sl@0
|
89 |
* flag set. Normal RBBI iterators don't use it, although the code
|
sl@0
|
90 |
* for updating it is live. Dictionary Based break iterators (a subclass
|
sl@0
|
91 |
* of us) access this field directly.
|
sl@0
|
92 |
* @internal
|
sl@0
|
93 |
*/
|
sl@0
|
94 |
uint32_t fDictionaryCharCount;
|
sl@0
|
95 |
|
sl@0
|
96 |
/**
|
sl@0
|
97 |
* Debugging flag. Trace operation of state machine when true.
|
sl@0
|
98 |
* @internal
|
sl@0
|
99 |
*/
|
sl@0
|
100 |
static UBool fTrace;
|
sl@0
|
101 |
|
sl@0
|
102 |
|
sl@0
|
103 |
protected:
|
sl@0
|
104 |
//=======================================================================
|
sl@0
|
105 |
// constructors
|
sl@0
|
106 |
//=======================================================================
|
sl@0
|
107 |
|
sl@0
|
108 |
/**
|
sl@0
|
109 |
* Constructor from a flattened set of RBBI data in malloced memory.
|
sl@0
|
110 |
* RulesBasedBreakIterators built from a custom set of rules
|
sl@0
|
111 |
* are created via this constructor; the rules are compiled
|
sl@0
|
112 |
* into memory, then the break iterator is constructed here.
|
sl@0
|
113 |
*
|
sl@0
|
114 |
* The break iterator adopts the memory, and will
|
sl@0
|
115 |
* free it when done.
|
sl@0
|
116 |
* @internal
|
sl@0
|
117 |
*/
|
sl@0
|
118 |
RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status);
|
sl@0
|
119 |
|
sl@0
|
120 |
/** @internal */
|
sl@0
|
121 |
friend class RBBIRuleBuilder;
|
sl@0
|
122 |
/** @internal */
|
sl@0
|
123 |
friend class BreakIterator;
|
sl@0
|
124 |
|
sl@0
|
125 |
|
sl@0
|
126 |
|
sl@0
|
127 |
public:
|
sl@0
|
128 |
|
sl@0
|
129 |
/** Default constructor. Creates an empty shell of an iterator, with no
|
sl@0
|
130 |
* rules or text to iterate over. Object can subsequently be assigned to.
|
sl@0
|
131 |
* @stable ICU 2.2
|
sl@0
|
132 |
*/
|
sl@0
|
133 |
RuleBasedBreakIterator();
|
sl@0
|
134 |
|
sl@0
|
135 |
/**
|
sl@0
|
136 |
* Copy constructor. Will produce a break iterator with the same behavior,
|
sl@0
|
137 |
* and which iterates over the same text, as the one passed in.
|
sl@0
|
138 |
* @param that The RuleBasedBreakIterator passed to be copied
|
sl@0
|
139 |
* @stable ICU 2.0
|
sl@0
|
140 |
*/
|
sl@0
|
141 |
RuleBasedBreakIterator(const RuleBasedBreakIterator& that);
|
sl@0
|
142 |
|
sl@0
|
143 |
/**
|
sl@0
|
144 |
* Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
|
sl@0
|
145 |
* @param rules The break rules to be used.
|
sl@0
|
146 |
* @param parseError In the event of a syntax error in the rules, provides the location
|
sl@0
|
147 |
* within the rules of the problem.
|
sl@0
|
148 |
* @param status Information on any errors encountered.
|
sl@0
|
149 |
* @stable ICU 2.2
|
sl@0
|
150 |
*/
|
sl@0
|
151 |
RuleBasedBreakIterator( const UnicodeString &rules,
|
sl@0
|
152 |
UParseError &parseError,
|
sl@0
|
153 |
UErrorCode &status);
|
sl@0
|
154 |
|
sl@0
|
155 |
|
sl@0
|
156 |
/**
|
sl@0
|
157 |
* This constructor uses the udata interface to create a BreakIterator
|
sl@0
|
158 |
* whose internal tables live in a memory-mapped file. "image" is an
|
sl@0
|
159 |
* ICU UDataMemory handle for the pre-compiled break iterator tables.
|
sl@0
|
160 |
* @param image handle to the memory image for the break iterator data.
|
sl@0
|
161 |
* Ownership of the UDataMemory handle passes to the Break Iterator,
|
sl@0
|
162 |
* which will be responsible for closing it when it is no longer needed.
|
sl@0
|
163 |
* @param status Information on any errors encountered.
|
sl@0
|
164 |
* @see udata_open
|
sl@0
|
165 |
* @see #getBinaryRules
|
sl@0
|
166 |
* @stable ICU 2.8
|
sl@0
|
167 |
*/
|
sl@0
|
168 |
RuleBasedBreakIterator(UDataMemory* image, UErrorCode &status);
|
sl@0
|
169 |
|
sl@0
|
170 |
/**
|
sl@0
|
171 |
* Destructor
|
sl@0
|
172 |
* @stable ICU 2.0
|
sl@0
|
173 |
*/
|
sl@0
|
174 |
virtual ~RuleBasedBreakIterator();
|
sl@0
|
175 |
|
sl@0
|
176 |
/**
|
sl@0
|
177 |
* Assignment operator. Sets this iterator to have the same behavior,
|
sl@0
|
178 |
* and iterate over the same text, as the one passed in.
|
sl@0
|
179 |
* @param that The RuleBasedBreakItertor passed in
|
sl@0
|
180 |
* @return the newly created RuleBasedBreakIterator
|
sl@0
|
181 |
* @stable ICU 2.0
|
sl@0
|
182 |
*/
|
sl@0
|
183 |
RuleBasedBreakIterator& operator=(const RuleBasedBreakIterator& that);
|
sl@0
|
184 |
|
sl@0
|
185 |
/**
|
sl@0
|
186 |
* Equality operator. Returns TRUE if both BreakIterators are of the
|
sl@0
|
187 |
* same class, have the same behavior, and iterate over the same text.
|
sl@0
|
188 |
* @param that The BreakIterator to be compared for equality
|
sl@0
|
189 |
* @return TRUE if both BreakIterators are of the
|
sl@0
|
190 |
* same class, have the same behavior, and iterate over the same text.
|
sl@0
|
191 |
* @stable ICU 2.0
|
sl@0
|
192 |
*/
|
sl@0
|
193 |
virtual UBool operator==(const BreakIterator& that) const;
|
sl@0
|
194 |
|
sl@0
|
195 |
/**
|
sl@0
|
196 |
* Not-equal operator. If operator== returns TRUE, this returns FALSE,
|
sl@0
|
197 |
* and vice versa.
|
sl@0
|
198 |
* @param that The BreakIterator to be compared for inequality
|
sl@0
|
199 |
* @return TRUE if both BreakIterators are not same.
|
sl@0
|
200 |
* @stable ICU 2.0
|
sl@0
|
201 |
*/
|
sl@0
|
202 |
UBool operator!=(const BreakIterator& that) const;
|
sl@0
|
203 |
|
sl@0
|
204 |
/**
|
sl@0
|
205 |
* Returns a newly-constructed RuleBasedBreakIterator with the same
|
sl@0
|
206 |
* behavior, and iterating over the same text, as this one.
|
sl@0
|
207 |
* Differs from the copy constructor in that it is polymorphic, and
|
sl@0
|
208 |
* will correctly clone (copy) a derived class.
|
sl@0
|
209 |
* clone() is thread safe. Multiple threads may simultaeneously
|
sl@0
|
210 |
* clone the same source break iterator.
|
sl@0
|
211 |
* @return a newly-constructed RuleBasedBreakIterator
|
sl@0
|
212 |
* @stable ICU 2.0
|
sl@0
|
213 |
*/
|
sl@0
|
214 |
virtual BreakIterator* clone() const;
|
sl@0
|
215 |
|
sl@0
|
216 |
/**
|
sl@0
|
217 |
* Compute a hash code for this BreakIterator
|
sl@0
|
218 |
* @return A hash code
|
sl@0
|
219 |
* @stable ICU 2.0
|
sl@0
|
220 |
*/
|
sl@0
|
221 |
virtual int32_t hashCode(void) const;
|
sl@0
|
222 |
|
sl@0
|
223 |
/**
|
sl@0
|
224 |
* Returns the description used to create this iterator
|
sl@0
|
225 |
* @return the description used to create this iterator
|
sl@0
|
226 |
* @stable ICU 2.0
|
sl@0
|
227 |
*/
|
sl@0
|
228 |
virtual const UnicodeString& getRules(void) const;
|
sl@0
|
229 |
|
sl@0
|
230 |
//=======================================================================
|
sl@0
|
231 |
// BreakIterator overrides
|
sl@0
|
232 |
//=======================================================================
|
sl@0
|
233 |
|
sl@0
|
234 |
/**
|
sl@0
|
235 |
* Return a CharacterIterator over the text being analyzed. This version
|
sl@0
|
236 |
* of this method returns the actual CharacterIterator we're using internally.
|
sl@0
|
237 |
* Changing the state of this iterator can have undefined consequences. If
|
sl@0
|
238 |
* you need to change it, clone it first.
|
sl@0
|
239 |
* @return An iterator over the text being analyzed.
|
sl@0
|
240 |
* @stable ICU 2.0
|
sl@0
|
241 |
*/
|
sl@0
|
242 |
virtual const CharacterIterator& getText(void) const;
|
sl@0
|
243 |
|
sl@0
|
244 |
|
sl@0
|
245 |
/**
|
sl@0
|
246 |
* Get a UText for the text being analyzed.
|
sl@0
|
247 |
* The returned UText is a shallow clone of the UText used internally
|
sl@0
|
248 |
* by the break iterator implementation. It can safely be used to
|
sl@0
|
249 |
* access the text without impacting any break iterator operations,
|
sl@0
|
250 |
* but the underlying text itself must not be altered.
|
sl@0
|
251 |
*
|
sl@0
|
252 |
* @param fillIn A UText to be filled in. If NULL, a new UText will be
|
sl@0
|
253 |
* allocated to hold the result.
|
sl@0
|
254 |
* @param status receives any error codes.
|
sl@0
|
255 |
* @return The current UText for this break iterator. If an input
|
sl@0
|
256 |
* UText was provided, it will always be returned.
|
sl@0
|
257 |
* @draft ICU 3.4
|
sl@0
|
258 |
*/
|
sl@0
|
259 |
virtual UText *getUText(UText *fillIn, UErrorCode &status) const;
|
sl@0
|
260 |
|
sl@0
|
261 |
/**
|
sl@0
|
262 |
* Set the iterator to analyze a new piece of text. This function resets
|
sl@0
|
263 |
* the current iteration position to the beginning of the text.
|
sl@0
|
264 |
* @param newText An iterator over the text to analyze. The BreakIterator
|
sl@0
|
265 |
* takes ownership of the character iterator. The caller MUST NOT delete it!
|
sl@0
|
266 |
* @stable ICU 2.0
|
sl@0
|
267 |
*/
|
sl@0
|
268 |
virtual void adoptText(CharacterIterator* newText);
|
sl@0
|
269 |
|
sl@0
|
270 |
/**
|
sl@0
|
271 |
* Set the iterator to analyze a new piece of text. This function resets
|
sl@0
|
272 |
* the current iteration position to the beginning of the text.
|
sl@0
|
273 |
* @param newText The text to analyze.
|
sl@0
|
274 |
* @stable ICU 2.0
|
sl@0
|
275 |
*/
|
sl@0
|
276 |
virtual void setText(const UnicodeString& newText);
|
sl@0
|
277 |
|
sl@0
|
278 |
/**
|
sl@0
|
279 |
* Reset the break iterator to operate over the text represented by
|
sl@0
|
280 |
* the UText. The iterator position is reset to the start.
|
sl@0
|
281 |
*
|
sl@0
|
282 |
* This function makes a shallow clone of the supplied UText. This means
|
sl@0
|
283 |
* that the caller is free to immediately close or otherwise reuse the
|
sl@0
|
284 |
* Utext that was passed as a parameter, but that the underlying text itself
|
sl@0
|
285 |
* must not be altered while being referenced by the break iterator.
|
sl@0
|
286 |
*
|
sl@0
|
287 |
* @param text The UText used to change the text.
|
sl@0
|
288 |
* @param status Receives any error codes.
|
sl@0
|
289 |
* @draft ICU 3.4
|
sl@0
|
290 |
*/
|
sl@0
|
291 |
virtual void setText(UText *text, UErrorCode &status);
|
sl@0
|
292 |
|
sl@0
|
293 |
/**
|
sl@0
|
294 |
* Sets the current iteration position to the beginning of the text.
|
sl@0
|
295 |
* (i.e., the CharacterIterator's starting offset).
|
sl@0
|
296 |
* @return The offset of the beginning of the text.
|
sl@0
|
297 |
* @stable ICU 2.0
|
sl@0
|
298 |
*/
|
sl@0
|
299 |
virtual int32_t first(void);
|
sl@0
|
300 |
|
sl@0
|
301 |
/**
|
sl@0
|
302 |
* Sets the current iteration position to the end of the text.
|
sl@0
|
303 |
* (i.e., the CharacterIterator's ending offset).
|
sl@0
|
304 |
* @return The text's past-the-end offset.
|
sl@0
|
305 |
* @stable ICU 2.0
|
sl@0
|
306 |
*/
|
sl@0
|
307 |
virtual int32_t last(void);
|
sl@0
|
308 |
|
sl@0
|
309 |
/**
|
sl@0
|
310 |
* Advances the iterator either forward or backward the specified number of steps.
|
sl@0
|
311 |
* Negative values move backward, and positive values move forward. This is
|
sl@0
|
312 |
* equivalent to repeatedly calling next() or previous().
|
sl@0
|
313 |
* @param n The number of steps to move. The sign indicates the direction
|
sl@0
|
314 |
* (negative is backwards, and positive is forwards).
|
sl@0
|
315 |
* @return The character offset of the boundary position n boundaries away from
|
sl@0
|
316 |
* the current one.
|
sl@0
|
317 |
* @stable ICU 2.0
|
sl@0
|
318 |
*/
|
sl@0
|
319 |
virtual int32_t next(int32_t n);
|
sl@0
|
320 |
|
sl@0
|
321 |
/**
|
sl@0
|
322 |
* Advances the iterator to the next boundary position.
|
sl@0
|
323 |
* @return The position of the first boundary after this one.
|
sl@0
|
324 |
* @stable ICU 2.0
|
sl@0
|
325 |
*/
|
sl@0
|
326 |
virtual int32_t next(void);
|
sl@0
|
327 |
|
sl@0
|
328 |
/**
|
sl@0
|
329 |
* Moves the iterator backwards, to the last boundary preceding this one.
|
sl@0
|
330 |
* @return The position of the last boundary position preceding this one.
|
sl@0
|
331 |
* @stable ICU 2.0
|
sl@0
|
332 |
*/
|
sl@0
|
333 |
virtual int32_t previous(void);
|
sl@0
|
334 |
|
sl@0
|
335 |
/**
|
sl@0
|
336 |
* Sets the iterator to refer to the first boundary position following
|
sl@0
|
337 |
* the specified position.
|
sl@0
|
338 |
* @param offset The position from which to begin searching for a break position.
|
sl@0
|
339 |
* @return The position of the first break after the current position.
|
sl@0
|
340 |
* @stable ICU 2.0
|
sl@0
|
341 |
*/
|
sl@0
|
342 |
virtual int32_t following(int32_t offset);
|
sl@0
|
343 |
|
sl@0
|
344 |
/**
|
sl@0
|
345 |
* Sets the iterator to refer to the last boundary position before the
|
sl@0
|
346 |
* specified position.
|
sl@0
|
347 |
* @param offset The position to begin searching for a break from.
|
sl@0
|
348 |
* @return The position of the last boundary before the starting position.
|
sl@0
|
349 |
* @stable ICU 2.0
|
sl@0
|
350 |
*/
|
sl@0
|
351 |
virtual int32_t preceding(int32_t offset);
|
sl@0
|
352 |
|
sl@0
|
353 |
/**
|
sl@0
|
354 |
* Returns true if the specfied position is a boundary position. As a side
|
sl@0
|
355 |
* effect, leaves the iterator pointing to the first boundary position at
|
sl@0
|
356 |
* or after "offset".
|
sl@0
|
357 |
* @param offset the offset to check.
|
sl@0
|
358 |
* @return True if "offset" is a boundary position.
|
sl@0
|
359 |
* @stable ICU 2.0
|
sl@0
|
360 |
*/
|
sl@0
|
361 |
virtual UBool isBoundary(int32_t offset);
|
sl@0
|
362 |
|
sl@0
|
363 |
/**
|
sl@0
|
364 |
* Returns the current iteration position.
|
sl@0
|
365 |
* @return The current iteration position.
|
sl@0
|
366 |
* @stable ICU 2.0
|
sl@0
|
367 |
*/
|
sl@0
|
368 |
virtual int32_t current(void) const;
|
sl@0
|
369 |
|
sl@0
|
370 |
|
sl@0
|
371 |
/**
|
sl@0
|
372 |
* Return the status tag from the break rule that determined the most recently
|
sl@0
|
373 |
* returned break position. For break rules that do not specify a
|
sl@0
|
374 |
* status, a default value of 0 is returned. If more than one break rule
|
sl@0
|
375 |
* would cause a boundary to be located at some position in the text,
|
sl@0
|
376 |
* the numerically largest of the applicable status values is returned.
|
sl@0
|
377 |
* <p>
|
sl@0
|
378 |
* Of the standard types of ICU break iterators, only word break and
|
sl@0
|
379 |
* line break provide status values. The values are defined in
|
sl@0
|
380 |
* the header file ubrk.h. For Word breaks, the status allows distinguishing between words
|
sl@0
|
381 |
* that contain alphabetic letters, "words" that appear to be numbers,
|
sl@0
|
382 |
* punctuation and spaces, words containing ideographic characters, and
|
sl@0
|
383 |
* more. For Line Break, the status distinguishes between hard (mandatory) breaks
|
sl@0
|
384 |
* and soft (potential) break positions.
|
sl@0
|
385 |
* <p>
|
sl@0
|
386 |
* <code>getRuleStatus()</code> can be called after obtaining a boundary
|
sl@0
|
387 |
* position from <code>next()</code>, <code>previous()</code>, or
|
sl@0
|
388 |
* any other break iterator functions that returns a boundary position.
|
sl@0
|
389 |
* <p>
|
sl@0
|
390 |
* When creating custom break rules, one is free to define whatever
|
sl@0
|
391 |
* status values may be convenient for the application.
|
sl@0
|
392 |
* <p>
|
sl@0
|
393 |
* Note: this function is not thread safe. It should not have been
|
sl@0
|
394 |
* declared const, and the const remains only for compatibility
|
sl@0
|
395 |
* reasons. (The function is logically const, but not bit-wise const).
|
sl@0
|
396 |
* <p>
|
sl@0
|
397 |
* @return the status from the break rule that determined the most recently
|
sl@0
|
398 |
* returned break position.
|
sl@0
|
399 |
*
|
sl@0
|
400 |
* @see UWordBreak
|
sl@0
|
401 |
* @stable ICU 2.2
|
sl@0
|
402 |
*/
|
sl@0
|
403 |
virtual int32_t getRuleStatus() const;
|
sl@0
|
404 |
|
sl@0
|
405 |
/**
|
sl@0
|
406 |
* Get the status (tag) values from the break rule(s) that determined the most
|
sl@0
|
407 |
* recently returned break position.
|
sl@0
|
408 |
* <p>
|
sl@0
|
409 |
* The returned status value(s) are stored into an array provided by the caller.
|
sl@0
|
410 |
* The values are stored in sorted (ascending) order.
|
sl@0
|
411 |
* If the capacity of the output array is insufficient to hold the data,
|
sl@0
|
412 |
* the output will be truncated to the available length, and a
|
sl@0
|
413 |
* U_BUFFER_OVERFLOW_ERROR will be signaled.
|
sl@0
|
414 |
*
|
sl@0
|
415 |
* @param fillInVec an array to be filled in with the status values.
|
sl@0
|
416 |
* @param capacity the length of the supplied vector. A length of zero causes
|
sl@0
|
417 |
* the function to return the number of status values, in the
|
sl@0
|
418 |
* normal way, without attemtping to store any values.
|
sl@0
|
419 |
* @param status receives error codes.
|
sl@0
|
420 |
* @return The number of rule status values from rules that determined
|
sl@0
|
421 |
* the most recent boundary returned by the break iterator.
|
sl@0
|
422 |
* In the event of a U_BUFFER_OVERFLOW_ERROR, the return value
|
sl@0
|
423 |
* is the total number of status values that were available,
|
sl@0
|
424 |
* not the reduced number that were actually returned.
|
sl@0
|
425 |
* @see getRuleStatus
|
sl@0
|
426 |
* @draft ICU 3.0
|
sl@0
|
427 |
*/
|
sl@0
|
428 |
virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status);
|
sl@0
|
429 |
|
sl@0
|
430 |
/**
|
sl@0
|
431 |
* Returns a unique class ID POLYMORPHICALLY. Pure virtual override.
|
sl@0
|
432 |
* This method is to implement a simple version of RTTI, since not all
|
sl@0
|
433 |
* C++ compilers support genuine RTTI. Polymorphic operator==() and
|
sl@0
|
434 |
* clone() methods call this method.
|
sl@0
|
435 |
*
|
sl@0
|
436 |
* @return The class ID for this object. All objects of a
|
sl@0
|
437 |
* given class have the same class ID. Objects of
|
sl@0
|
438 |
* other classes have different class IDs.
|
sl@0
|
439 |
* @stable ICU 2.0
|
sl@0
|
440 |
*/
|
sl@0
|
441 |
virtual UClassID getDynamicClassID(void) const;
|
sl@0
|
442 |
|
sl@0
|
443 |
/**
|
sl@0
|
444 |
* Returns the class ID for this class. This is useful only for
|
sl@0
|
445 |
* comparing to a return value from getDynamicClassID(). For example:
|
sl@0
|
446 |
*
|
sl@0
|
447 |
* Base* polymorphic_pointer = createPolymorphicObject();
|
sl@0
|
448 |
* if (polymorphic_pointer->getDynamicClassID() ==
|
sl@0
|
449 |
* Derived::getStaticClassID()) ...
|
sl@0
|
450 |
*
|
sl@0
|
451 |
* @return The class ID for all objects of this class.
|
sl@0
|
452 |
* @stable ICU 2.0
|
sl@0
|
453 |
*/
|
sl@0
|
454 |
static UClassID U_EXPORT2 getStaticClassID(void);
|
sl@0
|
455 |
|
sl@0
|
456 |
/*
|
sl@0
|
457 |
* Create a clone (copy) of this break iterator in memory provided
|
sl@0
|
458 |
* by the caller. The idea is to increase performance by avoiding
|
sl@0
|
459 |
* a storage allocation. Use of this functoin is NOT RECOMMENDED.
|
sl@0
|
460 |
* Performance gains are minimal, and correct buffer management is
|
sl@0
|
461 |
* tricky. Use clone() instead.
|
sl@0
|
462 |
*
|
sl@0
|
463 |
* @param stackBuffer The pointer to the memory into which the cloned object
|
sl@0
|
464 |
* should be placed. If NULL, allocate heap memory
|
sl@0
|
465 |
* for the cloned object.
|
sl@0
|
466 |
* @param BufferSize The size of the buffer. If zero, return the required
|
sl@0
|
467 |
* buffer size, but do not clone the object. If the
|
sl@0
|
468 |
* size was too small (but not zero), allocate heap
|
sl@0
|
469 |
* storage for the cloned object.
|
sl@0
|
470 |
*
|
sl@0
|
471 |
* @param status Error status. U_SAFECLONE_ALLOCATED_WARNING will be
|
sl@0
|
472 |
* returned if the the provided buffer was too small, and
|
sl@0
|
473 |
* the clone was therefore put on the heap.
|
sl@0
|
474 |
*
|
sl@0
|
475 |
* @return Pointer to the clone object. This may differ from the stackBuffer
|
sl@0
|
476 |
* address if the byte alignment of the stack buffer was not suitable
|
sl@0
|
477 |
* or if the stackBuffer was too small to hold the clone.
|
sl@0
|
478 |
* @stable ICU 2.0
|
sl@0
|
479 |
*/
|
sl@0
|
480 |
virtual BreakIterator * createBufferClone(void *stackBuffer,
|
sl@0
|
481 |
int32_t &BufferSize,
|
sl@0
|
482 |
UErrorCode &status);
|
sl@0
|
483 |
|
sl@0
|
484 |
|
sl@0
|
485 |
/**
|
sl@0
|
486 |
* Return the binary form of compiled break rules,
|
sl@0
|
487 |
* which can then be used to create a new break iterator at some
|
sl@0
|
488 |
* time in the future. Creating a break iterator from pre-compiled rules
|
sl@0
|
489 |
* is much faster than building one from the source form of the
|
sl@0
|
490 |
* break rules.
|
sl@0
|
491 |
*
|
sl@0
|
492 |
* The binary data can only be used with the same version of ICU
|
sl@0
|
493 |
* and on the same platform type (processor endian-ness)
|
sl@0
|
494 |
*
|
sl@0
|
495 |
* @param length Returns the length of the binary data. (Out paramter.)
|
sl@0
|
496 |
*
|
sl@0
|
497 |
* @return A pointer to the binary (compiled) rule data. The storage
|
sl@0
|
498 |
* belongs to the RulesBasedBreakIterator object, not the
|
sl@0
|
499 |
* caller, and must not be modified or deleted.
|
sl@0
|
500 |
* @internal
|
sl@0
|
501 |
*/
|
sl@0
|
502 |
virtual const uint8_t *getBinaryRules(uint32_t &length);
|
sl@0
|
503 |
|
sl@0
|
504 |
|
sl@0
|
505 |
protected:
|
sl@0
|
506 |
//=======================================================================
|
sl@0
|
507 |
// implementation
|
sl@0
|
508 |
//=======================================================================
|
sl@0
|
509 |
/**
|
sl@0
|
510 |
* This method is the actual implementation of the next() method. All iteration
|
sl@0
|
511 |
* vectors through here. This method initializes the state machine to state 1
|
sl@0
|
512 |
* and advances through the text character by character until we reach the end
|
sl@0
|
513 |
* of the text or the state machine transitions to state 0. We update our return
|
sl@0
|
514 |
* value every time the state machine passes through a possible end state.
|
sl@0
|
515 |
* @internal
|
sl@0
|
516 |
*/
|
sl@0
|
517 |
virtual int32_t handleNext(void);
|
sl@0
|
518 |
|
sl@0
|
519 |
/**
|
sl@0
|
520 |
* This method backs the iterator back up to a "safe position" in the text.
|
sl@0
|
521 |
* This is a position that we know, without any context, must be a break position.
|
sl@0
|
522 |
* The various calling methods then iterate forward from this safe position to
|
sl@0
|
523 |
* the appropriate position to return. (For more information, see the description
|
sl@0
|
524 |
* of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
|
sl@0
|
525 |
* @internal
|
sl@0
|
526 |
*/
|
sl@0
|
527 |
virtual int32_t handlePrevious(void);
|
sl@0
|
528 |
|
sl@0
|
529 |
/**
|
sl@0
|
530 |
* Dumps caches and performs other actions associated with a complete change
|
sl@0
|
531 |
* in text or iteration position. This function is a no-op in RuleBasedBreakIterator,
|
sl@0
|
532 |
* but subclasses can and do override it.
|
sl@0
|
533 |
* @internal
|
sl@0
|
534 |
*/
|
sl@0
|
535 |
virtual void reset(void);
|
sl@0
|
536 |
|
sl@0
|
537 |
/**
|
sl@0
|
538 |
* Return true if the category lookup for this char
|
sl@0
|
539 |
* indicates that it is in the set of dictionary lookup chars.
|
sl@0
|
540 |
* This function is intended for use by dictionary based break iterators.
|
sl@0
|
541 |
* @return true if the category lookup for this char
|
sl@0
|
542 |
* indicates that it is in the set of dictionary lookup chars.
|
sl@0
|
543 |
* @internal
|
sl@0
|
544 |
*/
|
sl@0
|
545 |
virtual UBool isDictionaryChar(UChar32);
|
sl@0
|
546 |
|
sl@0
|
547 |
/**
|
sl@0
|
548 |
* Common initialization function, used by constructors and bufferClone.
|
sl@0
|
549 |
* (Also used by DictionaryBasedBreakIterator::createBufferClone().)
|
sl@0
|
550 |
* @internal
|
sl@0
|
551 |
*/
|
sl@0
|
552 |
void init();
|
sl@0
|
553 |
|
sl@0
|
554 |
private:
|
sl@0
|
555 |
|
sl@0
|
556 |
/**
|
sl@0
|
557 |
* This method backs the iterator back up to a "safe position" in the text.
|
sl@0
|
558 |
* This is a position that we know, without any context, must be a break position.
|
sl@0
|
559 |
* The various calling methods then iterate forward from this safe position to
|
sl@0
|
560 |
* the appropriate position to return. (For more information, see the description
|
sl@0
|
561 |
* of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
|
sl@0
|
562 |
* @param statetable state table used of moving backwards
|
sl@0
|
563 |
* @internal
|
sl@0
|
564 |
*/
|
sl@0
|
565 |
int32_t handlePrevious(const RBBIStateTable *statetable);
|
sl@0
|
566 |
|
sl@0
|
567 |
/**
|
sl@0
|
568 |
* This method is the actual implementation of the next() method. All iteration
|
sl@0
|
569 |
* vectors through here. This method initializes the state machine to state 1
|
sl@0
|
570 |
* and advances through the text character by character until we reach the end
|
sl@0
|
571 |
* of the text or the state machine transitions to state 0. We update our return
|
sl@0
|
572 |
* value every time the state machine passes through a possible end state.
|
sl@0
|
573 |
* @param statetable state table used of moving forwards
|
sl@0
|
574 |
* @internal
|
sl@0
|
575 |
*/
|
sl@0
|
576 |
int32_t handleNext(const RBBIStateTable *statetable);
|
sl@0
|
577 |
|
sl@0
|
578 |
/**
|
sl@0
|
579 |
* @internal
|
sl@0
|
580 |
*/
|
sl@0
|
581 |
void makeRuleStatusValid();
|
sl@0
|
582 |
|
sl@0
|
583 |
};
|
sl@0
|
584 |
|
sl@0
|
585 |
//------------------------------------------------------------------------------
|
sl@0
|
586 |
//
|
sl@0
|
587 |
// Inline Functions Definitions ...
|
sl@0
|
588 |
//
|
sl@0
|
589 |
//------------------------------------------------------------------------------
|
sl@0
|
590 |
|
sl@0
|
591 |
inline UBool RuleBasedBreakIterator::operator!=(const BreakIterator& that) const {
|
sl@0
|
592 |
return !operator==(that);
|
sl@0
|
593 |
}
|
sl@0
|
594 |
|
sl@0
|
595 |
U_NAMESPACE_END
|
sl@0
|
596 |
|
sl@0
|
597 |
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
sl@0
|
598 |
|
sl@0
|
599 |
#endif
|