sl@0
|
1 |
/*
|
sl@0
|
2 |
* Copyright (C) 1996-2005, International Business Machines Corporation and others. All Rights Reserved.
|
sl@0
|
3 |
*****************************************************************************************
|
sl@0
|
4 |
*/
|
sl@0
|
5 |
|
sl@0
|
6 |
#ifndef UBRK_H
|
sl@0
|
7 |
#define UBRK_H
|
sl@0
|
8 |
|
sl@0
|
9 |
#include "unicode/utypes.h"
|
sl@0
|
10 |
#include "unicode/uloc.h"
|
sl@0
|
11 |
#include "unicode/utext.h"
|
sl@0
|
12 |
|
sl@0
|
13 |
/**
|
sl@0
|
14 |
* A text-break iterator.
|
sl@0
|
15 |
* For usage in C programs.
|
sl@0
|
16 |
*/
|
sl@0
|
17 |
#ifndef UBRK_TYPEDEF_UBREAK_ITERATOR
|
sl@0
|
18 |
# define UBRK_TYPEDEF_UBREAK_ITERATOR
|
sl@0
|
19 |
/**
|
sl@0
|
20 |
* Opaque type representing an ICU Break iterator object.
|
sl@0
|
21 |
* @stable ICU 2.0
|
sl@0
|
22 |
*/
|
sl@0
|
23 |
typedef void UBreakIterator;
|
sl@0
|
24 |
#endif
|
sl@0
|
25 |
|
sl@0
|
26 |
#if !UCONFIG_NO_BREAK_ITERATION
|
sl@0
|
27 |
|
sl@0
|
28 |
#include "unicode/parseerr.h"
|
sl@0
|
29 |
|
sl@0
|
30 |
/**
|
sl@0
|
31 |
* \file
|
sl@0
|
32 |
* \brief C API: BreakIterator
|
sl@0
|
33 |
*
|
sl@0
|
34 |
* <h2> BreakIterator C API </h2>
|
sl@0
|
35 |
*
|
sl@0
|
36 |
* The BreakIterator C API defines methods for finding the location
|
sl@0
|
37 |
* of boundaries in text. Pointer to a UBreakIterator maintain a
|
sl@0
|
38 |
* current position and scan over text returning the index of characters
|
sl@0
|
39 |
* where boundaries occur.
|
sl@0
|
40 |
* <P>
|
sl@0
|
41 |
* Line boundary analysis determines where a text string can be broken
|
sl@0
|
42 |
* when line-wrapping. The mechanism correctly handles punctuation and
|
sl@0
|
43 |
* hyphenated words.
|
sl@0
|
44 |
* <P>
|
sl@0
|
45 |
* Sentence boundary analysis allows selection with correct
|
sl@0
|
46 |
* interpretation of periods within numbers and abbreviations, and
|
sl@0
|
47 |
* trailing punctuation marks such as quotation marks and parentheses.
|
sl@0
|
48 |
* <P>
|
sl@0
|
49 |
* Word boundary analysis is used by search and replace functions, as
|
sl@0
|
50 |
* well as within text editing applications that allow the user to
|
sl@0
|
51 |
* select words with a double click. Word selection provides correct
|
sl@0
|
52 |
* interpretation of punctuation marks within and following
|
sl@0
|
53 |
* words. Characters that are not part of a word, such as symbols or
|
sl@0
|
54 |
* punctuation marks, have word-breaks on both sides.
|
sl@0
|
55 |
* <P>
|
sl@0
|
56 |
* Character boundary analysis allows users to interact with
|
sl@0
|
57 |
* characters as they expect to, for example, when moving the cursor
|
sl@0
|
58 |
* through a text string. Character boundary analysis provides correct
|
sl@0
|
59 |
* navigation of through character strings, regardless of how the
|
sl@0
|
60 |
* character is stored. For example, an accented character might be
|
sl@0
|
61 |
* stored as a base character and a diacritical mark. What users
|
sl@0
|
62 |
* consider to be a character can differ between languages.
|
sl@0
|
63 |
* <P>
|
sl@0
|
64 |
* Title boundary analysis locates all positions,
|
sl@0
|
65 |
* typically starts of words, that should be set to Title Case
|
sl@0
|
66 |
* when title casing the text.
|
sl@0
|
67 |
* <P>
|
sl@0
|
68 |
*
|
sl@0
|
69 |
* This is the interface for all text boundaries.
|
sl@0
|
70 |
* <P>
|
sl@0
|
71 |
* Examples:
|
sl@0
|
72 |
* <P>
|
sl@0
|
73 |
* Helper function to output text
|
sl@0
|
74 |
* <pre>
|
sl@0
|
75 |
* \code
|
sl@0
|
76 |
* void printTextRange(UChar* str, int32_t start, int32_t end ) {
|
sl@0
|
77 |
* UChar* result;
|
sl@0
|
78 |
* UChar* temp;
|
sl@0
|
79 |
* const char* res;
|
sl@0
|
80 |
* temp=(UChar*)malloc(sizeof(UChar) * ((u_strlen(str)-start)+1));
|
sl@0
|
81 |
* result=(UChar*)malloc(sizeof(UChar) * ((end-start)+1));
|
sl@0
|
82 |
* u_strcpy(temp, &str[start]);
|
sl@0
|
83 |
* u_strncpy(result, temp, end-start);
|
sl@0
|
84 |
* res=(char*)malloc(sizeof(char) * (u_strlen(result)+1));
|
sl@0
|
85 |
* u_austrcpy(res, result);
|
sl@0
|
86 |
* printf("%s\n", res);
|
sl@0
|
87 |
* }
|
sl@0
|
88 |
* \endcode
|
sl@0
|
89 |
* </pre>
|
sl@0
|
90 |
* Print each element in order:
|
sl@0
|
91 |
* <pre>
|
sl@0
|
92 |
* \code
|
sl@0
|
93 |
* void printEachForward( UBreakIterator* boundary, UChar* str) {
|
sl@0
|
94 |
* int32_t end;
|
sl@0
|
95 |
* int32_t start = ubrk_first(boundary);
|
sl@0
|
96 |
* for (end = ubrk_next(boundary)); end != UBRK_DONE; start = end, end = ubrk_next(boundary)) {
|
sl@0
|
97 |
* printTextRange(str, start, end );
|
sl@0
|
98 |
* }
|
sl@0
|
99 |
* }
|
sl@0
|
100 |
* \endcode
|
sl@0
|
101 |
* </pre>
|
sl@0
|
102 |
* Print each element in reverse order:
|
sl@0
|
103 |
* <pre>
|
sl@0
|
104 |
* \code
|
sl@0
|
105 |
* void printEachBackward( UBreakIterator* boundary, UChar* str) {
|
sl@0
|
106 |
* int32_t start;
|
sl@0
|
107 |
* int32_t end = ubrk_last(boundary);
|
sl@0
|
108 |
* for (start = ubrk_previous(boundary); start != UBRK_DONE; end = start, start =ubrk_previous(boundary)) {
|
sl@0
|
109 |
* printTextRange( str, start, end );
|
sl@0
|
110 |
* }
|
sl@0
|
111 |
* }
|
sl@0
|
112 |
* \endcode
|
sl@0
|
113 |
* </pre>
|
sl@0
|
114 |
* Print first element
|
sl@0
|
115 |
* <pre>
|
sl@0
|
116 |
* \code
|
sl@0
|
117 |
* void printFirst(UBreakIterator* boundary, UChar* str) {
|
sl@0
|
118 |
* int32_t end;
|
sl@0
|
119 |
* int32_t start = ubrk_first(boundary);
|
sl@0
|
120 |
* end = ubrk_next(boundary);
|
sl@0
|
121 |
* printTextRange( str, start, end );
|
sl@0
|
122 |
* }
|
sl@0
|
123 |
* \endcode
|
sl@0
|
124 |
* </pre>
|
sl@0
|
125 |
* Print last element
|
sl@0
|
126 |
* <pre>
|
sl@0
|
127 |
* \code
|
sl@0
|
128 |
* void printLast(UBreakIterator* boundary, UChar* str) {
|
sl@0
|
129 |
* int32_t start;
|
sl@0
|
130 |
* int32_t end = ubrk_last(boundary);
|
sl@0
|
131 |
* start = ubrk_previous(boundary);
|
sl@0
|
132 |
* printTextRange(str, start, end );
|
sl@0
|
133 |
* }
|
sl@0
|
134 |
* \endcode
|
sl@0
|
135 |
* </pre>
|
sl@0
|
136 |
* Print the element at a specified position
|
sl@0
|
137 |
* <pre>
|
sl@0
|
138 |
* \code
|
sl@0
|
139 |
* void printAt(UBreakIterator* boundary, int32_t pos , UChar* str) {
|
sl@0
|
140 |
* int32_t start;
|
sl@0
|
141 |
* int32_t end = ubrk_following(boundary, pos);
|
sl@0
|
142 |
* start = ubrk_previous(boundary);
|
sl@0
|
143 |
* printTextRange(str, start, end );
|
sl@0
|
144 |
* }
|
sl@0
|
145 |
* \endcode
|
sl@0
|
146 |
* </pre>
|
sl@0
|
147 |
* Creating and using text boundaries
|
sl@0
|
148 |
* <pre>
|
sl@0
|
149 |
* \code
|
sl@0
|
150 |
* void BreakIterator_Example( void ) {
|
sl@0
|
151 |
* UBreakIterator* boundary;
|
sl@0
|
152 |
* UChar *stringToExamine;
|
sl@0
|
153 |
* stringToExamine=(UChar*)malloc(sizeof(UChar) * (strlen("Aaa bbb ccc. Ddd eee fff.")+1) );
|
sl@0
|
154 |
* u_uastrcpy(stringToExamine, "Aaa bbb ccc. Ddd eee fff.");
|
sl@0
|
155 |
* printf("Examining: "Aaa bbb ccc. Ddd eee fff.");
|
sl@0
|
156 |
*
|
sl@0
|
157 |
* //print each sentence in forward and reverse order
|
sl@0
|
158 |
* boundary = ubrk_open(UBRK_SENTENCE, "en_us", stringToExamine, u_strlen(stringToExamine), &status);
|
sl@0
|
159 |
* printf("----- forward: -----------\n");
|
sl@0
|
160 |
* printEachForward(boundary, stringToExamine);
|
sl@0
|
161 |
* printf("----- backward: ----------\n");
|
sl@0
|
162 |
* printEachBackward(boundary, stringToExamine);
|
sl@0
|
163 |
* ubrk_close(boundary);
|
sl@0
|
164 |
*
|
sl@0
|
165 |
* //print each word in order
|
sl@0
|
166 |
* boundary = ubrk_open(UBRK_WORD, "en_us", stringToExamine, u_strlen(stringToExamine), &status);
|
sl@0
|
167 |
* printf("----- forward: -----------\n");
|
sl@0
|
168 |
* printEachForward(boundary, stringToExamine);
|
sl@0
|
169 |
* printf("----- backward: ----------\n");
|
sl@0
|
170 |
* printEachBackward(boundary, stringToExamine);
|
sl@0
|
171 |
* //print first element
|
sl@0
|
172 |
* printf("----- first: -------------\n");
|
sl@0
|
173 |
* printFirst(boundary, stringToExamine);
|
sl@0
|
174 |
* //print last element
|
sl@0
|
175 |
* printf("----- last: --------------\n");
|
sl@0
|
176 |
* printLast(boundary, stringToExamine);
|
sl@0
|
177 |
* //print word at charpos 10
|
sl@0
|
178 |
* printf("----- at pos 10: ---------\n");
|
sl@0
|
179 |
* printAt(boundary, 10 , stringToExamine);
|
sl@0
|
180 |
*
|
sl@0
|
181 |
* ubrk_close(boundary);
|
sl@0
|
182 |
* }
|
sl@0
|
183 |
* \endcode
|
sl@0
|
184 |
* </pre>
|
sl@0
|
185 |
*/
|
sl@0
|
186 |
|
sl@0
|
187 |
/** The possible types of text boundaries. @stable ICU 2.0 */
|
sl@0
|
188 |
typedef enum UBreakIteratorType {
|
sl@0
|
189 |
/** Character breaks @stable ICU 2.0 */
|
sl@0
|
190 |
UBRK_CHARACTER,
|
sl@0
|
191 |
/** Word breaks @stable ICU 2.0 */
|
sl@0
|
192 |
UBRK_WORD,
|
sl@0
|
193 |
/** Line breaks @stable ICU 2.0 */
|
sl@0
|
194 |
UBRK_LINE,
|
sl@0
|
195 |
/** Sentence breaks @stable ICU 2.0 */
|
sl@0
|
196 |
UBRK_SENTENCE,
|
sl@0
|
197 |
|
sl@0
|
198 |
#ifndef U_HIDE_DEPRECATED_API
|
sl@0
|
199 |
/**
|
sl@0
|
200 |
* Title Case breaks
|
sl@0
|
201 |
* The iterator created using this type locates title boundaries as described for
|
sl@0
|
202 |
* Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
|
sl@0
|
203 |
* please use Word Boundary iterator.
|
sl@0
|
204 |
*
|
sl@0
|
205 |
* @deprecated ICU 2.8 Use the word break iterator for titlecasing for Unicode 4 and later.
|
sl@0
|
206 |
*/
|
sl@0
|
207 |
UBRK_TITLE
|
sl@0
|
208 |
#endif /* U_HIDE_DEPRECATED_API */
|
sl@0
|
209 |
|
sl@0
|
210 |
} UBreakIteratorType;
|
sl@0
|
211 |
|
sl@0
|
212 |
/** Value indicating all text boundaries have been returned.
|
sl@0
|
213 |
* @stable ICU 2.0
|
sl@0
|
214 |
*/
|
sl@0
|
215 |
#define UBRK_DONE ((int32_t) -1)
|
sl@0
|
216 |
|
sl@0
|
217 |
|
sl@0
|
218 |
/**
|
sl@0
|
219 |
* Enum constants for the word break tags returned by
|
sl@0
|
220 |
* getRuleStatus(). A range of values is defined for each category of
|
sl@0
|
221 |
* word, to allow for further subdivisions of a category in future releases.
|
sl@0
|
222 |
* Applications should check for tag values falling within the range, rather
|
sl@0
|
223 |
* than for single individual values.
|
sl@0
|
224 |
* @stable ICU 2.2
|
sl@0
|
225 |
*/
|
sl@0
|
226 |
typedef enum UWordBreak {
|
sl@0
|
227 |
/** Tag value for "words" that do not fit into any of other categories.
|
sl@0
|
228 |
* Includes spaces and most punctuation. */
|
sl@0
|
229 |
UBRK_WORD_NONE = 0,
|
sl@0
|
230 |
/** Upper bound for tags for uncategorized words. */
|
sl@0
|
231 |
UBRK_WORD_NONE_LIMIT = 100,
|
sl@0
|
232 |
/** Tag value for words that appear to be numbers, lower limit. */
|
sl@0
|
233 |
UBRK_WORD_NUMBER = 100,
|
sl@0
|
234 |
/** Tag value for words that appear to be numbers, upper limit. */
|
sl@0
|
235 |
UBRK_WORD_NUMBER_LIMIT = 200,
|
sl@0
|
236 |
/** Tag value for words that contain letters, excluding
|
sl@0
|
237 |
* hiragana, katakana or ideographic characters, lower limit. */
|
sl@0
|
238 |
UBRK_WORD_LETTER = 200,
|
sl@0
|
239 |
/** Tag value for words containing letters, upper limit */
|
sl@0
|
240 |
UBRK_WORD_LETTER_LIMIT = 300,
|
sl@0
|
241 |
/** Tag value for words containing kana characters, lower limit */
|
sl@0
|
242 |
UBRK_WORD_KANA = 300,
|
sl@0
|
243 |
/** Tag value for words containing kana characters, upper limit */
|
sl@0
|
244 |
UBRK_WORD_KANA_LIMIT = 400,
|
sl@0
|
245 |
/** Tag value for words containing ideographic characters, lower limit */
|
sl@0
|
246 |
UBRK_WORD_IDEO = 400,
|
sl@0
|
247 |
/** Tag value for words containing ideographic characters, upper limit */
|
sl@0
|
248 |
UBRK_WORD_IDEO_LIMIT = 500
|
sl@0
|
249 |
} UWordBreak;
|
sl@0
|
250 |
|
sl@0
|
251 |
/**
|
sl@0
|
252 |
* Enum constants for the line break tags returned by getRuleStatus().
|
sl@0
|
253 |
* A range of values is defined for each category of
|
sl@0
|
254 |
* word, to allow for further subdivisions of a category in future releases.
|
sl@0
|
255 |
* Applications should check for tag values falling within the range, rather
|
sl@0
|
256 |
* than for single individual values.
|
sl@0
|
257 |
* @stable ICU 2.8
|
sl@0
|
258 |
*/
|
sl@0
|
259 |
typedef enum ULineBreakTag {
|
sl@0
|
260 |
/** Tag value for soft line breaks, positions at which a line break
|
sl@0
|
261 |
* is acceptable but not required */
|
sl@0
|
262 |
UBRK_LINE_SOFT = 0,
|
sl@0
|
263 |
/** Upper bound for soft line breaks. */
|
sl@0
|
264 |
UBRK_LINE_SOFT_LIMIT = 100,
|
sl@0
|
265 |
/** Tag value for a hard, or mandatory line break */
|
sl@0
|
266 |
UBRK_LINE_HARD = 100,
|
sl@0
|
267 |
/** Upper bound for hard line breaks. */
|
sl@0
|
268 |
UBRK_LINE_HARD_LIMIT = 200
|
sl@0
|
269 |
} ULineBreakTag;
|
sl@0
|
270 |
|
sl@0
|
271 |
|
sl@0
|
272 |
|
sl@0
|
273 |
/**
|
sl@0
|
274 |
* Enum constants for the sentence break tags returned by getRuleStatus().
|
sl@0
|
275 |
* A range of values is defined for each category of
|
sl@0
|
276 |
* sentence, to allow for further subdivisions of a category in future releases.
|
sl@0
|
277 |
* Applications should check for tag values falling within the range, rather
|
sl@0
|
278 |
* than for single individual values.
|
sl@0
|
279 |
* @stable ICU 2.8
|
sl@0
|
280 |
*/
|
sl@0
|
281 |
typedef enum USentenceBreakTag {
|
sl@0
|
282 |
/** Tag value for for sentences ending with a sentence terminator
|
sl@0
|
283 |
* ('.', '?', '!', etc.) character, possibly followed by a
|
sl@0
|
284 |
* hard separator (CR, LF, PS, etc.)
|
sl@0
|
285 |
*/
|
sl@0
|
286 |
UBRK_SENTENCE_TERM = 0,
|
sl@0
|
287 |
/** Upper bound for tags for sentences ended by sentence terminators. */
|
sl@0
|
288 |
UBRK_SENTENCE_TERM_LIMIT = 100,
|
sl@0
|
289 |
/** Tag value for for sentences that do not contain an ending
|
sl@0
|
290 |
* sentence terminator ('.', '?', '!', etc.) character, but
|
sl@0
|
291 |
* are ended only by a hard separator (CR, LF, PS, etc.) or end of input.
|
sl@0
|
292 |
*/
|
sl@0
|
293 |
UBRK_SENTENCE_SEP = 100,
|
sl@0
|
294 |
/** Upper bound for tags for sentences ended by a separator. */
|
sl@0
|
295 |
UBRK_SENTENCE_SEP_LIMIT = 200
|
sl@0
|
296 |
/** Tag value for a hard, or mandatory line break */
|
sl@0
|
297 |
} USentenceBreakTag;
|
sl@0
|
298 |
|
sl@0
|
299 |
|
sl@0
|
300 |
/**
|
sl@0
|
301 |
* Open a new UBreakIterator for locating text boundaries for a specified locale.
|
sl@0
|
302 |
* A UBreakIterator may be used for detecting character, line, word,
|
sl@0
|
303 |
* and sentence breaks in text.
|
sl@0
|
304 |
* @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,
|
sl@0
|
305 |
* UBRK_LINE, UBRK_SENTENCE
|
sl@0
|
306 |
* @param locale The locale specifying the text-breaking conventions.
|
sl@0
|
307 |
* @param text The text to be iterated over.
|
sl@0
|
308 |
* @param textLength The number of characters in text, or -1 if null-terminated.
|
sl@0
|
309 |
* @param status A UErrorCode to receive any errors.
|
sl@0
|
310 |
* @return A UBreakIterator for the specified locale.
|
sl@0
|
311 |
* @see ubrk_openRules
|
sl@0
|
312 |
* @stable ICU 2.0
|
sl@0
|
313 |
*/
|
sl@0
|
314 |
U_STABLE UBreakIterator* U_EXPORT2
|
sl@0
|
315 |
ubrk_open(UBreakIteratorType type,
|
sl@0
|
316 |
const char *locale,
|
sl@0
|
317 |
const UChar *text,
|
sl@0
|
318 |
int32_t textLength,
|
sl@0
|
319 |
UErrorCode *status);
|
sl@0
|
320 |
|
sl@0
|
321 |
/**
|
sl@0
|
322 |
* Open a new UBreakIterator for locating text boundaries using specified breaking rules.
|
sl@0
|
323 |
* The rule syntax is ... (TBD)
|
sl@0
|
324 |
* @param rules A set of rules specifying the text breaking conventions.
|
sl@0
|
325 |
* @param rulesLength The number of characters in rules, or -1 if null-terminated.
|
sl@0
|
326 |
* @param text The text to be iterated over. May be null, in which case ubrk_setText() is
|
sl@0
|
327 |
* used to specify the text to be iterated.
|
sl@0
|
328 |
* @param textLength The number of characters in text, or -1 if null-terminated.
|
sl@0
|
329 |
* @param parseErr Receives position and context information for any syntax errors
|
sl@0
|
330 |
* detected while parsing the rules.
|
sl@0
|
331 |
* @param status A UErrorCode to receive any errors.
|
sl@0
|
332 |
* @return A UBreakIterator for the specified rules.
|
sl@0
|
333 |
* @see ubrk_open
|
sl@0
|
334 |
* @stable ICU 2.2
|
sl@0
|
335 |
*/
|
sl@0
|
336 |
U_STABLE UBreakIterator* U_EXPORT2
|
sl@0
|
337 |
ubrk_openRules(const UChar *rules,
|
sl@0
|
338 |
int32_t rulesLength,
|
sl@0
|
339 |
const UChar *text,
|
sl@0
|
340 |
int32_t textLength,
|
sl@0
|
341 |
UParseError *parseErr,
|
sl@0
|
342 |
UErrorCode *status);
|
sl@0
|
343 |
|
sl@0
|
344 |
/**
|
sl@0
|
345 |
* Thread safe cloning operation
|
sl@0
|
346 |
* @param bi iterator to be cloned
|
sl@0
|
347 |
* @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
|
sl@0
|
348 |
* If buffer is not large enough, new memory will be allocated.
|
sl@0
|
349 |
* Clients can use the U_BRK_SAFECLONE_BUFFERSIZE. This will probably be enough to avoid memory allocations.
|
sl@0
|
350 |
* @param pBufferSize pointer to size of allocated space.
|
sl@0
|
351 |
* If *pBufferSize == 0, a sufficient size for use in cloning will
|
sl@0
|
352 |
* be returned ('pre-flighting')
|
sl@0
|
353 |
* If *pBufferSize is not enough for a stack-based safe clone,
|
sl@0
|
354 |
* new memory will be allocated.
|
sl@0
|
355 |
* @param status to indicate whether the operation went on smoothly or there were errors
|
sl@0
|
356 |
* An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary.
|
sl@0
|
357 |
* @return pointer to the new clone
|
sl@0
|
358 |
* @stable ICU 2.0
|
sl@0
|
359 |
*/
|
sl@0
|
360 |
U_STABLE UBreakIterator * U_EXPORT2
|
sl@0
|
361 |
ubrk_safeClone(
|
sl@0
|
362 |
const UBreakIterator *bi,
|
sl@0
|
363 |
void *stackBuffer,
|
sl@0
|
364 |
int32_t *pBufferSize,
|
sl@0
|
365 |
UErrorCode *status);
|
sl@0
|
366 |
|
sl@0
|
367 |
/**
|
sl@0
|
368 |
* A recommended size (in bytes) for the memory buffer to be passed to ubrk_saveClone().
|
sl@0
|
369 |
* @stable ICU 2.0
|
sl@0
|
370 |
*/
|
sl@0
|
371 |
#define U_BRK_SAFECLONE_BUFFERSIZE 512
|
sl@0
|
372 |
|
sl@0
|
373 |
/**
|
sl@0
|
374 |
* Close a UBreakIterator.
|
sl@0
|
375 |
* Once closed, a UBreakIterator may no longer be used.
|
sl@0
|
376 |
* @param bi The break iterator to close.
|
sl@0
|
377 |
* @stable ICU 2.0
|
sl@0
|
378 |
*/
|
sl@0
|
379 |
U_STABLE void U_EXPORT2
|
sl@0
|
380 |
ubrk_close(UBreakIterator *bi);
|
sl@0
|
381 |
|
sl@0
|
382 |
/**
|
sl@0
|
383 |
* Sets an existing iterator to point to a new piece of text
|
sl@0
|
384 |
* @param bi The iterator to use
|
sl@0
|
385 |
* @param text The text to be set
|
sl@0
|
386 |
* @param textLength The length of the text
|
sl@0
|
387 |
* @param status The error code
|
sl@0
|
388 |
* @stable ICU 2.0
|
sl@0
|
389 |
*/
|
sl@0
|
390 |
U_STABLE void U_EXPORT2
|
sl@0
|
391 |
ubrk_setText(UBreakIterator* bi,
|
sl@0
|
392 |
const UChar* text,
|
sl@0
|
393 |
int32_t textLength,
|
sl@0
|
394 |
UErrorCode* status);
|
sl@0
|
395 |
|
sl@0
|
396 |
|
sl@0
|
397 |
/**
|
sl@0
|
398 |
* Sets an existing iterator to point to a new piece of text
|
sl@0
|
399 |
* @param bi The iterator to use
|
sl@0
|
400 |
* @param text The text to be set
|
sl@0
|
401 |
* @param status The error code
|
sl@0
|
402 |
* @draft ICU 3.4
|
sl@0
|
403 |
*/
|
sl@0
|
404 |
U_DRAFT void U_EXPORT2
|
sl@0
|
405 |
ubrk_setUText(UBreakIterator* bi,
|
sl@0
|
406 |
UText* text,
|
sl@0
|
407 |
UErrorCode* status);
|
sl@0
|
408 |
|
sl@0
|
409 |
|
sl@0
|
410 |
|
sl@0
|
411 |
/**
|
sl@0
|
412 |
* Determine the most recently-returned text boundary.
|
sl@0
|
413 |
*
|
sl@0
|
414 |
* @param bi The break iterator to use.
|
sl@0
|
415 |
* @return The character index most recently returned by \ref ubrk_next, \ref ubrk_previous,
|
sl@0
|
416 |
* \ref ubrk_first, or \ref ubrk_last.
|
sl@0
|
417 |
* @stable ICU 2.0
|
sl@0
|
418 |
*/
|
sl@0
|
419 |
U_STABLE int32_t U_EXPORT2
|
sl@0
|
420 |
ubrk_current(const UBreakIterator *bi);
|
sl@0
|
421 |
|
sl@0
|
422 |
/**
|
sl@0
|
423 |
* Determine the text boundary following the current text boundary.
|
sl@0
|
424 |
*
|
sl@0
|
425 |
* @param bi The break iterator to use.
|
sl@0
|
426 |
* @return The character index of the next text boundary, or UBRK_DONE
|
sl@0
|
427 |
* if all text boundaries have been returned.
|
sl@0
|
428 |
* @see ubrk_previous
|
sl@0
|
429 |
* @stable ICU 2.0
|
sl@0
|
430 |
*/
|
sl@0
|
431 |
U_STABLE int32_t U_EXPORT2
|
sl@0
|
432 |
ubrk_next(UBreakIterator *bi);
|
sl@0
|
433 |
|
sl@0
|
434 |
/**
|
sl@0
|
435 |
* Determine the text boundary preceding the current text boundary.
|
sl@0
|
436 |
*
|
sl@0
|
437 |
* @param bi The break iterator to use.
|
sl@0
|
438 |
* @return The character index of the preceding text boundary, or UBRK_DONE
|
sl@0
|
439 |
* if all text boundaries have been returned.
|
sl@0
|
440 |
* @see ubrk_next
|
sl@0
|
441 |
* @stable ICU 2.0
|
sl@0
|
442 |
*/
|
sl@0
|
443 |
U_STABLE int32_t U_EXPORT2
|
sl@0
|
444 |
ubrk_previous(UBreakIterator *bi);
|
sl@0
|
445 |
|
sl@0
|
446 |
/**
|
sl@0
|
447 |
* Determine the index of the first character in the text being scanned.
|
sl@0
|
448 |
* This is not always the same as index 0 of the text.
|
sl@0
|
449 |
* @param bi The break iterator to use.
|
sl@0
|
450 |
* @return The character index of the first character in the text being scanned.
|
sl@0
|
451 |
* @see ubrk_last
|
sl@0
|
452 |
* @stable ICU 2.0
|
sl@0
|
453 |
*/
|
sl@0
|
454 |
U_STABLE int32_t U_EXPORT2
|
sl@0
|
455 |
ubrk_first(UBreakIterator *bi);
|
sl@0
|
456 |
|
sl@0
|
457 |
/**
|
sl@0
|
458 |
* Determine the index immediately <EM>beyond</EM> the last character in the text being
|
sl@0
|
459 |
* scanned.
|
sl@0
|
460 |
* This is not the same as the last character.
|
sl@0
|
461 |
* @param bi The break iterator to use.
|
sl@0
|
462 |
* @return The character offset immediately <EM>beyond</EM> the last character in the
|
sl@0
|
463 |
* text being scanned.
|
sl@0
|
464 |
* @see ubrk_first
|
sl@0
|
465 |
* @stable ICU 2.0
|
sl@0
|
466 |
*/
|
sl@0
|
467 |
U_STABLE int32_t U_EXPORT2
|
sl@0
|
468 |
ubrk_last(UBreakIterator *bi);
|
sl@0
|
469 |
|
sl@0
|
470 |
/**
|
sl@0
|
471 |
* Determine the text boundary preceding the specified offset.
|
sl@0
|
472 |
* The value returned is always smaller than offset, or UBRK_DONE.
|
sl@0
|
473 |
* @param bi The break iterator to use.
|
sl@0
|
474 |
* @param offset The offset to begin scanning.
|
sl@0
|
475 |
* @return The text boundary preceding offset, or UBRK_DONE.
|
sl@0
|
476 |
* @see ubrk_following
|
sl@0
|
477 |
* @stable ICU 2.0
|
sl@0
|
478 |
*/
|
sl@0
|
479 |
U_STABLE int32_t U_EXPORT2
|
sl@0
|
480 |
ubrk_preceding(UBreakIterator *bi,
|
sl@0
|
481 |
int32_t offset);
|
sl@0
|
482 |
|
sl@0
|
483 |
/**
|
sl@0
|
484 |
* Determine the text boundary following the specified offset.
|
sl@0
|
485 |
* The value returned is always greater than offset, or UBRK_DONE.
|
sl@0
|
486 |
* @param bi The break iterator to use.
|
sl@0
|
487 |
* @param offset The offset to begin scanning.
|
sl@0
|
488 |
* @return The text boundary following offset, or UBRK_DONE.
|
sl@0
|
489 |
* @see ubrk_preceding
|
sl@0
|
490 |
* @stable ICU 2.0
|
sl@0
|
491 |
*/
|
sl@0
|
492 |
U_STABLE int32_t U_EXPORT2
|
sl@0
|
493 |
ubrk_following(UBreakIterator *bi,
|
sl@0
|
494 |
int32_t offset);
|
sl@0
|
495 |
|
sl@0
|
496 |
/**
|
sl@0
|
497 |
* Get a locale for which text breaking information is available.
|
sl@0
|
498 |
* A UBreakIterator in a locale returned by this function will perform the correct
|
sl@0
|
499 |
* text breaking for the locale.
|
sl@0
|
500 |
* @param index The index of the desired locale.
|
sl@0
|
501 |
* @return A locale for which number text breaking information is available, or 0 if none.
|
sl@0
|
502 |
* @see ubrk_countAvailable
|
sl@0
|
503 |
* @stable ICU 2.0
|
sl@0
|
504 |
*/
|
sl@0
|
505 |
U_STABLE const char* U_EXPORT2
|
sl@0
|
506 |
ubrk_getAvailable(int32_t index);
|
sl@0
|
507 |
|
sl@0
|
508 |
/**
|
sl@0
|
509 |
* Determine how many locales have text breaking information available.
|
sl@0
|
510 |
* This function is most useful as determining the loop ending condition for
|
sl@0
|
511 |
* calls to \ref ubrk_getAvailable.
|
sl@0
|
512 |
* @return The number of locales for which text breaking information is available.
|
sl@0
|
513 |
* @see ubrk_getAvailable
|
sl@0
|
514 |
* @stable ICU 2.0
|
sl@0
|
515 |
*/
|
sl@0
|
516 |
U_STABLE int32_t U_EXPORT2
|
sl@0
|
517 |
ubrk_countAvailable(void);
|
sl@0
|
518 |
|
sl@0
|
519 |
|
sl@0
|
520 |
/**
|
sl@0
|
521 |
* Returns true if the specfied position is a boundary position. As a side
|
sl@0
|
522 |
* effect, leaves the iterator pointing to the first boundary position at
|
sl@0
|
523 |
* or after "offset".
|
sl@0
|
524 |
* @param bi The break iterator to use.
|
sl@0
|
525 |
* @param offset the offset to check.
|
sl@0
|
526 |
* @return True if "offset" is a boundary position.
|
sl@0
|
527 |
* @stable ICU 2.0
|
sl@0
|
528 |
*/
|
sl@0
|
529 |
U_STABLE UBool U_EXPORT2
|
sl@0
|
530 |
ubrk_isBoundary(UBreakIterator *bi, int32_t offset);
|
sl@0
|
531 |
|
sl@0
|
532 |
/**
|
sl@0
|
533 |
* Return the status from the break rule that determined the most recently
|
sl@0
|
534 |
* returned break position. The values appear in the rule source
|
sl@0
|
535 |
* within brackets, {123}, for example. For rules that do not specify a
|
sl@0
|
536 |
* status, a default value of 0 is returned.
|
sl@0
|
537 |
* <p>
|
sl@0
|
538 |
* For word break iterators, the possible values are defined in enum UWordBreak.
|
sl@0
|
539 |
* @stable ICU 2.2
|
sl@0
|
540 |
*/
|
sl@0
|
541 |
U_STABLE int32_t U_EXPORT2
|
sl@0
|
542 |
ubrk_getRuleStatus(UBreakIterator *bi);
|
sl@0
|
543 |
|
sl@0
|
544 |
/**
|
sl@0
|
545 |
* Get the statuses from the break rules that determined the most recently
|
sl@0
|
546 |
* returned break position. The values appear in the rule source
|
sl@0
|
547 |
* within brackets, {123}, for example. The default status value for rules
|
sl@0
|
548 |
* that do not explicitly provide one is zero.
|
sl@0
|
549 |
* <p>
|
sl@0
|
550 |
* For word break iterators, the possible values are defined in enum UWordBreak.
|
sl@0
|
551 |
* @param bi The break iterator to use
|
sl@0
|
552 |
* @param fillInVec an array to be filled in with the status values.
|
sl@0
|
553 |
* @param capacity the length of the supplied vector. A length of zero causes
|
sl@0
|
554 |
* the function to return the number of status values, in the
|
sl@0
|
555 |
* normal way, without attemtping to store any values.
|
sl@0
|
556 |
* @param status receives error codes.
|
sl@0
|
557 |
* @return The number of rule status values from rules that determined
|
sl@0
|
558 |
* the most recent boundary returned by the break iterator.
|
sl@0
|
559 |
* @draft ICU 3.0
|
sl@0
|
560 |
*/
|
sl@0
|
561 |
U_DRAFT int32_t U_EXPORT2
|
sl@0
|
562 |
ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status);
|
sl@0
|
563 |
|
sl@0
|
564 |
/**
|
sl@0
|
565 |
* Return the locale of the break iterator. You can choose between the valid and
|
sl@0
|
566 |
* the actual locale.
|
sl@0
|
567 |
* @param bi break iterator
|
sl@0
|
568 |
* @param type locale type (valid or actual)
|
sl@0
|
569 |
* @param status error code
|
sl@0
|
570 |
* @return locale string
|
sl@0
|
571 |
* @draft ICU 2.8 likely to change after ICU 3.0, based on feedback
|
sl@0
|
572 |
*/
|
sl@0
|
573 |
U_DRAFT const char* U_EXPORT2
|
sl@0
|
574 |
ubrk_getLocaleByType(const UBreakIterator *bi, ULocDataLocaleType type, UErrorCode* status);
|
sl@0
|
575 |
|
sl@0
|
576 |
|
sl@0
|
577 |
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
sl@0
|
578 |
|
sl@0
|
579 |
#endif
|