Update contrib.
2 ********************************************************************************
3 * Copyright (C) 1997-2005, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 ********************************************************************************
9 * Modification History:
11 * Date Name Description
12 * 02/18/97 aliu Added typedef for TextCount. Made DONE const.
13 * 05/07/97 aliu Fixed DLL declaration.
14 * 07/09/97 jfitz Renamed BreakIterator and interface synced with JDK
15 * 08/11/98 helena Sync-up JDK1.2.
16 * 01/13/2000 helena Added UErrorCode parameter to createXXXInstance methods.
17 ********************************************************************************
23 #include "unicode/utypes.h"
27 * \brief C++ API: Break Iterator.
30 #if UCONFIG_NO_BREAK_ITERATION
35 * Allow the declaration of APIs with pointers to BreakIterator
36 * even when break iteration is removed from the build.
44 #include "unicode/uobject.h"
45 #include "unicode/unistr.h"
46 #include "unicode/chariter.h"
47 #include "unicode/locid.h"
48 #include "unicode/ubrk.h"
49 #include "unicode/strenum.h"
50 #include "unicode/utext.h"
54 #if !UCONFIG_NO_SERVICE
56 * Opaque type returned by registerInstance.
59 typedef const void* URegistryKey;
63 * The BreakIterator class implements methods for finding the location
64 * of boundaries in text. BreakIterator is an abstract base class.
65 * Instances of BreakIterator maintain a current position and scan over
66 * text returning the index of characters where boundaries occur.
68 * Line boundary analysis determines where a text string can be broken
69 * when line-wrapping. The mechanism correctly handles punctuation and
72 * Sentence boundary analysis allows selection with correct
73 * interpretation of periods within numbers and abbreviations, and
74 * trailing punctuation marks such as quotation marks and parentheses.
76 * Word boundary analysis is used by search and replace functions, as
77 * well as within text editing applications that allow the user to
78 * select words with a double click. Word selection provides correct
79 * interpretation of punctuation marks within and following
80 * words. Characters that are not part of a word, such as symbols or
81 * punctuation marks, have word-breaks on both sides.
83 * Character boundary analysis allows users to interact with
84 * characters as they expect to, for example, when moving the cursor
85 * through a text string. Character boundary analysis provides correct
86 * navigation of through character strings, regardless of how the
87 * character is stored. For example, an accented character might be
88 * stored as a base character and a diacritical mark. What users
89 * consider to be a character can differ between languages.
91 * This is the interface for all text boundaries.
95 * Helper function to output text
98 * void printTextRange( BreakIterator& iterator, int32_t start, int32_t end )
100 * UnicodeString textBuffer, temp;
101 * CharacterIterator *strIter = iterator.createText();
102 * strIter->getText(temp);
103 * cout << " " << start << " " << end << " |"
104 * << temp.extractBetween(start, end, textBuffer)
110 * Print each element in order:
113 * void printEachForward( BreakIterator& boundary)
115 * int32_t start = boundary.first();
116 * for (int32_t end = boundary.next();
117 * end != BreakIterator::DONE;
118 * start = end, end = boundary.next())
120 * printTextRange( boundary, start, end );
125 * Print each element in reverse order:
128 * void printEachBackward( BreakIterator& boundary)
130 * int32_t end = boundary.last();
131 * for (int32_t start = boundary.previous();
132 * start != BreakIterator::DONE;
133 * end = start, start = boundary.previous())
135 * printTextRange( boundary, start, end );
140 * Print first element
143 * void printFirst(BreakIterator& boundary)
145 * int32_t start = boundary.first();
146 * int32_t end = boundary.next();
147 * printTextRange( boundary, start, end );
154 * void printLast(BreakIterator& boundary)
156 * int32_t end = boundary.last();
157 * int32_t start = boundary.previous();
158 * printTextRange( boundary, start, end );
162 * Print the element at a specified position
165 * void printAt(BreakIterator &boundary, int32_t pos )
167 * int32_t end = boundary.following(pos);
168 * int32_t start = boundary.previous();
169 * printTextRange( boundary, start, end );
173 * Creating and using text boundaries
176 * void BreakIterator_Example( void )
178 * BreakIterator* boundary;
179 * UnicodeString stringToExamine("Aaa bbb ccc. Ddd eee fff.");
180 * cout << "Examining: " << stringToExamine << endl;
182 * //print each sentence in forward and reverse order
183 * boundary = BreakIterator::createSentenceInstance( Locale::US );
184 * boundary->setText(stringToExamine);
185 * cout << "----- forward: -----------" << endl;
186 * printEachForward(*boundary);
187 * cout << "----- backward: ----------" << endl;
188 * printEachBackward(*boundary);
191 * //print each word in order
192 * boundary = BreakIterator::createWordInstance();
193 * boundary->setText(stringToExamine);
194 * cout << "----- forward: -----------" << endl;
195 * printEachForward(*boundary);
196 * //print first element
197 * cout << "----- first: -------------" << endl;
198 * printFirst(*boundary);
199 * //print last element
200 * cout << "----- last: --------------" << endl;
201 * printLast(*boundary);
202 * //print word at charpos 10
203 * cout << "----- at pos 10: ---------" << endl;
204 * printAt(*boundary, 10 );
211 class U_COMMON_API BreakIterator : public UObject {
217 virtual ~BreakIterator();
220 * Return true if another object is semantically equal to this
221 * one. The other object should be an instance of the same subclass of
222 * BreakIterator. Objects of different subclasses are considered
225 * Return true if this BreakIterator is at the same position in the
226 * same text, and is the same class and type (word, line, etc.) of
227 * BreakIterator, as the argument. Text is considered the same if
228 * it contains the same characters, it need not be the same
229 * object, and styles are not considered.
232 virtual UBool operator==(const BreakIterator&) const = 0;
235 * Returns the complement of the result of operator==
236 * @param rhs The BreakIterator to be compared for inequality
237 * @return the complement of the result of operator==
240 UBool operator!=(const BreakIterator& rhs) const { return !operator==(rhs); }
243 * Return a polymorphic copy of this object. This is an abstract
244 * method which subclasses implement.
247 virtual BreakIterator* clone(void) const = 0;
250 * Return a polymorphic class ID for this object. Different subclasses
251 * will return distinct unequal values.
254 virtual UClassID getDynamicClassID(void) const = 0;
257 * Return a CharacterIterator over the text being analyzed.
258 * Changing the state of the returned iterator can have undefined consequences
259 * on the operation of the break iterator. If you need to change it, clone it first.
262 virtual const CharacterIterator& getText(void) const = 0;
266 * Get a UText for the text being analyzed.
267 * The returned UText is a shallow clone of the UText used internally
268 * by the break iterator implementation. It can safely be used to
269 * access the text without impacting any break iterator operations,
270 * but the underlying text itself must not be altered.
272 * @param fillIn A UText to be filled in. If NULL, a new UText will be
273 * allocated to hold the result.
274 * @param status receives any error codes.
275 * @return The current UText for this break iterator. If an input
276 * UText was provided, it will always be returned.
279 virtual UText *getUText(UText *fillIn, UErrorCode &status) const = 0;
282 * Change the text over which this operates. The text boundary is
283 * reset to the start.
284 * @param text The UnicodeString used to change the text.
287 virtual void setText(const UnicodeString &text) = 0;
290 * Reset the break iterator to operate over the text represented by
291 * the UText. The iterator position is reset to the start.
293 * This function makes a shallow clone of the supplied UText. This means
294 * that the caller is free to immediately close or otherwise reuse the
295 * Utext that was passed as a parameter, but that the underlying text itself
296 * must not be altered while being referenced by the break iterator.
298 * @param text The UText used to change the text.
299 * @param status receives any error codes.
302 virtual void setText(UText *text, UErrorCode &status) = 0;
305 * Change the text over which this operates. The text boundary is
306 * reset to the start.
307 * @param it The CharacterIterator used to change the text.
310 virtual void adoptText(CharacterIterator* it) = 0;
314 * DONE is returned by previous() and next() after all valid
315 * boundaries have been returned.
322 * Return the index of the first character in the text being scanned.
325 virtual int32_t first(void) = 0;
328 * Return the index immediately BEYOND the last character in the text being scanned.
331 virtual int32_t last(void) = 0;
334 * Return the boundary preceding the current boundary.
335 * @return The character index of the previous text boundary or DONE if all
336 * boundaries have been returned.
339 virtual int32_t previous(void) = 0;
342 * Return the boundary following the current boundary.
343 * @return The character index of the next text boundary or DONE if all
344 * boundaries have been returned.
347 virtual int32_t next(void) = 0;
350 * Return character index of the current interator position within the text.
351 * @return The boundary most recently returned.
354 virtual int32_t current(void) const = 0;
357 * Return the first boundary following the specified offset.
358 * The value returned is always greater than the offset or
359 * the value BreakIterator.DONE
360 * @param offset the offset to begin scanning.
361 * @return The first boundary after the specified offset.
364 virtual int32_t following(int32_t offset) = 0;
367 * Return the first boundary preceding the specified offset.
368 * The value returned is always smaller than the offset or
369 * the value BreakIterator.DONE
370 * @param offset the offset to begin scanning.
371 * @return The first boundary before the specified offset.
374 virtual int32_t preceding(int32_t offset) = 0;
377 * Return true if the specfied position is a boundary position.
378 * As a side effect, the current position of the iterator is set
379 * to the first boundary position at or following the specified offset.
380 * @param offset the offset to check.
381 * @return True if "offset" is a boundary position.
384 virtual UBool isBoundary(int32_t offset) = 0;
387 * Return the nth boundary from the current boundary
388 * @param n which boundary to return. A value of 0
389 * does nothing. Negative values move to previous boundaries
390 * and positive values move to later boundaries.
391 * @return The index of the nth boundary from the current position, or
392 * DONE if there are fewer than |n| boundaries in the specfied direction.
395 virtual int32_t next(int32_t n) = 0;
398 * Create BreakIterator for word-breaks using the given locale.
399 * Returns an instance of a BreakIterator implementing word breaks.
400 * WordBreak is useful for word selection (ex. double click)
401 * @param where the locale.
402 * @param status the error code
403 * @return A BreakIterator for word-breaks. The UErrorCode& status
404 * parameter is used to return status information to the user.
405 * To check whether the construction succeeded or not, you should check
406 * the value of U_SUCCESS(err). If you wish more detailed information, you
407 * can check for informational error results which still indicate success.
408 * U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
409 * example, 'de_CH' was requested, but nothing was found there, so 'de' was
410 * used. U_USING_DEFAULT_WARNING indicates that the default locale data was
411 * used; neither the requested locale nor any of its fall back locales
413 * The caller owns the returned object and is responsible for deleting it.
416 static BreakIterator* U_EXPORT2
417 createWordInstance(const Locale& where, UErrorCode& status);
420 * Create BreakIterator for line-breaks using specified locale.
421 * Returns an instance of a BreakIterator implementing line breaks. Line
422 * breaks are logically possible line breaks, actual line breaks are
423 * usually determined based on display width.
424 * LineBreak is useful for word wrapping text.
425 * @param where the locale.
426 * @param status The error code.
427 * @return A BreakIterator for line-breaks. The UErrorCode& status
428 * parameter is used to return status information to the user.
429 * To check whether the construction succeeded or not, you should check
430 * the value of U_SUCCESS(err). If you wish more detailed information, you
431 * can check for informational error results which still indicate success.
432 * U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
433 * example, 'de_CH' was requested, but nothing was found there, so 'de' was
434 * used. U_USING_DEFAULT_WARNING indicates that the default locale data was
435 * used; neither the requested locale nor any of its fall back locales
437 * The caller owns the returned object and is responsible for deleting it.
440 static BreakIterator* U_EXPORT2
441 createLineInstance(const Locale& where, UErrorCode& status);
444 * Create BreakIterator for character-breaks using specified locale
445 * Returns an instance of a BreakIterator implementing character breaks.
446 * Character breaks are boundaries of combining character sequences.
447 * @param where the locale.
448 * @param status The error code.
449 * @return A BreakIterator for character-breaks. The UErrorCode& status
450 * parameter is used to return status information to the user.
451 * To check whether the construction succeeded or not, you should check
452 * the value of U_SUCCESS(err). If you wish more detailed information, you
453 * can check for informational error results which still indicate success.
454 * U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
455 * example, 'de_CH' was requested, but nothing was found there, so 'de' was
456 * used. U_USING_DEFAULT_WARNING indicates that the default locale data was
457 * used; neither the requested locale nor any of its fall back locales
459 * The caller owns the returned object and is responsible for deleting it.
462 static BreakIterator* U_EXPORT2
463 createCharacterInstance(const Locale& where, UErrorCode& status);
466 * Create BreakIterator for sentence-breaks using specified locale
467 * Returns an instance of a BreakIterator implementing sentence breaks.
468 * @param where the locale.
469 * @param status The error code.
470 * @return A BreakIterator for sentence-breaks. The UErrorCode& status
471 * parameter is used to return status information to the user.
472 * To check whether the construction succeeded or not, you should check
473 * the value of U_SUCCESS(err). If you wish more detailed information, you
474 * can check for informational error results which still indicate success.
475 * U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
476 * example, 'de_CH' was requested, but nothing was found there, so 'de' was
477 * used. U_USING_DEFAULT_WARNING indicates that the default locale data was
478 * used; neither the requested locale nor any of its fall back locales
480 * The caller owns the returned object and is responsible for deleting it.
483 static BreakIterator* U_EXPORT2
484 createSentenceInstance(const Locale& where, UErrorCode& status);
487 * Create BreakIterator for title-casing breaks using the specified locale
488 * Returns an instance of a BreakIterator implementing title breaks.
489 * The iterator returned locates title boundaries as described for
490 * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
491 * please use Word Boundary iterator.{@link #createWordInstance }
493 * @param where the locale.
494 * @param status The error code.
495 * @return A BreakIterator for title-breaks. The UErrorCode& status
496 * parameter is used to return status information to the user.
497 * To check whether the construction succeeded or not, you should check
498 * the value of U_SUCCESS(err). If you wish more detailed information, you
499 * can check for informational error results which still indicate success.
500 * U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
501 * example, 'de_CH' was requested, but nothing was found there, so 'de' was
502 * used. U_USING_DEFAULT_WARNING indicates that the default locale data was
503 * used; neither the requested locale nor any of its fall back locales
505 * The caller owns the returned object and is responsible for deleting it.
508 static BreakIterator* U_EXPORT2
509 createTitleInstance(const Locale& where, UErrorCode& status);
512 * Get the set of Locales for which TextBoundaries are installed.
513 * <p><b>Note:</b> this will not return locales added through the register
514 * call. To see the registered locales too, use the getAvailableLocales
515 * function that returns a StringEnumeration object </p>
516 * @param count the output parameter of number of elements in the locale list
517 * @return available locales
520 static const Locale* U_EXPORT2 getAvailableLocales(int32_t& count);
523 * Get name of the object for the desired Locale, in the desired langauge.
524 * @param objectLocale must be from getAvailableLocales.
525 * @param displayLocale specifies the desired locale for output.
526 * @param name the fill-in parameter of the return value
528 * @return user-displayable name
531 static UnicodeString& U_EXPORT2 getDisplayName(const Locale& objectLocale,
532 const Locale& displayLocale,
533 UnicodeString& name);
536 * Get name of the object for the desired Locale, in the langauge of the
538 * @param objectLocale must be from getMatchingLocales
539 * @param name the fill-in parameter of the return value
540 * @return user-displayable name
543 static UnicodeString& U_EXPORT2 getDisplayName(const Locale& objectLocale,
544 UnicodeString& name);
547 * Thread safe client-buffer-based cloning operation
548 * Do NOT call delete on a safeclone, since 'new' is not used to create it.
549 * @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
550 * If buffer is not large enough, new memory will be allocated.
551 * @param BufferSize reference to size of allocated space.
552 * If BufferSize == 0, a sufficient size for use in cloning will
553 * be returned ('pre-flighting')
554 * If BufferSize is not enough for a stack-based safe clone,
555 * new memory will be allocated.
556 * @param status to indicate whether the operation went on smoothly or there were errors
557 * An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were
559 * @return pointer to the new clone
563 virtual BreakIterator * createBufferClone(void *stackBuffer,
565 UErrorCode &status) = 0;
568 * Determine whether the BreakIterator was created in user memory by
569 * createBufferClone(), and thus should not be deleted. Such objects
570 * must be closed by an explicit call to the destructor (not delete).
573 inline UBool isBufferClone(void);
575 #if !UCONFIG_NO_SERVICE
577 * Register a new break iterator of the indicated kind, to use in the given locale.
578 * The break iterator will be adopted. Clones of the iterator will be returned
579 * if a request for a break iterator of the given kind matches or falls back to
581 * @param toAdopt the BreakIterator instance to be adopted
582 * @param locale the Locale for which this instance is to be registered
583 * @param kind the type of iterator for which this instance is to be registered
584 * @param status the in/out status code, no special meanings are assigned
585 * @return a registry key that can be used to unregister this instance
588 static URegistryKey U_EXPORT2 registerInstance(BreakIterator* toAdopt,
589 const Locale& locale,
590 UBreakIteratorType kind,
594 * Unregister a previously-registered BreakIterator using the key returned from the
595 * register call. Key becomes invalid after a successful call and should not be used again.
596 * The BreakIterator corresponding to the key will be deleted.
597 * @param key the registry key returned by a previous call to registerInstance
598 * @param status the in/out status code, no special meanings are assigned
599 * @return TRUE if the iterator for the key was successfully unregistered
602 static UBool U_EXPORT2 unregister(URegistryKey key, UErrorCode& status);
605 * Return a StringEnumeration over the locales available at the time of the call,
606 * including registered locales.
607 * @return a StringEnumeration over the locales available at the time of the call
610 static StringEnumeration* U_EXPORT2 getAvailableLocales(void);
614 * Returns the locale for this break iterator. Two flavors are available: valid and
616 * @draft ICU 2.8 likely to change after ICU 3.0, based on feedback
618 Locale getLocale(ULocDataLocaleType type, UErrorCode& status) const;
620 /** Get the locale for this break iterator object. You can choose between valid and actual locale.
621 * @param type type of the locale we're looking for (valid or actual)
622 * @param status error code for the operation
626 const char *getLocaleID(ULocDataLocaleType type, UErrorCode& status) const;
629 static BreakIterator* buildInstance(const Locale& loc, const char *type, UBool dict, UErrorCode& status);
630 static BreakIterator* createInstance(const Locale& loc, UBreakIteratorType kind, UErrorCode& status);
631 static BreakIterator* makeInstance(const Locale& loc, int32_t kind, UErrorCode& status);
633 friend class ICUBreakIteratorFactory;
634 friend class ICUBreakIteratorService;
642 BreakIterator (const BreakIterator &other) : UObject(other), fBufferClone(FALSE) {}
647 char actualLocale[ULOC_FULLNAME_CAPACITY];
648 char validLocale[ULOC_FULLNAME_CAPACITY];
651 * The assignment operator has no real implementation.
652 * It's provided to make the compiler happy. Do not call.
654 BreakIterator& operator=(const BreakIterator&);
657 inline UBool BreakIterator::isBufferClone()
664 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */