os/textandloc/fontservices/textshaperplugin/IcuSource/common/unicode/rbbi.h
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
     1 /*
     2 ***************************************************************************
     3 *   Copyright (C) 1999-2005 International Business Machines Corporation   *
     4 *   and others. All rights reserved.                                      *
     5 ***************************************************************************
     6 
     7 **********************************************************************
     8 *   Date        Name        Description
     9 *   10/22/99    alan        Creation.
    10 *   11/11/99    rgillam     Complete port from Java.
    11 **********************************************************************
    12 */
    13 
    14 #ifndef RBBI_H
    15 #define RBBI_H
    16 
    17 #include "unicode/utypes.h"
    18 
    19 /**
    20  * \file
    21  * \brief C++ API: Rule Based Break Iterator
    22  */
    23 
    24 #if !UCONFIG_NO_BREAK_ITERATION
    25 
    26 #include "unicode/brkiter.h"
    27 #include "unicode/udata.h"
    28 #include "unicode/parseerr.h"
    29 
    30 
    31 struct UTrie;
    32 
    33 U_NAMESPACE_BEGIN
    34 
    35 /** @internal */
    36 struct RBBIDataHeader;
    37 class  RuleBasedBreakIteratorTables;
    38 class  BreakIterator;
    39 class  RBBIDataWrapper;
    40 struct RBBIStateTable;
    41 
    42 
    43 
    44 /**
    45  *
    46  * A subclass of BreakIterator whose behavior is specified using a list of rules.
    47  * <p>Instances of this class are most commonly created by the factory methods of
    48  *  BreakIterator::createWordInstance(), BreakIterator::createLineInstance(), etc.,
    49  *  and then used via the abstract API in class BreakIterator</p>
    50  *
    51  * <p>See the ICU User Guide for information on Break Iterator Rules.</p>
    52  *
    53  * <p>This class is not intended to be subclassed.  (Class DictionaryBasedBreakIterator
    54  *    is a subclass, but that relationship is effectively internal to the ICU
    55  *    implementation.  The subclassing interface to RulesBasedBreakIterator is
    56  *    not part of the ICU API, and may not remain stable.</p>
    57  *
    58  */
    59 class U_COMMON_API RuleBasedBreakIterator : public BreakIterator {
    60 
    61 protected:
    62     /**
    63      * The character iterator through which this BreakIterator accesses the text
    64      * @internal
    65      */
    66     CharacterIterator*  fText;
    67 
    68     /**
    69      * The rule data for this BreakIterator instance
    70      * @internal
    71      */
    72     RBBIDataWrapper    *fData;
    73 
    74     /** Index of the Rule {tag} values for the most recent match.
    75      *  @internal
    76     */
    77     int32_t             fLastRuleStatusIndex;
    78 
    79     /**
    80      * Rule tag value valid flag.
    81      * Some iterator operations don't intrinsically set the correct tag value.
    82      * This flag lets us lazily compute the value if we are ever asked for it.
    83      * @internal
    84      */
    85     UBool               fLastStatusIndexValid;
    86 
    87     /**
    88      * Counter for the number of characters encountered with the "dictionary"
    89      *   flag set.  Normal RBBI iterators don't use it, although the code
    90      *   for updating it is live.  Dictionary Based break iterators (a subclass
    91      *   of us) access this field directly.
    92      * @internal
    93      */
    94     uint32_t           fDictionaryCharCount;
    95 
    96     /**
    97      * Debugging flag.  Trace operation of state machine when true.
    98      * @internal
    99      */
   100     static UBool        fTrace;
   101 
   102 
   103 protected:
   104     //=======================================================================
   105     // constructors
   106     //=======================================================================
   107 
   108     /**
   109      * Constructor from a flattened set of RBBI data in malloced memory.
   110      *             RulesBasedBreakIterators built from a custom set of rules
   111      *             are created via this constructor; the rules are compiled
   112      *             into memory, then the break iterator is constructed here.
   113      *
   114      *             The break iterator adopts the memory, and will
   115      *             free it when done.
   116      * @internal
   117      */
   118     RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status);
   119 
   120     /** @internal */
   121     friend class RBBIRuleBuilder;
   122     /** @internal */
   123     friend class BreakIterator;
   124 
   125 
   126 
   127 public:
   128 
   129     /** Default constructor.  Creates an empty shell of an iterator, with no
   130      *  rules or text to iterate over.   Object can subsequently be assigned to.
   131      *  @stable ICU 2.2
   132      */
   133     RuleBasedBreakIterator();
   134 
   135     /**
   136      * Copy constructor.  Will produce a break iterator with the same behavior,
   137      * and which iterates over the same text, as the one passed in.
   138      * @param that The RuleBasedBreakIterator passed to be copied
   139      * @stable ICU 2.0
   140      */
   141     RuleBasedBreakIterator(const RuleBasedBreakIterator& that);
   142 
   143     /**
   144      * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
   145      * @param rules The break rules to be used.
   146      * @param parseError  In the event of a syntax error in the rules, provides the location
   147      *                    within the rules of the problem.
   148      * @param status Information on any errors encountered.
   149      * @stable ICU 2.2
   150      */
   151     RuleBasedBreakIterator( const UnicodeString    &rules,
   152                              UParseError           &parseError,
   153                              UErrorCode            &status);
   154 
   155 
   156     /**
   157      * This constructor uses the udata interface to create a BreakIterator
   158      * whose internal tables live in a memory-mapped file.  "image" is an
   159      * ICU UDataMemory handle for the pre-compiled break iterator tables.
   160      * @param image handle to the memory image for the break iterator data.
   161      *        Ownership of the UDataMemory handle passes to the Break Iterator,
   162      *        which will be responsible for closing it when it is no longer needed.
   163      * @param status Information on any errors encountered.
   164      * @see udata_open
   165      * @see #getBinaryRules
   166      * @stable ICU 2.8
   167      */
   168     RuleBasedBreakIterator(UDataMemory* image, UErrorCode &status);
   169 
   170     /**
   171      * Destructor
   172      *  @stable ICU 2.0
   173      */
   174     virtual ~RuleBasedBreakIterator();
   175 
   176     /**
   177      * Assignment operator.  Sets this iterator to have the same behavior,
   178      * and iterate over the same text, as the one passed in.
   179      * @param that The RuleBasedBreakItertor passed in
   180      * @return the newly created RuleBasedBreakIterator
   181      *  @stable ICU 2.0
   182      */
   183     RuleBasedBreakIterator& operator=(const RuleBasedBreakIterator& that);
   184 
   185     /**
   186      * Equality operator.  Returns TRUE if both BreakIterators are of the
   187      * same class, have the same behavior, and iterate over the same text.
   188      * @param that The BreakIterator to be compared for equality
   189      * @return TRUE if both BreakIterators are of the
   190      * same class, have the same behavior, and iterate over the same text.
   191      *  @stable ICU 2.0
   192      */
   193     virtual UBool operator==(const BreakIterator& that) const;
   194 
   195     /**
   196      * Not-equal operator.  If operator== returns TRUE, this returns FALSE,
   197      * and vice versa.
   198      * @param that The BreakIterator to be compared for inequality
   199      * @return TRUE if both BreakIterators are not same.
   200      *  @stable ICU 2.0
   201      */
   202     UBool operator!=(const BreakIterator& that) const;
   203 
   204     /**
   205      * Returns a newly-constructed RuleBasedBreakIterator with the same
   206      * behavior, and iterating over the same text, as this one.
   207      * Differs from the copy constructor in that it is polymorphic, and
   208      * will correctly clone (copy) a derived class.
   209      * clone() is thread safe.  Multiple threads may simultaeneously
   210      * clone the same source break iterator.
   211      * @return a newly-constructed RuleBasedBreakIterator
   212      * @stable ICU 2.0
   213      */
   214     virtual BreakIterator* clone() const;
   215 
   216     /**
   217      * Compute a hash code for this BreakIterator
   218      * @return A hash code
   219      *  @stable ICU 2.0
   220      */
   221     virtual int32_t hashCode(void) const;
   222 
   223     /**
   224      * Returns the description used to create this iterator
   225      * @return the description used to create this iterator
   226      *  @stable ICU 2.0
   227      */
   228     virtual const UnicodeString& getRules(void) const;
   229 
   230     //=======================================================================
   231     // BreakIterator overrides
   232     //=======================================================================
   233 
   234     /**
   235      * Return a CharacterIterator over the text being analyzed.  This version
   236      * of this method returns the actual CharacterIterator we're using internally.
   237      * Changing the state of this iterator can have undefined consequences.  If
   238      * you need to change it, clone it first.
   239      * @return An iterator over the text being analyzed.
   240      *  @stable ICU 2.0
   241      */
   242     virtual const CharacterIterator& getText(void) const;
   243 
   244 
   245     /**
   246       *  Get a UText for the text being analyzed.
   247       *  The returned UText is a shallow clone of the UText used internally
   248       *  by the break iterator implementation.  It can safely be used to
   249       *  access the text without impacting any break iterator operations,
   250       *  but the underlying text itself must not be altered.
   251       *
   252       * @param fillIn A UText to be filled in.  If NULL, a new UText will be
   253       *           allocated to hold the result.
   254       * @param status receives any error codes.
   255       * @return   The current UText for this break iterator.  If an input
   256       *           UText was provided, it will always be returned.
   257       * @draft ICU 3.4
   258       */
   259      virtual UText *getUText(UText *fillIn, UErrorCode &status) const;
   260 
   261     /**
   262      * Set the iterator to analyze a new piece of text.  This function resets
   263      * the current iteration position to the beginning of the text.
   264      * @param newText An iterator over the text to analyze.  The BreakIterator
   265      * takes ownership of the character iterator.  The caller MUST NOT delete it!
   266      *  @stable ICU 2.0
   267      */
   268     virtual void adoptText(CharacterIterator* newText);
   269 
   270     /**
   271      * Set the iterator to analyze a new piece of text.  This function resets
   272      * the current iteration position to the beginning of the text.
   273      * @param newText The text to analyze.
   274      *  @stable ICU 2.0
   275      */
   276     virtual void setText(const UnicodeString& newText);
   277 
   278     /**
   279      * Reset the break iterator to operate over the text represented by
   280      * the UText.  The iterator position is reset to the start.
   281      *
   282      * This function makes a shallow clone of the supplied UText.  This means
   283      * that the caller is free to immediately close or otherwise reuse the
   284      * Utext that was passed as a parameter, but that the underlying text itself
   285      * must not be altered while being referenced by the break iterator.
   286      *
   287      * @param text    The UText used to change the text.
   288      * @param status  Receives any error codes.
   289      * @draft ICU 3.4
   290      */
   291     virtual void  setText(UText *text, UErrorCode &status);
   292 
   293     /**
   294      * Sets the current iteration position to the beginning of the text.
   295      * (i.e., the CharacterIterator's starting offset).
   296      * @return The offset of the beginning of the text.
   297      *  @stable ICU 2.0
   298      */
   299     virtual int32_t first(void);
   300 
   301     /**
   302      * Sets the current iteration position to the end of the text.
   303      * (i.e., the CharacterIterator's ending offset).
   304      * @return The text's past-the-end offset.
   305      *  @stable ICU 2.0
   306      */
   307     virtual int32_t last(void);
   308 
   309     /**
   310      * Advances the iterator either forward or backward the specified number of steps.
   311      * Negative values move backward, and positive values move forward.  This is
   312      * equivalent to repeatedly calling next() or previous().
   313      * @param n The number of steps to move.  The sign indicates the direction
   314      * (negative is backwards, and positive is forwards).
   315      * @return The character offset of the boundary position n boundaries away from
   316      * the current one.
   317      *  @stable ICU 2.0
   318      */
   319     virtual int32_t next(int32_t n);
   320 
   321     /**
   322      * Advances the iterator to the next boundary position.
   323      * @return The position of the first boundary after this one.
   324      *  @stable ICU 2.0
   325      */
   326     virtual int32_t next(void);
   327 
   328     /**
   329      * Moves the iterator backwards, to the last boundary preceding this one.
   330      * @return The position of the last boundary position preceding this one.
   331      *  @stable ICU 2.0
   332      */
   333     virtual int32_t previous(void);
   334 
   335     /**
   336      * Sets the iterator to refer to the first boundary position following
   337      * the specified position.
   338      * @param offset The position from which to begin searching for a break position.
   339      * @return The position of the first break after the current position.
   340      *  @stable ICU 2.0
   341      */
   342     virtual int32_t following(int32_t offset);
   343 
   344     /**
   345      * Sets the iterator to refer to the last boundary position before the
   346      * specified position.
   347      * @param offset The position to begin searching for a break from.
   348      * @return The position of the last boundary before the starting position.
   349      *  @stable ICU 2.0
   350      */
   351     virtual int32_t preceding(int32_t offset);
   352 
   353     /**
   354      * Returns true if the specfied position is a boundary position.  As a side
   355      * effect, leaves the iterator pointing to the first boundary position at
   356      * or after "offset".
   357      * @param offset the offset to check.
   358      * @return True if "offset" is a boundary position.
   359      *  @stable ICU 2.0
   360      */
   361     virtual UBool isBoundary(int32_t offset);
   362 
   363     /**
   364      * Returns the current iteration position.
   365      * @return The current iteration position.
   366      * @stable ICU 2.0
   367      */
   368     virtual int32_t current(void) const;
   369 
   370 
   371     /**
   372      * Return the status tag from the break rule that determined the most recently
   373      * returned break position.  For break rules that do not specify a
   374      * status, a default value of 0 is returned.  If more than one break rule
   375      * would cause a boundary to be located at some position in the text,
   376      * the numerically largest of the applicable status values is returned.
   377      * <p>
   378      * Of the standard types of ICU break iterators, only word break and
   379      * line break provide status values.  The values are defined in
   380      * the header file ubrk.h.  For Word breaks, the status allows distinguishing between words
   381      * that contain alphabetic letters, "words" that appear to be numbers,
   382      * punctuation and spaces, words containing ideographic characters, and
   383      * more.  For Line Break, the status distinguishes between hard (mandatory) breaks
   384      * and soft (potential) break positions.
   385      * <p>
   386      * <code>getRuleStatus()</code> can be called after obtaining a boundary
   387      * position from <code>next()</code>, <code>previous()</code>, or
   388      * any other break iterator functions that returns a boundary position.
   389      * <p>
   390      * When creating custom break rules, one is free to define whatever
   391      * status values may be convenient for the application.
   392      * <p>
   393      * Note: this function is not thread safe.  It should not have been
   394      *       declared const, and the const remains only for compatibility
   395      *       reasons.  (The function is logically const, but not bit-wise const).
   396      * <p>
   397      * @return the status from the break rule that determined the most recently
   398      * returned break position.
   399      *
   400      * @see UWordBreak
   401      * @stable ICU 2.2
   402      */
   403     virtual int32_t getRuleStatus() const;
   404 
   405    /**
   406     * Get the status (tag) values from the break rule(s) that determined the most
   407     * recently returned break position.
   408     * <p>
   409     * The returned status value(s) are stored into an array provided by the caller.
   410     * The values are stored in sorted (ascending) order.
   411     * If the capacity of the output array is insufficient to hold the data,
   412     *  the output will be truncated to the available length, and a
   413     *  U_BUFFER_OVERFLOW_ERROR will be signaled.
   414     *
   415     * @param fillInVec an array to be filled in with the status values.
   416     * @param capacity  the length of the supplied vector.  A length of zero causes
   417     *                  the function to return the number of status values, in the
   418     *                  normal way, without attemtping to store any values.
   419     * @param status    receives error codes.
   420     * @return          The number of rule status values from rules that determined
   421     *                  the most recent boundary returned by the break iterator.
   422     *                  In the event of a U_BUFFER_OVERFLOW_ERROR, the return value
   423     *                  is the total number of status values that were available,
   424     *                  not the reduced number that were actually returned.
   425     * @see getRuleStatus
   426     * @draft ICU 3.0
   427     */
   428     virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status);
   429 
   430     /**
   431      * Returns a unique class ID POLYMORPHICALLY.  Pure virtual override.
   432      * This method is to implement a simple version of RTTI, since not all
   433      * C++ compilers support genuine RTTI.  Polymorphic operator==() and
   434      * clone() methods call this method.
   435      *
   436      * @return          The class ID for this object. All objects of a
   437      *                  given class have the same class ID.  Objects of
   438      *                  other classes have different class IDs.
   439      * @stable ICU 2.0
   440      */
   441     virtual UClassID getDynamicClassID(void) const;
   442 
   443     /**
   444      * Returns the class ID for this class.  This is useful only for
   445      * comparing to a return value from getDynamicClassID().  For example:
   446      *
   447      *      Base* polymorphic_pointer = createPolymorphicObject();
   448      *      if (polymorphic_pointer->getDynamicClassID() ==
   449      *          Derived::getStaticClassID()) ...
   450      *
   451      * @return          The class ID for all objects of this class.
   452      * @stable ICU 2.0
   453      */
   454     static UClassID U_EXPORT2 getStaticClassID(void);
   455 
   456     /*
   457      * Create a clone (copy) of this break iterator in memory provided
   458      *  by the caller.  The idea is to increase performance by avoiding
   459      *  a storage allocation.  Use of this functoin is NOT RECOMMENDED.
   460      *  Performance gains are minimal, and correct buffer management is
   461      *  tricky.  Use clone() instead.
   462      *
   463      * @param stackBuffer  The pointer to the memory into which the cloned object
   464      *                     should be placed.  If NULL,  allocate heap memory
   465      *                     for the cloned object.
   466      * @param BufferSize   The size of the buffer.  If zero, return the required
   467      *                     buffer size, but do not clone the object.  If the
   468      *                     size was too small (but not zero), allocate heap
   469      *                     storage for the cloned object.
   470      *
   471      * @param status       Error status.  U_SAFECLONE_ALLOCATED_WARNING will be
   472      *                     returned if the the provided buffer was too small, and
   473      *                     the clone was therefore put on the heap.
   474      *
   475      * @return  Pointer to the clone object.  This may differ from the stackBuffer
   476      *          address if the byte alignment of the stack buffer was not suitable
   477      *          or if the stackBuffer was too small to hold the clone.
   478      * @stable ICU 2.0
   479      */
   480     virtual BreakIterator *  createBufferClone(void *stackBuffer,
   481                                                int32_t &BufferSize,
   482                                                UErrorCode &status);
   483 
   484 
   485     /**
   486      * Return the binary form of compiled break rules,
   487      * which can then be used to create a new break iterator at some
   488      * time in the future.  Creating a break iterator from pre-compiled rules
   489      * is much faster than building one from the source form of the
   490      * break rules.
   491      *
   492      * The binary data can only be used with the same version of ICU
   493      *  and on the same platform type (processor endian-ness)
   494      *
   495      * @param length Returns the length of the binary data.  (Out paramter.)
   496      *
   497      * @return   A pointer to the binary (compiled) rule data.  The storage
   498      *           belongs to the RulesBasedBreakIterator object, not the
   499      *           caller, and must not be modified or deleted.
   500      * @internal
   501      */
   502     virtual const uint8_t *getBinaryRules(uint32_t &length);
   503 
   504 
   505 protected:
   506     //=======================================================================
   507     // implementation
   508     //=======================================================================
   509     /**
   510      * This method is the actual implementation of the next() method.  All iteration
   511      * vectors through here.  This method initializes the state machine to state 1
   512      * and advances through the text character by character until we reach the end
   513      * of the text or the state machine transitions to state 0.  We update our return
   514      * value every time the state machine passes through a possible end state.
   515      * @internal
   516      */
   517     virtual int32_t handleNext(void);
   518 
   519     /**
   520      * This method backs the iterator back up to a "safe position" in the text.
   521      * This is a position that we know, without any context, must be a break position.
   522      * The various calling methods then iterate forward from this safe position to
   523      * the appropriate position to return.  (For more information, see the description
   524      * of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
   525      * @internal
   526      */
   527     virtual int32_t handlePrevious(void);
   528 
   529     /**
   530      * Dumps caches and performs other actions associated with a complete change
   531      * in text or iteration position.  This function is a no-op in RuleBasedBreakIterator,
   532      * but subclasses can and do override it.
   533      * @internal
   534      */
   535     virtual void reset(void);
   536 
   537     /**
   538       * Return true if the category lookup for this char
   539       * indicates that it is in the set of dictionary lookup chars.
   540       * This function is intended for use by dictionary based break iterators.
   541       * @return true if the category lookup for this char
   542       * indicates that it is in the set of dictionary lookup chars.
   543       * @internal
   544       */
   545     virtual UBool isDictionaryChar(UChar32);
   546 
   547     /**
   548       * Common initialization function, used by constructors and bufferClone.
   549       *   (Also used by DictionaryBasedBreakIterator::createBufferClone().)
   550       * @internal
   551       */
   552     void init();
   553 
   554 private:
   555 
   556     /**
   557      * This method backs the iterator back up to a "safe position" in the text.
   558      * This is a position that we know, without any context, must be a break position.
   559      * The various calling methods then iterate forward from this safe position to
   560      * the appropriate position to return.  (For more information, see the description
   561      * of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
   562      * @param statetable state table used of moving backwards
   563      * @internal
   564      */
   565     int32_t handlePrevious(const RBBIStateTable *statetable);
   566 
   567     /**
   568      * This method is the actual implementation of the next() method.  All iteration
   569      * vectors through here.  This method initializes the state machine to state 1
   570      * and advances through the text character by character until we reach the end
   571      * of the text or the state machine transitions to state 0.  We update our return
   572      * value every time the state machine passes through a possible end state.
   573      * @param statetable state table used of moving forwards
   574      * @internal
   575      */
   576     int32_t handleNext(const RBBIStateTable *statetable);
   577 
   578     /**
   579      *  @internal
   580      */
   581     void makeRuleStatusValid();
   582 
   583 };
   584 
   585 //------------------------------------------------------------------------------
   586 //
   587 //   Inline Functions Definitions ...
   588 //
   589 //------------------------------------------------------------------------------
   590 
   591 inline UBool RuleBasedBreakIterator::operator!=(const BreakIterator& that) const {
   592     return !operator==(that);
   593 }
   594 
   595 U_NAMESPACE_END
   596 
   597 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   598 
   599 #endif