os/textandloc/fontservices/textshaperplugin/IcuSource/common/brkdict.h
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
     1 /*
     2 **********************************************************************
     3 *   Copyright (C) 1999-2004 IBM and others. All rights reserved.
     4 **********************************************************************
     5 *   Date        Name        Description
     6 *   12/1/99     rtg         Ported from Java
     7 *   01/13/2000  helena      Added UErrorCode to ctors.
     8 **********************************************************************
     9 */
    10 
    11 #ifndef BRKDICT_H
    12 #define BRKDICT_H
    13 
    14 #include "unicode/utypes.h"
    15 #include "unicode/uobject.h"
    16 #include "ucmp8.h"
    17 
    18 U_NAMESPACE_BEGIN
    19 
    20 /**
    21  * This is the class that represents the list of known words used by
    22  * DictionaryBasedBreakIterator.  The conceptual data structure used
    23  * here is a trie: there is a node hanging off the root node for every
    24  * letter that can start a word.  Each of these nodes has a node hanging
    25  * off of it for every letter that can be the second letter of a word
    26  * if this node is the first letter, and so on.  The trie is represented
    27  * as a two-dimensional array that can be treated as a table of state
    28  * transitions.  Indexes are used to compress this array, taking
    29  * advantage of the fact that this array will always be very sparse.
    30  */
    31 class BreakDictionary : public UMemory {
    32     //=================================================================================
    33     // data members
    34     //=================================================================================
    35 private:
    36 
    37     /**
    38      * Maps from characters to column numbers.  The main use of this is to
    39      * avoid making room in the array for empty columns.
    40      */
    41     CompactByteArray* columnMap;
    42 
    43     /**
    44      * The number of actual columns in the table
    45      */
    46     int32_t numCols;
    47 
    48     /**
    49      * Columns are organized into groups of 32.  This says how many
    50      * column groups.  (We could calculate this, but we store the
    51      * value to avoid having to repeatedly calculate it.)
    52      */
    53     int32_t numColGroups;
    54 
    55     /**
    56      * The actual compressed state table.  Each conceptual row represents
    57      * a state, and the cells in it contain the row numbers of the states
    58      * to transition to for each possible letter.  0 is used to indicate
    59      * an illegal combination of letters (i.e., the error state).  The
    60      * table is compressed by eliminating all the unpopulated (i.e., zero)
    61      * cells.  Multiple conceptual rows can then be doubled up in a single
    62      * physical row by sliding them up and possibly shifting them to one
    63      * side or the other so the populated cells don't collide.  Indexes
    64      * are used to identify unpopulated cells and to locate populated cells.
    65      */
    66     int16_t* table;
    67 
    68     /**
    69      * This index maps logical row numbers to physical row numbers
    70      */
    71     int16_t* rowIndex;
    72 
    73     /**
    74      * A bitmap is used to tell which cells in the comceptual table are
    75      * populated.  This array contains all the unique bit combinations
    76      * in that bitmap.  If the table is more than 32 columns wide,
    77      * successive entries in this array are used for a single row.
    78      */
    79     int32_t* rowIndexFlags;
    80 
    81     /**
    82      * This index maps from a logical row number into the bitmap table above.
    83      * (This keeps us from storing duplicate bitmap combinations.)  Since there
    84      * are a lot of rows with only one populated cell, instead of wasting space
    85      * in the bitmap table, we just store a negative number in this index for
    86      * rows with one populated cell.  The absolute value of that number is
    87      * the column number of the populated cell.
    88      */
    89     int16_t* rowIndexFlagsIndex;
    90 
    91     /**
    92      * For each logical row, this index contains a constant that is added to
    93      * the logical column number to get the physical column number
    94      */
    95     int8_t* rowIndexShifts;
    96 
    97     //=================================================================================
    98     // deserialization
    99     //=================================================================================
   100 
   101 public:
   102     /**
   103      * Constructor.  Creates the BreakDictionary by using readDictionaryFile() to
   104      * load the dictionary tables from the disk.
   105      * @param dictionaryFilename The name of the dictionary file
   106      * @param status for errors if it occurs
   107      */
   108     BreakDictionary(const char* dictionaryFilename, UErrorCode& status);
   109 
   110     /**
   111      * Destructor.
   112      */
   113     ~BreakDictionary();
   114 
   115     /**
   116      * Reads the dictionary file on the disk and constructs the appropriate in-memory
   117      * representation.
   118      * @param in The given memory stream
   119      */
   120     void readDictionaryFile(const uint8_t * in);
   121 
   122     //=================================================================================
   123     // access to the words
   124     //=================================================================================
   125 
   126     /**
   127      * Uses the column map to map the character to a column number, then
   128      * passes the row and column number to the other version of at()
   129      * @param row The current state
   130      * @param ch The character whose column we're interested in
   131      * @return The new state to transition to
   132      */
   133     int16_t at(int32_t row, UChar ch) const;
   134 
   135     /**
   136      * Returns the value in the cell with the specified (logical) row and
   137      * column numbers.  In DictionaryBasedBreakIterator, the row number is
   138      * a state number, the column number is an input, and the return value
   139      * is the row number of the new state to transition to.  (0 is the
   140      * "error" state, and -1 is the "end of word" state in a dictionary)
   141      * @param row The row number of the current state
   142      * @param col The column number of the input character (0 means "not a
   143      * dictionary character")
   144      * @return The row number of the new state to transition to
   145      */
   146     int16_t at(int32_t row, int32_t col) const;
   147 
   148 private:
   149     /**
   150      * Given (logical) row and column numbers, returns true if the
   151      * cell in that position is populated
   152      * @param row The LOGICAL row number of the cell
   153      * @param col The PHYSICAL row number of the cell
   154      * @return true if the cell in that position is populated
   155      */
   156     UBool cellIsPopulated(int32_t row, int32_t col) const;
   157 
   158     /**
   159      * Implementation of at() when we know the specified cell is populated.
   160      * @param row The PHYSICAL row number of the cell
   161      * @param col The PHYSICAL column number of the cell
   162      * @return The value stored in the cell
   163      */
   164     int16_t internalAt(int32_t row, int32_t col) const;
   165 
   166     // the following methods are never meant to be called and so are not defined
   167     // (if you don't declare them, you get default implementations)
   168     BreakDictionary(const BreakDictionary& that);
   169     BreakDictionary& operator=(const BreakDictionary& that);
   170 };
   171 
   172 U_NAMESPACE_END
   173 
   174 #endif