Symaptic: os/textandloc/fontservices/textshaperplugin/IcuSource/common/brkdict.h@260cb5ec6c19

     1 /*

     2 **********************************************************************

     3 *   Copyright (C) 1999-2004 IBM and others. All rights reserved.

     4 **********************************************************************

     5 *   Date        Name        Description

     6 *   12/1/99     rtg         Ported from Java

     7 *   01/13/2000  helena      Added UErrorCode to ctors.

     8 **********************************************************************

     9 */

    11 #ifndef BRKDICT_H

    12 #define BRKDICT_H

    14 #include "unicode/utypes.h"

    15 #include "unicode/uobject.h"

    16 #include "ucmp8.h"

    18 U_NAMESPACE_BEGIN

    20 /**

    21  * This is the class that represents the list of known words used by

    22  * DictionaryBasedBreakIterator.  The conceptual data structure used

    23  * here is a trie: there is a node hanging off the root node for every

    24  * letter that can start a word.  Each of these nodes has a node hanging

    25  * off of it for every letter that can be the second letter of a word

    26  * if this node is the first letter, and so on.  The trie is represented

    27  * as a two-dimensional array that can be treated as a table of state

    28  * transitions.  Indexes are used to compress this array, taking

    29  * advantage of the fact that this array will always be very sparse.

    30  */

    31 class BreakDictionary : public UMemory {

    32     //=================================================================================

    33     // data members

    34     //=================================================================================

    35 private:

    37     /**

    38      * Maps from characters to column numbers.  The main use of this is to

    39      * avoid making room in the array for empty columns.

    40      */

    41     CompactByteArray* columnMap;

    43     /**

    44      * The number of actual columns in the table

    45      */

    46     int32_t numCols;

    48     /**

    49      * Columns are organized into groups of 32.  This says how many

    50      * column groups.  (We could calculate this, but we store the

    51      * value to avoid having to repeatedly calculate it.)

    52      */

    53     int32_t numColGroups;

    55     /**

    56      * The actual compressed state table.  Each conceptual row represents

    57      * a state, and the cells in it contain the row numbers of the states

    58      * to transition to for each possible letter.  0 is used to indicate

    59      * an illegal combination of letters (i.e., the error state).  The

    60      * table is compressed by eliminating all the unpopulated (i.e., zero)

    61      * cells.  Multiple conceptual rows can then be doubled up in a single

    62      * physical row by sliding them up and possibly shifting them to one

    63      * side or the other so the populated cells don't collide.  Indexes

    64      * are used to identify unpopulated cells and to locate populated cells.

    65      */

    66     int16_t* table;

    68     /**

    69      * This index maps logical row numbers to physical row numbers

    70      */

    71     int16_t* rowIndex;

    73     /**

    74      * A bitmap is used to tell which cells in the comceptual table are

    75      * populated.  This array contains all the unique bit combinations

    76      * in that bitmap.  If the table is more than 32 columns wide,

    77      * successive entries in this array are used for a single row.

    78      */

    79     int32_t* rowIndexFlags;

    81     /**

    82      * This index maps from a logical row number into the bitmap table above.

    83      * (This keeps us from storing duplicate bitmap combinations.)  Since there

    84      * are a lot of rows with only one populated cell, instead of wasting space

    85      * in the bitmap table, we just store a negative number in this index for

    86      * rows with one populated cell.  The absolute value of that number is

    87      * the column number of the populated cell.

    88      */

    89     int16_t* rowIndexFlagsIndex;

    91     /**

    92      * For each logical row, this index contains a constant that is added to

    93      * the logical column number to get the physical column number

    94      */

    95     int8_t* rowIndexShifts;

    97     //=================================================================================

    98     // deserialization

    99     //=================================================================================

   101 public:

   102     /**

   103      * Constructor.  Creates the BreakDictionary by using readDictionaryFile() to

   104      * load the dictionary tables from the disk.

   105      * @param dictionaryFilename The name of the dictionary file

   106      * @param status for errors if it occurs

   107      */

   108     BreakDictionary(const char* dictionaryFilename, UErrorCode& status);

   110     /**

   111      * Destructor.

   112      */

   113     ~BreakDictionary();

   115     /**

   116      * Reads the dictionary file on the disk and constructs the appropriate in-memory

   117      * representation.

   118      * @param in The given memory stream

   119      */

   120     void readDictionaryFile(const uint8_t * in);

   122     //=================================================================================

   123     // access to the words

   124     //=================================================================================

   126     /**

   127      * Uses the column map to map the character to a column number, then

   128      * passes the row and column number to the other version of at()

   129      * @param row The current state

   130      * @param ch The character whose column we're interested in

   131      * @return The new state to transition to

   132      */

   133     int16_t at(int32_t row, UChar ch) const;

   135     /**

   136      * Returns the value in the cell with the specified (logical) row and

   137      * column numbers.  In DictionaryBasedBreakIterator, the row number is

   138      * a state number, the column number is an input, and the return value

   139      * is the row number of the new state to transition to.  (0 is the

   140      * "error" state, and -1 is the "end of word" state in a dictionary)

   141      * @param row The row number of the current state

   142      * @param col The column number of the input character (0 means "not a

   143      * dictionary character")

   144      * @return The row number of the new state to transition to

   145      */

   146     int16_t at(int32_t row, int32_t col) const;

   148 private:

   149     /**

   150      * Given (logical) row and column numbers, returns true if the

   151      * cell in that position is populated

   152      * @param row The LOGICAL row number of the cell

   153      * @param col The PHYSICAL row number of the cell

   154      * @return true if the cell in that position is populated

   155      */

   156     UBool cellIsPopulated(int32_t row, int32_t col) const;

   158     /**

   159      * Implementation of at() when we know the specified cell is populated.

   160      * @param row The PHYSICAL row number of the cell

   161      * @param col The PHYSICAL column number of the cell

   162      * @return The value stored in the cell

   163      */

   164     int16_t internalAt(int32_t row, int32_t col) const;

   166     // the following methods are never meant to be called and so are not defined

   167     // (if you don't declare them, you get default implementations)

   168     BreakDictionary(const BreakDictionary& that);

   169     BreakDictionary& operator=(const BreakDictionary& that);

   170 };

   172 U_NAMESPACE_END

   174 #endif

author	sl
	Tue, 10 Jun 2014 14:32:02 +0200
changeset 1	260cb5ec6c19
permissions	-rw-r--r--