os/textandloc/fontservices/textshaperplugin/IcuSource/common/brkdict.h
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
sl@0
     1
/*
sl@0
     2
**********************************************************************
sl@0
     3
*   Copyright (C) 1999-2004 IBM and others. All rights reserved.
sl@0
     4
**********************************************************************
sl@0
     5
*   Date        Name        Description
sl@0
     6
*   12/1/99     rtg         Ported from Java
sl@0
     7
*   01/13/2000  helena      Added UErrorCode to ctors.
sl@0
     8
**********************************************************************
sl@0
     9
*/
sl@0
    10
sl@0
    11
#ifndef BRKDICT_H
sl@0
    12
#define BRKDICT_H
sl@0
    13
sl@0
    14
#include "unicode/utypes.h"
sl@0
    15
#include "unicode/uobject.h"
sl@0
    16
#include "ucmp8.h"
sl@0
    17
sl@0
    18
U_NAMESPACE_BEGIN
sl@0
    19
sl@0
    20
/**
sl@0
    21
 * This is the class that represents the list of known words used by
sl@0
    22
 * DictionaryBasedBreakIterator.  The conceptual data structure used
sl@0
    23
 * here is a trie: there is a node hanging off the root node for every
sl@0
    24
 * letter that can start a word.  Each of these nodes has a node hanging
sl@0
    25
 * off of it for every letter that can be the second letter of a word
sl@0
    26
 * if this node is the first letter, and so on.  The trie is represented
sl@0
    27
 * as a two-dimensional array that can be treated as a table of state
sl@0
    28
 * transitions.  Indexes are used to compress this array, taking
sl@0
    29
 * advantage of the fact that this array will always be very sparse.
sl@0
    30
 */
sl@0
    31
class BreakDictionary : public UMemory {
sl@0
    32
    //=================================================================================
sl@0
    33
    // data members
sl@0
    34
    //=================================================================================
sl@0
    35
private:
sl@0
    36
sl@0
    37
    /**
sl@0
    38
     * Maps from characters to column numbers.  The main use of this is to
sl@0
    39
     * avoid making room in the array for empty columns.
sl@0
    40
     */
sl@0
    41
    CompactByteArray* columnMap;
sl@0
    42
sl@0
    43
    /**
sl@0
    44
     * The number of actual columns in the table
sl@0
    45
     */
sl@0
    46
    int32_t numCols;
sl@0
    47
sl@0
    48
    /**
sl@0
    49
     * Columns are organized into groups of 32.  This says how many
sl@0
    50
     * column groups.  (We could calculate this, but we store the
sl@0
    51
     * value to avoid having to repeatedly calculate it.)
sl@0
    52
     */
sl@0
    53
    int32_t numColGroups;
sl@0
    54
sl@0
    55
    /**
sl@0
    56
     * The actual compressed state table.  Each conceptual row represents
sl@0
    57
     * a state, and the cells in it contain the row numbers of the states
sl@0
    58
     * to transition to for each possible letter.  0 is used to indicate
sl@0
    59
     * an illegal combination of letters (i.e., the error state).  The
sl@0
    60
     * table is compressed by eliminating all the unpopulated (i.e., zero)
sl@0
    61
     * cells.  Multiple conceptual rows can then be doubled up in a single
sl@0
    62
     * physical row by sliding them up and possibly shifting them to one
sl@0
    63
     * side or the other so the populated cells don't collide.  Indexes
sl@0
    64
     * are used to identify unpopulated cells and to locate populated cells.
sl@0
    65
     */
sl@0
    66
    int16_t* table;
sl@0
    67
sl@0
    68
    /**
sl@0
    69
     * This index maps logical row numbers to physical row numbers
sl@0
    70
     */
sl@0
    71
    int16_t* rowIndex;
sl@0
    72
sl@0
    73
    /**
sl@0
    74
     * A bitmap is used to tell which cells in the comceptual table are
sl@0
    75
     * populated.  This array contains all the unique bit combinations
sl@0
    76
     * in that bitmap.  If the table is more than 32 columns wide,
sl@0
    77
     * successive entries in this array are used for a single row.
sl@0
    78
     */
sl@0
    79
    int32_t* rowIndexFlags;
sl@0
    80
sl@0
    81
    /**
sl@0
    82
     * This index maps from a logical row number into the bitmap table above.
sl@0
    83
     * (This keeps us from storing duplicate bitmap combinations.)  Since there
sl@0
    84
     * are a lot of rows with only one populated cell, instead of wasting space
sl@0
    85
     * in the bitmap table, we just store a negative number in this index for
sl@0
    86
     * rows with one populated cell.  The absolute value of that number is
sl@0
    87
     * the column number of the populated cell.
sl@0
    88
     */
sl@0
    89
    int16_t* rowIndexFlagsIndex;
sl@0
    90
sl@0
    91
    /**
sl@0
    92
     * For each logical row, this index contains a constant that is added to
sl@0
    93
     * the logical column number to get the physical column number
sl@0
    94
     */
sl@0
    95
    int8_t* rowIndexShifts;
sl@0
    96
sl@0
    97
    //=================================================================================
sl@0
    98
    // deserialization
sl@0
    99
    //=================================================================================
sl@0
   100
sl@0
   101
public:
sl@0
   102
    /**
sl@0
   103
     * Constructor.  Creates the BreakDictionary by using readDictionaryFile() to
sl@0
   104
     * load the dictionary tables from the disk.
sl@0
   105
     * @param dictionaryFilename The name of the dictionary file
sl@0
   106
     * @param status for errors if it occurs
sl@0
   107
     */
sl@0
   108
    BreakDictionary(const char* dictionaryFilename, UErrorCode& status);
sl@0
   109
sl@0
   110
    /**
sl@0
   111
     * Destructor.
sl@0
   112
     */
sl@0
   113
    ~BreakDictionary();
sl@0
   114
sl@0
   115
    /**
sl@0
   116
     * Reads the dictionary file on the disk and constructs the appropriate in-memory
sl@0
   117
     * representation.
sl@0
   118
     * @param in The given memory stream
sl@0
   119
     */
sl@0
   120
    void readDictionaryFile(const uint8_t * in);
sl@0
   121
sl@0
   122
    //=================================================================================
sl@0
   123
    // access to the words
sl@0
   124
    //=================================================================================
sl@0
   125
sl@0
   126
    /**
sl@0
   127
     * Uses the column map to map the character to a column number, then
sl@0
   128
     * passes the row and column number to the other version of at()
sl@0
   129
     * @param row The current state
sl@0
   130
     * @param ch The character whose column we're interested in
sl@0
   131
     * @return The new state to transition to
sl@0
   132
     */
sl@0
   133
    int16_t at(int32_t row, UChar ch) const;
sl@0
   134
sl@0
   135
    /**
sl@0
   136
     * Returns the value in the cell with the specified (logical) row and
sl@0
   137
     * column numbers.  In DictionaryBasedBreakIterator, the row number is
sl@0
   138
     * a state number, the column number is an input, and the return value
sl@0
   139
     * is the row number of the new state to transition to.  (0 is the
sl@0
   140
     * "error" state, and -1 is the "end of word" state in a dictionary)
sl@0
   141
     * @param row The row number of the current state
sl@0
   142
     * @param col The column number of the input character (0 means "not a
sl@0
   143
     * dictionary character")
sl@0
   144
     * @return The row number of the new state to transition to
sl@0
   145
     */
sl@0
   146
    int16_t at(int32_t row, int32_t col) const;
sl@0
   147
sl@0
   148
private:
sl@0
   149
    /**
sl@0
   150
     * Given (logical) row and column numbers, returns true if the
sl@0
   151
     * cell in that position is populated
sl@0
   152
     * @param row The LOGICAL row number of the cell
sl@0
   153
     * @param col The PHYSICAL row number of the cell
sl@0
   154
     * @return true if the cell in that position is populated
sl@0
   155
     */
sl@0
   156
    UBool cellIsPopulated(int32_t row, int32_t col) const;
sl@0
   157
sl@0
   158
    /**
sl@0
   159
     * Implementation of at() when we know the specified cell is populated.
sl@0
   160
     * @param row The PHYSICAL row number of the cell
sl@0
   161
     * @param col The PHYSICAL column number of the cell
sl@0
   162
     * @return The value stored in the cell
sl@0
   163
     */
sl@0
   164
    int16_t internalAt(int32_t row, int32_t col) const;
sl@0
   165
sl@0
   166
    // the following methods are never meant to be called and so are not defined
sl@0
   167
    // (if you don't declare them, you get default implementations)
sl@0
   168
    BreakDictionary(const BreakDictionary& that);
sl@0
   169
    BreakDictionary& operator=(const BreakDictionary& that);
sl@0
   170
};
sl@0
   171
sl@0
   172
U_NAMESPACE_END
sl@0
   173
sl@0
   174
#endif