Update contrib.
2 **********************************************************************
3 * Copyright (C) 1999-2004 IBM and others. All rights reserved.
4 **********************************************************************
5 * Date Name Description
6 * 12/1/99 rtg Ported from Java
7 * 01/13/2000 helena Added UErrorCode to ctors.
8 **********************************************************************
14 #include "unicode/utypes.h"
15 #include "unicode/uobject.h"
21 * This is the class that represents the list of known words used by
22 * DictionaryBasedBreakIterator. The conceptual data structure used
23 * here is a trie: there is a node hanging off the root node for every
24 * letter that can start a word. Each of these nodes has a node hanging
25 * off of it for every letter that can be the second letter of a word
26 * if this node is the first letter, and so on. The trie is represented
27 * as a two-dimensional array that can be treated as a table of state
28 * transitions. Indexes are used to compress this array, taking
29 * advantage of the fact that this array will always be very sparse.
31 class BreakDictionary : public UMemory {
32 //=================================================================================
34 //=================================================================================
38 * Maps from characters to column numbers. The main use of this is to
39 * avoid making room in the array for empty columns.
41 CompactByteArray* columnMap;
44 * The number of actual columns in the table
49 * Columns are organized into groups of 32. This says how many
50 * column groups. (We could calculate this, but we store the
51 * value to avoid having to repeatedly calculate it.)
56 * The actual compressed state table. Each conceptual row represents
57 * a state, and the cells in it contain the row numbers of the states
58 * to transition to for each possible letter. 0 is used to indicate
59 * an illegal combination of letters (i.e., the error state). The
60 * table is compressed by eliminating all the unpopulated (i.e., zero)
61 * cells. Multiple conceptual rows can then be doubled up in a single
62 * physical row by sliding them up and possibly shifting them to one
63 * side or the other so the populated cells don't collide. Indexes
64 * are used to identify unpopulated cells and to locate populated cells.
69 * This index maps logical row numbers to physical row numbers
74 * A bitmap is used to tell which cells in the comceptual table are
75 * populated. This array contains all the unique bit combinations
76 * in that bitmap. If the table is more than 32 columns wide,
77 * successive entries in this array are used for a single row.
79 int32_t* rowIndexFlags;
82 * This index maps from a logical row number into the bitmap table above.
83 * (This keeps us from storing duplicate bitmap combinations.) Since there
84 * are a lot of rows with only one populated cell, instead of wasting space
85 * in the bitmap table, we just store a negative number in this index for
86 * rows with one populated cell. The absolute value of that number is
87 * the column number of the populated cell.
89 int16_t* rowIndexFlagsIndex;
92 * For each logical row, this index contains a constant that is added to
93 * the logical column number to get the physical column number
95 int8_t* rowIndexShifts;
97 //=================================================================================
99 //=================================================================================
103 * Constructor. Creates the BreakDictionary by using readDictionaryFile() to
104 * load the dictionary tables from the disk.
105 * @param dictionaryFilename The name of the dictionary file
106 * @param status for errors if it occurs
108 BreakDictionary(const char* dictionaryFilename, UErrorCode& status);
116 * Reads the dictionary file on the disk and constructs the appropriate in-memory
118 * @param in The given memory stream
120 void readDictionaryFile(const uint8_t * in);
122 //=================================================================================
123 // access to the words
124 //=================================================================================
127 * Uses the column map to map the character to a column number, then
128 * passes the row and column number to the other version of at()
129 * @param row The current state
130 * @param ch The character whose column we're interested in
131 * @return The new state to transition to
133 int16_t at(int32_t row, UChar ch) const;
136 * Returns the value in the cell with the specified (logical) row and
137 * column numbers. In DictionaryBasedBreakIterator, the row number is
138 * a state number, the column number is an input, and the return value
139 * is the row number of the new state to transition to. (0 is the
140 * "error" state, and -1 is the "end of word" state in a dictionary)
141 * @param row The row number of the current state
142 * @param col The column number of the input character (0 means "not a
143 * dictionary character")
144 * @return The row number of the new state to transition to
146 int16_t at(int32_t row, int32_t col) const;
150 * Given (logical) row and column numbers, returns true if the
151 * cell in that position is populated
152 * @param row The LOGICAL row number of the cell
153 * @param col The PHYSICAL row number of the cell
154 * @return true if the cell in that position is populated
156 UBool cellIsPopulated(int32_t row, int32_t col) const;
159 * Implementation of at() when we know the specified cell is populated.
160 * @param row The PHYSICAL row number of the cell
161 * @param col The PHYSICAL column number of the cell
162 * @return The value stored in the cell
164 int16_t internalAt(int32_t row, int32_t col) const;
166 // the following methods are never meant to be called and so are not defined
167 // (if you don't declare them, you get default implementations)
168 BreakDictionary(const BreakDictionary& that);
169 BreakDictionary& operator=(const BreakDictionary& that);