sl@0
|
1 |
/*
|
sl@0
|
2 |
**********************************************************************
|
sl@0
|
3 |
* Copyright (C) 1999-2004 IBM and others. All rights reserved.
|
sl@0
|
4 |
**********************************************************************
|
sl@0
|
5 |
* Date Name Description
|
sl@0
|
6 |
* 12/1/99 rtg Ported from Java
|
sl@0
|
7 |
* 01/13/2000 helena Added UErrorCode to ctors.
|
sl@0
|
8 |
**********************************************************************
|
sl@0
|
9 |
*/
|
sl@0
|
10 |
|
sl@0
|
11 |
#ifndef BRKDICT_H
|
sl@0
|
12 |
#define BRKDICT_H
|
sl@0
|
13 |
|
sl@0
|
14 |
#include "unicode/utypes.h"
|
sl@0
|
15 |
#include "unicode/uobject.h"
|
sl@0
|
16 |
#include "ucmp8.h"
|
sl@0
|
17 |
|
sl@0
|
18 |
U_NAMESPACE_BEGIN
|
sl@0
|
19 |
|
sl@0
|
20 |
/**
|
sl@0
|
21 |
* This is the class that represents the list of known words used by
|
sl@0
|
22 |
* DictionaryBasedBreakIterator. The conceptual data structure used
|
sl@0
|
23 |
* here is a trie: there is a node hanging off the root node for every
|
sl@0
|
24 |
* letter that can start a word. Each of these nodes has a node hanging
|
sl@0
|
25 |
* off of it for every letter that can be the second letter of a word
|
sl@0
|
26 |
* if this node is the first letter, and so on. The trie is represented
|
sl@0
|
27 |
* as a two-dimensional array that can be treated as a table of state
|
sl@0
|
28 |
* transitions. Indexes are used to compress this array, taking
|
sl@0
|
29 |
* advantage of the fact that this array will always be very sparse.
|
sl@0
|
30 |
*/
|
sl@0
|
31 |
class BreakDictionary : public UMemory {
|
sl@0
|
32 |
//=================================================================================
|
sl@0
|
33 |
// data members
|
sl@0
|
34 |
//=================================================================================
|
sl@0
|
35 |
private:
|
sl@0
|
36 |
|
sl@0
|
37 |
/**
|
sl@0
|
38 |
* Maps from characters to column numbers. The main use of this is to
|
sl@0
|
39 |
* avoid making room in the array for empty columns.
|
sl@0
|
40 |
*/
|
sl@0
|
41 |
CompactByteArray* columnMap;
|
sl@0
|
42 |
|
sl@0
|
43 |
/**
|
sl@0
|
44 |
* The number of actual columns in the table
|
sl@0
|
45 |
*/
|
sl@0
|
46 |
int32_t numCols;
|
sl@0
|
47 |
|
sl@0
|
48 |
/**
|
sl@0
|
49 |
* Columns are organized into groups of 32. This says how many
|
sl@0
|
50 |
* column groups. (We could calculate this, but we store the
|
sl@0
|
51 |
* value to avoid having to repeatedly calculate it.)
|
sl@0
|
52 |
*/
|
sl@0
|
53 |
int32_t numColGroups;
|
sl@0
|
54 |
|
sl@0
|
55 |
/**
|
sl@0
|
56 |
* The actual compressed state table. Each conceptual row represents
|
sl@0
|
57 |
* a state, and the cells in it contain the row numbers of the states
|
sl@0
|
58 |
* to transition to for each possible letter. 0 is used to indicate
|
sl@0
|
59 |
* an illegal combination of letters (i.e., the error state). The
|
sl@0
|
60 |
* table is compressed by eliminating all the unpopulated (i.e., zero)
|
sl@0
|
61 |
* cells. Multiple conceptual rows can then be doubled up in a single
|
sl@0
|
62 |
* physical row by sliding them up and possibly shifting them to one
|
sl@0
|
63 |
* side or the other so the populated cells don't collide. Indexes
|
sl@0
|
64 |
* are used to identify unpopulated cells and to locate populated cells.
|
sl@0
|
65 |
*/
|
sl@0
|
66 |
int16_t* table;
|
sl@0
|
67 |
|
sl@0
|
68 |
/**
|
sl@0
|
69 |
* This index maps logical row numbers to physical row numbers
|
sl@0
|
70 |
*/
|
sl@0
|
71 |
int16_t* rowIndex;
|
sl@0
|
72 |
|
sl@0
|
73 |
/**
|
sl@0
|
74 |
* A bitmap is used to tell which cells in the comceptual table are
|
sl@0
|
75 |
* populated. This array contains all the unique bit combinations
|
sl@0
|
76 |
* in that bitmap. If the table is more than 32 columns wide,
|
sl@0
|
77 |
* successive entries in this array are used for a single row.
|
sl@0
|
78 |
*/
|
sl@0
|
79 |
int32_t* rowIndexFlags;
|
sl@0
|
80 |
|
sl@0
|
81 |
/**
|
sl@0
|
82 |
* This index maps from a logical row number into the bitmap table above.
|
sl@0
|
83 |
* (This keeps us from storing duplicate bitmap combinations.) Since there
|
sl@0
|
84 |
* are a lot of rows with only one populated cell, instead of wasting space
|
sl@0
|
85 |
* in the bitmap table, we just store a negative number in this index for
|
sl@0
|
86 |
* rows with one populated cell. The absolute value of that number is
|
sl@0
|
87 |
* the column number of the populated cell.
|
sl@0
|
88 |
*/
|
sl@0
|
89 |
int16_t* rowIndexFlagsIndex;
|
sl@0
|
90 |
|
sl@0
|
91 |
/**
|
sl@0
|
92 |
* For each logical row, this index contains a constant that is added to
|
sl@0
|
93 |
* the logical column number to get the physical column number
|
sl@0
|
94 |
*/
|
sl@0
|
95 |
int8_t* rowIndexShifts;
|
sl@0
|
96 |
|
sl@0
|
97 |
//=================================================================================
|
sl@0
|
98 |
// deserialization
|
sl@0
|
99 |
//=================================================================================
|
sl@0
|
100 |
|
sl@0
|
101 |
public:
|
sl@0
|
102 |
/**
|
sl@0
|
103 |
* Constructor. Creates the BreakDictionary by using readDictionaryFile() to
|
sl@0
|
104 |
* load the dictionary tables from the disk.
|
sl@0
|
105 |
* @param dictionaryFilename The name of the dictionary file
|
sl@0
|
106 |
* @param status for errors if it occurs
|
sl@0
|
107 |
*/
|
sl@0
|
108 |
BreakDictionary(const char* dictionaryFilename, UErrorCode& status);
|
sl@0
|
109 |
|
sl@0
|
110 |
/**
|
sl@0
|
111 |
* Destructor.
|
sl@0
|
112 |
*/
|
sl@0
|
113 |
~BreakDictionary();
|
sl@0
|
114 |
|
sl@0
|
115 |
/**
|
sl@0
|
116 |
* Reads the dictionary file on the disk and constructs the appropriate in-memory
|
sl@0
|
117 |
* representation.
|
sl@0
|
118 |
* @param in The given memory stream
|
sl@0
|
119 |
*/
|
sl@0
|
120 |
void readDictionaryFile(const uint8_t * in);
|
sl@0
|
121 |
|
sl@0
|
122 |
//=================================================================================
|
sl@0
|
123 |
// access to the words
|
sl@0
|
124 |
//=================================================================================
|
sl@0
|
125 |
|
sl@0
|
126 |
/**
|
sl@0
|
127 |
* Uses the column map to map the character to a column number, then
|
sl@0
|
128 |
* passes the row and column number to the other version of at()
|
sl@0
|
129 |
* @param row The current state
|
sl@0
|
130 |
* @param ch The character whose column we're interested in
|
sl@0
|
131 |
* @return The new state to transition to
|
sl@0
|
132 |
*/
|
sl@0
|
133 |
int16_t at(int32_t row, UChar ch) const;
|
sl@0
|
134 |
|
sl@0
|
135 |
/**
|
sl@0
|
136 |
* Returns the value in the cell with the specified (logical) row and
|
sl@0
|
137 |
* column numbers. In DictionaryBasedBreakIterator, the row number is
|
sl@0
|
138 |
* a state number, the column number is an input, and the return value
|
sl@0
|
139 |
* is the row number of the new state to transition to. (0 is the
|
sl@0
|
140 |
* "error" state, and -1 is the "end of word" state in a dictionary)
|
sl@0
|
141 |
* @param row The row number of the current state
|
sl@0
|
142 |
* @param col The column number of the input character (0 means "not a
|
sl@0
|
143 |
* dictionary character")
|
sl@0
|
144 |
* @return The row number of the new state to transition to
|
sl@0
|
145 |
*/
|
sl@0
|
146 |
int16_t at(int32_t row, int32_t col) const;
|
sl@0
|
147 |
|
sl@0
|
148 |
private:
|
sl@0
|
149 |
/**
|
sl@0
|
150 |
* Given (logical) row and column numbers, returns true if the
|
sl@0
|
151 |
* cell in that position is populated
|
sl@0
|
152 |
* @param row The LOGICAL row number of the cell
|
sl@0
|
153 |
* @param col The PHYSICAL row number of the cell
|
sl@0
|
154 |
* @return true if the cell in that position is populated
|
sl@0
|
155 |
*/
|
sl@0
|
156 |
UBool cellIsPopulated(int32_t row, int32_t col) const;
|
sl@0
|
157 |
|
sl@0
|
158 |
/**
|
sl@0
|
159 |
* Implementation of at() when we know the specified cell is populated.
|
sl@0
|
160 |
* @param row The PHYSICAL row number of the cell
|
sl@0
|
161 |
* @param col The PHYSICAL column number of the cell
|
sl@0
|
162 |
* @return The value stored in the cell
|
sl@0
|
163 |
*/
|
sl@0
|
164 |
int16_t internalAt(int32_t row, int32_t col) const;
|
sl@0
|
165 |
|
sl@0
|
166 |
// the following methods are never meant to be called and so are not defined
|
sl@0
|
167 |
// (if you don't declare them, you get default implementations)
|
sl@0
|
168 |
BreakDictionary(const BreakDictionary& that);
|
sl@0
|
169 |
BreakDictionary& operator=(const BreakDictionary& that);
|
sl@0
|
170 |
};
|
sl@0
|
171 |
|
sl@0
|
172 |
U_NAMESPACE_END
|
sl@0
|
173 |
|
sl@0
|
174 |
#endif
|