sl@0: /* sl@0: ********************************************************************** sl@0: * Copyright (C) 1999-2004 IBM and others. All rights reserved. sl@0: ********************************************************************** sl@0: * Date Name Description sl@0: * 12/1/99 rtg Ported from Java sl@0: * 01/13/2000 helena Added UErrorCode to ctors. sl@0: ********************************************************************** sl@0: */ sl@0: sl@0: #ifndef BRKDICT_H sl@0: #define BRKDICT_H sl@0: sl@0: #include "unicode/utypes.h" sl@0: #include "unicode/uobject.h" sl@0: #include "ucmp8.h" sl@0: sl@0: U_NAMESPACE_BEGIN sl@0: sl@0: /** sl@0: * This is the class that represents the list of known words used by sl@0: * DictionaryBasedBreakIterator. The conceptual data structure used sl@0: * here is a trie: there is a node hanging off the root node for every sl@0: * letter that can start a word. Each of these nodes has a node hanging sl@0: * off of it for every letter that can be the second letter of a word sl@0: * if this node is the first letter, and so on. The trie is represented sl@0: * as a two-dimensional array that can be treated as a table of state sl@0: * transitions. Indexes are used to compress this array, taking sl@0: * advantage of the fact that this array will always be very sparse. sl@0: */ sl@0: class BreakDictionary : public UMemory { sl@0: //================================================================================= sl@0: // data members sl@0: //================================================================================= sl@0: private: sl@0: sl@0: /** sl@0: * Maps from characters to column numbers. The main use of this is to sl@0: * avoid making room in the array for empty columns. sl@0: */ sl@0: CompactByteArray* columnMap; sl@0: sl@0: /** sl@0: * The number of actual columns in the table sl@0: */ sl@0: int32_t numCols; sl@0: sl@0: /** sl@0: * Columns are organized into groups of 32. This says how many sl@0: * column groups. (We could calculate this, but we store the sl@0: * value to avoid having to repeatedly calculate it.) sl@0: */ sl@0: int32_t numColGroups; sl@0: sl@0: /** sl@0: * The actual compressed state table. Each conceptual row represents sl@0: * a state, and the cells in it contain the row numbers of the states sl@0: * to transition to for each possible letter. 0 is used to indicate sl@0: * an illegal combination of letters (i.e., the error state). The sl@0: * table is compressed by eliminating all the unpopulated (i.e., zero) sl@0: * cells. Multiple conceptual rows can then be doubled up in a single sl@0: * physical row by sliding them up and possibly shifting them to one sl@0: * side or the other so the populated cells don't collide. Indexes sl@0: * are used to identify unpopulated cells and to locate populated cells. sl@0: */ sl@0: int16_t* table; sl@0: sl@0: /** sl@0: * This index maps logical row numbers to physical row numbers sl@0: */ sl@0: int16_t* rowIndex; sl@0: sl@0: /** sl@0: * A bitmap is used to tell which cells in the comceptual table are sl@0: * populated. This array contains all the unique bit combinations sl@0: * in that bitmap. If the table is more than 32 columns wide, sl@0: * successive entries in this array are used for a single row. sl@0: */ sl@0: int32_t* rowIndexFlags; sl@0: sl@0: /** sl@0: * This index maps from a logical row number into the bitmap table above. sl@0: * (This keeps us from storing duplicate bitmap combinations.) Since there sl@0: * are a lot of rows with only one populated cell, instead of wasting space sl@0: * in the bitmap table, we just store a negative number in this index for sl@0: * rows with one populated cell. The absolute value of that number is sl@0: * the column number of the populated cell. sl@0: */ sl@0: int16_t* rowIndexFlagsIndex; sl@0: sl@0: /** sl@0: * For each logical row, this index contains a constant that is added to sl@0: * the logical column number to get the physical column number sl@0: */ sl@0: int8_t* rowIndexShifts; sl@0: sl@0: //================================================================================= sl@0: // deserialization sl@0: //================================================================================= sl@0: sl@0: public: sl@0: /** sl@0: * Constructor. Creates the BreakDictionary by using readDictionaryFile() to sl@0: * load the dictionary tables from the disk. sl@0: * @param dictionaryFilename The name of the dictionary file sl@0: * @param status for errors if it occurs sl@0: */ sl@0: BreakDictionary(const char* dictionaryFilename, UErrorCode& status); sl@0: sl@0: /** sl@0: * Destructor. sl@0: */ sl@0: ~BreakDictionary(); sl@0: sl@0: /** sl@0: * Reads the dictionary file on the disk and constructs the appropriate in-memory sl@0: * representation. sl@0: * @param in The given memory stream sl@0: */ sl@0: void readDictionaryFile(const uint8_t * in); sl@0: sl@0: //================================================================================= sl@0: // access to the words sl@0: //================================================================================= sl@0: sl@0: /** sl@0: * Uses the column map to map the character to a column number, then sl@0: * passes the row and column number to the other version of at() sl@0: * @param row The current state sl@0: * @param ch The character whose column we're interested in sl@0: * @return The new state to transition to sl@0: */ sl@0: int16_t at(int32_t row, UChar ch) const; sl@0: sl@0: /** sl@0: * Returns the value in the cell with the specified (logical) row and sl@0: * column numbers. In DictionaryBasedBreakIterator, the row number is sl@0: * a state number, the column number is an input, and the return value sl@0: * is the row number of the new state to transition to. (0 is the sl@0: * "error" state, and -1 is the "end of word" state in a dictionary) sl@0: * @param row The row number of the current state sl@0: * @param col The column number of the input character (0 means "not a sl@0: * dictionary character") sl@0: * @return The row number of the new state to transition to sl@0: */ sl@0: int16_t at(int32_t row, int32_t col) const; sl@0: sl@0: private: sl@0: /** sl@0: * Given (logical) row and column numbers, returns true if the sl@0: * cell in that position is populated sl@0: * @param row The LOGICAL row number of the cell sl@0: * @param col The PHYSICAL row number of the cell sl@0: * @return true if the cell in that position is populated sl@0: */ sl@0: UBool cellIsPopulated(int32_t row, int32_t col) const; sl@0: sl@0: /** sl@0: * Implementation of at() when we know the specified cell is populated. sl@0: * @param row The PHYSICAL row number of the cell sl@0: * @param col The PHYSICAL column number of the cell sl@0: * @return The value stored in the cell sl@0: */ sl@0: int16_t internalAt(int32_t row, int32_t col) const; sl@0: sl@0: // the following methods are never meant to be called and so are not defined sl@0: // (if you don't declare them, you get default implementations) sl@0: BreakDictionary(const BreakDictionary& that); sl@0: BreakDictionary& operator=(const BreakDictionary& that); sl@0: }; sl@0: sl@0: U_NAMESPACE_END sl@0: sl@0: #endif