sl@0: /*
sl@0: **********************************************************************
sl@0: *   Copyright (C) 1999-2004 IBM and others. All rights reserved.
sl@0: **********************************************************************
sl@0: *   Date        Name        Description
sl@0: *   12/1/99     rtg         Ported from Java
sl@0: *   01/13/2000  helena      Added UErrorCode to ctors.
sl@0: **********************************************************************
sl@0: */
sl@0: 
sl@0: #ifndef BRKDICT_H
sl@0: #define BRKDICT_H
sl@0: 
sl@0: #include "unicode/utypes.h"
sl@0: #include "unicode/uobject.h"
sl@0: #include "ucmp8.h"
sl@0: 
sl@0: U_NAMESPACE_BEGIN
sl@0: 
sl@0: /**
sl@0:  * This is the class that represents the list of known words used by
sl@0:  * DictionaryBasedBreakIterator.  The conceptual data structure used
sl@0:  * here is a trie: there is a node hanging off the root node for every
sl@0:  * letter that can start a word.  Each of these nodes has a node hanging
sl@0:  * off of it for every letter that can be the second letter of a word
sl@0:  * if this node is the first letter, and so on.  The trie is represented
sl@0:  * as a two-dimensional array that can be treated as a table of state
sl@0:  * transitions.  Indexes are used to compress this array, taking
sl@0:  * advantage of the fact that this array will always be very sparse.
sl@0:  */
sl@0: class BreakDictionary : public UMemory {
sl@0:     //=================================================================================
sl@0:     // data members
sl@0:     //=================================================================================
sl@0: private:
sl@0: 
sl@0:     /**
sl@0:      * Maps from characters to column numbers.  The main use of this is to
sl@0:      * avoid making room in the array for empty columns.
sl@0:      */
sl@0:     CompactByteArray* columnMap;
sl@0: 
sl@0:     /**
sl@0:      * The number of actual columns in the table
sl@0:      */
sl@0:     int32_t numCols;
sl@0: 
sl@0:     /**
sl@0:      * Columns are organized into groups of 32.  This says how many
sl@0:      * column groups.  (We could calculate this, but we store the
sl@0:      * value to avoid having to repeatedly calculate it.)
sl@0:      */
sl@0:     int32_t numColGroups;
sl@0: 
sl@0:     /**
sl@0:      * The actual compressed state table.  Each conceptual row represents
sl@0:      * a state, and the cells in it contain the row numbers of the states
sl@0:      * to transition to for each possible letter.  0 is used to indicate
sl@0:      * an illegal combination of letters (i.e., the error state).  The
sl@0:      * table is compressed by eliminating all the unpopulated (i.e., zero)
sl@0:      * cells.  Multiple conceptual rows can then be doubled up in a single
sl@0:      * physical row by sliding them up and possibly shifting them to one
sl@0:      * side or the other so the populated cells don't collide.  Indexes
sl@0:      * are used to identify unpopulated cells and to locate populated cells.
sl@0:      */
sl@0:     int16_t* table;
sl@0: 
sl@0:     /**
sl@0:      * This index maps logical row numbers to physical row numbers
sl@0:      */
sl@0:     int16_t* rowIndex;
sl@0: 
sl@0:     /**
sl@0:      * A bitmap is used to tell which cells in the comceptual table are
sl@0:      * populated.  This array contains all the unique bit combinations
sl@0:      * in that bitmap.  If the table is more than 32 columns wide,
sl@0:      * successive entries in this array are used for a single row.
sl@0:      */
sl@0:     int32_t* rowIndexFlags;
sl@0: 
sl@0:     /**
sl@0:      * This index maps from a logical row number into the bitmap table above.
sl@0:      * (This keeps us from storing duplicate bitmap combinations.)  Since there
sl@0:      * are a lot of rows with only one populated cell, instead of wasting space
sl@0:      * in the bitmap table, we just store a negative number in this index for
sl@0:      * rows with one populated cell.  The absolute value of that number is
sl@0:      * the column number of the populated cell.
sl@0:      */
sl@0:     int16_t* rowIndexFlagsIndex;
sl@0: 
sl@0:     /**
sl@0:      * For each logical row, this index contains a constant that is added to
sl@0:      * the logical column number to get the physical column number
sl@0:      */
sl@0:     int8_t* rowIndexShifts;
sl@0: 
sl@0:     //=================================================================================
sl@0:     // deserialization
sl@0:     //=================================================================================
sl@0: 
sl@0: public:
sl@0:     /**
sl@0:      * Constructor.  Creates the BreakDictionary by using readDictionaryFile() to
sl@0:      * load the dictionary tables from the disk.
sl@0:      * @param dictionaryFilename The name of the dictionary file
sl@0:      * @param status for errors if it occurs
sl@0:      */
sl@0:     BreakDictionary(const char* dictionaryFilename, UErrorCode& status);
sl@0: 
sl@0:     /**
sl@0:      * Destructor.
sl@0:      */
sl@0:     ~BreakDictionary();
sl@0: 
sl@0:     /**
sl@0:      * Reads the dictionary file on the disk and constructs the appropriate in-memory
sl@0:      * representation.
sl@0:      * @param in The given memory stream
sl@0:      */
sl@0:     void readDictionaryFile(const uint8_t * in);
sl@0: 
sl@0:     //=================================================================================
sl@0:     // access to the words
sl@0:     //=================================================================================
sl@0: 
sl@0:     /**
sl@0:      * Uses the column map to map the character to a column number, then
sl@0:      * passes the row and column number to the other version of at()
sl@0:      * @param row The current state
sl@0:      * @param ch The character whose column we're interested in
sl@0:      * @return The new state to transition to
sl@0:      */
sl@0:     int16_t at(int32_t row, UChar ch) const;
sl@0: 
sl@0:     /**
sl@0:      * Returns the value in the cell with the specified (logical) row and
sl@0:      * column numbers.  In DictionaryBasedBreakIterator, the row number is
sl@0:      * a state number, the column number is an input, and the return value
sl@0:      * is the row number of the new state to transition to.  (0 is the
sl@0:      * "error" state, and -1 is the "end of word" state in a dictionary)
sl@0:      * @param row The row number of the current state
sl@0:      * @param col The column number of the input character (0 means "not a
sl@0:      * dictionary character")
sl@0:      * @return The row number of the new state to transition to
sl@0:      */
sl@0:     int16_t at(int32_t row, int32_t col) const;
sl@0: 
sl@0: private:
sl@0:     /**
sl@0:      * Given (logical) row and column numbers, returns true if the
sl@0:      * cell in that position is populated
sl@0:      * @param row The LOGICAL row number of the cell
sl@0:      * @param col The PHYSICAL row number of the cell
sl@0:      * @return true if the cell in that position is populated
sl@0:      */
sl@0:     UBool cellIsPopulated(int32_t row, int32_t col) const;
sl@0: 
sl@0:     /**
sl@0:      * Implementation of at() when we know the specified cell is populated.
sl@0:      * @param row The PHYSICAL row number of the cell
sl@0:      * @param col The PHYSICAL column number of the cell
sl@0:      * @return The value stored in the cell
sl@0:      */
sl@0:     int16_t internalAt(int32_t row, int32_t col) const;
sl@0: 
sl@0:     // the following methods are never meant to be called and so are not defined
sl@0:     // (if you don't declare them, you get default implementations)
sl@0:     BreakDictionary(const BreakDictionary& that);
sl@0:     BreakDictionary& operator=(const BreakDictionary& that);
sl@0: };
sl@0: 
sl@0: U_NAMESPACE_END
sl@0: 
sl@0: #endif