sl@0: // sl@0: // rbbisetb.h sl@0: /* sl@0: ********************************************************************** sl@0: * Copyright (c) 2001-2005, International Business Machines sl@0: * Corporation and others. All Rights Reserved. sl@0: ********************************************************************** sl@0: */ sl@0: sl@0: #ifndef RBBISETB_H sl@0: #define RBBISETB_H sl@0: sl@0: #include "unicode/utypes.h" sl@0: #include "unicode/uobject.h" sl@0: #include "rbbirb.h" sl@0: #include "uvector.h" sl@0: sl@0: struct UNewTrie; sl@0: sl@0: U_NAMESPACE_BEGIN sl@0: sl@0: // sl@0: // RBBISetBuilder Derives the character categories used by the runtime RBBI engine sl@0: // from the Unicode Sets appearing in the source RBBI rules, and sl@0: // creates the TRIE table used to map from Unicode to the sl@0: // character categories. sl@0: // sl@0: sl@0: sl@0: // sl@0: // RangeDescriptor sl@0: // sl@0: // Each of the non-overlapping character ranges gets one of these descriptors. sl@0: // All of them are strung together in a linked list, which is kept in order sl@0: // (by character) sl@0: // sl@0: class RangeDescriptor : public UMemory { sl@0: public: sl@0: UChar32 fStartChar; // Start of range, unicode 32 bit value. sl@0: UChar32 fEndChar; // End of range, unicode 32 bit value. sl@0: int32_t fNum; // runtime-mapped input value for this range. sl@0: UVector *fIncludesSets; // vector of the the original sl@0: // Unicode sets that include this range. sl@0: // (Contains ptrs to uset nodes) sl@0: RangeDescriptor *fNext; // Next RangeDescriptor in the linked list. sl@0: sl@0: RangeDescriptor(UErrorCode &status); sl@0: RangeDescriptor(const RangeDescriptor &other, UErrorCode &status); sl@0: ~RangeDescriptor(); sl@0: void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with sl@0: // where appearing in the second (higher) part. sl@0: void setDictionaryFlag(); // Check whether this range appears as part of sl@0: // the Unicode set named "dictionary" sl@0: sl@0: private: sl@0: RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class sl@0: RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class sl@0: }; sl@0: sl@0: sl@0: // sl@0: // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules. sl@0: // sl@0: // Starting with the rules parse tree from the scanner, sl@0: // sl@0: // - Enumerate the set of UnicodeSets that are referenced sl@0: // by the RBBI rules. sl@0: // - compute a derived set of non-overlapping UnicodeSets sl@0: // that will correspond to columns in the state table for sl@0: // the RBBI execution engine. sl@0: // - construct the trie table that maps input characters sl@0: // to set numbers in the non-overlapping set of sets. sl@0: // sl@0: sl@0: sl@0: class RBBISetBuilder : public UMemory { sl@0: public: sl@0: RBBISetBuilder(RBBIRuleBuilder *rb); sl@0: ~RBBISetBuilder(); sl@0: sl@0: void build(); sl@0: void addValToSets(UVector *sets, uint32_t val); sl@0: void addValToSet (RBBINode *usetNode, uint32_t val); sl@0: int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the sl@0: // runtime state machine, which are the same as sl@0: // columns in the DFA state table sl@0: int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie. sl@0: void serializeTrie(uint8_t *where); // write out the serialized Trie. sl@0: UChar32 getFirstChar(int32_t val) const; sl@0: #ifdef RBBI_DEBUG sl@0: void printSets(); sl@0: void printRanges(); sl@0: void printRangeGroups(); sl@0: #else sl@0: #define printSets() sl@0: #define printRanges() sl@0: #define printRangeGroups() sl@0: #endif sl@0: sl@0: private: sl@0: void numberSets(); sl@0: sl@0: RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us. sl@0: UErrorCode *fStatus; sl@0: sl@0: RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors sl@0: sl@0: UNewTrie *fTrie; // The mapping TRIE that is the end result of processing sl@0: uint32_t fTrieSize; // the Unicode Sets. sl@0: sl@0: // Groups correspond to character categories - sl@0: // groups of ranges that are in the same original UnicodeSets. sl@0: // fGroupCount is the index of the last used group. sl@0: // fGroupCount+1 is also the number of columns in the RBBI state table being compiled. sl@0: // State table column 0 is not used. Column 1 is for end-of-input. sl@0: // column 2 is for group 0. Funny counting. sl@0: int32_t fGroupCount; sl@0: sl@0: RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class sl@0: RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class sl@0: }; sl@0: sl@0: sl@0: sl@0: U_NAMESPACE_END sl@0: #endif