sl@0: //
sl@0: //  rbbisetb.h
sl@0: /*
sl@0: **********************************************************************
sl@0: *   Copyright (c) 2001-2005, International Business Machines
sl@0: *   Corporation and others.  All Rights Reserved.
sl@0: **********************************************************************
sl@0: */
sl@0: 
sl@0: #ifndef RBBISETB_H
sl@0: #define RBBISETB_H
sl@0: 
sl@0: #include "unicode/utypes.h"
sl@0: #include "unicode/uobject.h"
sl@0: #include "rbbirb.h"
sl@0: #include "uvector.h"
sl@0: 
sl@0: struct  UNewTrie;
sl@0: 
sl@0: U_NAMESPACE_BEGIN
sl@0: 
sl@0: //
sl@0: //  RBBISetBuilder   Derives the character categories used by the runtime RBBI engine
sl@0: //                   from the Unicode Sets appearing in the source  RBBI rules, and
sl@0: //                   creates the TRIE table used to map from Unicode to the
sl@0: //                   character categories.
sl@0: //
sl@0: 
sl@0: 
sl@0: //
sl@0: //  RangeDescriptor
sl@0: //
sl@0: //     Each of the non-overlapping character ranges gets one of these descriptors.
sl@0: //     All of them are strung together in a linked list, which is kept in order
sl@0: //     (by character)
sl@0: //
sl@0: class RangeDescriptor : public UMemory {
sl@0: public:
sl@0:     UChar32            fStartChar;      // Start of range, unicode 32 bit value.
sl@0:     UChar32            fEndChar;        // End of range, unicode 32 bit value.
sl@0:     int32_t            fNum;            // runtime-mapped input value for this range.
sl@0:     UVector           *fIncludesSets;   // vector of the the original
sl@0:                                         //   Unicode sets that include this range.
sl@0:                                         //    (Contains ptrs to uset nodes)
sl@0:     RangeDescriptor   *fNext;           // Next RangeDescriptor in the linked list.
sl@0: 
sl@0:     RangeDescriptor(UErrorCode &status);
sl@0:     RangeDescriptor(const RangeDescriptor &other, UErrorCode &status);
sl@0:     ~RangeDescriptor();
sl@0:     void split(UChar32 where, UErrorCode &status);   // Spit this range in two at "where", with
sl@0:                                         //   where appearing in the second (higher) part.
sl@0:     void setDictionaryFlag();           // Check whether this range appears as part of
sl@0:                                         //   the Unicode set named "dictionary"
sl@0: 
sl@0: private:
sl@0:     RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class
sl@0:     RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class
sl@0: };
sl@0: 
sl@0: 
sl@0: //
sl@0: //  RBBISetBuilder   Handles processing of Unicode Sets from RBBI rules.
sl@0: //
sl@0: //      Starting with the rules parse tree from the scanner,
sl@0: //
sl@0: //                   -  Enumerate the set of UnicodeSets that are referenced
sl@0: //                      by the RBBI rules.
sl@0: //                   -  compute a derived set of non-overlapping UnicodeSets
sl@0: //                      that will correspond to columns in the state table for
sl@0: //                      the RBBI execution engine.
sl@0: //                   -  construct the trie table that maps input characters
sl@0: //                      to set numbers in the non-overlapping set of sets.
sl@0: //
sl@0: 
sl@0: 
sl@0: class RBBISetBuilder : public UMemory {
sl@0: public:
sl@0:     RBBISetBuilder(RBBIRuleBuilder *rb);
sl@0:     ~RBBISetBuilder();
sl@0: 
sl@0:     void     build();
sl@0:     void     addValToSets(UVector *sets,      uint32_t val);
sl@0:     void     addValToSet (RBBINode *usetNode, uint32_t val);
sl@0:     int32_t  getNumCharCategories() const;   // CharCategories are the same as input symbol set to the
sl@0:                                    //    runtime state machine, which are the same as
sl@0:                                    //    columns in the DFA state table
sl@0:     int32_t  getTrieSize() /*const*/;        // Size in bytes of the serialized Trie.
sl@0:     void     serializeTrie(uint8_t *where);  // write out the serialized Trie.
sl@0:     UChar32  getFirstChar(int32_t  val) const;
sl@0: #ifdef RBBI_DEBUG
sl@0:     void     printSets();
sl@0:     void     printRanges();
sl@0:     void     printRangeGroups();
sl@0: #else
sl@0:     #define printSets()
sl@0:     #define printRanges()
sl@0:     #define printRangeGroups()
sl@0: #endif
sl@0: 
sl@0: private:
sl@0:     void           numberSets();
sl@0: 
sl@0:     RBBIRuleBuilder       *fRB;             // The RBBI Rule Compiler that owns us.
sl@0:     UErrorCode            *fStatus;
sl@0: 
sl@0:     RangeDescriptor       *fRangeList;      // Head of the linked list of RangeDescriptors
sl@0: 
sl@0:     UNewTrie              *fTrie;           // The mapping TRIE that is the end result of processing
sl@0:     uint32_t              fTrieSize;        //  the Unicode Sets.
sl@0: 
sl@0:     // Groups correspond to character categories -
sl@0:     //       groups of ranges that are in the same original UnicodeSets.
sl@0:     //       fGroupCount is the index of the last used group.
sl@0:     //       fGroupCount+1 is also the number of columns in the RBBI state table being compiled.
sl@0:     //       State table column 0 is not used.  Column 1 is for end-of-input.
sl@0:     //       column 2 is for group 0.  Funny counting.
sl@0:     int32_t               fGroupCount;
sl@0: 
sl@0:     RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class
sl@0:     RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class
sl@0: };
sl@0: 
sl@0: 
sl@0: 
sl@0: U_NAMESPACE_END
sl@0: #endif