sl@0: // sl@0: // rbbiscan.h sl@0: // sl@0: // Copyright (C) 2002-2003, International Business Machines Corporation and others. sl@0: // All Rights Reserved. sl@0: // sl@0: // This file contains declarations for class RBBIRuleScanner sl@0: // sl@0: sl@0: sl@0: #ifndef RBBISCAN_H sl@0: #define RBBISCAN_H sl@0: sl@0: #include "unicode/utypes.h" sl@0: #include "unicode/uobject.h" sl@0: #include "unicode/rbbi.h" sl@0: #include "unicode/uniset.h" sl@0: #include "unicode/parseerr.h" sl@0: #include "uhash.h" sl@0: #include "uvector.h" sl@0: #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that sl@0: // looks up references to $variables within a set. sl@0: #include "rbbinode.h" sl@0: //#include "rbbitblb.h" sl@0: sl@0: sl@0: sl@0: U_NAMESPACE_BEGIN sl@0: sl@0: class RBBIRuleBuilder; sl@0: class RBBISymbolTable; sl@0: sl@0: sl@0: //-------------------------------------------------------------------------------- sl@0: // sl@0: // class RBBIRuleScanner does the lowest level, character-at-a-time sl@0: // scanning of break iterator rules. sl@0: // sl@0: // The output of the scanner is parse trees for sl@0: // the rule expressions and a list of all Unicode Sets sl@0: // encountered. sl@0: // sl@0: //-------------------------------------------------------------------------------- sl@0: static const int kStackSize = 100; // The size of the state stack for sl@0: // rules parsing. Corresponds roughly sl@0: // to the depth of parentheses nesting sl@0: // that is allowed in the rules. sl@0: sl@0: enum EParseAction {dummy01, dummy02}; // Placeholder enum for the specifier for sl@0: // actions that are specified in the sl@0: // rule parsing state table. sl@0: sl@0: class RBBIRuleScanner : public UMemory { sl@0: public: sl@0: sl@0: struct RBBIRuleChar { sl@0: UChar32 fChar; sl@0: UBool fEscaped; sl@0: }; sl@0: sl@0: RBBIRuleScanner(RBBIRuleBuilder *rb); sl@0: sl@0: sl@0: virtual ~RBBIRuleScanner(); sl@0: sl@0: void nextChar(RBBIRuleChar &c); // Get the next char from the input stream. sl@0: // Return false if at end. sl@0: sl@0: UBool push(const RBBIRuleChar &c); // Push (unget) one character. sl@0: // Only a single character may be pushed. sl@0: sl@0: void parse(); // Parse the rules, generating two parse sl@0: // trees, one each for the forward and sl@0: // reverse rules, sl@0: // and a list of UnicodeSets encountered. sl@0: sl@0: /** sl@0: * Return a rules string without unnecessary sl@0: * characters. sl@0: */ sl@0: static UnicodeString stripRules(const UnicodeString &rules); sl@0: private: sl@0: sl@0: UBool doParseActions(EParseAction a); sl@0: void error(UErrorCode e); // error reporting convenience function. sl@0: void fixOpStack(RBBINode::OpPrecedence p); sl@0: // a character. sl@0: void findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL); sl@0: sl@0: UChar32 nextCharLL(); sl@0: #ifdef RBBI_DEBUG sl@0: void printNodeStack(const char *title); sl@0: #endif sl@0: RBBINode *pushNewNode(RBBINode::NodeType t); sl@0: void scanSet(); sl@0: sl@0: sl@0: RBBIRuleBuilder *fRB; // The rule builder that we are part of. sl@0: sl@0: int32_t fScanIndex; // Index of current character being processed sl@0: // in the rule input string. sl@0: int32_t fNextIndex; // Index of the next character, which sl@0: // is the first character not yet scanned. sl@0: UBool fQuoteMode; // Scan is in a 'quoted region' sl@0: int fLineNum; // Line number in input file. sl@0: int fCharNum; // Char position within the line. sl@0: UChar32 fLastChar; // Previous char, needed to count CR-LF sl@0: // as a single line, not two. sl@0: sl@0: RBBIRuleChar fC; // Current char for parse state machine sl@0: // processing. sl@0: UnicodeString fVarName; // $variableName, valid when we've just sl@0: // scanned one. sl@0: sl@0: RBBIRuleTableEl **fStateTable; // State Transition Table for RBBI Rule sl@0: // parsing. index by p[state][char-class] sl@0: sl@0: uint16_t fStack[kStackSize]; // State stack, holds state pushes sl@0: int fStackPtr; // and pops as specified in the state sl@0: // transition rules. sl@0: sl@0: RBBINode *fNodeStack[kStackSize]; // Node stack, holds nodes created sl@0: // during the parse of a rule sl@0: int fNodeStackPtr; sl@0: sl@0: sl@0: UBool fReverseRule; // True if the rule currently being scanned sl@0: // is a reverse direction rule (if it sl@0: // starts with a '!') sl@0: sl@0: UBool fLookAheadRule; // True if the rule includes a '/' sl@0: // somewhere within it. sl@0: sl@0: RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of sl@0: // $variable symbols. sl@0: sl@0: UHashtable *fSetTable; // UnicocodeSet hash table, holds indexes to sl@0: // the sets created while parsing rules. sl@0: // The key is the string used for creating sl@0: // the set. sl@0: sl@0: UnicodeSet *fRuleSets[10]; // Unicode Sets that are needed during sl@0: // the scanning of RBBI rules. The sl@0: // indicies for these are assigned by the sl@0: // perl script that builds the state tables. sl@0: // See rbbirpt.h. sl@0: sl@0: int32_t fRuleNum; // Counts each rule as it is scanned. sl@0: sl@0: int32_t fOptionStart; // Input index of start of a !!option sl@0: // keyword, while being scanned. sl@0: sl@0: UnicodeSet *gRuleSet_rule_char; sl@0: UnicodeSet *gRuleSet_white_space; sl@0: UnicodeSet *gRuleSet_name_char; sl@0: UnicodeSet *gRuleSet_name_start_char; sl@0: sl@0: RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class sl@0: RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class sl@0: }; sl@0: sl@0: U_NAMESPACE_END sl@0: sl@0: #endif