sl@0
|
1 |
//
|
sl@0
|
2 |
// rbbiscan.h
|
sl@0
|
3 |
//
|
sl@0
|
4 |
// Copyright (C) 2002-2003, International Business Machines Corporation and others.
|
sl@0
|
5 |
// All Rights Reserved.
|
sl@0
|
6 |
//
|
sl@0
|
7 |
// This file contains declarations for class RBBIRuleScanner
|
sl@0
|
8 |
//
|
sl@0
|
9 |
|
sl@0
|
10 |
|
sl@0
|
11 |
#ifndef RBBISCAN_H
|
sl@0
|
12 |
#define RBBISCAN_H
|
sl@0
|
13 |
|
sl@0
|
14 |
#include "unicode/utypes.h"
|
sl@0
|
15 |
#include "unicode/uobject.h"
|
sl@0
|
16 |
#include "unicode/rbbi.h"
|
sl@0
|
17 |
#include "unicode/uniset.h"
|
sl@0
|
18 |
#include "unicode/parseerr.h"
|
sl@0
|
19 |
#include "uhash.h"
|
sl@0
|
20 |
#include "uvector.h"
|
sl@0
|
21 |
#include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
|
sl@0
|
22 |
// looks up references to $variables within a set.
|
sl@0
|
23 |
#include "rbbinode.h"
|
sl@0
|
24 |
//#include "rbbitblb.h"
|
sl@0
|
25 |
|
sl@0
|
26 |
|
sl@0
|
27 |
|
sl@0
|
28 |
U_NAMESPACE_BEGIN
|
sl@0
|
29 |
|
sl@0
|
30 |
class RBBIRuleBuilder;
|
sl@0
|
31 |
class RBBISymbolTable;
|
sl@0
|
32 |
|
sl@0
|
33 |
|
sl@0
|
34 |
//--------------------------------------------------------------------------------
|
sl@0
|
35 |
//
|
sl@0
|
36 |
// class RBBIRuleScanner does the lowest level, character-at-a-time
|
sl@0
|
37 |
// scanning of break iterator rules.
|
sl@0
|
38 |
//
|
sl@0
|
39 |
// The output of the scanner is parse trees for
|
sl@0
|
40 |
// the rule expressions and a list of all Unicode Sets
|
sl@0
|
41 |
// encountered.
|
sl@0
|
42 |
//
|
sl@0
|
43 |
//--------------------------------------------------------------------------------
|
sl@0
|
44 |
static const int kStackSize = 100; // The size of the state stack for
|
sl@0
|
45 |
// rules parsing. Corresponds roughly
|
sl@0
|
46 |
// to the depth of parentheses nesting
|
sl@0
|
47 |
// that is allowed in the rules.
|
sl@0
|
48 |
|
sl@0
|
49 |
enum EParseAction {dummy01, dummy02}; // Placeholder enum for the specifier for
|
sl@0
|
50 |
// actions that are specified in the
|
sl@0
|
51 |
// rule parsing state table.
|
sl@0
|
52 |
|
sl@0
|
53 |
class RBBIRuleScanner : public UMemory {
|
sl@0
|
54 |
public:
|
sl@0
|
55 |
|
sl@0
|
56 |
struct RBBIRuleChar {
|
sl@0
|
57 |
UChar32 fChar;
|
sl@0
|
58 |
UBool fEscaped;
|
sl@0
|
59 |
};
|
sl@0
|
60 |
|
sl@0
|
61 |
RBBIRuleScanner(RBBIRuleBuilder *rb);
|
sl@0
|
62 |
|
sl@0
|
63 |
|
sl@0
|
64 |
virtual ~RBBIRuleScanner();
|
sl@0
|
65 |
|
sl@0
|
66 |
void nextChar(RBBIRuleChar &c); // Get the next char from the input stream.
|
sl@0
|
67 |
// Return false if at end.
|
sl@0
|
68 |
|
sl@0
|
69 |
UBool push(const RBBIRuleChar &c); // Push (unget) one character.
|
sl@0
|
70 |
// Only a single character may be pushed.
|
sl@0
|
71 |
|
sl@0
|
72 |
void parse(); // Parse the rules, generating two parse
|
sl@0
|
73 |
// trees, one each for the forward and
|
sl@0
|
74 |
// reverse rules,
|
sl@0
|
75 |
// and a list of UnicodeSets encountered.
|
sl@0
|
76 |
|
sl@0
|
77 |
/**
|
sl@0
|
78 |
* Return a rules string without unnecessary
|
sl@0
|
79 |
* characters.
|
sl@0
|
80 |
*/
|
sl@0
|
81 |
static UnicodeString stripRules(const UnicodeString &rules);
|
sl@0
|
82 |
private:
|
sl@0
|
83 |
|
sl@0
|
84 |
UBool doParseActions(EParseAction a);
|
sl@0
|
85 |
void error(UErrorCode e); // error reporting convenience function.
|
sl@0
|
86 |
void fixOpStack(RBBINode::OpPrecedence p);
|
sl@0
|
87 |
// a character.
|
sl@0
|
88 |
void findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL);
|
sl@0
|
89 |
|
sl@0
|
90 |
UChar32 nextCharLL();
|
sl@0
|
91 |
#ifdef RBBI_DEBUG
|
sl@0
|
92 |
void printNodeStack(const char *title);
|
sl@0
|
93 |
#endif
|
sl@0
|
94 |
RBBINode *pushNewNode(RBBINode::NodeType t);
|
sl@0
|
95 |
void scanSet();
|
sl@0
|
96 |
|
sl@0
|
97 |
|
sl@0
|
98 |
RBBIRuleBuilder *fRB; // The rule builder that we are part of.
|
sl@0
|
99 |
|
sl@0
|
100 |
int32_t fScanIndex; // Index of current character being processed
|
sl@0
|
101 |
// in the rule input string.
|
sl@0
|
102 |
int32_t fNextIndex; // Index of the next character, which
|
sl@0
|
103 |
// is the first character not yet scanned.
|
sl@0
|
104 |
UBool fQuoteMode; // Scan is in a 'quoted region'
|
sl@0
|
105 |
int fLineNum; // Line number in input file.
|
sl@0
|
106 |
int fCharNum; // Char position within the line.
|
sl@0
|
107 |
UChar32 fLastChar; // Previous char, needed to count CR-LF
|
sl@0
|
108 |
// as a single line, not two.
|
sl@0
|
109 |
|
sl@0
|
110 |
RBBIRuleChar fC; // Current char for parse state machine
|
sl@0
|
111 |
// processing.
|
sl@0
|
112 |
UnicodeString fVarName; // $variableName, valid when we've just
|
sl@0
|
113 |
// scanned one.
|
sl@0
|
114 |
|
sl@0
|
115 |
RBBIRuleTableEl **fStateTable; // State Transition Table for RBBI Rule
|
sl@0
|
116 |
// parsing. index by p[state][char-class]
|
sl@0
|
117 |
|
sl@0
|
118 |
uint16_t fStack[kStackSize]; // State stack, holds state pushes
|
sl@0
|
119 |
int fStackPtr; // and pops as specified in the state
|
sl@0
|
120 |
// transition rules.
|
sl@0
|
121 |
|
sl@0
|
122 |
RBBINode *fNodeStack[kStackSize]; // Node stack, holds nodes created
|
sl@0
|
123 |
// during the parse of a rule
|
sl@0
|
124 |
int fNodeStackPtr;
|
sl@0
|
125 |
|
sl@0
|
126 |
|
sl@0
|
127 |
UBool fReverseRule; // True if the rule currently being scanned
|
sl@0
|
128 |
// is a reverse direction rule (if it
|
sl@0
|
129 |
// starts with a '!')
|
sl@0
|
130 |
|
sl@0
|
131 |
UBool fLookAheadRule; // True if the rule includes a '/'
|
sl@0
|
132 |
// somewhere within it.
|
sl@0
|
133 |
|
sl@0
|
134 |
RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of
|
sl@0
|
135 |
// $variable symbols.
|
sl@0
|
136 |
|
sl@0
|
137 |
UHashtable *fSetTable; // UnicocodeSet hash table, holds indexes to
|
sl@0
|
138 |
// the sets created while parsing rules.
|
sl@0
|
139 |
// The key is the string used for creating
|
sl@0
|
140 |
// the set.
|
sl@0
|
141 |
|
sl@0
|
142 |
UnicodeSet *fRuleSets[10]; // Unicode Sets that are needed during
|
sl@0
|
143 |
// the scanning of RBBI rules. The
|
sl@0
|
144 |
// indicies for these are assigned by the
|
sl@0
|
145 |
// perl script that builds the state tables.
|
sl@0
|
146 |
// See rbbirpt.h.
|
sl@0
|
147 |
|
sl@0
|
148 |
int32_t fRuleNum; // Counts each rule as it is scanned.
|
sl@0
|
149 |
|
sl@0
|
150 |
int32_t fOptionStart; // Input index of start of a !!option
|
sl@0
|
151 |
// keyword, while being scanned.
|
sl@0
|
152 |
|
sl@0
|
153 |
UnicodeSet *gRuleSet_rule_char;
|
sl@0
|
154 |
UnicodeSet *gRuleSet_white_space;
|
sl@0
|
155 |
UnicodeSet *gRuleSet_name_char;
|
sl@0
|
156 |
UnicodeSet *gRuleSet_name_start_char;
|
sl@0
|
157 |
|
sl@0
|
158 |
RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class
|
sl@0
|
159 |
RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class
|
sl@0
|
160 |
};
|
sl@0
|
161 |
|
sl@0
|
162 |
U_NAMESPACE_END
|
sl@0
|
163 |
|
sl@0
|
164 |
#endif
|