sl@0
|
1 |
//
|
sl@0
|
2 |
// rbbisetb.h
|
sl@0
|
3 |
/*
|
sl@0
|
4 |
**********************************************************************
|
sl@0
|
5 |
* Copyright (c) 2001-2005, International Business Machines
|
sl@0
|
6 |
* Corporation and others. All Rights Reserved.
|
sl@0
|
7 |
**********************************************************************
|
sl@0
|
8 |
*/
|
sl@0
|
9 |
|
sl@0
|
10 |
#ifndef RBBISETB_H
|
sl@0
|
11 |
#define RBBISETB_H
|
sl@0
|
12 |
|
sl@0
|
13 |
#include "unicode/utypes.h"
|
sl@0
|
14 |
#include "unicode/uobject.h"
|
sl@0
|
15 |
#include "rbbirb.h"
|
sl@0
|
16 |
#include "uvector.h"
|
sl@0
|
17 |
|
sl@0
|
18 |
struct UNewTrie;
|
sl@0
|
19 |
|
sl@0
|
20 |
U_NAMESPACE_BEGIN
|
sl@0
|
21 |
|
sl@0
|
22 |
//
|
sl@0
|
23 |
// RBBISetBuilder Derives the character categories used by the runtime RBBI engine
|
sl@0
|
24 |
// from the Unicode Sets appearing in the source RBBI rules, and
|
sl@0
|
25 |
// creates the TRIE table used to map from Unicode to the
|
sl@0
|
26 |
// character categories.
|
sl@0
|
27 |
//
|
sl@0
|
28 |
|
sl@0
|
29 |
|
sl@0
|
30 |
//
|
sl@0
|
31 |
// RangeDescriptor
|
sl@0
|
32 |
//
|
sl@0
|
33 |
// Each of the non-overlapping character ranges gets one of these descriptors.
|
sl@0
|
34 |
// All of them are strung together in a linked list, which is kept in order
|
sl@0
|
35 |
// (by character)
|
sl@0
|
36 |
//
|
sl@0
|
37 |
class RangeDescriptor : public UMemory {
|
sl@0
|
38 |
public:
|
sl@0
|
39 |
UChar32 fStartChar; // Start of range, unicode 32 bit value.
|
sl@0
|
40 |
UChar32 fEndChar; // End of range, unicode 32 bit value.
|
sl@0
|
41 |
int32_t fNum; // runtime-mapped input value for this range.
|
sl@0
|
42 |
UVector *fIncludesSets; // vector of the the original
|
sl@0
|
43 |
// Unicode sets that include this range.
|
sl@0
|
44 |
// (Contains ptrs to uset nodes)
|
sl@0
|
45 |
RangeDescriptor *fNext; // Next RangeDescriptor in the linked list.
|
sl@0
|
46 |
|
sl@0
|
47 |
RangeDescriptor(UErrorCode &status);
|
sl@0
|
48 |
RangeDescriptor(const RangeDescriptor &other, UErrorCode &status);
|
sl@0
|
49 |
~RangeDescriptor();
|
sl@0
|
50 |
void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with
|
sl@0
|
51 |
// where appearing in the second (higher) part.
|
sl@0
|
52 |
void setDictionaryFlag(); // Check whether this range appears as part of
|
sl@0
|
53 |
// the Unicode set named "dictionary"
|
sl@0
|
54 |
|
sl@0
|
55 |
private:
|
sl@0
|
56 |
RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class
|
sl@0
|
57 |
RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class
|
sl@0
|
58 |
};
|
sl@0
|
59 |
|
sl@0
|
60 |
|
sl@0
|
61 |
//
|
sl@0
|
62 |
// RBBISetBuilder Handles processing of Unicode Sets from RBBI rules.
|
sl@0
|
63 |
//
|
sl@0
|
64 |
// Starting with the rules parse tree from the scanner,
|
sl@0
|
65 |
//
|
sl@0
|
66 |
// - Enumerate the set of UnicodeSets that are referenced
|
sl@0
|
67 |
// by the RBBI rules.
|
sl@0
|
68 |
// - compute a derived set of non-overlapping UnicodeSets
|
sl@0
|
69 |
// that will correspond to columns in the state table for
|
sl@0
|
70 |
// the RBBI execution engine.
|
sl@0
|
71 |
// - construct the trie table that maps input characters
|
sl@0
|
72 |
// to set numbers in the non-overlapping set of sets.
|
sl@0
|
73 |
//
|
sl@0
|
74 |
|
sl@0
|
75 |
|
sl@0
|
76 |
class RBBISetBuilder : public UMemory {
|
sl@0
|
77 |
public:
|
sl@0
|
78 |
RBBISetBuilder(RBBIRuleBuilder *rb);
|
sl@0
|
79 |
~RBBISetBuilder();
|
sl@0
|
80 |
|
sl@0
|
81 |
void build();
|
sl@0
|
82 |
void addValToSets(UVector *sets, uint32_t val);
|
sl@0
|
83 |
void addValToSet (RBBINode *usetNode, uint32_t val);
|
sl@0
|
84 |
int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the
|
sl@0
|
85 |
// runtime state machine, which are the same as
|
sl@0
|
86 |
// columns in the DFA state table
|
sl@0
|
87 |
int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie.
|
sl@0
|
88 |
void serializeTrie(uint8_t *where); // write out the serialized Trie.
|
sl@0
|
89 |
UChar32 getFirstChar(int32_t val) const;
|
sl@0
|
90 |
#ifdef RBBI_DEBUG
|
sl@0
|
91 |
void printSets();
|
sl@0
|
92 |
void printRanges();
|
sl@0
|
93 |
void printRangeGroups();
|
sl@0
|
94 |
#else
|
sl@0
|
95 |
#define printSets()
|
sl@0
|
96 |
#define printRanges()
|
sl@0
|
97 |
#define printRangeGroups()
|
sl@0
|
98 |
#endif
|
sl@0
|
99 |
|
sl@0
|
100 |
private:
|
sl@0
|
101 |
void numberSets();
|
sl@0
|
102 |
|
sl@0
|
103 |
RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us.
|
sl@0
|
104 |
UErrorCode *fStatus;
|
sl@0
|
105 |
|
sl@0
|
106 |
RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors
|
sl@0
|
107 |
|
sl@0
|
108 |
UNewTrie *fTrie; // The mapping TRIE that is the end result of processing
|
sl@0
|
109 |
uint32_t fTrieSize; // the Unicode Sets.
|
sl@0
|
110 |
|
sl@0
|
111 |
// Groups correspond to character categories -
|
sl@0
|
112 |
// groups of ranges that are in the same original UnicodeSets.
|
sl@0
|
113 |
// fGroupCount is the index of the last used group.
|
sl@0
|
114 |
// fGroupCount+1 is also the number of columns in the RBBI state table being compiled.
|
sl@0
|
115 |
// State table column 0 is not used. Column 1 is for end-of-input.
|
sl@0
|
116 |
// column 2 is for group 0. Funny counting.
|
sl@0
|
117 |
int32_t fGroupCount;
|
sl@0
|
118 |
|
sl@0
|
119 |
RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class
|
sl@0
|
120 |
RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class
|
sl@0
|
121 |
};
|
sl@0
|
122 |
|
sl@0
|
123 |
|
sl@0
|
124 |
|
sl@0
|
125 |
U_NAMESPACE_END
|
sl@0
|
126 |
#endif
|