First public contribution.
3 * Copyright (c) 1998-2002
6 * Use, modification and distribution are subject to the
7 * Boost Software License, Version 1.0. (See accompanying file
8 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
13 * LOCATION: see http://www.boost.org for most recent version.
15 * VERSION see <boost/version.hpp>
16 * DESCRIPTION: Declares internal state machine structures.
19 #ifndef BOOST_REGEX_V4_STATES_HPP
20 #define BOOST_REGEX_V4_STATES_HPP
22 #ifdef BOOST_HAS_ABI_HEADERS
23 # include BOOST_ABI_PREFIX
29 /*** mask_type *******************************************************
30 Whenever we have a choice of two alternatives, we use an array of bytes
31 to indicate which of the two alternatives it is possible to take for any
32 given input character. If mask_take is set, then we can take the next
33 state, and if mask_skip is set then we can take the alternative.
34 ***********************************************************************/
40 mask_any = mask_skip | mask_take,
44 /*** helpers **********************************************************
45 These helpers let us use function overload resolution to detect whether
46 we have narrow or wide character strings:
47 ***********************************************************************/
48 struct _narrow_type{};
50 template <class charT> struct is_byte;
51 template<> struct is_byte<char> { typedef _narrow_type width_type; };
52 template<> struct is_byte<unsigned char>{ typedef _narrow_type width_type; };
53 template<> struct is_byte<signed char> { typedef _narrow_type width_type; };
54 template <class charT> struct is_byte { typedef _wide_type width_type; };
56 /*** enum syntax_element_type ******************************************
57 Every record in the state machine falls into one of the following types:
58 ***********************************************************************/
59 enum syntax_element_type
61 // start of a marked sub-expression, or perl-style (?...) extension
62 syntax_element_startmark = 0,
63 // end of a marked sub-expression, or perl-style (?...) extension
64 syntax_element_endmark = syntax_element_startmark + 1,
65 // any sequence of literal characters
66 syntax_element_literal = syntax_element_endmark + 1,
67 // start of line assertion: ^
68 syntax_element_start_line = syntax_element_literal + 1,
69 // end of line assertion $
70 syntax_element_end_line = syntax_element_start_line + 1,
71 // match any character: .
72 syntax_element_wild = syntax_element_end_line + 1,
73 // end of expression: we have a match when we get here
74 syntax_element_match = syntax_element_wild + 1,
75 // perl style word boundary: \b
76 syntax_element_word_boundary = syntax_element_match + 1,
77 // perl style within word boundary: \B
78 syntax_element_within_word = syntax_element_word_boundary + 1,
79 // start of word assertion: \<
80 syntax_element_word_start = syntax_element_within_word + 1,
81 // end of word assertion: \>
82 syntax_element_word_end = syntax_element_word_start + 1,
83 // start of buffer assertion: \`
84 syntax_element_buffer_start = syntax_element_word_end + 1,
85 // end of buffer assertion: \'
86 syntax_element_buffer_end = syntax_element_buffer_start + 1,
87 // backreference to previously matched sub-expression
88 syntax_element_backref = syntax_element_buffer_end + 1,
89 // either a wide character set [..] or one with multicharacter collating elements:
90 syntax_element_long_set = syntax_element_backref + 1,
91 // narrow character set: [...]
92 syntax_element_set = syntax_element_long_set + 1,
93 // jump to a new state in the machine:
94 syntax_element_jump = syntax_element_set + 1,
95 // choose between two production states:
96 syntax_element_alt = syntax_element_jump + 1,
98 syntax_element_rep = syntax_element_alt + 1,
99 // match a combining character sequence
100 syntax_element_combining = syntax_element_rep + 1,
101 // perl style soft buffer end: \z
102 syntax_element_soft_buffer_end = syntax_element_combining + 1,
103 // perl style continuation: \G
104 syntax_element_restart_continue = syntax_element_soft_buffer_end + 1,
105 // single character repeats:
106 syntax_element_dot_rep = syntax_element_restart_continue + 1,
107 syntax_element_char_rep = syntax_element_dot_rep + 1,
108 syntax_element_short_set_rep = syntax_element_char_rep + 1,
109 syntax_element_long_set_rep = syntax_element_short_set_rep + 1,
110 // a backstep for lookbehind repeats:
111 syntax_element_backstep = syntax_element_long_set_rep + 1,
112 // an assertion that a mark was matched:
113 syntax_element_assert_backref = syntax_element_backstep + 1,
114 syntax_element_toggle_case = syntax_element_assert_backref + 1
117 #ifdef BOOST_REGEX_DEBUG
118 // dwa 09/26/00 - This is needed to suppress warnings about an ambiguous conversion
119 std::ostream& operator<<(std::ostream&, syntax_element_type);
122 struct re_syntax_base;
124 /*** union offset_type ************************************************
125 Points to another state in the machine. During machine construction
126 we use integral offsets, but these are converted to pointers before
127 execution of the machine.
128 ***********************************************************************/
135 /*** struct re_syntax_base ********************************************
136 Base class for all states in the machine.
137 ***********************************************************************/
138 struct re_syntax_base
140 syntax_element_type type; // what kind of state this is
141 offset_type next; // next state in the machine
144 /*** struct re_brace **************************************************
145 A marked parenthesis.
146 ***********************************************************************/
147 struct re_brace : public re_syntax_base
149 // The index to match, can be zero (don't mark the sub-expression)
150 // or negative (for perl style (?...) extentions):
154 /*** struct re_dot **************************************************
156 ***********************************************************************/
160 force_not_newline = 0,
163 test_not_newline = 2,
166 struct re_dot : public re_syntax_base
171 /*** struct re_literal ************************************************
172 A string of literals, following this structure will be an
173 array of characters: charT[length]
174 ***********************************************************************/
175 struct re_literal : public re_syntax_base
180 /*** struct re_case ************************************************
181 Indicates whether we are moving to a case insensive block or not
182 ***********************************************************************/
183 struct re_case : public re_syntax_base
188 /*** struct re_set_long ***********************************************
189 A wide character set of characters, following this structure will be
190 an array of type charT:
191 First csingles null-terminated strings
192 Then 2 * cranges NULL terminated strings
193 Then cequivalents NULL terminated strings
194 ***********************************************************************/
195 template <class mask_type>
196 struct re_set_long : public re_syntax_base
198 unsigned int csingles, cranges, cequivalents;
205 /*** struct re_set ****************************************************
206 A set of narrow-characters, matches any of _map which is none-zero
207 ***********************************************************************/
208 struct re_set : public re_syntax_base
210 unsigned char _map[1 << CHAR_BIT];
213 /*** struct re_jump ***************************************************
214 Jump to a new location in the machine (not next).
215 ***********************************************************************/
216 struct re_jump : public re_syntax_base
218 offset_type alt; // location to jump to
221 /*** struct re_alt ***************************************************
222 Jump to a new location in the machine (possibly next).
223 ***********************************************************************/
224 struct re_alt : public re_jump
226 unsigned char _map[1 << CHAR_BIT]; // which characters can take the jump
227 unsigned int can_be_null; // true if we match a NULL string
230 /*** struct re_repeat *************************************************
231 Repeat a section of the machine
232 ***********************************************************************/
233 struct re_repeat : public re_alt
235 std::size_t min, max; // min and max allowable repeats
236 int id; // Unique identifier for this repeat
237 bool leading; // True if this repeat is at the start of the machine (lets us optimize some searches)
238 bool greedy; // True if this is a greedy repeat
241 /*** enum re_jump_size_type *******************************************
242 Provides compiled size of re_jump structure (allowing for trailing alignment).
243 We provide this so we know how manybytes to insert when constructing the machine
244 (The value of padding_mask is defined in regex_raw_buffer.hpp).
245 ***********************************************************************/
246 enum re_jump_size_type
248 re_jump_size = (sizeof(re_jump) + padding_mask) & ~(padding_mask),
249 re_repeater_size = (sizeof(re_repeat) + padding_mask) & ~(padding_mask),
250 re_alt_size = (sizeof(re_alt) + padding_mask) & ~(padding_mask)
253 /*** proc re_is_set_member *********************************************
254 Forward declaration: we'll need this one later...
255 ***********************************************************************/
257 template<class charT, class traits>
260 template <class iterator, class charT, class traits_type, class char_classT>
261 iterator BOOST_REGEX_CALL re_is_set_member(iterator next,
263 const re_set_long<char_classT>* set_,
264 const regex_data<charT, traits_type>& e, bool icase);
266 } // namespace re_detail
270 #ifdef BOOST_HAS_ABI_HEADERS
271 # include BOOST_ABI_SUFFIX