sl@0: /* sl@0: * sl@0: * Copyright (c) 1998-2002 sl@0: * John Maddock sl@0: * sl@0: * Use, modification and distribution are subject to the sl@0: * Boost Software License, Version 1.0. (See accompanying file sl@0: * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) sl@0: * sl@0: */ sl@0: sl@0: /* sl@0: * LOCATION: see http://www.boost.org for most recent version. sl@0: * FILE states.cpp sl@0: * VERSION see sl@0: * DESCRIPTION: Declares internal state machine structures. sl@0: */ sl@0: sl@0: #ifndef BOOST_REGEX_V4_STATES_HPP sl@0: #define BOOST_REGEX_V4_STATES_HPP sl@0: sl@0: #ifdef BOOST_HAS_ABI_HEADERS sl@0: # include BOOST_ABI_PREFIX sl@0: #endif sl@0: sl@0: namespace boost{ sl@0: namespace re_detail{ sl@0: sl@0: /*** mask_type ******************************************************* sl@0: Whenever we have a choice of two alternatives, we use an array of bytes sl@0: to indicate which of the two alternatives it is possible to take for any sl@0: given input character. If mask_take is set, then we can take the next sl@0: state, and if mask_skip is set then we can take the alternative. sl@0: ***********************************************************************/ sl@0: enum mask_type sl@0: { sl@0: mask_take = 1, sl@0: mask_skip = 2, sl@0: mask_init = 4, sl@0: mask_any = mask_skip | mask_take, sl@0: mask_all = mask_any sl@0: }; sl@0: sl@0: /*** helpers ********************************************************** sl@0: These helpers let us use function overload resolution to detect whether sl@0: we have narrow or wide character strings: sl@0: ***********************************************************************/ sl@0: struct _narrow_type{}; sl@0: struct _wide_type{}; sl@0: template struct is_byte; sl@0: template<> struct is_byte { typedef _narrow_type width_type; }; sl@0: template<> struct is_byte{ typedef _narrow_type width_type; }; sl@0: template<> struct is_byte { typedef _narrow_type width_type; }; sl@0: template struct is_byte { typedef _wide_type width_type; }; sl@0: sl@0: /*** enum syntax_element_type ****************************************** sl@0: Every record in the state machine falls into one of the following types: sl@0: ***********************************************************************/ sl@0: enum syntax_element_type sl@0: { sl@0: // start of a marked sub-expression, or perl-style (?...) extension sl@0: syntax_element_startmark = 0, sl@0: // end of a marked sub-expression, or perl-style (?...) extension sl@0: syntax_element_endmark = syntax_element_startmark + 1, sl@0: // any sequence of literal characters sl@0: syntax_element_literal = syntax_element_endmark + 1, sl@0: // start of line assertion: ^ sl@0: syntax_element_start_line = syntax_element_literal + 1, sl@0: // end of line assertion $ sl@0: syntax_element_end_line = syntax_element_start_line + 1, sl@0: // match any character: . sl@0: syntax_element_wild = syntax_element_end_line + 1, sl@0: // end of expression: we have a match when we get here sl@0: syntax_element_match = syntax_element_wild + 1, sl@0: // perl style word boundary: \b sl@0: syntax_element_word_boundary = syntax_element_match + 1, sl@0: // perl style within word boundary: \B sl@0: syntax_element_within_word = syntax_element_word_boundary + 1, sl@0: // start of word assertion: \< sl@0: syntax_element_word_start = syntax_element_within_word + 1, sl@0: // end of word assertion: \> sl@0: syntax_element_word_end = syntax_element_word_start + 1, sl@0: // start of buffer assertion: \` sl@0: syntax_element_buffer_start = syntax_element_word_end + 1, sl@0: // end of buffer assertion: \' sl@0: syntax_element_buffer_end = syntax_element_buffer_start + 1, sl@0: // backreference to previously matched sub-expression sl@0: syntax_element_backref = syntax_element_buffer_end + 1, sl@0: // either a wide character set [..] or one with multicharacter collating elements: sl@0: syntax_element_long_set = syntax_element_backref + 1, sl@0: // narrow character set: [...] sl@0: syntax_element_set = syntax_element_long_set + 1, sl@0: // jump to a new state in the machine: sl@0: syntax_element_jump = syntax_element_set + 1, sl@0: // choose between two production states: sl@0: syntax_element_alt = syntax_element_jump + 1, sl@0: // a repeat sl@0: syntax_element_rep = syntax_element_alt + 1, sl@0: // match a combining character sequence sl@0: syntax_element_combining = syntax_element_rep + 1, sl@0: // perl style soft buffer end: \z sl@0: syntax_element_soft_buffer_end = syntax_element_combining + 1, sl@0: // perl style continuation: \G sl@0: syntax_element_restart_continue = syntax_element_soft_buffer_end + 1, sl@0: // single character repeats: sl@0: syntax_element_dot_rep = syntax_element_restart_continue + 1, sl@0: syntax_element_char_rep = syntax_element_dot_rep + 1, sl@0: syntax_element_short_set_rep = syntax_element_char_rep + 1, sl@0: syntax_element_long_set_rep = syntax_element_short_set_rep + 1, sl@0: // a backstep for lookbehind repeats: sl@0: syntax_element_backstep = syntax_element_long_set_rep + 1, sl@0: // an assertion that a mark was matched: sl@0: syntax_element_assert_backref = syntax_element_backstep + 1, sl@0: syntax_element_toggle_case = syntax_element_assert_backref + 1 sl@0: }; sl@0: sl@0: #ifdef BOOST_REGEX_DEBUG sl@0: // dwa 09/26/00 - This is needed to suppress warnings about an ambiguous conversion sl@0: std::ostream& operator<<(std::ostream&, syntax_element_type); sl@0: #endif sl@0: sl@0: struct re_syntax_base; sl@0: sl@0: /*** union offset_type ************************************************ sl@0: Points to another state in the machine. During machine construction sl@0: we use integral offsets, but these are converted to pointers before sl@0: execution of the machine. sl@0: ***********************************************************************/ sl@0: union offset_type sl@0: { sl@0: re_syntax_base* p; sl@0: std::ptrdiff_t i; sl@0: }; sl@0: sl@0: /*** struct re_syntax_base ******************************************** sl@0: Base class for all states in the machine. sl@0: ***********************************************************************/ sl@0: struct re_syntax_base sl@0: { sl@0: syntax_element_type type; // what kind of state this is sl@0: offset_type next; // next state in the machine sl@0: }; sl@0: sl@0: /*** struct re_brace ************************************************** sl@0: A marked parenthesis. sl@0: ***********************************************************************/ sl@0: struct re_brace : public re_syntax_base sl@0: { sl@0: // The index to match, can be zero (don't mark the sub-expression) sl@0: // or negative (for perl style (?...) extentions): sl@0: int index; sl@0: }; sl@0: sl@0: /*** struct re_dot ************************************************** sl@0: Match anything. sl@0: ***********************************************************************/ sl@0: enum sl@0: { sl@0: dont_care = 1, sl@0: force_not_newline = 0, sl@0: force_newline = 2, sl@0: sl@0: test_not_newline = 2, sl@0: test_newline = 3 sl@0: }; sl@0: struct re_dot : public re_syntax_base sl@0: { sl@0: unsigned char mask; sl@0: }; sl@0: sl@0: /*** struct re_literal ************************************************ sl@0: A string of literals, following this structure will be an sl@0: array of characters: charT[length] sl@0: ***********************************************************************/ sl@0: struct re_literal : public re_syntax_base sl@0: { sl@0: unsigned int length; sl@0: }; sl@0: sl@0: /*** struct re_case ************************************************ sl@0: Indicates whether we are moving to a case insensive block or not sl@0: ***********************************************************************/ sl@0: struct re_case : public re_syntax_base sl@0: { sl@0: bool icase; sl@0: }; sl@0: sl@0: /*** struct re_set_long *********************************************** sl@0: A wide character set of characters, following this structure will be sl@0: an array of type charT: sl@0: First csingles null-terminated strings sl@0: Then 2 * cranges NULL terminated strings sl@0: Then cequivalents NULL terminated strings sl@0: ***********************************************************************/ sl@0: template sl@0: struct re_set_long : public re_syntax_base sl@0: { sl@0: unsigned int csingles, cranges, cequivalents; sl@0: mask_type cclasses; sl@0: mask_type cnclasses; sl@0: bool isnot; sl@0: bool singleton; sl@0: }; sl@0: sl@0: /*** struct re_set **************************************************** sl@0: A set of narrow-characters, matches any of _map which is none-zero sl@0: ***********************************************************************/ sl@0: struct re_set : public re_syntax_base sl@0: { sl@0: unsigned char _map[1 << CHAR_BIT]; sl@0: }; sl@0: sl@0: /*** struct re_jump *************************************************** sl@0: Jump to a new location in the machine (not next). sl@0: ***********************************************************************/ sl@0: struct re_jump : public re_syntax_base sl@0: { sl@0: offset_type alt; // location to jump to sl@0: }; sl@0: sl@0: /*** struct re_alt *************************************************** sl@0: Jump to a new location in the machine (possibly next). sl@0: ***********************************************************************/ sl@0: struct re_alt : public re_jump sl@0: { sl@0: unsigned char _map[1 << CHAR_BIT]; // which characters can take the jump sl@0: unsigned int can_be_null; // true if we match a NULL string sl@0: }; sl@0: sl@0: /*** struct re_repeat ************************************************* sl@0: Repeat a section of the machine sl@0: ***********************************************************************/ sl@0: struct re_repeat : public re_alt sl@0: { sl@0: std::size_t min, max; // min and max allowable repeats sl@0: int id; // Unique identifier for this repeat sl@0: bool leading; // True if this repeat is at the start of the machine (lets us optimize some searches) sl@0: bool greedy; // True if this is a greedy repeat sl@0: }; sl@0: sl@0: /*** enum re_jump_size_type ******************************************* sl@0: Provides compiled size of re_jump structure (allowing for trailing alignment). sl@0: We provide this so we know how manybytes to insert when constructing the machine sl@0: (The value of padding_mask is defined in regex_raw_buffer.hpp). sl@0: ***********************************************************************/ sl@0: enum re_jump_size_type sl@0: { sl@0: re_jump_size = (sizeof(re_jump) + padding_mask) & ~(padding_mask), sl@0: re_repeater_size = (sizeof(re_repeat) + padding_mask) & ~(padding_mask), sl@0: re_alt_size = (sizeof(re_alt) + padding_mask) & ~(padding_mask) sl@0: }; sl@0: sl@0: /*** proc re_is_set_member ********************************************* sl@0: Forward declaration: we'll need this one later... sl@0: ***********************************************************************/ sl@0: sl@0: template sl@0: struct regex_data; sl@0: sl@0: template sl@0: iterator BOOST_REGEX_CALL re_is_set_member(iterator next, sl@0: iterator last, sl@0: const re_set_long* set_, sl@0: const regex_data& e, bool icase); sl@0: sl@0: } // namespace re_detail sl@0: sl@0: } // namespace boost sl@0: sl@0: #ifdef BOOST_HAS_ABI_HEADERS sl@0: # include BOOST_ABI_SUFFIX sl@0: #endif sl@0: sl@0: #endif sl@0: sl@0: