os/ossrv/ossrv_pub/boost_apis/boost/regex/v4/states.hpp
author sl@SLION-WIN7.fritz.box
Fri, 15 Jun 2012 03:10:57 +0200
changeset 0 bde4ae8d615e
permissions -rw-r--r--
First public contribution.
     1 /*
     2  *
     3  * Copyright (c) 1998-2002
     4  * John Maddock
     5  *
     6  * Use, modification and distribution are subject to the 
     7  * Boost Software License, Version 1.0. (See accompanying file 
     8  * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
     9  *
    10  */
    11 
    12  /*
    13   *   LOCATION:    see http://www.boost.org for most recent version.
    14   *   FILE         states.cpp
    15   *   VERSION      see <boost/version.hpp>
    16   *   DESCRIPTION: Declares internal state machine structures.
    17   */
    18 
    19 #ifndef BOOST_REGEX_V4_STATES_HPP
    20 #define BOOST_REGEX_V4_STATES_HPP
    21 
    22 #ifdef BOOST_HAS_ABI_HEADERS
    23 #  include BOOST_ABI_PREFIX
    24 #endif
    25 
    26 namespace boost{
    27 namespace re_detail{
    28 
    29 /*** mask_type *******************************************************
    30 Whenever we have a choice of two alternatives, we use an array of bytes
    31 to indicate which of the two alternatives it is possible to take for any
    32 given input character.  If mask_take is set, then we can take the next 
    33 state, and if mask_skip is set then we can take the alternative.
    34 ***********************************************************************/
    35 enum mask_type
    36 {
    37    mask_take = 1,
    38    mask_skip = 2,
    39    mask_init = 4,
    40    mask_any = mask_skip | mask_take,
    41    mask_all = mask_any
    42 };
    43 
    44 /*** helpers **********************************************************
    45 These helpers let us use function overload resolution to detect whether
    46 we have narrow or wide character strings:
    47 ***********************************************************************/
    48 struct _narrow_type{};
    49 struct _wide_type{};
    50 template <class charT> struct is_byte;
    51 template<>             struct is_byte<char>         { typedef _narrow_type width_type; };
    52 template<>             struct is_byte<unsigned char>{ typedef _narrow_type width_type; };
    53 template<>             struct is_byte<signed char>  { typedef _narrow_type width_type; };
    54 template <class charT> struct is_byte               { typedef _wide_type width_type; };
    55 
    56 /*** enum syntax_element_type ******************************************
    57 Every record in the state machine falls into one of the following types:
    58 ***********************************************************************/
    59 enum syntax_element_type
    60 {
    61    // start of a marked sub-expression, or perl-style (?...) extension
    62    syntax_element_startmark = 0,
    63    // end of a marked sub-expression, or perl-style (?...) extension
    64    syntax_element_endmark = syntax_element_startmark + 1,
    65    // any sequence of literal characters
    66    syntax_element_literal = syntax_element_endmark + 1,
    67    // start of line assertion: ^
    68    syntax_element_start_line = syntax_element_literal + 1,
    69    // end of line assertion $
    70    syntax_element_end_line = syntax_element_start_line + 1,
    71    // match any character: .
    72    syntax_element_wild = syntax_element_end_line + 1,
    73    // end of expression: we have a match when we get here
    74    syntax_element_match = syntax_element_wild + 1,
    75    // perl style word boundary: \b
    76    syntax_element_word_boundary = syntax_element_match + 1,
    77    // perl style within word boundary: \B
    78    syntax_element_within_word = syntax_element_word_boundary + 1,
    79    // start of word assertion: \<
    80    syntax_element_word_start = syntax_element_within_word + 1,
    81    // end of word assertion: \>
    82    syntax_element_word_end = syntax_element_word_start + 1,
    83    // start of buffer assertion: \`
    84    syntax_element_buffer_start = syntax_element_word_end + 1,
    85    // end of buffer assertion: \'
    86    syntax_element_buffer_end = syntax_element_buffer_start + 1,
    87    // backreference to previously matched sub-expression
    88    syntax_element_backref = syntax_element_buffer_end + 1,
    89    // either a wide character set [..] or one with multicharacter collating elements:
    90    syntax_element_long_set = syntax_element_backref + 1,
    91    // narrow character set: [...]
    92    syntax_element_set = syntax_element_long_set + 1,
    93    // jump to a new state in the machine:
    94    syntax_element_jump = syntax_element_set + 1,
    95    // choose between two production states:
    96    syntax_element_alt = syntax_element_jump + 1,
    97    // a repeat
    98    syntax_element_rep = syntax_element_alt + 1,
    99    // match a combining character sequence
   100    syntax_element_combining = syntax_element_rep + 1,
   101    // perl style soft buffer end: \z
   102    syntax_element_soft_buffer_end = syntax_element_combining + 1,
   103    // perl style continuation: \G
   104    syntax_element_restart_continue = syntax_element_soft_buffer_end + 1,
   105    // single character repeats:
   106    syntax_element_dot_rep = syntax_element_restart_continue + 1,
   107    syntax_element_char_rep = syntax_element_dot_rep + 1,
   108    syntax_element_short_set_rep = syntax_element_char_rep + 1,
   109    syntax_element_long_set_rep = syntax_element_short_set_rep + 1,
   110    // a backstep for lookbehind repeats:
   111    syntax_element_backstep = syntax_element_long_set_rep + 1,
   112    // an assertion that a mark was matched:
   113    syntax_element_assert_backref = syntax_element_backstep + 1,
   114    syntax_element_toggle_case = syntax_element_assert_backref + 1
   115 };
   116 
   117 #ifdef BOOST_REGEX_DEBUG
   118 // dwa 09/26/00 - This is needed to suppress warnings about an ambiguous conversion
   119 std::ostream& operator<<(std::ostream&, syntax_element_type);
   120 #endif
   121 
   122 struct re_syntax_base;
   123 
   124 /*** union offset_type ************************************************
   125 Points to another state in the machine.  During machine construction
   126 we use integral offsets, but these are converted to pointers before
   127 execution of the machine.
   128 ***********************************************************************/
   129 union offset_type
   130 {
   131    re_syntax_base*   p;
   132    std::ptrdiff_t    i;
   133 };
   134 
   135 /*** struct re_syntax_base ********************************************
   136 Base class for all states in the machine.
   137 ***********************************************************************/
   138 struct re_syntax_base
   139 {
   140    syntax_element_type   type;         // what kind of state this is
   141    offset_type           next;         // next state in the machine
   142 };
   143 
   144 /*** struct re_brace **************************************************
   145 A marked parenthesis.
   146 ***********************************************************************/
   147 struct re_brace : public re_syntax_base
   148 {
   149    // The index to match, can be zero (don't mark the sub-expression)
   150    // or negative (for perl style (?...) extentions):
   151    int index;
   152 };
   153 
   154 /*** struct re_dot **************************************************
   155 Match anything.
   156 ***********************************************************************/
   157 enum
   158 {
   159    dont_care = 1,
   160    force_not_newline = 0,
   161    force_newline = 2,
   162 
   163    test_not_newline = 2,
   164    test_newline = 3
   165 };
   166 struct re_dot : public re_syntax_base
   167 {
   168    unsigned char mask;
   169 };
   170 
   171 /*** struct re_literal ************************************************
   172 A string of literals, following this structure will be an 
   173 array of characters: charT[length]
   174 ***********************************************************************/
   175 struct re_literal : public re_syntax_base
   176 {
   177    unsigned int length;
   178 };
   179 
   180 /*** struct re_case ************************************************
   181 Indicates whether we are moving to a case insensive block or not
   182 ***********************************************************************/
   183 struct re_case : public re_syntax_base
   184 {
   185    bool icase;
   186 };
   187 
   188 /*** struct re_set_long ***********************************************
   189 A wide character set of characters, following this structure will be
   190 an array of type charT:
   191 First csingles null-terminated strings
   192 Then 2 * cranges NULL terminated strings
   193 Then cequivalents NULL terminated strings
   194 ***********************************************************************/
   195 template <class mask_type>
   196 struct re_set_long : public re_syntax_base
   197 {
   198    unsigned int            csingles, cranges, cequivalents;
   199    mask_type               cclasses;
   200    mask_type               cnclasses;
   201    bool                    isnot;
   202    bool                    singleton;
   203 };
   204 
   205 /*** struct re_set ****************************************************
   206 A set of narrow-characters, matches any of _map which is none-zero
   207 ***********************************************************************/
   208 struct re_set : public re_syntax_base
   209 {
   210    unsigned char _map[1 << CHAR_BIT];
   211 };
   212 
   213 /*** struct re_jump ***************************************************
   214 Jump to a new location in the machine (not next).
   215 ***********************************************************************/
   216 struct re_jump : public re_syntax_base
   217 {
   218    offset_type     alt;                 // location to jump to
   219 };
   220 
   221 /*** struct re_alt ***************************************************
   222 Jump to a new location in the machine (possibly next).
   223 ***********************************************************************/
   224 struct re_alt : public re_jump
   225 {
   226    unsigned char   _map[1 << CHAR_BIT]; // which characters can take the jump
   227    unsigned int    can_be_null;         // true if we match a NULL string
   228 };
   229 
   230 /*** struct re_repeat *************************************************
   231 Repeat a section of the machine
   232 ***********************************************************************/
   233 struct re_repeat : public re_alt
   234 {
   235    std::size_t   min, max;  // min and max allowable repeats
   236    int           id;        // Unique identifier for this repeat
   237    bool          leading;   // True if this repeat is at the start of the machine (lets us optimize some searches)
   238    bool          greedy;    // True if this is a greedy repeat
   239 };
   240 
   241 /*** enum re_jump_size_type *******************************************
   242 Provides compiled size of re_jump structure (allowing for trailing alignment).
   243 We provide this so we know how manybytes to insert when constructing the machine
   244 (The value of padding_mask is defined in regex_raw_buffer.hpp).
   245 ***********************************************************************/
   246 enum re_jump_size_type
   247 {
   248    re_jump_size = (sizeof(re_jump) + padding_mask) & ~(padding_mask),
   249    re_repeater_size = (sizeof(re_repeat) + padding_mask) & ~(padding_mask),
   250    re_alt_size = (sizeof(re_alt) + padding_mask) & ~(padding_mask)
   251 };
   252 
   253 /*** proc re_is_set_member *********************************************
   254 Forward declaration: we'll need this one later...
   255 ***********************************************************************/
   256 
   257 template<class charT, class traits>
   258 struct regex_data;
   259 
   260 template <class iterator, class charT, class traits_type, class char_classT>
   261 iterator BOOST_REGEX_CALL re_is_set_member(iterator next, 
   262                           iterator last, 
   263                           const re_set_long<char_classT>* set_, 
   264                           const regex_data<charT, traits_type>& e, bool icase);
   265 
   266 } // namespace re_detail
   267 
   268 } // namespace boost
   269 
   270 #ifdef BOOST_HAS_ABI_HEADERS
   271 #  include BOOST_ABI_SUFFIX
   272 #endif
   273 
   274 #endif
   275 
   276