os/ossrv/ossrv_pub/boost_apis/boost/regex/v4/states.hpp
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
sl@0
     1
/*
sl@0
     2
 *
sl@0
     3
 * Copyright (c) 1998-2002
sl@0
     4
 * John Maddock
sl@0
     5
 *
sl@0
     6
 * Use, modification and distribution are subject to the 
sl@0
     7
 * Boost Software License, Version 1.0. (See accompanying file 
sl@0
     8
 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
sl@0
     9
 *
sl@0
    10
 */
sl@0
    11
sl@0
    12
 /*
sl@0
    13
  *   LOCATION:    see http://www.boost.org for most recent version.
sl@0
    14
  *   FILE         states.cpp
sl@0
    15
  *   VERSION      see <boost/version.hpp>
sl@0
    16
  *   DESCRIPTION: Declares internal state machine structures.
sl@0
    17
  */
sl@0
    18
sl@0
    19
#ifndef BOOST_REGEX_V4_STATES_HPP
sl@0
    20
#define BOOST_REGEX_V4_STATES_HPP
sl@0
    21
sl@0
    22
#ifdef BOOST_HAS_ABI_HEADERS
sl@0
    23
#  include BOOST_ABI_PREFIX
sl@0
    24
#endif
sl@0
    25
sl@0
    26
namespace boost{
sl@0
    27
namespace re_detail{
sl@0
    28
sl@0
    29
/*** mask_type *******************************************************
sl@0
    30
Whenever we have a choice of two alternatives, we use an array of bytes
sl@0
    31
to indicate which of the two alternatives it is possible to take for any
sl@0
    32
given input character.  If mask_take is set, then we can take the next 
sl@0
    33
state, and if mask_skip is set then we can take the alternative.
sl@0
    34
***********************************************************************/
sl@0
    35
enum mask_type
sl@0
    36
{
sl@0
    37
   mask_take = 1,
sl@0
    38
   mask_skip = 2,
sl@0
    39
   mask_init = 4,
sl@0
    40
   mask_any = mask_skip | mask_take,
sl@0
    41
   mask_all = mask_any
sl@0
    42
};
sl@0
    43
sl@0
    44
/*** helpers **********************************************************
sl@0
    45
These helpers let us use function overload resolution to detect whether
sl@0
    46
we have narrow or wide character strings:
sl@0
    47
***********************************************************************/
sl@0
    48
struct _narrow_type{};
sl@0
    49
struct _wide_type{};
sl@0
    50
template <class charT> struct is_byte;
sl@0
    51
template<>             struct is_byte<char>         { typedef _narrow_type width_type; };
sl@0
    52
template<>             struct is_byte<unsigned char>{ typedef _narrow_type width_type; };
sl@0
    53
template<>             struct is_byte<signed char>  { typedef _narrow_type width_type; };
sl@0
    54
template <class charT> struct is_byte               { typedef _wide_type width_type; };
sl@0
    55
sl@0
    56
/*** enum syntax_element_type ******************************************
sl@0
    57
Every record in the state machine falls into one of the following types:
sl@0
    58
***********************************************************************/
sl@0
    59
enum syntax_element_type
sl@0
    60
{
sl@0
    61
   // start of a marked sub-expression, or perl-style (?...) extension
sl@0
    62
   syntax_element_startmark = 0,
sl@0
    63
   // end of a marked sub-expression, or perl-style (?...) extension
sl@0
    64
   syntax_element_endmark = syntax_element_startmark + 1,
sl@0
    65
   // any sequence of literal characters
sl@0
    66
   syntax_element_literal = syntax_element_endmark + 1,
sl@0
    67
   // start of line assertion: ^
sl@0
    68
   syntax_element_start_line = syntax_element_literal + 1,
sl@0
    69
   // end of line assertion $
sl@0
    70
   syntax_element_end_line = syntax_element_start_line + 1,
sl@0
    71
   // match any character: .
sl@0
    72
   syntax_element_wild = syntax_element_end_line + 1,
sl@0
    73
   // end of expression: we have a match when we get here
sl@0
    74
   syntax_element_match = syntax_element_wild + 1,
sl@0
    75
   // perl style word boundary: \b
sl@0
    76
   syntax_element_word_boundary = syntax_element_match + 1,
sl@0
    77
   // perl style within word boundary: \B
sl@0
    78
   syntax_element_within_word = syntax_element_word_boundary + 1,
sl@0
    79
   // start of word assertion: \<
sl@0
    80
   syntax_element_word_start = syntax_element_within_word + 1,
sl@0
    81
   // end of word assertion: \>
sl@0
    82
   syntax_element_word_end = syntax_element_word_start + 1,
sl@0
    83
   // start of buffer assertion: \`
sl@0
    84
   syntax_element_buffer_start = syntax_element_word_end + 1,
sl@0
    85
   // end of buffer assertion: \'
sl@0
    86
   syntax_element_buffer_end = syntax_element_buffer_start + 1,
sl@0
    87
   // backreference to previously matched sub-expression
sl@0
    88
   syntax_element_backref = syntax_element_buffer_end + 1,
sl@0
    89
   // either a wide character set [..] or one with multicharacter collating elements:
sl@0
    90
   syntax_element_long_set = syntax_element_backref + 1,
sl@0
    91
   // narrow character set: [...]
sl@0
    92
   syntax_element_set = syntax_element_long_set + 1,
sl@0
    93
   // jump to a new state in the machine:
sl@0
    94
   syntax_element_jump = syntax_element_set + 1,
sl@0
    95
   // choose between two production states:
sl@0
    96
   syntax_element_alt = syntax_element_jump + 1,
sl@0
    97
   // a repeat
sl@0
    98
   syntax_element_rep = syntax_element_alt + 1,
sl@0
    99
   // match a combining character sequence
sl@0
   100
   syntax_element_combining = syntax_element_rep + 1,
sl@0
   101
   // perl style soft buffer end: \z
sl@0
   102
   syntax_element_soft_buffer_end = syntax_element_combining + 1,
sl@0
   103
   // perl style continuation: \G
sl@0
   104
   syntax_element_restart_continue = syntax_element_soft_buffer_end + 1,
sl@0
   105
   // single character repeats:
sl@0
   106
   syntax_element_dot_rep = syntax_element_restart_continue + 1,
sl@0
   107
   syntax_element_char_rep = syntax_element_dot_rep + 1,
sl@0
   108
   syntax_element_short_set_rep = syntax_element_char_rep + 1,
sl@0
   109
   syntax_element_long_set_rep = syntax_element_short_set_rep + 1,
sl@0
   110
   // a backstep for lookbehind repeats:
sl@0
   111
   syntax_element_backstep = syntax_element_long_set_rep + 1,
sl@0
   112
   // an assertion that a mark was matched:
sl@0
   113
   syntax_element_assert_backref = syntax_element_backstep + 1,
sl@0
   114
   syntax_element_toggle_case = syntax_element_assert_backref + 1
sl@0
   115
};
sl@0
   116
sl@0
   117
#ifdef BOOST_REGEX_DEBUG
sl@0
   118
// dwa 09/26/00 - This is needed to suppress warnings about an ambiguous conversion
sl@0
   119
std::ostream& operator<<(std::ostream&, syntax_element_type);
sl@0
   120
#endif
sl@0
   121
sl@0
   122
struct re_syntax_base;
sl@0
   123
sl@0
   124
/*** union offset_type ************************************************
sl@0
   125
Points to another state in the machine.  During machine construction
sl@0
   126
we use integral offsets, but these are converted to pointers before
sl@0
   127
execution of the machine.
sl@0
   128
***********************************************************************/
sl@0
   129
union offset_type
sl@0
   130
{
sl@0
   131
   re_syntax_base*   p;
sl@0
   132
   std::ptrdiff_t    i;
sl@0
   133
};
sl@0
   134
sl@0
   135
/*** struct re_syntax_base ********************************************
sl@0
   136
Base class for all states in the machine.
sl@0
   137
***********************************************************************/
sl@0
   138
struct re_syntax_base
sl@0
   139
{
sl@0
   140
   syntax_element_type   type;         // what kind of state this is
sl@0
   141
   offset_type           next;         // next state in the machine
sl@0
   142
};
sl@0
   143
sl@0
   144
/*** struct re_brace **************************************************
sl@0
   145
A marked parenthesis.
sl@0
   146
***********************************************************************/
sl@0
   147
struct re_brace : public re_syntax_base
sl@0
   148
{
sl@0
   149
   // The index to match, can be zero (don't mark the sub-expression)
sl@0
   150
   // or negative (for perl style (?...) extentions):
sl@0
   151
   int index;
sl@0
   152
};
sl@0
   153
sl@0
   154
/*** struct re_dot **************************************************
sl@0
   155
Match anything.
sl@0
   156
***********************************************************************/
sl@0
   157
enum
sl@0
   158
{
sl@0
   159
   dont_care = 1,
sl@0
   160
   force_not_newline = 0,
sl@0
   161
   force_newline = 2,
sl@0
   162
sl@0
   163
   test_not_newline = 2,
sl@0
   164
   test_newline = 3
sl@0
   165
};
sl@0
   166
struct re_dot : public re_syntax_base
sl@0
   167
{
sl@0
   168
   unsigned char mask;
sl@0
   169
};
sl@0
   170
sl@0
   171
/*** struct re_literal ************************************************
sl@0
   172
A string of literals, following this structure will be an 
sl@0
   173
array of characters: charT[length]
sl@0
   174
***********************************************************************/
sl@0
   175
struct re_literal : public re_syntax_base
sl@0
   176
{
sl@0
   177
   unsigned int length;
sl@0
   178
};
sl@0
   179
sl@0
   180
/*** struct re_case ************************************************
sl@0
   181
Indicates whether we are moving to a case insensive block or not
sl@0
   182
***********************************************************************/
sl@0
   183
struct re_case : public re_syntax_base
sl@0
   184
{
sl@0
   185
   bool icase;
sl@0
   186
};
sl@0
   187
sl@0
   188
/*** struct re_set_long ***********************************************
sl@0
   189
A wide character set of characters, following this structure will be
sl@0
   190
an array of type charT:
sl@0
   191
First csingles null-terminated strings
sl@0
   192
Then 2 * cranges NULL terminated strings
sl@0
   193
Then cequivalents NULL terminated strings
sl@0
   194
***********************************************************************/
sl@0
   195
template <class mask_type>
sl@0
   196
struct re_set_long : public re_syntax_base
sl@0
   197
{
sl@0
   198
   unsigned int            csingles, cranges, cequivalents;
sl@0
   199
   mask_type               cclasses;
sl@0
   200
   mask_type               cnclasses;
sl@0
   201
   bool                    isnot;
sl@0
   202
   bool                    singleton;
sl@0
   203
};
sl@0
   204
sl@0
   205
/*** struct re_set ****************************************************
sl@0
   206
A set of narrow-characters, matches any of _map which is none-zero
sl@0
   207
***********************************************************************/
sl@0
   208
struct re_set : public re_syntax_base
sl@0
   209
{
sl@0
   210
   unsigned char _map[1 << CHAR_BIT];
sl@0
   211
};
sl@0
   212
sl@0
   213
/*** struct re_jump ***************************************************
sl@0
   214
Jump to a new location in the machine (not next).
sl@0
   215
***********************************************************************/
sl@0
   216
struct re_jump : public re_syntax_base
sl@0
   217
{
sl@0
   218
   offset_type     alt;                 // location to jump to
sl@0
   219
};
sl@0
   220
sl@0
   221
/*** struct re_alt ***************************************************
sl@0
   222
Jump to a new location in the machine (possibly next).
sl@0
   223
***********************************************************************/
sl@0
   224
struct re_alt : public re_jump
sl@0
   225
{
sl@0
   226
   unsigned char   _map[1 << CHAR_BIT]; // which characters can take the jump
sl@0
   227
   unsigned int    can_be_null;         // true if we match a NULL string
sl@0
   228
};
sl@0
   229
sl@0
   230
/*** struct re_repeat *************************************************
sl@0
   231
Repeat a section of the machine
sl@0
   232
***********************************************************************/
sl@0
   233
struct re_repeat : public re_alt
sl@0
   234
{
sl@0
   235
   std::size_t   min, max;  // min and max allowable repeats
sl@0
   236
   int           id;        // Unique identifier for this repeat
sl@0
   237
   bool          leading;   // True if this repeat is at the start of the machine (lets us optimize some searches)
sl@0
   238
   bool          greedy;    // True if this is a greedy repeat
sl@0
   239
};
sl@0
   240
sl@0
   241
/*** enum re_jump_size_type *******************************************
sl@0
   242
Provides compiled size of re_jump structure (allowing for trailing alignment).
sl@0
   243
We provide this so we know how manybytes to insert when constructing the machine
sl@0
   244
(The value of padding_mask is defined in regex_raw_buffer.hpp).
sl@0
   245
***********************************************************************/
sl@0
   246
enum re_jump_size_type
sl@0
   247
{
sl@0
   248
   re_jump_size = (sizeof(re_jump) + padding_mask) & ~(padding_mask),
sl@0
   249
   re_repeater_size = (sizeof(re_repeat) + padding_mask) & ~(padding_mask),
sl@0
   250
   re_alt_size = (sizeof(re_alt) + padding_mask) & ~(padding_mask)
sl@0
   251
};
sl@0
   252
sl@0
   253
/*** proc re_is_set_member *********************************************
sl@0
   254
Forward declaration: we'll need this one later...
sl@0
   255
***********************************************************************/
sl@0
   256
sl@0
   257
template<class charT, class traits>
sl@0
   258
struct regex_data;
sl@0
   259
sl@0
   260
template <class iterator, class charT, class traits_type, class char_classT>
sl@0
   261
iterator BOOST_REGEX_CALL re_is_set_member(iterator next, 
sl@0
   262
                          iterator last, 
sl@0
   263
                          const re_set_long<char_classT>* set_, 
sl@0
   264
                          const regex_data<charT, traits_type>& e, bool icase);
sl@0
   265
sl@0
   266
} // namespace re_detail
sl@0
   267
sl@0
   268
} // namespace boost
sl@0
   269
sl@0
   270
#ifdef BOOST_HAS_ABI_HEADERS
sl@0
   271
#  include BOOST_ABI_SUFFIX
sl@0
   272
#endif
sl@0
   273
sl@0
   274
#endif
sl@0
   275
sl@0
   276