os/ossrv/ossrv_pub/boost_apis/boost/xpressive/regex_compiler.hpp
author sl@SLION-WIN7.fritz.box
Fri, 15 Jun 2012 03:10:57 +0200
changeset 0 bde4ae8d615e
permissions -rw-r--r--
First public contribution.
     1 ///////////////////////////////////////////////////////////////////////////////
     2 /// \file regex_compiler.hpp
     3 /// Contains the definition of regex_compiler, a factory for building regex objects
     4 /// from strings.
     5 //
     6 //  Copyright 2004 Eric Niebler. Distributed under the Boost
     7 //  Software License, Version 1.0. (See accompanying file
     8 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
     9 
    10 #ifndef BOOST_XPRESSIVE_REGEX_COMPILER_HPP_EAN_10_04_2005
    11 #define BOOST_XPRESSIVE_REGEX_COMPILER_HPP_EAN_10_04_2005
    12 
    13 // MS compatible compilers support #pragma once
    14 #if defined(_MSC_VER) && (_MSC_VER >= 1020)
    15 # pragma once
    16 #endif
    17 
    18 #include <boost/xpressive/basic_regex.hpp>
    19 #include <boost/xpressive/detail/dynamic/parser.hpp>
    20 #include <boost/xpressive/detail/dynamic/parse_charset.hpp>
    21 #include <boost/xpressive/detail/dynamic/parser_enum.hpp>
    22 #include <boost/xpressive/detail/dynamic/parser_traits.hpp>
    23 #include <boost/xpressive/detail/core/linker.hpp>
    24 #include <boost/xpressive/detail/core/optimize.hpp>
    25 
    26 namespace boost { namespace xpressive
    27 {
    28 
    29 ///////////////////////////////////////////////////////////////////////////////
    30 // regex_compiler
    31 //
    32 /// \brief Class template regex_compiler is a factory for building basic_regex objects from a string.
    33 ///
    34 /// Class template regex_compiler is used to construct a basic_regex object from a string. The string
    35 /// should contain a valid regular expression. You can imbue a regex_compiler object with a locale,
    36 /// after which all basic_regex objects created with that regex_compiler object will use that locale.
    37 /// After creating a regex_compiler object, and optionally imbueing it with a locale, you can call the
    38 /// compile() method to construct a basic_regex object, passing it the string representing the regular
    39 /// expression. You can call compile() multiple times on the same regex_compiler object. Two basic_regex
    40 /// objects compiled from the same string will have different regex_id's.
    41 template<typename BidiIter, typename RegexTraits, typename CompilerTraits>
    42 struct regex_compiler
    43 {
    44     typedef BidiIter iterator_type;
    45     typedef typename iterator_value<BidiIter>::type char_type;
    46     typedef std::basic_string<char_type> string_type;
    47     typedef regex_constants::syntax_option_type flag_type;
    48     typedef RegexTraits traits_type;
    49     typedef typename traits_type::char_class_type char_class_type;
    50     typedef typename traits_type::locale_type locale_type;
    51 
    52     explicit regex_compiler(RegexTraits const &traits = RegexTraits())
    53       : mark_count_(0)
    54       , hidden_mark_count_(0)
    55       , traits_(traits)
    56       , upper_(0)
    57     {
    58         this->upper_ = lookup_classname(this->rxtraits(), "upper");
    59         BOOST_ASSERT(0 != this->upper_);
    60     }
    61 
    62     ///////////////////////////////////////////////////////////////////////////
    63     // imbue
    64     /// Specify the locale to be used by a regex_compiler.
    65     ///
    66     /// \param loc The locale that this regex_compiler should use.
    67     /// \return The previous locale.
    68     locale_type imbue(locale_type loc)
    69     {
    70         locale_type oldloc = this->traits_.imbue(loc);
    71         this->upper_ = lookup_classname(this->rxtraits(), "upper");
    72         BOOST_ASSERT(0 != this->upper_);
    73         return oldloc;
    74     }
    75 
    76     ///////////////////////////////////////////////////////////////////////////
    77     // getloc
    78     /// Get the locale used by a regex_compiler.
    79     ///
    80     /// \param loc The locale that this regex_compiler uses.
    81     locale_type getloc() const
    82     {
    83         return this->traits_.getloc();
    84     }
    85 
    86     ///////////////////////////////////////////////////////////////////////////
    87     // compile
    88     /// Builds a basic_regex object from a std::string.
    89     ///
    90     /// \param  pat A std::string containing the regular expression pattern.
    91     /// \param  flags Optional bitmask that determines how the pat string is interpreted. (See syntax_option_type.)
    92     /// \return A basic_regex object corresponding to the regular expression represented by the string.
    93     /// \pre    The std::string pat contains a valid string-based representation of a regular expression.
    94     /// \throw  regex_error when the string has invalid regular expression syntax.
    95     basic_regex<BidiIter> compile(string_type pat, flag_type flags = regex_constants::ECMAScript)
    96     {
    97         this->reset();
    98         this->traits_.flags(flags);
    99 
   100         string_iterator begin = pat.begin(), end = pat.end();
   101 
   102         // at the top level, a regex is a sequence of alternates
   103         alternates_list alternates;
   104         this->parse_alternates(begin, end, alternates);
   105         detail::ensure(begin == end, regex_constants::error_paren, "mismatched parenthesis");
   106 
   107         // convert the alternates list to the appropriate matcher and terminate the sequence
   108         detail::sequence<BidiIter> seq = detail::alternates_to_matchable(alternates, alternates_factory());
   109         seq += detail::make_dynamic_xpression<BidiIter>(detail::end_matcher());
   110 
   111         // fill in the back-pointers by visiting the regex parse tree
   112         detail::xpression_linker<char_type> linker(this->rxtraits());
   113         seq.first->link(linker);
   114 
   115         // bundle the regex information into a regex_impl object
   116         detail::regex_impl<BidiIter> impl;
   117         impl.xpr_ = seq.first;
   118         impl.traits_.reset(new RegexTraits(this->rxtraits()));
   119         impl.mark_count_ = this->mark_count_;
   120         impl.hidden_mark_count_ = this->hidden_mark_count_;
   121 
   122         // optimization: get the peek chars OR the boyer-moore search string
   123         detail::optimize_regex(impl, this->rxtraits(), detail::is_random<BidiIter>());
   124 
   125         return detail::core_access<BidiIter>::make_regex(impl);
   126     }
   127 
   128 private:
   129 
   130     typedef typename string_type::const_iterator string_iterator;
   131     typedef std::list<detail::sequence<BidiIter> > alternates_list;
   132     typedef detail::escape_value<char_type, char_class_type> escape_value;
   133     typedef detail::alternates_factory_impl<BidiIter, traits_type> alternates_factory;
   134 
   135     ///////////////////////////////////////////////////////////////////////////
   136     // reset
   137     /// INTERNAL ONLY
   138     void reset()
   139     {
   140         this->mark_count_ = 0;
   141         this->hidden_mark_count_ = 0;
   142         this->traits_.flags(regex_constants::ECMAScript);
   143     }
   144 
   145     ///////////////////////////////////////////////////////////////////////////
   146     // regex_traits
   147     /// INTERNAL ONLY
   148     traits_type &rxtraits()
   149     {
   150         return this->traits_.traits();
   151     }
   152 
   153     ///////////////////////////////////////////////////////////////////////////
   154     // regex_traits
   155     /// INTERNAL ONLY
   156     traits_type const &rxtraits() const
   157     {
   158         return this->traits_.traits();
   159     }
   160 
   161     ///////////////////////////////////////////////////////////////////////////
   162     // parse_alternates
   163     /// INTERNAL ONLY
   164     void parse_alternates(string_iterator &begin, string_iterator end, alternates_list &alternates)
   165     {
   166         using namespace regex_constants;
   167         string_iterator old_begin;
   168 
   169         do
   170         {
   171             alternates.push_back(this->parse_sequence(begin, end));
   172             old_begin = begin;
   173         }
   174         while(begin != end && token_alternate == this->traits_.get_token(begin, end));
   175 
   176         begin = old_begin;
   177     }
   178 
   179     ///////////////////////////////////////////////////////////////////////////
   180     // parse_group
   181     /// INTERNAL ONLY
   182     detail::sequence<BidiIter> parse_group(string_iterator &begin, string_iterator end)
   183     {
   184         using namespace regex_constants;
   185         int mark_nbr = 0;
   186         bool keeper = false;
   187         bool lookahead = false;
   188         bool lookbehind = false;
   189         bool negative = false;
   190         std::size_t old_mark_count = this->mark_count_;
   191 
   192         detail::sequence<BidiIter> seq, seq_end;
   193         string_iterator tmp = string_iterator();
   194 
   195         syntax_option_type old_flags = this->traits_.flags();
   196 
   197         switch(this->traits_.get_group_type(begin, end))
   198         {
   199         case token_no_mark:
   200             // Don't process empty groups like (?:) or (?i)
   201             // BUGBUG this doesn't handle the degenerate (?:)+ correctly
   202             if(token_group_end == this->traits_.get_token(tmp = begin, end))
   203             {
   204                 return this->parse_atom(begin = tmp, end);
   205             }
   206             break;
   207 
   208         case token_negative_lookahead:
   209             negative = true; // fall-through
   210         case token_positive_lookahead:
   211             lookahead = true;
   212             seq_end = detail::make_dynamic_xpression<BidiIter>(detail::true_matcher());
   213             break;
   214 
   215         case token_negative_lookbehind:
   216             negative = true; // fall-through
   217         case token_positive_lookbehind:
   218             lookbehind = true;
   219             seq_end = detail::make_dynamic_xpression<BidiIter>(detail::true_matcher());
   220             break;
   221 
   222         case token_independent_sub_expression:
   223             keeper = true;
   224             seq_end = detail::make_dynamic_xpression<BidiIter>(detail::true_matcher());
   225             break;
   226 
   227         case token_comment:
   228             while(detail::ensure(begin != end, error_paren, "mismatched parenthesis"))
   229             {
   230                 switch(this->traits_.get_token(begin, end))
   231                 {
   232                 case token_group_end: return this->parse_atom(begin, end);
   233                 case token_escape: detail::ensure(begin != end, error_escape, "incomplete escape sequence");
   234                 case token_literal: ++begin;
   235                 default:;
   236                 }
   237             }
   238             break;
   239 
   240         default:
   241             mark_nbr = static_cast<int>(++this->mark_count_);
   242             seq = detail::make_dynamic_xpression<BidiIter>(detail::mark_begin_matcher(mark_nbr));
   243             seq_end = detail::make_dynamic_xpression<BidiIter>(detail::mark_end_matcher(mark_nbr));
   244             break;
   245         }
   246 
   247         // alternates
   248         alternates_list alternates;
   249         this->parse_alternates(begin, end, alternates);
   250         detail::ensure
   251         (
   252             begin != end && token_group_end == this->traits_.get_token(begin, end)
   253           , error_paren
   254           , "mismatched parenthesis"
   255         );
   256 
   257         seq += detail::alternates_to_matchable(alternates, alternates_factory());
   258         seq += seq_end;
   259 
   260         typedef shared_ptr<detail::matchable<BidiIter> const> xpr_type;
   261         bool do_save = (this->mark_count_ != old_mark_count);
   262 
   263         if(lookahead)
   264         {
   265             detail::lookahead_matcher<xpr_type> lookahead(seq.first, negative, do_save);
   266             seq = detail::make_dynamic_xpression<BidiIter>(lookahead);
   267         }
   268         else if(lookbehind)
   269         {
   270             detail::lookbehind_matcher<xpr_type> lookbehind(seq.first, negative, do_save);
   271             seq = detail::make_dynamic_xpression<BidiIter>(lookbehind);
   272         }
   273         else if(keeper) // independent sub-expression
   274         {
   275             detail::keeper_matcher<xpr_type> keeper(seq.first, do_save);
   276             seq = detail::make_dynamic_xpression<BidiIter>(keeper);
   277         }
   278 
   279         // restore the modifiers
   280         this->traits_.flags(old_flags);
   281         return seq;
   282     }
   283 
   284     ///////////////////////////////////////////////////////////////////////////
   285     // parse_charset
   286     /// INTERNAL ONLY
   287     detail::sequence<BidiIter> parse_charset(string_iterator &begin, string_iterator end)
   288     {
   289         detail::compound_charset<traits_type> chset;
   290 
   291         // call out to a helper to actually parse the character set
   292         detail::parse_charset(begin, end, chset, this->traits_);
   293 
   294         return detail::make_charset_xpression<BidiIter>
   295         (
   296             chset
   297           , this->rxtraits()
   298           , this->traits_.flags()
   299         );
   300     }
   301 
   302     ///////////////////////////////////////////////////////////////////////////
   303     // parse_atom
   304     /// INTERNAL ONLY
   305     detail::sequence<BidiIter> parse_atom(string_iterator &begin, string_iterator end)
   306     {
   307         using namespace regex_constants;
   308         escape_value esc = { 0, 0, 0, detail::escape_char };
   309         string_iterator old_begin = begin;
   310 
   311         switch(this->traits_.get_token(begin, end))
   312         {
   313         case token_literal:
   314             return detail::make_literal_xpression<BidiIter>
   315             (
   316                 this->parse_literal(begin, end), this->traits_.flags(), this->rxtraits()
   317             );
   318 
   319         case token_any:
   320             return detail::make_any_xpression<BidiIter>(this->traits_.flags(), this->rxtraits());
   321 
   322         case token_assert_begin_sequence:
   323             return detail::make_dynamic_xpression<BidiIter>(detail::assert_bos_matcher());
   324 
   325         case token_assert_end_sequence:
   326             return detail::make_dynamic_xpression<BidiIter>(detail::assert_eos_matcher());
   327 
   328         case token_assert_begin_line:
   329             return detail::make_assert_begin_line<BidiIter>(this->traits_.flags(), this->rxtraits());
   330 
   331         case token_assert_end_line:
   332             return detail::make_assert_end_line<BidiIter>(this->traits_.flags(), this->rxtraits());
   333 
   334         case token_assert_word_boundary:
   335             return detail::make_assert_word<BidiIter>(detail::word_boundary<true>(), this->rxtraits());
   336 
   337         case token_assert_not_word_boundary:
   338             return detail::make_assert_word<BidiIter>(detail::word_boundary<false>(), this->rxtraits());
   339 
   340         case token_assert_word_begin:
   341             return detail::make_assert_word<BidiIter>(detail::word_begin(), this->rxtraits());
   342 
   343         case token_assert_word_end:
   344             return detail::make_assert_word<BidiIter>(detail::word_end(), this->rxtraits());
   345 
   346         case token_escape:
   347             esc = this->parse_escape(begin, end);
   348             switch(esc.type_)
   349             {
   350             case detail::escape_mark:
   351                 return detail::make_backref_xpression<BidiIter>
   352                 (
   353                     esc.mark_nbr_, this->traits_.flags(), this->rxtraits()
   354                 );
   355             case detail::escape_char:
   356                 return detail::make_char_xpression<BidiIter>
   357                 (
   358                     esc.ch_, this->traits_.flags(), this->rxtraits()
   359                 );
   360             case detail::escape_class:
   361                 return detail::make_posix_charset_xpression<BidiIter>
   362                 (
   363                     esc.class_
   364                   , this->rxtraits().isctype(*begin++, this->upper_)
   365                   , this->traits_.flags()
   366                   , this->rxtraits()
   367                 );
   368             }
   369 
   370         case token_group_begin:
   371             return this->parse_group(begin, end);
   372 
   373         case token_charset_begin:
   374             return this->parse_charset(begin, end);
   375 
   376         case token_invalid_quantifier:
   377             throw regex_error(error_badrepeat, "quantifier not expected");
   378 
   379         case token_quote_meta_begin:
   380             return detail::make_literal_xpression<BidiIter>
   381             (
   382                 this->parse_quote_meta(begin, end), this->traits_.flags(), this->rxtraits()
   383             );
   384 
   385         case token_quote_meta_end:
   386             throw regex_error
   387             (
   388                 error_escape
   389               , "found quote-meta end without corresponding quote-meta begin"
   390             );
   391 
   392         case token_end_of_pattern:
   393             break;
   394 
   395         default:
   396             begin = old_begin;
   397             break;
   398         }
   399 
   400         return detail::sequence<BidiIter>();
   401     }
   402 
   403     ///////////////////////////////////////////////////////////////////////////
   404     // parse_quant
   405     /// INTERNAL ONLY
   406     detail::sequence<BidiIter> parse_quant(string_iterator &begin, string_iterator end)
   407     {
   408         BOOST_ASSERT(begin != end);
   409         detail::quant_spec spec = { 0, 0, false };
   410         detail::sequence<BidiIter> seq = this->parse_atom(begin, end);
   411 
   412         // BUGBUG this doesn't handle the degenerate (?:)+ correctly
   413         if(!seq.is_empty() && begin != end && seq.first->is_quantifiable())
   414         {
   415             if(this->traits_.get_quant_spec(begin, end, spec))
   416             {
   417                 BOOST_ASSERT(spec.min_ <= spec.max_);
   418 
   419                 if(0 == spec.max_) // quant {0,0} is degenerate -- matches nothing.
   420                 {
   421                     seq = this->parse_quant(begin, end);
   422                 }
   423                 else
   424                 {
   425                     seq = seq.first->quantify(spec, this->hidden_mark_count_, seq, alternates_factory());
   426                 }
   427             }
   428         }
   429 
   430         return seq;
   431     }
   432 
   433     ///////////////////////////////////////////////////////////////////////////
   434     // parse_sequence
   435     /// INTERNAL ONLY
   436     detail::sequence<BidiIter> parse_sequence(string_iterator &begin, string_iterator end)
   437     {
   438         detail::sequence<BidiIter> seq;
   439 
   440         while(begin != end)
   441         {
   442             detail::sequence<BidiIter> seq_quant = this->parse_quant(begin, end);
   443 
   444             // did we find a quantified atom?
   445             if(seq_quant.is_empty())
   446                 break;
   447 
   448             // chain it to the end of the xpression sequence
   449             seq += seq_quant;
   450         }
   451 
   452         return seq;
   453     }
   454 
   455     ///////////////////////////////////////////////////////////////////////////
   456     // parse_literal
   457     //  scan ahead looking for char literals to be globbed together into a string literal
   458     /// INTERNAL ONLY
   459     string_type parse_literal(string_iterator &begin, string_iterator end)
   460     {
   461         using namespace regex_constants;
   462         BOOST_ASSERT(begin != end);
   463         BOOST_ASSERT(token_literal == this->traits_.get_token(begin, end));
   464         escape_value esc = { 0, 0, 0, detail::escape_char };
   465         string_type literal(1, *begin);
   466 
   467         for(string_iterator prev = begin, tmp = ++begin; begin != end; prev = begin, begin = tmp)
   468         {
   469             detail::quant_spec spec;
   470             if(this->traits_.get_quant_spec(tmp, end, spec))
   471             {
   472                 if(literal.size() != 1)
   473                 {
   474                     begin = prev;
   475                     literal.erase(literal.size() - 1);
   476                 }
   477                 return literal;
   478             }
   479             else switch(this->traits_.get_token(tmp, end))
   480             {
   481             case token_escape:
   482                 esc = this->parse_escape(tmp, end);
   483                 if(detail::escape_char != esc.type_) return literal;
   484                 literal += esc.ch_;
   485                 break;
   486             case token_literal:
   487                 literal += *tmp++;
   488                 break;
   489             default:
   490                 return literal;
   491             }
   492         }
   493 
   494         return literal;
   495     }
   496 
   497     ///////////////////////////////////////////////////////////////////////////
   498     // parse_quote_meta
   499     //  scan ahead looking for char literals to be globbed together into a string literal
   500     /// INTERNAL ONLY
   501     string_type parse_quote_meta(string_iterator &begin, string_iterator end)
   502     {
   503         using namespace regex_constants;
   504         string_iterator old_begin = begin, old_end;
   505         while(end != (old_end = begin))
   506         {
   507             switch(this->traits_.get_token(begin, end))
   508             {
   509             case token_quote_meta_end: return string_type(old_begin, old_end);
   510             case token_escape: detail::ensure(begin != end, error_escape, "incomplete escape sequence");
   511             case token_literal: ++begin;
   512             default:;
   513             }
   514         }
   515         return string_type(old_begin, begin);
   516     }
   517 
   518     ///////////////////////////////////////////////////////////////////////////////
   519     // parse_escape
   520     /// INTERNAL ONLY
   521     escape_value parse_escape(string_iterator &begin, string_iterator end)
   522     {
   523         detail::ensure(begin != end, regex_constants::error_escape, "incomplete escape sequence");
   524 
   525         // first, check to see if this can be a backreference
   526         if(0 < this->rxtraits().value(*begin, 10))
   527         {
   528             // Parse at most 3 decimal digits.
   529             string_iterator tmp = begin;
   530             int mark_nbr = detail::toi(tmp, end, this->rxtraits(), 10, 999);
   531 
   532             // If the resulting number could conceivably be a backref, then it is.
   533             if(10 > mark_nbr || mark_nbr <= static_cast<int>(this->mark_count_))
   534             {
   535                 begin = tmp;
   536                 escape_value esc = {0, mark_nbr, 0, detail::escape_mark};
   537                 return esc;
   538             }
   539         }
   540 
   541         // Not a backreference, defer to the parse_escape helper
   542         return detail::parse_escape(begin, end, this->traits_);
   543     }
   544 
   545     std::size_t mark_count_;
   546     std::size_t hidden_mark_count_;
   547     CompilerTraits traits_;
   548     typename RegexTraits::char_class_type upper_;
   549 };
   550 
   551 }} // namespace boost::xpressive
   552 
   553 #endif