1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/ossrv/ossrv_pub/boost_apis/boost/xpressive/regex_compiler.hpp Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,553 @@
1.4 +///////////////////////////////////////////////////////////////////////////////
1.5 +/// \file regex_compiler.hpp
1.6 +/// Contains the definition of regex_compiler, a factory for building regex objects
1.7 +/// from strings.
1.8 +//
1.9 +// Copyright 2004 Eric Niebler. Distributed under the Boost
1.10 +// Software License, Version 1.0. (See accompanying file
1.11 +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
1.12 +
1.13 +#ifndef BOOST_XPRESSIVE_REGEX_COMPILER_HPP_EAN_10_04_2005
1.14 +#define BOOST_XPRESSIVE_REGEX_COMPILER_HPP_EAN_10_04_2005
1.15 +
1.16 +// MS compatible compilers support #pragma once
1.17 +#if defined(_MSC_VER) && (_MSC_VER >= 1020)
1.18 +# pragma once
1.19 +#endif
1.20 +
1.21 +#include <boost/xpressive/basic_regex.hpp>
1.22 +#include <boost/xpressive/detail/dynamic/parser.hpp>
1.23 +#include <boost/xpressive/detail/dynamic/parse_charset.hpp>
1.24 +#include <boost/xpressive/detail/dynamic/parser_enum.hpp>
1.25 +#include <boost/xpressive/detail/dynamic/parser_traits.hpp>
1.26 +#include <boost/xpressive/detail/core/linker.hpp>
1.27 +#include <boost/xpressive/detail/core/optimize.hpp>
1.28 +
1.29 +namespace boost { namespace xpressive
1.30 +{
1.31 +
1.32 +///////////////////////////////////////////////////////////////////////////////
1.33 +// regex_compiler
1.34 +//
1.35 +/// \brief Class template regex_compiler is a factory for building basic_regex objects from a string.
1.36 +///
1.37 +/// Class template regex_compiler is used to construct a basic_regex object from a string. The string
1.38 +/// should contain a valid regular expression. You can imbue a regex_compiler object with a locale,
1.39 +/// after which all basic_regex objects created with that regex_compiler object will use that locale.
1.40 +/// After creating a regex_compiler object, and optionally imbueing it with a locale, you can call the
1.41 +/// compile() method to construct a basic_regex object, passing it the string representing the regular
1.42 +/// expression. You can call compile() multiple times on the same regex_compiler object. Two basic_regex
1.43 +/// objects compiled from the same string will have different regex_id's.
1.44 +template<typename BidiIter, typename RegexTraits, typename CompilerTraits>
1.45 +struct regex_compiler
1.46 +{
1.47 + typedef BidiIter iterator_type;
1.48 + typedef typename iterator_value<BidiIter>::type char_type;
1.49 + typedef std::basic_string<char_type> string_type;
1.50 + typedef regex_constants::syntax_option_type flag_type;
1.51 + typedef RegexTraits traits_type;
1.52 + typedef typename traits_type::char_class_type char_class_type;
1.53 + typedef typename traits_type::locale_type locale_type;
1.54 +
1.55 + explicit regex_compiler(RegexTraits const &traits = RegexTraits())
1.56 + : mark_count_(0)
1.57 + , hidden_mark_count_(0)
1.58 + , traits_(traits)
1.59 + , upper_(0)
1.60 + {
1.61 + this->upper_ = lookup_classname(this->rxtraits(), "upper");
1.62 + BOOST_ASSERT(0 != this->upper_);
1.63 + }
1.64 +
1.65 + ///////////////////////////////////////////////////////////////////////////
1.66 + // imbue
1.67 + /// Specify the locale to be used by a regex_compiler.
1.68 + ///
1.69 + /// \param loc The locale that this regex_compiler should use.
1.70 + /// \return The previous locale.
1.71 + locale_type imbue(locale_type loc)
1.72 + {
1.73 + locale_type oldloc = this->traits_.imbue(loc);
1.74 + this->upper_ = lookup_classname(this->rxtraits(), "upper");
1.75 + BOOST_ASSERT(0 != this->upper_);
1.76 + return oldloc;
1.77 + }
1.78 +
1.79 + ///////////////////////////////////////////////////////////////////////////
1.80 + // getloc
1.81 + /// Get the locale used by a regex_compiler.
1.82 + ///
1.83 + /// \param loc The locale that this regex_compiler uses.
1.84 + locale_type getloc() const
1.85 + {
1.86 + return this->traits_.getloc();
1.87 + }
1.88 +
1.89 + ///////////////////////////////////////////////////////////////////////////
1.90 + // compile
1.91 + /// Builds a basic_regex object from a std::string.
1.92 + ///
1.93 + /// \param pat A std::string containing the regular expression pattern.
1.94 + /// \param flags Optional bitmask that determines how the pat string is interpreted. (See syntax_option_type.)
1.95 + /// \return A basic_regex object corresponding to the regular expression represented by the string.
1.96 + /// \pre The std::string pat contains a valid string-based representation of a regular expression.
1.97 + /// \throw regex_error when the string has invalid regular expression syntax.
1.98 + basic_regex<BidiIter> compile(string_type pat, flag_type flags = regex_constants::ECMAScript)
1.99 + {
1.100 + this->reset();
1.101 + this->traits_.flags(flags);
1.102 +
1.103 + string_iterator begin = pat.begin(), end = pat.end();
1.104 +
1.105 + // at the top level, a regex is a sequence of alternates
1.106 + alternates_list alternates;
1.107 + this->parse_alternates(begin, end, alternates);
1.108 + detail::ensure(begin == end, regex_constants::error_paren, "mismatched parenthesis");
1.109 +
1.110 + // convert the alternates list to the appropriate matcher and terminate the sequence
1.111 + detail::sequence<BidiIter> seq = detail::alternates_to_matchable(alternates, alternates_factory());
1.112 + seq += detail::make_dynamic_xpression<BidiIter>(detail::end_matcher());
1.113 +
1.114 + // fill in the back-pointers by visiting the regex parse tree
1.115 + detail::xpression_linker<char_type> linker(this->rxtraits());
1.116 + seq.first->link(linker);
1.117 +
1.118 + // bundle the regex information into a regex_impl object
1.119 + detail::regex_impl<BidiIter> impl;
1.120 + impl.xpr_ = seq.first;
1.121 + impl.traits_.reset(new RegexTraits(this->rxtraits()));
1.122 + impl.mark_count_ = this->mark_count_;
1.123 + impl.hidden_mark_count_ = this->hidden_mark_count_;
1.124 +
1.125 + // optimization: get the peek chars OR the boyer-moore search string
1.126 + detail::optimize_regex(impl, this->rxtraits(), detail::is_random<BidiIter>());
1.127 +
1.128 + return detail::core_access<BidiIter>::make_regex(impl);
1.129 + }
1.130 +
1.131 +private:
1.132 +
1.133 + typedef typename string_type::const_iterator string_iterator;
1.134 + typedef std::list<detail::sequence<BidiIter> > alternates_list;
1.135 + typedef detail::escape_value<char_type, char_class_type> escape_value;
1.136 + typedef detail::alternates_factory_impl<BidiIter, traits_type> alternates_factory;
1.137 +
1.138 + ///////////////////////////////////////////////////////////////////////////
1.139 + // reset
1.140 + /// INTERNAL ONLY
1.141 + void reset()
1.142 + {
1.143 + this->mark_count_ = 0;
1.144 + this->hidden_mark_count_ = 0;
1.145 + this->traits_.flags(regex_constants::ECMAScript);
1.146 + }
1.147 +
1.148 + ///////////////////////////////////////////////////////////////////////////
1.149 + // regex_traits
1.150 + /// INTERNAL ONLY
1.151 + traits_type &rxtraits()
1.152 + {
1.153 + return this->traits_.traits();
1.154 + }
1.155 +
1.156 + ///////////////////////////////////////////////////////////////////////////
1.157 + // regex_traits
1.158 + /// INTERNAL ONLY
1.159 + traits_type const &rxtraits() const
1.160 + {
1.161 + return this->traits_.traits();
1.162 + }
1.163 +
1.164 + ///////////////////////////////////////////////////////////////////////////
1.165 + // parse_alternates
1.166 + /// INTERNAL ONLY
1.167 + void parse_alternates(string_iterator &begin, string_iterator end, alternates_list &alternates)
1.168 + {
1.169 + using namespace regex_constants;
1.170 + string_iterator old_begin;
1.171 +
1.172 + do
1.173 + {
1.174 + alternates.push_back(this->parse_sequence(begin, end));
1.175 + old_begin = begin;
1.176 + }
1.177 + while(begin != end && token_alternate == this->traits_.get_token(begin, end));
1.178 +
1.179 + begin = old_begin;
1.180 + }
1.181 +
1.182 + ///////////////////////////////////////////////////////////////////////////
1.183 + // parse_group
1.184 + /// INTERNAL ONLY
1.185 + detail::sequence<BidiIter> parse_group(string_iterator &begin, string_iterator end)
1.186 + {
1.187 + using namespace regex_constants;
1.188 + int mark_nbr = 0;
1.189 + bool keeper = false;
1.190 + bool lookahead = false;
1.191 + bool lookbehind = false;
1.192 + bool negative = false;
1.193 + std::size_t old_mark_count = this->mark_count_;
1.194 +
1.195 + detail::sequence<BidiIter> seq, seq_end;
1.196 + string_iterator tmp = string_iterator();
1.197 +
1.198 + syntax_option_type old_flags = this->traits_.flags();
1.199 +
1.200 + switch(this->traits_.get_group_type(begin, end))
1.201 + {
1.202 + case token_no_mark:
1.203 + // Don't process empty groups like (?:) or (?i)
1.204 + // BUGBUG this doesn't handle the degenerate (?:)+ correctly
1.205 + if(token_group_end == this->traits_.get_token(tmp = begin, end))
1.206 + {
1.207 + return this->parse_atom(begin = tmp, end);
1.208 + }
1.209 + break;
1.210 +
1.211 + case token_negative_lookahead:
1.212 + negative = true; // fall-through
1.213 + case token_positive_lookahead:
1.214 + lookahead = true;
1.215 + seq_end = detail::make_dynamic_xpression<BidiIter>(detail::true_matcher());
1.216 + break;
1.217 +
1.218 + case token_negative_lookbehind:
1.219 + negative = true; // fall-through
1.220 + case token_positive_lookbehind:
1.221 + lookbehind = true;
1.222 + seq_end = detail::make_dynamic_xpression<BidiIter>(detail::true_matcher());
1.223 + break;
1.224 +
1.225 + case token_independent_sub_expression:
1.226 + keeper = true;
1.227 + seq_end = detail::make_dynamic_xpression<BidiIter>(detail::true_matcher());
1.228 + break;
1.229 +
1.230 + case token_comment:
1.231 + while(detail::ensure(begin != end, error_paren, "mismatched parenthesis"))
1.232 + {
1.233 + switch(this->traits_.get_token(begin, end))
1.234 + {
1.235 + case token_group_end: return this->parse_atom(begin, end);
1.236 + case token_escape: detail::ensure(begin != end, error_escape, "incomplete escape sequence");
1.237 + case token_literal: ++begin;
1.238 + default:;
1.239 + }
1.240 + }
1.241 + break;
1.242 +
1.243 + default:
1.244 + mark_nbr = static_cast<int>(++this->mark_count_);
1.245 + seq = detail::make_dynamic_xpression<BidiIter>(detail::mark_begin_matcher(mark_nbr));
1.246 + seq_end = detail::make_dynamic_xpression<BidiIter>(detail::mark_end_matcher(mark_nbr));
1.247 + break;
1.248 + }
1.249 +
1.250 + // alternates
1.251 + alternates_list alternates;
1.252 + this->parse_alternates(begin, end, alternates);
1.253 + detail::ensure
1.254 + (
1.255 + begin != end && token_group_end == this->traits_.get_token(begin, end)
1.256 + , error_paren
1.257 + , "mismatched parenthesis"
1.258 + );
1.259 +
1.260 + seq += detail::alternates_to_matchable(alternates, alternates_factory());
1.261 + seq += seq_end;
1.262 +
1.263 + typedef shared_ptr<detail::matchable<BidiIter> const> xpr_type;
1.264 + bool do_save = (this->mark_count_ != old_mark_count);
1.265 +
1.266 + if(lookahead)
1.267 + {
1.268 + detail::lookahead_matcher<xpr_type> lookahead(seq.first, negative, do_save);
1.269 + seq = detail::make_dynamic_xpression<BidiIter>(lookahead);
1.270 + }
1.271 + else if(lookbehind)
1.272 + {
1.273 + detail::lookbehind_matcher<xpr_type> lookbehind(seq.first, negative, do_save);
1.274 + seq = detail::make_dynamic_xpression<BidiIter>(lookbehind);
1.275 + }
1.276 + else if(keeper) // independent sub-expression
1.277 + {
1.278 + detail::keeper_matcher<xpr_type> keeper(seq.first, do_save);
1.279 + seq = detail::make_dynamic_xpression<BidiIter>(keeper);
1.280 + }
1.281 +
1.282 + // restore the modifiers
1.283 + this->traits_.flags(old_flags);
1.284 + return seq;
1.285 + }
1.286 +
1.287 + ///////////////////////////////////////////////////////////////////////////
1.288 + // parse_charset
1.289 + /// INTERNAL ONLY
1.290 + detail::sequence<BidiIter> parse_charset(string_iterator &begin, string_iterator end)
1.291 + {
1.292 + detail::compound_charset<traits_type> chset;
1.293 +
1.294 + // call out to a helper to actually parse the character set
1.295 + detail::parse_charset(begin, end, chset, this->traits_);
1.296 +
1.297 + return detail::make_charset_xpression<BidiIter>
1.298 + (
1.299 + chset
1.300 + , this->rxtraits()
1.301 + , this->traits_.flags()
1.302 + );
1.303 + }
1.304 +
1.305 + ///////////////////////////////////////////////////////////////////////////
1.306 + // parse_atom
1.307 + /// INTERNAL ONLY
1.308 + detail::sequence<BidiIter> parse_atom(string_iterator &begin, string_iterator end)
1.309 + {
1.310 + using namespace regex_constants;
1.311 + escape_value esc = { 0, 0, 0, detail::escape_char };
1.312 + string_iterator old_begin = begin;
1.313 +
1.314 + switch(this->traits_.get_token(begin, end))
1.315 + {
1.316 + case token_literal:
1.317 + return detail::make_literal_xpression<BidiIter>
1.318 + (
1.319 + this->parse_literal(begin, end), this->traits_.flags(), this->rxtraits()
1.320 + );
1.321 +
1.322 + case token_any:
1.323 + return detail::make_any_xpression<BidiIter>(this->traits_.flags(), this->rxtraits());
1.324 +
1.325 + case token_assert_begin_sequence:
1.326 + return detail::make_dynamic_xpression<BidiIter>(detail::assert_bos_matcher());
1.327 +
1.328 + case token_assert_end_sequence:
1.329 + return detail::make_dynamic_xpression<BidiIter>(detail::assert_eos_matcher());
1.330 +
1.331 + case token_assert_begin_line:
1.332 + return detail::make_assert_begin_line<BidiIter>(this->traits_.flags(), this->rxtraits());
1.333 +
1.334 + case token_assert_end_line:
1.335 + return detail::make_assert_end_line<BidiIter>(this->traits_.flags(), this->rxtraits());
1.336 +
1.337 + case token_assert_word_boundary:
1.338 + return detail::make_assert_word<BidiIter>(detail::word_boundary<true>(), this->rxtraits());
1.339 +
1.340 + case token_assert_not_word_boundary:
1.341 + return detail::make_assert_word<BidiIter>(detail::word_boundary<false>(), this->rxtraits());
1.342 +
1.343 + case token_assert_word_begin:
1.344 + return detail::make_assert_word<BidiIter>(detail::word_begin(), this->rxtraits());
1.345 +
1.346 + case token_assert_word_end:
1.347 + return detail::make_assert_word<BidiIter>(detail::word_end(), this->rxtraits());
1.348 +
1.349 + case token_escape:
1.350 + esc = this->parse_escape(begin, end);
1.351 + switch(esc.type_)
1.352 + {
1.353 + case detail::escape_mark:
1.354 + return detail::make_backref_xpression<BidiIter>
1.355 + (
1.356 + esc.mark_nbr_, this->traits_.flags(), this->rxtraits()
1.357 + );
1.358 + case detail::escape_char:
1.359 + return detail::make_char_xpression<BidiIter>
1.360 + (
1.361 + esc.ch_, this->traits_.flags(), this->rxtraits()
1.362 + );
1.363 + case detail::escape_class:
1.364 + return detail::make_posix_charset_xpression<BidiIter>
1.365 + (
1.366 + esc.class_
1.367 + , this->rxtraits().isctype(*begin++, this->upper_)
1.368 + , this->traits_.flags()
1.369 + , this->rxtraits()
1.370 + );
1.371 + }
1.372 +
1.373 + case token_group_begin:
1.374 + return this->parse_group(begin, end);
1.375 +
1.376 + case token_charset_begin:
1.377 + return this->parse_charset(begin, end);
1.378 +
1.379 + case token_invalid_quantifier:
1.380 + throw regex_error(error_badrepeat, "quantifier not expected");
1.381 +
1.382 + case token_quote_meta_begin:
1.383 + return detail::make_literal_xpression<BidiIter>
1.384 + (
1.385 + this->parse_quote_meta(begin, end), this->traits_.flags(), this->rxtraits()
1.386 + );
1.387 +
1.388 + case token_quote_meta_end:
1.389 + throw regex_error
1.390 + (
1.391 + error_escape
1.392 + , "found quote-meta end without corresponding quote-meta begin"
1.393 + );
1.394 +
1.395 + case token_end_of_pattern:
1.396 + break;
1.397 +
1.398 + default:
1.399 + begin = old_begin;
1.400 + break;
1.401 + }
1.402 +
1.403 + return detail::sequence<BidiIter>();
1.404 + }
1.405 +
1.406 + ///////////////////////////////////////////////////////////////////////////
1.407 + // parse_quant
1.408 + /// INTERNAL ONLY
1.409 + detail::sequence<BidiIter> parse_quant(string_iterator &begin, string_iterator end)
1.410 + {
1.411 + BOOST_ASSERT(begin != end);
1.412 + detail::quant_spec spec = { 0, 0, false };
1.413 + detail::sequence<BidiIter> seq = this->parse_atom(begin, end);
1.414 +
1.415 + // BUGBUG this doesn't handle the degenerate (?:)+ correctly
1.416 + if(!seq.is_empty() && begin != end && seq.first->is_quantifiable())
1.417 + {
1.418 + if(this->traits_.get_quant_spec(begin, end, spec))
1.419 + {
1.420 + BOOST_ASSERT(spec.min_ <= spec.max_);
1.421 +
1.422 + if(0 == spec.max_) // quant {0,0} is degenerate -- matches nothing.
1.423 + {
1.424 + seq = this->parse_quant(begin, end);
1.425 + }
1.426 + else
1.427 + {
1.428 + seq = seq.first->quantify(spec, this->hidden_mark_count_, seq, alternates_factory());
1.429 + }
1.430 + }
1.431 + }
1.432 +
1.433 + return seq;
1.434 + }
1.435 +
1.436 + ///////////////////////////////////////////////////////////////////////////
1.437 + // parse_sequence
1.438 + /// INTERNAL ONLY
1.439 + detail::sequence<BidiIter> parse_sequence(string_iterator &begin, string_iterator end)
1.440 + {
1.441 + detail::sequence<BidiIter> seq;
1.442 +
1.443 + while(begin != end)
1.444 + {
1.445 + detail::sequence<BidiIter> seq_quant = this->parse_quant(begin, end);
1.446 +
1.447 + // did we find a quantified atom?
1.448 + if(seq_quant.is_empty())
1.449 + break;
1.450 +
1.451 + // chain it to the end of the xpression sequence
1.452 + seq += seq_quant;
1.453 + }
1.454 +
1.455 + return seq;
1.456 + }
1.457 +
1.458 + ///////////////////////////////////////////////////////////////////////////
1.459 + // parse_literal
1.460 + // scan ahead looking for char literals to be globbed together into a string literal
1.461 + /// INTERNAL ONLY
1.462 + string_type parse_literal(string_iterator &begin, string_iterator end)
1.463 + {
1.464 + using namespace regex_constants;
1.465 + BOOST_ASSERT(begin != end);
1.466 + BOOST_ASSERT(token_literal == this->traits_.get_token(begin, end));
1.467 + escape_value esc = { 0, 0, 0, detail::escape_char };
1.468 + string_type literal(1, *begin);
1.469 +
1.470 + for(string_iterator prev = begin, tmp = ++begin; begin != end; prev = begin, begin = tmp)
1.471 + {
1.472 + detail::quant_spec spec;
1.473 + if(this->traits_.get_quant_spec(tmp, end, spec))
1.474 + {
1.475 + if(literal.size() != 1)
1.476 + {
1.477 + begin = prev;
1.478 + literal.erase(literal.size() - 1);
1.479 + }
1.480 + return literal;
1.481 + }
1.482 + else switch(this->traits_.get_token(tmp, end))
1.483 + {
1.484 + case token_escape:
1.485 + esc = this->parse_escape(tmp, end);
1.486 + if(detail::escape_char != esc.type_) return literal;
1.487 + literal += esc.ch_;
1.488 + break;
1.489 + case token_literal:
1.490 + literal += *tmp++;
1.491 + break;
1.492 + default:
1.493 + return literal;
1.494 + }
1.495 + }
1.496 +
1.497 + return literal;
1.498 + }
1.499 +
1.500 + ///////////////////////////////////////////////////////////////////////////
1.501 + // parse_quote_meta
1.502 + // scan ahead looking for char literals to be globbed together into a string literal
1.503 + /// INTERNAL ONLY
1.504 + string_type parse_quote_meta(string_iterator &begin, string_iterator end)
1.505 + {
1.506 + using namespace regex_constants;
1.507 + string_iterator old_begin = begin, old_end;
1.508 + while(end != (old_end = begin))
1.509 + {
1.510 + switch(this->traits_.get_token(begin, end))
1.511 + {
1.512 + case token_quote_meta_end: return string_type(old_begin, old_end);
1.513 + case token_escape: detail::ensure(begin != end, error_escape, "incomplete escape sequence");
1.514 + case token_literal: ++begin;
1.515 + default:;
1.516 + }
1.517 + }
1.518 + return string_type(old_begin, begin);
1.519 + }
1.520 +
1.521 + ///////////////////////////////////////////////////////////////////////////////
1.522 + // parse_escape
1.523 + /// INTERNAL ONLY
1.524 + escape_value parse_escape(string_iterator &begin, string_iterator end)
1.525 + {
1.526 + detail::ensure(begin != end, regex_constants::error_escape, "incomplete escape sequence");
1.527 +
1.528 + // first, check to see if this can be a backreference
1.529 + if(0 < this->rxtraits().value(*begin, 10))
1.530 + {
1.531 + // Parse at most 3 decimal digits.
1.532 + string_iterator tmp = begin;
1.533 + int mark_nbr = detail::toi(tmp, end, this->rxtraits(), 10, 999);
1.534 +
1.535 + // If the resulting number could conceivably be a backref, then it is.
1.536 + if(10 > mark_nbr || mark_nbr <= static_cast<int>(this->mark_count_))
1.537 + {
1.538 + begin = tmp;
1.539 + escape_value esc = {0, mark_nbr, 0, detail::escape_mark};
1.540 + return esc;
1.541 + }
1.542 + }
1.543 +
1.544 + // Not a backreference, defer to the parse_escape helper
1.545 + return detail::parse_escape(begin, end, this->traits_);
1.546 + }
1.547 +
1.548 + std::size_t mark_count_;
1.549 + std::size_t hidden_mark_count_;
1.550 + CompilerTraits traits_;
1.551 + typename RegexTraits::char_class_type upper_;
1.552 +};
1.553 +
1.554 +}} // namespace boost::xpressive
1.555 +
1.556 +#endif