First public contribution.
1 ///////////////////////////////////////////////////////////////////////////////
2 /// \file regex_compiler.hpp
3 /// Contains the definition of regex_compiler, a factory for building regex objects
6 // Copyright 2004 Eric Niebler. Distributed under the Boost
7 // Software License, Version 1.0. (See accompanying file
8 // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
10 #ifndef BOOST_XPRESSIVE_REGEX_COMPILER_HPP_EAN_10_04_2005
11 #define BOOST_XPRESSIVE_REGEX_COMPILER_HPP_EAN_10_04_2005
13 // MS compatible compilers support #pragma once
14 #if defined(_MSC_VER) && (_MSC_VER >= 1020)
18 #include <boost/xpressive/basic_regex.hpp>
19 #include <boost/xpressive/detail/dynamic/parser.hpp>
20 #include <boost/xpressive/detail/dynamic/parse_charset.hpp>
21 #include <boost/xpressive/detail/dynamic/parser_enum.hpp>
22 #include <boost/xpressive/detail/dynamic/parser_traits.hpp>
23 #include <boost/xpressive/detail/core/linker.hpp>
24 #include <boost/xpressive/detail/core/optimize.hpp>
26 namespace boost { namespace xpressive
29 ///////////////////////////////////////////////////////////////////////////////
32 /// \brief Class template regex_compiler is a factory for building basic_regex objects from a string.
34 /// Class template regex_compiler is used to construct a basic_regex object from a string. The string
35 /// should contain a valid regular expression. You can imbue a regex_compiler object with a locale,
36 /// after which all basic_regex objects created with that regex_compiler object will use that locale.
37 /// After creating a regex_compiler object, and optionally imbueing it with a locale, you can call the
38 /// compile() method to construct a basic_regex object, passing it the string representing the regular
39 /// expression. You can call compile() multiple times on the same regex_compiler object. Two basic_regex
40 /// objects compiled from the same string will have different regex_id's.
41 template<typename BidiIter, typename RegexTraits, typename CompilerTraits>
44 typedef BidiIter iterator_type;
45 typedef typename iterator_value<BidiIter>::type char_type;
46 typedef std::basic_string<char_type> string_type;
47 typedef regex_constants::syntax_option_type flag_type;
48 typedef RegexTraits traits_type;
49 typedef typename traits_type::char_class_type char_class_type;
50 typedef typename traits_type::locale_type locale_type;
52 explicit regex_compiler(RegexTraits const &traits = RegexTraits())
54 , hidden_mark_count_(0)
58 this->upper_ = lookup_classname(this->rxtraits(), "upper");
59 BOOST_ASSERT(0 != this->upper_);
62 ///////////////////////////////////////////////////////////////////////////
64 /// Specify the locale to be used by a regex_compiler.
66 /// \param loc The locale that this regex_compiler should use.
67 /// \return The previous locale.
68 locale_type imbue(locale_type loc)
70 locale_type oldloc = this->traits_.imbue(loc);
71 this->upper_ = lookup_classname(this->rxtraits(), "upper");
72 BOOST_ASSERT(0 != this->upper_);
76 ///////////////////////////////////////////////////////////////////////////
78 /// Get the locale used by a regex_compiler.
80 /// \param loc The locale that this regex_compiler uses.
81 locale_type getloc() const
83 return this->traits_.getloc();
86 ///////////////////////////////////////////////////////////////////////////
88 /// Builds a basic_regex object from a std::string.
90 /// \param pat A std::string containing the regular expression pattern.
91 /// \param flags Optional bitmask that determines how the pat string is interpreted. (See syntax_option_type.)
92 /// \return A basic_regex object corresponding to the regular expression represented by the string.
93 /// \pre The std::string pat contains a valid string-based representation of a regular expression.
94 /// \throw regex_error when the string has invalid regular expression syntax.
95 basic_regex<BidiIter> compile(string_type pat, flag_type flags = regex_constants::ECMAScript)
98 this->traits_.flags(flags);
100 string_iterator begin = pat.begin(), end = pat.end();
102 // at the top level, a regex is a sequence of alternates
103 alternates_list alternates;
104 this->parse_alternates(begin, end, alternates);
105 detail::ensure(begin == end, regex_constants::error_paren, "mismatched parenthesis");
107 // convert the alternates list to the appropriate matcher and terminate the sequence
108 detail::sequence<BidiIter> seq = detail::alternates_to_matchable(alternates, alternates_factory());
109 seq += detail::make_dynamic_xpression<BidiIter>(detail::end_matcher());
111 // fill in the back-pointers by visiting the regex parse tree
112 detail::xpression_linker<char_type> linker(this->rxtraits());
113 seq.first->link(linker);
115 // bundle the regex information into a regex_impl object
116 detail::regex_impl<BidiIter> impl;
117 impl.xpr_ = seq.first;
118 impl.traits_.reset(new RegexTraits(this->rxtraits()));
119 impl.mark_count_ = this->mark_count_;
120 impl.hidden_mark_count_ = this->hidden_mark_count_;
122 // optimization: get the peek chars OR the boyer-moore search string
123 detail::optimize_regex(impl, this->rxtraits(), detail::is_random<BidiIter>());
125 return detail::core_access<BidiIter>::make_regex(impl);
130 typedef typename string_type::const_iterator string_iterator;
131 typedef std::list<detail::sequence<BidiIter> > alternates_list;
132 typedef detail::escape_value<char_type, char_class_type> escape_value;
133 typedef detail::alternates_factory_impl<BidiIter, traits_type> alternates_factory;
135 ///////////////////////////////////////////////////////////////////////////
140 this->mark_count_ = 0;
141 this->hidden_mark_count_ = 0;
142 this->traits_.flags(regex_constants::ECMAScript);
145 ///////////////////////////////////////////////////////////////////////////
148 traits_type &rxtraits()
150 return this->traits_.traits();
153 ///////////////////////////////////////////////////////////////////////////
156 traits_type const &rxtraits() const
158 return this->traits_.traits();
161 ///////////////////////////////////////////////////////////////////////////
164 void parse_alternates(string_iterator &begin, string_iterator end, alternates_list &alternates)
166 using namespace regex_constants;
167 string_iterator old_begin;
171 alternates.push_back(this->parse_sequence(begin, end));
174 while(begin != end && token_alternate == this->traits_.get_token(begin, end));
179 ///////////////////////////////////////////////////////////////////////////
182 detail::sequence<BidiIter> parse_group(string_iterator &begin, string_iterator end)
184 using namespace regex_constants;
187 bool lookahead = false;
188 bool lookbehind = false;
189 bool negative = false;
190 std::size_t old_mark_count = this->mark_count_;
192 detail::sequence<BidiIter> seq, seq_end;
193 string_iterator tmp = string_iterator();
195 syntax_option_type old_flags = this->traits_.flags();
197 switch(this->traits_.get_group_type(begin, end))
200 // Don't process empty groups like (?:) or (?i)
201 // BUGBUG this doesn't handle the degenerate (?:)+ correctly
202 if(token_group_end == this->traits_.get_token(tmp = begin, end))
204 return this->parse_atom(begin = tmp, end);
208 case token_negative_lookahead:
209 negative = true; // fall-through
210 case token_positive_lookahead:
212 seq_end = detail::make_dynamic_xpression<BidiIter>(detail::true_matcher());
215 case token_negative_lookbehind:
216 negative = true; // fall-through
217 case token_positive_lookbehind:
219 seq_end = detail::make_dynamic_xpression<BidiIter>(detail::true_matcher());
222 case token_independent_sub_expression:
224 seq_end = detail::make_dynamic_xpression<BidiIter>(detail::true_matcher());
228 while(detail::ensure(begin != end, error_paren, "mismatched parenthesis"))
230 switch(this->traits_.get_token(begin, end))
232 case token_group_end: return this->parse_atom(begin, end);
233 case token_escape: detail::ensure(begin != end, error_escape, "incomplete escape sequence");
234 case token_literal: ++begin;
241 mark_nbr = static_cast<int>(++this->mark_count_);
242 seq = detail::make_dynamic_xpression<BidiIter>(detail::mark_begin_matcher(mark_nbr));
243 seq_end = detail::make_dynamic_xpression<BidiIter>(detail::mark_end_matcher(mark_nbr));
248 alternates_list alternates;
249 this->parse_alternates(begin, end, alternates);
252 begin != end && token_group_end == this->traits_.get_token(begin, end)
254 , "mismatched parenthesis"
257 seq += detail::alternates_to_matchable(alternates, alternates_factory());
260 typedef shared_ptr<detail::matchable<BidiIter> const> xpr_type;
261 bool do_save = (this->mark_count_ != old_mark_count);
265 detail::lookahead_matcher<xpr_type> lookahead(seq.first, negative, do_save);
266 seq = detail::make_dynamic_xpression<BidiIter>(lookahead);
270 detail::lookbehind_matcher<xpr_type> lookbehind(seq.first, negative, do_save);
271 seq = detail::make_dynamic_xpression<BidiIter>(lookbehind);
273 else if(keeper) // independent sub-expression
275 detail::keeper_matcher<xpr_type> keeper(seq.first, do_save);
276 seq = detail::make_dynamic_xpression<BidiIter>(keeper);
279 // restore the modifiers
280 this->traits_.flags(old_flags);
284 ///////////////////////////////////////////////////////////////////////////
287 detail::sequence<BidiIter> parse_charset(string_iterator &begin, string_iterator end)
289 detail::compound_charset<traits_type> chset;
291 // call out to a helper to actually parse the character set
292 detail::parse_charset(begin, end, chset, this->traits_);
294 return detail::make_charset_xpression<BidiIter>
298 , this->traits_.flags()
302 ///////////////////////////////////////////////////////////////////////////
305 detail::sequence<BidiIter> parse_atom(string_iterator &begin, string_iterator end)
307 using namespace regex_constants;
308 escape_value esc = { 0, 0, 0, detail::escape_char };
309 string_iterator old_begin = begin;
311 switch(this->traits_.get_token(begin, end))
314 return detail::make_literal_xpression<BidiIter>
316 this->parse_literal(begin, end), this->traits_.flags(), this->rxtraits()
320 return detail::make_any_xpression<BidiIter>(this->traits_.flags(), this->rxtraits());
322 case token_assert_begin_sequence:
323 return detail::make_dynamic_xpression<BidiIter>(detail::assert_bos_matcher());
325 case token_assert_end_sequence:
326 return detail::make_dynamic_xpression<BidiIter>(detail::assert_eos_matcher());
328 case token_assert_begin_line:
329 return detail::make_assert_begin_line<BidiIter>(this->traits_.flags(), this->rxtraits());
331 case token_assert_end_line:
332 return detail::make_assert_end_line<BidiIter>(this->traits_.flags(), this->rxtraits());
334 case token_assert_word_boundary:
335 return detail::make_assert_word<BidiIter>(detail::word_boundary<true>(), this->rxtraits());
337 case token_assert_not_word_boundary:
338 return detail::make_assert_word<BidiIter>(detail::word_boundary<false>(), this->rxtraits());
340 case token_assert_word_begin:
341 return detail::make_assert_word<BidiIter>(detail::word_begin(), this->rxtraits());
343 case token_assert_word_end:
344 return detail::make_assert_word<BidiIter>(detail::word_end(), this->rxtraits());
347 esc = this->parse_escape(begin, end);
350 case detail::escape_mark:
351 return detail::make_backref_xpression<BidiIter>
353 esc.mark_nbr_, this->traits_.flags(), this->rxtraits()
355 case detail::escape_char:
356 return detail::make_char_xpression<BidiIter>
358 esc.ch_, this->traits_.flags(), this->rxtraits()
360 case detail::escape_class:
361 return detail::make_posix_charset_xpression<BidiIter>
364 , this->rxtraits().isctype(*begin++, this->upper_)
365 , this->traits_.flags()
370 case token_group_begin:
371 return this->parse_group(begin, end);
373 case token_charset_begin:
374 return this->parse_charset(begin, end);
376 case token_invalid_quantifier:
377 throw regex_error(error_badrepeat, "quantifier not expected");
379 case token_quote_meta_begin:
380 return detail::make_literal_xpression<BidiIter>
382 this->parse_quote_meta(begin, end), this->traits_.flags(), this->rxtraits()
385 case token_quote_meta_end:
389 , "found quote-meta end without corresponding quote-meta begin"
392 case token_end_of_pattern:
400 return detail::sequence<BidiIter>();
403 ///////////////////////////////////////////////////////////////////////////
406 detail::sequence<BidiIter> parse_quant(string_iterator &begin, string_iterator end)
408 BOOST_ASSERT(begin != end);
409 detail::quant_spec spec = { 0, 0, false };
410 detail::sequence<BidiIter> seq = this->parse_atom(begin, end);
412 // BUGBUG this doesn't handle the degenerate (?:)+ correctly
413 if(!seq.is_empty() && begin != end && seq.first->is_quantifiable())
415 if(this->traits_.get_quant_spec(begin, end, spec))
417 BOOST_ASSERT(spec.min_ <= spec.max_);
419 if(0 == spec.max_) // quant {0,0} is degenerate -- matches nothing.
421 seq = this->parse_quant(begin, end);
425 seq = seq.first->quantify(spec, this->hidden_mark_count_, seq, alternates_factory());
433 ///////////////////////////////////////////////////////////////////////////
436 detail::sequence<BidiIter> parse_sequence(string_iterator &begin, string_iterator end)
438 detail::sequence<BidiIter> seq;
442 detail::sequence<BidiIter> seq_quant = this->parse_quant(begin, end);
444 // did we find a quantified atom?
445 if(seq_quant.is_empty())
448 // chain it to the end of the xpression sequence
455 ///////////////////////////////////////////////////////////////////////////
457 // scan ahead looking for char literals to be globbed together into a string literal
459 string_type parse_literal(string_iterator &begin, string_iterator end)
461 using namespace regex_constants;
462 BOOST_ASSERT(begin != end);
463 BOOST_ASSERT(token_literal == this->traits_.get_token(begin, end));
464 escape_value esc = { 0, 0, 0, detail::escape_char };
465 string_type literal(1, *begin);
467 for(string_iterator prev = begin, tmp = ++begin; begin != end; prev = begin, begin = tmp)
469 detail::quant_spec spec;
470 if(this->traits_.get_quant_spec(tmp, end, spec))
472 if(literal.size() != 1)
475 literal.erase(literal.size() - 1);
479 else switch(this->traits_.get_token(tmp, end))
482 esc = this->parse_escape(tmp, end);
483 if(detail::escape_char != esc.type_) return literal;
497 ///////////////////////////////////////////////////////////////////////////
499 // scan ahead looking for char literals to be globbed together into a string literal
501 string_type parse_quote_meta(string_iterator &begin, string_iterator end)
503 using namespace regex_constants;
504 string_iterator old_begin = begin, old_end;
505 while(end != (old_end = begin))
507 switch(this->traits_.get_token(begin, end))
509 case token_quote_meta_end: return string_type(old_begin, old_end);
510 case token_escape: detail::ensure(begin != end, error_escape, "incomplete escape sequence");
511 case token_literal: ++begin;
515 return string_type(old_begin, begin);
518 ///////////////////////////////////////////////////////////////////////////////
521 escape_value parse_escape(string_iterator &begin, string_iterator end)
523 detail::ensure(begin != end, regex_constants::error_escape, "incomplete escape sequence");
525 // first, check to see if this can be a backreference
526 if(0 < this->rxtraits().value(*begin, 10))
528 // Parse at most 3 decimal digits.
529 string_iterator tmp = begin;
530 int mark_nbr = detail::toi(tmp, end, this->rxtraits(), 10, 999);
532 // If the resulting number could conceivably be a backref, then it is.
533 if(10 > mark_nbr || mark_nbr <= static_cast<int>(this->mark_count_))
536 escape_value esc = {0, mark_nbr, 0, detail::escape_mark};
541 // Not a backreference, defer to the parse_escape helper
542 return detail::parse_escape(begin, end, this->traits_);
545 std::size_t mark_count_;
546 std::size_t hidden_mark_count_;
547 CompilerTraits traits_;
548 typename RegexTraits::char_class_type upper_;
551 }} // namespace boost::xpressive