sl@0: /////////////////////////////////////////////////////////////////////////////// sl@0: /// \file regex_compiler.hpp sl@0: /// Contains the definition of regex_compiler, a factory for building regex objects sl@0: /// from strings. sl@0: // sl@0: // Copyright 2004 Eric Niebler. Distributed under the Boost sl@0: // Software License, Version 1.0. (See accompanying file sl@0: // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) sl@0: sl@0: #ifndef BOOST_XPRESSIVE_REGEX_COMPILER_HPP_EAN_10_04_2005 sl@0: #define BOOST_XPRESSIVE_REGEX_COMPILER_HPP_EAN_10_04_2005 sl@0: sl@0: // MS compatible compilers support #pragma once sl@0: #if defined(_MSC_VER) && (_MSC_VER >= 1020) sl@0: # pragma once sl@0: #endif sl@0: sl@0: #include sl@0: #include sl@0: #include sl@0: #include sl@0: #include sl@0: #include sl@0: #include sl@0: sl@0: namespace boost { namespace xpressive sl@0: { sl@0: sl@0: /////////////////////////////////////////////////////////////////////////////// sl@0: // regex_compiler sl@0: // sl@0: /// \brief Class template regex_compiler is a factory for building basic_regex objects from a string. sl@0: /// sl@0: /// Class template regex_compiler is used to construct a basic_regex object from a string. The string sl@0: /// should contain a valid regular expression. You can imbue a regex_compiler object with a locale, sl@0: /// after which all basic_regex objects created with that regex_compiler object will use that locale. sl@0: /// After creating a regex_compiler object, and optionally imbueing it with a locale, you can call the sl@0: /// compile() method to construct a basic_regex object, passing it the string representing the regular sl@0: /// expression. You can call compile() multiple times on the same regex_compiler object. Two basic_regex sl@0: /// objects compiled from the same string will have different regex_id's. sl@0: template sl@0: struct regex_compiler sl@0: { sl@0: typedef BidiIter iterator_type; sl@0: typedef typename iterator_value::type char_type; sl@0: typedef std::basic_string string_type; sl@0: typedef regex_constants::syntax_option_type flag_type; sl@0: typedef RegexTraits traits_type; sl@0: typedef typename traits_type::char_class_type char_class_type; sl@0: typedef typename traits_type::locale_type locale_type; sl@0: sl@0: explicit regex_compiler(RegexTraits const &traits = RegexTraits()) sl@0: : mark_count_(0) sl@0: , hidden_mark_count_(0) sl@0: , traits_(traits) sl@0: , upper_(0) sl@0: { sl@0: this->upper_ = lookup_classname(this->rxtraits(), "upper"); sl@0: BOOST_ASSERT(0 != this->upper_); sl@0: } sl@0: sl@0: /////////////////////////////////////////////////////////////////////////// sl@0: // imbue sl@0: /// Specify the locale to be used by a regex_compiler. sl@0: /// sl@0: /// \param loc The locale that this regex_compiler should use. sl@0: /// \return The previous locale. sl@0: locale_type imbue(locale_type loc) sl@0: { sl@0: locale_type oldloc = this->traits_.imbue(loc); sl@0: this->upper_ = lookup_classname(this->rxtraits(), "upper"); sl@0: BOOST_ASSERT(0 != this->upper_); sl@0: return oldloc; sl@0: } sl@0: sl@0: /////////////////////////////////////////////////////////////////////////// sl@0: // getloc sl@0: /// Get the locale used by a regex_compiler. sl@0: /// sl@0: /// \param loc The locale that this regex_compiler uses. sl@0: locale_type getloc() const sl@0: { sl@0: return this->traits_.getloc(); sl@0: } sl@0: sl@0: /////////////////////////////////////////////////////////////////////////// sl@0: // compile sl@0: /// Builds a basic_regex object from a std::string. sl@0: /// sl@0: /// \param pat A std::string containing the regular expression pattern. sl@0: /// \param flags Optional bitmask that determines how the pat string is interpreted. (See syntax_option_type.) sl@0: /// \return A basic_regex object corresponding to the regular expression represented by the string. sl@0: /// \pre The std::string pat contains a valid string-based representation of a regular expression. sl@0: /// \throw regex_error when the string has invalid regular expression syntax. sl@0: basic_regex compile(string_type pat, flag_type flags = regex_constants::ECMAScript) sl@0: { sl@0: this->reset(); sl@0: this->traits_.flags(flags); sl@0: sl@0: string_iterator begin = pat.begin(), end = pat.end(); sl@0: sl@0: // at the top level, a regex is a sequence of alternates sl@0: alternates_list alternates; sl@0: this->parse_alternates(begin, end, alternates); sl@0: detail::ensure(begin == end, regex_constants::error_paren, "mismatched parenthesis"); sl@0: sl@0: // convert the alternates list to the appropriate matcher and terminate the sequence sl@0: detail::sequence seq = detail::alternates_to_matchable(alternates, alternates_factory()); sl@0: seq += detail::make_dynamic_xpression(detail::end_matcher()); sl@0: sl@0: // fill in the back-pointers by visiting the regex parse tree sl@0: detail::xpression_linker linker(this->rxtraits()); sl@0: seq.first->link(linker); sl@0: sl@0: // bundle the regex information into a regex_impl object sl@0: detail::regex_impl impl; sl@0: impl.xpr_ = seq.first; sl@0: impl.traits_.reset(new RegexTraits(this->rxtraits())); sl@0: impl.mark_count_ = this->mark_count_; sl@0: impl.hidden_mark_count_ = this->hidden_mark_count_; sl@0: sl@0: // optimization: get the peek chars OR the boyer-moore search string sl@0: detail::optimize_regex(impl, this->rxtraits(), detail::is_random()); sl@0: sl@0: return detail::core_access::make_regex(impl); sl@0: } sl@0: sl@0: private: sl@0: sl@0: typedef typename string_type::const_iterator string_iterator; sl@0: typedef std::list > alternates_list; sl@0: typedef detail::escape_value escape_value; sl@0: typedef detail::alternates_factory_impl alternates_factory; sl@0: sl@0: /////////////////////////////////////////////////////////////////////////// sl@0: // reset sl@0: /// INTERNAL ONLY sl@0: void reset() sl@0: { sl@0: this->mark_count_ = 0; sl@0: this->hidden_mark_count_ = 0; sl@0: this->traits_.flags(regex_constants::ECMAScript); sl@0: } sl@0: sl@0: /////////////////////////////////////////////////////////////////////////// sl@0: // regex_traits sl@0: /// INTERNAL ONLY sl@0: traits_type &rxtraits() sl@0: { sl@0: return this->traits_.traits(); sl@0: } sl@0: sl@0: /////////////////////////////////////////////////////////////////////////// sl@0: // regex_traits sl@0: /// INTERNAL ONLY sl@0: traits_type const &rxtraits() const sl@0: { sl@0: return this->traits_.traits(); sl@0: } sl@0: sl@0: /////////////////////////////////////////////////////////////////////////// sl@0: // parse_alternates sl@0: /// INTERNAL ONLY sl@0: void parse_alternates(string_iterator &begin, string_iterator end, alternates_list &alternates) sl@0: { sl@0: using namespace regex_constants; sl@0: string_iterator old_begin; sl@0: sl@0: do sl@0: { sl@0: alternates.push_back(this->parse_sequence(begin, end)); sl@0: old_begin = begin; sl@0: } sl@0: while(begin != end && token_alternate == this->traits_.get_token(begin, end)); sl@0: sl@0: begin = old_begin; sl@0: } sl@0: sl@0: /////////////////////////////////////////////////////////////////////////// sl@0: // parse_group sl@0: /// INTERNAL ONLY sl@0: detail::sequence parse_group(string_iterator &begin, string_iterator end) sl@0: { sl@0: using namespace regex_constants; sl@0: int mark_nbr = 0; sl@0: bool keeper = false; sl@0: bool lookahead = false; sl@0: bool lookbehind = false; sl@0: bool negative = false; sl@0: std::size_t old_mark_count = this->mark_count_; sl@0: sl@0: detail::sequence seq, seq_end; sl@0: string_iterator tmp = string_iterator(); sl@0: sl@0: syntax_option_type old_flags = this->traits_.flags(); sl@0: sl@0: switch(this->traits_.get_group_type(begin, end)) sl@0: { sl@0: case token_no_mark: sl@0: // Don't process empty groups like (?:) or (?i) sl@0: // BUGBUG this doesn't handle the degenerate (?:)+ correctly sl@0: if(token_group_end == this->traits_.get_token(tmp = begin, end)) sl@0: { sl@0: return this->parse_atom(begin = tmp, end); sl@0: } sl@0: break; sl@0: sl@0: case token_negative_lookahead: sl@0: negative = true; // fall-through sl@0: case token_positive_lookahead: sl@0: lookahead = true; sl@0: seq_end = detail::make_dynamic_xpression(detail::true_matcher()); sl@0: break; sl@0: sl@0: case token_negative_lookbehind: sl@0: negative = true; // fall-through sl@0: case token_positive_lookbehind: sl@0: lookbehind = true; sl@0: seq_end = detail::make_dynamic_xpression(detail::true_matcher()); sl@0: break; sl@0: sl@0: case token_independent_sub_expression: sl@0: keeper = true; sl@0: seq_end = detail::make_dynamic_xpression(detail::true_matcher()); sl@0: break; sl@0: sl@0: case token_comment: sl@0: while(detail::ensure(begin != end, error_paren, "mismatched parenthesis")) sl@0: { sl@0: switch(this->traits_.get_token(begin, end)) sl@0: { sl@0: case token_group_end: return this->parse_atom(begin, end); sl@0: case token_escape: detail::ensure(begin != end, error_escape, "incomplete escape sequence"); sl@0: case token_literal: ++begin; sl@0: default:; sl@0: } sl@0: } sl@0: break; sl@0: sl@0: default: sl@0: mark_nbr = static_cast(++this->mark_count_); sl@0: seq = detail::make_dynamic_xpression(detail::mark_begin_matcher(mark_nbr)); sl@0: seq_end = detail::make_dynamic_xpression(detail::mark_end_matcher(mark_nbr)); sl@0: break; sl@0: } sl@0: sl@0: // alternates sl@0: alternates_list alternates; sl@0: this->parse_alternates(begin, end, alternates); sl@0: detail::ensure sl@0: ( sl@0: begin != end && token_group_end == this->traits_.get_token(begin, end) sl@0: , error_paren sl@0: , "mismatched parenthesis" sl@0: ); sl@0: sl@0: seq += detail::alternates_to_matchable(alternates, alternates_factory()); sl@0: seq += seq_end; sl@0: sl@0: typedef shared_ptr const> xpr_type; sl@0: bool do_save = (this->mark_count_ != old_mark_count); sl@0: sl@0: if(lookahead) sl@0: { sl@0: detail::lookahead_matcher lookahead(seq.first, negative, do_save); sl@0: seq = detail::make_dynamic_xpression(lookahead); sl@0: } sl@0: else if(lookbehind) sl@0: { sl@0: detail::lookbehind_matcher lookbehind(seq.first, negative, do_save); sl@0: seq = detail::make_dynamic_xpression(lookbehind); sl@0: } sl@0: else if(keeper) // independent sub-expression sl@0: { sl@0: detail::keeper_matcher keeper(seq.first, do_save); sl@0: seq = detail::make_dynamic_xpression(keeper); sl@0: } sl@0: sl@0: // restore the modifiers sl@0: this->traits_.flags(old_flags); sl@0: return seq; sl@0: } sl@0: sl@0: /////////////////////////////////////////////////////////////////////////// sl@0: // parse_charset sl@0: /// INTERNAL ONLY sl@0: detail::sequence parse_charset(string_iterator &begin, string_iterator end) sl@0: { sl@0: detail::compound_charset chset; sl@0: sl@0: // call out to a helper to actually parse the character set sl@0: detail::parse_charset(begin, end, chset, this->traits_); sl@0: sl@0: return detail::make_charset_xpression sl@0: ( sl@0: chset sl@0: , this->rxtraits() sl@0: , this->traits_.flags() sl@0: ); sl@0: } sl@0: sl@0: /////////////////////////////////////////////////////////////////////////// sl@0: // parse_atom sl@0: /// INTERNAL ONLY sl@0: detail::sequence parse_atom(string_iterator &begin, string_iterator end) sl@0: { sl@0: using namespace regex_constants; sl@0: escape_value esc = { 0, 0, 0, detail::escape_char }; sl@0: string_iterator old_begin = begin; sl@0: sl@0: switch(this->traits_.get_token(begin, end)) sl@0: { sl@0: case token_literal: sl@0: return detail::make_literal_xpression sl@0: ( sl@0: this->parse_literal(begin, end), this->traits_.flags(), this->rxtraits() sl@0: ); sl@0: sl@0: case token_any: sl@0: return detail::make_any_xpression(this->traits_.flags(), this->rxtraits()); sl@0: sl@0: case token_assert_begin_sequence: sl@0: return detail::make_dynamic_xpression(detail::assert_bos_matcher()); sl@0: sl@0: case token_assert_end_sequence: sl@0: return detail::make_dynamic_xpression(detail::assert_eos_matcher()); sl@0: sl@0: case token_assert_begin_line: sl@0: return detail::make_assert_begin_line(this->traits_.flags(), this->rxtraits()); sl@0: sl@0: case token_assert_end_line: sl@0: return detail::make_assert_end_line(this->traits_.flags(), this->rxtraits()); sl@0: sl@0: case token_assert_word_boundary: sl@0: return detail::make_assert_word(detail::word_boundary(), this->rxtraits()); sl@0: sl@0: case token_assert_not_word_boundary: sl@0: return detail::make_assert_word(detail::word_boundary(), this->rxtraits()); sl@0: sl@0: case token_assert_word_begin: sl@0: return detail::make_assert_word(detail::word_begin(), this->rxtraits()); sl@0: sl@0: case token_assert_word_end: sl@0: return detail::make_assert_word(detail::word_end(), this->rxtraits()); sl@0: sl@0: case token_escape: sl@0: esc = this->parse_escape(begin, end); sl@0: switch(esc.type_) sl@0: { sl@0: case detail::escape_mark: sl@0: return detail::make_backref_xpression sl@0: ( sl@0: esc.mark_nbr_, this->traits_.flags(), this->rxtraits() sl@0: ); sl@0: case detail::escape_char: sl@0: return detail::make_char_xpression sl@0: ( sl@0: esc.ch_, this->traits_.flags(), this->rxtraits() sl@0: ); sl@0: case detail::escape_class: sl@0: return detail::make_posix_charset_xpression sl@0: ( sl@0: esc.class_ sl@0: , this->rxtraits().isctype(*begin++, this->upper_) sl@0: , this->traits_.flags() sl@0: , this->rxtraits() sl@0: ); sl@0: } sl@0: sl@0: case token_group_begin: sl@0: return this->parse_group(begin, end); sl@0: sl@0: case token_charset_begin: sl@0: return this->parse_charset(begin, end); sl@0: sl@0: case token_invalid_quantifier: sl@0: throw regex_error(error_badrepeat, "quantifier not expected"); sl@0: sl@0: case token_quote_meta_begin: sl@0: return detail::make_literal_xpression sl@0: ( sl@0: this->parse_quote_meta(begin, end), this->traits_.flags(), this->rxtraits() sl@0: ); sl@0: sl@0: case token_quote_meta_end: sl@0: throw regex_error sl@0: ( sl@0: error_escape sl@0: , "found quote-meta end without corresponding quote-meta begin" sl@0: ); sl@0: sl@0: case token_end_of_pattern: sl@0: break; sl@0: sl@0: default: sl@0: begin = old_begin; sl@0: break; sl@0: } sl@0: sl@0: return detail::sequence(); sl@0: } sl@0: sl@0: /////////////////////////////////////////////////////////////////////////// sl@0: // parse_quant sl@0: /// INTERNAL ONLY sl@0: detail::sequence parse_quant(string_iterator &begin, string_iterator end) sl@0: { sl@0: BOOST_ASSERT(begin != end); sl@0: detail::quant_spec spec = { 0, 0, false }; sl@0: detail::sequence seq = this->parse_atom(begin, end); sl@0: sl@0: // BUGBUG this doesn't handle the degenerate (?:)+ correctly sl@0: if(!seq.is_empty() && begin != end && seq.first->is_quantifiable()) sl@0: { sl@0: if(this->traits_.get_quant_spec(begin, end, spec)) sl@0: { sl@0: BOOST_ASSERT(spec.min_ <= spec.max_); sl@0: sl@0: if(0 == spec.max_) // quant {0,0} is degenerate -- matches nothing. sl@0: { sl@0: seq = this->parse_quant(begin, end); sl@0: } sl@0: else sl@0: { sl@0: seq = seq.first->quantify(spec, this->hidden_mark_count_, seq, alternates_factory()); sl@0: } sl@0: } sl@0: } sl@0: sl@0: return seq; sl@0: } sl@0: sl@0: /////////////////////////////////////////////////////////////////////////// sl@0: // parse_sequence sl@0: /// INTERNAL ONLY sl@0: detail::sequence parse_sequence(string_iterator &begin, string_iterator end) sl@0: { sl@0: detail::sequence seq; sl@0: sl@0: while(begin != end) sl@0: { sl@0: detail::sequence seq_quant = this->parse_quant(begin, end); sl@0: sl@0: // did we find a quantified atom? sl@0: if(seq_quant.is_empty()) sl@0: break; sl@0: sl@0: // chain it to the end of the xpression sequence sl@0: seq += seq_quant; sl@0: } sl@0: sl@0: return seq; sl@0: } sl@0: sl@0: /////////////////////////////////////////////////////////////////////////// sl@0: // parse_literal sl@0: // scan ahead looking for char literals to be globbed together into a string literal sl@0: /// INTERNAL ONLY sl@0: string_type parse_literal(string_iterator &begin, string_iterator end) sl@0: { sl@0: using namespace regex_constants; sl@0: BOOST_ASSERT(begin != end); sl@0: BOOST_ASSERT(token_literal == this->traits_.get_token(begin, end)); sl@0: escape_value esc = { 0, 0, 0, detail::escape_char }; sl@0: string_type literal(1, *begin); sl@0: sl@0: for(string_iterator prev = begin, tmp = ++begin; begin != end; prev = begin, begin = tmp) sl@0: { sl@0: detail::quant_spec spec; sl@0: if(this->traits_.get_quant_spec(tmp, end, spec)) sl@0: { sl@0: if(literal.size() != 1) sl@0: { sl@0: begin = prev; sl@0: literal.erase(literal.size() - 1); sl@0: } sl@0: return literal; sl@0: } sl@0: else switch(this->traits_.get_token(tmp, end)) sl@0: { sl@0: case token_escape: sl@0: esc = this->parse_escape(tmp, end); sl@0: if(detail::escape_char != esc.type_) return literal; sl@0: literal += esc.ch_; sl@0: break; sl@0: case token_literal: sl@0: literal += *tmp++; sl@0: break; sl@0: default: sl@0: return literal; sl@0: } sl@0: } sl@0: sl@0: return literal; sl@0: } sl@0: sl@0: /////////////////////////////////////////////////////////////////////////// sl@0: // parse_quote_meta sl@0: // scan ahead looking for char literals to be globbed together into a string literal sl@0: /// INTERNAL ONLY sl@0: string_type parse_quote_meta(string_iterator &begin, string_iterator end) sl@0: { sl@0: using namespace regex_constants; sl@0: string_iterator old_begin = begin, old_end; sl@0: while(end != (old_end = begin)) sl@0: { sl@0: switch(this->traits_.get_token(begin, end)) sl@0: { sl@0: case token_quote_meta_end: return string_type(old_begin, old_end); sl@0: case token_escape: detail::ensure(begin != end, error_escape, "incomplete escape sequence"); sl@0: case token_literal: ++begin; sl@0: default:; sl@0: } sl@0: } sl@0: return string_type(old_begin, begin); sl@0: } sl@0: sl@0: /////////////////////////////////////////////////////////////////////////////// sl@0: // parse_escape sl@0: /// INTERNAL ONLY sl@0: escape_value parse_escape(string_iterator &begin, string_iterator end) sl@0: { sl@0: detail::ensure(begin != end, regex_constants::error_escape, "incomplete escape sequence"); sl@0: sl@0: // first, check to see if this can be a backreference sl@0: if(0 < this->rxtraits().value(*begin, 10)) sl@0: { sl@0: // Parse at most 3 decimal digits. sl@0: string_iterator tmp = begin; sl@0: int mark_nbr = detail::toi(tmp, end, this->rxtraits(), 10, 999); sl@0: sl@0: // If the resulting number could conceivably be a backref, then it is. sl@0: if(10 > mark_nbr || mark_nbr <= static_cast(this->mark_count_)) sl@0: { sl@0: begin = tmp; sl@0: escape_value esc = {0, mark_nbr, 0, detail::escape_mark}; sl@0: return esc; sl@0: } sl@0: } sl@0: sl@0: // Not a backreference, defer to the parse_escape helper sl@0: return detail::parse_escape(begin, end, this->traits_); sl@0: } sl@0: sl@0: std::size_t mark_count_; sl@0: std::size_t hidden_mark_count_; sl@0: CompilerTraits traits_; sl@0: typename RegexTraits::char_class_type upper_; sl@0: }; sl@0: sl@0: }} // namespace boost::xpressive sl@0: sl@0: #endif