| // parser.hpp |
| // Copyright (c) 2007-2008 Ben Hanson (http://www.benhanson.net/) |
| // |
| // Distributed under the Boost Software License, Version 1.0. (See accompanying |
| // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) |
| #ifndef BOOST_LEXER_PARSER_HPP |
| #define BOOST_LEXER_PARSER_HPP |
| |
| #include <assert.h> |
| #include "tree/end_node.hpp" |
| #include "tree/iteration_node.hpp" |
| #include "tree/leaf_node.hpp" |
| #include "../runtime_error.hpp" |
| #include "tree/selection_node.hpp" |
| #include "tree/sequence_node.hpp" |
| #include "../size_t.hpp" |
| #include "tokeniser/re_tokeniser.hpp" |
| |
| namespace boost |
| { |
| namespace lexer |
| { |
| namespace detail |
| { |
| template<typename CharT> |
| class basic_parser |
| { |
| public: |
| typedef basic_re_tokeniser<CharT> tokeniser; |
| typedef typename tokeniser::string string; |
| typedef std::map<string, const node *> macro_map; |
| typedef node::node_ptr_vector node_ptr_vector; |
| typedef typename tokeniser::num_token token; |
| |
| /* |
| General principles of regex parsing: |
| - Every regex is a sequence of sub-regexes. |
| - Regexes consist of operands and operators |
| - All operators decompose to sequence, selection ('|') and iteration ('*') |
| - Regex tokens are stored on the stack. |
| - When a complete sequence of regex tokens is on the stack it is processed. |
| |
| Grammar: |
| |
| <REGEX> -> <OREXP> |
| <OREXP> -> <SEQUENCE> | <OREXP>'|'<SEQUENCE> |
| <SEQUENCE> -> <SUB> |
| <SUB> -> <EXPRESSION> | <SUB><EXPRESSION> |
| <EXPRESSION> -> <REPEAT> |
| <REPEAT> -> charset | macro | '('<REGEX>')' | <REPEAT><DUPLICATE> |
| <DUPLICATE> -> '?' | '*' | '+' | '{n[,[m]]}' |
| */ |
| static node *parse (const CharT *start_, const CharT * const end_, |
| const std::size_t id_, const std::size_t dfa_state_, |
| const regex_flags flags_, const std::locale &locale_, |
| node_ptr_vector &node_ptr_vector_, const macro_map ¯omap_, |
| typename tokeniser::token_map &map_, |
| bool &seen_BOL_assertion_, bool &seen_EOL_assertion_) |
| { |
| node *root_ = 0; |
| state state_ (start_, end_, flags_, locale_); |
| token lhs_token_; |
| token rhs_token_; |
| token_stack token_stack_; |
| tree_node_stack tree_node_stack_; |
| char action_ = 0; |
| |
| token_stack_.push (rhs_token_); |
| tokeniser::next (state_, map_, rhs_token_); |
| |
| do |
| { |
| lhs_token_ = token_stack_.top (); |
| action_ = lhs_token_.precedence (rhs_token_._type); |
| |
| switch (action_) |
| { |
| case '<': |
| case '=': |
| token_stack_.push (rhs_token_); |
| tokeniser::next (state_, map_, rhs_token_); |
| break; |
| case '>': |
| reduce (token_stack_, macromap_, node_ptr_vector_, |
| tree_node_stack_); |
| break; |
| default: |
| std::ostringstream ss_; |
| |
| ss_ << "A syntax error occurred: '" << |
| lhs_token_.precedence_string () << |
| "' against '" << rhs_token_.precedence_string () << |
| "' at index " << state_.index () << "."; |
| throw runtime_error (ss_.str ().c_str ()); |
| break; |
| } |
| } while (!token_stack_.empty ()); |
| |
| if (tree_node_stack_.empty ()) |
| { |
| throw runtime_error ("Empty rules are not allowed."); |
| } |
| |
| assert (tree_node_stack_.size () == 1); |
| |
| node *lhs_node_ = tree_node_stack_.top (); |
| |
| tree_node_stack_.pop (); |
| |
| if (id_ == 0) |
| { |
| // Macros have no end state... |
| root_ = lhs_node_; |
| } |
| else |
| { |
| node_ptr_vector_->push_back (0); |
| |
| node *rhs_node_ = new end_node (id_, dfa_state_); |
| |
| node_ptr_vector_->back () = rhs_node_; |
| node_ptr_vector_->push_back (0); |
| node_ptr_vector_->back () = new sequence_node (lhs_node_, rhs_node_); |
| root_ = node_ptr_vector_->back (); |
| } |
| |
| // Done this way as bug in VC++ 6 prevents |= operator working |
| // properly! |
| if (state_._seen_BOL_assertion) seen_BOL_assertion_ = true; |
| |
| if (state_._seen_EOL_assertion) seen_EOL_assertion_ = true; |
| |
| return root_; |
| } |
| |
| private: |
| typedef typename tokeniser::state state; |
| typedef std::stack<token> token_stack; |
| typedef node::node_stack tree_node_stack; |
| |
| static void reduce (token_stack &token_stack_, |
| const macro_map ¯omap_, node_ptr_vector &node_vector_ptr_, |
| tree_node_stack &tree_node_stack_) |
| { |
| typename tokeniser::num_token lhs_; |
| typename tokeniser::num_token rhs_; |
| token_stack handle_; |
| char action_ = 0; |
| |
| do |
| { |
| rhs_ = token_stack_.top (); |
| token_stack_.pop (); |
| handle_.push (rhs_); |
| |
| if (!token_stack_.empty ()) |
| { |
| lhs_ = token_stack_.top (); |
| action_ = lhs_.precedence (rhs_._type); |
| } |
| } while (!token_stack_.empty () && action_ == '='); |
| |
| assert (token_stack_.empty () || action_ == '<'); |
| |
| switch (rhs_._type) |
| { |
| case token::BEGIN: |
| // finished processing so exit |
| break; |
| case token::REGEX: |
| // finished parsing, nothing to do |
| break; |
| case token::OREXP: |
| orexp (handle_, token_stack_, node_vector_ptr_, tree_node_stack_); |
| break; |
| case token::SEQUENCE: |
| token_stack_.push (token::OREXP); |
| break; |
| case token::SUB: |
| sub (handle_, token_stack_, node_vector_ptr_, tree_node_stack_); |
| break; |
| case token::EXPRESSION: |
| token_stack_.push (token::SUB); |
| break; |
| case token::REPEAT: |
| repeat (handle_, token_stack_); |
| break; |
| case token::CHARSET: |
| charset (handle_, token_stack_, node_vector_ptr_, |
| tree_node_stack_); |
| break; |
| case token::MACRO: |
| macro (handle_, token_stack_, macromap_, node_vector_ptr_, |
| tree_node_stack_); |
| break; |
| case token::OPENPAREN: |
| openparen (handle_, token_stack_); |
| break; |
| case token::OPT: |
| case token::AOPT: |
| optional (rhs_._type == token::OPT, node_vector_ptr_, |
| tree_node_stack_); |
| token_stack_.push (token::DUP); |
| break; |
| case token::ZEROORMORE: |
| case token::AZEROORMORE: |
| zero_or_more (rhs_._type == token::ZEROORMORE, node_vector_ptr_, |
| tree_node_stack_); |
| token_stack_.push (token::DUP); |
| break; |
| case token::ONEORMORE: |
| case token::AONEORMORE: |
| one_or_more (rhs_._type == token::ONEORMORE, node_vector_ptr_, |
| tree_node_stack_); |
| token_stack_.push (token::DUP); |
| break; |
| case token::REPEATN: |
| case token::AREPEATN: |
| repeatn (rhs_._type == token::REPEATN, handle_.top (), |
| node_vector_ptr_, tree_node_stack_); |
| token_stack_.push (token::DUP); |
| break; |
| default: |
| throw runtime_error |
| ("Internal error regex_parser::reduce"); |
| break; |
| } |
| } |
| |
| static void orexp (token_stack &handle_, token_stack &token_stack_, |
| node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) |
| { |
| assert (handle_.top ()._type == token::OREXP && |
| (handle_.size () == 1 || handle_.size () == 3)); |
| |
| if (handle_.size () == 1) |
| { |
| token_stack_.push (token::REGEX); |
| } |
| else |
| { |
| handle_.pop (); |
| assert (handle_.top ()._type == token::OR); |
| handle_.pop (); |
| assert (handle_.top ()._type == token::SEQUENCE); |
| perform_or (node_ptr_vector_, tree_node_stack_); |
| token_stack_.push (token::OREXP); |
| } |
| } |
| |
| static void sub (token_stack &handle_, token_stack &token_stack_, |
| node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) |
| { |
| assert (handle_.top ()._type == token::SUB && |
| handle_.size () == 1 || handle_.size () == 2); |
| |
| if (handle_.size () == 1) |
| { |
| token_stack_.push (token::SEQUENCE); |
| } |
| else |
| { |
| handle_.pop (); |
| assert (handle_.top ()._type == token::EXPRESSION); |
| // perform join |
| sequence (node_ptr_vector_, tree_node_stack_); |
| token_stack_.push (token::SUB); |
| } |
| } |
| |
| static void repeat (token_stack &handle_, token_stack &token_stack_) |
| { |
| assert (handle_.top ()._type == token::REPEAT && |
| handle_.size () >= 1 && handle_.size () <= 3); |
| |
| if (handle_.size () == 1) |
| { |
| token_stack_.push (token::EXPRESSION); |
| } |
| else |
| { |
| handle_.pop (); |
| assert (handle_.top ()._type == token::DUP); |
| token_stack_.push (token::REPEAT); |
| } |
| } |
| |
| static void charset (token_stack &handle_, token_stack &token_stack_, |
| node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) |
| { |
| assert (handle_.top ()._type == token::CHARSET && |
| handle_.size () == 1); |
| // store charset |
| node_ptr_vector_->push_back (0); |
| |
| const size_t id_ = handle_.top ()._id; |
| |
| node_ptr_vector_->back () = new leaf_node (id_, true); |
| tree_node_stack_.push (node_ptr_vector_->back ()); |
| token_stack_.push (token::REPEAT); |
| } |
| |
| static void macro (token_stack &handle_, token_stack &token_stack_, |
| const macro_map ¯omap_, node_ptr_vector &node_ptr_vector_, |
| tree_node_stack &tree_node_stack_) |
| { |
| token &top_ = handle_.top (); |
| |
| assert (top_._type == token::MACRO && handle_.size () == 1); |
| |
| typename macro_map::const_iterator iter_ = |
| macromap_.find (top_._macro); |
| |
| if (iter_ == macromap_.end ()) |
| { |
| const CharT *name_ = top_._macro; |
| std::basic_stringstream<CharT> ss_; |
| std::ostringstream os_; |
| |
| os_ << "Unknown MACRO name '"; |
| |
| while (*name_) |
| { |
| os_ << ss_.narrow (*name_++, ' '); |
| } |
| |
| os_ << "'."; |
| throw runtime_error (os_.str ()); |
| } |
| |
| tree_node_stack_.push (iter_->second->copy (node_ptr_vector_)); |
| token_stack_.push (token::REPEAT); |
| } |
| |
| static void openparen (token_stack &handle_, token_stack &token_stack_) |
| { |
| assert (handle_.top ()._type == token::OPENPAREN && |
| handle_.size () == 3); |
| handle_.pop (); |
| assert (handle_.top ()._type == token::REGEX); |
| handle_.pop (); |
| assert (handle_.top ()._type == token::CLOSEPAREN); |
| token_stack_.push (token::REPEAT); |
| } |
| |
| static void perform_or (node_ptr_vector &node_ptr_vector_, |
| tree_node_stack &tree_node_stack_) |
| { |
| // perform or |
| node *rhs_ = tree_node_stack_.top (); |
| |
| tree_node_stack_.pop (); |
| |
| node *lhs_ = tree_node_stack_.top (); |
| |
| node_ptr_vector_->push_back (0); |
| node_ptr_vector_->back () = new selection_node (lhs_, rhs_); |
| tree_node_stack_.top () = node_ptr_vector_->back (); |
| } |
| |
| static void sequence (node_ptr_vector &node_ptr_vector_, |
| tree_node_stack &tree_node_stack_) |
| { |
| node *rhs_ = tree_node_stack_.top (); |
| |
| tree_node_stack_.pop (); |
| |
| node *lhs_ = tree_node_stack_.top (); |
| |
| node_ptr_vector_->push_back (0); |
| node_ptr_vector_->back () = new sequence_node (lhs_, rhs_); |
| tree_node_stack_.top () = node_ptr_vector_->back (); |
| } |
| |
| static void optional (const bool greedy_, |
| node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) |
| { |
| // perform ? |
| node *lhs_ = tree_node_stack_.top (); |
| // You don't know if lhs_ is a leaf_node, so get firstpos. |
| node::node_vector &firstpos_ = lhs_->firstpos(); |
| |
| for (node::node_vector::iterator iter_ = firstpos_.begin (), |
| end_ = firstpos_.end (); iter_ != end_; ++iter_) |
| { |
| // These are leaf_nodes! |
| (*iter_)->greedy (greedy_); |
| } |
| |
| node_ptr_vector_->push_back (0); |
| |
| node *rhs_ = new leaf_node (null_token, greedy_); |
| |
| node_ptr_vector_->back () = rhs_; |
| node_ptr_vector_->push_back (0); |
| node_ptr_vector_->back () = new selection_node (lhs_, rhs_); |
| tree_node_stack_.top () = node_ptr_vector_->back (); |
| } |
| |
| static void zero_or_more (const bool greedy_, |
| node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) |
| { |
| // perform * |
| node *ptr_ = tree_node_stack_.top (); |
| |
| node_ptr_vector_->push_back (0); |
| node_ptr_vector_->back () = new iteration_node (ptr_, greedy_); |
| tree_node_stack_.top () = node_ptr_vector_->back (); |
| } |
| |
| static void one_or_more (const bool greedy_, |
| node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) |
| { |
| // perform + |
| node *lhs_ = tree_node_stack_.top (); |
| node *copy_ = lhs_->copy (node_ptr_vector_); |
| |
| node_ptr_vector_->push_back (0); |
| |
| node *rhs_ = new iteration_node (copy_, greedy_); |
| |
| node_ptr_vector_->back () = rhs_; |
| node_ptr_vector_->push_back (0); |
| node_ptr_vector_->back () = new sequence_node (lhs_, rhs_); |
| tree_node_stack_.top () = node_ptr_vector_->back (); |
| } |
| |
| static void repeatn (const bool greedy_, const token &token_, |
| node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) |
| { |
| // perform {n[,[m]]} |
| // Semantic checks have already been performed. |
| // {0,} = * |
| // {0,1} = ? |
| // {1,} = + |
| // therefore we do not check for these cases. |
| if (!(token_._min == 1 && !token_._comma)) |
| { |
| const std::size_t top_ = token_._min > 0 ? |
| token_._min : token_._max; |
| |
| if (token_._min == 0) |
| { |
| optional (greedy_, node_ptr_vector_, tree_node_stack_); |
| } |
| |
| node *prev_ = tree_node_stack_.top ()->copy (node_ptr_vector_); |
| node *curr_ = 0; |
| |
| for (std::size_t i_ = 2; i_ < top_; ++i_) |
| { |
| node *temp_ = prev_->copy (node_ptr_vector_); |
| |
| curr_ = temp_; |
| tree_node_stack_.push (0); |
| tree_node_stack_.top () = prev_; |
| sequence (node_ptr_vector_, tree_node_stack_); |
| prev_ = curr_; |
| } |
| |
| if (token_._comma && token_._min > 0) |
| { |
| if (token_._min > 1) |
| { |
| node *temp_ = prev_->copy (node_ptr_vector_); |
| |
| curr_ = temp_; |
| tree_node_stack_.push (0); |
| tree_node_stack_.top () = prev_; |
| sequence (node_ptr_vector_, tree_node_stack_); |
| prev_ = curr_; |
| } |
| |
| if (token_._comma && token_._max) |
| { |
| tree_node_stack_.push (0); |
| tree_node_stack_.top () = prev_; |
| optional (greedy_, node_ptr_vector_, tree_node_stack_); |
| |
| node *temp_ = tree_node_stack_.top (); |
| |
| tree_node_stack_.pop (); |
| prev_ = temp_; |
| |
| const std::size_t count_ = token_._max - token_._min; |
| |
| for (std::size_t i_ = 1; i_ < count_; ++i_) |
| { |
| node *temp_ = prev_->copy (node_ptr_vector_); |
| |
| curr_ = temp_; |
| tree_node_stack_.push (0); |
| tree_node_stack_.top () = prev_; |
| sequence (node_ptr_vector_, tree_node_stack_); |
| prev_ = curr_; |
| } |
| } |
| else |
| { |
| tree_node_stack_.push (0); |
| tree_node_stack_.top () = prev_; |
| zero_or_more (greedy_, node_ptr_vector_, tree_node_stack_); |
| |
| node *temp_ = tree_node_stack_.top (); |
| |
| prev_ = temp_; |
| tree_node_stack_.pop (); |
| } |
| } |
| |
| tree_node_stack_.push (0); |
| tree_node_stack_.top () = prev_; |
| sequence (node_ptr_vector_, tree_node_stack_); |
| } |
| } |
| }; |
| } |
| } |
| } |
| |
| #endif |