| // rules.hpp |
| // Copyright (c) 2007-2008 Ben Hanson (http://www.benhanson.net/) |
| // |
| // Distributed under the Boost Software License, Version 1.0. (See accompanying |
| // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) |
| #ifndef BOOST_LEXER_RULES_HPP |
| #define BOOST_LEXER_RULES_HPP |
| |
| #include "consts.hpp" |
| #include <deque> |
| #include <locale> |
| #include <map> |
| #include "runtime_error.hpp" |
| #include <set> |
| #include "size_t.hpp" |
| #include <sstream> |
| #include <string> |
| #include <vector> |
| |
| namespace boost |
| { |
| namespace lexer |
| { |
| namespace detail |
| { |
| // return name of initial state |
| template <typename CharT> |
| struct initial; |
| |
| template <> |
| struct initial<char> |
| { |
| static const char *str () |
| { |
| return "INITIAL"; |
| } |
| }; |
| |
| template <> |
| struct initial<wchar_t> |
| { |
| static const wchar_t *str () |
| { |
| return L"INITIAL"; |
| } |
| }; |
| } |
| |
| template<typename CharT> |
| class basic_rules |
| { |
| public: |
| typedef std::vector<std::size_t> id_vector; |
| typedef std::deque<id_vector> id_vector_deque; |
| typedef std::basic_string<CharT> string; |
| typedef std::deque<string> string_deque; |
| typedef std::deque<string_deque> string_deque_deque; |
| typedef std::set<string> string_set; |
| typedef std::pair<string, string> string_pair; |
| typedef std::deque<string_pair> string_pair_deque; |
| typedef std::map<string, std::size_t> string_size_t_map; |
| typedef std::pair<string, std::size_t> string_size_t_pair; |
| |
| basic_rules (const regex_flags flags_ = dot_not_newline) : |
| _flags (flags_) |
| { |
| add_state (initial ()); |
| } |
| |
| void clear () |
| { |
| _statemap.clear (); |
| _macrodeque.clear (); |
| _macroset.clear (); |
| _regexes.clear (); |
| _ids.clear (); |
| _states.clear (); |
| _flags = dot_not_newline; |
| _locale = std::locale (); |
| add_state (initial ()); |
| } |
| |
| void clear (const CharT *state_name_) |
| { |
| std::size_t state_ = state (state_name_); |
| |
| if (state_ != npos) |
| { |
| _regexes[state_].clear (); |
| _ids[state_].clear (); |
| _states[state_].clear (); |
| } |
| } |
| |
| void flags (const regex_flags flags_) |
| { |
| _flags = flags_; |
| } |
| |
| regex_flags flags () const |
| { |
| return _flags; |
| } |
| |
| std::locale imbue (std::locale &locale_) |
| { |
| std::locale loc_ = _locale; |
| |
| _locale = locale_; |
| return loc_; |
| } |
| |
| const std::locale &locale () const |
| { |
| return _locale; |
| } |
| |
| std::size_t state (const CharT *name_) const |
| { |
| std::size_t state_ = npos; |
| typename string_size_t_map::const_iterator iter_ = |
| _statemap.find (name_); |
| |
| if (iter_ != _statemap.end ()) |
| { |
| state_ = iter_->second; |
| } |
| |
| return state_; |
| } |
| |
| void add_state (const CharT *name_) |
| { |
| validate (name_, true); |
| |
| if (_statemap.insert (string_size_t_pair (name_, |
| _statemap.size ())).second) |
| { |
| _regexes.push_back (string_deque ()); |
| _ids.push_back (id_vector ()); |
| _states.push_back (id_vector ()); |
| } |
| } |
| |
| void add_macro (const CharT *name_, const CharT *regex_) |
| { |
| add_macro (name_, string (regex_)); |
| } |
| |
| void add_macro (const CharT *name_, const CharT *regex_start_, |
| const CharT *regex_end_) |
| { |
| add_macro (name_, string (regex_start_, regex_end_)); |
| } |
| |
| void add_macro (const CharT *name_, const string ®ex_) |
| { |
| validate (name_, false); |
| |
| typename string_set::const_iterator iter_ = _macroset.find (name_); |
| |
| if (iter_ == _macroset.end ()) |
| { |
| _macrodeque.push_back (string_pair (name_, regex_)); |
| _macroset.insert (name_); |
| } |
| else |
| { |
| std::basic_stringstream<CharT> ss_; |
| std::ostringstream os_; |
| |
| os_ << "Attempt to redefine MACRO '"; |
| |
| while (*name_) |
| { |
| os_ << ss_.narrow (*name_++, static_cast<CharT> (' ')); |
| } |
| |
| os_ << "'."; |
| throw runtime_error (os_.str ()); |
| } |
| } |
| |
| void add (const CharT *regex_, const std::size_t id_) |
| { |
| add (string (regex_), id_); |
| } |
| |
| void add (const CharT *regex_start_, const CharT *regex_end_, |
| const std::size_t id_) |
| { |
| add (string (regex_start_, regex_end_), id_); |
| } |
| |
| void add (const string ®ex_, const std::size_t id_) |
| { |
| check_for_invalid_id (id_); |
| _regexes[0].push_back (regex_); |
| _ids[0].push_back (id_); |
| _states[0].push_back (0); |
| } |
| |
| void add (const CharT *curr_state_, const CharT *regex_, |
| const CharT *new_state_) |
| { |
| add (curr_state_, string (regex_), new_state_); |
| } |
| |
| void add (const CharT *curr_state_, const CharT *regex_start_, |
| const CharT *regex_end_, const CharT *new_state_) |
| { |
| add (curr_state_, string (regex_start_, regex_end_), new_state_); |
| } |
| |
| void add (const CharT *curr_state_, const string ®ex_, |
| const CharT *new_state_) |
| { |
| add (curr_state_, regex_, 0, new_state_, false); |
| } |
| |
| void add (const CharT *curr_state_, const CharT *regex_, |
| const std::size_t id_, const CharT *new_state_) |
| { |
| add (curr_state_, string (regex_), id_, new_state_); |
| } |
| |
| void add (const CharT *curr_state_, const CharT *regex_start_, |
| const CharT *regex_end_, const std::size_t id_, const CharT *new_state_) |
| { |
| add (curr_state_, string (regex_start_, regex_end_), id_, new_state_); |
| } |
| |
| void add (const CharT *curr_state_, const string ®ex_, |
| const std::size_t id_, const CharT *new_state_) |
| { |
| add (curr_state_, regex_, id_, new_state_, true); |
| } |
| |
| void add (const CharT *curr_state_, const basic_rules &rules_) |
| { |
| const string_deque_deque ®exes_ = rules_.regexes (); |
| const id_vector_deque &ids_ = rules_.ids (); |
| typename string_deque_deque::const_iterator state_regex_iter_ = |
| regexes_.begin (); |
| typename string_deque_deque::const_iterator state_regex_end_ = |
| regexes_.end (); |
| typename id_vector_deque::const_iterator state_id_iter_ = |
| ids_.begin (); |
| typename string_deque::const_iterator regex_iter_; |
| typename string_deque::const_iterator regex_end_; |
| typename id_vector::const_iterator id_iter_; |
| |
| for (; state_regex_iter_ != state_regex_end_; ++state_regex_iter_) |
| { |
| regex_iter_ = state_regex_iter_->begin (); |
| regex_end_ = state_regex_iter_->end (); |
| id_iter_ = state_id_iter_->begin (); |
| |
| for (; regex_iter_ != regex_end_; ++regex_iter_, ++id_iter_) |
| { |
| add (curr_state_, *regex_iter_, *id_iter_, curr_state_); |
| } |
| } |
| } |
| |
| const string_size_t_map &statemap () const |
| { |
| return _statemap; |
| } |
| |
| const string_pair_deque ¯odeque () const |
| { |
| return _macrodeque; |
| } |
| |
| const string_deque_deque ®exes () const |
| { |
| return _regexes; |
| } |
| |
| const id_vector_deque &ids () const |
| { |
| return _ids; |
| } |
| |
| const id_vector_deque &states () const |
| { |
| return _states; |
| } |
| |
| bool empty () const |
| { |
| typename string_deque_deque::const_iterator iter_ = _regexes.begin (); |
| typename string_deque_deque::const_iterator end_ = _regexes.end (); |
| bool empty_ = true; |
| |
| for (; iter_ != end_; ++iter_) |
| { |
| if (!iter_->empty ()) |
| { |
| empty_ = false; |
| break; |
| } |
| } |
| |
| return empty_; |
| } |
| |
| static const CharT *initial () |
| { |
| return detail::initial<CharT>::str (); |
| } |
| |
| private: |
| string_size_t_map _statemap; |
| string_pair_deque _macrodeque; |
| string_set _macroset; |
| string_deque_deque _regexes; |
| id_vector_deque _ids; |
| id_vector_deque _states; |
| regex_flags _flags; |
| std::locale _locale; |
| |
| void add (const CharT *curr_state_, const string ®ex_, |
| const std::size_t id_, const CharT *new_state_, const bool check_) |
| { |
| const bool star_ = *curr_state_ == '*' && *(curr_state_ + 1) == 0; |
| const bool dot_ = *new_state_ == '.' && *(new_state_ + 1) == 0; |
| |
| if (check_) |
| { |
| check_for_invalid_id (id_); |
| } |
| |
| if (!dot_) |
| { |
| validate (new_state_, true); |
| } |
| |
| std::size_t new_ = string::npos; |
| typename string_size_t_map::const_iterator iter_; |
| typename string_size_t_map::const_iterator end_ = _statemap.end (); |
| id_vector states_; |
| |
| if (!dot_) |
| { |
| iter_ = _statemap.find (new_state_); |
| |
| if (iter_ == end_) |
| { |
| std::basic_stringstream<CharT> ss_; |
| std::ostringstream os_; |
| |
| os_ << "Unknown state name '"; |
| |
| while (*new_state_) |
| { |
| os_ << ss_.narrow (*new_state_++, ' '); |
| } |
| |
| os_ << "'."; |
| throw runtime_error (os_.str ()); |
| } |
| |
| new_ = iter_->second; |
| } |
| |
| if (star_) |
| { |
| const std::size_t size_ = _statemap.size (); |
| |
| for (std::size_t i_ = 0; i_ < size_; ++i_) |
| { |
| states_.push_back (i_); |
| } |
| } |
| else |
| { |
| const CharT *start_ = curr_state_; |
| string state_; |
| |
| while (*curr_state_) |
| { |
| while (*curr_state_ && *curr_state_ != ',') |
| { |
| ++curr_state_; |
| } |
| |
| state_.assign (start_, curr_state_); |
| |
| if (*curr_state_) |
| { |
| ++curr_state_; |
| start_ = curr_state_; |
| } |
| |
| validate (state_.c_str (), true); |
| iter_ = _statemap.find (state_.c_str ()); |
| |
| if (iter_ == end_) |
| { |
| std::basic_stringstream<CharT> ss_; |
| std::ostringstream os_; |
| |
| os_ << "Unknown state name '"; |
| |
| while (*curr_state_) |
| { |
| os_ << ss_.narrow (*curr_state_++, ' '); |
| } |
| |
| os_ << "'."; |
| throw runtime_error (os_.str ()); |
| } |
| |
| states_.push_back (iter_->second); |
| } |
| } |
| |
| for (std::size_t i_ = 0, size_ = states_.size (); i_ < size_; ++i_) |
| { |
| const std::size_t curr_ = states_[i_]; |
| |
| _regexes[curr_].push_back (regex_); |
| _ids[curr_].push_back (id_); |
| _states[curr_].push_back (dot_ ? curr_ : new_); |
| } |
| } |
| |
| void validate (const CharT *name_, const bool comma_) const |
| { |
| again: |
| const CharT *start_ = name_; |
| |
| if (*name_ != '_' && !(*name_ >= 'A' && *name_ <= 'Z') && |
| !(*name_ >= 'a' && *name_ <= 'z')) |
| { |
| std::basic_stringstream<CharT> ss_; |
| std::ostringstream os_; |
| |
| os_ << "Invalid name '"; |
| |
| while (*name_) |
| { |
| os_ << ss_.narrow (*name_++, ' '); |
| } |
| |
| os_ << "'."; |
| throw runtime_error (os_.str ()); |
| } |
| else if (*name_) |
| { |
| ++name_; |
| } |
| |
| while (*name_) |
| { |
| if (*name_ == ',' && comma_) |
| { |
| ++name_; |
| goto again; |
| } |
| |
| if (*name_ != '_' && *name_ != '-' && |
| !(*name_ >= 'A' && *name_ <= 'Z') && |
| !(*name_ >= 'a' && *name_ <= 'z') && |
| !(*name_ >= '0' && *name_ <= '9')) |
| { |
| std::basic_stringstream<CharT> ss_; |
| std::ostringstream os_; |
| |
| os_ << "Invalid name '"; |
| |
| while (*name_) |
| { |
| os_ << ss_.narrow (*name_++, ' '); |
| } |
| |
| os_ << "'."; |
| throw runtime_error (os_.str ()); |
| } |
| |
| ++name_; |
| } |
| |
| if (name_ - start_ > static_cast<std::ptrdiff_t>(max_macro_len)) |
| { |
| std::basic_stringstream<CharT> ss_; |
| std::ostringstream os_; |
| |
| os_ << "Name '"; |
| |
| while (*name_) |
| { |
| os_ << ss_.narrow (*name_++, ' '); |
| } |
| |
| os_ << "' too long."; |
| throw runtime_error (os_.str ()); |
| } |
| } |
| |
| void check_for_invalid_id (const std::size_t id_) const |
| { |
| switch (id_) |
| { |
| case 0: |
| throw runtime_error ("id 0 is reserved for EOF."); |
| case npos: |
| throw runtime_error ("id npos is reserved for the " |
| "UNKNOWN token."); |
| default: |
| // OK |
| break; |
| } |
| } |
| }; |
| |
| typedef basic_rules<char> rules; |
| typedef basic_rules<wchar_t> wrules; |
| } |
| } |
| |
| #endif |