| // Copyright 2006 The RE2 Authors. All Rights Reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // Helper class for traversing Regexps without recursion. |
| // Clients should declare their own subclasses that override |
| // the PreVisit and PostVisit methods, which are called before |
| // and after visiting the subexpressions. |
| |
| // Not quite the Visitor pattern, because (among other things) |
| // the Visitor pattern is recursive. |
| |
| #ifndef RE2_WALKER_INL_H__ |
| #define RE2_WALKER_INL_H__ |
| |
| #include "re2/regexp.h" |
| |
| namespace re2 { |
| |
| template<typename T> struct WalkState; |
| |
| template<typename T> class Regexp::Walker { |
| public: |
| Walker(); |
| virtual ~Walker(); |
| |
| // Virtual method called before visiting re's children. |
| // PreVisit passes ownership of its return value to its caller. |
| // The Arg* that PreVisit returns will be passed to PostVisit as pre_arg |
| // and passed to the child PreVisits and PostVisits as parent_arg. |
| // At the top-most Regexp, parent_arg is arg passed to walk. |
| // If PreVisit sets *stop to true, the walk does not recurse |
| // into the children. Instead it behaves as though the return |
| // value from PreVisit is the return value from PostVisit. |
| // The default PreVisit returns parent_arg. |
| virtual T PreVisit(Regexp* re, T parent_arg, bool* stop); |
| |
| // Virtual method called after visiting re's children. |
| // The pre_arg is the T that PreVisit returned. |
| // The child_args is a vector of the T that the child PostVisits returned. |
| // PostVisit takes ownership of pre_arg. |
| // PostVisit takes ownership of the Ts |
| // in *child_args, but not the vector itself. |
| // PostVisit passes ownership of its return value |
| // to its caller. |
| // The default PostVisit simply returns pre_arg. |
| virtual T PostVisit(Regexp* re, T parent_arg, T pre_arg, |
| T* child_args, int nchild_args); |
| |
| // Virtual method called to copy a T, |
| // when Walk notices that more than one child is the same re. |
| virtual T Copy(T arg); |
| |
| // Virtual method called to do a "quick visit" of the re, |
| // but not its children. Only called once the visit budget |
| // has been used up and we're trying to abort the walk |
| // as quickly as possible. Should return a value that |
| // makes sense for the parent PostVisits still to be run. |
| // This function is (hopefully) only called by |
| // WalkExponential, but must be implemented by all clients, |
| // just in case. |
| virtual T ShortVisit(Regexp* re, T parent_arg) = 0; |
| |
| // Walks over a regular expression. |
| // Top_arg is passed as parent_arg to PreVisit and PostVisit of re. |
| // Returns the T returned by PostVisit on re. |
| T Walk(Regexp* re, T top_arg); |
| |
| // Like Walk, but doesn't use Copy. This can lead to |
| // exponential runtimes on cross-linked Regexps like the |
| // ones generated by Simplify. To help limit this, |
| // at most max_visits nodes will be visited and then |
| // the walk will be cut off early. |
| // If the walk *is* cut off early, ShortVisit(re) |
| // will be called on regexps that cannot be fully |
| // visited rather than calling PreVisit/PostVisit. |
| T WalkExponential(Regexp* re, T top_arg, int max_visits); |
| |
| // Clears the stack. Should never be necessary, since |
| // Walk always enters and exits with an empty stack. |
| // Logs DFATAL if stack is not already clear. |
| void Reset(); |
| |
| // Returns whether walk was cut off. |
| bool stopped_early() { return stopped_early_; } |
| |
| private: |
| // Walk state for the entire traversal. |
| stack<WalkState<T> >* stack_; |
| bool stopped_early_; |
| int max_visits_; |
| |
| T WalkInternal(Regexp* re, T top_arg, bool use_copy); |
| |
| DISALLOW_EVIL_CONSTRUCTORS(Walker); |
| }; |
| |
| template<typename T> T Regexp::Walker<T>::PreVisit(Regexp* re, |
| T parent_arg, |
| bool* stop) { |
| return parent_arg; |
| } |
| |
| template<typename T> T Regexp::Walker<T>::PostVisit(Regexp* re, |
| T parent_arg, |
| T pre_arg, |
| T* child_args, |
| int nchild_args) { |
| return pre_arg; |
| } |
| |
| template<typename T> T Regexp::Walker<T>::Copy(T arg) { |
| return arg; |
| } |
| |
| // State about a single level in the traversal. |
| template<typename T> struct WalkState { |
| WalkState<T>(Regexp* re, T parent) |
| : re(re), |
| n(-1), |
| parent_arg(parent), |
| child_args(NULL) { } |
| |
| Regexp* re; // The regexp |
| int n; // The index of the next child to process; -1 means need to PreVisit |
| T parent_arg; // Accumulated arguments. |
| T pre_arg; |
| T child_arg; // One-element buffer for child_args. |
| T* child_args; |
| }; |
| |
| template<typename T> Regexp::Walker<T>::Walker() { |
| stack_ = new stack<WalkState<T> >; |
| stopped_early_ = false; |
| } |
| |
| template<typename T> Regexp::Walker<T>::~Walker() { |
| Reset(); |
| delete stack_; |
| } |
| |
| // Clears the stack. Should never be necessary, since |
| // Walk always enters and exits with an empty stack. |
| // Logs DFATAL if stack is not already clear. |
| template<typename T> void Regexp::Walker<T>::Reset() { |
| if (stack_ && stack_->size() > 0) { |
| LOG(DFATAL) << "Stack not empty."; |
| while (stack_->size() > 0) { |
| delete stack_->top().child_args; |
| stack_->pop(); |
| } |
| } |
| } |
| |
| template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg, |
| bool use_copy) { |
| Reset(); |
| |
| if (re == NULL) { |
| LOG(DFATAL) << "Walk NULL"; |
| return top_arg; |
| } |
| |
| stack_->push(WalkState<T>(re, top_arg)); |
| |
| WalkState<T>* s; |
| for (;;) { |
| T t; |
| s = &stack_->top(); |
| Regexp* re = s->re; |
| switch (s->n) { |
| case -1: { |
| if (--max_visits_ < 0) { |
| stopped_early_ = true; |
| t = ShortVisit(re, s->parent_arg); |
| break; |
| } |
| bool stop = false; |
| s->pre_arg = PreVisit(re, s->parent_arg, &stop); |
| if (stop) { |
| t = s->pre_arg; |
| break; |
| } |
| s->n = 0; |
| s->child_args = NULL; |
| if (re->nsub_ == 1) |
| s->child_args = &s->child_arg; |
| else if (re->nsub_ > 1) |
| s->child_args = new T[re->nsub_]; |
| // Fall through. |
| } |
| default: { |
| if (re->nsub_ > 0) { |
| Regexp** sub = re->sub(); |
| if (s->n < re->nsub_) { |
| if (use_copy && s->n > 0 && sub[s->n - 1] == sub[s->n]) { |
| s->child_args[s->n] = Copy(s->child_args[s->n - 1]); |
| s->n++; |
| } else { |
| stack_->push(WalkState<T>(sub[s->n], s->pre_arg)); |
| } |
| continue; |
| } |
| } |
| |
| t = PostVisit(re, s->parent_arg, s->pre_arg, s->child_args, s->n); |
| if (re->nsub_ > 1) |
| delete[] s->child_args; |
| break; |
| } |
| } |
| |
| // We've finished stack_->top(). |
| // Update next guy down. |
| stack_->pop(); |
| if (stack_->size() == 0) |
| return t; |
| s = &stack_->top(); |
| if (s->child_args != NULL) |
| s->child_args[s->n] = t; |
| else |
| s->child_arg = t; |
| s->n++; |
| } |
| } |
| |
| template<typename T> T Regexp::Walker<T>::Walk(Regexp* re, T top_arg) { |
| // Without the exponential walking behavior, |
| // this budget should be more than enough for any |
| // regexp, and yet not enough to get us in trouble |
| // as far as CPU time. |
| max_visits_ = 1000000; |
| return WalkInternal(re, top_arg, true); |
| } |
| |
| template<typename T> T Regexp::Walker<T>::WalkExponential(Regexp* re, T top_arg, |
| int max_visits) { |
| max_visits_ = max_visits; |
| return WalkInternal(re, top_arg, false); |
| } |
| |
| } // namespace re2 |
| |
| #endif // RE2_WALKER_INL_H__ |