RE2::Set: fix handling of regexps with trailing $,
by forcing all regexps to have trailing $ and checking
for the match only after the $ has been pushed through.
[Exported from internal Google RE2 repository.]
R=rsc
CC=re2-dev
http://codereview.appspot.com/6410043
diff --git a/re2/compile.cc b/re2/compile.cc
index 03036cd..c44efb6 100644
--- a/re2/compile.cc
+++ b/re2/compile.cc
@@ -732,7 +732,7 @@
Frag f = Match(re->match_id());
// Remember unanchored match to end of string.
if (anchor_ != RE2::ANCHOR_BOTH)
- f = Cat(DotStar(), f);
+ f = Cat(DotStar(), Cat(EmptyWidth(kEmptyEndText), f));
return f;
}
diff --git a/re2/dfa.cc b/re2/dfa.cc
index 7d206fb..6503455 100644
--- a/re2/dfa.cc
+++ b/re2/dfa.cc
@@ -1006,7 +1006,17 @@
}
bool ismatch = false;
RunWorkqOnByte(q0_, q1_, c, afterflag, &ismatch, kind_, start_unanchored_);
- swap(q0_, q1_);
+
+ // Most of the time, we build the state from the output of
+ // RunWorkqOnByte, so swap q0_ and q1_ here. However, so that
+ // RE2::Set can tell exactly which match instructions
+ // contributed to the match, don't swap if c is kByteEndText.
+ // The resulting state wouldn't be correct for further processing
+ // of the string, but we're at the end of the text so that's okay.
+ // Leaving q0_ alone preseves the match instructions that led to
+ // the current setting of ismatch.
+ if (c != kByteEndText || kind_ != Prog::kManyMatch)
+ swap(q0_, q1_);
// Save afterflag along with ismatch and isword in new state.
uint flag = afterflag;
@@ -1422,20 +1432,6 @@
}
}
- // Peek in state to see if a match is coming up.
- if (params->matches && kind_ == Prog::kManyMatch) {
- vector<int>* v = params->matches;
- v->clear();
- if (s > SpecialStateMax) {
- for (int i = 0; i < s->ninst_; i++) {
- Prog::Inst* ip = prog_->inst(s->inst_[i]);
- if (ip->opcode() == kInstMatch)
- v->push_back(ip->match_id());
- }
- }
- }
-
-
// Process one more byte to see if it triggers a match.
// (Remember, matches are delayed one byte.)
int lastbyte;
@@ -1480,6 +1476,15 @@
if (s > SpecialStateMax && s->IsMatch()) {
matched = true;
lastmatch = p;
+ if (params->matches && kind_ == Prog::kManyMatch) {
+ vector<int>* v = params->matches;
+ v->clear();
+ for (int i = 0; i < s->ninst_; i++) {
+ Prog::Inst* ip = prog_->inst(s->inst_[i]);
+ if (ip->opcode() == kInstMatch)
+ v->push_back(ip->match_id());
+ }
+ }
if (DebugDFA)
fprintf(stderr, "match @%d! [%s]\n", static_cast<int>(lastmatch - bp),
DumpState(s).c_str());
diff --git a/re2/testing/set_test.cc b/re2/testing/set_test.cc
index 89aed80..74058a4 100644
--- a/re2/testing/set_test.cc
+++ b/re2/testing/set_test.cc
@@ -69,6 +69,18 @@
CHECK_EQ(v.size(), 0);
}
+TEST(Set, UnanchoredDollar) {
+ RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED);
+
+ CHECK_EQ(s.Add("foo$", NULL), 0);
+ CHECK_EQ(s.Compile(), true);
+
+ vector<int> v;
+ CHECK_EQ(s.Match("foo", &v), true);
+ CHECK_EQ(v.size(), 1);
+ CHECK_EQ(v[0], 0);
+}
+
TEST(Set, Anchored) {
RE2::Set s(RE2::DefaultOptions, RE2::ANCHOR_BOTH);