Fix auto-anchor bug when .* is inside an assertion.
diff --git a/ChangeLog b/ChangeLog
index 3403d1d..d7568e0 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -29,15 +29,7 @@
existing subpattern.
(e) A conditional recursion test such as (?(R)...) misbehaved if there was a
group whose name began with "R".
- (f) The amount of memory needed for a compiled pattern was miscalculated if a
- lookbehind contained more than one toplevel branch and the first branch
- was of length zero.
- (g) In UTF-8 or UTF-16 modes with PCRE2_EXTENDED (/x) set and a non-zero-
- terminated pattern, if a # comment ran on to the end of the pattern, one
- or more code units past the end were being read.
- (h) An unterminated repeat at the end of a non-zero-terminated pattern (e.g.
- "{2,2") could cause reading beyond the pattern.
-
+
One effect of the refactoring is that some error numbers and messages have
changed, and the pattern offset given for compiling errors is not always the
right-most character that has been read. In particular, for a variable-length
@@ -61,6 +53,17 @@
a lookup outside one of the global tables. A similar bug existed for wide
characters in *VERB names.
+ (d) The amount of memory needed for a compiled pattern was miscalculated if a
+ lookbehind contained more than one toplevel branch and the first branch
+ was of length zero.
+
+ (e) In UTF-8 or UTF-16 modes with PCRE2_EXTENDED (/x) set and a non-zero-
+ terminated pattern, if a # comment ran on to the end of the pattern, one
+ or more code units past the end were being read.
+
+ (f) An unterminated repeat at the end of a non-zero-terminated pattern (e.g.
+ "{2,2") could cause reading beyond the pattern.
+
4. Back references are now permitted in lookbehind assertions when there are
no duplicated group numbers (that is, (?| has not been used), and, if the
reference is by name, there is only one group of that name. The referenced
@@ -122,6 +125,10 @@
compiled. A non-installed binary to run the test function locally, called
pcre2fuzzcheck is also compiled.
+18. A pattern with PCRE2_DOTALL (/s) set but not PCRE2_NO_DOTSTAR_ANCHOR, and
+which started with .* inside a positive lookahead was incorrectly being
+compiled as implicitly anchored.
+
Version 10.22 29-July-2016
--------------------------
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
index 06be3bf..edb49d0 100644
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@@ -7634,13 +7634,14 @@
the less precise approach
cb points to the compile data block
atomcount atomic group level
+ inassert TRUE if in an assertion
Returns: TRUE or FALSE
*/
static BOOL
is_anchored(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
- int atomcount)
+ int atomcount, BOOL inassert)
{
do {
PCRE2_SPTR scode = first_significant_code(
@@ -7652,7 +7653,8 @@
if (op == OP_BRA || op == OP_BRAPOS ||
op == OP_SBRA || op == OP_SBRAPOS)
{
- if (!is_anchored(scode, bracket_map, cb, atomcount)) return FALSE;
+ if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
+ return FALSE;
}
/* Capturing brackets */
@@ -7662,33 +7664,44 @@
{
int n = GET2(scode, 1+LINK_SIZE);
int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
- if (!is_anchored(scode, new_map, cb, atomcount)) return FALSE;
+ if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE;
}
- /* Positive forward assertions and conditions */
+ /* Positive forward assertion */
- else if (op == OP_ASSERT || op == OP_COND)
+ else if (op == OP_ASSERT)
{
- if (!is_anchored(scode, bracket_map, cb, atomcount)) return FALSE;
+ if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
+ }
+
+ /* Condition */
+
+ else if (op == OP_COND)
+ {
+ if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
+ return FALSE;
}
/* Atomic groups */
else if (op == OP_ONCE || op == OP_ONCE_NC)
{
- if (!is_anchored(scode, bracket_map, cb, atomcount + 1))
+ if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert))
return FALSE;
}
/* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
it isn't in brackets that are or may be referenced or inside an atomic
- group. There is also an option that disables auto-anchoring. */
+ group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
+ because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
+ with the subject "aab", which matches "b", i.e. not at the start of a line.
+ There is also an option that disables auto-anchoring. */
else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
op == OP_TYPEPOSSTAR))
{
if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
- atomcount > 0 || cb->had_pruneorskip ||
+ atomcount > 0 || cb->had_pruneorskip || inassert ||
(cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
return FALSE;
}
@@ -9423,7 +9436,7 @@
disable this case). */
if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
- is_anchored(codestart, 0, &cb, 0))
+ is_anchored(codestart, 0, &cb, 0, FALSE))
re->overall_options |= PCRE2_ANCHORED;
/* If the pattern is still not anchored and we do not have a first code unit,
diff --git a/testdata/testinput1 b/testdata/testinput1
index bcc42bd..7978e0c 100644
--- a/testdata/testinput1
+++ b/testdata/testinput1
@@ -5812,4 +5812,7 @@
/(?=.*X)X$/
\ X
+/(?s)(?=.*?)b/
+ aabc
+
# End of testinput1
diff --git a/testdata/testoutput1 b/testdata/testoutput1
index 837d1f4..617ca8a 100644
--- a/testdata/testoutput1
+++ b/testdata/testoutput1
@@ -9285,4 +9285,8 @@
\ X
0: X
+/(?s)(?=.*?)b/
+ aabc
+ 0: b
+
# End of testinput1