pcre2test: avoid printing invalid utf trail in partial match (#237)
When match_invalid_utf is enabled, invalid UTF-8 data can't match
but it was mistakenly getting printed as part of a partial match
eventhough the ovector correctly didn't include it, as shown by:
PCRE2 version 10.34 2019-11-21
re> /(?<=..)X/match_invalid_utf,allvector
data> XX\x80\=ph,ovector=1
Partial match: \x{80}
** ovector[1] is not equal to the subject length: 2 != 3
0: 2 2
Fix the logic to print instead the empty match that was returned
and address a buffer overread when trying to decode UTF-8 that was
missing code units.
Fixes: #235
diff --git a/.gitignore b/.gitignore
index c25f473..7dacbcb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -66,6 +66,10 @@
testtemp2grep
testtry
testtrygrep
+testSinput
+testbtables
+testsaved1
+testsaved2
m4/libtool.m4
m4/ltoptions.m4
diff --git a/src/pcre2test.c b/src/pcre2test.c
index 4e2dfa8..7834509 100644
--- a/src/pcre2test.c
+++ b/src/pcre2test.c
@@ -2913,6 +2913,7 @@
Argument:
utf8bytes a pointer to the byte vector
+ end a pointer to the end of the byte vector
vptr a pointer to an int to receive the value
Returns: > 0 => the number of bytes consumed
@@ -2920,7 +2921,7 @@
*/
static int
-utf82ord(PCRE2_SPTR8 utf8bytes, uint32_t *vptr)
+utf82ord(PCRE2_SPTR8 utf8bytes, PCRE2_SPTR8 end, uint32_t *vptr)
{
uint32_t c = *utf8bytes++;
uint32_t d = c;
@@ -2942,6 +2943,8 @@
for (j = 0; j < i; j++)
{
+ if (utf8bytes >= end) return 0;
+
c = *utf8bytes++;
if ((c & 0xc0) != 0x80) return -(j+1);
s -= 6;
@@ -3052,14 +3055,16 @@
static int pchars8(PCRE2_SPTR8 p, int length, BOOL utf, FILE *f)
{
+PCRE2_SPTR8 end;
uint32_t c = 0;
int yield = 0;
if (length < 0) length = *p++;
+end = p + length;
while (length-- > 0)
{
if (utf)
{
- int rc = utf82ord(p, &c);
+ int rc = utf82ord(p, end, &c);
if (rc > 0 && rc <= length + 1) /* Mustn't run over the end */
{
length -= rc - 1;
@@ -3238,7 +3243,8 @@
else while (len > 0)
{
uint32_t c;
- int chlen = utf82ord(p, &c);
+ const uint8_t *end = p + len;
+ int chlen = utf82ord(p, end, &c);
if (chlen <= 0) return -1;
if (!utf && c > 0xffff) return -3;
if (c > 0x10ffff) return -2;
@@ -3329,13 +3335,14 @@
int chlen;
uint32_t c;
uint32_t topbit = 0;
+ const uint8_t *end = p + len;
if (!utf && *p == 0xff && len > 1)
{
topbit = 0x80000000u;
p++;
len--;
}
- chlen = utf82ord(p, &c);
+ chlen = utf82ord(p, end, &c);
if (chlen <= 0) return -1;
if (utf && c > 0x10ffff) return -2;
p += chlen;
@@ -6852,7 +6859,9 @@
uint8_t *q;
uint32_t cc;
int n = 1;
- for (q = p; n > 0 && *q; q += n) n = utf82ord(q, &cc);
+ uint8_t *q_end = p + len;
+
+ for (q = p; n > 0 && *q; q += n) n = utf82ord(q, q_end, &cc);
if (n <= 0)
{
fprintf(outfile, "** Failed: invalid UTF-8 string cannot be used as input "
@@ -8081,7 +8090,7 @@
rubriclength += 15;
PCHARS(backlength, pp, leftchar, ovector[0] - leftchar, utf, outfile);
- PCHARSV(pp, ovector[0], ulen - ovector[0], utf, outfile);
+ PCHARSV(pp, ovector[0], ovector[1] - ovector[0], utf, outfile);
if ((pat_patctl.control & CTL_JITVERIFY) != 0 && jit_was_used)
fprintf(outfile, " (JIT)");