pcre2test: avoid printing invalid utf trail in partial match (#237) When match_invalid_utf is enabled, invalid UTF-8 data can't match but it was mistakenly getting printed as part of a partial match eventhough the ovector correctly didn't include it, as shown by: PCRE2 version 10.34 2019-11-21 re> /(?<=..)X/match_invalid_utf,allvector data> XX\x80\=ph,ovector=1 Partial match: \x{80} ** ovector[1] is not equal to the subject length: 2 != 3 0: 2 2 Fix the logic to print instead the empty match that was returned and address a buffer overread when trying to decode UTF-8 that was missing code units. Fixes: #235

commit: 15a11d10a8580ee09f7cd433e1fdaccb88106c4f [log] [tgz]
author: Carlo Marcelo Arenas Belón <carenas@gmail.com> Fri Apr 21 01:11:36 2023 -0700
committer: GitHub <noreply@github.com> Fri Apr 21 09:11:36 2023 +0100
tree: 1a573c2f727f7b8ee1df608b6d52b906d4edcb3e
parent: 9bad4654be04c6419c443a1e7469844ece15e8cf [diff]
diff --git a/.gitignore b/.gitignore
index c25f473..7dacbcb 100644
--- a/.gitignore
+++ b/.gitignore

@@ -66,6 +66,10 @@
 testtemp2grep
 testtry
 testtrygrep
+testSinput
+testbtables
+testsaved1
+testsaved2
 
 m4/libtool.m4
 m4/ltoptions.m4

diff --git a/src/pcre2test.c b/src/pcre2test.c
index 4e2dfa8..7834509 100644
--- a/src/pcre2test.c
+++ b/src/pcre2test.c

@@ -2913,6 +2913,7 @@
 
 Argument:
   utf8bytes   a pointer to the byte vector
+  end         a pointer to the end of the byte vector
   vptr        a pointer to an int to receive the value
 
 Returns:      >  0 => the number of bytes consumed
@@ -2920,7 +2921,7 @@
 */
 
 static int
-utf82ord(PCRE2_SPTR8 utf8bytes, uint32_t *vptr)
+utf82ord(PCRE2_SPTR8 utf8bytes, PCRE2_SPTR8 end, uint32_t *vptr)
 {
 uint32_t c = *utf8bytes++;
 uint32_t d = c;
@@ -2942,6 +2943,8 @@
 
 for (j = 0; j < i; j++)
   {
+  if (utf8bytes >= end) return 0;
+
   c = *utf8bytes++;
   if ((c & 0xc0) != 0x80) return -(j+1);
   s -= 6;
@@ -3052,14 +3055,16 @@
 
 static int pchars8(PCRE2_SPTR8 p, int length, BOOL utf, FILE *f)
 {
+PCRE2_SPTR8 end;
 uint32_t c = 0;
 int yield = 0;
 if (length < 0) length = *p++;
+end = p + length;
 while (length-- > 0)
   {
   if (utf)
     {
-    int rc = utf82ord(p, &c);
+    int rc = utf82ord(p, end, &c);
     if (rc > 0 && rc <= length + 1)   /* Mustn't run over the end */
       {
       length -= rc - 1;
@@ -3238,7 +3243,8 @@
 else while (len > 0)
   {
   uint32_t c;
-  int chlen = utf82ord(p, &c);
+  const uint8_t *end = p + len;
+  int chlen = utf82ord(p, end, &c);
   if (chlen <= 0) return -1;
   if (!utf && c > 0xffff) return -3;
   if (c > 0x10ffff) return -2;
@@ -3329,13 +3335,14 @@
   int chlen;
   uint32_t c;
   uint32_t topbit = 0;
+  const uint8_t *end = p + len;
   if (!utf && *p == 0xff && len > 1)
     {
     topbit = 0x80000000u;
     p++;
     len--;
     }
-  chlen = utf82ord(p, &c);
+  chlen = utf82ord(p, end, &c);
   if (chlen <= 0) return -1;
   if (utf && c > 0x10ffff) return -2;
   p += chlen;
@@ -6852,7 +6859,9 @@
   uint8_t *q;
   uint32_t cc;
   int n = 1;
-  for (q = p; n > 0 && *q; q += n) n = utf82ord(q, &cc);
+  uint8_t *q_end = p + len;
+
+  for (q = p; n > 0 && *q; q += n) n = utf82ord(q, q_end, &cc);
   if (n <= 0)
     {
     fprintf(outfile, "** Failed: invalid UTF-8 string cannot be used as input "
@@ -8081,7 +8090,7 @@
     rubriclength += 15;
 
     PCHARS(backlength, pp, leftchar, ovector[0] - leftchar, utf, outfile);
-    PCHARSV(pp, ovector[0], ulen - ovector[0], utf, outfile);
+    PCHARSV(pp, ovector[0], ovector[1] - ovector[0], utf, outfile);
 
     if ((pat_patctl.control & CTL_JITVERIFY) != 0 && jit_was_used)
       fprintf(outfile, " (JIT)");
commit	15a11d10a8580ee09f7cd433e1fdaccb88106c4f	[log] [tgz]
author	Carlo Marcelo Arenas Belón <carenas@gmail.com>	Fri Apr 21 01:11:36 2023 -0700
committer	GitHub <noreply@github.com>	Fri Apr 21 09:11:36 2023 +0100
tree	1a573c2f727f7b8ee1df608b6d52b906d4edcb3e
parent	9bad4654be04c6419c443a1e7469844ece15e8cf [diff]