Made PCRE2_UCP the default in UTF mode in pcre2grep, and added new options --case-restrict and --no-ucp.

commit: 8385df8c97b6f8069a48e600c7e4e94cc3e3ebd9 [log] [tgz]
author: Philip Hazel <Philip.Hazel@gmail.com> Wed Feb 08 15:09:18 2023 +0000
committer: Philip Hazel <Philip.Hazel@gmail.com> Wed Feb 08 15:09:18 2023 +0000
tree: 15d65ce0ec185fabcf616c69229797adb59ad81d
parent: 6a97f600d6572d024fc3a1a42f4730b83c380440 [diff]
diff --git a/ChangeLog b/ChangeLog
index 16d0eef..b211e9c 100644
--- a/ChangeLog
+++ b/ChangeLog

@@ -55,6 +55,9 @@
 
 12. Integer overflow testing is now centralized in a new function.
 
+13. Made PCRE2_UCP the default in UTF mode in pcre2grep, and added new options 
+--case-restrict and --no-ucp.
+
 
 Version 10.42 11-December-2022
 ------------------------------

diff --git a/RunGrepTest b/RunGrepTest
index 0a00e82..4383010 100755
--- a/RunGrepTest
+++ b/RunGrepTest

@@ -854,8 +854,6 @@
 (cd $srcdir; $valgrind $vjs $pcre2grep --colour=always -e this -e The -e 'The wo' testdata/grepinputv) >>testtrygrep
 
 
-
-
 # Now compare the results.
 
 $cf $srcdir/testdata/grepoutput testtrygrep
@@ -893,6 +891,14 @@
   (cd $srcdir; $valgrind $vjs $pcre2grep -u -m1 -O '=$x{1d3}$o{744}=' 'fox') <$srcdir/testdata/grepinputv >>testtrygrep 2>&1
   echo "RC=$?" >>testtrygrep
 
+  echo "---------------------------- Test U7 ------------------------------" >>testtrygrep
+  (cd $srcdir; $valgrind $vjs $pcre2grep -ui --colour=always 'k+|\babc\b' ./testdata/grepinput8) >>testtrygrep
+  echo "RC=$?" >>testtrygrep
+
+  echo "---------------------------- Test U8 ------------------------------" >>testtrygrep
+  (cd $srcdir; $valgrind $vjs $pcre2grep -UiEP --colour=always 'k+|\babc\b' ./testdata/grepinput8) >>testtrygrep
+  echo "RC=$?" >>testtrygrep
+
   $cf $srcdir/testdata/grepoutput8 testtrygrep
   if [ $? != 0 ] ; then exit 1; fi
 

diff --git a/doc/html/pcre2grep.html b/doc/html/pcre2grep.html
index 29ab031..eb688a4 100644
--- a/doc/html/pcre2grep.html
+++ b/doc/html/pcre2grep.html

@@ -21,7 +21,7 @@
 <li><a name="TOC6" href="#SEC6">OPTIONS</a>
 <li><a name="TOC7" href="#SEC7">ENVIRONMENT VARIABLES</a>
 <li><a name="TOC8" href="#SEC8">NEWLINES</a>
-<li><a name="TOC9" href="#SEC9">OPTIONS COMPATIBILITY</a>
+<li><a name="TOC9" href="#SEC9">OPTIONS COMPATIBILITY WITH GNU GREP</a>
 <li><a name="TOC10" href="#SEC10">OPTIONS WITH DATA</a>
 <li><a name="TOC11" href="#SEC11">USING PCRE2'S CALLOUT FACILITY</a>
 <li><a name="TOC12" href="#SEC12">MATCHING ERRORS</a>
@@ -314,6 +314,14 @@
 See <b>--match-limit</b> below.
 </P>
 <P>
+<b>-E</b>, <b>--case-restrict</b>
+When case distinctions are being ignored in Unicode mode, two ASCII letters (K
+and S) will by default match Unicode characters U+212A (Kelvin sign) and U+017F
+(long S) respectively, as well as their lower case ASCII counterparts. When
+this option is set, case equivalences are restricted such that no ASCII
+character matches a non-ASCII character, and vice versa.
+</P>
+<P>
 <b>-e</b> <i>pattern</i>, <b>--regex=</b><i>pattern</i>, <b>--regexp=</b><i>pattern</i>
 Specify a pattern to be matched. This option can be used multiple times in
 order to specify several patterns. It can also be used as a way of specifying a
@@ -449,7 +457,9 @@
 </P>
 <P>
 <b>-i</b>, <b>--ignore-case</b>
-Ignore upper/lower case distinctions during comparisons.
+Ignore upper/lower case distinctions when pattern matching. This applies when
+matching path names for inclusion or exclusion as well as when matching lines
+in files.
 </P>
 <P>
 <b>--include</b>=<i>pattern</i>
@@ -759,6 +769,18 @@
 is an empty string. Separating strings are never coloured.
 </P>
 <P>
+<b>-P</b>, <b>--no-ucp</b>
+Starting from release 10.43, when UTF/Unicode mode is specified with <b>-u</b>
+or <b>-U</b>, the PCRE2_UCP option is used by default. This means that the
+simple class escapes in patterns match more than just ASCII characters. For
+example, \d matches any Unicode decimal digit. The <b>--no-ucp</b> option
+suppresses PCRE2_UCP, thus restricting the class escapes to ASCII characters,
+as was the case in earlier releases. Note that there are now more fine-grained
+option settings within patterns that affect individual escapes. For example,
+when PCRE2_UCP is set, the sequence (?aD) restricts \d to ASCII digits, while
+allowing \w to match Unicode letters and digits.
+</P>
+<P>
 <b>-q</b>, <b>--quiet</b>
 Work quietly, that is, display nothing except error messages. The exit
 status indicates whether or not any matches were found.
@@ -796,11 +818,11 @@
 </P>
 <P>
 <b>-u</b>, <b>--utf</b>
-Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled
-with UTF-8 support. All patterns (including those for any <b>--exclude</b> and
-<b>--include</b> options) and all lines that are scanned must be valid strings
-of UTF-8 characters. If an invalid UTF-8 string is encountered, an error
-occurs.
+Operate in UTF/Unicode mode. This option is available only if PCRE2 has been
+compiled with UTF-8 support. All patterns (including those for any
+<b>--exclude</b> and <b>--include</b> options) and all lines that are scanned
+must be valid strings of UTF-8 characters. If an invalid UTF-8 string is
+encountered, an error occurs.
 </P>
 <P>
 <b>-U</b>, <b>--utf-allow-invalid</b>
@@ -883,25 +905,27 @@
 standard output must end with "\r\n". For all other operating systems, and
 for all messages to the standard error stream, "\n" is used.
 </P>
-<br><a name="SEC9" href="#TOC1">OPTIONS COMPATIBILITY</a><br>
+<br><a name="SEC9" href="#TOC1">OPTIONS COMPATIBILITY WITH GNU GREP</a><br>
 <P>
-Many of the short and long forms of <b>pcre2grep</b>'s options are the same
-as in the GNU <b>grep</b> program. Any long option of the form
-<b>--xxx-regexp</b> (GNU terminology) is also available as <b>--xxx-regex</b>
-(PCRE2 terminology). However, the <b>--depth-limit</b>, <b>--file-list</b>,
-<b>--file-offsets</b>, <b>--heap-limit</b>, <b>--include-dir</b>,
-<b>--line-offsets</b>, <b>--locale</b>, <b>--match-limit</b>, <b>-M</b>,
-<b>--multiline</b>, <b>-N</b>, <b>--newline</b>, <b>--om-separator</b>,
-<b>--output</b>, <b>-u</b>, <b>--utf</b>, <b>-U</b>, and <b>--utf-allow-invalid</b>
-options are specific to <b>pcre2grep</b>, as is the use of the
-<b>--only-matching</b> option with a capturing parentheses number.
+Many of the short and long forms of <b>pcre2grep</b>'s options are the same as
+in the GNU <b>grep</b> program. Any long option of the form <b>--xxx-regexp</b>
+(GNU terminology) is also available as <b>--xxx-regex</b> (PCRE2 terminology).
+However, the <b>--case-restrict</b>, <b>--depth-limit</b>, <b>-E</b>,
+<b>--file-list</b>, <b>--file-offsets</b>, <b>--heap-limit</b>,
+<b>--include-dir</b>, <b>--line-offsets</b>, <b>--locale</b>, <b>--match-limit</b>,
+<b>-M</b>, <b>--multiline</b>, <b>-N</b>, <b>--newline</b>, <b>--no-ucp</b>,
+<b>--om-separator</b>, <b>--output</b>, <b>-P</b>, <b>-u</b>, <b>--utf</b>,
+<b>-U</b>, and <b>--utf-allow-invalid</b> options are specific to
+<b>pcre2grep</b>, as is the use of the <b>--only-matching</b> option with a
+capturing parentheses number.
 </P>
 <P>
 Although most of the common options work the same way, a few are different in
 <b>pcre2grep</b>. For example, the <b>--include</b> option's argument is a glob
-for GNU <b>grep</b>, but a regular expression for <b>pcre2grep</b>. If both the
-<b>-c</b> and <b>-l</b> options are given, GNU grep lists only file names,
-without counts, but <b>pcre2grep</b> gives the counts as well.
+for GNU <b>grep</b>, but in <b>pcre2grep</b> it is a regular expression to which
+the \i option applies. If both the <b>-c</b> and <b>-l</b> options are given,
+GNU grep lists only file names, without counts, but <b>pcre2grep</b> gives the
+counts as well.
 </P>
 <br><a name="SEC10" href="#TOC1">OPTIONS WITH DATA</a><br>
 <P>
@@ -1065,9 +1089,9 @@
 </P>
 <br><a name="SEC16" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 21 November 2022
+Last updated: 08 February 2023
 <br>
-Copyright &copy; 1997-2022 University of Cambridge.
+Copyright &copy; 1997-2023 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.

diff --git a/doc/pcre2grep.1 b/doc/pcre2grep.1
index 956633d..80e3c84 100644
--- a/doc/pcre2grep.1
+++ b/doc/pcre2grep.1

@@ -1,4 +1,4 @@
-.TH PCRE2GREP 1 "21 November 2022" "PCRE2 10.41"
+.TH PCRE2GREP 1 "08 February 2023" "PCRE2 10.43"
 .SH NAME
 pcre2grep - a grep with Perl-compatible regular expressions.
 .SH SYNOPSIS
@@ -268,6 +268,13 @@
 \fB--depth-limit\fP=\fInumber\fP
 See \fB--match-limit\fP below.
 .TP
+\fB-E\fP, \fB--case-restrict\fP
+When case distinctions are being ignored in Unicode mode, two ASCII letters (K
+and S) will by default match Unicode characters U+212A (Kelvin sign) and U+017F
+(long S) respectively, as well as their lower case ASCII counterparts. When
+this option is set, case equivalences are restricted such that no ASCII
+character matches a non-ASCII character, and vice versa.
+.TP
 \fB-e\fP \fIpattern\fP, \fB--regex=\fP\fIpattern\fP, \fB--regexp=\fP\fIpattern\fP
 Specify a pattern to be matched. This option can be used multiple times in
 order to specify several patterns. It can also be used as a way of specifying a
@@ -388,7 +395,9 @@
 \fB--binary-files\fP=\fIwithout-match\fP.
 .TP
 \fB-i\fP, \fB--ignore-case\fP
-Ignore upper/lower case distinctions during comparisons.
+Ignore upper/lower case distinctions when pattern matching. This applies when
+matching path names for inclusion or exclusion as well as when matching lines
+in files.
 .TP
 \fB--include\fP=\fIpattern\fP
 If any \fB--include\fP patterns are specified, the only files that are
@@ -660,6 +669,17 @@
 Specify a separating string for multiple occurrences of \fB-o\fP. The default
 is an empty string. Separating strings are never coloured.
 .TP
+\fB-P\fP, \fB--no-ucp\fP
+Starting from release 10.43, when UTF/Unicode mode is specified with \fB-u\fP
+or \fB-U\fP, the PCRE2_UCP option is used by default. This means that the
+simple class escapes in patterns match more than just ASCII characters. For
+example, \ed matches any Unicode decimal digit. The \fB--no-ucp\fP option
+suppresses PCRE2_UCP, thus restricting the class escapes to ASCII characters,
+as was the case in earlier releases. Note that there are now more fine-grained
+option settings within patterns that affect individual escapes. For example,
+when PCRE2_UCP is set, the sequence (?aD) restricts \ed to ASCII digits, while
+allowing \ew to match Unicode letters and digits.
+.TP
 \fB-q\fP, \fB--quiet\fP
 Work quietly, that is, display nothing except error messages. The exit
 status indicates whether or not any matches were found.
@@ -692,11 +712,11 @@
 total would always be zero.
 .TP
 \fB-u\fP, \fB--utf\fP
-Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled
-with UTF-8 support. All patterns (including those for any \fB--exclude\fP and
-\fB--include\fP options) and all lines that are scanned must be valid strings
-of UTF-8 characters. If an invalid UTF-8 string is encountered, an error
-occurs.
+Operate in UTF/Unicode mode. This option is available only if PCRE2 has been
+compiled with UTF-8 support. All patterns (including those for any
+\fB--exclude\fP and \fB--include\fP options) and all lines that are scanned
+must be valid strings of UTF-8 characters. If an invalid UTF-8 string is
+encountered, an error occurs.
 .TP
 \fB-U\fP, \fB--utf-allow-invalid\fP
 As \fB--utf\fP, but in addition subject lines may contain invalid UTF-8 code
@@ -778,25 +798,27 @@
 for all messages to the standard error stream, "\en" is used.
 .
 .
-.SH "OPTIONS COMPATIBILITY"
+.SH "OPTIONS COMPATIBILITY WITH GNU GREP"
 .rs
 .sp
-Many of the short and long forms of \fBpcre2grep\fP's options are the same
-as in the GNU \fBgrep\fP program. Any long option of the form
-\fB--xxx-regexp\fP (GNU terminology) is also available as \fB--xxx-regex\fP
-(PCRE2 terminology). However, the \fB--depth-limit\fP, \fB--file-list\fP,
-\fB--file-offsets\fP, \fB--heap-limit\fP, \fB--include-dir\fP,
-\fB--line-offsets\fP, \fB--locale\fP, \fB--match-limit\fP, \fB-M\fP,
-\fB--multiline\fP, \fB-N\fP, \fB--newline\fP, \fB--om-separator\fP,
-\fB--output\fP, \fB-u\fP, \fB--utf\fP, \fB-U\fP, and \fB--utf-allow-invalid\fP
-options are specific to \fBpcre2grep\fP, as is the use of the
-\fB--only-matching\fP option with a capturing parentheses number.
+Many of the short and long forms of \fBpcre2grep\fP's options are the same as
+in the GNU \fBgrep\fP program. Any long option of the form \fB--xxx-regexp\fP
+(GNU terminology) is also available as \fB--xxx-regex\fP (PCRE2 terminology).
+However, the \fB--case-restrict\fP, \fB--depth-limit\fP, \fB-E\fP,
+\fB--file-list\fP, \fB--file-offsets\fP, \fB--heap-limit\fP,
+\fB--include-dir\fP, \fB--line-offsets\fP, \fB--locale\fP, \fB--match-limit\fP,
+\fB-M\fP, \fB--multiline\fP, \fB-N\fP, \fB--newline\fP, \fB--no-ucp\fP,
+\fB--om-separator\fP, \fB--output\fP, \fB-P\fP, \fB-u\fP, \fB--utf\fP,
+\fB-U\fP, and \fB--utf-allow-invalid\fP options are specific to
+\fBpcre2grep\fP, as is the use of the \fB--only-matching\fP option with a
+capturing parentheses number.
 .P
 Although most of the common options work the same way, a few are different in
 \fBpcre2grep\fP. For example, the \fB--include\fP option's argument is a glob
-for GNU \fBgrep\fP, but a regular expression for \fBpcre2grep\fP. If both the
-\fB-c\fP and \fB-l\fP options are given, GNU grep lists only file names,
-without counts, but \fBpcre2grep\fP gives the counts as well.
+for GNU \fBgrep\fP, but in \fBpcre2grep\fP it is a regular expression to which
+the \ei option applies. If both the \fB-c\fP and \fB-l\fP options are given,
+GNU grep lists only file names, without counts, but \fBpcre2grep\fP gives the
+counts as well.
 .
 .
 .SH "OPTIONS WITH DATA"
@@ -970,6 +992,6 @@
 .rs
 .sp
 .nf
-Last updated: 21 November 2022
-Copyright (c) 1997-2022 University of Cambridge.
+Last updated: 08 February 2023
+Copyright (c) 1997-2023 University of Cambridge.
 .fi

diff --git a/doc/pcre2grep.txt b/doc/pcre2grep.txt
index adc1d89..602c977 100644
--- a/doc/pcre2grep.txt
+++ b/doc/pcre2grep.txt

@@ -286,6 +286,15 @@
        --depth-limit=number
                  See --match-limit below.
 
+       -E, --case-restrict
+                 When case distinctions are being ignored in Unicode mode, two
+                 ASCII letters (K and S) will by default match Unicode charac-
+                 ters  U+212A  (Kelvin sign) and U+017F (long S) respectively,
+                 as well as their lower case ASCII counterparts. When this op-
+                 tion  is  set,  case equivalences are restricted such that no
+                 ASCII character  matches  a  non-ASCII  character,  and  vice
+                 versa.
+
        -e pattern, --regex=pattern, --regexp=pattern
                  Specify a pattern to be matched. This option can be used mul-
                  tiple times in order to specify several patterns. It can also
@@ -421,7 +430,9 @@
                  files=without-match.
 
        -i, --ignore-case
-                 Ignore upper/lower case distinctions during comparisons.
+                 Ignore upper/lower case distinctions when  pattern  matching.
+                 This applies when matching path names for inclusion or exclu-
+                 sion as well as when matching lines in files.
 
        --include=pattern
                  If any --include patterns are specified, the only files  that
@@ -736,6 +747,19 @@
                  The default is an empty string. Separating strings are  never
                  coloured.
 
+       -P, --no-ucp
+                 Starting  from release 10.43, when UTF/Unicode mode is speci-
+                 fied with -u or -U, the PCRE2_UCP option is used by  default.
+                 This  means  that  the simple class escapes in patterns match
+                 more than just ASCII characters. For example, \d matches  any
+                 Unicode   decimal   digit.  The  --no-ucp  option  suppresses
+                 PCRE2_UCP, thus restricting the class escapes to ASCII  char-
+                 acters,  as was the case in earlier releases. Note that there
+                 are now more fine-grained  option  settings  within  patterns
+                 that  affect  individual escapes. For example, when PCRE2_UCP
+                 is set, the sequence (?aD)  restricts  \d  to  ASCII  digits,
+                 while allowing \w to match Unicode letters and digits.
+
        -q, --quiet
                  Work quietly, that is, display nothing except error messages.
                  The exit status indicates whether or  not  any  matches  were
@@ -771,57 +795,58 @@
                  (list files without matches), because the grand  total  would
                  always be zero.
 
-       -u, --utf Operate in UTF-8 mode. This option is available only if PCRE2
-                 has been compiled with UTF-8 support. All patterns (including
-                 those  for any --exclude and --include options) and all lines
-                 that are scanned must be valid strings of  UTF-8  characters.
-                 If an invalid UTF-8 string is encountered, an error occurs.
+       -u, --utf Operate in UTF/Unicode mode. This option is available only if
+                 PCRE2 has been compiled with UTF-8 support. All patterns (in-
+                 cluding  those  for  any --exclude and --include options) and
+                 all lines that are scanned must be  valid  strings  of  UTF-8
+                 characters. If an invalid UTF-8 string is encountered, an er-
+                 ror occurs.
 
        -U, --utf-allow-invalid
-                 As  --utf,  but in addition subject lines may contain invalid
-                 UTF-8 code unit sequences. These can never form part  of  any
-                 pattern  match.  Patterns  themselves, however, must still be
+                 As --utf, but in addition subject lines may  contain  invalid
+                 UTF-8  code  unit sequences. These can never form part of any
+                 pattern match. Patterns themselves, however,  must  still  be
                  valid UTF-8 strings. This facility allows valid UTF-8 strings
                  to be sought within arbitrary byte sequences in executable or
-                 other binary files. For more details about matching  in  non-
+                 other  binary  files. For more details about matching in non-
                  valid UTF-8 strings, see the pcre2unicode(3) documentation.
 
        -V, --version
-                 Write  the version numbers of pcre2grep and the PCRE2 library
-                 to the standard output and then exit. Anything  else  on  the
+                 Write the version numbers of pcre2grep and the PCRE2  library
+                 to  the  standard  output and then exit. Anything else on the
                  command line is ignored.
 
        -v, --invert-match
-                 Invert  the  sense  of  the match, so that lines which do not
-                 match any of the patterns are the ones that are  found.  When
-                 this  option  is  set,  options  such  as --only-matching and
-                 --output, which specify parts of a match that are to be  out-
+                 Invert the sense of the match, so that  lines  which  do  not
+                 match  any  of the patterns are the ones that are found. When
+                 this option is  set,  options  such  as  --only-matching  and
+                 --output,  which specify parts of a match that are to be out-
                  put, are ignored.
 
        -w, --word-regex, --word-regexp
                  Force the patterns only to match "words". That is, there must
-                 be a word boundary at the  start  and  end  of  each  matched
-                 string.  This is equivalent to having "\b(?:" at the start of
-                 each pattern, and ")\b" at the end. This option applies  only
-                 to  the  patterns  that  are  matched against the contents of
-                 files; it does not apply to patterns specified by any of  the
+                 be  a  word  boundary  at  the  start and end of each matched
+                 string. This is equivalent to having "\b(?:" at the start  of
+                 each  pattern, and ")\b" at the end. This option applies only
+                 to the patterns that are  matched  against  the  contents  of
+                 files;  it does not apply to patterns specified by any of the
                  --include or --exclude options.
 
        -x, --line-regex, --line-regexp
-                 Force  the  patterns to start matching only at the beginnings
-                 of lines, and in  addition,  require  them  to  match  entire
+                 Force the patterns to start matching only at  the  beginnings
+                 of  lines,  and  in  addition,  require  them to match entire
                  lines. In multiline mode the match may be more than one line.
                  This is equivalent to having "^(?:" at the start of each pat-
-                 tern  and  ")$"  at  the end. This option applies only to the
-                 patterns that are matched against the contents of  files;  it
-                 does  not apply to patterns specified by any of the --include
+                 tern and ")$" at the end. This option  applies  only  to  the
+                 patterns  that  are matched against the contents of files; it
+                 does not apply to patterns specified by any of the  --include
                  or --exclude options.
 
        -Z, --null
-                 Terminate files names in the regular output with a zero  byte
-                 (the  NUL  character)  instead of what would normally appear.
-                 This is useful when file  names  contain  unusual  characters
-                 such  as  colons,  hyphens, or even newlines. The option does
+                 Terminate  files names in the regular output with a zero byte
+                 (the NUL character) instead of what  would  normally  appear.
+                 This  is  useful  when  file names contain unusual characters
+                 such as colons, hyphens, or even newlines.  The  option  does
                  not apply to file names in error messages.
 
 
@@ -835,137 +860,139 @@
 
 NEWLINES
 
-       The  -N  (--newline) option allows pcre2grep to scan files with newline
-       conventions that differ from the default. This option affects only  the
-       way  scanned files are processed. It does not affect the interpretation
-       of files specified by the -f,  --file-list,  --exclude-from,  or  --in-
+       The -N (--newline) option allows pcre2grep to scan files  with  newline
+       conventions  that differ from the default. This option affects only the
+       way scanned files are processed. It does not affect the  interpretation
+       of  files  specified  by  the -f, --file-list, --exclude-from, or --in-
        clude-from options.
 
-       Any  parts  of the scanned input files that are written to the standard
-       output are copied with whatever newline sequences they have in the  in-
-       put.  However,  if  the final line of a file is output, and it does not
-       end with a newline sequence, a newline sequence is added. If  the  new-
-       line  setting  is  CR, LF, CRLF or NUL, that line ending is output; for
+       Any parts of the scanned input files that are written to  the  standard
+       output  are copied with whatever newline sequences they have in the in-
+       put. However, if the final line of a file is output, and  it  does  not
+       end  with  a newline sequence, a newline sequence is added. If the new-
+       line setting is CR, LF, CRLF or NUL, that line ending  is  output;  for
        the other settings (ANYCRLF or ANY) a single NL is used.
 
-       The newline setting does not affect the way in which  pcre2grep  writes
-       newlines  in  informational  messages  to the standard output and error
-       streams.  Under Windows, the standard output is set to  be  binary,  so
-       that  "\r\n" at the ends of output lines that are copied from the input
-       is not converted to "\r\r\n" by the C I/O library. This means that  any
-       messages  written  to the standard output must end with "\r\n". For all
-       other operating systems, and for all messages  to  the  standard  error
+       The  newline  setting does not affect the way in which pcre2grep writes
+       newlines in informational messages to the  standard  output  and  error
+       streams.   Under  Windows,  the standard output is set to be binary, so
+       that "\r\n" at the ends of output lines that are copied from the  input
+       is  not converted to "\r\r\n" by the C I/O library. This means that any
+       messages written to the standard output must end with "\r\n".  For  all
+       other  operating  systems,  and  for all messages to the standard error
        stream, "\n" is used.
 
 
-OPTIONS COMPATIBILITY
+OPTIONS COMPATIBILITY WITH GNU GREP
 
        Many of the short and long forms of pcre2grep's options are the same as
-       in the GNU grep program. Any long option of the form --xxx-regexp  (GNU
-       terminology) is also available as --xxx-regex (PCRE2 terminology). How-
-       ever, the  --depth-limit,  --file-list,  --file-offsets,  --heap-limit,
-       --include-dir,  --line-offsets,  --locale,  --match-limit, -M, --multi-
-       line, -N, --newline,  --om-separator,  --output,  -u,  --utf,  -U,  and
-       --utf-allow-invalid options are specific to pcre2grep, as is the use of
-       the --only-matching option with a capturing parentheses number.
+       in  the GNU grep program. Any long option of the form --xxx-regexp (GNU
+       terminology) is also  available  as  --xxx-regex  (PCRE2  terminology).
+       However,  the  --case-restrict, --depth-limit, -E, --file-list, --file-
+       offsets,   --heap-limit,   --include-dir,   --line-offsets,   --locale,
+       --match-limit,  -M,  --multiline, -N, --newline, --no-ucp, --om-separa-
+       tor, --output, -P, -u, --utf, -U, and --utf-allow-invalid  options  are
+       specific to pcre2grep, as is the use of the --only-matching option with
+       a capturing parentheses number.
 
        Although most of the common options work the same way, a few  are  dif-
        ferent  in pcre2grep. For example, the --include option's argument is a
-       glob for GNU grep, but a regular expression for pcre2grep. If both  the
-       -c  and  -l  options are given, GNU grep lists only file names, without
-       counts, but pcre2grep gives the counts as well.
+       glob for GNU grep, but in pcre2grep it is a regular expression to which
+       the  \i  option  applies.  If both the -c and -l options are given, GNU
+       grep lists only file names, without counts,  but  pcre2grep  gives  the
+       counts as well.
 
 
 OPTIONS WITH DATA
 
        There are four different ways in which an option with data can be spec-
-       ified.   If  a  short  form option is used, the data may follow immedi-
+       ified.  If a short form option is used, the  data  may  follow  immedi-
        ately, or (with one exception) in the next command line item. For exam-
        ple:
 
          -f/some/file
          -f /some/file
 
-       The  exception is the -o option, which may appear with or without data.
-       Because of this, if data is present, it must follow immediately in  the
+       The exception is the -o option, which may appear with or without  data.
+       Because  of this, if data is present, it must follow immediately in the
        same item, for example -o3.
 
-       If  a long form option is used, the data may appear in the same command
-       line item, separated by an equals character, or (with  two  exceptions)
+       If a long form option is used, the data may appear in the same  command
+       line  item,  separated by an equals character, or (with two exceptions)
        it may appear in the next command line item. For example:
 
          --file=/some/file
          --file /some/file
 
-       Note,  however, that if you want to supply a file name beginning with ~
-       as data in a shell command, and have the shell expand ~ to a  home  di-
-       rectory,  you  must separate the file name from the option, because the
+       Note, however, that if you want to supply a file name beginning with  ~
+       as  data  in a shell command, and have the shell expand ~ to a home di-
+       rectory, you must separate the file name from the option,  because  the
        shell does not treat ~ specially unless it is at the start of an item.
 
-       The exceptions to the above are the --colour (or --color)  and  --only-
-       matching  options,  for which the data is optional. If one of these op-
-       tions does have data, it must be given in  the  first  form,  using  an
+       The  exceptions  to the above are the --colour (or --color) and --only-
+       matching options, for which the data is optional. If one of  these  op-
+       tions  does  have  data,  it  must be given in the first form, using an
        equals character. Otherwise pcre2grep will assume that it has no data.
 
 
 USING PCRE2'S CALLOUT FACILITY
 
-       pcre2grep  has,  by  default,  support for calling external programs or
-       scripts or echoing specific strings during matching by  making  use  of
-       PCRE2's  callout  facility.  However, this support can be completely or
-       partially disabled when pcre2grep is built. You can  find  out  whether
-       your  binary has support for callouts by running it with the --help op-
-       tion. If callout support is completely disabled, all callouts  in  pat-
+       pcre2grep has, by default, support for  calling  external  programs  or
+       scripts  or  echoing  specific strings during matching by making use of
+       PCRE2's callout facility. However, this support can  be  completely  or
+       partially  disabled  when  pcre2grep is built. You can find out whether
+       your binary has support for callouts by running it with the --help  op-
+       tion.  If  callout support is completely disabled, all callouts in pat-
        terns are ignored by pcre2grep.  If the facility is partially disabled,
-       calling external programs is not supported, and callouts  that  request
+       calling  external  programs is not supported, and callouts that request
        it are ignored.
 
-       A  callout  in a PCRE2 pattern is of the form (?C<arg>) where the argu-
-       ment is either a number or a quoted string (see the pcre2callout  docu-
-       mentation  for  details).  Numbered  callouts are ignored by pcre2grep;
+       A callout in a PCRE2 pattern is of the form (?C<arg>) where  the  argu-
+       ment  is either a number or a quoted string (see the pcre2callout docu-
+       mentation for details). Numbered callouts  are  ignored  by  pcre2grep;
        only callouts with string arguments are useful.
 
    Echoing a specific string
 
-       Starting the callout string with a pipe character  invokes  an  echoing
+       Starting  the  callout  string with a pipe character invokes an echoing
        facility that avoids calling an external program or script. This facil-
-       ity is always available, provided that  callouts  were  not  completely
-       disabled  when  pcre2grep  was built. The rest of the callout string is
-       processed as a zero-terminated string, which means it should  not  con-
-       tain  any  internal  binary  zeros. It is written to the output, having
-       first been passed through the same escape processing as text  from  the
-       --output  (-O) option (see above). However, $0 cannot be used to insert
-       a matched substring because the match is still  in  progress.  Instead,
-       the  single  character '0' is inserted. Any syntax errors in the string
-       (for example, a dollar not followed by another  character)  causes  the
-       callout  to be ignored. No terminator is added to the output string, so
-       if you want a newline, you must include it explicitly using the  escape
+       ity  is  always  available,  provided that callouts were not completely
+       disabled when pcre2grep was built. The rest of the  callout  string  is
+       processed  as  a zero-terminated string, which means it should not con-
+       tain any internal binary zeros. It is written  to  the  output,  having
+       first  been  passed through the same escape processing as text from the
+       --output (-O) option (see above). However, $0 cannot be used to  insert
+       a  matched  substring  because the match is still in progress. Instead,
+       the single character '0' is inserted. Any syntax errors in  the  string
+       (for  example,  a  dollar not followed by another character) causes the
+       callout to be ignored. No terminator is added to the output string,  so
+       if  you want a newline, you must include it explicitly using the escape
        $n. For example:
 
          pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' <some file>
 
-       Matching  continues normally after the string is output. If you want to
-       see only the callout output but not any output from  an  actual  match,
+       Matching continues normally after the string is output. If you want  to
+       see  only  the  callout output but not any output from an actual match,
        you should end the pattern with (*FAIL).
 
    Calling external programs or scripts
 
        This facility can be independently disabled when pcre2grep is built. It
-       is supported for Windows, where a call to _spawnvp() is used, for  VMS,
-       where  lib$spawn()  is  used,  and  for any Unix-like environment where
+       is  supported for Windows, where a call to _spawnvp() is used, for VMS,
+       where lib$spawn() is used, and  for  any  Unix-like  environment  where
        fork() and execv() are available.
 
        If the callout string does not start with a pipe (vertical bar) charac-
-       ter,  it  is parsed into a list of substrings separated by pipe charac-
-       ters. The first substring must be an executable name, with the  follow-
+       ter, it is parsed into a list of substrings separated by  pipe  charac-
+       ters.  The first substring must be an executable name, with the follow-
        ing substrings specifying arguments:
 
          executable_name|arg1|arg2|...
 
-       Any  substring  (including  the executable name) may contain escape se-
-       quences started by a dollar character. These are the same  as  for  the
+       Any substring (including the executable name) may  contain  escape  se-
+       quences  started  by  a dollar character. These are the same as for the
        --output (-O) option documented above, except that $0 cannot insert the
-       matched string because the match is still  in  progress.  Instead,  the
+       matched  string  because  the  match is still in progress. Instead, the
        character '0' is inserted. If you need a literal dollar or pipe charac-
        ter in any substring, use $$ or $| respectively. Here is an example:
 
@@ -980,43 +1007,43 @@
            Arg1: [1] [234] [4] Arg2: |1| ()
            12345
 
-       The parameters for the system call that is used to run the  program  or
+       The  parameters  for the system call that is used to run the program or
        script are zero-terminated strings. This means that binary zero charac-
-       ters in the callout argument will cause premature termination of  their
-       substrings,  and  therefore should not be present. Any syntax errors in
-       the string (for example, a dollar not followed  by  another  character)
+       ters  in the callout argument will cause premature termination of their
+       substrings, and therefore should not be present. Any syntax  errors  in
+       the  string  (for  example, a dollar not followed by another character)
        causes the callout to be ignored.  If running the program fails for any
-       reason (including the non-existence of the executable), a local  match-
+       reason  (including the non-existence of the executable), a local match-
        ing failure occurs and the matcher backtracks in the normal way.
 
 
 MATCHING ERRORS
 
-       It  is  possible  to supply a regular expression that takes a very long
-       time to fail to match certain lines.  Such  patterns  normally  involve
-       nested  indefinite repeats, for example: (a+)*\d when matched against a
-       line of a's with no final digit. The PCRE2 matching function has a  re-
-       source  limit  that  causes it to abort in these circumstances. If this
-       happens, pcre2grep outputs an error message and the  line  that  caused
-       the  problem  to  the  standard error stream. If there are more than 20
+       It is possible to supply a regular expression that takes  a  very  long
+       time  to  fail  to  match certain lines. Such patterns normally involve
+       nested indefinite repeats, for example: (a+)*\d when matched against  a
+       line  of a's with no final digit. The PCRE2 matching function has a re-
+       source limit that causes it to abort in these  circumstances.  If  this
+       happens,  pcre2grep  outputs  an error message and the line that caused
+       the problem to the standard error stream. If there  are  more  than  20
        such errors, pcre2grep gives up.
 
-       The --match-limit option of pcre2grep can be used to  set  the  overall
-       resource  limit.  There are also other limits that affect the amount of
-       memory used during matching; see the  discussion  of  --heap-limit  and
+       The  --match-limit  option  of pcre2grep can be used to set the overall
+       resource limit. There are also other limits that affect the  amount  of
+       memory  used  during  matching;  see the discussion of --heap-limit and
        --depth-limit above.
 
 
 DIAGNOSTICS
 
        Exit status is 0 if any matches were found, 1 if no matches were found,
-       and 2 for syntax errors, overlong lines, non-existent  or  inaccessible
-       files  (even if matches were found in other files) or too many matching
+       and  2  for syntax errors, overlong lines, non-existent or inaccessible
+       files (even if matches were found in other files) or too many  matching
        errors. Using the -s option to suppress error messages about inaccessi-
        ble files does not affect the return code.
 
-       When   run  under  VMS,  the  return  code  is  placed  in  the  symbol
-       PCRE2GREP_RC because VMS  does  not  distinguish  between  exit(0)  and
+       When  run  under  VMS,  the  return  code  is  placed  in  the   symbol
+       PCRE2GREP_RC  because  VMS  does  not  distinguish  between exit(0) and
        exit(1).
 
 
@@ -1034,5 +1061,5 @@
 
 REVISION
 
-       Last updated: 21 November 2022
-       Copyright (c) 1997-2022 University of Cambridge.
+       Last updated: 08 February 2023
+       Copyright (c) 1997-2023 University of Cambridge.

diff --git a/src/pcre2grep.c b/src/pcre2grep.c
index e7eb311..83179da 100644
--- a/src/pcre2grep.c
+++ b/src/pcre2grep.c

@@ -13,7 +13,7 @@
 The header can be found in the special z/OS distribution, which is available
 from www.zaconsultants.net or from www.cbttape.org.
 
-           Copyright (c) 1997-2022 University of Cambridge
+           Copyright (c) 1997-2023 University of Cambridge
 
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -268,6 +268,7 @@
 static uint32_t capture_max = DEFAULT_CAPTURE_MAX;
 
 static BOOL all_matches = FALSE;
+static BOOL case_restrict = FALSE;
 static BOOL count_only = FALSE;
 static BOOL do_colour = FALSE;
 #ifdef WIN32
@@ -279,6 +280,7 @@
 static BOOL line_buffered = FALSE;
 static BOOL line_offsets = FALSE;
 static BOOL multiline = FALSE;
+static BOOL no_ucp = FALSE;
 static BOOL number = FALSE;
 static BOOL omit_zero_count = FALSE;
 static BOOL resource_error = FALSE;
@@ -437,6 +439,7 @@
   { OP_NODATA,     'c',      NULL,              "count",         "print only a count of matching lines per FILE" },
   { OP_STRING,     'D',      &DEE_option,       "devices=action","how to handle devices, FIFOs, and sockets" },
   { OP_STRING,     'd',      &dee_option,       "directories=action", "how to handle directories" },
+  { OP_NODATA,     'E',      NULL,              "case-restrict", "restrict case matching (no mix ASCII/non-ASCII)" },
   { OP_PATLIST,    'e',      &match_patdata,    "regex(p)=pattern", "specify pattern (may be used more than once)" },
   { OP_NODATA,     'F',      NULL,              "fixed-strings", "patterns are sets of newline-separated strings" },
   { OP_FILELIST,   'f',      &pattern_files_data, "file=path",   "read patterns from file" },
@@ -469,6 +472,7 @@
   { OP_OP_NUMBERS, 'o',      &only_matching_data, "only-matching=n", "show only the part of the line that matched" },
   { OP_STRING,     N_OM_SEPARATOR, &om_separator, "om-separator=text", "set separator for multiple -o output" },
   { OP_U32NUMBER,  N_OM_CAPTURE, &capture_max,  "om-capture=n",  "set capture count for --only-matching" },
+  { OP_NODATA,     'P',      NULL,              "no-ucp",        "do not set PCRE2_UCP in Unicode mode" },
   { OP_NODATA,     'q',      NULL,              "quiet",         "suppress output, just set return code" },
   { OP_NODATA,     'r',      NULL,              "recursive",     "recursively scan sub-directories" },
   { OP_PATLIST,    N_EXCLUDE,&exclude_patdata,  "exclude=pattern","exclude matching files when recursing" },
@@ -479,8 +483,8 @@
   { OP_FILELIST,   N_INCLUDE_FROM,&include_from_data, "include-from=path", "read include list from file" },
   { OP_NODATA,    's',      NULL,              "no-messages",   "suppress error messages" },
   { OP_NODATA,    't',      NULL,              "total-count",   "print total count of matching lines" },
-  { OP_NODATA,    'u',      NULL,              "utf",           "use UTF mode" },
-  { OP_NODATA,    'U',      NULL,              "utf-allow-invalid", "use UTF mode, allow for invalid code units" },
+  { OP_NODATA,    'u',      NULL,              "utf",           "use UTF/Unicode" },
+  { OP_NODATA,    'U',      NULL,              "utf-allow-invalid", "use UTF/Unicode, allow for invalid code units" },
   { OP_NODATA,    'V',      NULL,              "version",       "print version information and exit" },
   { OP_NODATA,    'v',      NULL,              "invert-match",  "select non-matching lines" },
   { OP_NODATA,    'w',      NULL,              "word-regex(p)", "force patterns to match only as words"  },
@@ -3585,6 +3589,7 @@
   case N_ALLABSK: extra_options |= PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK; break;
   case 'a': binary_files = BIN_TEXT; break;
   case 'c': count_only = TRUE; break;
+  case 'E': case_restrict = TRUE; break;
   case 'F': options |= PCRE2_LITERAL; break;
   case 'H': filenames = FN_FORCE; break;
   case 'I': binary_files = BIN_NOMATCH; break;
@@ -3600,12 +3605,14 @@
   if (only_matching == NULL) only_matching = only_matching_last;
   break;
 
+  case 'P': no_ucp = TRUE; break;
   case 'q': quiet = TRUE; break;
   case 'r': dee_action = dee_RECURSE; break;
   case 's': silent = TRUE; break;
   case 't': show_total_count = TRUE; break;
-  case 'u': options |= PCRE2_UTF; utf = TRUE; break;
-  case 'U': options |= PCRE2_UTF|PCRE2_MATCH_INVALID_UTF; utf = TRUE; break;
+  case 'u': options |= PCRE2_UTF | PCRE2_UCP; utf = TRUE; break;
+  case 'U': options |= PCRE2_UTF|PCRE2_MATCH_INVALID_UTF|PCRE2_UCP;
+            utf = TRUE; break;
   case 'v': invert = TRUE; break;
 
   case 'V':
@@ -4221,13 +4228,13 @@
 match_data_toggle = 0;
 
 /* If string (script) callouts are supported, set up the callout processing
-function. */
+function in the match context. */
 
 #ifdef SUPPORT_PCRE2GREP_CALLOUT
 pcre2_set_callout(match_context, pcre2grep_callout, NULL);
 #endif
 
-/* Put limits into the match data block. */
+/* Put limits into the match context. */
 
 if (heap_limit != PCRE2_UNSET) pcre2_set_heap_limit(match_context, heap_limit);
 if (match_limit > 0) pcre2_set_match_limit(match_context, match_limit);
@@ -4353,7 +4360,15 @@
     }
   }
 
-/* Set the extra options */
+/* If no_ucp is set, remove PCRE2_UCP from the compile options. */
+
+if (no_ucp) pcre2_options &= ~PCRE2_UCP;
+
+/* If case_restrict is set, adjust the extra options. */
+
+if (case_restrict) extra_options |= PCRE2_EXTRA_CASELESS_RESTRICT;
+
+/* Set the extra options in the compile context. */
 
 (void)pcre2_set_compile_extra_options(compile_context, extra_options);
 

diff --git a/testdata/grepinput8 b/testdata/grepinput8
index 7779cdc..748c674 100644
--- a/testdata/grepinput8
+++ b/testdata/grepinput8
Binary files differ

diff --git a/testdata/grepoutput b/testdata/grepoutput
index aa53aab..f0f5c44 100644
--- a/testdata/grepoutput
+++ b/testdata/grepoutput
Binary files differ

diff --git a/testdata/grepoutput8 b/testdata/grepoutput8
index 3888d9a..5bfd555 100644
--- a/testdata/grepoutput8
+++ b/testdata/grepoutput8
Binary files differ
commit	8385df8c97b6f8069a48e600c7e4e94cc3e3ebd9	[log] [tgz]
author	Philip Hazel <Philip.Hazel@gmail.com>	Wed Feb 08 15:09:18 2023 +0000
committer	Philip Hazel <Philip.Hazel@gmail.com>	Wed Feb 08 15:09:18 2023 +0000
tree	15d65ce0ec185fabcf616c69229797adb59ad81d
parent	6a97f600d6572d024fc3a1a42f4730b83c380440 [diff]