Made PCRE2_UCP the default in UTF mode in pcre2grep, and added new options --case-restrict and --no-ucp.
diff --git a/ChangeLog b/ChangeLog
index 16d0eef..b211e9c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -55,6 +55,9 @@
12. Integer overflow testing is now centralized in a new function.
+13. Made PCRE2_UCP the default in UTF mode in pcre2grep, and added new options
+--case-restrict and --no-ucp.
+
Version 10.42 11-December-2022
------------------------------
diff --git a/RunGrepTest b/RunGrepTest
index 0a00e82..4383010 100755
--- a/RunGrepTest
+++ b/RunGrepTest
@@ -854,8 +854,6 @@
(cd $srcdir; $valgrind $vjs $pcre2grep --colour=always -e this -e The -e 'The wo' testdata/grepinputv) >>testtrygrep
-
-
# Now compare the results.
$cf $srcdir/testdata/grepoutput testtrygrep
@@ -893,6 +891,14 @@
(cd $srcdir; $valgrind $vjs $pcre2grep -u -m1 -O '=$x{1d3}$o{744}=' 'fox') <$srcdir/testdata/grepinputv >>testtrygrep 2>&1
echo "RC=$?" >>testtrygrep
+ echo "---------------------------- Test U7 ------------------------------" >>testtrygrep
+ (cd $srcdir; $valgrind $vjs $pcre2grep -ui --colour=always 'k+|\babc\b' ./testdata/grepinput8) >>testtrygrep
+ echo "RC=$?" >>testtrygrep
+
+ echo "---------------------------- Test U8 ------------------------------" >>testtrygrep
+ (cd $srcdir; $valgrind $vjs $pcre2grep -UiEP --colour=always 'k+|\babc\b' ./testdata/grepinput8) >>testtrygrep
+ echo "RC=$?" >>testtrygrep
+
$cf $srcdir/testdata/grepoutput8 testtrygrep
if [ $? != 0 ] ; then exit 1; fi
diff --git a/doc/html/pcre2grep.html b/doc/html/pcre2grep.html
index 29ab031..eb688a4 100644
--- a/doc/html/pcre2grep.html
+++ b/doc/html/pcre2grep.html
@@ -21,7 +21,7 @@
<li><a name="TOC6" href="#SEC6">OPTIONS</a>
<li><a name="TOC7" href="#SEC7">ENVIRONMENT VARIABLES</a>
<li><a name="TOC8" href="#SEC8">NEWLINES</a>
-<li><a name="TOC9" href="#SEC9">OPTIONS COMPATIBILITY</a>
+<li><a name="TOC9" href="#SEC9">OPTIONS COMPATIBILITY WITH GNU GREP</a>
<li><a name="TOC10" href="#SEC10">OPTIONS WITH DATA</a>
<li><a name="TOC11" href="#SEC11">USING PCRE2'S CALLOUT FACILITY</a>
<li><a name="TOC12" href="#SEC12">MATCHING ERRORS</a>
@@ -314,6 +314,14 @@
See <b>--match-limit</b> below.
</P>
<P>
+<b>-E</b>, <b>--case-restrict</b>
+When case distinctions are being ignored in Unicode mode, two ASCII letters (K
+and S) will by default match Unicode characters U+212A (Kelvin sign) and U+017F
+(long S) respectively, as well as their lower case ASCII counterparts. When
+this option is set, case equivalences are restricted such that no ASCII
+character matches a non-ASCII character, and vice versa.
+</P>
+<P>
<b>-e</b> <i>pattern</i>, <b>--regex=</b><i>pattern</i>, <b>--regexp=</b><i>pattern</i>
Specify a pattern to be matched. This option can be used multiple times in
order to specify several patterns. It can also be used as a way of specifying a
@@ -449,7 +457,9 @@
</P>
<P>
<b>-i</b>, <b>--ignore-case</b>
-Ignore upper/lower case distinctions during comparisons.
+Ignore upper/lower case distinctions when pattern matching. This applies when
+matching path names for inclusion or exclusion as well as when matching lines
+in files.
</P>
<P>
<b>--include</b>=<i>pattern</i>
@@ -759,6 +769,18 @@
is an empty string. Separating strings are never coloured.
</P>
<P>
+<b>-P</b>, <b>--no-ucp</b>
+Starting from release 10.43, when UTF/Unicode mode is specified with <b>-u</b>
+or <b>-U</b>, the PCRE2_UCP option is used by default. This means that the
+simple class escapes in patterns match more than just ASCII characters. For
+example, \d matches any Unicode decimal digit. The <b>--no-ucp</b> option
+suppresses PCRE2_UCP, thus restricting the class escapes to ASCII characters,
+as was the case in earlier releases. Note that there are now more fine-grained
+option settings within patterns that affect individual escapes. For example,
+when PCRE2_UCP is set, the sequence (?aD) restricts \d to ASCII digits, while
+allowing \w to match Unicode letters and digits.
+</P>
+<P>
<b>-q</b>, <b>--quiet</b>
Work quietly, that is, display nothing except error messages. The exit
status indicates whether or not any matches were found.
@@ -796,11 +818,11 @@
</P>
<P>
<b>-u</b>, <b>--utf</b>
-Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled
-with UTF-8 support. All patterns (including those for any <b>--exclude</b> and
-<b>--include</b> options) and all lines that are scanned must be valid strings
-of UTF-8 characters. If an invalid UTF-8 string is encountered, an error
-occurs.
+Operate in UTF/Unicode mode. This option is available only if PCRE2 has been
+compiled with UTF-8 support. All patterns (including those for any
+<b>--exclude</b> and <b>--include</b> options) and all lines that are scanned
+must be valid strings of UTF-8 characters. If an invalid UTF-8 string is
+encountered, an error occurs.
</P>
<P>
<b>-U</b>, <b>--utf-allow-invalid</b>
@@ -883,25 +905,27 @@
standard output must end with "\r\n". For all other operating systems, and
for all messages to the standard error stream, "\n" is used.
</P>
-<br><a name="SEC9" href="#TOC1">OPTIONS COMPATIBILITY</a><br>
+<br><a name="SEC9" href="#TOC1">OPTIONS COMPATIBILITY WITH GNU GREP</a><br>
<P>
-Many of the short and long forms of <b>pcre2grep</b>'s options are the same
-as in the GNU <b>grep</b> program. Any long option of the form
-<b>--xxx-regexp</b> (GNU terminology) is also available as <b>--xxx-regex</b>
-(PCRE2 terminology). However, the <b>--depth-limit</b>, <b>--file-list</b>,
-<b>--file-offsets</b>, <b>--heap-limit</b>, <b>--include-dir</b>,
-<b>--line-offsets</b>, <b>--locale</b>, <b>--match-limit</b>, <b>-M</b>,
-<b>--multiline</b>, <b>-N</b>, <b>--newline</b>, <b>--om-separator</b>,
-<b>--output</b>, <b>-u</b>, <b>--utf</b>, <b>-U</b>, and <b>--utf-allow-invalid</b>
-options are specific to <b>pcre2grep</b>, as is the use of the
-<b>--only-matching</b> option with a capturing parentheses number.
+Many of the short and long forms of <b>pcre2grep</b>'s options are the same as
+in the GNU <b>grep</b> program. Any long option of the form <b>--xxx-regexp</b>
+(GNU terminology) is also available as <b>--xxx-regex</b> (PCRE2 terminology).
+However, the <b>--case-restrict</b>, <b>--depth-limit</b>, <b>-E</b>,
+<b>--file-list</b>, <b>--file-offsets</b>, <b>--heap-limit</b>,
+<b>--include-dir</b>, <b>--line-offsets</b>, <b>--locale</b>, <b>--match-limit</b>,
+<b>-M</b>, <b>--multiline</b>, <b>-N</b>, <b>--newline</b>, <b>--no-ucp</b>,
+<b>--om-separator</b>, <b>--output</b>, <b>-P</b>, <b>-u</b>, <b>--utf</b>,
+<b>-U</b>, and <b>--utf-allow-invalid</b> options are specific to
+<b>pcre2grep</b>, as is the use of the <b>--only-matching</b> option with a
+capturing parentheses number.
</P>
<P>
Although most of the common options work the same way, a few are different in
<b>pcre2grep</b>. For example, the <b>--include</b> option's argument is a glob
-for GNU <b>grep</b>, but a regular expression for <b>pcre2grep</b>. If both the
-<b>-c</b> and <b>-l</b> options are given, GNU grep lists only file names,
-without counts, but <b>pcre2grep</b> gives the counts as well.
+for GNU <b>grep</b>, but in <b>pcre2grep</b> it is a regular expression to which
+the \i option applies. If both the <b>-c</b> and <b>-l</b> options are given,
+GNU grep lists only file names, without counts, but <b>pcre2grep</b> gives the
+counts as well.
</P>
<br><a name="SEC10" href="#TOC1">OPTIONS WITH DATA</a><br>
<P>
@@ -1065,9 +1089,9 @@
</P>
<br><a name="SEC16" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 21 November 2022
+Last updated: 08 February 2023
<br>
-Copyright © 1997-2022 University of Cambridge.
+Copyright © 1997-2023 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE2 index page</a>.
diff --git a/doc/pcre2grep.1 b/doc/pcre2grep.1
index 956633d..80e3c84 100644
--- a/doc/pcre2grep.1
+++ b/doc/pcre2grep.1
@@ -1,4 +1,4 @@
-.TH PCRE2GREP 1 "21 November 2022" "PCRE2 10.41"
+.TH PCRE2GREP 1 "08 February 2023" "PCRE2 10.43"
.SH NAME
pcre2grep - a grep with Perl-compatible regular expressions.
.SH SYNOPSIS
@@ -268,6 +268,13 @@
\fB--depth-limit\fP=\fInumber\fP
See \fB--match-limit\fP below.
.TP
+\fB-E\fP, \fB--case-restrict\fP
+When case distinctions are being ignored in Unicode mode, two ASCII letters (K
+and S) will by default match Unicode characters U+212A (Kelvin sign) and U+017F
+(long S) respectively, as well as their lower case ASCII counterparts. When
+this option is set, case equivalences are restricted such that no ASCII
+character matches a non-ASCII character, and vice versa.
+.TP
\fB-e\fP \fIpattern\fP, \fB--regex=\fP\fIpattern\fP, \fB--regexp=\fP\fIpattern\fP
Specify a pattern to be matched. This option can be used multiple times in
order to specify several patterns. It can also be used as a way of specifying a
@@ -388,7 +395,9 @@
\fB--binary-files\fP=\fIwithout-match\fP.
.TP
\fB-i\fP, \fB--ignore-case\fP
-Ignore upper/lower case distinctions during comparisons.
+Ignore upper/lower case distinctions when pattern matching. This applies when
+matching path names for inclusion or exclusion as well as when matching lines
+in files.
.TP
\fB--include\fP=\fIpattern\fP
If any \fB--include\fP patterns are specified, the only files that are
@@ -660,6 +669,17 @@
Specify a separating string for multiple occurrences of \fB-o\fP. The default
is an empty string. Separating strings are never coloured.
.TP
+\fB-P\fP, \fB--no-ucp\fP
+Starting from release 10.43, when UTF/Unicode mode is specified with \fB-u\fP
+or \fB-U\fP, the PCRE2_UCP option is used by default. This means that the
+simple class escapes in patterns match more than just ASCII characters. For
+example, \ed matches any Unicode decimal digit. The \fB--no-ucp\fP option
+suppresses PCRE2_UCP, thus restricting the class escapes to ASCII characters,
+as was the case in earlier releases. Note that there are now more fine-grained
+option settings within patterns that affect individual escapes. For example,
+when PCRE2_UCP is set, the sequence (?aD) restricts \ed to ASCII digits, while
+allowing \ew to match Unicode letters and digits.
+.TP
\fB-q\fP, \fB--quiet\fP
Work quietly, that is, display nothing except error messages. The exit
status indicates whether or not any matches were found.
@@ -692,11 +712,11 @@
total would always be zero.
.TP
\fB-u\fP, \fB--utf\fP
-Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled
-with UTF-8 support. All patterns (including those for any \fB--exclude\fP and
-\fB--include\fP options) and all lines that are scanned must be valid strings
-of UTF-8 characters. If an invalid UTF-8 string is encountered, an error
-occurs.
+Operate in UTF/Unicode mode. This option is available only if PCRE2 has been
+compiled with UTF-8 support. All patterns (including those for any
+\fB--exclude\fP and \fB--include\fP options) and all lines that are scanned
+must be valid strings of UTF-8 characters. If an invalid UTF-8 string is
+encountered, an error occurs.
.TP
\fB-U\fP, \fB--utf-allow-invalid\fP
As \fB--utf\fP, but in addition subject lines may contain invalid UTF-8 code
@@ -778,25 +798,27 @@
for all messages to the standard error stream, "\en" is used.
.
.
-.SH "OPTIONS COMPATIBILITY"
+.SH "OPTIONS COMPATIBILITY WITH GNU GREP"
.rs
.sp
-Many of the short and long forms of \fBpcre2grep\fP's options are the same
-as in the GNU \fBgrep\fP program. Any long option of the form
-\fB--xxx-regexp\fP (GNU terminology) is also available as \fB--xxx-regex\fP
-(PCRE2 terminology). However, the \fB--depth-limit\fP, \fB--file-list\fP,
-\fB--file-offsets\fP, \fB--heap-limit\fP, \fB--include-dir\fP,
-\fB--line-offsets\fP, \fB--locale\fP, \fB--match-limit\fP, \fB-M\fP,
-\fB--multiline\fP, \fB-N\fP, \fB--newline\fP, \fB--om-separator\fP,
-\fB--output\fP, \fB-u\fP, \fB--utf\fP, \fB-U\fP, and \fB--utf-allow-invalid\fP
-options are specific to \fBpcre2grep\fP, as is the use of the
-\fB--only-matching\fP option with a capturing parentheses number.
+Many of the short and long forms of \fBpcre2grep\fP's options are the same as
+in the GNU \fBgrep\fP program. Any long option of the form \fB--xxx-regexp\fP
+(GNU terminology) is also available as \fB--xxx-regex\fP (PCRE2 terminology).
+However, the \fB--case-restrict\fP, \fB--depth-limit\fP, \fB-E\fP,
+\fB--file-list\fP, \fB--file-offsets\fP, \fB--heap-limit\fP,
+\fB--include-dir\fP, \fB--line-offsets\fP, \fB--locale\fP, \fB--match-limit\fP,
+\fB-M\fP, \fB--multiline\fP, \fB-N\fP, \fB--newline\fP, \fB--no-ucp\fP,
+\fB--om-separator\fP, \fB--output\fP, \fB-P\fP, \fB-u\fP, \fB--utf\fP,
+\fB-U\fP, and \fB--utf-allow-invalid\fP options are specific to
+\fBpcre2grep\fP, as is the use of the \fB--only-matching\fP option with a
+capturing parentheses number.
.P
Although most of the common options work the same way, a few are different in
\fBpcre2grep\fP. For example, the \fB--include\fP option's argument is a glob
-for GNU \fBgrep\fP, but a regular expression for \fBpcre2grep\fP. If both the
-\fB-c\fP and \fB-l\fP options are given, GNU grep lists only file names,
-without counts, but \fBpcre2grep\fP gives the counts as well.
+for GNU \fBgrep\fP, but in \fBpcre2grep\fP it is a regular expression to which
+the \ei option applies. If both the \fB-c\fP and \fB-l\fP options are given,
+GNU grep lists only file names, without counts, but \fBpcre2grep\fP gives the
+counts as well.
.
.
.SH "OPTIONS WITH DATA"
@@ -970,6 +992,6 @@
.rs
.sp
.nf
-Last updated: 21 November 2022
-Copyright (c) 1997-2022 University of Cambridge.
+Last updated: 08 February 2023
+Copyright (c) 1997-2023 University of Cambridge.
.fi
diff --git a/doc/pcre2grep.txt b/doc/pcre2grep.txt
index adc1d89..602c977 100644
--- a/doc/pcre2grep.txt
+++ b/doc/pcre2grep.txt
@@ -286,6 +286,15 @@
--depth-limit=number
See --match-limit below.
+ -E, --case-restrict
+ When case distinctions are being ignored in Unicode mode, two
+ ASCII letters (K and S) will by default match Unicode charac-
+ ters U+212A (Kelvin sign) and U+017F (long S) respectively,
+ as well as their lower case ASCII counterparts. When this op-
+ tion is set, case equivalences are restricted such that no
+ ASCII character matches a non-ASCII character, and vice
+ versa.
+
-e pattern, --regex=pattern, --regexp=pattern
Specify a pattern to be matched. This option can be used mul-
tiple times in order to specify several patterns. It can also
@@ -421,7 +430,9 @@
files=without-match.
-i, --ignore-case
- Ignore upper/lower case distinctions during comparisons.
+ Ignore upper/lower case distinctions when pattern matching.
+ This applies when matching path names for inclusion or exclu-
+ sion as well as when matching lines in files.
--include=pattern
If any --include patterns are specified, the only files that
@@ -736,6 +747,19 @@
The default is an empty string. Separating strings are never
coloured.
+ -P, --no-ucp
+ Starting from release 10.43, when UTF/Unicode mode is speci-
+ fied with -u or -U, the PCRE2_UCP option is used by default.
+ This means that the simple class escapes in patterns match
+ more than just ASCII characters. For example, \d matches any
+ Unicode decimal digit. The --no-ucp option suppresses
+ PCRE2_UCP, thus restricting the class escapes to ASCII char-
+ acters, as was the case in earlier releases. Note that there
+ are now more fine-grained option settings within patterns
+ that affect individual escapes. For example, when PCRE2_UCP
+ is set, the sequence (?aD) restricts \d to ASCII digits,
+ while allowing \w to match Unicode letters and digits.
+
-q, --quiet
Work quietly, that is, display nothing except error messages.
The exit status indicates whether or not any matches were
@@ -771,57 +795,58 @@
(list files without matches), because the grand total would
always be zero.
- -u, --utf Operate in UTF-8 mode. This option is available only if PCRE2
- has been compiled with UTF-8 support. All patterns (including
- those for any --exclude and --include options) and all lines
- that are scanned must be valid strings of UTF-8 characters.
- If an invalid UTF-8 string is encountered, an error occurs.
+ -u, --utf Operate in UTF/Unicode mode. This option is available only if
+ PCRE2 has been compiled with UTF-8 support. All patterns (in-
+ cluding those for any --exclude and --include options) and
+ all lines that are scanned must be valid strings of UTF-8
+ characters. If an invalid UTF-8 string is encountered, an er-
+ ror occurs.
-U, --utf-allow-invalid
- As --utf, but in addition subject lines may contain invalid
- UTF-8 code unit sequences. These can never form part of any
- pattern match. Patterns themselves, however, must still be
+ As --utf, but in addition subject lines may contain invalid
+ UTF-8 code unit sequences. These can never form part of any
+ pattern match. Patterns themselves, however, must still be
valid UTF-8 strings. This facility allows valid UTF-8 strings
to be sought within arbitrary byte sequences in executable or
- other binary files. For more details about matching in non-
+ other binary files. For more details about matching in non-
valid UTF-8 strings, see the pcre2unicode(3) documentation.
-V, --version
- Write the version numbers of pcre2grep and the PCRE2 library
- to the standard output and then exit. Anything else on the
+ Write the version numbers of pcre2grep and the PCRE2 library
+ to the standard output and then exit. Anything else on the
command line is ignored.
-v, --invert-match
- Invert the sense of the match, so that lines which do not
- match any of the patterns are the ones that are found. When
- this option is set, options such as --only-matching and
- --output, which specify parts of a match that are to be out-
+ Invert the sense of the match, so that lines which do not
+ match any of the patterns are the ones that are found. When
+ this option is set, options such as --only-matching and
+ --output, which specify parts of a match that are to be out-
put, are ignored.
-w, --word-regex, --word-regexp
Force the patterns only to match "words". That is, there must
- be a word boundary at the start and end of each matched
- string. This is equivalent to having "\b(?:" at the start of
- each pattern, and ")\b" at the end. This option applies only
- to the patterns that are matched against the contents of
- files; it does not apply to patterns specified by any of the
+ be a word boundary at the start and end of each matched
+ string. This is equivalent to having "\b(?:" at the start of
+ each pattern, and ")\b" at the end. This option applies only
+ to the patterns that are matched against the contents of
+ files; it does not apply to patterns specified by any of the
--include or --exclude options.
-x, --line-regex, --line-regexp
- Force the patterns to start matching only at the beginnings
- of lines, and in addition, require them to match entire
+ Force the patterns to start matching only at the beginnings
+ of lines, and in addition, require them to match entire
lines. In multiline mode the match may be more than one line.
This is equivalent to having "^(?:" at the start of each pat-
- tern and ")$" at the end. This option applies only to the
- patterns that are matched against the contents of files; it
- does not apply to patterns specified by any of the --include
+ tern and ")$" at the end. This option applies only to the
+ patterns that are matched against the contents of files; it
+ does not apply to patterns specified by any of the --include
or --exclude options.
-Z, --null
- Terminate files names in the regular output with a zero byte
- (the NUL character) instead of what would normally appear.
- This is useful when file names contain unusual characters
- such as colons, hyphens, or even newlines. The option does
+ Terminate files names in the regular output with a zero byte
+ (the NUL character) instead of what would normally appear.
+ This is useful when file names contain unusual characters
+ such as colons, hyphens, or even newlines. The option does
not apply to file names in error messages.
@@ -835,137 +860,139 @@
NEWLINES
- The -N (--newline) option allows pcre2grep to scan files with newline
- conventions that differ from the default. This option affects only the
- way scanned files are processed. It does not affect the interpretation
- of files specified by the -f, --file-list, --exclude-from, or --in-
+ The -N (--newline) option allows pcre2grep to scan files with newline
+ conventions that differ from the default. This option affects only the
+ way scanned files are processed. It does not affect the interpretation
+ of files specified by the -f, --file-list, --exclude-from, or --in-
clude-from options.
- Any parts of the scanned input files that are written to the standard
- output are copied with whatever newline sequences they have in the in-
- put. However, if the final line of a file is output, and it does not
- end with a newline sequence, a newline sequence is added. If the new-
- line setting is CR, LF, CRLF or NUL, that line ending is output; for
+ Any parts of the scanned input files that are written to the standard
+ output are copied with whatever newline sequences they have in the in-
+ put. However, if the final line of a file is output, and it does not
+ end with a newline sequence, a newline sequence is added. If the new-
+ line setting is CR, LF, CRLF or NUL, that line ending is output; for
the other settings (ANYCRLF or ANY) a single NL is used.
- The newline setting does not affect the way in which pcre2grep writes
- newlines in informational messages to the standard output and error
- streams. Under Windows, the standard output is set to be binary, so
- that "\r\n" at the ends of output lines that are copied from the input
- is not converted to "\r\r\n" by the C I/O library. This means that any
- messages written to the standard output must end with "\r\n". For all
- other operating systems, and for all messages to the standard error
+ The newline setting does not affect the way in which pcre2grep writes
+ newlines in informational messages to the standard output and error
+ streams. Under Windows, the standard output is set to be binary, so
+ that "\r\n" at the ends of output lines that are copied from the input
+ is not converted to "\r\r\n" by the C I/O library. This means that any
+ messages written to the standard output must end with "\r\n". For all
+ other operating systems, and for all messages to the standard error
stream, "\n" is used.
-OPTIONS COMPATIBILITY
+OPTIONS COMPATIBILITY WITH GNU GREP
Many of the short and long forms of pcre2grep's options are the same as
- in the GNU grep program. Any long option of the form --xxx-regexp (GNU
- terminology) is also available as --xxx-regex (PCRE2 terminology). How-
- ever, the --depth-limit, --file-list, --file-offsets, --heap-limit,
- --include-dir, --line-offsets, --locale, --match-limit, -M, --multi-
- line, -N, --newline, --om-separator, --output, -u, --utf, -U, and
- --utf-allow-invalid options are specific to pcre2grep, as is the use of
- the --only-matching option with a capturing parentheses number.
+ in the GNU grep program. Any long option of the form --xxx-regexp (GNU
+ terminology) is also available as --xxx-regex (PCRE2 terminology).
+ However, the --case-restrict, --depth-limit, -E, --file-list, --file-
+ offsets, --heap-limit, --include-dir, --line-offsets, --locale,
+ --match-limit, -M, --multiline, -N, --newline, --no-ucp, --om-separa-
+ tor, --output, -P, -u, --utf, -U, and --utf-allow-invalid options are
+ specific to pcre2grep, as is the use of the --only-matching option with
+ a capturing parentheses number.
Although most of the common options work the same way, a few are dif-
ferent in pcre2grep. For example, the --include option's argument is a
- glob for GNU grep, but a regular expression for pcre2grep. If both the
- -c and -l options are given, GNU grep lists only file names, without
- counts, but pcre2grep gives the counts as well.
+ glob for GNU grep, but in pcre2grep it is a regular expression to which
+ the \i option applies. If both the -c and -l options are given, GNU
+ grep lists only file names, without counts, but pcre2grep gives the
+ counts as well.
OPTIONS WITH DATA
There are four different ways in which an option with data can be spec-
- ified. If a short form option is used, the data may follow immedi-
+ ified. If a short form option is used, the data may follow immedi-
ately, or (with one exception) in the next command line item. For exam-
ple:
-f/some/file
-f /some/file
- The exception is the -o option, which may appear with or without data.
- Because of this, if data is present, it must follow immediately in the
+ The exception is the -o option, which may appear with or without data.
+ Because of this, if data is present, it must follow immediately in the
same item, for example -o3.
- If a long form option is used, the data may appear in the same command
- line item, separated by an equals character, or (with two exceptions)
+ If a long form option is used, the data may appear in the same command
+ line item, separated by an equals character, or (with two exceptions)
it may appear in the next command line item. For example:
--file=/some/file
--file /some/file
- Note, however, that if you want to supply a file name beginning with ~
- as data in a shell command, and have the shell expand ~ to a home di-
- rectory, you must separate the file name from the option, because the
+ Note, however, that if you want to supply a file name beginning with ~
+ as data in a shell command, and have the shell expand ~ to a home di-
+ rectory, you must separate the file name from the option, because the
shell does not treat ~ specially unless it is at the start of an item.
- The exceptions to the above are the --colour (or --color) and --only-
- matching options, for which the data is optional. If one of these op-
- tions does have data, it must be given in the first form, using an
+ The exceptions to the above are the --colour (or --color) and --only-
+ matching options, for which the data is optional. If one of these op-
+ tions does have data, it must be given in the first form, using an
equals character. Otherwise pcre2grep will assume that it has no data.
USING PCRE2'S CALLOUT FACILITY
- pcre2grep has, by default, support for calling external programs or
- scripts or echoing specific strings during matching by making use of
- PCRE2's callout facility. However, this support can be completely or
- partially disabled when pcre2grep is built. You can find out whether
- your binary has support for callouts by running it with the --help op-
- tion. If callout support is completely disabled, all callouts in pat-
+ pcre2grep has, by default, support for calling external programs or
+ scripts or echoing specific strings during matching by making use of
+ PCRE2's callout facility. However, this support can be completely or
+ partially disabled when pcre2grep is built. You can find out whether
+ your binary has support for callouts by running it with the --help op-
+ tion. If callout support is completely disabled, all callouts in pat-
terns are ignored by pcre2grep. If the facility is partially disabled,
- calling external programs is not supported, and callouts that request
+ calling external programs is not supported, and callouts that request
it are ignored.
- A callout in a PCRE2 pattern is of the form (?C<arg>) where the argu-
- ment is either a number or a quoted string (see the pcre2callout docu-
- mentation for details). Numbered callouts are ignored by pcre2grep;
+ A callout in a PCRE2 pattern is of the form (?C<arg>) where the argu-
+ ment is either a number or a quoted string (see the pcre2callout docu-
+ mentation for details). Numbered callouts are ignored by pcre2grep;
only callouts with string arguments are useful.
Echoing a specific string
- Starting the callout string with a pipe character invokes an echoing
+ Starting the callout string with a pipe character invokes an echoing
facility that avoids calling an external program or script. This facil-
- ity is always available, provided that callouts were not completely
- disabled when pcre2grep was built. The rest of the callout string is
- processed as a zero-terminated string, which means it should not con-
- tain any internal binary zeros. It is written to the output, having
- first been passed through the same escape processing as text from the
- --output (-O) option (see above). However, $0 cannot be used to insert
- a matched substring because the match is still in progress. Instead,
- the single character '0' is inserted. Any syntax errors in the string
- (for example, a dollar not followed by another character) causes the
- callout to be ignored. No terminator is added to the output string, so
- if you want a newline, you must include it explicitly using the escape
+ ity is always available, provided that callouts were not completely
+ disabled when pcre2grep was built. The rest of the callout string is
+ processed as a zero-terminated string, which means it should not con-
+ tain any internal binary zeros. It is written to the output, having
+ first been passed through the same escape processing as text from the
+ --output (-O) option (see above). However, $0 cannot be used to insert
+ a matched substring because the match is still in progress. Instead,
+ the single character '0' is inserted. Any syntax errors in the string
+ (for example, a dollar not followed by another character) causes the
+ callout to be ignored. No terminator is added to the output string, so
+ if you want a newline, you must include it explicitly using the escape
$n. For example:
pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' <some file>
- Matching continues normally after the string is output. If you want to
- see only the callout output but not any output from an actual match,
+ Matching continues normally after the string is output. If you want to
+ see only the callout output but not any output from an actual match,
you should end the pattern with (*FAIL).
Calling external programs or scripts
This facility can be independently disabled when pcre2grep is built. It
- is supported for Windows, where a call to _spawnvp() is used, for VMS,
- where lib$spawn() is used, and for any Unix-like environment where
+ is supported for Windows, where a call to _spawnvp() is used, for VMS,
+ where lib$spawn() is used, and for any Unix-like environment where
fork() and execv() are available.
If the callout string does not start with a pipe (vertical bar) charac-
- ter, it is parsed into a list of substrings separated by pipe charac-
- ters. The first substring must be an executable name, with the follow-
+ ter, it is parsed into a list of substrings separated by pipe charac-
+ ters. The first substring must be an executable name, with the follow-
ing substrings specifying arguments:
executable_name|arg1|arg2|...
- Any substring (including the executable name) may contain escape se-
- quences started by a dollar character. These are the same as for the
+ Any substring (including the executable name) may contain escape se-
+ quences started by a dollar character. These are the same as for the
--output (-O) option documented above, except that $0 cannot insert the
- matched string because the match is still in progress. Instead, the
+ matched string because the match is still in progress. Instead, the
character '0' is inserted. If you need a literal dollar or pipe charac-
ter in any substring, use $$ or $| respectively. Here is an example:
@@ -980,43 +1007,43 @@
Arg1: [1] [234] [4] Arg2: |1| ()
12345
- The parameters for the system call that is used to run the program or
+ The parameters for the system call that is used to run the program or
script are zero-terminated strings. This means that binary zero charac-
- ters in the callout argument will cause premature termination of their
- substrings, and therefore should not be present. Any syntax errors in
- the string (for example, a dollar not followed by another character)
+ ters in the callout argument will cause premature termination of their
+ substrings, and therefore should not be present. Any syntax errors in
+ the string (for example, a dollar not followed by another character)
causes the callout to be ignored. If running the program fails for any
- reason (including the non-existence of the executable), a local match-
+ reason (including the non-existence of the executable), a local match-
ing failure occurs and the matcher backtracks in the normal way.
MATCHING ERRORS
- It is possible to supply a regular expression that takes a very long
- time to fail to match certain lines. Such patterns normally involve
- nested indefinite repeats, for example: (a+)*\d when matched against a
- line of a's with no final digit. The PCRE2 matching function has a re-
- source limit that causes it to abort in these circumstances. If this
- happens, pcre2grep outputs an error message and the line that caused
- the problem to the standard error stream. If there are more than 20
+ It is possible to supply a regular expression that takes a very long
+ time to fail to match certain lines. Such patterns normally involve
+ nested indefinite repeats, for example: (a+)*\d when matched against a
+ line of a's with no final digit. The PCRE2 matching function has a re-
+ source limit that causes it to abort in these circumstances. If this
+ happens, pcre2grep outputs an error message and the line that caused
+ the problem to the standard error stream. If there are more than 20
such errors, pcre2grep gives up.
- The --match-limit option of pcre2grep can be used to set the overall
- resource limit. There are also other limits that affect the amount of
- memory used during matching; see the discussion of --heap-limit and
+ The --match-limit option of pcre2grep can be used to set the overall
+ resource limit. There are also other limits that affect the amount of
+ memory used during matching; see the discussion of --heap-limit and
--depth-limit above.
DIAGNOSTICS
Exit status is 0 if any matches were found, 1 if no matches were found,
- and 2 for syntax errors, overlong lines, non-existent or inaccessible
- files (even if matches were found in other files) or too many matching
+ and 2 for syntax errors, overlong lines, non-existent or inaccessible
+ files (even if matches were found in other files) or too many matching
errors. Using the -s option to suppress error messages about inaccessi-
ble files does not affect the return code.
- When run under VMS, the return code is placed in the symbol
- PCRE2GREP_RC because VMS does not distinguish between exit(0) and
+ When run under VMS, the return code is placed in the symbol
+ PCRE2GREP_RC because VMS does not distinguish between exit(0) and
exit(1).
@@ -1034,5 +1061,5 @@
REVISION
- Last updated: 21 November 2022
- Copyright (c) 1997-2022 University of Cambridge.
+ Last updated: 08 February 2023
+ Copyright (c) 1997-2023 University of Cambridge.
diff --git a/src/pcre2grep.c b/src/pcre2grep.c
index e7eb311..83179da 100644
--- a/src/pcre2grep.c
+++ b/src/pcre2grep.c
@@ -13,7 +13,7 @@
The header can be found in the special z/OS distribution, which is available
from www.zaconsultants.net or from www.cbttape.org.
- Copyright (c) 1997-2022 University of Cambridge
+ Copyright (c) 1997-2023 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -268,6 +268,7 @@
static uint32_t capture_max = DEFAULT_CAPTURE_MAX;
static BOOL all_matches = FALSE;
+static BOOL case_restrict = FALSE;
static BOOL count_only = FALSE;
static BOOL do_colour = FALSE;
#ifdef WIN32
@@ -279,6 +280,7 @@
static BOOL line_buffered = FALSE;
static BOOL line_offsets = FALSE;
static BOOL multiline = FALSE;
+static BOOL no_ucp = FALSE;
static BOOL number = FALSE;
static BOOL omit_zero_count = FALSE;
static BOOL resource_error = FALSE;
@@ -437,6 +439,7 @@
{ OP_NODATA, 'c', NULL, "count", "print only a count of matching lines per FILE" },
{ OP_STRING, 'D', &DEE_option, "devices=action","how to handle devices, FIFOs, and sockets" },
{ OP_STRING, 'd', &dee_option, "directories=action", "how to handle directories" },
+ { OP_NODATA, 'E', NULL, "case-restrict", "restrict case matching (no mix ASCII/non-ASCII)" },
{ OP_PATLIST, 'e', &match_patdata, "regex(p)=pattern", "specify pattern (may be used more than once)" },
{ OP_NODATA, 'F', NULL, "fixed-strings", "patterns are sets of newline-separated strings" },
{ OP_FILELIST, 'f', &pattern_files_data, "file=path", "read patterns from file" },
@@ -469,6 +472,7 @@
{ OP_OP_NUMBERS, 'o', &only_matching_data, "only-matching=n", "show only the part of the line that matched" },
{ OP_STRING, N_OM_SEPARATOR, &om_separator, "om-separator=text", "set separator for multiple -o output" },
{ OP_U32NUMBER, N_OM_CAPTURE, &capture_max, "om-capture=n", "set capture count for --only-matching" },
+ { OP_NODATA, 'P', NULL, "no-ucp", "do not set PCRE2_UCP in Unicode mode" },
{ OP_NODATA, 'q', NULL, "quiet", "suppress output, just set return code" },
{ OP_NODATA, 'r', NULL, "recursive", "recursively scan sub-directories" },
{ OP_PATLIST, N_EXCLUDE,&exclude_patdata, "exclude=pattern","exclude matching files when recursing" },
@@ -479,8 +483,8 @@
{ OP_FILELIST, N_INCLUDE_FROM,&include_from_data, "include-from=path", "read include list from file" },
{ OP_NODATA, 's', NULL, "no-messages", "suppress error messages" },
{ OP_NODATA, 't', NULL, "total-count", "print total count of matching lines" },
- { OP_NODATA, 'u', NULL, "utf", "use UTF mode" },
- { OP_NODATA, 'U', NULL, "utf-allow-invalid", "use UTF mode, allow for invalid code units" },
+ { OP_NODATA, 'u', NULL, "utf", "use UTF/Unicode" },
+ { OP_NODATA, 'U', NULL, "utf-allow-invalid", "use UTF/Unicode, allow for invalid code units" },
{ OP_NODATA, 'V', NULL, "version", "print version information and exit" },
{ OP_NODATA, 'v', NULL, "invert-match", "select non-matching lines" },
{ OP_NODATA, 'w', NULL, "word-regex(p)", "force patterns to match only as words" },
@@ -3585,6 +3589,7 @@
case N_ALLABSK: extra_options |= PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK; break;
case 'a': binary_files = BIN_TEXT; break;
case 'c': count_only = TRUE; break;
+ case 'E': case_restrict = TRUE; break;
case 'F': options |= PCRE2_LITERAL; break;
case 'H': filenames = FN_FORCE; break;
case 'I': binary_files = BIN_NOMATCH; break;
@@ -3600,12 +3605,14 @@
if (only_matching == NULL) only_matching = only_matching_last;
break;
+ case 'P': no_ucp = TRUE; break;
case 'q': quiet = TRUE; break;
case 'r': dee_action = dee_RECURSE; break;
case 's': silent = TRUE; break;
case 't': show_total_count = TRUE; break;
- case 'u': options |= PCRE2_UTF; utf = TRUE; break;
- case 'U': options |= PCRE2_UTF|PCRE2_MATCH_INVALID_UTF; utf = TRUE; break;
+ case 'u': options |= PCRE2_UTF | PCRE2_UCP; utf = TRUE; break;
+ case 'U': options |= PCRE2_UTF|PCRE2_MATCH_INVALID_UTF|PCRE2_UCP;
+ utf = TRUE; break;
case 'v': invert = TRUE; break;
case 'V':
@@ -4221,13 +4228,13 @@
match_data_toggle = 0;
/* If string (script) callouts are supported, set up the callout processing
-function. */
+function in the match context. */
#ifdef SUPPORT_PCRE2GREP_CALLOUT
pcre2_set_callout(match_context, pcre2grep_callout, NULL);
#endif
-/* Put limits into the match data block. */
+/* Put limits into the match context. */
if (heap_limit != PCRE2_UNSET) pcre2_set_heap_limit(match_context, heap_limit);
if (match_limit > 0) pcre2_set_match_limit(match_context, match_limit);
@@ -4353,7 +4360,15 @@
}
}
-/* Set the extra options */
+/* If no_ucp is set, remove PCRE2_UCP from the compile options. */
+
+if (no_ucp) pcre2_options &= ~PCRE2_UCP;
+
+/* If case_restrict is set, adjust the extra options. */
+
+if (case_restrict) extra_options |= PCRE2_EXTRA_CASELESS_RESTRICT;
+
+/* Set the extra options in the compile context. */
(void)pcre2_set_compile_extra_options(compile_context, extra_options);
diff --git a/testdata/grepinput8 b/testdata/grepinput8
index 7779cdc..748c674 100644
--- a/testdata/grepinput8
+++ b/testdata/grepinput8
Binary files differ
diff --git a/testdata/grepoutput b/testdata/grepoutput
index aa53aab..f0f5c44 100644
--- a/testdata/grepoutput
+++ b/testdata/grepoutput
Binary files differ
diff --git a/testdata/grepoutput8 b/testdata/grepoutput8
index 3888d9a..5bfd555 100644
--- a/testdata/grepoutput8
+++ b/testdata/grepoutput8
Binary files differ