Merge "Upgrade regex to 1.7.0"

commit: 8f60e8004a905b29ad916bf036be7e60819dac0a [log] [tgz]
author: Treehugger Robot <treehugger-gerrit@google.com> Fri Jan 20 13:18:31 2023 +0000
committer: Gerrit Code Review <noreply-gerritcodereview@google.com> Fri Jan 20 13:18:31 2023 +0000
tree: 10c08e1a02ceff6e83ffc620f98e04d31f78b47e
parent: 4594904dce079aa49285af3044fc4c515cac15e3 [diff]
parent: 7023f2b7d11accbb7bae9b207321b9b4971ed6dd [diff]
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json
index 51b3cd6..67effe3 100644
--- a/.cargo_vcs_info.json
+++ b/.cargo_vcs_info.json

@@ -1,5 +1,6 @@
 {
   "git": {
-    "sha1": "f2dc1b788f773a49f1b6633a6302054978344452"
-  }
-}
+    "sha1": "f871a8eb1d725a8aaa56e2ceea57c24fce74b5fd"
+  },
+  "path_in_vcs": ""
+}
\ No newline at end of file

diff --git a/Android.bp b/Android.bp
index 77a8def..93eaee0 100644
--- a/Android.bp
+++ b/Android.bp

@@ -43,7 +43,7 @@
     host_supported: true,
     crate_name: "regex",
     cargo_env_compat: true,
-    cargo_pkg_version: "1.5.4",
+    cargo_pkg_version: "1.7.0",
     srcs: ["src/lib.rs"],
     edition: "2018",
     features: [
@@ -83,7 +83,7 @@
     host_supported: true,
     crate_name: "regex",
     cargo_env_compat: true,
-    cargo_pkg_version: "1.5.4",
+    cargo_pkg_version: "1.7.0",
     srcs: ["src/lib.rs"],
     test_suites: ["general-tests"],
     auto_gen_config: true,
@@ -125,7 +125,7 @@
     host_supported: true,
     crate_name: "backtrack",
     cargo_env_compat: true,
-    cargo_pkg_version: "1.5.4",
+    cargo_pkg_version: "1.7.0",
     srcs: ["tests/test_backtrack.rs"],
     test_suites: ["general-tests"],
     auto_gen_config: true,
@@ -168,7 +168,7 @@
     host_supported: true,
     crate_name: "backtrack_bytes",
     cargo_env_compat: true,
-    cargo_pkg_version: "1.5.4",
+    cargo_pkg_version: "1.7.0",
     srcs: ["tests/test_backtrack_bytes.rs"],
     test_suites: ["general-tests"],
     auto_gen_config: true,
@@ -211,7 +211,7 @@
     host_supported: true,
     crate_name: "backtrack_utf8bytes",
     cargo_env_compat: true,
-    cargo_pkg_version: "1.5.4",
+    cargo_pkg_version: "1.7.0",
     srcs: ["tests/test_backtrack_utf8bytes.rs"],
     test_suites: ["general-tests"],
     auto_gen_config: true,
@@ -254,7 +254,7 @@
     host_supported: true,
     crate_name: "crates_regex",
     cargo_env_compat: true,
-    cargo_pkg_version: "1.5.4",
+    cargo_pkg_version: "1.7.0",
     srcs: ["tests/test_crates_regex.rs"],
     test_suites: ["general-tests"],
     auto_gen_config: true,
@@ -297,7 +297,7 @@
     host_supported: true,
     crate_name: "default",
     cargo_env_compat: true,
-    cargo_pkg_version: "1.5.4",
+    cargo_pkg_version: "1.7.0",
     srcs: ["tests/test_default.rs"],
     test_suites: ["general-tests"],
     auto_gen_config: true,
@@ -340,7 +340,7 @@
     host_supported: true,
     crate_name: "default_bytes",
     cargo_env_compat: true,
-    cargo_pkg_version: "1.5.4",
+    cargo_pkg_version: "1.7.0",
     srcs: ["tests/test_default_bytes.rs"],
     test_suites: ["general-tests"],
     auto_gen_config: true,
@@ -383,7 +383,7 @@
     host_supported: true,
     crate_name: "nfa",
     cargo_env_compat: true,
-    cargo_pkg_version: "1.5.4",
+    cargo_pkg_version: "1.7.0",
     srcs: ["tests/test_nfa.rs"],
     test_suites: ["general-tests"],
     auto_gen_config: true,
@@ -426,7 +426,7 @@
     host_supported: true,
     crate_name: "nfa_bytes",
     cargo_env_compat: true,
-    cargo_pkg_version: "1.5.4",
+    cargo_pkg_version: "1.7.0",
     srcs: ["tests/test_nfa_bytes.rs"],
     test_suites: ["general-tests"],
     auto_gen_config: true,
@@ -469,7 +469,7 @@
     host_supported: true,
     crate_name: "nfa_utf8bytes",
     cargo_env_compat: true,
-    cargo_pkg_version: "1.5.4",
+    cargo_pkg_version: "1.7.0",
     srcs: ["tests/test_nfa_utf8bytes.rs"],
     test_suites: ["general-tests"],
     auto_gen_config: true,

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 71d1963..73e9e66 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md

@@ -1,3 +1,61 @@
+1.7.0 (2022-11-05)
+==================
+This release principally includes an upgrade to Unicode 15.
+
+New features:
+
+* [FEATURE #832](https://github.com/rust-lang/regex/issues/916):
+  Upgrade to Unicode 15.
+
+
+1.6.0 (2022-07-05)
+==================
+This release principally includes an upgrade to Unicode 14.
+
+New features:
+
+* [FEATURE #832](https://github.com/rust-lang/regex/pull/832):
+  Clarify that `Captures::len` includes all groups, not just matching groups.
+* [FEATURE #857](https://github.com/rust-lang/regex/pull/857):
+  Add an `ExactSizeIterator` impl for `SubCaptureMatches`.
+* [FEATURE #861](https://github.com/rust-lang/regex/pull/861):
+  Improve `RegexSet` documentation examples.
+* [FEATURE #877](https://github.com/rust-lang/regex/issues/877):
+  Upgrade to Unicode 14.
+
+Bug fixes:
+
+* [BUG #792](https://github.com/rust-lang/regex/issues/792):
+  Fix error message rendering bug.
+
+
+1.5.6 (2022-05-20)
+==================
+This release includes a few bug fixes, including a bug that produced incorrect
+matches when a non-greedy `?` operator was used.
+
+* [BUG #680](https://github.com/rust-lang/regex/issues/680):
+  Fixes a bug where `[[:alnum:][:^ascii:]]` dropped `[:alnum:]` from the class.
+* [BUG #859](https://github.com/rust-lang/regex/issues/859):
+  Fixes a bug where `Hir::is_match_empty` returned `false` for `\b`.
+* [BUG #862](https://github.com/rust-lang/regex/issues/862):
+  Fixes a bug where 'ab??' matches 'ab' instead of 'a' in 'ab'.
+
+
+1.5.5 (2022-03-08)
+==================
+This releases fixes a security bug in the regex compiler. This bug permits a
+vector for a denial-of-service attack in cases where the regex being compiled
+is untrusted. There are no known problems where the regex is itself trusted,
+including in cases of untrusted haystacks.
+
+* [SECURITY #GHSA-m5pq-gvj9-9vr8](https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8):
+  Fixes a bug in the regex compiler where empty sub-expressions subverted the
+  existing mitigations in place to enforce a size limit on compiled regexes.
+  The Rust Security Response WG published an advisory about this:
+  https://groups.google.com/g/rustlang-security-announcements/c/NcNNL1Jq7Yw
+
+
 1.5.4 (2021-05-06)
 ==================
 This release fixes another compilation failure when building regex. This time,
@@ -669,7 +727,7 @@
 * Empty sub-expressions are now permitted in most places. That is, `()+` is
   now a valid regex.
 * Almost everything in regex-syntax now uses constant stack space, even when
-  performing anaylsis that requires structural induction. This reduces the risk
+  performing analysis that requires structural induction. This reduces the risk
   of a user provided regular expression causing a stack overflow.
 * [FEATURE #174](https://github.com/rust-lang/regex/issues/174):
   The `Ast` type in `regex-syntax` now contains span information.

diff --git a/Cargo.lock.saved b/Cargo.lock.saved
new file mode 100644
index 0000000..7da81ca
--- /dev/null
+++ b/Cargo.lock.saved

@@ -0,0 +1,98 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "aho-corasick"
+version = "0.7.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "getrandom"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c9495705279e7140bf035dde1f6e750c162df8b625267cd52cc44e0b156732c8"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi",
+]
+
+[[package]]
+name = "lazy_static"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+
+[[package]]
+name = "libc"
+version = "0.2.94"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "18794a8ad5b29321f790b55d93dfba91e125cb1a9edbd4f8e3150acc771c1a5e"
+
+[[package]]
+name = "memchr"
+version = "2.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc"
+
+[[package]]
+name = "quickcheck"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "588f6378e4dd99458b60ec275b4477add41ce4fa9f64dcba6f15adccb19b50d6"
+dependencies = [
+ "rand",
+]
+
+[[package]]
+name = "rand"
+version = "0.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ef9e7e66b4468674bfcb0c81af8b7fa0bb154fa9f28eb840da5c447baeb8d7e"
+dependencies = [
+ "rand_core",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34cf66eb183df1c5876e2dcf6b13d57340741e8dc255b48e40a26de954d06ae7"
+dependencies = [
+ "getrandom",
+]
+
+[[package]]
+name = "regex"
+version = "1.7.0"
+dependencies = [
+ "aho-corasick",
+ "lazy_static",
+ "memchr",
+ "quickcheck",
+ "rand",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.6.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848"
+
+[[package]]
+name = "wasi"
+version = "0.10.2+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6"

diff --git a/Cargo.toml b/Cargo.toml
index 260acec..6dd723e 100644
--- a/Cargo.toml
+++ b/Cargo.toml

@@ -3,27 +3,33 @@
 # When uploading crates to the registry Cargo will automatically
 # "normalize" Cargo.toml files for maximal compatibility
 # with all versions of Cargo and also rewrite `path` dependencies
-# to registry (e.g., crates.io) dependencies
+# to registry (e.g., crates.io) dependencies.
 #
-# If you believe there's an error in this file please file an
-# issue against the rust-lang/cargo repository. If you're
-# editing this file be aware that the upstream Cargo.toml
-# will likely look very different (and much more reasonable)
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
 
 [package]
 edition = "2018"
 name = "regex"
-version = "1.5.4"
+version = "1.7.0"
 authors = ["The Rust Project Developers"]
-exclude = ["/scripts/*", "/.github/*"]
+exclude = [
+    "/scripts/*",
+    "/.github/*",
+]
 autotests = false
-description = "An implementation of regular expressions for Rust. This implementation uses\nfinite automata and guarantees linear time matching on all inputs.\n"
+description = """
+An implementation of regular expressions for Rust. This implementation uses
+finite automata and guarantees linear time matching on all inputs.
+"""
 homepage = "https://github.com/rust-lang/regex"
 documentation = "https://docs.rs/regex"
 readme = "README.md"
 categories = ["text-processing"]
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/rust-lang/regex"
+
 [profile.bench]
 debug = true
 
@@ -72,6 +78,7 @@
 [[test]]
 name = "crates-regex"
 path = "tests/test_crates_regex.rs"
+
 [dependencies.aho-corasick]
 version = "0.7.18"
 optional = true
@@ -81,8 +88,9 @@
 optional = true
 
 [dependencies.regex-syntax]
-version = "0.6.25"
+version = "0.6.27"
 default-features = false
+
 [dev-dependencies.lazy_static]
 version = "1"
 
@@ -92,19 +100,44 @@
 
 [dev-dependencies.rand]
 version = "0.8.3"
-features = ["getrandom", "small_rng"]
+features = [
+    "getrandom",
+    "small_rng",
+]
 default-features = false
 
 [features]
-default = ["std", "perf", "unicode", "regex-syntax/default"]
+default = [
+    "std",
+    "perf",
+    "unicode",
+    "regex-syntax/default",
+]
 pattern = []
-perf = ["perf-cache", "perf-dfa", "perf-inline", "perf-literal"]
+perf = [
+    "perf-cache",
+    "perf-dfa",
+    "perf-inline",
+    "perf-literal",
+]
 perf-cache = []
 perf-dfa = []
 perf-inline = []
-perf-literal = ["aho-corasick", "memchr"]
+perf-literal = [
+    "aho-corasick",
+    "memchr",
+]
 std = []
-unicode = ["unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment", "regex-syntax/unicode"]
+unicode = [
+    "unicode-age",
+    "unicode-bool",
+    "unicode-case",
+    "unicode-gencat",
+    "unicode-perl",
+    "unicode-script",
+    "unicode-segment",
+    "regex-syntax/unicode",
+]
 unicode-age = ["regex-syntax/unicode-age"]
 unicode-bool = ["regex-syntax/unicode-bool"]
 unicode-case = ["regex-syntax/unicode-case"]

diff --git a/Cargo.toml.orig b/Cargo.toml.orig
index 468230b..cafb65e 100644
--- a/Cargo.toml.orig
+++ b/Cargo.toml.orig

@@ -1,6 +1,6 @@
 [package]
 name = "regex"
-version = "1.5.4"  #:version
+version = "1.7.0"  #:version
 authors = ["The Rust Project Developers"]
 license = "MIT OR Apache-2.0"
 readme = "README.md"
@@ -117,7 +117,7 @@
 # For parsing regular expressions.
 [dependencies.regex-syntax]
 path = "regex-syntax"
-version = "0.6.25"
+version = "0.6.27"
 default-features = false
 
 [dev-dependencies]

diff --git a/METADATA b/METADATA
index 6611aac..e71e513 100644
--- a/METADATA
+++ b/METADATA

@@ -1,3 +1,7 @@
+# This project was upgraded with external_updater.
+# Usage: tools/external_updater/updater.sh update rust/crates/regex
+# For more info, check https://cs.android.com/android/platform/superproject/+/master:tools/external_updater/README.md
+
 name: "regex"
 description: "An implementation of regular expressions for Rust. This implementation uses finite automata and guarantees linear time matching on all inputs."
 third_party {
@@ -7,13 +11,13 @@
   }
   url {
     type: ARCHIVE
-    value: "https://static.crates.io/crates/regex/regex-1.5.4.crate"
+    value: "https://static.crates.io/crates/regex/regex-1.7.0.crate"
   }
-  version: "1.5.4"
+  version: "1.7.0"
   license_type: NOTICE
   last_upgrade_date {
-    year: 2021
-    month: 5
-    day: 19
+    year: 2022
+    month: 12
+    day: 13
   }
 }

diff --git a/README.md b/README.md
index 86d6996..861417d 100644
--- a/README.md
+++ b/README.md

@@ -8,7 +8,7 @@
 by [RE2](https://github.com/google/re2).
 
 [![Build status](https://github.com/rust-lang/regex/workflows/ci/badge.svg)](https://github.com/rust-lang/regex/actions)
-[![](https://meritbadge.herokuapp.com/regex)](https://crates.io/crates/regex)
+[![Crates.io](https://img.shields.io/crates/v/regex.svg)](https://crates.io/crates/regex)
 [![Rust](https://img.shields.io/badge/rust-1.41.1%2B-blue.svg?maxAge=3600)](https://github.com/rust-lang/regex)
 
 ### Documentation
@@ -23,12 +23,8 @@
 
 ### Usage
 
-Add this to your `Cargo.toml`:
-
-```toml
-[dependencies]
-regex = "1.5"
-```
+To bring this crate into your repository, either add `regex` to your
+`Cargo.toml`, or run `cargo add regex`.
 
 Here's a simple example that matches a date in YYYY-MM-DD format and prints the
 year, month and day:

diff --git a/src/backtrack.rs b/src/backtrack.rs
index a3d25d6..4d83856 100644
--- a/src/backtrack.rs
+++ b/src/backtrack.rs

@@ -93,13 +93,7 @@
         let mut cache = cache.borrow_mut();
         let cache = &mut cache.backtrack;
         let start = input.at(start);
-        let mut b = Bounded {
-            prog: prog,
-            input: input,
-            matches: matches,
-            slots: slots,
-            m: cache,
-        };
+        let mut b = Bounded { prog, input, matches, slots, m: cache };
         b.exec_(start, end)
     }
 
@@ -220,14 +214,14 @@
                         // job is popped and the old capture index is restored.
                         self.m.jobs.push(Job::SaveRestore {
                             slot: inst.slot,
-                            old_pos: old_pos,
+                            old_pos,
                         });
                         self.slots[inst.slot] = Some(at.pos());
                     }
                     ip = inst.goto;
                 }
                 Split(ref inst) => {
-                    self.m.jobs.push(Job::Inst { ip: inst.goto2, at: at });
+                    self.m.jobs.push(Job::Inst { ip: inst.goto2, at });
                     ip = inst.goto1;
                 }
                 EmptyLook(ref inst) => {

diff --git a/src/compile.rs b/src/compile.rs
index 9a2ed5e..90ca250 100644
--- a/src/compile.rs
+++ b/src/compile.rs

@@ -38,6 +38,16 @@
     suffix_cache: SuffixCache,
     utf8_seqs: Option<Utf8Sequences>,
     byte_classes: ByteClassSet,
+    // This keeps track of extra bytes allocated while compiling the regex
+    // program. Currently, this corresponds to two things. First is the heap
+    // memory allocated by Unicode character classes ('InstRanges'). Second is
+    // a "fake" amount of memory used by empty sub-expressions, so that enough
+    // empty sub-expressions will ultimately trigger the compiler to bail
+    // because of a size limit restriction. (That empty sub-expressions don't
+    // add to heap memory usage is more-or-less an implementation detail.) In
+    // the second case, if we don't bail, then an excessively large repetition
+    // on an empty sub-expression can result in the compiler using a very large
+    // amount of CPU time.
     extra_inst_bytes: usize,
 }
 
@@ -139,7 +149,8 @@
             self.compiled.start = dotstar_patch.entry;
         }
         self.compiled.captures = vec![None];
-        let patch = self.c_capture(0, expr)?.unwrap_or(self.next_inst());
+        let patch =
+            self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst());
         if self.compiled.needs_dotstar() {
             self.fill(dotstar_patch.hole, patch.entry);
         } else {
@@ -175,7 +186,7 @@
             self.fill_to_next(prev_hole);
             let split = self.push_split_hole();
             let Patch { hole, entry } =
-                self.c_capture(0, expr)?.unwrap_or(self.next_inst());
+                self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst());
             self.fill_to_next(hole);
             self.compiled.matches.push(self.insts.len());
             self.push_compiled(Inst::Match(i));
@@ -183,7 +194,7 @@
         }
         let i = exprs.len() - 1;
         let Patch { hole, entry } =
-            self.c_capture(0, &exprs[i])?.unwrap_or(self.next_inst());
+            self.c_capture(0, &exprs[i])?.unwrap_or_else(|| self.next_inst());
         self.fill(prev_hole, entry);
         self.fill_to_next(hole);
         self.compiled.matches.push(self.insts.len());
@@ -260,7 +271,7 @@
 
         self.check_size()?;
         match *expr.kind() {
-            Empty => Ok(None),
+            Empty => self.c_empty(),
             Literal(hir::Literal::Unicode(c)) => self.c_char(c),
             Literal(hir::Literal::Byte(b)) => {
                 assert!(self.compiled.uses_bytes());
@@ -378,6 +389,19 @@
         }
     }
 
+    fn c_empty(&mut self) -> ResultOrEmpty {
+        // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8
+        // See: CVE-2022-24713
+        //
+        // Since 'empty' sub-expressions don't increase the size of
+        // the actual compiled object, we "fake" an increase in its
+        // size so that our 'check_size_limit' routine will eventually
+        // stop compilation if there are too many empty sub-expressions
+        // (e.g., via a large repetition).
+        self.extra_inst_bytes += std::mem::size_of::<Inst>();
+        Ok(None)
+    }
+
     fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty {
         if self.num_exprs > 1 || self.compiled.is_dfa {
             // Don't ever compile Save instructions for regex sets because
@@ -387,11 +411,11 @@
         } else {
             let entry = self.insts.len();
             let hole = self.push_hole(InstHole::Save { slot: first_slot });
-            let patch = self.c(expr)?.unwrap_or(self.next_inst());
+            let patch = self.c(expr)?.unwrap_or_else(|| self.next_inst());
             self.fill(hole, patch.entry);
             self.fill_to_next(patch.hole);
             let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
-            Ok(Some(Patch { hole: hole, entry: entry }))
+            Ok(Some(Patch { hole, entry }))
         }
     }
 
@@ -425,7 +449,7 @@
                 self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
             }
         } else {
-            let hole = self.push_hole(InstHole::Char { c: c });
+            let hole = self.push_hole(InstHole::Char { c });
             Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
         }
     }
@@ -435,7 +459,7 @@
 
         assert!(!ranges.is_empty());
         if self.compiled.uses_bytes() {
-            Ok(Some(CompileClass { c: self, ranges: ranges }.compile()?))
+            Ok(Some(CompileClass { c: self, ranges }.compile()?))
         } else {
             let ranges: Vec<(char, char)> =
                 ranges.iter().map(|r| (r.start(), r.end())).collect();
@@ -444,9 +468,9 @@
             } else {
                 self.extra_inst_bytes +=
                     ranges.len() * (size_of::<char>() * 2);
-                self.push_hole(InstHole::Ranges { ranges: ranges })
+                self.push_hole(InstHole::Ranges { ranges })
             };
-            Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }))
+            Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
         }
     }
 
@@ -485,8 +509,8 @@
     }
 
     fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty {
-        let hole = self.push_hole(InstHole::EmptyLook { look: look });
-        Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }))
+        let hole = self.push_hole(InstHole::EmptyLook { look });
+        Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
     }
 
     fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty
@@ -496,7 +520,7 @@
         let mut exprs = exprs.into_iter();
         let Patch { mut hole, entry } = loop {
             match exprs.next() {
-                None => return Ok(None),
+                None => return self.c_empty(),
                 Some(e) => {
                     if let Some(p) = self.c(e)? {
                         break p;
@@ -510,7 +534,7 @@
                 hole = p.hole;
             }
         }
-        Ok(Some(Patch { hole: hole, entry: entry }))
+        Ok(Some(Patch { hole, entry }))
     }
 
     fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty {
@@ -653,7 +677,7 @@
         // None).
         let patch_concat = self
             .c_concat(iter::repeat(expr).take(min))?
-            .unwrap_or(self.next_inst());
+            .unwrap_or_else(|| self.next_inst());
         if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? {
             self.fill(patch_concat.hole, patch_rep.entry);
             Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry }))
@@ -677,7 +701,7 @@
         }
         // Same reasoning as in c_repeat_range_min_or_more (we know that min <
         // max at this point).
-        let patch_concat = patch_concat.unwrap_or(self.next_inst());
+        let patch_concat = patch_concat.unwrap_or_else(|| self.next_inst());
         let initial_entry = patch_concat.entry;
         // It is much simpler to compile, e.g., `a{2,5}` as:
         //
@@ -856,14 +880,14 @@
             }
             MaybeInst::Split1(goto1) => {
                 MaybeInst::Compiled(Inst::Split(InstSplit {
-                    goto1: goto1,
+                    goto1,
                     goto2: goto,
                 }))
             }
             MaybeInst::Split2(goto2) => {
                 MaybeInst::Compiled(Inst::Split(InstSplit {
                     goto1: goto,
-                    goto2: goto2,
+                    goto2,
                 }))
             }
             _ => unreachable!(
@@ -877,9 +901,7 @@
 
     fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
         let filled = match *self {
-            MaybeInst::Split => {
-                Inst::Split(InstSplit { goto1: goto1, goto2: goto2 })
-            }
+            MaybeInst::Split => Inst::Split(InstSplit { goto1, goto2 }),
             _ => unreachable!(
                 "must be called on Split instruction, \
                  instead it was called on: {:?}",
@@ -937,19 +959,17 @@
 impl InstHole {
     fn fill(&self, goto: InstPtr) -> Inst {
         match *self {
-            InstHole::Save { slot } => {
-                Inst::Save(InstSave { goto: goto, slot: slot })
-            }
+            InstHole::Save { slot } => Inst::Save(InstSave { goto, slot }),
             InstHole::EmptyLook { look } => {
-                Inst::EmptyLook(InstEmptyLook { goto: goto, look: look })
+                Inst::EmptyLook(InstEmptyLook { goto, look })
             }
-            InstHole::Char { c } => Inst::Char(InstChar { goto: goto, c: c }),
+            InstHole::Char { c } => Inst::Char(InstChar { goto, c }),
             InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges {
-                goto: goto,
+                goto,
                 ranges: ranges.clone().into_boxed_slice(),
             }),
             InstHole::Bytes { start, end } => {
-                Inst::Bytes(InstBytes { goto: goto, start: start, end: end })
+                Inst::Bytes(InstBytes { goto, start, end })
             }
         }
     }
@@ -1019,7 +1039,7 @@
         let mut last_hole = Hole::None;
         for byte_range in seq {
             let key = SuffixCacheKey {
-                from_inst: from_inst,
+                from_inst,
                 start: byte_range.start,
                 end: byte_range.end,
             };
@@ -1109,7 +1129,7 @@
             }
         }
         *pos = self.dense.len();
-        self.dense.push(SuffixCacheEntry { key: key, pc: pc });
+        self.dense.push(SuffixCacheEntry { key, pc });
         None
     }
 
@@ -1120,8 +1140,8 @@
     fn hash(&self, suffix: &SuffixCacheKey) -> usize {
         // Basic FNV-1a hash as described:
         // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
-        const FNV_PRIME: u64 = 1099511628211;
-        let mut h = 14695981039346656037;
+        const FNV_PRIME: u64 = 1_099_511_628_211;
+        let mut h = 14_695_981_039_346_656_037;
         h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
         h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
         h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);

diff --git a/src/dfa.rs b/src/dfa.rs
index 4b60f4d..dc99521 100644
--- a/src/dfa.rs
+++ b/src/dfa.rs

@@ -31,7 +31,7 @@
 tricks are employed to make it fast. Tread carefully.
 
 N.B. While this implementation is heavily commented, Russ Cox's series of
-articles on regexes is strongly recommended: https://swtch.com/~rsc/regexp/
+articles on regexes is strongly recommended: <https://swtch.com/~rsc/regexp/>
 (As is the DFA implementation in RE2, which heavily influenced this
 implementation.)
 */
@@ -454,10 +454,10 @@
         let mut cache = cache.borrow_mut();
         let cache = &mut cache.dfa;
         let mut dfa = Fsm {
-            prog: prog,
+            prog,
             start: 0, // filled in below
-            at: at,
-            quit_after_match: quit_after_match,
+            at,
+            quit_after_match,
             last_match_si: STATE_UNKNOWN,
             last_cache_flush: at,
             cache: &mut cache.inner,
@@ -484,10 +484,10 @@
         let mut cache = cache.borrow_mut();
         let cache = &mut cache.dfa_reverse;
         let mut dfa = Fsm {
-            prog: prog,
+            prog,
             start: 0, // filled in below
-            at: at,
-            quit_after_match: quit_after_match,
+            at,
+            quit_after_match,
             last_match_si: STATE_UNKNOWN,
             last_cache_flush: at,
             cache: &mut cache.inner,
@@ -515,9 +515,9 @@
         let mut cache = cache.borrow_mut();
         let cache = &mut cache.dfa;
         let mut dfa = Fsm {
-            prog: prog,
+            prog,
             start: 0, // filled in below
-            at: at,
+            at,
             quit_after_match: false,
             last_match_si: STATE_UNKNOWN,
             last_cache_flush: at,
@@ -1353,7 +1353,6 @@
         match self.cache.trans.next(si, self.byte_class(b)) {
             STATE_UNKNOWN => self.exec_byte(qcur, qnext, si, b),
             STATE_QUIT => None,
-            STATE_DEAD => Some(STATE_DEAD),
             nsi => Some(nsi),
         }
     }
@@ -1387,7 +1386,6 @@
         };
         match self.cache.start_states[flagi] {
             STATE_UNKNOWN => {}
-            STATE_DEAD => return Some(STATE_DEAD),
             si => return Some(si),
         }
         q.clear();
@@ -1608,11 +1606,7 @@
 
 impl StateMap {
     fn new(num_byte_classes: usize) -> StateMap {
-        StateMap {
-            map: HashMap::new(),
-            states: vec![],
-            num_byte_classes: num_byte_classes,
-        }
+        StateMap { map: HashMap::new(), states: vec![], num_byte_classes }
     }
 
     fn len(&self) -> usize {
@@ -1648,7 +1642,7 @@
     /// The number of byte classes corresponds to the stride. Every state will
     /// have `num_byte_classes` slots for transitions.
     fn new(num_byte_classes: usize) -> Transitions {
-        Transitions { table: vec![], num_byte_classes: num_byte_classes }
+        Transitions { table: vec![], num_byte_classes }
     }
 
     /// Returns the total number of states currently in this table.
@@ -1698,27 +1692,27 @@
 
 impl StateFlags {
     fn is_match(&self) -> bool {
-        self.0 & 0b0000000_1 > 0
+        self.0 & 0b0000_0001 > 0
     }
 
     fn set_match(&mut self) {
-        self.0 |= 0b0000000_1;
+        self.0 |= 0b0000_0001;
     }
 
     fn is_word(&self) -> bool {
-        self.0 & 0b000000_1_0 > 0
+        self.0 & 0b0000_0010 > 0
     }
 
     fn set_word(&mut self) {
-        self.0 |= 0b000000_1_0;
+        self.0 |= 0b0000_0010;
     }
 
     fn has_empty(&self) -> bool {
-        self.0 & 0b00000_1_00 > 0
+        self.0 & 0b0000_0100 > 0
     }
 
     fn set_empty(&mut self) {
-        self.0 |= 0b00000_1_00;
+        self.0 |= 0b0000_0100;
     }
 }
 

diff --git a/src/exec.rs b/src/exec.rs
index d5fad1c..e75ca08 100644
--- a/src/exec.rs
+++ b/src/exec.rs

@@ -288,10 +288,10 @@
             exprs.push(expr);
         }
         Ok(Parsed {
-            exprs: exprs,
+            exprs,
             prefixes: prefixes.unwrap_or_else(Literals::empty),
             suffixes: suffixes.unwrap_or_else(Literals::empty),
-            bytes: bytes,
+            bytes,
         })
     }
 
@@ -311,7 +311,7 @@
                 match_type: MatchType::Nothing,
             });
             let pool = ExecReadOnly::new_pool(&ro);
-            return Ok(Exec { ro: ro, pool });
+            return Ok(Exec { ro, pool });
         }
         let parsed = self.parse()?;
         let mut nfa = Compiler::new()
@@ -340,12 +340,12 @@
 
         let mut ro = ExecReadOnly {
             res: self.options.pats,
-            nfa: nfa,
-            dfa: dfa,
-            dfa_reverse: dfa_reverse,
+            nfa,
+            dfa,
+            dfa_reverse,
             suffixes: LiteralSearcher::suffixes(parsed.suffixes),
             #[cfg(feature = "perf-literal")]
-            ac: ac,
+            ac,
             match_type: MatchType::Nothing,
         };
         ro.match_type = ro.choose_match_type(self.match_type);

diff --git a/src/expand.rs b/src/expand.rs
index fd9c2d0..67b5149 100644
--- a/src/expand.rs
+++ b/src/expand.rs

@@ -127,7 +127,7 @@
 /// If no such valid reference could be found, None is returned.
 fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> {
     let mut i = 0;
-    let rep: &[u8] = replacement.as_ref();
+    let rep: &[u8] = replacement;
     if rep.len() <= 1 || rep[0] != b'$' {
         return None;
     }
@@ -136,7 +136,7 @@
         return find_cap_ref_braced(rep, i + 1);
     }
     let mut cap_end = i;
-    while rep.get(cap_end).map_or(false, is_valid_cap_letter) {
+    while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) {
         cap_end += 1;
     }
     if cap_end == i {
@@ -183,8 +183,8 @@
 }
 
 /// Returns true if and only if the given byte is allowed in a capture name.
-fn is_valid_cap_letter(b: &u8) -> bool {
-    match *b {
+fn is_valid_cap_letter(b: u8) -> bool {
+    match b {
         b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true,
         _ => false,
     }

diff --git a/src/input.rs b/src/input.rs
index 5d50ee3..df6c3e0 100644
--- a/src/input.rs
+++ b/src/input.rs

@@ -160,7 +160,7 @@
             InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
         } else {
             let c = decode_utf8(&self[i..]).map(|(c, _)| c).into();
-            InputAt { pos: i, c: c, byte: None, len: c.len_utf8() }
+            InputAt { pos: i, c, byte: None, len: c.len_utf8() }
         }
     }
 
@@ -231,7 +231,7 @@
 impl<'t> ByteInput<'t> {
     /// Return a new byte-based input reader for the given string.
     pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> {
-        ByteInput { text: text, only_utf8: only_utf8 }
+        ByteInput { text, only_utf8 }
     }
 }
 

diff --git a/src/lib.rs b/src/lib.rs
index 7f2dec8..6b95739 100644
--- a/src/lib.rs
+++ b/src/lib.rs

@@ -353,6 +353,9 @@
 \B    not a Unicode word boundary
 </pre>
 
+The empty regex is valid and matches the empty string. For example, the empty
+regex matches `abc` at positions `0`, `1`, `2` and `3`.
+
 ## Grouping and flags
 
 <pre class="rust">
@@ -628,7 +631,6 @@
 #[cfg(feature = "std")]
 pub use crate::re_set::unicode::*;
 #[cfg(feature = "std")]
-#[cfg(feature = "std")]
 pub use crate::re_unicode::{
     escape, CaptureLocations, CaptureMatches, CaptureNames, Captures,
     Locations, Match, Matches, NoExpand, Regex, Replacer, ReplacerRef, Split,

diff --git a/src/literal/imp.rs b/src/literal/imp.rs
index 82f050a..90b2f11 100644
--- a/src/literal/imp.rs
+++ b/src/literal/imp.rs

@@ -57,10 +57,10 @@
     fn new(lits: Literals, matcher: Matcher) -> Self {
         let complete = lits.all_complete();
         LiteralSearcher {
-            complete: complete,
+            complete,
             lcp: Memmem::new(lits.longest_common_prefix()),
             lcs: Memmem::new(lits.longest_common_suffix()),
-            matcher: matcher,
+            matcher,
         }
     }
 

diff --git a/src/pattern.rs b/src/pattern.rs
index b4ffd8e..00549e5 100644
--- a/src/pattern.rs
+++ b/src/pattern.rs

@@ -15,7 +15,7 @@
 
     fn into_searcher(self, haystack: &'t str) -> RegexSearcher<'r, 't> {
         RegexSearcher {
-            haystack: haystack,
+            haystack,
             it: self.find_iter(haystack),
             last_step_end: 0,
             next_match: None,

diff --git a/src/pikevm.rs b/src/pikevm.rs
index 9a14240..8c9eac2 100644
--- a/src/pikevm.rs
+++ b/src/pikevm.rs

@@ -100,7 +100,7 @@
         cache.clist.resize(prog.len(), prog.captures.len());
         cache.nlist.resize(prog.len(), prog.captures.len());
         let at = input.at(start);
-        Fsm { prog: prog, stack: &mut cache.stack, input: input }.exec_(
+        Fsm { prog, stack: &mut cache.stack, input }.exec_(
             &mut cache.clist,
             &mut cache.nlist,
             matches,

diff --git a/src/prog.rs b/src/prog.rs
index 475a811..c211f71 100644
--- a/src/prog.rs
+++ b/src/prog.rs

@@ -233,7 +233,7 @@
             if pc == self.start {
                 write!(f, " (start)")?;
             }
-            write!(f, "\n")?;
+            writeln!(f)?;
         }
         Ok(())
     }

diff --git a/src/re_bytes.rs b/src/re_bytes.rs
index ae55d6d..d719692 100644
--- a/src/re_bytes.rs
+++ b/src/re_bytes.rs

@@ -53,7 +53,7 @@
     /// Creates a new match from the given haystack and byte offsets.
     #[inline]
     fn new(haystack: &'t [u8], start: usize, end: usize) -> Match<'t> {
-        Match { text: haystack, start: start, end: end }
+        Match { text: haystack, start, end }
     }
 }
 
@@ -255,7 +255,7 @@
     pub fn captures<'t>(&self, text: &'t [u8]) -> Option<Captures<'t>> {
         let mut locs = self.capture_locations();
         self.captures_read_at(&mut locs, text, 0).map(move |_| Captures {
-            text: text,
+            text,
             locs: locs.0,
             named_groups: self.0.capture_name_idx().clone(),
         })
@@ -578,7 +578,7 @@
     /// context into consideration. For example, the `\A` anchor can only
     /// match when `start == 0`.
     pub fn is_match_at(&self, text: &[u8], start: usize) -> bool {
-        self.shortest_match_at(text, start).is_some()
+        self.0.searcher().is_match_at(text, start)
     }
 
     /// Returns the same as find, but starts the search at the given
@@ -723,7 +723,7 @@
     fn next(&mut self) -> Option<Captures<'t>> {
         self.0.next().map(|locs| Captures {
             text: self.0.text(),
-            locs: locs,
+            locs,
             named_groups: self.0.regex().capture_name_idx().clone(),
         })
     }
@@ -877,7 +877,7 @@
         self.0.pos(i)
     }
 
-    /// Returns the total number of capturing groups.
+    /// Returns the total number of capture groups (even if they didn't match).
     ///
     /// This is always at least `1` since every regex has at least `1`
     /// capturing group that corresponds to the entire match.
@@ -979,7 +979,7 @@
         expand_bytes(self, replacement, dst)
     }
 
-    /// Returns the number of captured groups.
+    /// Returns the total number of capture groups (even if they didn't match).
     ///
     /// This is always at least `1`, since every regex has at least one capture
     /// group that corresponds to the full match.

diff --git a/src/re_set.rs b/src/re_set.rs
index 73d5953..a6d886d 100644
--- a/src/re_set.rs
+++ b/src/re_set.rs

@@ -59,13 +59,45 @@
 /// 1. Does any regex in the set match?
 /// 2. If so, which regexes in the set match?
 ///
-/// As with the main `Regex` type, it is cheaper to ask (1) instead of (2)
-/// since the matching engines can stop after the first match is found.
+/// As with the main [`Regex`][crate::Regex] type, it is cheaper to ask (1)
+/// instead of (2) since the matching engines can stop after the first match
+/// is found.
 ///
-/// Other features like finding the location of successive matches or their
-/// sub-captures aren't supported. If you need this functionality, the
-/// recommended approach is to compile each regex in the set independently and
-/// selectively match them based on which regexes in the set matched.
+/// You cannot directly extract [`Match`][crate::Match] or
+/// [`Captures`][crate::Captures] objects from a regex set. If you need these
+/// operations, the recommended approach is to compile each pattern in the set
+/// independently and scan the exact same input a second time with those
+/// independently compiled patterns:
+///
+/// ```rust
+/// use regex::{Regex, RegexSet};
+///
+/// let patterns = ["foo", "bar"];
+/// // Both patterns will match different ranges of this string.
+/// let text = "barfoo";
+///
+/// // Compile a set matching any of our patterns.
+/// let set = RegexSet::new(&patterns).unwrap();
+/// // Compile each pattern independently.
+/// let regexes: Vec<_> = set.patterns().iter()
+///     .map(|pat| Regex::new(pat).unwrap())
+///     .collect();
+///
+/// // Match against the whole set first and identify the individual
+/// // matching patterns.
+/// let matches: Vec<&str> = set.matches(text).into_iter()
+///     // Dereference the match index to get the corresponding
+///     // compiled pattern.
+///     .map(|match_idx| &regexes[match_idx])
+///     // To get match locations or any other info, we then have to search
+///     // the exact same text again, using our separately-compiled pattern.
+///     .map(|pat| pat.find(text).unwrap().as_str())
+///     .collect();
+///
+/// // Matches arrive in the order the constituent patterns were declared,
+/// // not the order they appear in the input.
+/// assert_eq!(vec!["foo", "bar"], matches);
+/// ```
 ///
 /// # Performance
 ///

diff --git a/src/re_trait.rs b/src/re_trait.rs
index 680aa54..d0c717d 100644
--- a/src/re_trait.rs
+++ b/src/re_trait.rs

@@ -74,8 +74,19 @@
         self.idx += 1;
         x
     }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let len = self.locs.len() - self.idx;
+        (len, Some(len))
+    }
+
+    fn count(self) -> usize {
+        self.len()
+    }
 }
 
+impl<'c> ExactSizeIterator for SubCapturesPosIter<'c> {}
+
 impl<'c> FusedIterator for SubCapturesPosIter<'c> {}
 
 /// `RegularExpression` describes types that can implement regex searching.
@@ -139,7 +150,7 @@
     /// Returns an iterator over all non-overlapping successive leftmost-first
     /// matches.
     fn find_iter(self, text: &Self::Text) -> Matches<'_, Self> {
-        Matches { re: self, text: text, last_end: 0, last_match: None }
+        Matches { re: self, text, last_end: 0, last_match: None }
     }
 
     /// Returns an iterator over all non-overlapping successive leftmost-first

diff --git a/src/re_unicode.rs b/src/re_unicode.rs
index 142c78f..60d81a7 100644
--- a/src/re_unicode.rs
+++ b/src/re_unicode.rs

@@ -61,7 +61,7 @@
     /// Creates a new match from the given haystack and byte offsets.
     #[inline]
     fn new(haystack: &'t str, start: usize, end: usize) -> Match<'t> {
-        Match { text: haystack, start: start, end: end }
+        Match { text: haystack, start, end }
     }
 }
 
@@ -129,7 +129,7 @@
 /// assert!(haystack.contains(&re));
 /// assert_eq!(haystack.find(&re), Some(1));
 /// assert_eq!(haystack.match_indices(&re).collect::<Vec<_>>(),
-///            vec![(1, 4), (5, 8)]);
+///            vec![(1, "111"), (5, "222")]);
 /// assert_eq!(haystack.split(&re).collect::<Vec<_>>(), vec!["a", "b", "c"]);
 /// ```
 #[derive(Clone)]
@@ -311,7 +311,7 @@
     pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> {
         let mut locs = self.capture_locations();
         self.captures_read_at(&mut locs, text, 0).map(move |_| Captures {
-            text: text,
+            text,
             locs: locs.0,
             named_groups: self.0.capture_name_idx().clone(),
         })
@@ -538,7 +538,7 @@
         mut rep: R,
     ) -> Cow<'t, str> {
         // If we know that the replacement doesn't have any capture expansions,
-        // then we can fast path. The fast path can make a tremendous
+        // then we can use the fast path. The fast path can make a tremendous
         // difference:
         //
         //   1) We use `find_iter` instead of `captures_iter`. Not asking for
@@ -636,7 +636,7 @@
     /// context into consideration. For example, the `\A` anchor can only
     /// match when `start == 0`.
     pub fn is_match_at(&self, text: &str, start: usize) -> bool {
-        self.shortest_match_at(text, start).is_some()
+        self.0.searcher_str().is_match_at(text, start)
     }
 
     /// Returns the same as find, but starts the search at the given
@@ -887,7 +887,7 @@
         self.0.pos(i)
     }
 
-    /// Returns the total number of capturing groups.
+    /// Returns the total number of capture groups (even if they didn't match).
     ///
     /// This is always at least `1` since every regex has at least `1`
     /// capturing group that corresponds to the entire match.
@@ -989,7 +989,7 @@
         expand_str(self, replacement, dst)
     }
 
-    /// Returns the number of captured groups.
+    /// Returns the total number of capture groups (even if they didn't match).
     ///
     /// This is always at least `1`, since every regex has at least one capture
     /// group that corresponds to the full match.
@@ -1092,8 +1092,18 @@
             .next()
             .map(|cap| cap.map(|(s, e)| Match::new(self.caps.text, s, e)))
     }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.it.size_hint()
+    }
+
+    fn count(self) -> usize {
+        self.it.count()
+    }
 }
 
+impl<'c, 't> ExactSizeIterator for SubCaptureMatches<'c, 't> {}
+
 impl<'c, 't> FusedIterator for SubCaptureMatches<'c, 't> {}
 
 /// An iterator that yields all non-overlapping capture groups matching a
@@ -1114,7 +1124,7 @@
     fn next(&mut self) -> Option<Captures<'t>> {
         self.0.next().map(|locs| Captures {
             text: self.0.text(),
-            locs: locs,
+            locs,
             named_groups: self.0.regex().capture_name_idx().clone(),
         })
     }

diff --git a/src/utf8.rs b/src/utf8.rs
index 6e0608f..2dfd2c0 100644
--- a/src/utf8.rs
+++ b/src/utf8.rs

@@ -108,7 +108,7 @@
                 | ((b2 & !TAG_CONT) as u32) << 6
                 | ((b3 & !TAG_CONT) as u32);
             match cp {
-                0x10000..=0x10FFFF => char::from_u32(cp).map(|cp| (cp, 4)),
+                0x10000..=0x0010_FFFF => char::from_u32(cp).map(|cp| (cp, 4)),
                 _ => None,
             }
         }

diff --git a/tests/regression.rs b/tests/regression.rs
index 44b9083..e8b2525 100644
--- a/tests/regression.rs
+++ b/tests/regression.rs

@@ -217,3 +217,6 @@
 // https://en.wikipedia.org/wiki/Je_(Cyrillic)
 ismatch!(empty_group_match, r"()Ј01", "zЈ01", true);
 matiter!(empty_group_find, r"()Ј01", "zЈ01", (1, 5));
+
+// See: https://github.com/rust-lang/regex/issues/862
+mat!(non_greedy_question_literal, r"ab??", "ab", Some((0, 1)));

diff --git a/tests/test_default.rs b/tests/test_default.rs
index d4365fb..be627f7 100644
--- a/tests/test_default.rs
+++ b/tests/test_default.rs

@@ -150,3 +150,73 @@
     assert_eq!(16, size_of::<bytes::Regex>());
     assert_eq!(16, size_of::<bytes::RegexSet>());
 }
+
+// See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8
+// See: CVE-2022-24713
+//
+// We test that our regex compiler will correctly return a "too big" error when
+// we try to use a very large repetition on an *empty* sub-expression.
+//
+// At the time this test was written, the regex compiler does not represent
+// empty sub-expressions with any bytecode instructions. In effect, it's an
+// "optimization" to leave them out, since they would otherwise correspond
+// to an unconditional JUMP in the regex bytecode (i.e., an unconditional
+// epsilon transition in the NFA graph). Therefore, an empty sub-expression
+// represents an interesting case for the compiler's size limits. Since it
+// doesn't actually contribute any additional memory to the compiled regex
+// instructions, the size limit machinery never detects it. Instead, it just
+// dumbly tries to compile the empty sub-expression N times, where N is the
+// repetition size.
+//
+// When N is very large, this will cause the compiler to essentially spin and
+// do nothing for a decently large amount of time. It causes the regex to take
+// quite a bit of time to compile, despite the concrete syntax of the regex
+// being quite small.
+//
+// The degree to which this is actually a problem is somewhat of a judgment
+// call. Some regexes simply take a long time to compile. But in general, you
+// should be able to reasonably control this by setting lower or higher size
+// limits on the compiled object size. But this mitigation doesn't work at all
+// for this case.
+//
+// This particular test is somewhat narrow. It merely checks that regex
+// compilation will, at some point, return a "too big" error. Before the
+// fix landed, this test would eventually fail because the regex would be
+// successfully compiled (after enough time elapsed). So while this test
+// doesn't check that we exit in a reasonable amount of time, it does at least
+// check that we are properly returning an error at some point.
+#[test]
+fn big_empty_regex_fails() {
+    use regex::Regex;
+
+    let result = Regex::new("(?:){4294967295}");
+    assert!(result.is_err());
+}
+
+// Below is a "billion laughs" variant of the previous test case.
+#[test]
+fn big_empty_reps_chain_regex_fails() {
+    use regex::Regex;
+
+    let result = Regex::new("(?:){64}{64}{64}{64}{64}{64}");
+    assert!(result.is_err());
+}
+
+// Below is another situation where a zero-length sub-expression can be
+// introduced.
+#[test]
+fn big_zero_reps_regex_fails() {
+    use regex::Regex;
+
+    let result = Regex::new(r"x{0}{4294967295}");
+    assert!(result.is_err());
+}
+
+// Testing another case for completeness.
+#[test]
+fn empty_alt_regex_fails() {
+    use regex::Regex;
+
+    let result = Regex::new(r"(?:|){4294967295}");
+    assert!(result.is_err());
+}

diff --git a/tests/unicode.rs b/tests/unicode.rs
index 9f1cd0c..9b32286 100644
--- a/tests/unicode.rs
+++ b/tests/unicode.rs

@@ -232,3 +232,20 @@
 mat!(uni_class_sb3, r"\p{sb=Close}", "\u{FF60}", Some((0, 3)));
 mat!(uni_class_sb4, r"\p{sb=Close}", "\u{1F677}", Some((0, 4)));
 mat!(uni_class_sb5, r"\p{sb=SContinue}", "\u{FF64}", Some((0, 3)));
+
+// Test 'Vithkuqi' support, which was added in Unicode 14.
+// See: https://github.com/rust-lang/regex/issues/877
+mat!(
+    uni_vithkuqi_literal_upper,
+    r"(?i)^\u{10570}$",
+    "\u{10570}",
+    Some((0, 4))
+);
+mat!(
+    uni_vithkuqi_literal_lower,
+    r"(?i)^\u{10570}$",
+    "\u{10597}",
+    Some((0, 4))
+);
+mat!(uni_vithkuqi_word_upper, r"^\w$", "\u{10570}", Some((0, 4)));
+mat!(uni_vithkuqi_word_lower, r"^\w$", "\u{10597}", Some((0, 4)));
commit	8f60e8004a905b29ad916bf036be7e60819dac0a	[log] [tgz]
author	Treehugger Robot <treehugger-gerrit@google.com>	Fri Jan 20 13:18:31 2023 +0000
committer	Gerrit Code Review <noreply-gerritcodereview@google.com>	Fri Jan 20 13:18:31 2023 +0000
tree	10c08e1a02ceff6e83ffc620f98e04d31f78b47e
parent	4594904dce079aa49285af3044fc4c515cac15e3 [diff]
parent	7023f2b7d11accbb7bae9b207321b9b4971ed6dd [diff]