| /*! |
| An NFA backed Pike VM for executing regex searches with capturing groups. |
| |
| This module provides a [`PikeVM`] that works by simulating an NFA and |
| resolving all spans of capturing groups that participate in a match. |
| */ |
| |
| #[cfg(feature = "internal-instrument-pikevm")] |
| use core::cell::RefCell; |
| |
| use alloc::{vec, vec::Vec}; |
| |
| use crate::{ |
| nfa::thompson::{self, BuildError, State, NFA}, |
| util::{ |
| captures::Captures, |
| empty, iter, |
| prefilter::Prefilter, |
| primitives::{NonMaxUsize, PatternID, SmallIndex, StateID}, |
| search::{ |
| Anchored, HalfMatch, Input, Match, MatchKind, PatternSet, Span, |
| }, |
| sparse_set::SparseSet, |
| }, |
| }; |
| |
| /// A simple macro for conditionally executing instrumentation logic when |
| /// the 'trace' log level is enabled. This is a compile-time no-op when the |
| /// 'internal-instrument-pikevm' feature isn't enabled. The intent here is that |
| /// this makes it easier to avoid doing extra work when instrumentation isn't |
| /// enabled. |
| /// |
| /// This macro accepts a closure of type `|&mut Counters|`. The closure can |
| /// then increment counters (or whatever) in accordance with what one wants |
| /// to track. |
| macro_rules! instrument { |
| ($fun:expr) => { |
| #[cfg(feature = "internal-instrument-pikevm")] |
| { |
| let fun: &mut dyn FnMut(&mut Counters) = &mut $fun; |
| COUNTERS.with(|c: &RefCell<Counters>| fun(&mut *c.borrow_mut())); |
| } |
| }; |
| } |
| |
| #[cfg(feature = "internal-instrument-pikevm")] |
| std::thread_local! { |
| /// Effectively global state used to keep track of instrumentation |
| /// counters. The "proper" way to do this is to thread it through the |
| /// PikeVM, but it makes the code quite icky. Since this is just a |
| /// debugging feature, we're content to relegate it to thread local |
| /// state. When instrumentation is enabled, the counters are reset at the |
| /// beginning of every search and printed (with the 'trace' log level) at |
| /// the end of every search. |
| static COUNTERS: RefCell<Counters> = RefCell::new(Counters::empty()); |
| } |
| |
| /// The configuration used for building a [`PikeVM`]. |
| /// |
| /// A PikeVM configuration is a simple data object that is typically used with |
| /// [`Builder::configure`]. It can be cheaply cloned. |
| /// |
| /// A default configuration can be created either with `Config::new`, or |
| /// perhaps more conveniently, with [`PikeVM::config`]. |
| #[derive(Clone, Debug, Default)] |
| pub struct Config { |
| match_kind: Option<MatchKind>, |
| pre: Option<Option<Prefilter>>, |
| } |
| |
| impl Config { |
| /// Return a new default PikeVM configuration. |
| pub fn new() -> Config { |
| Config::default() |
| } |
| |
| /// Set the desired match semantics. |
| /// |
| /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the |
| /// match semantics of Perl-like regex engines. That is, when multiple |
| /// patterns would match at the same leftmost position, the pattern that |
| /// appears first in the concrete syntax is chosen. |
| /// |
| /// Currently, the only other kind of match semantics supported is |
| /// [`MatchKind::All`]. This corresponds to "classical DFA" construction |
| /// where all possible matches are visited in the NFA by the `PikeVM`. |
| /// |
| /// Typically, `All` is used when one wants to execute an overlapping |
| /// search and `LeftmostFirst` otherwise. In particular, it rarely makes |
| /// sense to use `All` with the various "leftmost" find routines, since the |
| /// leftmost routines depend on the `LeftmostFirst` automata construction |
| /// strategy. Specifically, `LeftmostFirst` results in the `PikeVM` |
| /// simulating dead states as a way to terminate the search and report a |
| /// match. `LeftmostFirst` also supports non-greedy matches using this |
| /// strategy where as `All` does not. |
| pub fn match_kind(mut self, kind: MatchKind) -> Config { |
| self.match_kind = Some(kind); |
| self |
| } |
| |
| /// Set a prefilter to be used whenever a start state is entered. |
| /// |
| /// A [`Prefilter`] in this context is meant to accelerate searches by |
| /// looking for literal prefixes that every match for the corresponding |
| /// pattern (or patterns) must start with. Once a prefilter produces a |
| /// match, the underlying search routine continues on to try and confirm |
| /// the match. |
| /// |
| /// Be warned that setting a prefilter does not guarantee that the search |
| /// will be faster. While it's usually a good bet, if the prefilter |
| /// produces a lot of false positive candidates (i.e., positions matched |
| /// by the prefilter but not by the regex), then the overall result can |
| /// be slower than if you had just executed the regex engine without any |
| /// prefilters. |
| /// |
| /// By default no prefilter is set. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use regex_automata::{ |
| /// nfa::thompson::pikevm::PikeVM, |
| /// util::prefilter::Prefilter, |
| /// Input, Match, MatchKind, |
| /// }; |
| /// |
| /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]); |
| /// let re = PikeVM::builder() |
| /// .configure(PikeVM::config().prefilter(pre)) |
| /// .build(r"(foo|bar)[a-z]+")?; |
| /// let mut cache = re.create_cache(); |
| /// let input = Input::new("foo1 barfox bar"); |
| /// assert_eq!(Some(Match::must(0, 5..11)), re.find(&mut cache, input)); |
| /// |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| /// |
| /// Be warned though that an incorrect prefilter can lead to incorrect |
| /// results! |
| /// |
| /// ``` |
| /// use regex_automata::{ |
| /// nfa::thompson::pikevm::PikeVM, |
| /// util::prefilter::Prefilter, |
| /// Input, HalfMatch, MatchKind, |
| /// }; |
| /// |
| /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]); |
| /// let re = PikeVM::builder() |
| /// .configure(PikeVM::config().prefilter(pre)) |
| /// .build(r"(foo|bar)[a-z]+")?; |
| /// let mut cache = re.create_cache(); |
| /// let input = Input::new("foo1 barfox bar"); |
| /// // No match reported even though there clearly is one! |
| /// assert_eq!(None, re.find(&mut cache, input)); |
| /// |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config { |
| self.pre = Some(pre); |
| self |
| } |
| |
| /// Returns the match semantics set in this configuration. |
| pub fn get_match_kind(&self) -> MatchKind { |
| self.match_kind.unwrap_or(MatchKind::LeftmostFirst) |
| } |
| |
| /// Returns the prefilter set in this configuration, if one at all. |
| pub fn get_prefilter(&self) -> Option<&Prefilter> { |
| self.pre.as_ref().unwrap_or(&None).as_ref() |
| } |
| |
| /// Overwrite the default configuration such that the options in `o` are |
| /// always used. If an option in `o` is not set, then the corresponding |
| /// option in `self` is used. If it's not set in `self` either, then it |
| /// remains not set. |
| pub(crate) fn overwrite(&self, o: Config) -> Config { |
| Config { |
| match_kind: o.match_kind.or(self.match_kind), |
| pre: o.pre.or_else(|| self.pre.clone()), |
| } |
| } |
| } |
| |
| /// A builder for a `PikeVM`. |
| /// |
| /// This builder permits configuring options for the syntax of a pattern, |
| /// the NFA construction and the `PikeVM` construction. This builder is |
| /// different from a general purpose regex builder in that it permits fine |
| /// grain configuration of the construction process. The trade off for this is |
| /// complexity, and the possibility of setting a configuration that might not |
| /// make sense. For example, there are two different UTF-8 modes: |
| /// |
| /// * [`util::syntax::Config::utf8`](crate::util::syntax::Config::utf8) |
| /// controls whether the pattern itself can contain sub-expressions that match |
| /// invalid UTF-8. |
| /// * [`thompson::Config::utf8`] controls whether empty matches that split a |
| /// Unicode codepoint are reported or not. |
| /// |
| /// Generally speaking, callers will want to either enable all of these or |
| /// disable all of these. |
| /// |
| /// # Example |
| /// |
| /// This example shows how to disable UTF-8 mode in the syntax and the regex |
| /// itself. This is generally what you want for matching on arbitrary bytes. |
| /// |
| /// ``` |
| /// use regex_automata::{ |
| /// nfa::thompson::{self, pikevm::PikeVM}, |
| /// util::syntax, |
| /// Match, |
| /// }; |
| /// |
| /// let re = PikeVM::builder() |
| /// .syntax(syntax::Config::new().utf8(false)) |
| /// .thompson(thompson::Config::new().utf8(false)) |
| /// .build(r"foo(?-u:[^b])ar.*")?; |
| /// let mut cache = re.create_cache(); |
| /// |
| /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; |
| /// let expected = Some(Match::must(0, 1..9)); |
| /// let got = re.find_iter(&mut cache, haystack).next(); |
| /// assert_eq!(expected, got); |
| /// // Notice that `(?-u:[^b])` matches invalid UTF-8, |
| /// // but the subsequent `.*` does not! Disabling UTF-8 |
| /// // on the syntax permits this. |
| /// // |
| /// // N.B. This example does not show the impact of |
| /// // disabling UTF-8 mode on a PikeVM Config, since that |
| /// // only impacts regexes that can produce matches of |
| /// // length 0. |
| /// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]); |
| /// |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| #[derive(Clone, Debug)] |
| pub struct Builder { |
| config: Config, |
| #[cfg(feature = "syntax")] |
| thompson: thompson::Compiler, |
| } |
| |
| impl Builder { |
| /// Create a new PikeVM builder with its default configuration. |
| pub fn new() -> Builder { |
| Builder { |
| config: Config::default(), |
| #[cfg(feature = "syntax")] |
| thompson: thompson::Compiler::new(), |
| } |
| } |
| |
| /// Build a `PikeVM` from the given pattern. |
| /// |
| /// If there was a problem parsing or compiling the pattern, then an error |
| /// is returned. |
| #[cfg(feature = "syntax")] |
| pub fn build(&self, pattern: &str) -> Result<PikeVM, BuildError> { |
| self.build_many(&[pattern]) |
| } |
| |
| /// Build a `PikeVM` from the given patterns. |
| #[cfg(feature = "syntax")] |
| pub fn build_many<P: AsRef<str>>( |
| &self, |
| patterns: &[P], |
| ) -> Result<PikeVM, BuildError> { |
| let nfa = self.thompson.build_many(patterns)?; |
| self.build_from_nfa(nfa) |
| } |
| |
| /// Build a `PikeVM` directly from its NFA. |
| /// |
| /// Note that when using this method, any configuration that applies to the |
| /// construction of the NFA itself will of course be ignored, since the NFA |
| /// given here is already built. |
| pub fn build_from_nfa(&self, nfa: NFA) -> Result<PikeVM, BuildError> { |
| nfa.look_set_any().available().map_err(BuildError::word)?; |
| Ok(PikeVM { config: self.config.clone(), nfa }) |
| } |
| |
| /// Apply the given `PikeVM` configuration options to this builder. |
| pub fn configure(&mut self, config: Config) -> &mut Builder { |
| self.config = self.config.overwrite(config); |
| self |
| } |
| |
| /// Set the syntax configuration for this builder using |
| /// [`syntax::Config`](crate::util::syntax::Config). |
| /// |
| /// This permits setting things like case insensitivity, Unicode and multi |
| /// line mode. |
| /// |
| /// These settings only apply when constructing a PikeVM directly from a |
| /// pattern. |
| #[cfg(feature = "syntax")] |
| pub fn syntax( |
| &mut self, |
| config: crate::util::syntax::Config, |
| ) -> &mut Builder { |
| self.thompson.syntax(config); |
| self |
| } |
| |
| /// Set the Thompson NFA configuration for this builder using |
| /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). |
| /// |
| /// This permits setting things like if additional time should be spent |
| /// shrinking the size of the NFA. |
| /// |
| /// These settings only apply when constructing a PikeVM directly from a |
| /// pattern. |
| #[cfg(feature = "syntax")] |
| pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { |
| self.thompson.configure(config); |
| self |
| } |
| } |
| |
| /// A virtual machine for executing regex searches with capturing groups. |
| /// |
| /// # Infallible APIs |
| /// |
| /// Unlike most other regex engines in this crate, a `PikeVM` never returns an |
| /// error at search time. It supports all [`Anchored`] configurations, never |
| /// quits and works on haystacks of arbitrary length. |
| /// |
| /// There are two caveats to mention though: |
| /// |
| /// * If an invalid pattern ID is given to a search via [`Anchored::Pattern`], |
| /// then the PikeVM will report "no match." This is consistent with all other |
| /// regex engines in this crate. |
| /// * When using [`PikeVM::which_overlapping_matches`] with a [`PatternSet`] |
| /// that has insufficient capacity to store all valid pattern IDs, then if a |
| /// match occurs for a `PatternID` that cannot be inserted, it is silently |
| /// dropped as if it did not match. |
| /// |
| /// # Advice |
| /// |
| /// The `PikeVM` is generally the most "powerful" regex engine in this crate. |
| /// "Powerful" in this context means that it can handle any regular expression |
| /// that is parseable by `regex-syntax` and any size haystack. Regretably, |
| /// the `PikeVM` is also simultaneously often the _slowest_ regex engine in |
| /// practice. This results in an annoying situation where one generally tries |
| /// to pick any other regex engine (or perhaps none at all) before being |
| /// forced to fall back to a `PikeVM`. |
| /// |
| /// For example, a common strategy for dealing with capturing groups is to |
| /// actually look for the overall match of the regex using a faster regex |
| /// engine, like a [lazy DFA](crate::hybrid::regex::Regex). Once the overall |
| /// match is found, one can then run the `PikeVM` on just the match span to |
| /// find the spans of the capturing groups. In this way, the faster regex |
| /// engine does the majority of the work, while the `PikeVM` only lends its |
| /// power in a more limited role. |
| /// |
| /// Unfortunately, this isn't always possible because the faster regex engines |
| /// don't support all of the regex features in `regex-syntax`. This notably |
| /// includes (and is currently limited to) Unicode word boundaries. So if |
| /// your pattern has Unicode word boundaries, you typically can't use a |
| /// DFA-based regex engine at all (unless you [enable heuristic support for |
| /// it](crate::hybrid::dfa::Config::unicode_word_boundary)). (The [one-pass |
| /// DFA](crate::dfa::onepass::DFA) can handle Unicode word boundaries for |
| /// anchored searches only, but in a cruel sort of joke, many Unicode features |
| /// tend to result in making the regex _not_ one-pass.) |
| /// |
| /// # Example |
| /// |
| /// This example shows that the `PikeVM` implements Unicode word boundaries |
| /// correctly by default. |
| /// |
| /// ``` |
| /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
| /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; |
| /// |
| /// let re = PikeVM::new(r"\b\w+\b")?; |
| /// let mut cache = re.create_cache(); |
| /// |
| /// let mut it = re.find_iter(&mut cache, "Шерлок Холмс"); |
| /// assert_eq!(Some(Match::must(0, 0..12)), it.next()); |
| /// assert_eq!(Some(Match::must(0, 13..23)), it.next()); |
| /// assert_eq!(None, it.next()); |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| #[derive(Clone, Debug)] |
| pub struct PikeVM { |
| config: Config, |
| nfa: NFA, |
| } |
| |
| impl PikeVM { |
| /// Parse the given regular expression using the default configuration and |
| /// return the corresponding `PikeVM`. |
| /// |
| /// If you want a non-default configuration, then use the [`Builder`] to |
| /// set your own configuration. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; |
| /// |
| /// let re = PikeVM::new("foo[0-9]+bar")?; |
| /// let mut cache = re.create_cache(); |
| /// assert_eq!( |
| /// Some(Match::must(0, 3..14)), |
| /// re.find_iter(&mut cache, "zzzfoo12345barzzz").next(), |
| /// ); |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| #[cfg(feature = "syntax")] |
| pub fn new(pattern: &str) -> Result<PikeVM, BuildError> { |
| PikeVM::builder().build(pattern) |
| } |
| |
| /// Like `new`, but parses multiple patterns into a single "multi regex." |
| /// This similarly uses the default regex configuration. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; |
| /// |
| /// let re = PikeVM::new_many(&["[a-z]+", "[0-9]+"])?; |
| /// let mut cache = re.create_cache(); |
| /// |
| /// let mut it = re.find_iter(&mut cache, "abc 1 foo 4567 0 quux"); |
| /// assert_eq!(Some(Match::must(0, 0..3)), it.next()); |
| /// assert_eq!(Some(Match::must(1, 4..5)), it.next()); |
| /// assert_eq!(Some(Match::must(0, 6..9)), it.next()); |
| /// assert_eq!(Some(Match::must(1, 10..14)), it.next()); |
| /// assert_eq!(Some(Match::must(1, 15..16)), it.next()); |
| /// assert_eq!(Some(Match::must(0, 17..21)), it.next()); |
| /// assert_eq!(None, it.next()); |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| #[cfg(feature = "syntax")] |
| pub fn new_many<P: AsRef<str>>( |
| patterns: &[P], |
| ) -> Result<PikeVM, BuildError> { |
| PikeVM::builder().build_many(patterns) |
| } |
| |
| /// Like `new`, but builds a PikeVM directly from an NFA. This is useful |
| /// if you already have an NFA, or even if you hand-assembled the NFA. |
| /// |
| /// # Example |
| /// |
| /// This shows how to hand assemble a regular expression via its HIR, |
| /// compile an NFA from it and build a PikeVM from the NFA. |
| /// |
| /// ``` |
| /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; |
| /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; |
| /// |
| /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![ |
| /// ClassBytesRange::new(b'0', b'9'), |
| /// ClassBytesRange::new(b'A', b'Z'), |
| /// ClassBytesRange::new(b'_', b'_'), |
| /// ClassBytesRange::new(b'a', b'z'), |
| /// ]))); |
| /// |
| /// let config = NFA::config().nfa_size_limit(Some(1_000)); |
| /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?; |
| /// |
| /// let re = PikeVM::new_from_nfa(nfa)?; |
| /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
| /// let expected = Some(Match::must(0, 3..4)); |
| /// re.captures(&mut cache, "!@#A#@!", &mut caps); |
| /// assert_eq!(expected, caps.get_match()); |
| /// |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| pub fn new_from_nfa(nfa: NFA) -> Result<PikeVM, BuildError> { |
| PikeVM::builder().build_from_nfa(nfa) |
| } |
| |
| /// Create a new `PikeVM` that matches every input. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; |
| /// |
| /// let re = PikeVM::always_match()?; |
| /// let mut cache = re.create_cache(); |
| /// |
| /// let expected = Match::must(0, 0..0); |
| /// assert_eq!(Some(expected), re.find_iter(&mut cache, "").next()); |
| /// assert_eq!(Some(expected), re.find_iter(&mut cache, "foo").next()); |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| pub fn always_match() -> Result<PikeVM, BuildError> { |
| let nfa = thompson::NFA::always_match(); |
| PikeVM::new_from_nfa(nfa) |
| } |
| |
| /// Create a new `PikeVM` that never matches any input. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use regex_automata::nfa::thompson::pikevm::PikeVM; |
| /// |
| /// let re = PikeVM::never_match()?; |
| /// let mut cache = re.create_cache(); |
| /// |
| /// assert_eq!(None, re.find_iter(&mut cache, "").next()); |
| /// assert_eq!(None, re.find_iter(&mut cache, "foo").next()); |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| pub fn never_match() -> Result<PikeVM, BuildError> { |
| let nfa = thompson::NFA::never_match(); |
| PikeVM::new_from_nfa(nfa) |
| } |
| |
| /// Return a default configuration for a `PikeVM`. |
| /// |
| /// This is a convenience routine to avoid needing to import the `Config` |
| /// type when customizing the construction of a `PikeVM`. |
| /// |
| /// # Example |
| /// |
| /// This example shows how to disable UTF-8 mode. When UTF-8 mode is |
| /// disabled, zero-width matches that split a codepoint are allowed. |
| /// Otherwise they are never reported. |
| /// |
| /// In the code below, notice that `""` is permitted to match positions |
| /// that split the encoding of a codepoint. |
| /// |
| /// ``` |
| /// use regex_automata::{nfa::thompson::{self, pikevm::PikeVM}, Match}; |
| /// |
| /// let re = PikeVM::builder() |
| /// .thompson(thompson::Config::new().utf8(false)) |
| /// .build(r"")?; |
| /// let mut cache = re.create_cache(); |
| /// |
| /// let haystack = "a☃z"; |
| /// let mut it = re.find_iter(&mut cache, haystack); |
| /// assert_eq!(Some(Match::must(0, 0..0)), it.next()); |
| /// assert_eq!(Some(Match::must(0, 1..1)), it.next()); |
| /// assert_eq!(Some(Match::must(0, 2..2)), it.next()); |
| /// assert_eq!(Some(Match::must(0, 3..3)), it.next()); |
| /// assert_eq!(Some(Match::must(0, 4..4)), it.next()); |
| /// assert_eq!(Some(Match::must(0, 5..5)), it.next()); |
| /// assert_eq!(None, it.next()); |
| /// |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| pub fn config() -> Config { |
| Config::new() |
| } |
| |
| /// Return a builder for configuring the construction of a `PikeVM`. |
| /// |
| /// This is a convenience routine to avoid needing to import the |
| /// [`Builder`] type in common cases. |
| /// |
| /// # Example |
| /// |
| /// This example shows how to use the builder to disable UTF-8 mode |
| /// everywhere. |
| /// |
| /// ``` |
| /// use regex_automata::{ |
| /// nfa::thompson::{self, pikevm::PikeVM}, |
| /// util::syntax, |
| /// Match, |
| /// }; |
| /// |
| /// let re = PikeVM::builder() |
| /// .syntax(syntax::Config::new().utf8(false)) |
| /// .thompson(thompson::Config::new().utf8(false)) |
| /// .build(r"foo(?-u:[^b])ar.*")?; |
| /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
| /// |
| /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; |
| /// let expected = Some(Match::must(0, 1..9)); |
| /// re.captures(&mut cache, haystack, &mut caps); |
| /// assert_eq!(expected, caps.get_match()); |
| /// |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| pub fn builder() -> Builder { |
| Builder::new() |
| } |
| |
| /// Create a new empty set of capturing groups that is guaranteed to be |
| /// valid for the search APIs on this `PikeVM`. |
| /// |
| /// A `Captures` value created for a specific `PikeVM` cannot be used with |
| /// any other `PikeVM`. |
| /// |
| /// This is a convenience function for [`Captures::all`]. See the |
| /// [`Captures`] documentation for an explanation of its alternative |
| /// constructors that permit the `PikeVM` to do less work during a search, |
| /// and thus might make it faster. |
| pub fn create_captures(&self) -> Captures { |
| Captures::all(self.get_nfa().group_info().clone()) |
| } |
| |
| /// Create a new cache for this `PikeVM`. |
| /// |
| /// The cache returned should only be used for searches for this |
| /// `PikeVM`. If you want to reuse the cache for another `PikeVM`, then |
| /// you must call [`Cache::reset`] with that `PikeVM` (or, equivalently, |
| /// [`PikeVM::reset_cache`]). |
| pub fn create_cache(&self) -> Cache { |
| Cache::new(self) |
| } |
| |
| /// Reset the given cache such that it can be used for searching with the |
| /// this `PikeVM` (and only this `PikeVM`). |
| /// |
| /// A cache reset permits reusing memory already allocated in this cache |
| /// with a different `PikeVM`. |
| /// |
| /// # Example |
| /// |
| /// This shows how to re-purpose a cache for use with a different `PikeVM`. |
| /// |
| /// ``` |
| /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
| /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; |
| /// |
| /// let re1 = PikeVM::new(r"\w")?; |
| /// let re2 = PikeVM::new(r"\W")?; |
| /// |
| /// let mut cache = re1.create_cache(); |
| /// assert_eq!( |
| /// Some(Match::must(0, 0..2)), |
| /// re1.find_iter(&mut cache, "Δ").next(), |
| /// ); |
| /// |
| /// // Using 'cache' with re2 is not allowed. It may result in panics or |
| /// // incorrect results. In order to re-purpose the cache, we must reset |
| /// // it with the PikeVM we'd like to use it with. |
| /// // |
| /// // Similarly, after this reset, using the cache with 're1' is also not |
| /// // allowed. |
| /// re2.reset_cache(&mut cache); |
| /// assert_eq!( |
| /// Some(Match::must(0, 0..3)), |
| /// re2.find_iter(&mut cache, "☃").next(), |
| /// ); |
| /// |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| pub fn reset_cache(&self, cache: &mut Cache) { |
| cache.reset(self); |
| } |
| |
| /// Returns the total number of patterns compiled into this `PikeVM`. |
| /// |
| /// In the case of a `PikeVM` that contains no patterns, this returns `0`. |
| /// |
| /// # Example |
| /// |
| /// This example shows the pattern length for a `PikeVM` that never |
| /// matches: |
| /// |
| /// ``` |
| /// use regex_automata::nfa::thompson::pikevm::PikeVM; |
| /// |
| /// let re = PikeVM::never_match()?; |
| /// assert_eq!(re.pattern_len(), 0); |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| /// |
| /// And another example for a `PikeVM` that matches at every position: |
| /// |
| /// ``` |
| /// use regex_automata::nfa::thompson::pikevm::PikeVM; |
| /// |
| /// let re = PikeVM::always_match()?; |
| /// assert_eq!(re.pattern_len(), 1); |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| /// |
| /// And finally, a `PikeVM` that was constructed from multiple patterns: |
| /// |
| /// ``` |
| /// use regex_automata::nfa::thompson::pikevm::PikeVM; |
| /// |
| /// let re = PikeVM::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; |
| /// assert_eq!(re.pattern_len(), 3); |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| pub fn pattern_len(&self) -> usize { |
| self.nfa.pattern_len() |
| } |
| |
| /// Return the config for this `PikeVM`. |
| #[inline] |
| pub fn get_config(&self) -> &Config { |
| &self.config |
| } |
| |
| /// Returns a reference to the underlying NFA. |
| #[inline] |
| pub fn get_nfa(&self) -> &NFA { |
| &self.nfa |
| } |
| } |
| |
| impl PikeVM { |
| /// Returns true if and only if this `PikeVM` matches the given haystack. |
| /// |
| /// This routine may short circuit if it knows that scanning future |
| /// input will never lead to a different result. In particular, if the |
| /// underlying NFA enters a match state, then this routine will return |
| /// `true` immediately without inspecting any future input. (Consider how |
| /// this might make a difference given the regex `a+` on the haystack |
| /// `aaaaaaaaaaaaaaa`. This routine can stop after it sees the first `a`, |
| /// but routines like `find` need to continue searching because `+` is |
| /// greedy by default.) |
| /// |
| /// # Example |
| /// |
| /// This shows basic usage: |
| /// |
| /// ``` |
| /// use regex_automata::nfa::thompson::pikevm::PikeVM; |
| /// |
| /// let re = PikeVM::new("foo[0-9]+bar")?; |
| /// let mut cache = re.create_cache(); |
| /// |
| /// assert!(re.is_match(&mut cache, "foo12345bar")); |
| /// assert!(!re.is_match(&mut cache, "foobar")); |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| /// |
| /// # Example: consistency with search APIs |
| /// |
| /// `is_match` is guaranteed to return `true` whenever `find` returns a |
| /// match. This includes searches that are executed entirely within a |
| /// codepoint: |
| /// |
| /// ``` |
| /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Input}; |
| /// |
| /// let re = PikeVM::new("a*")?; |
| /// let mut cache = re.create_cache(); |
| /// |
| /// assert!(!re.is_match(&mut cache, Input::new("☃").span(1..2))); |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| /// |
| /// Notice that when UTF-8 mode is disabled, then the above reports a |
| /// match because the restriction against zero-width matches that split a |
| /// codepoint has been lifted: |
| /// |
| /// ``` |
| /// use regex_automata::{nfa::thompson::{pikevm::PikeVM, NFA}, Input}; |
| /// |
| /// let re = PikeVM::builder() |
| /// .thompson(NFA::config().utf8(false)) |
| /// .build("a*")?; |
| /// let mut cache = re.create_cache(); |
| /// |
| /// assert!(re.is_match(&mut cache, Input::new("☃").span(1..2))); |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| #[inline] |
| pub fn is_match<'h, I: Into<Input<'h>>>( |
| &self, |
| cache: &mut Cache, |
| input: I, |
| ) -> bool { |
| let input = input.into().earliest(true); |
| self.search_slots(cache, &input, &mut []).is_some() |
| } |
| |
| /// Executes a leftmost forward search and returns a `Match` if one exists. |
| /// |
| /// This routine only includes the overall match span. To get access to the |
| /// individual spans of each capturing group, use [`PikeVM::captures`]. |
| /// |
| /// # Example |
| /// |
| /// Leftmost first match semantics corresponds to the match with the |
| /// smallest starting offset, but where the end offset is determined by |
| /// preferring earlier branches in the original regular expression. For |
| /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` |
| /// will match `Samwise` in `Samwise`. |
| /// |
| /// Generally speaking, the "leftmost first" match is how most backtracking |
| /// regular expressions tend to work. This is in contrast to POSIX-style |
| /// regular expressions that yield "leftmost longest" matches. Namely, |
| /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using |
| /// leftmost longest semantics. (This crate does not currently support |
| /// leftmost longest semantics.) |
| /// |
| /// ``` |
| /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; |
| /// |
| /// let re = PikeVM::new("foo[0-9]+")?; |
| /// let mut cache = re.create_cache(); |
| /// let expected = Match::must(0, 0..8); |
| /// assert_eq!(Some(expected), re.find(&mut cache, "foo12345")); |
| /// |
| /// // Even though a match is found after reading the first byte (`a`), |
| /// // the leftmost first match semantics demand that we find the earliest |
| /// // match that prefers earlier parts of the pattern over later parts. |
| /// let re = PikeVM::new("abc|a")?; |
| /// let mut cache = re.create_cache(); |
| /// let expected = Match::must(0, 0..3); |
| /// assert_eq!(Some(expected), re.find(&mut cache, "abc")); |
| /// |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| #[inline] |
| pub fn find<'h, I: Into<Input<'h>>>( |
| &self, |
| cache: &mut Cache, |
| input: I, |
| ) -> Option<Match> { |
| let input = input.into(); |
| if self.get_nfa().pattern_len() == 1 { |
| let mut slots = [None, None]; |
| let pid = self.search_slots(cache, &input, &mut slots)?; |
| let start = slots[0]?.get(); |
| let end = slots[1]?.get(); |
| return Some(Match::new(pid, Span { start, end })); |
| } |
| let ginfo = self.get_nfa().group_info(); |
| let slots_len = ginfo.implicit_slot_len(); |
| let mut slots = vec![None; slots_len]; |
| let pid = self.search_slots(cache, &input, &mut slots)?; |
| let start = slots[pid.as_usize() * 2]?.get(); |
| let end = slots[pid.as_usize() * 2 + 1]?.get(); |
| Some(Match::new(pid, Span { start, end })) |
| } |
| |
| /// Executes a leftmost forward search and writes the spans of capturing |
| /// groups that participated in a match into the provided [`Captures`] |
| /// value. If no match was found, then [`Captures::is_match`] is guaranteed |
| /// to return `false`. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; |
| /// |
| /// let re = PikeVM::new(r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$")?; |
| /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
| /// |
| /// re.captures(&mut cache, "2010-03-14", &mut caps); |
| /// assert!(caps.is_match()); |
| /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1)); |
| /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2)); |
| /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3)); |
| /// |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| #[inline] |
| pub fn captures<'h, I: Into<Input<'h>>>( |
| &self, |
| cache: &mut Cache, |
| input: I, |
| caps: &mut Captures, |
| ) { |
| self.search(cache, &input.into(), caps) |
| } |
| |
| /// Returns an iterator over all non-overlapping leftmost matches in the |
| /// given bytes. If no match exists, then the iterator yields no elements. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; |
| /// |
| /// let re = PikeVM::new("foo[0-9]+")?; |
| /// let mut cache = re.create_cache(); |
| /// |
| /// let text = "foo1 foo12 foo123"; |
| /// let matches: Vec<Match> = re.find_iter(&mut cache, text).collect(); |
| /// assert_eq!(matches, vec![ |
| /// Match::must(0, 0..4), |
| /// Match::must(0, 5..10), |
| /// Match::must(0, 11..17), |
| /// ]); |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| #[inline] |
| pub fn find_iter<'r, 'c, 'h, I: Into<Input<'h>>>( |
| &'r self, |
| cache: &'c mut Cache, |
| input: I, |
| ) -> FindMatches<'r, 'c, 'h> { |
| let caps = Captures::matches(self.get_nfa().group_info().clone()); |
| let it = iter::Searcher::new(input.into()); |
| FindMatches { re: self, cache, caps, it } |
| } |
| |
| /// Returns an iterator over all non-overlapping `Captures` values. If no |
| /// match exists, then the iterator yields no elements. |
| /// |
| /// This yields the same matches as [`PikeVM::find_iter`], but it includes |
| /// the spans of all capturing groups that participate in each match. |
| /// |
| /// **Tip:** See [`util::iter::Searcher`](crate::util::iter::Searcher) for |
| /// how to correctly iterate over all matches in a haystack while avoiding |
| /// the creation of a new `Captures` value for every match. (Which you are |
| /// forced to do with an `Iterator`.) |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; |
| /// |
| /// let re = PikeVM::new("foo(?P<numbers>[0-9]+)")?; |
| /// let mut cache = re.create_cache(); |
| /// |
| /// let text = "foo1 foo12 foo123"; |
| /// let matches: Vec<Span> = re |
| /// .captures_iter(&mut cache, text) |
| /// // The unwrap is OK since 'numbers' matches if the pattern matches. |
| /// .map(|caps| caps.get_group_by_name("numbers").unwrap()) |
| /// .collect(); |
| /// assert_eq!(matches, vec![ |
| /// Span::from(3..4), |
| /// Span::from(8..10), |
| /// Span::from(14..17), |
| /// ]); |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| #[inline] |
| pub fn captures_iter<'r, 'c, 'h, I: Into<Input<'h>>>( |
| &'r self, |
| cache: &'c mut Cache, |
| input: I, |
| ) -> CapturesMatches<'r, 'c, 'h> { |
| let caps = self.create_captures(); |
| let it = iter::Searcher::new(input.into()); |
| CapturesMatches { re: self, cache, caps, it } |
| } |
| } |
| |
| impl PikeVM { |
| /// Executes a leftmost forward search and writes the spans of capturing |
| /// groups that participated in a match into the provided [`Captures`] |
| /// value. If no match was found, then [`Captures::is_match`] is guaranteed |
| /// to return `false`. |
| /// |
| /// This is like [`PikeVM::captures`], but it accepts a concrete `&Input` |
| /// instead of an `Into<Input>`. |
| /// |
| /// # Example: specific pattern search |
| /// |
| /// This example shows how to build a multi-PikeVM that permits searching |
| /// for specific patterns. |
| /// |
| /// ``` |
| /// use regex_automata::{ |
| /// nfa::thompson::pikevm::PikeVM, |
| /// Anchored, Match, PatternID, Input, |
| /// }; |
| /// |
| /// let re = PikeVM::new_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?; |
| /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
| /// let haystack = "foo123"; |
| /// |
| /// // Since we are using the default leftmost-first match and both |
| /// // patterns match at the same starting position, only the first pattern |
| /// // will be returned in this case when doing a search for any of the |
| /// // patterns. |
| /// let expected = Some(Match::must(0, 0..6)); |
| /// re.search(&mut cache, &Input::new(haystack), &mut caps); |
| /// assert_eq!(expected, caps.get_match()); |
| /// |
| /// // But if we want to check whether some other pattern matches, then we |
| /// // can provide its pattern ID. |
| /// let expected = Some(Match::must(1, 0..6)); |
| /// let input = Input::new(haystack) |
| /// .anchored(Anchored::Pattern(PatternID::must(1))); |
| /// re.search(&mut cache, &input, &mut caps); |
| /// assert_eq!(expected, caps.get_match()); |
| /// |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| /// |
| /// # Example: specifying the bounds of a search |
| /// |
| /// This example shows how providing the bounds of a search can produce |
| /// different results than simply sub-slicing the haystack. |
| /// |
| /// ``` |
| /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
| /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input}; |
| /// |
| /// let re = PikeVM::new(r"\b[0-9]{3}\b")?; |
| /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
| /// let haystack = "foo123bar"; |
| /// |
| /// // Since we sub-slice the haystack, the search doesn't know about |
| /// // the larger context and assumes that `123` is surrounded by word |
| /// // boundaries. And of course, the match position is reported relative |
| /// // to the sub-slice as well, which means we get `0..3` instead of |
| /// // `3..6`. |
| /// let expected = Some(Match::must(0, 0..3)); |
| /// re.search(&mut cache, &Input::new(&haystack[3..6]), &mut caps); |
| /// assert_eq!(expected, caps.get_match()); |
| /// |
| /// // But if we provide the bounds of the search within the context of the |
| /// // entire haystack, then the search can take the surrounding context |
| /// // into account. (And if we did find a match, it would be reported |
| /// // as a valid offset into `haystack` instead of its sub-slice.) |
| /// let expected = None; |
| /// let input = Input::new(haystack).range(3..6); |
| /// re.search(&mut cache, &input, &mut caps); |
| /// assert_eq!(expected, caps.get_match()); |
| /// |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| #[inline] |
| pub fn search( |
| &self, |
| cache: &mut Cache, |
| input: &Input<'_>, |
| caps: &mut Captures, |
| ) { |
| caps.set_pattern(None); |
| let pid = self.search_slots(cache, input, caps.slots_mut()); |
| caps.set_pattern(pid); |
| } |
| |
| /// Executes a leftmost forward search and writes the spans of capturing |
| /// groups that participated in a match into the provided `slots`, and |
| /// returns the matching pattern ID. The contents of the slots for patterns |
| /// other than the matching pattern are unspecified. If no match was found, |
| /// then `None` is returned and the contents of `slots` is unspecified. |
| /// |
| /// This is like [`PikeVM::search`], but it accepts a raw slots slice |
| /// instead of a `Captures` value. This is useful in contexts where you |
| /// don't want or need to allocate a `Captures`. |
| /// |
| /// It is legal to pass _any_ number of slots to this routine. If the regex |
| /// engine would otherwise write a slot offset that doesn't fit in the |
| /// provided slice, then it is simply skipped. In general though, there are |
| /// usually three slice lengths you might want to use: |
| /// |
| /// * An empty slice, if you only care about which pattern matched. |
| /// * A slice with |
| /// [`pattern_len() * 2`](crate::nfa::thompson::NFA::pattern_len) |
| /// slots, if you only care about the overall match spans for each matching |
| /// pattern. |
| /// * A slice with |
| /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which |
| /// permits recording match offsets for every capturing group in every |
| /// pattern. |
| /// |
| /// # Example |
| /// |
| /// This example shows how to find the overall match offsets in a |
| /// multi-pattern search without allocating a `Captures` value. Indeed, we |
| /// can put our slots right on the stack. |
| /// |
| /// ``` |
| /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
| /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID, Input}; |
| /// |
| /// let re = PikeVM::new_many(&[ |
| /// r"\pL+", |
| /// r"\d+", |
| /// ])?; |
| /// let mut cache = re.create_cache(); |
| /// let input = Input::new("!@#123"); |
| /// |
| /// // We only care about the overall match offsets here, so we just |
| /// // allocate two slots for each pattern. Each slot records the start |
| /// // and end of the match. |
| /// let mut slots = [None; 4]; |
| /// let pid = re.search_slots(&mut cache, &input, &mut slots); |
| /// assert_eq!(Some(PatternID::must(1)), pid); |
| /// |
| /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'. |
| /// // See 'GroupInfo' for more details on the mapping between groups and |
| /// // slot indices. |
| /// let slot_start = pid.unwrap().as_usize() * 2; |
| /// let slot_end = slot_start + 1; |
| /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get())); |
| /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get())); |
| /// |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| #[inline] |
| pub fn search_slots( |
| &self, |
| cache: &mut Cache, |
| input: &Input<'_>, |
| slots: &mut [Option<NonMaxUsize>], |
| ) -> Option<PatternID> { |
| let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); |
| if !utf8empty { |
| let hm = self.search_slots_imp(cache, input, slots)?; |
| return Some(hm.pattern()); |
| } |
| // There is an unfortunate special case where if the regex can |
| // match the empty string and UTF-8 mode is enabled, the search |
| // implementation requires that the slots have at least as much space |
| // to report the bounds of any match. This is so zero-width matches |
| // that split a codepoint can be filtered out. |
| // |
| // Note that if utf8empty is true, we specialize the case for when |
| // the number of patterns is 1. In that case, we can just use a stack |
| // allocation. Otherwise we resort to a heap allocation, which we |
| // convince ourselves we're fine with due to the pathological nature of |
| // this case. |
| let min = self.get_nfa().group_info().implicit_slot_len(); |
| if slots.len() >= min { |
| let hm = self.search_slots_imp(cache, input, slots)?; |
| return Some(hm.pattern()); |
| } |
| if self.get_nfa().pattern_len() == 1 { |
| let mut enough = [None, None]; |
| let got = self.search_slots_imp(cache, input, &mut enough); |
| // This is OK because we know `enough` is strictly bigger than |
| // `slots`, otherwise this special case isn't reached. |
| slots.copy_from_slice(&enough[..slots.len()]); |
| return got.map(|hm| hm.pattern()); |
| } |
| let mut enough = vec![None; min]; |
| let got = self.search_slots_imp(cache, input, &mut enough); |
| // This is OK because we know `enough` is strictly bigger than `slots`, |
| // otherwise this special case isn't reached. |
| slots.copy_from_slice(&enough[..slots.len()]); |
| got.map(|hm| hm.pattern()) |
| } |
| |
| /// This is the actual implementation of `search_slots_imp` that |
| /// doesn't account for the special case when 1) the NFA has UTF-8 mode |
| /// enabled, 2) the NFA can match the empty string and 3) the caller has |
| /// provided an insufficient number of slots to record match offsets. |
| #[inline(never)] |
| fn search_slots_imp( |
| &self, |
| cache: &mut Cache, |
| input: &Input<'_>, |
| slots: &mut [Option<NonMaxUsize>], |
| ) -> Option<HalfMatch> { |
| let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); |
| let hm = match self.search_imp(cache, input, slots) { |
| None => return None, |
| Some(hm) if !utf8empty => return Some(hm), |
| Some(hm) => hm, |
| }; |
| empty::skip_splits_fwd(input, hm, hm.offset(), |input| { |
| Ok(self |
| .search_imp(cache, input, slots) |
| .map(|hm| (hm, hm.offset()))) |
| }) |
| // OK because the PikeVM never errors. |
| .unwrap() |
| } |
| |
| /// Writes the set of patterns that match anywhere in the given search |
| /// configuration to `patset`. If multiple patterns match at the same |
| /// position and this `PikeVM` was configured with [`MatchKind::All`] |
| /// semantics, then all matching patterns are written to the given set. |
| /// |
| /// Unless all of the patterns in this `PikeVM` are anchored, then |
| /// generally speaking, this will visit every byte in the haystack. |
| /// |
| /// This search routine *does not* clear the pattern set. This gives some |
| /// flexibility to the caller (e.g., running multiple searches with the |
| /// same pattern set), but does make the API bug-prone if you're reusing |
| /// the same pattern set for multiple searches but intended them to be |
| /// independent. |
| /// |
| /// If a pattern ID matched but the given `PatternSet` does not have |
| /// sufficient capacity to store it, then it is not inserted and silently |
| /// dropped. |
| /// |
| /// # Example |
| /// |
| /// This example shows how to find all matching patterns in a haystack, |
| /// even when some patterns match at the same position as other patterns. |
| /// |
| /// ``` |
| /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
| /// use regex_automata::{ |
| /// nfa::thompson::pikevm::PikeVM, |
| /// Input, MatchKind, PatternSet, |
| /// }; |
| /// |
| /// let patterns = &[ |
| /// r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar", |
| /// ]; |
| /// let re = PikeVM::builder() |
| /// .configure(PikeVM::config().match_kind(MatchKind::All)) |
| /// .build_many(patterns)?; |
| /// let mut cache = re.create_cache(); |
| /// |
| /// let input = Input::new("foobar"); |
| /// let mut patset = PatternSet::new(re.pattern_len()); |
| /// re.which_overlapping_matches(&mut cache, &input, &mut patset); |
| /// let expected = vec![0, 2, 3, 4, 6]; |
| /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect(); |
| /// assert_eq!(expected, got); |
| /// |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| #[inline] |
| pub fn which_overlapping_matches( |
| &self, |
| cache: &mut Cache, |
| input: &Input<'_>, |
| patset: &mut PatternSet, |
| ) { |
| self.which_overlapping_imp(cache, input, patset) |
| } |
| } |
| |
| impl PikeVM { |
| /// The implementation of standard leftmost search. |
| /// |
| /// Capturing group spans are written to `slots`, but only if requested. |
| /// `slots` can be any length. Any slot in the NFA that is activated but |
| /// which is out of bounds for the given `slots` is ignored. |
| fn search_imp( |
| &self, |
| cache: &mut Cache, |
| input: &Input<'_>, |
| slots: &mut [Option<NonMaxUsize>], |
| ) -> Option<HalfMatch> { |
| cache.setup_search(slots.len()); |
| if input.is_done() { |
| return None; |
| } |
| // Why do we even care about this? Well, in our 'Captures' |
| // representation, we use usize::MAX as a sentinel to indicate "no |
| // match." This isn't problematic so long as our haystack doesn't have |
| // a maximal length. Byte slices are guaranteed by Rust to have a |
| // length that fits into isize, and so this assert should always pass. |
| // But we put it here to make our assumption explicit. |
| assert!( |
| input.haystack().len() < core::usize::MAX, |
| "byte slice lengths must be less than usize MAX", |
| ); |
| instrument!(|c| c.reset(&self.nfa)); |
| |
| // Whether we want to visit all match states instead of emulating the |
| // 'leftmost' semantics of typical backtracking regex engines. |
| let allmatches = |
| self.config.get_match_kind().continue_past_first_match(); |
| let (anchored, start_id) = match self.start_config(input) { |
| None => return None, |
| Some(config) => config, |
| }; |
| |
| let pre = |
| if anchored { None } else { self.get_config().get_prefilter() }; |
| let Cache { ref mut stack, ref mut curr, ref mut next } = cache; |
| let mut hm = None; |
| // Yes, our search doesn't end at input.end(), but includes it. This |
| // is necessary because matches are delayed by one byte, just like |
| // how the DFA engines work. The delay is used to handle look-behind |
| // assertions. In the case of the PikeVM, the delay is implemented |
| // by not considering a match to exist until it is visited in |
| // 'steps'. Technically, we know a match exists in the previous |
| // iteration via 'epsilon_closure'. (It's the same thing in NFA-to-DFA |
| // determinization. We don't mark a DFA state as a match state if it |
| // contains an NFA match state, but rather, whether the DFA state was |
| // generated by a transition from a DFA state that contains an NFA |
| // match state.) |
| let mut at = input.start(); |
| while at <= input.end() { |
| // If we have no states left to visit, then there are some cases |
| // where we know we can quit early or even skip ahead. |
| if curr.set.is_empty() { |
| // We have a match and we haven't been instructed to continue |
| // on even after finding a match, so we can quit. |
| if hm.is_some() && !allmatches { |
| break; |
| } |
| // If we're running an anchored search and we've advanced |
| // beyond the start position with no other states to try, then |
| // we will never observe a match and thus can stop. |
| if anchored && at > input.start() { |
| break; |
| } |
| // If there no states left to explore at this position and we |
| // know we can't terminate early, then we are effectively at |
| // the starting state of the NFA. If we fell through here, |
| // we'd end up adding our '(?s-u:.)*?' prefix and it would be |
| // the only thing in 'curr'. So we might as well just skip |
| // ahead until we find something that we know might advance us |
| // forward. |
| if let Some(ref pre) = pre { |
| let span = Span::from(at..input.end()); |
| match pre.find(input.haystack(), span) { |
| None => break, |
| Some(ref span) => at = span.start, |
| } |
| } |
| } |
| // Instead of using the NFA's unanchored start state, we actually |
| // always use its anchored starting state. As a result, when doing |
| // an unanchored search, we need to simulate our own '(?s-u:.)*?' |
| // prefix, to permit a match to appear anywhere. |
| // |
| // Now, we don't *have* to do things this way. We could use the |
| // NFA's unanchored starting state and do one 'epsilon_closure' |
| // call from that starting state before the main loop here. And |
| // that is just as correct. However, it turns out to be slower |
| // than our approach here because it slightly increases the cost |
| // of processing each byte by requiring us to visit more NFA |
| // states to deal with the additional NFA states in the unanchored |
| // prefix. By simulating it explicitly here, we lower those costs |
| // substantially. The cost is itself small, but it adds up for |
| // large haystacks. |
| // |
| // In order to simulate the '(?s-u:.)*?' prefix---which is not |
| // greedy---we are careful not to perform an epsilon closure on |
| // the start state if we already have a match. Namely, if we |
| // did otherwise, we would never reach a terminating condition |
| // because there would always be additional states to process. |
| // In effect, the exclusion of running 'epsilon_closure' when |
| // we have a match corresponds to the "dead" states we have in |
| // our DFA regex engines. Namely, in a DFA, match states merely |
| // instruct the search execution to record the current offset as |
| // the most recently seen match. It is the dead state that actually |
| // indicates when to stop the search (other than EOF or quit |
| // states). |
| // |
| // However, when 'allmatches' is true, the caller has asked us to |
| // leave in every possible match state. This tends not to make a |
| // whole lot of sense in unanchored searches, because it means the |
| // search really cannot terminate until EOF. And often, in that |
| // case, you wind up skipping over a bunch of matches and are left |
| // with the "last" match. Arguably, it just doesn't make a lot of |
| // sense to run a 'leftmost' search (which is what this routine is) |
| // with 'allmatches' set to true. But the DFAs support it and this |
| // matches their behavior. (Generally, 'allmatches' is useful for |
| // overlapping searches or leftmost anchored searches to find the |
| // longest possible match by ignoring match priority.) |
| // |
| // Additionally, when we're running an anchored search, this |
| // epsilon closure should only be computed at the beginning of the |
| // search. If we re-computed it at every position, we would be |
| // simulating an unanchored search when we were tasked to perform |
| // an anchored search. |
| if (!hm.is_some() || allmatches) |
| && (!anchored || at == input.start()) |
| { |
| // Since we are adding to the 'curr' active states and since |
| // this is for the start ID, we use a slots slice that is |
| // guaranteed to have the right length but where every element |
| // is absent. This is exactly what we want, because this |
| // epsilon closure is responsible for simulating an unanchored |
| // '(?s:.)*?' prefix. It is specifically outside of any |
| // capturing groups, and thus, using slots that are always |
| // absent is correct. |
| // |
| // Note though that we can't just use '&mut []' here, since |
| // this epsilon closure may traverse through 'Captures' epsilon |
| // transitions, and thus must be able to write offsets to the |
| // slots given which are later copied to slot values in 'curr'. |
| let slots = next.slot_table.all_absent(); |
| self.epsilon_closure(stack, slots, curr, input, at, start_id); |
| } |
| if let Some(pid) = self.nexts(stack, curr, next, input, at, slots) |
| { |
| hm = Some(HalfMatch::new(pid, at)); |
| } |
| // Unless the caller asked us to return early, we need to mush on |
| // to see if we can extend our match. (But note that 'nexts' will |
| // quit right after seeing a match when match_kind==LeftmostFirst, |
| // as is consistent with leftmost-first match priority.) |
| if input.get_earliest() && hm.is_some() { |
| break; |
| } |
| core::mem::swap(curr, next); |
| next.set.clear(); |
| at += 1; |
| } |
| instrument!(|c| c.eprint(&self.nfa)); |
| hm |
| } |
| |
| /// The implementation for the 'which_overlapping_matches' API. Basically, |
| /// we do a single scan through the entire haystack (unless our regex |
| /// or search is anchored) and record every pattern that matched. In |
| /// particular, when MatchKind::All is used, this supports overlapping |
| /// matches. So if we have the regexes 'sam' and 'samwise', they will |
| /// *both* be reported in the pattern set when searching the haystack |
| /// 'samwise'. |
| fn which_overlapping_imp( |
| &self, |
| cache: &mut Cache, |
| input: &Input<'_>, |
| patset: &mut PatternSet, |
| ) { |
| // NOTE: This is effectively a copy of 'search_imp' above, but with no |
| // captures support and instead writes patterns that matched directly |
| // to 'patset'. See that routine for better commentary about what's |
| // going on in this routine. We probably could unify the routines using |
| // generics or more helper routines, but I'm not sure it's worth it. |
| // |
| // NOTE: We somewhat go out of our way here to support things like |
| // 'input.get_earliest()' and 'leftmost-first' match semantics. Neither |
| // of those seem particularly relevant to this routine, but they are |
| // both supported by the DFA analogs of this routine by construction |
| // and composition, so it seems like good sense to have the PikeVM |
| // match that behavior. |
| |
| cache.setup_search(0); |
| if input.is_done() { |
| return; |
| } |
| assert!( |
| input.haystack().len() < core::usize::MAX, |
| "byte slice lengths must be less than usize MAX", |
| ); |
| instrument!(|c| c.reset(&self.nfa)); |
| |
| let allmatches = |
| self.config.get_match_kind().continue_past_first_match(); |
| let (anchored, start_id) = match self.start_config(input) { |
| None => return, |
| Some(config) => config, |
| }; |
| |
| let Cache { ref mut stack, ref mut curr, ref mut next } = cache; |
| for at in input.start()..=input.end() { |
| let any_matches = !patset.is_empty(); |
| if curr.set.is_empty() { |
| if any_matches && !allmatches { |
| break; |
| } |
| if anchored && at > input.start() { |
| break; |
| } |
| } |
| if !any_matches || allmatches { |
| let slots = &mut []; |
| self.epsilon_closure(stack, slots, curr, input, at, start_id); |
| } |
| self.nexts_overlapping(stack, curr, next, input, at, patset); |
| // If we found a match and filled our set, then there is no more |
| // additional info that we can provide. Thus, we can quit. We also |
| // quit if the caller asked us to stop at the earliest point that |
| // we know a match exists. |
| if patset.is_full() || input.get_earliest() { |
| break; |
| } |
| core::mem::swap(curr, next); |
| next.set.clear(); |
| } |
| instrument!(|c| c.eprint(&self.nfa)); |
| } |
| |
| /// Process the active states in 'curr' to find the states (written to |
| /// 'next') we should process for the next byte in the haystack. |
| /// |
| /// 'stack' is used to perform a depth first traversal of the NFA when |
| /// computing an epsilon closure. |
| /// |
| /// When a match is found, the slots for that match state (in 'curr') are |
| /// copied to 'caps'. Moreover, once a match is seen, processing for 'curr' |
| /// stops (unless the PikeVM was configured with MatchKind::All semantics). |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| fn nexts( |
| &self, |
| stack: &mut Vec<FollowEpsilon>, |
| curr: &mut ActiveStates, |
| next: &mut ActiveStates, |
| input: &Input<'_>, |
| at: usize, |
| slots: &mut [Option<NonMaxUsize>], |
| ) -> Option<PatternID> { |
| instrument!(|c| c.record_state_set(&curr.set)); |
| let mut pid = None; |
| let ActiveStates { ref set, ref mut slot_table } = *curr; |
| for sid in set.iter() { |
| pid = match self.next(stack, slot_table, next, input, at, sid) { |
| None => continue, |
| Some(pid) => Some(pid), |
| }; |
| slots.copy_from_slice(slot_table.for_state(sid)); |
| if !self.config.get_match_kind().continue_past_first_match() { |
| break; |
| } |
| } |
| pid |
| } |
| |
| /// Like 'nexts', but for the overlapping case. This doesn't write any |
| /// slots, and instead just writes which pattern matched in 'patset'. |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| fn nexts_overlapping( |
| &self, |
| stack: &mut Vec<FollowEpsilon>, |
| curr: &mut ActiveStates, |
| next: &mut ActiveStates, |
| input: &Input<'_>, |
| at: usize, |
| patset: &mut PatternSet, |
| ) { |
| instrument!(|c| c.record_state_set(&curr.set)); |
| let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); |
| let ActiveStates { ref set, ref mut slot_table } = *curr; |
| for sid in set.iter() { |
| let pid = match self.next(stack, slot_table, next, input, at, sid) |
| { |
| None => continue, |
| Some(pid) => pid, |
| }; |
| // This handles the case of finding a zero-width match that splits |
| // a codepoint. Namely, if we're in UTF-8 mode AND we know we can |
| // match the empty string, then the only valid way of getting to |
| // this point with an offset that splits a codepoint is when we |
| // have an empty match. Such matches, in UTF-8 mode, must not be |
| // reported. So we just skip them here and pretend as if we did |
| // not see a match. |
| if utf8empty && !input.is_char_boundary(at) { |
| continue; |
| } |
| let _ = patset.try_insert(pid); |
| if !self.config.get_match_kind().continue_past_first_match() { |
| break; |
| } |
| } |
| } |
| |
| /// Starting from 'sid', if the position 'at' in the 'input' haystack has a |
| /// transition defined out of 'sid', then add the state transitioned to and |
| /// its epsilon closure to the 'next' set of states to explore. |
| /// |
| /// 'stack' is used by the epsilon closure computation to perform a depth |
| /// first traversal of the NFA. |
| /// |
| /// 'curr_slot_table' should be the table of slots for the current set of |
| /// states being explored. If there is a transition out of 'sid', then |
| /// sid's row in the slot table is used to perform the epsilon closure. |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| fn next( |
| &self, |
| stack: &mut Vec<FollowEpsilon>, |
| curr_slot_table: &mut SlotTable, |
| next: &mut ActiveStates, |
| input: &Input<'_>, |
| at: usize, |
| sid: StateID, |
| ) -> Option<PatternID> { |
| instrument!(|c| c.record_step(sid)); |
| match *self.nfa.state(sid) { |
| State::Fail |
| | State::Look { .. } |
| | State::Union { .. } |
| | State::BinaryUnion { .. } |
| | State::Capture { .. } => None, |
| State::ByteRange { ref trans } => { |
| if trans.matches(input.haystack(), at) { |
| let slots = curr_slot_table.for_state(sid); |
| // OK because 'at <= haystack.len() < usize::MAX', so |
| // adding 1 will never wrap. |
| let at = at.wrapping_add(1); |
| self.epsilon_closure( |
| stack, slots, next, input, at, trans.next, |
| ); |
| } |
| None |
| } |
| State::Sparse(ref sparse) => { |
| if let Some(next_sid) = sparse.matches(input.haystack(), at) { |
| let slots = curr_slot_table.for_state(sid); |
| // OK because 'at <= haystack.len() < usize::MAX', so |
| // adding 1 will never wrap. |
| let at = at.wrapping_add(1); |
| self.epsilon_closure( |
| stack, slots, next, input, at, next_sid, |
| ); |
| } |
| None |
| } |
| State::Dense(ref dense) => { |
| if let Some(next_sid) = dense.matches(input.haystack(), at) { |
| let slots = curr_slot_table.for_state(sid); |
| // OK because 'at <= haystack.len() < usize::MAX', so |
| // adding 1 will never wrap. |
| let at = at.wrapping_add(1); |
| self.epsilon_closure( |
| stack, slots, next, input, at, next_sid, |
| ); |
| } |
| None |
| } |
| State::Match { pattern_id } => Some(pattern_id), |
| } |
| } |
| |
| /// Compute the epsilon closure of 'sid', writing the closure into 'next' |
| /// while copying slot values from 'curr_slots' into corresponding states |
| /// in 'next'. 'curr_slots' should be the slot values corresponding to |
| /// 'sid'. |
| /// |
| /// The given 'stack' is used to perform a depth first traversal of the |
| /// NFA by recursively following all epsilon transitions out of 'sid'. |
| /// Conditional epsilon transitions are followed if and only if they are |
| /// satisfied for the position 'at' in the 'input' haystack. |
| /// |
| /// While this routine may write to 'curr_slots', once it returns, any |
| /// writes are undone and the original values (even if absent) are |
| /// restored. |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| fn epsilon_closure( |
| &self, |
| stack: &mut Vec<FollowEpsilon>, |
| curr_slots: &mut [Option<NonMaxUsize>], |
| next: &mut ActiveStates, |
| input: &Input<'_>, |
| at: usize, |
| sid: StateID, |
| ) { |
| instrument!(|c| { |
| c.record_closure(sid); |
| c.record_stack_push(sid); |
| }); |
| stack.push(FollowEpsilon::Explore(sid)); |
| while let Some(frame) = stack.pop() { |
| match frame { |
| FollowEpsilon::RestoreCapture { slot, offset: pos } => { |
| curr_slots[slot] = pos; |
| } |
| FollowEpsilon::Explore(sid) => { |
| self.epsilon_closure_explore( |
| stack, curr_slots, next, input, at, sid, |
| ); |
| } |
| } |
| } |
| } |
| |
| /// Explore all of the epsilon transitions out of 'sid'. This is mostly |
| /// split out from 'epsilon_closure' in order to clearly delineate |
| /// the actual work of computing an epsilon closure from the stack |
| /// book-keeping. |
| /// |
| /// This will push any additional explorations needed on to 'stack'. |
| /// |
| /// 'curr_slots' should refer to the slots for the currently active NFA |
| /// state. That is, the current state we are stepping through. These |
| /// slots are mutated in place as new 'Captures' states are traversed |
| /// during epsilon closure, but the slots are restored to their original |
| /// values once the full epsilon closure is completed. The ultimate use of |
| /// 'curr_slots' is to copy them to the corresponding 'next_slots', so that |
| /// the capturing group spans are forwarded from the currently active state |
| /// to the next. |
| /// |
| /// 'next' refers to the next set of active states. Computing an epsilon |
| /// closure may increase the next set of active states. |
| /// |
| /// 'input' refers to the caller's input configuration and 'at' refers to |
| /// the current position in the haystack. These are used to check whether |
| /// conditional epsilon transitions (like look-around) are satisfied at |
| /// the current position. If they aren't, then the epsilon closure won't |
| /// include them. |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| fn epsilon_closure_explore( |
| &self, |
| stack: &mut Vec<FollowEpsilon>, |
| curr_slots: &mut [Option<NonMaxUsize>], |
| next: &mut ActiveStates, |
| input: &Input<'_>, |
| at: usize, |
| mut sid: StateID, |
| ) { |
| // We can avoid pushing some state IDs on to our stack in precisely |
| // the cases where a 'push(x)' would be immediately followed by a 'x |
| // = pop()'. This is achieved by this outer-loop. We simply set 'sid' |
| // to be the next state ID we want to explore once we're done with |
| // our initial exploration. In practice, this avoids a lot of stack |
| // thrashing. |
| loop { |
| instrument!(|c| c.record_set_insert(sid)); |
| // Record this state as part of our next set of active states. If |
| // we've already explored it, then no need to do it again. |
| if !next.set.insert(sid) { |
| return; |
| } |
| match *self.nfa.state(sid) { |
| State::Fail |
| | State::Match { .. } |
| | State::ByteRange { .. } |
| | State::Sparse { .. } |
| | State::Dense { .. } => { |
| next.slot_table.for_state(sid).copy_from_slice(curr_slots); |
| return; |
| } |
| State::Look { look, next } => { |
| // OK because we don't permit building a searcher with a |
| // Unicode word boundary if the requisite Unicode data is |
| // unavailable. |
| if !self.nfa.look_matcher().matches_inline( |
| look, |
| input.haystack(), |
| at, |
| ) { |
| return; |
| } |
| sid = next; |
| } |
| State::Union { ref alternates } => { |
| sid = match alternates.get(0) { |
| None => return, |
| Some(&sid) => sid, |
| }; |
| instrument!(|c| { |
| for &alt in &alternates[1..] { |
| c.record_stack_push(alt); |
| } |
| }); |
| stack.extend( |
| alternates[1..] |
| .iter() |
| .copied() |
| .rev() |
| .map(FollowEpsilon::Explore), |
| ); |
| } |
| State::BinaryUnion { alt1, alt2 } => { |
| sid = alt1; |
| instrument!(|c| c.record_stack_push(sid)); |
| stack.push(FollowEpsilon::Explore(alt2)); |
| } |
| State::Capture { next, slot, .. } => { |
| // There's no need to do anything with slots that |
| // ultimately won't be copied into the caller-provided |
| // 'Captures' value. So we just skip dealing with them at |
| // all. |
| if slot.as_usize() < curr_slots.len() { |
| instrument!(|c| c.record_stack_push(sid)); |
| stack.push(FollowEpsilon::RestoreCapture { |
| slot, |
| offset: curr_slots[slot], |
| }); |
| // OK because length of a slice must fit into an isize. |
| curr_slots[slot] = Some(NonMaxUsize::new(at).unwrap()); |
| } |
| sid = next; |
| } |
| } |
| } |
| } |
| |
| /// Return the starting configuration of a PikeVM search. |
| /// |
| /// The "start config" is basically whether the search should be anchored |
| /// or not and the NFA state ID at which to begin the search. The state ID |
| /// returned always corresponds to an anchored starting state even when the |
| /// search is unanchored. This is because the PikeVM search loop deals with |
| /// unanchored searches with an explicit epsilon closure out of the start |
| /// state. |
| /// |
| /// This routine accounts for both the caller's `Input` configuration |
| /// and the pattern itself. For example, even if the caller asks for an |
| /// unanchored search, if the pattern itself is anchored, then this will |
| /// always return 'true' because implementing an unanchored search in that |
| /// case would be incorrect. |
| /// |
| /// Similarly, if the caller requests an anchored search for a particular |
| /// pattern, then the starting state ID returned will reflect that. |
| /// |
| /// If a pattern ID is given in the input configuration that is not in |
| /// this regex, then `None` is returned. |
| fn start_config(&self, input: &Input<'_>) -> Option<(bool, StateID)> { |
| match input.get_anchored() { |
| // Only way we're unanchored is if both the caller asked for an |
| // unanchored search *and* the pattern is itself not anchored. |
| Anchored::No => Some(( |
| self.nfa.is_always_start_anchored(), |
| self.nfa.start_anchored(), |
| )), |
| Anchored::Yes => Some((true, self.nfa.start_anchored())), |
| Anchored::Pattern(pid) => { |
| Some((true, self.nfa.start_pattern(pid)?)) |
| } |
| } |
| } |
| } |
| |
| /// An iterator over all non-overlapping matches for a particular search. |
| /// |
| /// The iterator yields a [`Match`] value until no more matches could be found. |
| /// |
| /// The lifetime parameters are as follows: |
| /// |
| /// * `'r` represents the lifetime of the PikeVM. |
| /// * `'c` represents the lifetime of the PikeVM's cache. |
| /// * `'h` represents the lifetime of the haystack being searched. |
| /// |
| /// This iterator can be created with the [`PikeVM::find_iter`] method. |
| #[derive(Debug)] |
| pub struct FindMatches<'r, 'c, 'h> { |
| re: &'r PikeVM, |
| cache: &'c mut Cache, |
| caps: Captures, |
| it: iter::Searcher<'h>, |
| } |
| |
| impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> { |
| type Item = Match; |
| |
| #[inline] |
| fn next(&mut self) -> Option<Match> { |
| // Splitting 'self' apart seems necessary to appease borrowck. |
| let FindMatches { re, ref mut cache, ref mut caps, ref mut it } = |
| *self; |
| // 'advance' converts errors into panics, which is OK here because |
| // the PikeVM can never return an error. |
| it.advance(|input| { |
| re.search(cache, input, caps); |
| Ok(caps.get_match()) |
| }) |
| } |
| } |
| |
| /// An iterator over all non-overlapping leftmost matches, with their capturing |
| /// groups, for a particular search. |
| /// |
| /// The iterator yields a [`Captures`] value until no more matches could be |
| /// found. |
| /// |
| /// The lifetime parameters are as follows: |
| /// |
| /// * `'r` represents the lifetime of the PikeVM. |
| /// * `'c` represents the lifetime of the PikeVM's cache. |
| /// * `'h` represents the lifetime of the haystack being searched. |
| /// |
| /// This iterator can be created with the [`PikeVM::captures_iter`] method. |
| #[derive(Debug)] |
| pub struct CapturesMatches<'r, 'c, 'h> { |
| re: &'r PikeVM, |
| cache: &'c mut Cache, |
| caps: Captures, |
| it: iter::Searcher<'h>, |
| } |
| |
| impl<'r, 'c, 'h> Iterator for CapturesMatches<'r, 'c, 'h> { |
| type Item = Captures; |
| |
| #[inline] |
| fn next(&mut self) -> Option<Captures> { |
| // Splitting 'self' apart seems necessary to appease borrowck. |
| let CapturesMatches { re, ref mut cache, ref mut caps, ref mut it } = |
| *self; |
| // 'advance' converts errors into panics, which is OK here because |
| // the PikeVM can never return an error. |
| it.advance(|input| { |
| re.search(cache, input, caps); |
| Ok(caps.get_match()) |
| }); |
| if caps.is_match() { |
| Some(caps.clone()) |
| } else { |
| None |
| } |
| } |
| } |
| |
| /// A cache represents mutable state that a [`PikeVM`] requires during a |
| /// search. |
| /// |
| /// For a given [`PikeVM`], its corresponding cache may be created either via |
| /// [`PikeVM::create_cache`], or via [`Cache::new`]. They are equivalent in |
| /// every way, except the former does not require explicitly importing `Cache`. |
| /// |
| /// A particular `Cache` is coupled with the [`PikeVM`] from which it |
| /// was created. It may only be used with that `PikeVM`. A cache and its |
| /// allocations may be re-purposed via [`Cache::reset`], in which case, it can |
| /// only be used with the new `PikeVM` (and not the old one). |
| #[derive(Clone, Debug)] |
| pub struct Cache { |
| /// Stack used while computing epsilon closure. This effectively lets us |
| /// move what is more naturally expressed through recursion to a stack |
| /// on the heap. |
| stack: Vec<FollowEpsilon>, |
| /// The current active states being explored for the current byte in the |
| /// haystack. |
| curr: ActiveStates, |
| /// The next set of states we're building that will be explored for the |
| /// next byte in the haystack. |
| next: ActiveStates, |
| } |
| |
| impl Cache { |
| /// Create a new [`PikeVM`] cache. |
| /// |
| /// A potentially more convenient routine to create a cache is |
| /// [`PikeVM::create_cache`], as it does not require also importing the |
| /// `Cache` type. |
| /// |
| /// If you want to reuse the returned `Cache` with some other `PikeVM`, |
| /// then you must call [`Cache::reset`] with the desired `PikeVM`. |
| pub fn new(re: &PikeVM) -> Cache { |
| Cache { |
| stack: vec![], |
| curr: ActiveStates::new(re), |
| next: ActiveStates::new(re), |
| } |
| } |
| |
| /// Reset this cache such that it can be used for searching with a |
| /// different [`PikeVM`]. |
| /// |
| /// A cache reset permits reusing memory already allocated in this cache |
| /// with a different `PikeVM`. |
| /// |
| /// # Example |
| /// |
| /// This shows how to re-purpose a cache for use with a different `PikeVM`. |
| /// |
| /// ``` |
| /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
| /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; |
| /// |
| /// let re1 = PikeVM::new(r"\w")?; |
| /// let re2 = PikeVM::new(r"\W")?; |
| /// |
| /// let mut cache = re1.create_cache(); |
| /// assert_eq!( |
| /// Some(Match::must(0, 0..2)), |
| /// re1.find_iter(&mut cache, "Δ").next(), |
| /// ); |
| /// |
| /// // Using 'cache' with re2 is not allowed. It may result in panics or |
| /// // incorrect results. In order to re-purpose the cache, we must reset |
| /// // it with the PikeVM we'd like to use it with. |
| /// // |
| /// // Similarly, after this reset, using the cache with 're1' is also not |
| /// // allowed. |
| /// cache.reset(&re2); |
| /// assert_eq!( |
| /// Some(Match::must(0, 0..3)), |
| /// re2.find_iter(&mut cache, "☃").next(), |
| /// ); |
| /// |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| pub fn reset(&mut self, re: &PikeVM) { |
| self.curr.reset(re); |
| self.next.reset(re); |
| } |
| |
| /// Returns the heap memory usage, in bytes, of this cache. |
| /// |
| /// This does **not** include the stack size used up by this cache. To |
| /// compute that, use `std::mem::size_of::<Cache>()`. |
| pub fn memory_usage(&self) -> usize { |
| use core::mem::size_of; |
| (self.stack.len() * size_of::<FollowEpsilon>()) |
| + self.curr.memory_usage() |
| + self.next.memory_usage() |
| } |
| |
| /// Clears this cache. This should be called at the start of every search |
| /// to ensure we start with a clean slate. |
| /// |
| /// This also sets the length of the capturing groups used in the current |
| /// search. This permits an optimization where by 'SlotTable::for_state' |
| /// only returns the number of slots equivalent to the number of slots |
| /// given in the 'Captures' value. This may be less than the total number |
| /// of possible slots, e.g., when one only wants to track overall match |
| /// offsets. This in turn permits less copying of capturing group spans |
| /// in the PikeVM. |
| fn setup_search(&mut self, captures_slot_len: usize) { |
| self.stack.clear(); |
| self.curr.setup_search(captures_slot_len); |
| self.next.setup_search(captures_slot_len); |
| } |
| } |
| |
| /// A set of active states used to "simulate" the execution of an NFA via the |
| /// PikeVM. |
| /// |
| /// There are two sets of these used during NFA simulation. One set corresponds |
| /// to the "current" set of states being traversed for the current position |
| /// in a haystack. The other set corresponds to the "next" set of states being |
| /// built, which will become the new "current" set for the next position in the |
| /// haystack. These two sets correspond to CLIST and NLIST in Thompson's |
| /// original paper regexes: https://dl.acm.org/doi/pdf/10.1145/363347.363387 |
| /// |
| /// In addition to representing a set of NFA states, this also maintains slot |
| /// values for each state. These slot values are what turn the NFA simulation |
| /// into the "Pike VM." Namely, they track capturing group values for each |
| /// state. During the computation of epsilon closure, we copy slot values from |
| /// states in the "current" set to the "next" set. Eventually, once a match |
| /// is found, the slot values for that match state are what we write to the |
| /// caller provided 'Captures' value. |
| #[derive(Clone, Debug)] |
| struct ActiveStates { |
| /// The set of active NFA states. This set preserves insertion order, which |
| /// is critical for simulating the match semantics of backtracking regex |
| /// engines. |
| set: SparseSet, |
| /// The slots for every NFA state, where each slot stores a (possibly |
| /// absent) offset. Every capturing group has two slots. One for a start |
| /// offset and one for an end offset. |
| slot_table: SlotTable, |
| } |
| |
| impl ActiveStates { |
| /// Create a new set of active states for the given PikeVM. The active |
| /// states returned may only be used with the given PikeVM. (Use 'reset' |
| /// to re-purpose the allocation for a different PikeVM.) |
| fn new(re: &PikeVM) -> ActiveStates { |
| let mut active = ActiveStates { |
| set: SparseSet::new(0), |
| slot_table: SlotTable::new(), |
| }; |
| active.reset(re); |
| active |
| } |
| |
| /// Reset this set of active states such that it can be used with the given |
| /// PikeVM (and only that PikeVM). |
| fn reset(&mut self, re: &PikeVM) { |
| self.set.resize(re.get_nfa().states().len()); |
| self.slot_table.reset(re); |
| } |
| |
| /// Return the heap memory usage, in bytes, used by this set of active |
| /// states. |
| /// |
| /// This does not include the stack size of this value. |
| fn memory_usage(&self) -> usize { |
| self.set.memory_usage() + self.slot_table.memory_usage() |
| } |
| |
| /// Setup this set of active states for a new search. The given slot |
| /// length should be the number of slots in a caller provided 'Captures' |
| /// (and may be zero). |
| fn setup_search(&mut self, captures_slot_len: usize) { |
| self.set.clear(); |
| self.slot_table.setup_search(captures_slot_len); |
| } |
| } |
| |
| /// A table of slots, where each row represent a state in an NFA. Thus, the |
| /// table has room for storing slots for every single state in an NFA. |
| /// |
| /// This table is represented with a single contiguous allocation. In general, |
| /// the notion of "capturing group" doesn't really exist at this level of |
| /// abstraction, hence the name "slot" instead. (Indeed, every capturing group |
| /// maps to a pair of slots, one for the start offset and one for the end |
| /// offset.) Slots are indexed by the 'Captures' NFA state. |
| /// |
| /// N.B. Not every state actually needs a row of slots. Namely, states that |
| /// only have epsilon transitions currently never have anything written to |
| /// their rows in this table. Thus, the table is somewhat wasteful in its heap |
| /// usage. However, it is important to maintain fast random access by state |
| /// ID, which means one giant table tends to work well. RE2 takes a different |
| /// approach here and allocates each row as its own reference counted thing. |
| /// I explored such a strategy at one point here, but couldn't get it to work |
| /// well using entirely safe code. (To the ambitious reader: I encourage you to |
| /// re-litigate that experiment.) I very much wanted to stick to safe code, but |
| /// could be convinced otherwise if there was a solid argument and the safety |
| /// was encapsulated well. |
| #[derive(Clone, Debug)] |
| struct SlotTable { |
| /// The actual table of offsets. |
| table: Vec<Option<NonMaxUsize>>, |
| /// The number of slots per state, i.e., the table's stride or the length |
| /// of each row. |
| slots_per_state: usize, |
| /// The number of slots in the caller-provided 'Captures' value for the |
| /// current search. Setting this to 'slots_per_state' is always correct, |
| /// but may be wasteful. |
| slots_for_captures: usize, |
| } |
| |
| impl SlotTable { |
| /// Create a new slot table. |
| /// |
| /// One should call 'reset' with the corresponding PikeVM before use. |
| fn new() -> SlotTable { |
| SlotTable { table: vec![], slots_for_captures: 0, slots_per_state: 0 } |
| } |
| |
| /// Reset this slot table such that it can be used with the given PikeVM |
| /// (and only that PikeVM). |
| fn reset(&mut self, re: &PikeVM) { |
| let nfa = re.get_nfa(); |
| self.slots_per_state = nfa.group_info().slot_len(); |
| // This is always correct, but may be reduced for a particular search |
| // if a 'Captures' has fewer slots, e.g., none at all or only slots |
| // for tracking the overall match instead of all slots for every |
| // group. |
| self.slots_for_captures = core::cmp::max( |
| self.slots_per_state, |
| nfa.pattern_len().checked_mul(2).unwrap(), |
| ); |
| let len = nfa |
| .states() |
| .len() |
| .checked_mul(self.slots_per_state) |
| // Add space to account for scratch space used during a search. |
| .and_then(|x| x.checked_add(self.slots_for_captures)) |
| // It seems like this could actually panic on legitimate inputs on |
| // 32-bit targets, and very likely to panic on 16-bit. Should we |
| // somehow convert this to an error? What about something similar |
| // for the lazy DFA cache? If you're tripping this assert, please |
| // file a bug. |
| .expect("slot table length doesn't overflow"); |
| // This happens about as often as a regex is compiled, so it probably |
| // should be at debug level, but I found it quite distracting and not |
| // particularly useful. |
| trace!( |
| "resizing PikeVM active states table to {} entries \ |
| (slots_per_state={})", |
| len, |
| self.slots_per_state, |
| ); |
| self.table.resize(len, None); |
| } |
| |
| /// Return the heap memory usage, in bytes, used by this slot table. |
| /// |
| /// This does not include the stack size of this value. |
| fn memory_usage(&self) -> usize { |
| self.table.len() * core::mem::size_of::<Option<NonMaxUsize>>() |
| } |
| |
| /// Perform any per-search setup for this slot table. |
| /// |
| /// In particular, this sets the length of the number of slots used in the |
| /// 'Captures' given by the caller (if any at all). This number may be |
| /// smaller than the total number of slots available, e.g., when the caller |
| /// is only interested in tracking the overall match and not the spans of |
| /// every matching capturing group. Only tracking the overall match can |
| /// save a substantial amount of time copying capturing spans during a |
| /// search. |
| fn setup_search(&mut self, captures_slot_len: usize) { |
| self.slots_for_captures = captures_slot_len; |
| } |
| |
| /// Return a mutable slice of the slots for the given state. |
| /// |
| /// Note that the length of the slice returned may be less than the total |
| /// number of slots available for this state. In particular, the length |
| /// always matches the number of slots indicated via 'setup_search'. |
| fn for_state(&mut self, sid: StateID) -> &mut [Option<NonMaxUsize>] { |
| let i = sid.as_usize() * self.slots_per_state; |
| &mut self.table[i..i + self.slots_for_captures] |
| } |
| |
| /// Return a slice of slots of appropriate length where every slot offset |
| /// is guaranteed to be absent. This is useful in cases where you need to |
| /// compute an epsilon closure outside of the user supplied regex, and thus |
| /// never want it to have any capturing slots set. |
| fn all_absent(&mut self) -> &mut [Option<NonMaxUsize>] { |
| let i = self.table.len() - self.slots_for_captures; |
| &mut self.table[i..i + self.slots_for_captures] |
| } |
| } |
| |
| /// Represents a stack frame for use while computing an epsilon closure. |
| /// |
| /// (An "epsilon closure" refers to the set of reachable NFA states from a |
| /// single state without consuming any input. That is, the set of all epsilon |
| /// transitions not only from that single state, but from every other state |
| /// reachable by an epsilon transition as well. This is why it's called a |
| /// "closure." Computing an epsilon closure is also done during DFA |
| /// determinization! Compare and contrast the epsilon closure here in this |
| /// PikeVM and the one used for determinization in crate::util::determinize.) |
| /// |
| /// Computing the epsilon closure in a Thompson NFA proceeds via a depth |
| /// first traversal over all epsilon transitions from a particular state. |
| /// (A depth first traversal is important because it emulates the same priority |
| /// of matches that is typically found in backtracking regex engines.) This |
| /// depth first traversal is naturally expressed using recursion, but to avoid |
| /// a call stack size proportional to the size of a regex, we put our stack on |
| /// the heap instead. |
| /// |
| /// This stack thus consists of call frames. The typical call frame is |
| /// `Explore`, which instructs epsilon closure to explore the epsilon |
| /// transitions from that state. (Subsequent epsilon transitions are then |
| /// pushed on to the stack as more `Explore` frames.) If the state ID being |
| /// explored has no epsilon transitions, then the capturing group slots are |
| /// copied from the original state that sparked the epsilon closure (from the |
| /// 'step' routine) to the state ID being explored. This way, capturing group |
| /// slots are forwarded from the previous state to the next. |
| /// |
| /// The other stack frame, `RestoreCaptures`, instructs the epsilon closure to |
| /// set the position for a particular slot back to some particular offset. This |
| /// frame is pushed when `Explore` sees a `Capture` transition. `Explore` will |
| /// set the offset of the slot indicated in `Capture` to the current offset, |
| /// and then push the old offset on to the stack as a `RestoreCapture` frame. |
| /// Thus, the new offset is only used until the epsilon closure reverts back to |
| /// the `RestoreCapture` frame. In effect, this gives the `Capture` epsilon |
| /// transition its "scope" to only states that come "after" it during depth |
| /// first traversal. |
| #[derive(Clone, Debug)] |
| enum FollowEpsilon { |
| /// Explore the epsilon transitions from a state ID. |
| Explore(StateID), |
| /// Reset the given `slot` to the given `offset` (which might be `None`). |
| RestoreCapture { slot: SmallIndex, offset: Option<NonMaxUsize> }, |
| } |
| |
| /// A set of counters that "instruments" a PikeVM search. To enable this, you |
| /// must enable the 'internal-instrument-pikevm' feature. Then run your Rust |
| /// program with RUST_LOG=regex_automata::nfa::thompson::pikevm=trace set in |
| /// the environment. The metrics collected will be dumped automatically for |
| /// every search executed by the PikeVM. |
| /// |
| /// NOTE: When 'internal-instrument-pikevm' is enabled, it will likely cause an |
| /// absolute decrease in wall-clock performance, even if the 'trace' log level |
| /// isn't enabled. (Although, we do try to avoid extra costs when 'trace' isn't |
| /// enabled.) The main point of instrumentation is to get counts of various |
| /// events that occur during the PikeVM's execution. |
| /// |
| /// This is a somewhat hacked together collection of metrics that are useful |
| /// to gather from a PikeVM search. In particular, it lets us scrutinize the |
| /// performance profile of a search beyond what general purpose profiling tools |
| /// give us. Namely, we orient the profiling data around the specific states of |
| /// the NFA. |
| /// |
| /// In other words, this lets us see which parts of the NFA graph are most |
| /// frequently activated. This then provides direction for optimization |
| /// opportunities. |
| /// |
| /// The really sad part about this is that it absolutely clutters up the PikeVM |
| /// implementation. :'( Another approach would be to just manually add this |
| /// code in whenever I want this kind of profiling data, but it's complicated |
| /// and tedious enough that I went with this approach... for now. |
| /// |
| /// When instrumentation is enabled (which also turns on 'logging'), then a |
| /// `Counters` is initialized for every search and `trace`'d just before the |
| /// search returns to the caller. |
| /// |
| /// Tip: When debugging performance problems with the PikeVM, it's best to try |
| /// to work with an NFA that is as small as possible. Otherwise the state graph |
| /// is likely to be too big to digest. |
| #[cfg(feature = "internal-instrument-pikevm")] |
| #[derive(Clone, Debug)] |
| struct Counters { |
| /// The number of times the NFA is in a particular permutation of states. |
| state_sets: alloc::collections::BTreeMap<Vec<StateID>, u64>, |
| /// The number of times 'step' is called for a particular state ID (which |
| /// indexes this array). |
| steps: Vec<u64>, |
| /// The number of times an epsilon closure was computed for a state. |
| closures: Vec<u64>, |
| /// The number of times a particular state ID is pushed on to a stack while |
| /// computing an epsilon closure. |
| stack_pushes: Vec<u64>, |
| /// The number of times a particular state ID is inserted into a sparse set |
| /// while computing an epsilon closure. |
| set_inserts: Vec<u64>, |
| } |
| |
| #[cfg(feature = "internal-instrument-pikevm")] |
| impl Counters { |
| fn empty() -> Counters { |
| Counters { |
| state_sets: alloc::collections::BTreeMap::new(), |
| steps: vec![], |
| closures: vec![], |
| stack_pushes: vec![], |
| set_inserts: vec![], |
| } |
| } |
| |
| fn reset(&mut self, nfa: &NFA) { |
| let len = nfa.states().len(); |
| |
| self.state_sets.clear(); |
| |
| self.steps.clear(); |
| self.steps.resize(len, 0); |
| |
| self.closures.clear(); |
| self.closures.resize(len, 0); |
| |
| self.stack_pushes.clear(); |
| self.stack_pushes.resize(len, 0); |
| |
| self.set_inserts.clear(); |
| self.set_inserts.resize(len, 0); |
| } |
| |
| fn eprint(&self, nfa: &NFA) { |
| trace!("===== START PikeVM Instrumentation Output ====="); |
| // We take the top-K most occurring state sets. Otherwise the output |
| // is likely to be overwhelming. And we probably only care about the |
| // most frequently occurring ones anyway. |
| const LIMIT: usize = 20; |
| let mut set_counts = |
| self.state_sets.iter().collect::<Vec<(&Vec<StateID>, &u64)>>(); |
| set_counts.sort_by_key(|(_, &count)| core::cmp::Reverse(count)); |
| trace!("## PikeVM frequency of state sets (top {})", LIMIT); |
| for (set, count) in set_counts.iter().take(LIMIT) { |
| trace!("{:?}: {}", set, count); |
| } |
| if set_counts.len() > LIMIT { |
| trace!( |
| "... {} sets omitted (out of {} total)", |
| set_counts.len() - LIMIT, |
| set_counts.len(), |
| ); |
| } |
| |
| trace!(""); |
| trace!("## PikeVM total frequency of events"); |
| trace!( |
| "steps: {}, closures: {}, stack-pushes: {}, set-inserts: {}", |
| self.steps.iter().copied().sum::<u64>(), |
| self.closures.iter().copied().sum::<u64>(), |
| self.stack_pushes.iter().copied().sum::<u64>(), |
| self.set_inserts.iter().copied().sum::<u64>(), |
| ); |
| |
| trace!(""); |
| trace!("## PikeVM frequency of events broken down by state"); |
| for sid in 0..self.steps.len() { |
| trace!( |
| "{:06}: steps: {}, closures: {}, \ |
| stack-pushes: {}, set-inserts: {}", |
| sid, |
| self.steps[sid], |
| self.closures[sid], |
| self.stack_pushes[sid], |
| self.set_inserts[sid], |
| ); |
| } |
| |
| trace!(""); |
| trace!("## NFA debug display"); |
| trace!("{:?}", nfa); |
| trace!("===== END PikeVM Instrumentation Output ====="); |
| } |
| |
| fn record_state_set(&mut self, set: &SparseSet) { |
| let set = set.iter().collect::<Vec<StateID>>(); |
| *self.state_sets.entry(set).or_insert(0) += 1; |
| } |
| |
| fn record_step(&mut self, sid: StateID) { |
| self.steps[sid] += 1; |
| } |
| |
| fn record_closure(&mut self, sid: StateID) { |
| self.closures[sid] += 1; |
| } |
| |
| fn record_stack_push(&mut self, sid: StateID) { |
| self.stack_pushes[sid] += 1; |
| } |
| |
| fn record_set_insert(&mut self, sid: StateID) { |
| self.set_inserts[sid] += 1; |
| } |
| } |