| /*! |
| A DFA-backed `Regex`. |
| |
| This module provides [`Regex`], which is defined generically over the |
| [`Automaton`] trait. A `Regex` implements convenience routines you might have |
| come to expect, such as finding the start/end of a match and iterating over |
| all non-overlapping matches. This `Regex` type is limited in its capabilities |
| to what a DFA can provide. Therefore, APIs involving capturing groups, for |
| example, are not provided. |
| |
| Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that |
| finds the end offset of a match, where as the other is a "reverse" DFA that |
| find the start offset of a match. |
| |
| See the [parent module](crate::dfa) for examples. |
| */ |
| |
| #[cfg(feature = "alloc")] |
| use alloc::vec::Vec; |
| |
| use crate::{ |
| dfa::automaton::{Automaton, OverlappingState}, |
| util::prefilter::{self, Prefilter}, |
| MatchError, MultiMatch, |
| }; |
| #[cfg(feature = "alloc")] |
| use crate::{ |
| dfa::{dense, error::Error, sparse}, |
| nfa::thompson, |
| util::matchtypes::MatchKind, |
| }; |
| |
| // When the alloc feature is enabled, the regex type sets its A type parameter |
| // to default to an owned dense DFA. But without alloc, we set no default. This |
| // makes things a lot more convenient in the common case, since writing out the |
| // DFA types is pretty annoying. |
| // |
| // Since we have two different definitions but only want to write one doc |
| // string, we use a macro to capture the doc and other attributes once and then |
| // repeat them for each definition. |
| macro_rules! define_regex_type { |
| ($(#[$doc:meta])*) => { |
| #[cfg(feature = "alloc")] |
| $(#[$doc])* |
| pub struct Regex<A = dense::OwnedDFA, P = prefilter::None> { |
| prefilter: Option<P>, |
| forward: A, |
| reverse: A, |
| utf8: bool, |
| } |
| |
| #[cfg(not(feature = "alloc"))] |
| $(#[$doc])* |
| pub struct Regex<A, P = prefilter::None> { |
| prefilter: Option<P>, |
| forward: A, |
| reverse: A, |
| utf8: bool, |
| } |
| }; |
| } |
| |
| define_regex_type!( |
| /// A regular expression that uses deterministic finite automata for fast |
| /// searching. |
| /// |
| /// A regular expression is comprised of two DFAs, a "forward" DFA and a |
| /// "reverse" DFA. The forward DFA is responsible for detecting the end of |
| /// a match while the reverse DFA is responsible for detecting the start |
| /// of a match. Thus, in order to find the bounds of any given match, a |
| /// forward search must first be run followed by a reverse search. A match |
| /// found by the forward DFA guarantees that the reverse DFA will also find |
| /// a match. |
| /// |
| /// The type of the DFA used by a `Regex` corresponds to the `A` type |
| /// parameter, which must satisfy the [`Automaton`] trait. Typically, |
| /// `A` is either a [`dense::DFA`](crate::dfa::dense::DFA) or a |
| /// [`sparse::DFA`](crate::dfa::sparse::DFA), where dense DFAs use more |
| /// memory but search faster, while sparse DFAs use less memory but search |
| /// more slowly. |
| /// |
| /// By default, a regex's automaton type parameter is set to |
| /// `dense::DFA<Vec<u32>>` when the `alloc` feature is enabled. For most |
| /// in-memory work loads, this is the most convenient type that gives the |
| /// best search performance. When the `alloc` feature is disabled, no |
| /// default type is used. |
| /// |
| /// A `Regex` also has a `P` type parameter, which is used to select the |
| /// prefilter used during search. By default, no prefilter is enabled by |
| /// setting the type to default to [`prefilter::None`]. A prefilter can be |
| /// enabled by using the [`Regex::prefilter`] method. |
| /// |
| /// # When should I use this? |
| /// |
| /// Generally speaking, if you can afford the overhead of building a full |
| /// DFA for your regex, and you don't need things like capturing groups, |
| /// then this is a good choice if you're looking to optimize for matching |
| /// speed. Note however that its speed may be worse than a general purpose |
| /// regex engine if you don't select a good [prefilter]. |
| /// |
| /// # Earliest vs Leftmost vs Overlapping |
| /// |
| /// The search routines exposed on a `Regex` reflect three different ways |
| /// of searching: |
| /// |
| /// * "earliest" means to stop as soon as a match has been detected. |
| /// * "leftmost" means to continue matching until the underlying |
| /// automaton cannot advance. This reflects "standard" searching you |
| /// might be used to in other regex engines. e.g., This permits |
| /// non-greedy and greedy searching to work as you would expect. |
| /// * "overlapping" means to find all possible matches, even if they |
| /// overlap. |
| /// |
| /// Generally speaking, when doing an overlapping search, you'll want to |
| /// build your regex DFAs with [`MatchKind::All`] semantics. Using |
| /// [`MatchKind::LeftmostFirst`] semantics with overlapping searches is |
| /// likely to lead to odd behavior since `LeftmostFirst` specifically omits |
| /// some matches that can never be reported due to its semantics. |
| /// |
| /// The following example shows the differences between how these different |
| /// types of searches impact looking for matches of `[a-z]+` in the |
| /// haystack `abc`. |
| /// |
| /// ``` |
| /// use regex_automata::{dfa::{self, dense}, MatchKind, MultiMatch}; |
| /// |
| /// let pattern = r"[a-z]+"; |
| /// let haystack = "abc".as_bytes(); |
| /// |
| /// // With leftmost-first semantics, we test "earliest" and "leftmost". |
| /// let re = dfa::regex::Builder::new() |
| /// .dense(dense::Config::new().match_kind(MatchKind::LeftmostFirst)) |
| /// .build(pattern)?; |
| /// |
| /// // "earliest" searching isn't impacted by greediness |
| /// let mut it = re.find_earliest_iter(haystack); |
| /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); |
| /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next()); |
| /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next()); |
| /// assert_eq!(None, it.next()); |
| /// |
| /// // "leftmost" searching supports greediness (and non-greediness) |
| /// let mut it = re.find_leftmost_iter(haystack); |
| /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); |
| /// assert_eq!(None, it.next()); |
| /// |
| /// // For overlapping, we want "all" match kind semantics. |
| /// let re = dfa::regex::Builder::new() |
| /// .dense(dense::Config::new().match_kind(MatchKind::All)) |
| /// .build(pattern)?; |
| /// |
| /// // In the overlapping search, we find all three possible matches |
| /// // starting at the beginning of the haystack. |
| /// let mut it = re.find_overlapping_iter(haystack); |
| /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); |
| /// assert_eq!(Some(MultiMatch::must(0, 0, 2)), it.next()); |
| /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); |
| /// assert_eq!(None, it.next()); |
| /// |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| /// |
| /// # Sparse DFAs |
| /// |
| /// Since a `Regex` is generic over the [`Automaton`] trait, it can be |
| /// used with any kind of DFA. While this crate constructs dense DFAs by |
| /// default, it is easy enough to build corresponding sparse DFAs, and then |
| /// build a regex from them: |
| /// |
| /// ``` |
| /// use regex_automata::dfa::regex::Regex; |
| /// |
| /// // First, build a regex that uses dense DFAs. |
| /// let dense_re = Regex::new("foo[0-9]+")?; |
| /// |
| /// // Second, build sparse DFAs from the forward and reverse dense DFAs. |
| /// let fwd = dense_re.forward().to_sparse()?; |
| /// let rev = dense_re.reverse().to_sparse()?; |
| /// |
| /// // Third, build a new regex from the constituent sparse DFAs. |
| /// let sparse_re = Regex::builder().build_from_dfas(fwd, rev); |
| /// |
| /// // A regex that uses sparse DFAs can be used just like with dense DFAs. |
| /// assert_eq!(true, sparse_re.is_match(b"foo123")); |
| /// |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| /// |
| /// Alternatively, one can use a [`Builder`] to construct a sparse DFA |
| /// more succinctly. (Note though that dense DFAs are still constructed |
| /// first internally, and then converted to sparse DFAs, as in the example |
| /// above.) |
| /// |
| /// ``` |
| /// use regex_automata::dfa::regex::Regex; |
| /// |
| /// let sparse_re = Regex::builder().build_sparse(r"foo[0-9]+")?; |
| /// // A regex that uses sparse DFAs can be used just like with dense DFAs. |
| /// assert!(sparse_re.is_match(b"foo123")); |
| /// |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| /// |
| /// # Fallibility |
| /// |
| /// In non-default configurations, the DFAs generated in this module may |
| /// return an error during a search. (Currently, the only way this happens |
| /// is if quit bytes are added or Unicode word boundaries are heuristically |
| /// enabled, both of which are turned off by default.) For convenience, the |
| /// main search routines, like [`find_leftmost`](Regex::find_leftmost), |
| /// will panic if an error occurs. However, if you need to use DFAs |
| /// which may produce an error at search time, then there are fallible |
| /// equivalents of all search routines. For example, for `find_leftmost`, |
| /// its fallible analog is [`try_find_leftmost`](Regex::try_find_leftmost). |
| /// The routines prefixed with `try_` return `Result<Option<MultiMatch>, |
| /// MatchError>`, where as the infallible routines simply return |
| /// `Option<MultiMatch>`. |
| /// |
| /// # Example |
| /// |
| /// This example shows how to cause a search to terminate if it sees a |
| /// `\n` byte, and handle the error returned. This could be useful if, for |
| /// example, you wanted to prevent a user supplied pattern from matching |
| /// across a line boundary. |
| /// |
| /// ``` |
| /// use regex_automata::{dfa::{self, regex::Regex}, MatchError}; |
| /// |
| /// let re = Regex::builder() |
| /// .dense(dfa::dense::Config::new().quit(b'\n', true)) |
| /// .build(r"foo\p{any}+bar")?; |
| /// |
| /// let haystack = "foo\nbar".as_bytes(); |
| /// // Normally this would produce a match, since \p{any} contains '\n'. |
| /// // But since we instructed the automaton to enter a quit state if a |
| /// // '\n' is observed, this produces a match error instead. |
| /// let expected = MatchError::Quit { byte: 0x0A, offset: 3 }; |
| /// let got = re.try_find_leftmost(haystack).unwrap_err(); |
| /// assert_eq!(expected, got); |
| /// |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| #[derive(Clone, Debug)] |
| ); |
| |
| #[cfg(feature = "alloc")] |
| impl Regex { |
| /// Parse the given regular expression using the default configuration and |
| /// return the corresponding regex. |
| /// |
| /// If you want a non-default configuration, then use the [`Builder`] to |
| /// set your own configuration. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use regex_automata::{MultiMatch, dfa::regex::Regex}; |
| /// |
| /// let re = Regex::new("foo[0-9]+bar")?; |
| /// assert_eq!( |
| /// Some(MultiMatch::must(0, 3, 14)), |
| /// re.find_leftmost(b"zzzfoo12345barzzz"), |
| /// ); |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| pub fn new(pattern: &str) -> Result<Regex, Error> { |
| Builder::new().build(pattern) |
| } |
| |
| /// Like `new`, but parses multiple patterns into a single "regex set." |
| /// This similarly uses the default regex configuration. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use regex_automata::{MultiMatch, dfa::regex::Regex}; |
| /// |
| /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?; |
| /// |
| /// let mut it = re.find_leftmost_iter(b"abc 1 foo 4567 0 quux"); |
| /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); |
| /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next()); |
| /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next()); |
| /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next()); |
| /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next()); |
| /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next()); |
| /// assert_eq!(None, it.next()); |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<Regex, Error> { |
| Builder::new().build_many(patterns) |
| } |
| } |
| |
| #[cfg(feature = "alloc")] |
| impl Regex<sparse::DFA<Vec<u8>>> { |
| /// Parse the given regular expression using the default configuration, |
| /// except using sparse DFAs, and return the corresponding regex. |
| /// |
| /// If you want a non-default configuration, then use the [`Builder`] to |
| /// set your own configuration. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use regex_automata::{MultiMatch, dfa::regex::Regex}; |
| /// |
| /// let re = Regex::new_sparse("foo[0-9]+bar")?; |
| /// assert_eq!( |
| /// Some(MultiMatch::must(0, 3, 14)), |
| /// re.find_leftmost(b"zzzfoo12345barzzz"), |
| /// ); |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| pub fn new_sparse( |
| pattern: &str, |
| ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> { |
| Builder::new().build_sparse(pattern) |
| } |
| |
| /// Like `new`, but parses multiple patterns into a single "regex set" |
| /// using sparse DFAs. This otherwise similarly uses the default regex |
| /// configuration. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use regex_automata::{MultiMatch, dfa::regex::Regex}; |
| /// |
| /// let re = Regex::new_many_sparse(&["[a-z]+", "[0-9]+"])?; |
| /// |
| /// let mut it = re.find_leftmost_iter(b"abc 1 foo 4567 0 quux"); |
| /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); |
| /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next()); |
| /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next()); |
| /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next()); |
| /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next()); |
| /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next()); |
| /// assert_eq!(None, it.next()); |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| pub fn new_many_sparse<P: AsRef<str>>( |
| patterns: &[P], |
| ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> { |
| Builder::new().build_many_sparse(patterns) |
| } |
| } |
| |
| /// Convenience routines for regex construction. |
| #[cfg(feature = "alloc")] |
| impl Regex { |
| /// Return a default configuration for a `Regex`. |
| /// |
| /// This is a convenience routine to avoid needing to import the `Config` |
| /// type when customizing the construction of a regex. |
| /// |
| /// # Example |
| /// |
| /// This example shows how to disable UTF-8 mode for `Regex` iteration. |
| /// When UTF-8 mode is disabled, the position immediately following an |
| /// empty match is where the next search begins, instead of the next |
| /// position of a UTF-8 encoded codepoint. |
| /// |
| /// ``` |
| /// use regex_automata::{dfa::regex::Regex, MultiMatch}; |
| /// |
| /// let re = Regex::builder() |
| /// .configure(Regex::config().utf8(false)) |
| /// .build(r"")?; |
| /// let haystack = "a☃z".as_bytes(); |
| /// let mut it = re.find_leftmost_iter(haystack); |
| /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); |
| /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); |
| /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next()); |
| /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next()); |
| /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); |
| /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); |
| /// assert_eq!(None, it.next()); |
| /// |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| pub fn config() -> Config { |
| Config::new() |
| } |
| |
| /// Return a builder for configuring the construction of a `Regex`. |
| /// |
| /// This is a convenience routine to avoid needing to import the |
| /// [`Builder`] type in common cases. |
| /// |
| /// # Example |
| /// |
| /// This example shows how to use the builder to disable UTF-8 mode |
| /// everywhere. |
| /// |
| /// ``` |
| /// use regex_automata::{ |
| /// dfa::regex::Regex, |
| /// nfa::thompson, |
| /// MultiMatch, SyntaxConfig, |
| /// }; |
| /// |
| /// let re = Regex::builder() |
| /// .configure(Regex::config().utf8(false)) |
| /// .syntax(SyntaxConfig::new().utf8(false)) |
| /// .thompson(thompson::Config::new().utf8(false)) |
| /// .build(r"foo(?-u:[^b])ar.*")?; |
| /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; |
| /// let expected = Some(MultiMatch::must(0, 1, 9)); |
| /// let got = re.find_leftmost(haystack); |
| /// assert_eq!(expected, got); |
| /// |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| pub fn builder() -> Builder { |
| Builder::new() |
| } |
| } |
| |
| /// Standard search routines for finding and iterating over matches. |
| impl<A: Automaton, P: Prefilter> Regex<A, P> { |
| /// Returns true if and only if this regex matches the given haystack. |
| /// |
| /// This routine may short circuit if it knows that scanning future input |
| /// will never lead to a different result. In particular, if the underlying |
| /// DFA enters a match state or a dead state, then this routine will return |
| /// `true` or `false`, respectively, without inspecting any future input. |
| /// |
| /// # Panics |
| /// |
| /// If the underlying DFAs return an error, then this routine panics. This |
| /// only occurs in non-default configurations where quit bytes are used or |
| /// Unicode word boundaries are heuristically enabled. |
| /// |
| /// The fallible version of this routine is |
| /// [`try_is_match`](Regex::try_is_match). |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use regex_automata::dfa::regex::Regex; |
| /// |
| /// let re = Regex::new("foo[0-9]+bar")?; |
| /// assert_eq!(true, re.is_match(b"foo12345bar")); |
| /// assert_eq!(false, re.is_match(b"foobar")); |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| pub fn is_match(&self, haystack: &[u8]) -> bool { |
| self.is_match_at(haystack, 0, haystack.len()) |
| } |
| |
| /// Returns the first position at which a match is found. |
| /// |
| /// This routine stops scanning input in precisely the same circumstances |
| /// as `is_match`. The key difference is that this routine returns the |
| /// position at which it stopped scanning input if and only if a match |
| /// was found. If no match is found, then `None` is returned. |
| /// |
| /// # Panics |
| /// |
| /// If the underlying DFAs return an error, then this routine panics. This |
| /// only occurs in non-default configurations where quit bytes are used or |
| /// Unicode word boundaries are heuristically enabled. |
| /// |
| /// The fallible version of this routine is |
| /// [`try_find_earliest`](Regex::try_find_earliest). |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use regex_automata::{MultiMatch, dfa::regex::Regex}; |
| /// |
| /// // Normally, the leftmost first match would greedily consume as many |
| /// // decimal digits as it could. But a match is detected as soon as one |
| /// // digit is seen. |
| /// let re = Regex::new("foo[0-9]+")?; |
| /// assert_eq!( |
| /// Some(MultiMatch::must(0, 0, 4)), |
| /// re.find_earliest(b"foo12345"), |
| /// ); |
| /// |
| /// // Normally, the end of the leftmost first match here would be 3, |
| /// // but the "earliest" match semantics detect a match earlier. |
| /// let re = Regex::new("abc|a")?; |
| /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), re.find_earliest(b"abc")); |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| pub fn find_earliest(&self, haystack: &[u8]) -> Option<MultiMatch> { |
| self.find_earliest_at(haystack, 0, haystack.len()) |
| } |
| |
| /// Returns the start and end offset of the leftmost match. If no match |
| /// exists, then `None` is returned. |
| /// |
| /// # Panics |
| /// |
| /// If the underlying DFAs return an error, then this routine panics. This |
| /// only occurs in non-default configurations where quit bytes are used or |
| /// Unicode word boundaries are heuristically enabled. |
| /// |
| /// The fallible version of this routine is |
| /// [`try_find_leftmost`](Regex::try_find_leftmost). |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use regex_automata::{MultiMatch, dfa::regex::Regex}; |
| /// |
| /// // Greediness is applied appropriately when compared to find_earliest. |
| /// let re = Regex::new("foo[0-9]+")?; |
| /// assert_eq!( |
| /// Some(MultiMatch::must(0, 3, 11)), |
| /// re.find_leftmost(b"zzzfoo12345zzz"), |
| /// ); |
| /// |
| /// // Even though a match is found after reading the first byte (`a`), |
| /// // the default leftmost-first match semantics demand that we find the |
| /// // earliest match that prefers earlier parts of the pattern over latter |
| /// // parts. |
| /// let re = Regex::new("abc|a")?; |
| /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), re.find_leftmost(b"abc")); |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| pub fn find_leftmost(&self, haystack: &[u8]) -> Option<MultiMatch> { |
| self.find_leftmost_at(haystack, 0, haystack.len()) |
| } |
| |
| /// Search for the first overlapping match in `haystack`. |
| /// |
| /// This routine is principally useful when searching for multiple patterns |
| /// on inputs where multiple patterns may match the same regions of text. |
| /// In particular, callers must preserve the automaton's search state from |
| /// prior calls so that the implementation knows where the last match |
| /// occurred and which pattern was reported. |
| /// |
| /// # Panics |
| /// |
| /// If the underlying DFAs return an error, then this routine panics. This |
| /// only occurs in non-default configurations where quit bytes are used or |
| /// Unicode word boundaries are heuristically enabled. |
| /// |
| /// The fallible version of this routine is |
| /// [`try_find_overlapping`](Regex::try_find_overlapping). |
| /// |
| /// # Example |
| /// |
| /// This example shows how to run an overlapping search with multiple |
| /// regexes. |
| /// |
| /// ``` |
| /// use regex_automata::{dfa::{self, regex::Regex}, MatchKind, MultiMatch}; |
| /// |
| /// let re = Regex::builder() |
| /// .dense(dfa::dense::Config::new().match_kind(MatchKind::All)) |
| /// .build_many(&[r"\w+$", r"\S+$"])?; |
| /// let haystack = "@foo".as_bytes(); |
| /// let mut state = dfa::OverlappingState::start(); |
| /// |
| /// let expected = Some(MultiMatch::must(1, 0, 4)); |
| /// let got = re.find_overlapping(haystack, &mut state); |
| /// assert_eq!(expected, got); |
| /// |
| /// // The first pattern also matches at the same position, so re-running |
| /// // the search will yield another match. Notice also that the first |
| /// // pattern is returned after the second. This is because the second |
| /// // pattern begins its match before the first, is therefore an earlier |
| /// // match and is thus reported first. |
| /// let expected = Some(MultiMatch::must(0, 1, 4)); |
| /// let got = re.find_overlapping(haystack, &mut state); |
| /// assert_eq!(expected, got); |
| /// |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| pub fn find_overlapping( |
| &self, |
| haystack: &[u8], |
| state: &mut OverlappingState, |
| ) -> Option<MultiMatch> { |
| self.find_overlapping_at(haystack, 0, haystack.len(), state) |
| } |
| |
| /// Returns an iterator over all non-overlapping "earliest" matches. |
| /// |
| /// Match positions are reported as soon as a match is known to occur, even |
| /// if the standard leftmost match would be longer. |
| /// |
| /// # Panics |
| /// |
| /// If the underlying DFAs return an error during iteration, then iteration |
| /// panics. This only occurs in non-default configurations where quit bytes |
| /// are used or Unicode word boundaries are heuristically enabled. |
| /// |
| /// The fallible version of this routine is |
| /// [`try_find_earliest_iter`](Regex::try_find_earliest_iter). |
| /// |
| /// # Example |
| /// |
| /// This example shows how to run an "earliest" iterator. |
| /// |
| /// ``` |
| /// use regex_automata::{dfa::regex::Regex, MultiMatch}; |
| /// |
| /// let re = Regex::new("[0-9]+")?; |
| /// let haystack = "123".as_bytes(); |
| /// |
| /// // Normally, a standard leftmost iterator would return a single |
| /// // match, but since "earliest" detects matches earlier, we get |
| /// // three matches. |
| /// let mut it = re.find_earliest_iter(haystack); |
| /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); |
| /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next()); |
| /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next()); |
| /// assert_eq!(None, it.next()); |
| /// |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| pub fn find_earliest_iter<'r, 't>( |
| &'r self, |
| haystack: &'t [u8], |
| ) -> FindEarliestMatches<'r, 't, A, P> { |
| FindEarliestMatches::new(self, haystack) |
| } |
| |
| /// Returns an iterator over all non-overlapping leftmost matches in the |
| /// given bytes. If no match exists, then the iterator yields no elements. |
| /// |
| /// This corresponds to the "standard" regex search iterator. |
| /// |
| /// # Panics |
| /// |
| /// If the underlying DFAs return an error during iteration, then iteration |
| /// panics. This only occurs in non-default configurations where quit bytes |
| /// are used or Unicode word boundaries are heuristically enabled. |
| /// |
| /// The fallible version of this routine is |
| /// [`try_find_leftmost_iter`](Regex::try_find_leftmost_iter). |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use regex_automata::{MultiMatch, dfa::regex::Regex}; |
| /// |
| /// let re = Regex::new("foo[0-9]+")?; |
| /// let text = b"foo1 foo12 foo123"; |
| /// let matches: Vec<MultiMatch> = re.find_leftmost_iter(text).collect(); |
| /// assert_eq!(matches, vec![ |
| /// MultiMatch::must(0, 0, 4), |
| /// MultiMatch::must(0, 5, 10), |
| /// MultiMatch::must(0, 11, 17), |
| /// ]); |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| pub fn find_leftmost_iter<'r, 't>( |
| &'r self, |
| haystack: &'t [u8], |
| ) -> FindLeftmostMatches<'r, 't, A, P> { |
| FindLeftmostMatches::new(self, haystack) |
| } |
| |
| /// Returns an iterator over all overlapping matches in the given haystack. |
| /// |
| /// This routine is principally useful when searching for multiple patterns |
| /// on inputs where multiple patterns may match the same regions of text. |
| /// The iterator takes care of handling the overlapping state that must be |
| /// threaded through every search. |
| /// |
| /// # Panics |
| /// |
| /// If the underlying DFAs return an error during iteration, then iteration |
| /// panics. This only occurs in non-default configurations where quit bytes |
| /// are used or Unicode word boundaries are heuristically enabled. |
| /// |
| /// The fallible version of this routine is |
| /// [`try_find_overlapping_iter`](Regex::try_find_overlapping_iter). |
| /// |
| /// # Example |
| /// |
| /// This example shows how to run an overlapping search with multiple |
| /// regexes. |
| /// |
| /// ``` |
| /// use regex_automata::{dfa::{self, regex::Regex}, MatchKind, MultiMatch}; |
| /// |
| /// let re = Regex::builder() |
| /// .dense(dfa::dense::Config::new().match_kind(MatchKind::All)) |
| /// .build_many(&[r"\w+$", r"\S+$"])?; |
| /// let haystack = "@foo".as_bytes(); |
| /// |
| /// let mut it = re.find_overlapping_iter(haystack); |
| /// assert_eq!(Some(MultiMatch::must(1, 0, 4)), it.next()); |
| /// assert_eq!(Some(MultiMatch::must(0, 1, 4)), it.next()); |
| /// assert_eq!(None, it.next()); |
| /// |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| pub fn find_overlapping_iter<'r, 't>( |
| &'r self, |
| haystack: &'t [u8], |
| ) -> FindOverlappingMatches<'r, 't, A, P> { |
| FindOverlappingMatches::new(self, haystack) |
| } |
| } |
| |
| /// Lower level infallible search routines that permit controlling where |
| /// the search starts and ends in a particular sequence. This is useful for |
| /// executing searches that need to take surrounding context into account. This |
| /// is required for correctly implementing iteration because of look-around |
| /// operators (`^`, `$`, `\b`). |
| impl<A: Automaton, P: Prefilter> Regex<A, P> { |
| /// Returns true if and only if this regex matches the given haystack. |
| /// |
| /// This routine may short circuit if it knows that scanning future input |
| /// will never lead to a different result. In particular, if the underlying |
| /// DFA enters a match state or a dead state, then this routine will return |
| /// `true` or `false`, respectively, without inspecting any future input. |
| /// |
| /// # Searching a substring of the haystack |
| /// |
| /// Being an "at" search routine, this permits callers to search a |
| /// substring of `haystack` by specifying a range in `haystack`. |
| /// Why expose this as an API instead of just asking callers to use |
| /// `&input[start..end]`? The reason is that regex matching often wants |
| /// to take the surrounding context into account in order to handle |
| /// look-around (`^`, `$` and `\b`). |
| /// |
| /// # Panics |
| /// |
| /// If the underlying DFAs return an error, then this routine panics. This |
| /// only occurs in non-default configurations where quit bytes are used or |
| /// Unicode word boundaries are heuristically enabled. |
| /// |
| /// The fallible version of this routine is |
| /// [`try_is_match_at`](Regex::try_is_match_at). |
| pub fn is_match_at( |
| &self, |
| haystack: &[u8], |
| start: usize, |
| end: usize, |
| ) -> bool { |
| self.try_is_match_at(haystack, start, end).unwrap() |
| } |
| |
| /// Returns the first position at which a match is found. |
| /// |
| /// This routine stops scanning input in precisely the same circumstances |
| /// as `is_match`. The key difference is that this routine returns the |
| /// position at which it stopped scanning input if and only if a match |
| /// was found. If no match is found, then `None` is returned. |
| /// |
| /// # Searching a substring of the haystack |
| /// |
| /// Being an "at" search routine, this permits callers to search a |
| /// substring of `haystack` by specifying a range in `haystack`. |
| /// Why expose this as an API instead of just asking callers to use |
| /// `&input[start..end]`? The reason is that regex matching often wants |
| /// to take the surrounding context into account in order to handle |
| /// look-around (`^`, `$` and `\b`). |
| /// |
| /// This is useful when implementing an iterator over matches |
| /// within the same haystack, which cannot be done correctly by simply |
| /// providing a subslice of `haystack`. |
| /// |
| /// # Panics |
| /// |
| /// If the underlying DFAs return an error, then this routine panics. This |
| /// only occurs in non-default configurations where quit bytes are used or |
| /// Unicode word boundaries are heuristically enabled. |
| /// |
| /// The fallible version of this routine is |
| /// [`try_find_earliest_at`](Regex::try_find_earliest_at). |
| pub fn find_earliest_at( |
| &self, |
| haystack: &[u8], |
| start: usize, |
| end: usize, |
| ) -> Option<MultiMatch> { |
| self.try_find_earliest_at(haystack, start, end).unwrap() |
| } |
| |
| /// Returns the same as `find_leftmost`, but starts the search at the given |
| /// offset. |
| /// |
| /// The significance of the starting point is that it takes the surrounding |
| /// context into consideration. For example, if the DFA is anchored, then |
| /// a match can only occur when `start == 0`. |
| /// |
| /// # Searching a substring of the haystack |
| /// |
| /// Being an "at" search routine, this permits callers to search a |
| /// substring of `haystack` by specifying a range in `haystack`. |
| /// Why expose this as an API instead of just asking callers to use |
| /// `&input[start..end]`? The reason is that regex matching often wants |
| /// to take the surrounding context into account in order to handle |
| /// look-around (`^`, `$` and `\b`). |
| /// |
| /// This is useful when implementing an iterator over matches within the |
| /// same haystack, which cannot be done correctly by simply providing a |
| /// subslice of `haystack`. |
| /// |
| /// # Panics |
| /// |
| /// If the underlying DFAs return an error, then this routine panics. This |
| /// only occurs in non-default configurations where quit bytes are used or |
| /// Unicode word boundaries are heuristically enabled. |
| /// |
| /// The fallible version of this routine is |
| /// [`try_find_leftmost_at`](Regex::try_find_leftmost_at). |
| pub fn find_leftmost_at( |
| &self, |
| haystack: &[u8], |
| start: usize, |
| end: usize, |
| ) -> Option<MultiMatch> { |
| self.try_find_leftmost_at(haystack, start, end).unwrap() |
| } |
| |
| /// Search for the first overlapping match within a given range of |
| /// `haystack`. |
| /// |
| /// This routine is principally useful when searching for multiple patterns |
| /// on inputs where multiple patterns may match the same regions of text. |
| /// In particular, callers must preserve the automaton's search state from |
| /// prior calls so that the implementation knows where the last match |
| /// occurred and which pattern was reported. |
| /// |
| /// # Searching a substring of the haystack |
| /// |
| /// Being an "at" search routine, this permits callers to search a |
| /// substring of `haystack` by specifying a range in `haystack`. |
| /// Why expose this as an API instead of just asking callers to use |
| /// `&input[start..end]`? The reason is that regex matching often wants |
| /// to take the surrounding context into account in order to handle |
| /// look-around (`^`, `$` and `\b`). |
| /// |
| /// This is useful when implementing an iterator over matches |
| /// within the same haystack, which cannot be done correctly by simply |
| /// providing a subslice of `haystack`. |
| /// |
| /// # Panics |
| /// |
| /// If the underlying DFAs return an error, then this routine panics. This |
| /// only occurs in non-default configurations where quit bytes are used or |
| /// Unicode word boundaries are heuristically enabled. |
| /// |
| /// The fallible version of this routine is |
| /// [`try_find_overlapping_at`](Regex::try_find_overlapping_at). |
| pub fn find_overlapping_at( |
| &self, |
| haystack: &[u8], |
| start: usize, |
| end: usize, |
| state: &mut OverlappingState, |
| ) -> Option<MultiMatch> { |
| self.try_find_overlapping_at(haystack, start, end, state).unwrap() |
| } |
| } |
| |
| /// Fallible search routines. These may return an error when the underlying |
| /// DFAs have been configured in a way that permits them to fail during a |
| /// search. |
| /// |
| /// Errors during search only occur when the DFA has been explicitly |
| /// configured to do so, usually by specifying one or more "quit" bytes or by |
| /// heuristically enabling Unicode word boundaries. |
| /// |
| /// Errors will never be returned using the default configuration. So these |
| /// fallible routines are only needed for particular configurations. |
| impl<A: Automaton, P: Prefilter> Regex<A, P> { |
| /// Returns true if and only if this regex matches the given haystack. |
| /// |
| /// This routine may short circuit if it knows that scanning future input |
| /// will never lead to a different result. In particular, if the underlying |
| /// DFA enters a match state or a dead state, then this routine will return |
| /// `true` or `false`, respectively, without inspecting any future input. |
| /// |
| /// # Errors |
| /// |
| /// This routine only errors if the search could not complete. For |
| /// DFA-based regexes, this only occurs in a non-default configuration |
| /// where quit bytes are used or Unicode word boundaries are heuristically |
| /// enabled. |
| /// |
| /// When a search cannot complete, callers cannot know whether a match |
| /// exists or not. |
| /// |
| /// The infallible (panics on error) version of this routine is |
| /// [`is_match`](Regex::is_match). |
| pub fn try_is_match(&self, haystack: &[u8]) -> Result<bool, MatchError> { |
| self.try_is_match_at(haystack, 0, haystack.len()) |
| } |
| |
| /// Returns the first position at which a match is found. |
| /// |
| /// This routine stops scanning input in precisely the same circumstances |
| /// as `is_match`. The key difference is that this routine returns the |
| /// position at which it stopped scanning input if and only if a match |
| /// was found. If no match is found, then `None` is returned. |
| /// |
| /// # Errors |
| /// |
| /// This routine only errors if the search could not complete. For |
| /// DFA-based regexes, this only occurs in a non-default configuration |
| /// where quit bytes are used or Unicode word boundaries are heuristically |
| /// enabled. |
| /// |
| /// When a search cannot complete, callers cannot know whether a match |
| /// exists or not. |
| /// |
| /// The infallible (panics on error) version of this routine is |
| /// [`find_earliest`](Regex::find_earliest). |
| pub fn try_find_earliest( |
| &self, |
| haystack: &[u8], |
| ) -> Result<Option<MultiMatch>, MatchError> { |
| self.try_find_earliest_at(haystack, 0, haystack.len()) |
| } |
| |
| /// Returns the start and end offset of the leftmost match. If no match |
| /// exists, then `None` is returned. |
| /// |
| /// # Errors |
| /// |
| /// This routine only errors if the search could not complete. For |
| /// DFA-based regexes, this only occurs in a non-default configuration |
| /// where quit bytes are used or Unicode word boundaries are heuristically |
| /// enabled. |
| /// |
| /// When a search cannot complete, callers cannot know whether a match |
| /// exists or not. |
| /// |
| /// The infallible (panics on error) version of this routine is |
| /// [`find_leftmost`](Regex::find_leftmost). |
| pub fn try_find_leftmost( |
| &self, |
| haystack: &[u8], |
| ) -> Result<Option<MultiMatch>, MatchError> { |
| self.try_find_leftmost_at(haystack, 0, haystack.len()) |
| } |
| |
| /// Search for the first overlapping match in `haystack`. |
| /// |
| /// This routine is principally useful when searching for multiple patterns |
| /// on inputs where multiple patterns may match the same regions of text. |
| /// In particular, callers must preserve the automaton's search state from |
| /// prior calls so that the implementation knows where the last match |
| /// occurred and which pattern was reported. |
| /// |
| /// # Errors |
| /// |
| /// This routine only errors if the search could not complete. For |
| /// DFA-based regexes, this only occurs in a non-default configuration |
| /// where quit bytes are used or Unicode word boundaries are heuristically |
| /// enabled. |
| /// |
| /// When a search cannot complete, callers cannot know whether a match |
| /// exists or not. |
| /// |
| /// The infallible (panics on error) version of this routine is |
| /// [`find_overlapping`](Regex::find_overlapping). |
| pub fn try_find_overlapping( |
| &self, |
| haystack: &[u8], |
| state: &mut OverlappingState, |
| ) -> Result<Option<MultiMatch>, MatchError> { |
| self.try_find_overlapping_at(haystack, 0, haystack.len(), state) |
| } |
| |
| /// Returns an iterator over all non-overlapping "earliest" matches. |
| /// |
| /// Match positions are reported as soon as a match is known to occur, even |
| /// if the standard leftmost match would be longer. |
| /// |
| /// # Errors |
| /// |
| /// This iterator only yields errors if the search could not complete. For |
| /// DFA-based regexes, this only occurs in a non-default configuration |
| /// where quit bytes are used or Unicode word boundaries are heuristically |
| /// enabled. |
| /// |
| /// When a search cannot complete, callers cannot know whether a match |
| /// exists or not. |
| /// |
| /// The infallible (panics on error) version of this routine is |
| /// [`find_earliest_iter`](Regex::find_earliest_iter). |
| pub fn try_find_earliest_iter<'r, 't>( |
| &'r self, |
| haystack: &'t [u8], |
| ) -> TryFindEarliestMatches<'r, 't, A, P> { |
| TryFindEarliestMatches::new(self, haystack) |
| } |
| |
| /// Returns an iterator over all non-overlapping leftmost matches in the |
| /// given bytes. If no match exists, then the iterator yields no elements. |
| /// |
| /// This corresponds to the "standard" regex search iterator. |
| /// |
| /// # Errors |
| /// |
| /// This iterator only yields errors if the search could not complete. For |
| /// DFA-based regexes, this only occurs in a non-default configuration |
| /// where quit bytes are used or Unicode word boundaries are heuristically |
| /// enabled. |
| /// |
| /// When a search cannot complete, callers cannot know whether a match |
| /// exists or not. |
| /// |
| /// The infallible (panics on error) version of this routine is |
| /// [`find_leftmost_iter`](Regex::find_leftmost_iter). |
| pub fn try_find_leftmost_iter<'r, 't>( |
| &'r self, |
| haystack: &'t [u8], |
| ) -> TryFindLeftmostMatches<'r, 't, A, P> { |
| TryFindLeftmostMatches::new(self, haystack) |
| } |
| |
| /// Returns an iterator over all overlapping matches in the given haystack. |
| /// |
| /// This routine is principally useful when searching for multiple patterns |
| /// on inputs where multiple patterns may match the same regions of text. |
| /// The iterator takes care of handling the overlapping state that must be |
| /// threaded through every search. |
| /// |
| /// # Errors |
| /// |
| /// This iterator only yields errors if the search could not complete. For |
| /// DFA-based regexes, this only occurs in a non-default configuration |
| /// where quit bytes are used or Unicode word boundaries are heuristically |
| /// enabled. |
| /// |
| /// When a search cannot complete, callers cannot know whether a match |
| /// exists or not. |
| /// |
| /// The infallible (panics on error) version of this routine is |
| /// [`find_overlapping_iter`](Regex::find_overlapping_iter). |
| pub fn try_find_overlapping_iter<'r, 't>( |
| &'r self, |
| haystack: &'t [u8], |
| ) -> TryFindOverlappingMatches<'r, 't, A, P> { |
| TryFindOverlappingMatches::new(self, haystack) |
| } |
| } |
| |
| /// Lower level fallible search routines that permit controlling where the |
| /// search starts and ends in a particular sequence. |
| impl<A: Automaton, P: Prefilter> Regex<A, P> { |
| /// Returns true if and only if this regex matches the given haystack. |
| /// |
| /// This routine may short circuit if it knows that scanning future input |
| /// will never lead to a different result. In particular, if the underlying |
| /// DFA enters a match state or a dead state, then this routine will return |
| /// `true` or `false`, respectively, without inspecting any future input. |
| /// |
| /// # Searching a substring of the haystack |
| /// |
| /// Being an "at" search routine, this permits callers to search a |
| /// substring of `haystack` by specifying a range in `haystack`. |
| /// Why expose this as an API instead of just asking callers to use |
| /// `&input[start..end]`? The reason is that regex matching often wants |
| /// to take the surrounding context into account in order to handle |
| /// look-around (`^`, `$` and `\b`). |
| /// |
| /// # Errors |
| /// |
| /// This routine only errors if the search could not complete. For |
| /// DFA-based regexes, this only occurs in a non-default configuration |
| /// where quit bytes are used, Unicode word boundaries are heuristically |
| /// enabled or limits are set on the number of times the lazy DFA's cache |
| /// may be cleared. |
| /// |
| /// When a search cannot complete, callers cannot know whether a match |
| /// exists or not. |
| /// |
| /// The infallible (panics on error) version of this routine is |
| /// [`is_match_at`](Regex::is_match_at). |
| pub fn try_is_match_at( |
| &self, |
| haystack: &[u8], |
| start: usize, |
| end: usize, |
| ) -> Result<bool, MatchError> { |
| self.forward() |
| .find_earliest_fwd_at( |
| self.scanner().as_mut(), |
| None, |
| haystack, |
| start, |
| end, |
| ) |
| .map(|x| x.is_some()) |
| } |
| |
| /// Returns the first position at which a match is found. |
| /// |
| /// This routine stops scanning input in precisely the same circumstances |
| /// as `is_match`. The key difference is that this routine returns the |
| /// position at which it stopped scanning input if and only if a match |
| /// was found. If no match is found, then `None` is returned. |
| /// |
| /// # Searching a substring of the haystack |
| /// |
| /// Being an "at" search routine, this permits callers to search a |
| /// substring of `haystack` by specifying a range in `haystack`. |
| /// Why expose this as an API instead of just asking callers to use |
| /// `&input[start..end]`? The reason is that regex matching often wants |
| /// to take the surrounding context into account in order to handle |
| /// look-around (`^`, `$` and `\b`). |
| /// |
| /// This is useful when implementing an iterator over matches |
| /// within the same haystack, which cannot be done correctly by simply |
| /// providing a subslice of `haystack`. |
| /// |
| /// # Errors |
| /// |
| /// This routine only errors if the search could not complete. For |
| /// DFA-based regexes, this only occurs in a non-default configuration |
| /// where quit bytes are used or Unicode word boundaries are heuristically |
| /// enabled. |
| /// |
| /// When a search cannot complete, callers cannot know whether a match |
| /// exists or not. |
| /// |
| /// The infallible (panics on error) version of this routine is |
| /// [`find_earliest_at`](Regex::find_earliest_at). |
| pub fn try_find_earliest_at( |
| &self, |
| haystack: &[u8], |
| start: usize, |
| end: usize, |
| ) -> Result<Option<MultiMatch>, MatchError> { |
| self.try_find_earliest_at_imp( |
| self.scanner().as_mut(), |
| haystack, |
| start, |
| end, |
| ) |
| } |
| |
| /// The implementation of "earliest" searching, where a prefilter scanner |
| /// may be given. |
| fn try_find_earliest_at_imp( |
| &self, |
| pre: Option<&mut prefilter::Scanner>, |
| haystack: &[u8], |
| start: usize, |
| end: usize, |
| ) -> Result<Option<MultiMatch>, MatchError> { |
| // N.B. We use `&&A` here to call `Automaton` methods, which ensures |
| // that we always use the `impl Automaton for &A` for calling methods. |
| // Since this is the usual way that automata are used, this helps |
| // reduce the number of monomorphized copies of the search code. |
| let (fwd, rev) = (self.forward(), self.reverse()); |
| let end = match (&fwd) |
| .find_earliest_fwd_at(pre, None, haystack, start, end)? |
| { |
| None => return Ok(None), |
| Some(end) => end, |
| }; |
| // N.B. The only time we need to tell the reverse searcher the pattern |
| // to match is in the overlapping case, since it's ambiguous. In the |
| // leftmost case, I have tentatively convinced myself that it isn't |
| // necessary and the reverse search will always find the same pattern |
| // to match as the forward search. But I lack a rigorous proof. |
| let start = (&rev) |
| .find_earliest_rev_at(None, haystack, start, end.offset())? |
| .expect("reverse search must match if forward search does"); |
| assert_eq!( |
| start.pattern(), |
| end.pattern(), |
| "forward and reverse search must match same pattern" |
| ); |
| assert!(start.offset() <= end.offset()); |
| Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) |
| } |
| |
| /// Returns the start and end offset of the leftmost match. If no match |
| /// exists, then `None` is returned. |
| /// |
| /// # Searching a substring of the haystack |
| /// |
| /// Being an "at" search routine, this permits callers to search a |
| /// substring of `haystack` by specifying a range in `haystack`. |
| /// Why expose this as an API instead of just asking callers to use |
| /// `&input[start..end]`? The reason is that regex matching often wants |
| /// to take the surrounding context into account in order to handle |
| /// look-around (`^`, `$` and `\b`). |
| /// |
| /// This is useful when implementing an iterator over matches |
| /// within the same haystack, which cannot be done correctly by simply |
| /// providing a subslice of `haystack`. |
| /// |
| /// # Errors |
| /// |
| /// This routine only errors if the search could not complete. For |
| /// DFA-based regexes, this only occurs in a non-default configuration |
| /// where quit bytes are used or Unicode word boundaries are heuristically |
| /// enabled. |
| /// |
| /// When a search cannot complete, callers cannot know whether a match |
| /// exists or not. |
| /// |
| /// The infallible (panics on error) version of this routine is |
| /// [`find_leftmost_at`](Regex::find_leftmost_at). |
| pub fn try_find_leftmost_at( |
| &self, |
| haystack: &[u8], |
| start: usize, |
| end: usize, |
| ) -> Result<Option<MultiMatch>, MatchError> { |
| self.try_find_leftmost_at_imp( |
| self.scanner().as_mut(), |
| haystack, |
| start, |
| end, |
| ) |
| } |
| |
| /// The implementation of leftmost searching, where a prefilter scanner |
| /// may be given. |
| fn try_find_leftmost_at_imp( |
| &self, |
| scanner: Option<&mut prefilter::Scanner>, |
| haystack: &[u8], |
| start: usize, |
| end: usize, |
| ) -> Result<Option<MultiMatch>, MatchError> { |
| // N.B. We use `&&A` here to call `Automaton` methods, which ensures |
| // that we always use the `impl Automaton for &A` for calling methods. |
| // Since this is the usual way that automata are used, this helps |
| // reduce the number of monomorphized copies of the search code. |
| let (fwd, rev) = (self.forward(), self.reverse()); |
| let end = match (&fwd) |
| .find_leftmost_fwd_at(scanner, None, haystack, start, end)? |
| { |
| None => return Ok(None), |
| Some(end) => end, |
| }; |
| // N.B. The only time we need to tell the reverse searcher the pattern |
| // to match is in the overlapping case, since it's ambiguous. In the |
| // leftmost case, I have tentatively convinced myself that it isn't |
| // necessary and the reverse search will always find the same pattern |
| // to match as the forward search. But I lack a rigorous proof. Why not |
| // just provide the pattern anyway? Well, if it is needed, then leaving |
| // it out gives us a chance to find a witness. |
| let start = (&rev) |
| .find_leftmost_rev_at(None, haystack, start, end.offset())? |
| .expect("reverse search must match if forward search does"); |
| assert_eq!( |
| start.pattern(), |
| end.pattern(), |
| "forward and reverse search must match same pattern", |
| ); |
| assert!(start.offset() <= end.offset()); |
| Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) |
| } |
| |
| /// Search for the first overlapping match within a given range of |
| /// `haystack`. |
| /// |
| /// This routine is principally useful when searching for multiple patterns |
| /// on inputs where multiple patterns may match the same regions of text. |
| /// In particular, callers must preserve the automaton's search state from |
| /// prior calls so that the implementation knows where the last match |
| /// occurred and which pattern was reported. |
| /// |
| /// # Searching a substring of the haystack |
| /// |
| /// Being an "at" search routine, this permits callers to search a |
| /// substring of `haystack` by specifying a range in `haystack`. |
| /// Why expose this as an API instead of just asking callers to use |
| /// `&input[start..end]`? The reason is that regex matching often wants |
| /// to take the surrounding context into account in order to handle |
| /// look-around (`^`, `$` and `\b`). |
| /// |
| /// This is useful when implementing an iterator over matches |
| /// within the same haystack, which cannot be done correctly by simply |
| /// providing a subslice of `haystack`. |
| /// |
| /// # Errors |
| /// |
| /// This routine only errors if the search could not complete. For |
| /// DFA-based regexes, this only occurs in a non-default configuration |
| /// where quit bytes are used or Unicode word boundaries are heuristically |
| /// enabled. |
| /// |
| /// When a search cannot complete, callers cannot know whether a match |
| /// exists or not. |
| /// |
| /// The infallible (panics on error) version of this routine is |
| /// [`find_overlapping_at`](Regex::find_overlapping_at). |
| pub fn try_find_overlapping_at( |
| &self, |
| haystack: &[u8], |
| start: usize, |
| end: usize, |
| state: &mut OverlappingState, |
| ) -> Result<Option<MultiMatch>, MatchError> { |
| self.try_find_overlapping_at_imp( |
| self.scanner().as_mut(), |
| haystack, |
| start, |
| end, |
| state, |
| ) |
| } |
| |
| /// The implementation of overlapping search at a given range in |
| /// `haystack`, where `scanner` is a prefilter (if active) and `state` is |
| /// the current state of the search. |
| fn try_find_overlapping_at_imp( |
| &self, |
| scanner: Option<&mut prefilter::Scanner>, |
| haystack: &[u8], |
| start: usize, |
| end: usize, |
| state: &mut OverlappingState, |
| ) -> Result<Option<MultiMatch>, MatchError> { |
| // N.B. We use `&&A` here to call `Automaton` methods, which ensures |
| // that we always use the `impl Automaton for &A` for calling methods. |
| // Since this is the usual way that automata are used, this helps |
| // reduce the number of monomorphized copies of the search code. |
| let (fwd, rev) = (self.forward(), self.reverse()); |
| // TODO: Decide whether it's worth making this assert work. It doesn't |
| // work currently because 'has_starts_for_each_pattern' isn't on the |
| // Automaton trait. Without this assert, we still get a panic, but it's |
| // a bit more inscrutable. |
| // assert!( |
| // rev.has_starts_for_each_pattern(), |
| // "overlapping searches require that the reverse DFA is \ |
| // compiled with the 'starts_for_each_pattern' option", |
| // ); |
| let end = match (&fwd).find_overlapping_fwd_at( |
| scanner, None, haystack, start, end, state, |
| )? { |
| None => return Ok(None), |
| Some(end) => end, |
| }; |
| // Unlike the leftmost cases, the reverse overlapping search may match |
| // a different pattern than the forward search. See test failures when |
| // using `None` instead of `Some(end.pattern())` below. Thus, we must |
| // run our reverse search using the pattern that matched in the forward |
| // direction. |
| let start = (&rev) |
| .find_leftmost_rev_at( |
| Some(end.pattern()), |
| haystack, |
| 0, |
| end.offset(), |
| )? |
| .expect("reverse search must match if forward search does"); |
| assert!(start.offset() <= end.offset()); |
| assert_eq!(start.pattern(), end.pattern()); |
| Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) |
| } |
| } |
| |
| /// Non-search APIs for querying information about the regex and setting a |
| /// prefilter. |
| impl<A: Automaton, P: Prefilter> Regex<A, P> { |
| /// Attach the given prefilter to this regex. |
| pub fn with_prefilter<Q: Prefilter>(self, prefilter: Q) -> Regex<A, Q> { |
| Regex { |
| prefilter: Some(prefilter), |
| forward: self.forward, |
| reverse: self.reverse, |
| utf8: self.utf8, |
| } |
| } |
| |
| /// Remove any prefilter from this regex. |
| pub fn without_prefilter(self) -> Regex<A> { |
| Regex { |
| prefilter: None, |
| forward: self.forward, |
| reverse: self.reverse, |
| utf8: self.utf8, |
| } |
| } |
| |
| /// Return the underlying DFA responsible for forward matching. |
| /// |
| /// This is useful for accessing the underlying DFA and converting it to |
| /// some other format or size. See the [`Builder::build_from_dfas`] docs |
| /// for an example of where this might be useful. |
| pub fn forward(&self) -> &A { |
| &self.forward |
| } |
| |
| /// Return the underlying DFA responsible for reverse matching. |
| /// |
| /// This is useful for accessing the underlying DFA and converting it to |
| /// some other format or size. See the [`Builder::build_from_dfas`] docs |
| /// for an example of where this might be useful. |
| pub fn reverse(&self) -> &A { |
| &self.reverse |
| } |
| |
| /// Returns the total number of patterns matched by this regex. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use regex_automata::{MultiMatch, dfa::regex::Regex}; |
| /// |
| /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?; |
| /// assert_eq!(3, re.pattern_count()); |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| pub fn pattern_count(&self) -> usize { |
| assert_eq!( |
| self.forward().pattern_count(), |
| self.reverse().pattern_count() |
| ); |
| self.forward().pattern_count() |
| } |
| |
| /// Convenience function for returning this regex's prefilter as a trait |
| /// object. |
| /// |
| /// If this regex doesn't have a prefilter, then `None` is returned. |
| pub fn prefilter(&self) -> Option<&dyn Prefilter> { |
| match self.prefilter { |
| None => None, |
| Some(ref x) => Some(&*x), |
| } |
| } |
| |
| /// Convenience function for returning a prefilter scanner. |
| fn scanner(&self) -> Option<prefilter::Scanner> { |
| self.prefilter().map(prefilter::Scanner::new) |
| } |
| } |
| |
| /// An iterator over all non-overlapping earliest matches for a particular |
| /// infallible search. |
| /// |
| /// The iterator yields a [`MultiMatch`] value until no more matches could be |
| /// found. If the underlying search returns an error, then this panics. |
| /// |
| /// `A` is the type used to represent the underlying DFAs used by the regex, |
| /// while `P` is the type of prefilter used, if any. The lifetime variables are |
| /// as follows: |
| /// |
| /// * `'r` is the lifetime of the regular expression itself. |
| /// * `'t` is the lifetime of the text being searched. |
| #[derive(Clone, Debug)] |
| pub struct FindEarliestMatches<'r, 't, A, P>( |
| TryFindEarliestMatches<'r, 't, A, P>, |
| ); |
| |
| impl<'r, 't, A: Automaton, P: Prefilter> FindEarliestMatches<'r, 't, A, P> { |
| fn new( |
| re: &'r Regex<A, P>, |
| text: &'t [u8], |
| ) -> FindEarliestMatches<'r, 't, A, P> { |
| FindEarliestMatches(TryFindEarliestMatches::new(re, text)) |
| } |
| } |
| |
| impl<'r, 't, A: Automaton, P: Prefilter> Iterator |
| for FindEarliestMatches<'r, 't, A, P> |
| { |
| type Item = MultiMatch; |
| |
| fn next(&mut self) -> Option<MultiMatch> { |
| next_unwrap(self.0.next()) |
| } |
| } |
| |
| /// An iterator over all non-overlapping leftmost matches for a particular |
| /// infallible search. |
| /// |
| /// The iterator yields a [`MultiMatch`] value until no more matches could be |
| /// found. If the underlying search returns an error, then this panics. |
| /// |
| /// `A` is the type used to represent the underlying DFAs used by the regex, |
| /// while `P` is the type of prefilter used, if any. The lifetime variables are |
| /// as follows: |
| /// |
| /// * `'r` is the lifetime of the regular expression itself. |
| /// * `'t` is the lifetime of the text being searched. |
| #[derive(Clone, Debug)] |
| pub struct FindLeftmostMatches<'r, 't, A, P>( |
| TryFindLeftmostMatches<'r, 't, A, P>, |
| ); |
| |
| impl<'r, 't, A: Automaton, P: Prefilter> FindLeftmostMatches<'r, 't, A, P> { |
| fn new( |
| re: &'r Regex<A, P>, |
| text: &'t [u8], |
| ) -> FindLeftmostMatches<'r, 't, A, P> { |
| FindLeftmostMatches(TryFindLeftmostMatches::new(re, text)) |
| } |
| } |
| |
| impl<'r, 't, A: Automaton, P: Prefilter> Iterator |
| for FindLeftmostMatches<'r, 't, A, P> |
| { |
| type Item = MultiMatch; |
| |
| fn next(&mut self) -> Option<MultiMatch> { |
| next_unwrap(self.0.next()) |
| } |
| } |
| |
| /// An iterator over all overlapping matches for a particular infallible |
| /// search. |
| /// |
| /// The iterator yields a [`MultiMatch`] value until no more matches could be |
| /// found. If the underlying search returns an error, then this panics. |
| /// |
| /// `A` is the type used to represent the underlying DFAs used by the regex, |
| /// while `P` is the type of prefilter used, if any. The lifetime variables are |
| /// as follows: |
| /// |
| /// * `'r` is the lifetime of the regular expression itself. |
| /// * `'t` is the lifetime of the text being searched. |
| #[derive(Clone, Debug)] |
| pub struct FindOverlappingMatches<'r, 't, A: Automaton, P>( |
| TryFindOverlappingMatches<'r, 't, A, P>, |
| ); |
| |
| impl<'r, 't, A: Automaton, P: Prefilter> FindOverlappingMatches<'r, 't, A, P> { |
| fn new( |
| re: &'r Regex<A, P>, |
| text: &'t [u8], |
| ) -> FindOverlappingMatches<'r, 't, A, P> { |
| FindOverlappingMatches(TryFindOverlappingMatches::new(re, text)) |
| } |
| } |
| |
| impl<'r, 't, A: Automaton, P: Prefilter> Iterator |
| for FindOverlappingMatches<'r, 't, A, P> |
| { |
| type Item = MultiMatch; |
| |
| fn next(&mut self) -> Option<MultiMatch> { |
| next_unwrap(self.0.next()) |
| } |
| } |
| |
| /// An iterator over all non-overlapping earliest matches for a particular |
| /// fallible search. |
| /// |
| /// The iterator yields a [`MultiMatch`] value until no more matches could be |
| /// found. |
| /// |
| /// `A` is the type used to represent the underlying DFAs used by the regex, |
| /// while `P` is the type of prefilter used, if any. The lifetime variables are |
| /// as follows: |
| /// |
| /// * `'r` is the lifetime of the regular expression itself. |
| /// * `'t` is the lifetime of the text being searched. |
| #[derive(Clone, Debug)] |
| pub struct TryFindEarliestMatches<'r, 't, A, P> { |
| re: &'r Regex<A, P>, |
| scanner: Option<prefilter::Scanner<'r>>, |
| text: &'t [u8], |
| last_end: usize, |
| last_match: Option<usize>, |
| } |
| |
| impl<'r, 't, A: Automaton, P: Prefilter> TryFindEarliestMatches<'r, 't, A, P> { |
| fn new( |
| re: &'r Regex<A, P>, |
| text: &'t [u8], |
| ) -> TryFindEarliestMatches<'r, 't, A, P> { |
| let scanner = re.scanner(); |
| TryFindEarliestMatches { |
| re, |
| scanner, |
| text, |
| last_end: 0, |
| last_match: None, |
| } |
| } |
| } |
| |
| impl<'r, 't, A: Automaton, P: Prefilter> Iterator |
| for TryFindEarliestMatches<'r, 't, A, P> |
| { |
| type Item = Result<MultiMatch, MatchError>; |
| |
| fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> { |
| if self.last_end > self.text.len() { |
| return None; |
| } |
| let result = self.re.try_find_earliest_at_imp( |
| self.scanner.as_mut(), |
| self.text, |
| self.last_end, |
| self.text.len(), |
| ); |
| let m = match result { |
| Err(err) => return Some(Err(err)), |
| Ok(None) => return None, |
| Ok(Some(m)) => m, |
| }; |
| if m.is_empty() { |
| // This is an empty match. To ensure we make progress, start |
| // the next search at the smallest possible starting position |
| // of the next match following this one. |
| self.last_end = if self.re.utf8 { |
| crate::util::next_utf8(self.text, m.end()) |
| } else { |
| m.end() + 1 |
| }; |
| // Don't accept empty matches immediately following a match. |
| // Just move on to the next match. |
| if Some(m.end()) == self.last_match { |
| return self.next(); |
| } |
| } else { |
| self.last_end = m.end(); |
| } |
| self.last_match = Some(m.end()); |
| Some(Ok(m)) |
| } |
| } |
| |
| /// An iterator over all non-overlapping leftmost matches for a particular |
| /// fallible search. |
| /// |
| /// The iterator yields a [`MultiMatch`] value until no more matches could be |
| /// found. |
| /// |
| /// `A` is the type used to represent the underlying DFAs used by the regex, |
| /// while `P` is the type of prefilter used, if any. The lifetime variables are |
| /// as follows: |
| /// |
| /// * `'r` is the lifetime of the regular expression itself. |
| /// * `'t` is the lifetime of the text being searched. |
| #[derive(Clone, Debug)] |
| pub struct TryFindLeftmostMatches<'r, 't, A, P> { |
| re: &'r Regex<A, P>, |
| scanner: Option<prefilter::Scanner<'r>>, |
| text: &'t [u8], |
| last_end: usize, |
| last_match: Option<usize>, |
| } |
| |
| impl<'r, 't, A: Automaton, P: Prefilter> TryFindLeftmostMatches<'r, 't, A, P> { |
| fn new( |
| re: &'r Regex<A, P>, |
| text: &'t [u8], |
| ) -> TryFindLeftmostMatches<'r, 't, A, P> { |
| let scanner = re.scanner(); |
| TryFindLeftmostMatches { |
| re, |
| scanner, |
| text, |
| last_end: 0, |
| last_match: None, |
| } |
| } |
| } |
| |
| impl<'r, 't, A: Automaton, P: Prefilter> Iterator |
| for TryFindLeftmostMatches<'r, 't, A, P> |
| { |
| type Item = Result<MultiMatch, MatchError>; |
| |
| fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> { |
| if self.last_end > self.text.len() { |
| return None; |
| } |
| let result = self.re.try_find_leftmost_at_imp( |
| self.scanner.as_mut(), |
| self.text, |
| self.last_end, |
| self.text.len(), |
| ); |
| let m = match result { |
| Err(err) => return Some(Err(err)), |
| Ok(None) => return None, |
| Ok(Some(m)) => m, |
| }; |
| if m.is_empty() { |
| // This is an empty match. To ensure we make progress, start |
| // the next search at the smallest possible starting position |
| // of the next match following this one. |
| self.last_end = if self.re.utf8 { |
| crate::util::next_utf8(self.text, m.end()) |
| } else { |
| m.end() + 1 |
| }; |
| // Don't accept empty matches immediately following a match. |
| // Just move on to the next match. |
| if Some(m.end()) == self.last_match { |
| return self.next(); |
| } |
| } else { |
| self.last_end = m.end(); |
| } |
| self.last_match = Some(m.end()); |
| Some(Ok(m)) |
| } |
| } |
| |
| /// An iterator over all overlapping matches for a particular fallible search. |
| /// |
| /// The iterator yields a [`MultiMatch`] value until no more matches could be |
| /// found. |
| /// |
| /// `A` is the type used to represent the underlying DFAs used by the regex, |
| /// while `P` is the type of prefilter used, if any. The lifetime variables are |
| /// as follows: |
| /// |
| /// * `'r` is the lifetime of the regular expression itself. |
| /// * `'t` is the lifetime of the text being searched. |
| #[derive(Clone, Debug)] |
| pub struct TryFindOverlappingMatches<'r, 't, A: Automaton, P> { |
| re: &'r Regex<A, P>, |
| scanner: Option<prefilter::Scanner<'r>>, |
| text: &'t [u8], |
| last_end: usize, |
| state: OverlappingState, |
| } |
| |
| impl<'r, 't, A: Automaton, P: Prefilter> |
| TryFindOverlappingMatches<'r, 't, A, P> |
| { |
| fn new( |
| re: &'r Regex<A, P>, |
| text: &'t [u8], |
| ) -> TryFindOverlappingMatches<'r, 't, A, P> { |
| let scanner = re.scanner(); |
| TryFindOverlappingMatches { |
| re, |
| scanner, |
| text, |
| last_end: 0, |
| state: OverlappingState::start(), |
| } |
| } |
| } |
| |
| impl<'r, 't, A: Automaton, P: Prefilter> Iterator |
| for TryFindOverlappingMatches<'r, 't, A, P> |
| { |
| type Item = Result<MultiMatch, MatchError>; |
| |
| fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> { |
| if self.last_end > self.text.len() { |
| return None; |
| } |
| let result = self.re.try_find_overlapping_at_imp( |
| self.scanner.as_mut(), |
| self.text, |
| self.last_end, |
| self.text.len(), |
| &mut self.state, |
| ); |
| let m = match result { |
| Err(err) => return Some(Err(err)), |
| Ok(None) => return None, |
| Ok(Some(m)) => m, |
| }; |
| // Unlike the non-overlapping case, we're OK with empty matches at this |
| // level. In particular, the overlapping search algorithm is itself |
| // responsible for ensuring that progress is always made. |
| self.last_end = m.end(); |
| Some(Ok(m)) |
| } |
| } |
| |
| /// The configuration used for compiling a DFA-backed regex. |
| /// |
| /// A regex configuration is a simple data object that is typically used with |
| /// [`Builder::configure`]. |
| #[cfg(feature = "alloc")] |
| #[derive(Clone, Copy, Debug, Default)] |
| pub struct Config { |
| utf8: Option<bool>, |
| } |
| |
| #[cfg(feature = "alloc")] |
| impl Config { |
| /// Return a new default regex compiler configuration. |
| pub fn new() -> Config { |
| Config::default() |
| } |
| |
| /// Whether to enable UTF-8 mode or not. |
| /// |
| /// When UTF-8 mode is enabled (the default) and an empty match is seen, |
| /// the iterators on [`Regex`] will always start the next search at the |
| /// next UTF-8 encoded codepoint when searching valid UTF-8. When UTF-8 |
| /// mode is disabled, such searches are begun at the next byte offset. |
| /// |
| /// If this mode is enabled and invalid UTF-8 is given to search, then |
| /// behavior is unspecified. |
| /// |
| /// Generally speaking, one should enable this when |
| /// [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) |
| /// and |
| /// [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) |
| /// are enabled, and disable it otherwise. |
| /// |
| /// # Example |
| /// |
| /// This example demonstrates the differences between when this option is |
| /// enabled and disabled. The differences only arise when the regex can |
| /// return matches of length zero. |
| /// |
| /// In this first snippet, we show the results when UTF-8 mode is disabled. |
| /// |
| /// ``` |
| /// use regex_automata::{dfa::regex::Regex, MultiMatch}; |
| /// |
| /// let re = Regex::builder() |
| /// .configure(Regex::config().utf8(false)) |
| /// .build(r"")?; |
| /// let haystack = "a☃z".as_bytes(); |
| /// let mut it = re.find_leftmost_iter(haystack); |
| /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); |
| /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); |
| /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next()); |
| /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next()); |
| /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); |
| /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); |
| /// assert_eq!(None, it.next()); |
| /// |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| /// |
| /// And in this snippet, we execute the same search on the same haystack, |
| /// but with UTF-8 mode enabled. Notice that byte offsets that would |
| /// otherwise split the encoding of `☃` are not returned. |
| /// |
| /// ``` |
| /// use regex_automata::{dfa::regex::Regex, MultiMatch}; |
| /// |
| /// let re = Regex::builder() |
| /// .configure(Regex::config().utf8(true)) |
| /// .build(r"")?; |
| /// let haystack = "a☃z".as_bytes(); |
| /// let mut it = re.find_leftmost_iter(haystack); |
| /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); |
| /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); |
| /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); |
| /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); |
| /// assert_eq!(None, it.next()); |
| /// |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| pub fn utf8(mut self, yes: bool) -> Config { |
| self.utf8 = Some(yes); |
| self |
| } |
| |
| /// Returns true if and only if this configuration has UTF-8 mode enabled. |
| /// |
| /// When UTF-8 mode is enabled and an empty match is seen, the iterators on |
| /// [`Regex`] will always start the next search at the next UTF-8 encoded |
| /// codepoint. When UTF-8 mode is disabled, such searches are begun at the |
| /// next byte offset. |
| pub fn get_utf8(&self) -> bool { |
| self.utf8.unwrap_or(true) |
| } |
| |
| /// Overwrite the default configuration such that the options in `o` are |
| /// always used. If an option in `o` is not set, then the corresponding |
| /// option in `self` is used. If it's not set in `self` either, then it |
| /// remains not set. |
| pub(crate) fn overwrite(self, o: Config) -> Config { |
| Config { utf8: o.utf8.or(self.utf8) } |
| } |
| } |
| |
| /// A builder for a regex based on deterministic finite automatons. |
| /// |
| /// This builder permits configuring options for the syntax of a pattern, the |
| /// NFA construction, the DFA construction and finally the regex searching |
| /// itself. This builder is different from a general purpose regex builder in |
| /// that it permits fine grain configuration of the construction process. The |
| /// trade off for this is complexity, and the possibility of setting a |
| /// configuration that might not make sense. For example, there are three |
| /// different UTF-8 modes: |
| /// |
| /// * [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) controls whether the |
| /// pattern itself can contain sub-expressions that match invalid UTF-8. |
| /// * [`nfa::thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) |
| /// controls whether the implicit unanchored prefix added to the NFA can |
| /// match through invalid UTF-8 or not. |
| /// * [`Config::utf8`] controls how the regex iterators themselves advance |
| /// the starting position of the next search when a match with zero length is |
| /// found. |
| /// |
| /// Generally speaking, callers will want to either enable all of these or |
| /// disable all of these. |
| /// |
| /// Internally, building a regex requires building two DFAs, where one is |
| /// responsible for finding the end of a match and the other is responsible |
| /// for finding the start of a match. If you only need to detect whether |
| /// something matched, or only the end of a match, then you should use a |
| /// [`dense::Builder`] to construct a single DFA, which is cheaper than |
| /// building two DFAs. |
| /// |
| /// # Build methods |
| /// |
| /// This builder has a few "build" methods. In general, it's the result of |
| /// combining the following parameters: |
| /// |
| /// * Building one or many regexes. |
| /// * Building a regex with dense or sparse DFAs. |
| /// |
| /// The simplest "build" method is [`Builder::build`]. It accepts a single |
| /// pattern and builds a dense DFA using `usize` for the state identifier |
| /// representation. |
| /// |
| /// The most general "build" method is [`Builder::build_many`], which permits |
| /// building a regex that searches for multiple patterns simultaneously while |
| /// using a specific state identifier representation. |
| /// |
| /// The most flexible "build" method, but hardest to use, is |
| /// [`Builder::build_from_dfas`]. This exposes the fact that a [`Regex`] is |
| /// just a pair of DFAs, and this method allows you to specify those DFAs |
| /// exactly. |
| /// |
| /// # Example |
| /// |
| /// This example shows how to disable UTF-8 mode in the syntax, the NFA and |
| /// the regex itself. This is generally what you want for matching on |
| /// arbitrary bytes. |
| /// |
| /// ``` |
| /// use regex_automata::{ |
| /// dfa::regex::Regex, nfa::thompson, MultiMatch, SyntaxConfig |
| /// }; |
| /// |
| /// let re = Regex::builder() |
| /// .configure(Regex::config().utf8(false)) |
| /// .syntax(SyntaxConfig::new().utf8(false)) |
| /// .thompson(thompson::Config::new().utf8(false)) |
| /// .build(r"foo(?-u:[^b])ar.*")?; |
| /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; |
| /// let expected = Some(MultiMatch::must(0, 1, 9)); |
| /// let got = re.find_leftmost(haystack); |
| /// assert_eq!(expected, got); |
| /// // Notice that `(?-u:[^b])` matches invalid UTF-8, |
| /// // but the subsequent `.*` does not! Disabling UTF-8 |
| /// // on the syntax permits this. Notice also that the |
| /// // search was unanchored and skipped over invalid UTF-8. |
| /// // Disabling UTF-8 on the Thompson NFA permits this. |
| /// // |
| /// // N.B. This example does not show the impact of |
| /// // disabling UTF-8 mode on Config, since that |
| /// // only impacts regexes that can produce matches of |
| /// // length 0. |
| /// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]); |
| /// |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| #[cfg(feature = "alloc")] |
| #[derive(Clone, Debug)] |
| pub struct Builder { |
| config: Config, |
| dfa: dense::Builder, |
| } |
| |
| #[cfg(feature = "alloc")] |
| impl Builder { |
| /// Create a new regex builder with the default configuration. |
| pub fn new() -> Builder { |
| Builder { config: Config::default(), dfa: dense::Builder::new() } |
| } |
| |
| /// Build a regex from the given pattern. |
| /// |
| /// If there was a problem parsing or compiling the pattern, then an error |
| /// is returned. |
| pub fn build(&self, pattern: &str) -> Result<Regex, Error> { |
| self.build_many(&[pattern]) |
| } |
| |
| /// Build a regex from the given pattern using sparse DFAs. |
| /// |
| /// If there was a problem parsing or compiling the pattern, then an error |
| /// is returned. |
| pub fn build_sparse( |
| &self, |
| pattern: &str, |
| ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> { |
| self.build_many_sparse(&[pattern]) |
| } |
| |
| /// Build a regex from the given patterns. |
| pub fn build_many<P: AsRef<str>>( |
| &self, |
| patterns: &[P], |
| ) -> Result<Regex, Error> { |
| let forward = self.dfa.build_many(patterns)?; |
| let reverse = self |
| .dfa |
| .clone() |
| .configure( |
| dense::Config::new() |
| .anchored(true) |
| .match_kind(MatchKind::All) |
| .starts_for_each_pattern(true), |
| ) |
| .thompson(thompson::Config::new().reverse(true)) |
| .build_many(patterns)?; |
| Ok(self.build_from_dfas(forward, reverse)) |
| } |
| |
| /// Build a sparse regex from the given patterns. |
| pub fn build_many_sparse<P: AsRef<str>>( |
| &self, |
| patterns: &[P], |
| ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> { |
| let re = self.build_many(patterns)?; |
| let forward = re.forward().to_sparse()?; |
| let reverse = re.reverse().to_sparse()?; |
| Ok(self.build_from_dfas(forward, reverse)) |
| } |
| |
| /// Build a regex from its component forward and reverse DFAs. |
| /// |
| /// This is useful when deserializing a regex from some arbitrary |
| /// memory region. This is also useful for building regexes from other |
| /// types of DFAs. |
| /// |
| /// If you're building the DFAs from scratch instead of building new DFAs |
| /// from other DFAs, then you'll need to make sure that the reverse DFA is |
| /// configured correctly to match the intended semantics. Namely: |
| /// |
| /// * It should be anchored. |
| /// * It should use [`MatchKind::All`] semantics. |
| /// * It should match in reverse. |
| /// * It should have anchored start states compiled for each pattern. |
| /// * Otherwise, its configuration should match the forward DFA. |
| /// |
| /// If these conditions are satisfied, then behavior of searches is |
| /// unspecified. |
| /// |
| /// Note that when using this constructor, only the configuration from |
| /// [`Config`] is applied. The only configuration settings on this builder |
| /// only apply when the builder owns the construction of the DFAs |
| /// themselves. |
| /// |
| /// # Example |
| /// |
| /// This example is a bit a contrived. The usual use of these methods |
| /// would involve serializing `initial_re` somewhere and then deserializing |
| /// it later to build a regex. But in this case, we do everything in |
| /// memory. |
| /// |
| /// ``` |
| /// use regex_automata::dfa::regex::Regex; |
| /// |
| /// let initial_re = Regex::new("foo[0-9]+")?; |
| /// assert_eq!(true, initial_re.is_match(b"foo123")); |
| /// |
| /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse()); |
| /// let re = Regex::builder().build_from_dfas(fwd, rev); |
| /// assert_eq!(true, re.is_match(b"foo123")); |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| /// |
| /// This example shows how to build a `Regex` that uses sparse DFAs instead |
| /// of dense DFAs without using one of the convenience `build_sparse` |
| /// routines: |
| /// |
| /// ``` |
| /// use regex_automata::dfa::regex::Regex; |
| /// |
| /// let initial_re = Regex::new("foo[0-9]+")?; |
| /// assert_eq!(true, initial_re.is_match(b"foo123")); |
| /// |
| /// let fwd = initial_re.forward().to_sparse()?; |
| /// let rev = initial_re.reverse().to_sparse()?; |
| /// let re = Regex::builder().build_from_dfas(fwd, rev); |
| /// assert_eq!(true, re.is_match(b"foo123")); |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| pub fn build_from_dfas<A: Automaton>( |
| &self, |
| forward: A, |
| reverse: A, |
| ) -> Regex<A> { |
| let utf8 = self.config.get_utf8(); |
| Regex { prefilter: None, forward, reverse, utf8 } |
| } |
| |
| /// Apply the given regex configuration options to this builder. |
| pub fn configure(&mut self, config: Config) -> &mut Builder { |
| self.config = self.config.overwrite(config); |
| self |
| } |
| |
| /// Set the syntax configuration for this builder using |
| /// [`SyntaxConfig`](crate::SyntaxConfig). |
| /// |
| /// This permits setting things like case insensitivity, Unicode and multi |
| /// line mode. |
| pub fn syntax( |
| &mut self, |
| config: crate::util::syntax::SyntaxConfig, |
| ) -> &mut Builder { |
| self.dfa.syntax(config); |
| self |
| } |
| |
| /// Set the Thompson NFA configuration for this builder using |
| /// [`nfa::thompson::Config`](thompson::Config). |
| /// |
| /// This permits setting things like whether additional time should be |
| /// spent shrinking the size of the NFA. |
| pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { |
| self.dfa.thompson(config); |
| self |
| } |
| |
| /// Set the dense DFA compilation configuration for this builder using |
| /// [`dense::Config`](dense::Config). |
| /// |
| /// This permits setting things like whether the underlying DFAs should |
| /// be minimized. |
| pub fn dense(&mut self, config: dense::Config) -> &mut Builder { |
| self.dfa.configure(config); |
| self |
| } |
| } |
| |
| #[cfg(feature = "alloc")] |
| impl Default for Builder { |
| fn default() -> Builder { |
| Builder::new() |
| } |
| } |
| |
| #[inline(always)] |
| fn next_unwrap( |
| item: Option<Result<MultiMatch, MatchError>>, |
| ) -> Option<MultiMatch> { |
| match item { |
| None => None, |
| Some(Ok(m)) => Some(m), |
| Some(Err(err)) => panic!( |
| "unexpected regex search error: {}\n\ |
| to handle search errors, use try_ methods", |
| err, |
| ), |
| } |
| } |