| use crate::util::{ |
| prefilter::PrefilterI, |
| search::{MatchKind, Span}, |
| }; |
| |
| #[derive(Clone, Debug)] |
| pub(crate) struct AhoCorasick { |
| #[cfg(not(feature = "perf-literal-multisubstring"))] |
| _unused: (), |
| #[cfg(feature = "perf-literal-multisubstring")] |
| ac: aho_corasick::AhoCorasick, |
| } |
| |
| impl AhoCorasick { |
| pub(crate) fn new<B: AsRef<[u8]>>( |
| kind: MatchKind, |
| needles: &[B], |
| ) -> Option<AhoCorasick> { |
| #[cfg(not(feature = "perf-literal-multisubstring"))] |
| { |
| None |
| } |
| #[cfg(feature = "perf-literal-multisubstring")] |
| { |
| // We used to use `aho_corasick::MatchKind::Standard` here when |
| // `kind` was `MatchKind::All`, but this is not correct. The |
| // "standard" Aho-Corasick match semantics are to report a match |
| // immediately as soon as it is seen, but `All` isn't like that. |
| // In particular, with "standard" semantics, given the needles |
| // "abc" and "b" and the haystack "abc," it would report a match |
| // at offset 1 before a match at offset 0. This is never what we |
| // want in the context of the regex engine, regardless of whether |
| // we have leftmost-first or 'all' semantics. Namely, we always |
| // want the leftmost match. |
| let ac_match_kind = match kind { |
| MatchKind::LeftmostFirst | MatchKind::All => { |
| aho_corasick::MatchKind::LeftmostFirst |
| } |
| }; |
| // This is kind of just an arbitrary number, but basically, if we |
| // have a small enough set of literals, then we try to use the VERY |
| // memory hungry DFA. Otherwise, we whimp out and use an NFA. The |
| // upshot is that the NFA is quite lean and decently fast. Faster |
| // than a naive Aho-Corasick NFA anyway. |
| let ac_kind = if needles.len() <= 500 { |
| aho_corasick::AhoCorasickKind::DFA |
| } else { |
| aho_corasick::AhoCorasickKind::ContiguousNFA |
| }; |
| let result = aho_corasick::AhoCorasick::builder() |
| .kind(Some(ac_kind)) |
| .match_kind(ac_match_kind) |
| .start_kind(aho_corasick::StartKind::Both) |
| // We try to handle all of the prefilter cases in the super |
| // module, and only use Aho-Corasick for the actual automaton. |
| // The aho-corasick crate does have some extra prefilters, |
| // namely, looking for rare bytes to feed to memchr{,2,3} |
| // instead of just the first byte. If we end up wanting |
| // those---and they are somewhat tricky to implement---then |
| // we could port them to this crate. |
| // |
| // The main reason for doing things this way is so we have a |
| // complete and easy to understand picture of which prefilters |
| // are available and how they work. Otherwise it seems too |
| // easy to get into a situation where we have a prefilter |
| // layered on top of prefilter, and that might have unintended |
| // consequences. |
| .prefilter(false) |
| .build(needles); |
| let ac = match result { |
| Ok(ac) => ac, |
| Err(_err) => { |
| debug!("aho-corasick prefilter failed to build: {}", _err); |
| return None; |
| } |
| }; |
| Some(AhoCorasick { ac }) |
| } |
| } |
| } |
| |
| impl PrefilterI for AhoCorasick { |
| fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { |
| #[cfg(not(feature = "perf-literal-multisubstring"))] |
| { |
| unreachable!() |
| } |
| #[cfg(feature = "perf-literal-multisubstring")] |
| { |
| let input = |
| aho_corasick::Input::new(haystack).span(span.start..span.end); |
| self.ac |
| .find(input) |
| .map(|m| Span { start: m.start(), end: m.end() }) |
| } |
| } |
| |
| fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { |
| #[cfg(not(feature = "perf-literal-multisubstring"))] |
| { |
| unreachable!() |
| } |
| #[cfg(feature = "perf-literal-multisubstring")] |
| { |
| let input = aho_corasick::Input::new(haystack) |
| .anchored(aho_corasick::Anchored::Yes) |
| .span(span.start..span.end); |
| self.ac |
| .find(input) |
| .map(|m| Span { start: m.start(), end: m.end() }) |
| } |
| } |
| |
| fn memory_usage(&self) -> usize { |
| #[cfg(not(feature = "perf-literal-multisubstring"))] |
| { |
| unreachable!() |
| } |
| #[cfg(feature = "perf-literal-multisubstring")] |
| { |
| self.ac.memory_usage() |
| } |
| } |
| |
| fn is_fast(&self) -> bool { |
| #[cfg(not(feature = "perf-literal-multisubstring"))] |
| { |
| unreachable!() |
| } |
| #[cfg(feature = "perf-literal-multisubstring")] |
| { |
| // Aho-Corasick is never considered "fast" because it's never |
| // going to be even close to an order of magnitude faster than the |
| // regex engine itself (assuming a DFA is used). In fact, it is |
| // usually slower. The magic of Aho-Corasick is that it can search |
| // a *large* number of literals with a relatively small amount of |
| // memory. The regex engines are far more wasteful. |
| // |
| // Aho-Corasick may be "fast" when the regex engine corresponds |
| // to, say, the PikeVM. That happens when the lazy DFA couldn't be |
| // built or used for some reason. But in these cases, the regex |
| // itself is likely quite big and we're probably hosed no matter |
| // what we do. (In this case, the best bet is for the caller to |
| // increase some of the memory limits on the hybrid cache capacity |
| // and hope that's enough.) |
| false |
| } |
| } |
| } |