blob: 50cce827ee2822a140306c5f6b6f79948f7b7d67 [file] [log] [blame]
use crate::util::{
prefilter::PrefilterI,
search::{MatchKind, Span},
};
#[derive(Clone, Debug)]
pub(crate) struct AhoCorasick {
#[cfg(not(feature = "perf-literal-multisubstring"))]
_unused: (),
#[cfg(feature = "perf-literal-multisubstring")]
ac: aho_corasick::AhoCorasick,
}
impl AhoCorasick {
pub(crate) fn new<B: AsRef<[u8]>>(
kind: MatchKind,
needles: &[B],
) -> Option<AhoCorasick> {
#[cfg(not(feature = "perf-literal-multisubstring"))]
{
None
}
#[cfg(feature = "perf-literal-multisubstring")]
{
// We used to use `aho_corasick::MatchKind::Standard` here when
// `kind` was `MatchKind::All`, but this is not correct. The
// "standard" Aho-Corasick match semantics are to report a match
// immediately as soon as it is seen, but `All` isn't like that.
// In particular, with "standard" semantics, given the needles
// "abc" and "b" and the haystack "abc," it would report a match
// at offset 1 before a match at offset 0. This is never what we
// want in the context of the regex engine, regardless of whether
// we have leftmost-first or 'all' semantics. Namely, we always
// want the leftmost match.
let ac_match_kind = match kind {
MatchKind::LeftmostFirst | MatchKind::All => {
aho_corasick::MatchKind::LeftmostFirst
}
};
// This is kind of just an arbitrary number, but basically, if we
// have a small enough set of literals, then we try to use the VERY
// memory hungry DFA. Otherwise, we whimp out and use an NFA. The
// upshot is that the NFA is quite lean and decently fast. Faster
// than a naive Aho-Corasick NFA anyway.
let ac_kind = if needles.len() <= 500 {
aho_corasick::AhoCorasickKind::DFA
} else {
aho_corasick::AhoCorasickKind::ContiguousNFA
};
let result = aho_corasick::AhoCorasick::builder()
.kind(Some(ac_kind))
.match_kind(ac_match_kind)
.start_kind(aho_corasick::StartKind::Both)
// We try to handle all of the prefilter cases in the super
// module, and only use Aho-Corasick for the actual automaton.
// The aho-corasick crate does have some extra prefilters,
// namely, looking for rare bytes to feed to memchr{,2,3}
// instead of just the first byte. If we end up wanting
// those---and they are somewhat tricky to implement---then
// we could port them to this crate.
//
// The main reason for doing things this way is so we have a
// complete and easy to understand picture of which prefilters
// are available and how they work. Otherwise it seems too
// easy to get into a situation where we have a prefilter
// layered on top of prefilter, and that might have unintended
// consequences.
.prefilter(false)
.build(needles);
let ac = match result {
Ok(ac) => ac,
Err(_err) => {
debug!("aho-corasick prefilter failed to build: {}", _err);
return None;
}
};
Some(AhoCorasick { ac })
}
}
}
impl PrefilterI for AhoCorasick {
fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
#[cfg(not(feature = "perf-literal-multisubstring"))]
{
unreachable!()
}
#[cfg(feature = "perf-literal-multisubstring")]
{
let input =
aho_corasick::Input::new(haystack).span(span.start..span.end);
self.ac
.find(input)
.map(|m| Span { start: m.start(), end: m.end() })
}
}
fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
#[cfg(not(feature = "perf-literal-multisubstring"))]
{
unreachable!()
}
#[cfg(feature = "perf-literal-multisubstring")]
{
let input = aho_corasick::Input::new(haystack)
.anchored(aho_corasick::Anchored::Yes)
.span(span.start..span.end);
self.ac
.find(input)
.map(|m| Span { start: m.start(), end: m.end() })
}
}
fn memory_usage(&self) -> usize {
#[cfg(not(feature = "perf-literal-multisubstring"))]
{
unreachable!()
}
#[cfg(feature = "perf-literal-multisubstring")]
{
self.ac.memory_usage()
}
}
fn is_fast(&self) -> bool {
#[cfg(not(feature = "perf-literal-multisubstring"))]
{
unreachable!()
}
#[cfg(feature = "perf-literal-multisubstring")]
{
// Aho-Corasick is never considered "fast" because it's never
// going to be even close to an order of magnitude faster than the
// regex engine itself (assuming a DFA is used). In fact, it is
// usually slower. The magic of Aho-Corasick is that it can search
// a *large* number of literals with a relatively small amount of
// memory. The regex engines are far more wasteful.
//
// Aho-Corasick may be "fast" when the regex engine corresponds
// to, say, the PikeVM. That happens when the lazy DFA couldn't be
// built or used for some reason. But in these cases, the regex
// itself is likely quite big and we're probably hosed no matter
// what we do. (In this case, the best bet is for the caller to
// increase some of the memory limits on the hybrid cache capacity
// and hope that's enough.)
false
}
}
}