vendor/regex-automata-0.4.3/src/util/prefilter/aho_corasick.rs - toolchain/rustc - Git at Google

 use crate::util::{
     prefilter::PrefilterI,
     search::{MatchKind, Span},
 };

 #[derive(Clone, Debug)]
 pub(crate) struct AhoCorasick {
     #[cfg(not(feature = "perf-literal-multisubstring"))]
     _unused: (),
     #[cfg(feature = "perf-literal-multisubstring")]
     ac: aho_corasick::AhoCorasick,
 }

 impl AhoCorasick {
     pub(crate) fn new<B: AsRef<[u8]>>(
         kind: MatchKind,
         needles: &[B],
     ) -> Option<AhoCorasick> {
         #[cfg(not(feature = "perf-literal-multisubstring"))]
         {
             None
         }
         #[cfg(feature = "perf-literal-multisubstring")]
         {
             // We used to use `aho_corasick::MatchKind::Standard` here when
             // `kind` was `MatchKind::All`, but this is not correct. The
             // "standard" Aho-Corasick match semantics are to report a match
             // immediately as soon as it is seen, but `All` isn't like that.
             // In particular, with "standard" semantics, given the needles
             // "abc" and "b" and the haystack "abc," it would report a match
             // at offset 1 before a match at offset 0. This is never what we
             // want in the context of the regex engine, regardless of whether
             // we have leftmost-first or 'all' semantics. Namely, we always
             // want the leftmost match.
             let ac_match_kind = match kind {
                 MatchKind::LeftmostFirst | MatchKind::All => {
                     aho_corasick::MatchKind::LeftmostFirst
                 }
             };
             // This is kind of just an arbitrary number, but basically, if we
             // have a small enough set of literals, then we try to use the VERY
             // memory hungry DFA. Otherwise, we whimp out and use an NFA. The
             // upshot is that the NFA is quite lean and decently fast. Faster
             // than a naive Aho-Corasick NFA anyway.
             let ac_kind = if needles.len() <= 500 {
                 aho_corasick::AhoCorasickKind::DFA
             } else {
                 aho_corasick::AhoCorasickKind::ContiguousNFA
             };
             let result = aho_corasick::AhoCorasick::builder()
                 .kind(Some(ac_kind))
                 .match_kind(ac_match_kind)
                 .start_kind(aho_corasick::StartKind::Both)
                 // We try to handle all of the prefilter cases in the super
                 // module, and only use Aho-Corasick for the actual automaton.
                 // The aho-corasick crate does have some extra prefilters,
                 // namely, looking for rare bytes to feed to memchr{,2,3}
                 // instead of just the first byte. If we end up wanting
                 // those---and they are somewhat tricky to implement---then
                 // we could port them to this crate.
                 //
                 // The main reason for doing things this way is so we have a
                 // complete and easy to understand picture of which prefilters
                 // are available and how they work. Otherwise it seems too
                 // easy to get into a situation where we have a prefilter
                 // layered on top of prefilter, and that might have unintended
                 // consequences.
                 .prefilter(false)
                 .build(needles);
             let ac = match result {
                 Ok(ac) => ac,
                 Err(_err) => {
                     debug!("aho-corasick prefilter failed to build: {}", _err);
                     return None;
                 }
             };
             Some(AhoCorasick { ac })
         }
     }
 }

 impl PrefilterI for AhoCorasick {
     fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
         #[cfg(not(feature = "perf-literal-multisubstring"))]
         {
             unreachable!()
         }
         #[cfg(feature = "perf-literal-multisubstring")]
         {
             let input =
                 aho_corasick::Input::new(haystack).span(span.start..span.end);
             self.ac
                 .find(input)
                 .map(|m| Span { start: m.start(), end: m.end() })
         }
     }

     fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
         #[cfg(not(feature = "perf-literal-multisubstring"))]
         {
             unreachable!()
         }
         #[cfg(feature = "perf-literal-multisubstring")]
         {
             let input = aho_corasick::Input::new(haystack)
                 .anchored(aho_corasick::Anchored::Yes)
                 .span(span.start..span.end);
             self.ac
                 .find(input)
                 .map(|m| Span { start: m.start(), end: m.end() })
         }
     }

     fn memory_usage(&self) -> usize {
         #[cfg(not(feature = "perf-literal-multisubstring"))]
         {
             unreachable!()
         }
         #[cfg(feature = "perf-literal-multisubstring")]
         {
             self.ac.memory_usage()
         }
     }

     fn is_fast(&self) -> bool {
         #[cfg(not(feature = "perf-literal-multisubstring"))]
         {
             unreachable!()
         }
         #[cfg(feature = "perf-literal-multisubstring")]
         {
             // Aho-Corasick is never considered "fast" because it's never
             // going to be even close to an order of magnitude faster than the
             // regex engine itself (assuming a DFA is used). In fact, it is
             // usually slower. The magic of Aho-Corasick is that it can search
             // a *large* number of literals with a relatively small amount of
             // memory. The regex engines are far more wasteful.
             //
             // Aho-Corasick may be "fast" when the regex engine corresponds
             // to, say, the PikeVM. That happens when the lazy DFA couldn't be
             // built or used for some reason. But in these cases, the regex
             // itself is likely quite big and we're probably hosed no matter
             // what we do. (In this case, the best bet is for the caller to
             // increase some of the memory limits on the hybrid cache capacity
             // and hope that's enough.)
             false
         }
     }
 }
	use crate::util::{
	prefilter::PrefilterI,
	search::{MatchKind, Span},
	};

	#[derive(Clone, Debug)]
	pub(crate) struct AhoCorasick {
	#[cfg(not(feature = "perf-literal-multisubstring"))]
	_unused: (),
	#[cfg(feature = "perf-literal-multisubstring")]
	ac: aho_corasick::AhoCorasick,
	}

	impl AhoCorasick {
	pub(crate) fn new<B: AsRef<[u8]>>(
	kind: MatchKind,
	needles: &[B],
	) -> Option<AhoCorasick> {
	#[cfg(not(feature = "perf-literal-multisubstring"))]
	{
	None
	}
	#[cfg(feature = "perf-literal-multisubstring")]
	{
	// We used to use `aho_corasick::MatchKind::Standard` here when
	// `kind` was `MatchKind::All`, but this is not correct. The
	// "standard" Aho-Corasick match semantics are to report a match
	// immediately as soon as it is seen, but `All` isn't like that.
	// In particular, with "standard" semantics, given the needles
	// "abc" and "b" and the haystack "abc," it would report a match
	// at offset 1 before a match at offset 0. This is never what we
	// want in the context of the regex engine, regardless of whether
	// we have leftmost-first or 'all' semantics. Namely, we always
	// want the leftmost match.
	let ac_match_kind = match kind {
	MatchKind::LeftmostFirst \| MatchKind::All => {
	aho_corasick::MatchKind::LeftmostFirst
	}
	};
	// This is kind of just an arbitrary number, but basically, if we
	// have a small enough set of literals, then we try to use the VERY
	// memory hungry DFA. Otherwise, we whimp out and use an NFA. The
	// upshot is that the NFA is quite lean and decently fast. Faster
	// than a naive Aho-Corasick NFA anyway.
	let ac_kind = if needles.len() <= 500 {
	aho_corasick::AhoCorasickKind::DFA
	} else {
	aho_corasick::AhoCorasickKind::ContiguousNFA
	};
	let result = aho_corasick::AhoCorasick::builder()
	.kind(Some(ac_kind))
	.match_kind(ac_match_kind)
	.start_kind(aho_corasick::StartKind::Both)
	// We try to handle all of the prefilter cases in the super
	// module, and only use Aho-Corasick for the actual automaton.
	// The aho-corasick crate does have some extra prefilters,
	// namely, looking for rare bytes to feed to memchr{,2,3}
	// instead of just the first byte. If we end up wanting
	// those---and they are somewhat tricky to implement---then
	// we could port them to this crate.
	//
	// The main reason for doing things this way is so we have a
	// complete and easy to understand picture of which prefilters
	// are available and how they work. Otherwise it seems too
	// easy to get into a situation where we have a prefilter
	// layered on top of prefilter, and that might have unintended
	// consequences.
	.prefilter(false)
	.build(needles);
	let ac = match result {
	Ok(ac) => ac,
	Err(_err) => {
	debug!("aho-corasick prefilter failed to build: {}", _err);
	return None;
	}
	};
	Some(AhoCorasick { ac })
	}
	}
	}

	impl PrefilterI for AhoCorasick {
	fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
	#[cfg(not(feature = "perf-literal-multisubstring"))]
	{
	unreachable!()
	}
	#[cfg(feature = "perf-literal-multisubstring")]
	{
	let input =
	aho_corasick::Input::new(haystack).span(span.start..span.end);
	self.ac
	.find(input)
	.map(\|m\| Span { start: m.start(), end: m.end() })
	}
	}

	fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
	#[cfg(not(feature = "perf-literal-multisubstring"))]
	{
	unreachable!()
	}
	#[cfg(feature = "perf-literal-multisubstring")]
	{
	let input = aho_corasick::Input::new(haystack)
	.anchored(aho_corasick::Anchored::Yes)
	.span(span.start..span.end);
	self.ac
	.find(input)
	.map(\|m\| Span { start: m.start(), end: m.end() })
	}
	}

	fn memory_usage(&self) -> usize {
	#[cfg(not(feature = "perf-literal-multisubstring"))]
	{
	unreachable!()
	}
	#[cfg(feature = "perf-literal-multisubstring")]
	{
	self.ac.memory_usage()
	}
	}

	fn is_fast(&self) -> bool {
	#[cfg(not(feature = "perf-literal-multisubstring"))]
	{
	unreachable!()
	}
	#[cfg(feature = "perf-literal-multisubstring")]
	{
	// Aho-Corasick is never considered "fast" because it's never
	// going to be even close to an order of magnitude faster than the
	// regex engine itself (assuming a DFA is used). In fact, it is
	// usually slower. The magic of Aho-Corasick is that it can search
	// a large number of literals with a relatively small amount of
	// memory. The regex engines are far more wasteful.
	//
	// Aho-Corasick may be "fast" when the regex engine corresponds
	// to, say, the PikeVM. That happens when the lazy DFA couldn't be
	// built or used for some reason. But in these cases, the regex
	// itself is likely quite big and we're probably hosed no matter
	// what we do. (In this case, the best bet is for the caller to
	// increase some of the memory limits on the hybrid cache capacity
	// and hope that's enough.)
	false
	}
	}
	}