blob: 3a316b5099532084d26cd76e46fd6824421e57a7 [file] [log] [blame]
use std::u16;
use packed::pattern::Patterns;
use packed::rabinkarp::RabinKarp;
use packed::teddy::{self, Teddy};
use Match;
/// This is a limit placed on the total number of patterns we're willing to try
/// and match at once. As more sophisticated algorithms are added, this number
/// may be increased.
const PATTERN_LIMIT: usize = 128;
/// A knob for controlling the match semantics of a packed multiple string
/// searcher.
///
/// This differs from the
/// [`MatchKind`](../enum.MatchKind.html)
/// type in the top-level crate module in that it doesn't support
/// "standard" match semantics, and instead only supports leftmost-first or
/// leftmost-longest. Namely, "standard" semantics cannot be easily supported
/// by packed searchers.
///
/// For more information on the distinction between leftmost-first and
/// leftmost-longest, see the docs on the top-level `MatchKind` type.
///
/// Unlike the top-level `MatchKind` type, the default match semantics for this
/// type are leftmost-first.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum MatchKind {
/// Use leftmost-first match semantics, which reports leftmost matches.
/// When there are multiple possible leftmost matches, the match
/// corresponding to the pattern that appeared earlier when constructing
/// the automaton is reported.
///
/// This is the default.
LeftmostFirst,
/// Use leftmost-longest match semantics, which reports leftmost matches.
/// When there are multiple possible leftmost matches, the longest match
/// is chosen.
LeftmostLongest,
/// Hints that destructuring should not be exhaustive.
///
/// This enum may grow additional variants, so this makes sure clients
/// don't count on exhaustive matching. (Otherwise, adding a new variant
/// could break existing code.)
#[doc(hidden)]
__Nonexhaustive,
}
impl Default for MatchKind {
fn default() -> MatchKind {
MatchKind::LeftmostFirst
}
}
/// The configuration for a packed multiple pattern searcher.
///
/// The configuration is currently limited only to being able to select the
/// match semantics (leftmost-first or leftmost-longest) of a searcher. In the
/// future, more knobs may be made available.
///
/// A configuration produces a [`packed::Builder`](struct.Builder.html), which
/// in turn can be used to construct a
/// [`packed::Searcher`](struct.Searcher.html) for searching.
///
/// # Example
///
/// This example shows how to use leftmost-longest semantics instead of the
/// default (leftmost-first).
///
/// ```
/// use aho_corasick::packed::{Config, MatchKind};
///
/// # fn example() -> Option<()> {
/// let searcher = Config::new()
/// .match_kind(MatchKind::LeftmostLongest)
/// .builder()
/// .add("foo")
/// .add("foobar")
/// .build()?;
/// let matches: Vec<usize> = searcher
/// .find_iter("foobar")
/// .map(|mat| mat.pattern())
/// .collect();
/// assert_eq!(vec![1], matches);
/// # Some(()) }
/// # if cfg!(target_arch = "x86_64") {
/// # example().unwrap()
/// # } else {
/// # assert!(example().is_none());
/// # }
/// ```
#[derive(Clone, Debug)]
pub struct Config {
kind: MatchKind,
force: Option<ForceAlgorithm>,
force_teddy_fat: Option<bool>,
force_avx: Option<bool>,
}
/// An internal option for forcing the use of a particular packed algorithm.
///
/// When an algorithm is forced, if a searcher could not be constructed for it,
/// then no searcher will be returned even if an alternative algorithm would
/// work.
#[derive(Clone, Debug)]
enum ForceAlgorithm {
Teddy,
RabinKarp,
}
impl Default for Config {
fn default() -> Config {
Config::new()
}
}
impl Config {
/// Create a new default configuration. A default configuration uses
/// leftmost-first match semantics.
pub fn new() -> Config {
Config {
kind: MatchKind::LeftmostFirst,
force: None,
force_teddy_fat: None,
force_avx: None,
}
}
/// Create a packed builder from this configuration. The builder can be
/// used to accumulate patterns and create a
/// [`Searcher`](struct.Searcher.html)
/// from them.
pub fn builder(&self) -> Builder {
Builder::from_config(self.clone())
}
/// Set the match semantics for this configuration.
pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config {
self.kind = kind;
self
}
/// An undocumented method for forcing the use of the Teddy algorithm.
///
/// This is only exposed for more precise testing and benchmarks. Callers
/// should not use it as it is not part of the API stability guarantees of
/// this crate.
#[doc(hidden)]
pub fn force_teddy(&mut self, yes: bool) -> &mut Config {
if yes {
self.force = Some(ForceAlgorithm::Teddy);
} else {
self.force = None;
}
self
}
/// An undocumented method for forcing the use of the Fat Teddy algorithm.
///
/// This is only exposed for more precise testing and benchmarks. Callers
/// should not use it as it is not part of the API stability guarantees of
/// this crate.
#[doc(hidden)]
pub fn force_teddy_fat(&mut self, yes: Option<bool>) -> &mut Config {
self.force_teddy_fat = yes;
self
}
/// An undocumented method for forcing the use of SSE (`Some(false)`) or
/// AVX (`Some(true)`) algorithms.
///
/// This is only exposed for more precise testing and benchmarks. Callers
/// should not use it as it is not part of the API stability guarantees of
/// this crate.
#[doc(hidden)]
pub fn force_avx(&mut self, yes: Option<bool>) -> &mut Config {
self.force_avx = yes;
self
}
/// An undocumented method for forcing the use of the Rabin-Karp algorithm.
///
/// This is only exposed for more precise testing and benchmarks. Callers
/// should not use it as it is not part of the API stability guarantees of
/// this crate.
#[doc(hidden)]
pub fn force_rabin_karp(&mut self, yes: bool) -> &mut Config {
if yes {
self.force = Some(ForceAlgorithm::RabinKarp);
} else {
self.force = None;
}
self
}
}
/// A builder for constructing a packed searcher from a collection of patterns.
///
/// # Example
///
/// This example shows how to use a builder to construct a searcher. By
/// default, leftmost-first match semantics are used.
///
/// ```
/// use aho_corasick::packed::{Builder, MatchKind};
///
/// # fn example() -> Option<()> {
/// let searcher = Builder::new()
/// .add("foobar")
/// .add("foo")
/// .build()?;
/// let matches: Vec<usize> = searcher
/// .find_iter("foobar")
/// .map(|mat| mat.pattern())
/// .collect();
/// assert_eq!(vec![0], matches);
/// # Some(()) }
/// # if cfg!(target_arch = "x86_64") {
/// # example().unwrap()
/// # } else {
/// # assert!(example().is_none());
/// # }
/// ```
#[derive(Clone, Debug)]
pub struct Builder {
/// The configuration of this builder and subsequent matcher.
config: Config,
/// Set to true if the builder detects that a matcher cannot be built.
inert: bool,
/// The patterns provided by the caller.
patterns: Patterns,
}
impl Builder {
/// Create a new builder for constructing a multi-pattern searcher. This
/// constructor uses the default configuration.
pub fn new() -> Builder {
Builder::from_config(Config::new())
}
fn from_config(config: Config) -> Builder {
Builder { config, inert: false, patterns: Patterns::new() }
}
/// Build a searcher from the patterns added to this builder so far.
pub fn build(&self) -> Option<Searcher> {
if self.inert || self.patterns.is_empty() {
return None;
}
let mut patterns = self.patterns.clone();
patterns.set_match_kind(self.config.kind);
let rabinkarp = RabinKarp::new(&patterns);
// Effectively, we only want to return a searcher if we can use Teddy,
// since Teddy is our only fast packed searcher at the moment.
// Rabin-Karp is only used when searching haystacks smaller than what
// Teddy can support. Thus, the only way to get a Rabin-Karp searcher
// is to force it using undocumented APIs (for tests/benchmarks).
let (search_kind, minimum_len) = match self.config.force {
None | Some(ForceAlgorithm::Teddy) => {
let teddy = match self.build_teddy(&patterns) {
None => return None,
Some(teddy) => teddy,
};
let minimum_len = teddy.minimum_len();
(SearchKind::Teddy(teddy), minimum_len)
}
Some(ForceAlgorithm::RabinKarp) => (SearchKind::RabinKarp, 0),
};
Some(Searcher {
config: self.config.clone(),
patterns,
rabinkarp,
search_kind,
minimum_len,
})
}
fn build_teddy(&self, patterns: &Patterns) -> Option<Teddy> {
teddy::Builder::new()
.avx(self.config.force_avx)
.fat(self.config.force_teddy_fat)
.build(&patterns)
}
/// Add the given pattern to this set to match.
///
/// The order in which patterns are added is significant. Namely, when
/// using leftmost-first match semantics, then when multiple patterns can
/// match at a particular location, the pattern that was added first is
/// used as the match.
///
/// If the number of patterns added exceeds the amount supported by packed
/// searchers, then the builder will stop accumulating patterns and render
/// itself inert. At this point, constructing a searcher will always return
/// `None`.
pub fn add<P: AsRef<[u8]>>(&mut self, pattern: P) -> &mut Builder {
if self.inert {
return self;
} else if self.patterns.len() >= PATTERN_LIMIT {
self.inert = true;
self.patterns.reset();
return self;
}
// Just in case PATTERN_LIMIT increases beyond u16::MAX.
assert!(self.patterns.len() <= u16::MAX as usize);
let pattern = pattern.as_ref();
if pattern.is_empty() {
self.inert = true;
self.patterns.reset();
return self;
}
self.patterns.add(pattern);
self
}
/// Add the given iterator of patterns to this set to match.
///
/// The iterator must yield elements that can be converted into a `&[u8]`.
///
/// The order in which patterns are added is significant. Namely, when
/// using leftmost-first match semantics, then when multiple patterns can
/// match at a particular location, the pattern that was added first is
/// used as the match.
///
/// If the number of patterns added exceeds the amount supported by packed
/// searchers, then the builder will stop accumulating patterns and render
/// itself inert. At this point, constructing a searcher will always return
/// `None`.
pub fn extend<I, P>(&mut self, patterns: I) -> &mut Builder
where
I: IntoIterator<Item = P>,
P: AsRef<[u8]>,
{
for p in patterns {
self.add(p);
}
self
}
}
impl Default for Builder {
fn default() -> Builder {
Builder::new()
}
}
/// A packed searcher for quickly finding occurrences of multiple patterns.
///
/// If callers need more flexible construction, or if one wants to change the
/// match semantics (either leftmost-first or leftmost-longest), then one can
/// use the [`Config`](struct.Config.html) and/or
/// [`Builder`](struct.Builder.html) types for more fine grained control.
///
/// # Example
///
/// This example shows how to create a searcher from an iterator of patterns.
/// By default, leftmost-first match semantics are used.
///
/// ```
/// use aho_corasick::packed::{MatchKind, Searcher};
///
/// # fn example() -> Option<()> {
/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
/// let matches: Vec<usize> = searcher
/// .find_iter("foobar")
/// .map(|mat| mat.pattern())
/// .collect();
/// assert_eq!(vec![0], matches);
/// # Some(()) }
/// # if cfg!(target_arch = "x86_64") {
/// # example().unwrap()
/// # } else {
/// # assert!(example().is_none());
/// # }
/// ```
#[derive(Clone, Debug)]
pub struct Searcher {
config: Config,
patterns: Patterns,
rabinkarp: RabinKarp,
search_kind: SearchKind,
minimum_len: usize,
}
#[derive(Clone, Debug)]
enum SearchKind {
Teddy(Teddy),
RabinKarp,
}
impl Searcher {
/// A convenience function for constructing a searcher from an iterator
/// of things that can be converted to a `&[u8]`.
///
/// If a searcher could not be constructed (either because of an
/// unsupported CPU or because there are too many patterns), then `None`
/// is returned.
///
/// # Example
///
/// Basic usage:
///
/// ```
/// use aho_corasick::packed::{MatchKind, Searcher};
///
/// # fn example() -> Option<()> {
/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
/// let matches: Vec<usize> = searcher
/// .find_iter("foobar")
/// .map(|mat| mat.pattern())
/// .collect();
/// assert_eq!(vec![0], matches);
/// # Some(()) }
/// # if cfg!(target_arch = "x86_64") {
/// # example().unwrap()
/// # } else {
/// # assert!(example().is_none());
/// # }
/// ```
pub fn new<I, P>(patterns: I) -> Option<Searcher>
where
I: IntoIterator<Item = P>,
P: AsRef<[u8]>,
{
Builder::new().extend(patterns).build()
}
/// Return the first occurrence of any of the patterns in this searcher,
/// according to its match semantics, in the given haystack. The `Match`
/// returned will include the identifier of the pattern that matched, which
/// corresponds to the index of the pattern (starting from `0`) in which it
/// was added.
///
/// # Example
///
/// Basic usage:
///
/// ```
/// use aho_corasick::packed::{MatchKind, Searcher};
///
/// # fn example() -> Option<()> {
/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
/// let mat = searcher.find("foobar")?;
/// assert_eq!(0, mat.pattern());
/// assert_eq!(0, mat.start());
/// assert_eq!(6, mat.end());
/// # Some(()) }
/// # if cfg!(target_arch = "x86_64") {
/// # example().unwrap()
/// # } else {
/// # assert!(example().is_none());
/// # }
/// ```
pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> {
self.find_at(haystack, 0)
}
/// Return the first occurrence of any of the patterns in this searcher,
/// according to its match semantics, in the given haystack starting from
/// the given position.
///
/// The `Match` returned will include the identifier of the pattern that
/// matched, which corresponds to the index of the pattern (starting from
/// `0`) in which it was added. The offsets in the `Match` will be relative
/// to the start of `haystack` (and not `at`).
///
/// # Example
///
/// Basic usage:
///
/// ```
/// use aho_corasick::packed::{MatchKind, Searcher};
///
/// # fn example() -> Option<()> {
/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
/// let mat = searcher.find_at("foofoobar", 3)?;
/// assert_eq!(0, mat.pattern());
/// assert_eq!(3, mat.start());
/// assert_eq!(9, mat.end());
/// # Some(()) }
/// # if cfg!(target_arch = "x86_64") {
/// # example().unwrap()
/// # } else {
/// # assert!(example().is_none());
/// # }
/// ```
pub fn find_at<B: AsRef<[u8]>>(
&self,
haystack: B,
at: usize,
) -> Option<Match> {
let haystack = haystack.as_ref();
match self.search_kind {
SearchKind::Teddy(ref teddy) => {
if haystack[at..].len() < teddy.minimum_len() {
return self.slow_at(haystack, at);
}
teddy.find_at(&self.patterns, haystack, at)
}
SearchKind::RabinKarp => {
self.rabinkarp.find_at(&self.patterns, haystack, at)
}
}
}
/// Return an iterator of non-overlapping occurrences of the patterns in
/// this searcher, according to its match semantics, in the given haystack.
///
/// # Example
///
/// Basic usage:
///
/// ```
/// use aho_corasick::packed::{MatchKind, Searcher};
///
/// # fn example() -> Option<()> {
/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
/// let matches: Vec<usize> = searcher
/// .find_iter("foobar fooba foofoo")
/// .map(|mat| mat.pattern())
/// .collect();
/// assert_eq!(vec![0, 1, 1, 1], matches);
/// # Some(()) }
/// # if cfg!(target_arch = "x86_64") {
/// # example().unwrap()
/// # } else {
/// # assert!(example().is_none());
/// # }
/// ```
pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>(
&'a self,
haystack: &'b B,
) -> FindIter<'a, 'b> {
FindIter { searcher: self, haystack: haystack.as_ref(), at: 0 }
}
/// Returns the match kind used by this packed searcher.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// use aho_corasick::packed::{MatchKind, Searcher};
///
/// # fn example() -> Option<()> {
/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
/// // leftmost-first is the default.
/// assert_eq!(&MatchKind::LeftmostFirst, searcher.match_kind());
/// # Some(()) }
/// # if cfg!(target_arch = "x86_64") {
/// # example().unwrap()
/// # } else {
/// # assert!(example().is_none());
/// # }
/// ```
pub fn match_kind(&self) -> &MatchKind {
self.patterns.match_kind()
}
/// Returns the minimum length of a haystack that is required in order for
/// packed searching to be effective.
///
/// In some cases, the underlying packed searcher may not be able to search
/// very short haystacks. When that occurs, the implementation will defer
/// to a slower non-packed searcher (which is still generally faster than
/// Aho-Corasick for a small number of patterns). However, callers may
/// want to avoid ever using the slower variant, which one can do by
/// never passing a haystack shorter than the minimum length returned by
/// this method.
pub fn minimum_len(&self) -> usize {
self.minimum_len
}
/// Returns the approximate total amount of heap used by this searcher, in
/// units of bytes.
pub fn heap_bytes(&self) -> usize {
self.patterns.heap_bytes()
+ self.rabinkarp.heap_bytes()
+ self.search_kind.heap_bytes()
}
/// Use a slow (non-packed) searcher.
///
/// This is useful when a packed searcher could be constructed, but could
/// not be used to search a specific haystack. For example, if Teddy was
/// built but the haystack is smaller than ~34 bytes, then Teddy might not
/// be able to run.
fn slow_at(&self, haystack: &[u8], at: usize) -> Option<Match> {
self.rabinkarp.find_at(&self.patterns, haystack, at)
}
}
impl SearchKind {
fn heap_bytes(&self) -> usize {
match *self {
SearchKind::Teddy(ref ted) => ted.heap_bytes(),
SearchKind::RabinKarp => 0,
}
}
}
/// An iterator over non-overlapping matches from a packed searcher.
///
/// The lifetime `'s` refers to the lifetime of the underlying
/// [`Searcher`](struct.Searcher.html), while the lifetime `'h` refers to the
/// lifetime of the haystack being searched.
#[derive(Debug)]
pub struct FindIter<'s, 'h> {
searcher: &'s Searcher,
haystack: &'h [u8],
at: usize,
}
impl<'s, 'h> Iterator for FindIter<'s, 'h> {
type Item = Match;
fn next(&mut self) -> Option<Match> {
if self.at > self.haystack.len() {
return None;
}
match self.searcher.find_at(&self.haystack, self.at) {
None => None,
Some(c) => {
self.at = c.end;
Some(c)
}
}
}
}