| /*! |
| This module provides forward and reverse substring search routines. |
| |
| Unlike the standard library's substring search routines, these work on |
| arbitrary bytes. For all non-empty needles, these routines will report exactly |
| the same values as the corresponding routines in the standard library. For |
| the empty needle, the standard library reports matches only at valid UTF-8 |
| boundaries, where as these routines will report matches at every position. |
| |
| Other than being able to work on arbitrary bytes, the primary reason to prefer |
| these routines over the standard library routines is that these will generally |
| be faster. In some cases, significantly so. |
| |
| # Example: iterating over substring matches |
| |
| This example shows how to use [`find_iter`] to find occurrences of a substring |
| in a haystack. |
| |
| ``` |
| use memchr::memmem; |
| |
| let haystack = b"foo bar foo baz foo"; |
| |
| let mut it = memmem::find_iter(haystack, "foo"); |
| assert_eq!(Some(0), it.next()); |
| assert_eq!(Some(8), it.next()); |
| assert_eq!(Some(16), it.next()); |
| assert_eq!(None, it.next()); |
| ``` |
| |
| # Example: iterating over substring matches in reverse |
| |
| This example shows how to use [`rfind_iter`] to find occurrences of a substring |
| in a haystack starting from the end of the haystack. |
| |
| **NOTE:** This module does not implement double ended iterators, so reverse |
| searches aren't done by calling `rev` on a forward iterator. |
| |
| ``` |
| use memchr::memmem; |
| |
| let haystack = b"foo bar foo baz foo"; |
| |
| let mut it = memmem::rfind_iter(haystack, "foo"); |
| assert_eq!(Some(16), it.next()); |
| assert_eq!(Some(8), it.next()); |
| assert_eq!(Some(0), it.next()); |
| assert_eq!(None, it.next()); |
| ``` |
| |
| # Example: repeating a search for the same needle |
| |
| It may be possible for the overhead of constructing a substring searcher to be |
| measurable in some workloads. In cases where the same needle is used to search |
| many haystacks, it is possible to do construction once and thus to avoid it for |
| subsequent searches. This can be done with a [`Finder`] (or a [`FinderRev`] for |
| reverse searches). |
| |
| ``` |
| use memchr::memmem; |
| |
| let finder = memmem::Finder::new("foo"); |
| |
| assert_eq!(Some(4), finder.find(b"baz foo quux")); |
| assert_eq!(None, finder.find(b"quux baz bar")); |
| ``` |
| */ |
| |
| pub use self::prefilter::Prefilter; |
| |
| use crate::{ |
| cow::CowBytes, |
| memmem::{ |
| prefilter::{Pre, PrefilterFn, PrefilterState}, |
| rabinkarp::NeedleHash, |
| rarebytes::RareNeedleBytes, |
| }, |
| }; |
| |
| /// Defines a suite of quickcheck properties for forward and reverse |
| /// substring searching. |
| /// |
| /// This is defined in this specific spot so that it can be used freely among |
| /// the different substring search implementations. I couldn't be bothered to |
| /// fight with the macro-visibility rules enough to figure out how to stuff it |
| /// somewhere more convenient. |
| #[cfg(all(test, feature = "std"))] |
| macro_rules! define_memmem_quickcheck_tests { |
| ($fwd:expr, $rev:expr) => { |
| use crate::memmem::proptests; |
| |
| quickcheck::quickcheck! { |
| fn qc_fwd_prefix_is_substring(bs: Vec<u8>) -> bool { |
| proptests::prefix_is_substring(false, &bs, $fwd) |
| } |
| |
| fn qc_fwd_suffix_is_substring(bs: Vec<u8>) -> bool { |
| proptests::suffix_is_substring(false, &bs, $fwd) |
| } |
| |
| fn qc_fwd_matches_naive( |
| haystack: Vec<u8>, |
| needle: Vec<u8> |
| ) -> bool { |
| proptests::matches_naive(false, &haystack, &needle, $fwd) |
| } |
| |
| fn qc_rev_prefix_is_substring(bs: Vec<u8>) -> bool { |
| proptests::prefix_is_substring(true, &bs, $rev) |
| } |
| |
| fn qc_rev_suffix_is_substring(bs: Vec<u8>) -> bool { |
| proptests::suffix_is_substring(true, &bs, $rev) |
| } |
| |
| fn qc_rev_matches_naive( |
| haystack: Vec<u8>, |
| needle: Vec<u8> |
| ) -> bool { |
| proptests::matches_naive(true, &haystack, &needle, $rev) |
| } |
| } |
| }; |
| } |
| |
| /// Defines a suite of "simple" hand-written tests for a substring |
| /// implementation. |
| /// |
| /// This is defined here for the same reason that |
| /// define_memmem_quickcheck_tests is defined here. |
| #[cfg(test)] |
| macro_rules! define_memmem_simple_tests { |
| ($fwd:expr, $rev:expr) => { |
| use crate::memmem::testsimples; |
| |
| #[test] |
| fn simple_forward() { |
| testsimples::run_search_tests_fwd($fwd); |
| } |
| |
| #[test] |
| fn simple_reverse() { |
| testsimples::run_search_tests_rev($rev); |
| } |
| }; |
| } |
| |
| mod byte_frequencies; |
| #[cfg(all(target_arch = "x86_64", memchr_runtime_simd))] |
| mod genericsimd; |
| mod prefilter; |
| mod rabinkarp; |
| mod rarebytes; |
| mod twoway; |
| mod util; |
| // SIMD is only supported on x86_64 currently. |
| #[cfg(target_arch = "x86_64")] |
| mod vector; |
| #[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] |
| mod x86; |
| |
| /// Returns an iterator over all occurrences of a substring in a haystack. |
| /// |
| /// # Complexity |
| /// |
| /// This routine is guaranteed to have worst case linear time complexity |
| /// with respect to both the needle and the haystack. That is, this runs |
| /// in `O(needle.len() + haystack.len())` time. |
| /// |
| /// This routine is also guaranteed to have worst case constant space |
| /// complexity. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use memchr::memmem; |
| /// |
| /// let haystack = b"foo bar foo baz foo"; |
| /// let mut it = memmem::find_iter(haystack, b"foo"); |
| /// assert_eq!(Some(0), it.next()); |
| /// assert_eq!(Some(8), it.next()); |
| /// assert_eq!(Some(16), it.next()); |
| /// assert_eq!(None, it.next()); |
| /// ``` |
| #[inline] |
| pub fn find_iter<'h, 'n, N: 'n + ?Sized + AsRef<[u8]>>( |
| haystack: &'h [u8], |
| needle: &'n N, |
| ) -> FindIter<'h, 'n> { |
| FindIter::new(haystack, Finder::new(needle)) |
| } |
| |
| /// Returns a reverse iterator over all occurrences of a substring in a |
| /// haystack. |
| /// |
| /// # Complexity |
| /// |
| /// This routine is guaranteed to have worst case linear time complexity |
| /// with respect to both the needle and the haystack. That is, this runs |
| /// in `O(needle.len() + haystack.len())` time. |
| /// |
| /// This routine is also guaranteed to have worst case constant space |
| /// complexity. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use memchr::memmem; |
| /// |
| /// let haystack = b"foo bar foo baz foo"; |
| /// let mut it = memmem::rfind_iter(haystack, b"foo"); |
| /// assert_eq!(Some(16), it.next()); |
| /// assert_eq!(Some(8), it.next()); |
| /// assert_eq!(Some(0), it.next()); |
| /// assert_eq!(None, it.next()); |
| /// ``` |
| #[inline] |
| pub fn rfind_iter<'h, 'n, N: 'n + ?Sized + AsRef<[u8]>>( |
| haystack: &'h [u8], |
| needle: &'n N, |
| ) -> FindRevIter<'h, 'n> { |
| FindRevIter::new(haystack, FinderRev::new(needle)) |
| } |
| |
| /// Returns the index of the first occurrence of the given needle. |
| /// |
| /// Note that if you're are searching for the same needle in many different |
| /// small haystacks, it may be faster to initialize a [`Finder`] once, |
| /// and reuse it for each search. |
| /// |
| /// # Complexity |
| /// |
| /// This routine is guaranteed to have worst case linear time complexity |
| /// with respect to both the needle and the haystack. That is, this runs |
| /// in `O(needle.len() + haystack.len())` time. |
| /// |
| /// This routine is also guaranteed to have worst case constant space |
| /// complexity. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use memchr::memmem; |
| /// |
| /// let haystack = b"foo bar baz"; |
| /// assert_eq!(Some(0), memmem::find(haystack, b"foo")); |
| /// assert_eq!(Some(4), memmem::find(haystack, b"bar")); |
| /// assert_eq!(None, memmem::find(haystack, b"quux")); |
| /// ``` |
| #[inline] |
| pub fn find(haystack: &[u8], needle: &[u8]) -> Option<usize> { |
| if haystack.len() < 64 { |
| rabinkarp::find(haystack, needle) |
| } else { |
| Finder::new(needle).find(haystack) |
| } |
| } |
| |
| /// Returns the index of the last occurrence of the given needle. |
| /// |
| /// Note that if you're are searching for the same needle in many different |
| /// small haystacks, it may be faster to initialize a [`FinderRev`] once, |
| /// and reuse it for each search. |
| /// |
| /// # Complexity |
| /// |
| /// This routine is guaranteed to have worst case linear time complexity |
| /// with respect to both the needle and the haystack. That is, this runs |
| /// in `O(needle.len() + haystack.len())` time. |
| /// |
| /// This routine is also guaranteed to have worst case constant space |
| /// complexity. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use memchr::memmem; |
| /// |
| /// let haystack = b"foo bar baz"; |
| /// assert_eq!(Some(0), memmem::rfind(haystack, b"foo")); |
| /// assert_eq!(Some(4), memmem::rfind(haystack, b"bar")); |
| /// assert_eq!(Some(8), memmem::rfind(haystack, b"ba")); |
| /// assert_eq!(None, memmem::rfind(haystack, b"quux")); |
| /// ``` |
| #[inline] |
| pub fn rfind(haystack: &[u8], needle: &[u8]) -> Option<usize> { |
| if haystack.len() < 64 { |
| rabinkarp::rfind(haystack, needle) |
| } else { |
| FinderRev::new(needle).rfind(haystack) |
| } |
| } |
| |
| /// An iterator over non-overlapping substring matches. |
| /// |
| /// Matches are reported by the byte offset at which they begin. |
| /// |
| /// `'h` is the lifetime of the haystack while `'n` is the lifetime of the |
| /// needle. |
| #[derive(Debug)] |
| pub struct FindIter<'h, 'n> { |
| haystack: &'h [u8], |
| prestate: PrefilterState, |
| finder: Finder<'n>, |
| pos: usize, |
| } |
| |
| impl<'h, 'n> FindIter<'h, 'n> { |
| #[inline(always)] |
| pub(crate) fn new( |
| haystack: &'h [u8], |
| finder: Finder<'n>, |
| ) -> FindIter<'h, 'n> { |
| let prestate = finder.searcher.prefilter_state(); |
| FindIter { haystack, prestate, finder, pos: 0 } |
| } |
| } |
| |
| impl<'h, 'n> Iterator for FindIter<'h, 'n> { |
| type Item = usize; |
| |
| fn next(&mut self) -> Option<usize> { |
| if self.pos > self.haystack.len() { |
| return None; |
| } |
| let result = self |
| .finder |
| .searcher |
| .find(&mut self.prestate, &self.haystack[self.pos..]); |
| match result { |
| None => None, |
| Some(i) => { |
| let pos = self.pos + i; |
| self.pos = pos + core::cmp::max(1, self.finder.needle().len()); |
| Some(pos) |
| } |
| } |
| } |
| } |
| |
| /// An iterator over non-overlapping substring matches in reverse. |
| /// |
| /// Matches are reported by the byte offset at which they begin. |
| /// |
| /// `'h` is the lifetime of the haystack while `'n` is the lifetime of the |
| /// needle. |
| #[derive(Debug)] |
| pub struct FindRevIter<'h, 'n> { |
| haystack: &'h [u8], |
| finder: FinderRev<'n>, |
| /// When searching with an empty needle, this gets set to `None` after |
| /// we've yielded the last element at `0`. |
| pos: Option<usize>, |
| } |
| |
| impl<'h, 'n> FindRevIter<'h, 'n> { |
| #[inline(always)] |
| pub(crate) fn new( |
| haystack: &'h [u8], |
| finder: FinderRev<'n>, |
| ) -> FindRevIter<'h, 'n> { |
| let pos = Some(haystack.len()); |
| FindRevIter { haystack, finder, pos } |
| } |
| } |
| |
| impl<'h, 'n> Iterator for FindRevIter<'h, 'n> { |
| type Item = usize; |
| |
| fn next(&mut self) -> Option<usize> { |
| let pos = match self.pos { |
| None => return None, |
| Some(pos) => pos, |
| }; |
| let result = self.finder.rfind(&self.haystack[..pos]); |
| match result { |
| None => None, |
| Some(i) => { |
| if pos == i { |
| self.pos = pos.checked_sub(1); |
| } else { |
| self.pos = Some(i); |
| } |
| Some(i) |
| } |
| } |
| } |
| } |
| |
| /// A single substring searcher fixed to a particular needle. |
| /// |
| /// The purpose of this type is to permit callers to construct a substring |
| /// searcher that can be used to search haystacks without the overhead of |
| /// constructing the searcher in the first place. This is a somewhat niche |
| /// concern when it's necessary to re-use the same needle to search multiple |
| /// different haystacks with as little overhead as possible. In general, using |
| /// [`find`] is good enough, but `Finder` is useful when you can meaningfully |
| /// observe searcher construction time in a profile. |
| /// |
| /// When the `std` feature is enabled, then this type has an `into_owned` |
| /// version which permits building a `Finder` that is not connected to |
| /// the lifetime of its needle. |
| #[derive(Clone, Debug)] |
| pub struct Finder<'n> { |
| searcher: Searcher<'n>, |
| } |
| |
| impl<'n> Finder<'n> { |
| /// Create a new finder for the given needle. |
| #[inline] |
| pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'n B) -> Finder<'n> { |
| FinderBuilder::new().build_forward(needle) |
| } |
| |
| /// Returns the index of the first occurrence of this needle in the given |
| /// haystack. |
| /// |
| /// # Complexity |
| /// |
| /// This routine is guaranteed to have worst case linear time complexity |
| /// with respect to both the needle and the haystack. That is, this runs |
| /// in `O(needle.len() + haystack.len())` time. |
| /// |
| /// This routine is also guaranteed to have worst case constant space |
| /// complexity. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use memchr::memmem::Finder; |
| /// |
| /// let haystack = b"foo bar baz"; |
| /// assert_eq!(Some(0), Finder::new("foo").find(haystack)); |
| /// assert_eq!(Some(4), Finder::new("bar").find(haystack)); |
| /// assert_eq!(None, Finder::new("quux").find(haystack)); |
| /// ``` |
| pub fn find(&self, haystack: &[u8]) -> Option<usize> { |
| self.searcher.find(&mut self.searcher.prefilter_state(), haystack) |
| } |
| |
| /// Returns an iterator over all occurrences of a substring in a haystack. |
| /// |
| /// # Complexity |
| /// |
| /// This routine is guaranteed to have worst case linear time complexity |
| /// with respect to both the needle and the haystack. That is, this runs |
| /// in `O(needle.len() + haystack.len())` time. |
| /// |
| /// This routine is also guaranteed to have worst case constant space |
| /// complexity. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use memchr::memmem::Finder; |
| /// |
| /// let haystack = b"foo bar foo baz foo"; |
| /// let finder = Finder::new(b"foo"); |
| /// let mut it = finder.find_iter(haystack); |
| /// assert_eq!(Some(0), it.next()); |
| /// assert_eq!(Some(8), it.next()); |
| /// assert_eq!(Some(16), it.next()); |
| /// assert_eq!(None, it.next()); |
| /// ``` |
| #[inline] |
| pub fn find_iter<'a, 'h>( |
| &'a self, |
| haystack: &'h [u8], |
| ) -> FindIter<'h, 'a> { |
| FindIter::new(haystack, self.as_ref()) |
| } |
| |
| /// Convert this finder into its owned variant, such that it no longer |
| /// borrows the needle. |
| /// |
| /// If this is already an owned finder, then this is a no-op. Otherwise, |
| /// this copies the needle. |
| /// |
| /// This is only available when the `std` feature is enabled. |
| #[cfg(feature = "std")] |
| #[inline] |
| pub fn into_owned(self) -> Finder<'static> { |
| Finder { searcher: self.searcher.into_owned() } |
| } |
| |
| /// Convert this finder into its borrowed variant. |
| /// |
| /// This is primarily useful if your finder is owned and you'd like to |
| /// store its borrowed variant in some intermediate data structure. |
| /// |
| /// Note that the lifetime parameter of the returned finder is tied to the |
| /// lifetime of `self`, and may be shorter than the `'n` lifetime of the |
| /// needle itself. Namely, a finder's needle can be either borrowed or |
| /// owned, so the lifetime of the needle returned must necessarily be the |
| /// shorter of the two. |
| #[inline] |
| pub fn as_ref(&self) -> Finder<'_> { |
| Finder { searcher: self.searcher.as_ref() } |
| } |
| |
| /// Returns the needle that this finder searches for. |
| /// |
| /// Note that the lifetime of the needle returned is tied to the lifetime |
| /// of the finder, and may be shorter than the `'n` lifetime. Namely, a |
| /// finder's needle can be either borrowed or owned, so the lifetime of the |
| /// needle returned must necessarily be the shorter of the two. |
| #[inline] |
| pub fn needle(&self) -> &[u8] { |
| self.searcher.needle() |
| } |
| } |
| |
| /// A single substring reverse searcher fixed to a particular needle. |
| /// |
| /// The purpose of this type is to permit callers to construct a substring |
| /// searcher that can be used to search haystacks without the overhead of |
| /// constructing the searcher in the first place. This is a somewhat niche |
| /// concern when it's necessary to re-use the same needle to search multiple |
| /// different haystacks with as little overhead as possible. In general, |
| /// using [`rfind`] is good enough, but `FinderRev` is useful when you can |
| /// meaningfully observe searcher construction time in a profile. |
| /// |
| /// When the `std` feature is enabled, then this type has an `into_owned` |
| /// version which permits building a `FinderRev` that is not connected to |
| /// the lifetime of its needle. |
| #[derive(Clone, Debug)] |
| pub struct FinderRev<'n> { |
| searcher: SearcherRev<'n>, |
| } |
| |
| impl<'n> FinderRev<'n> { |
| /// Create a new reverse finder for the given needle. |
| #[inline] |
| pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'n B) -> FinderRev<'n> { |
| FinderBuilder::new().build_reverse(needle) |
| } |
| |
| /// Returns the index of the last occurrence of this needle in the given |
| /// haystack. |
| /// |
| /// The haystack may be any type that can be cheaply converted into a |
| /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`. |
| /// |
| /// # Complexity |
| /// |
| /// This routine is guaranteed to have worst case linear time complexity |
| /// with respect to both the needle and the haystack. That is, this runs |
| /// in `O(needle.len() + haystack.len())` time. |
| /// |
| /// This routine is also guaranteed to have worst case constant space |
| /// complexity. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use memchr::memmem::FinderRev; |
| /// |
| /// let haystack = b"foo bar baz"; |
| /// assert_eq!(Some(0), FinderRev::new("foo").rfind(haystack)); |
| /// assert_eq!(Some(4), FinderRev::new("bar").rfind(haystack)); |
| /// assert_eq!(None, FinderRev::new("quux").rfind(haystack)); |
| /// ``` |
| pub fn rfind<B: AsRef<[u8]>>(&self, haystack: B) -> Option<usize> { |
| self.searcher.rfind(haystack.as_ref()) |
| } |
| |
| /// Returns a reverse iterator over all occurrences of a substring in a |
| /// haystack. |
| /// |
| /// # Complexity |
| /// |
| /// This routine is guaranteed to have worst case linear time complexity |
| /// with respect to both the needle and the haystack. That is, this runs |
| /// in `O(needle.len() + haystack.len())` time. |
| /// |
| /// This routine is also guaranteed to have worst case constant space |
| /// complexity. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use memchr::memmem::FinderRev; |
| /// |
| /// let haystack = b"foo bar foo baz foo"; |
| /// let finder = FinderRev::new(b"foo"); |
| /// let mut it = finder.rfind_iter(haystack); |
| /// assert_eq!(Some(16), it.next()); |
| /// assert_eq!(Some(8), it.next()); |
| /// assert_eq!(Some(0), it.next()); |
| /// assert_eq!(None, it.next()); |
| /// ``` |
| #[inline] |
| pub fn rfind_iter<'a, 'h>( |
| &'a self, |
| haystack: &'h [u8], |
| ) -> FindRevIter<'h, 'a> { |
| FindRevIter::new(haystack, self.as_ref()) |
| } |
| |
| /// Convert this finder into its owned variant, such that it no longer |
| /// borrows the needle. |
| /// |
| /// If this is already an owned finder, then this is a no-op. Otherwise, |
| /// this copies the needle. |
| /// |
| /// This is only available when the `std` feature is enabled. |
| #[cfg(feature = "std")] |
| #[inline] |
| pub fn into_owned(self) -> FinderRev<'static> { |
| FinderRev { searcher: self.searcher.into_owned() } |
| } |
| |
| /// Convert this finder into its borrowed variant. |
| /// |
| /// This is primarily useful if your finder is owned and you'd like to |
| /// store its borrowed variant in some intermediate data structure. |
| /// |
| /// Note that the lifetime parameter of the returned finder is tied to the |
| /// lifetime of `self`, and may be shorter than the `'n` lifetime of the |
| /// needle itself. Namely, a finder's needle can be either borrowed or |
| /// owned, so the lifetime of the needle returned must necessarily be the |
| /// shorter of the two. |
| #[inline] |
| pub fn as_ref(&self) -> FinderRev<'_> { |
| FinderRev { searcher: self.searcher.as_ref() } |
| } |
| |
| /// Returns the needle that this finder searches for. |
| /// |
| /// Note that the lifetime of the needle returned is tied to the lifetime |
| /// of the finder, and may be shorter than the `'n` lifetime. Namely, a |
| /// finder's needle can be either borrowed or owned, so the lifetime of the |
| /// needle returned must necessarily be the shorter of the two. |
| #[inline] |
| pub fn needle(&self) -> &[u8] { |
| self.searcher.needle() |
| } |
| } |
| |
| /// A builder for constructing non-default forward or reverse memmem finders. |
| /// |
| /// A builder is primarily useful for configuring a substring searcher. |
| /// Currently, the only configuration exposed is the ability to disable |
| /// heuristic prefilters used to speed up certain searches. |
| #[derive(Clone, Debug, Default)] |
| pub struct FinderBuilder { |
| config: SearcherConfig, |
| } |
| |
| impl FinderBuilder { |
| /// Create a new finder builder with default settings. |
| pub fn new() -> FinderBuilder { |
| FinderBuilder::default() |
| } |
| |
| /// Build a forward finder using the given needle from the current |
| /// settings. |
| pub fn build_forward<'n, B: ?Sized + AsRef<[u8]>>( |
| &self, |
| needle: &'n B, |
| ) -> Finder<'n> { |
| Finder { searcher: Searcher::new(self.config, needle.as_ref()) } |
| } |
| |
| /// Build a reverse finder using the given needle from the current |
| /// settings. |
| pub fn build_reverse<'n, B: ?Sized + AsRef<[u8]>>( |
| &self, |
| needle: &'n B, |
| ) -> FinderRev<'n> { |
| FinderRev { searcher: SearcherRev::new(needle.as_ref()) } |
| } |
| |
| /// Configure the prefilter setting for the finder. |
| /// |
| /// See the documentation for [`Prefilter`] for more discussion on why |
| /// you might want to configure this. |
| pub fn prefilter(&mut self, prefilter: Prefilter) -> &mut FinderBuilder { |
| self.config.prefilter = prefilter; |
| self |
| } |
| } |
| |
| /// The internal implementation of a forward substring searcher. |
| /// |
| /// The reality is that this is a "meta" searcher. Namely, depending on a |
| /// variety of parameters (CPU support, target, needle size, haystack size and |
| /// even dynamic properties such as prefilter effectiveness), the actual |
| /// algorithm employed to do substring search may change. |
| #[derive(Clone, Debug)] |
| struct Searcher<'n> { |
| /// The actual needle we're searching for. |
| /// |
| /// A CowBytes is like a Cow<[u8]>, except in no_std environments, it is |
| /// specialized to a single variant (the borrowed form). |
| needle: CowBytes<'n>, |
| /// A collection of facts computed on the needle that are useful for more |
| /// than one substring search algorithm. |
| ninfo: NeedleInfo, |
| /// A prefilter function, if it was deemed appropriate. |
| /// |
| /// Some substring search implementations (like Two-Way) benefit greatly |
| /// if we can quickly find candidate starting positions for a match. |
| prefn: Option<PrefilterFn>, |
| /// The actual substring implementation in use. |
| kind: SearcherKind, |
| } |
| |
| /// A collection of facts computed about a search needle. |
| /// |
| /// We group these things together because it's useful to be able to hand them |
| /// to prefilters or substring algorithms that want them. |
| #[derive(Clone, Copy, Debug)] |
| pub(crate) struct NeedleInfo { |
| /// The offsets of "rare" bytes detected in the needle. |
| /// |
| /// This is meant to be a heuristic in order to maximize the effectiveness |
| /// of vectorized code. Namely, vectorized code tends to focus on only |
| /// one or two bytes. If we pick bytes from the needle that occur |
| /// infrequently, then more time will be spent in the vectorized code and |
| /// will likely make the overall search (much) faster. |
| /// |
| /// Of course, this is only a heuristic based on a background frequency |
| /// distribution of bytes. But it tends to work very well in practice. |
| pub(crate) rarebytes: RareNeedleBytes, |
| /// A Rabin-Karp hash of the needle. |
| /// |
| /// This is store here instead of in a more specific Rabin-Karp search |
| /// since Rabin-Karp may be used even if another SearchKind corresponds |
| /// to some other search implementation. e.g., If measurements suggest RK |
| /// is faster in some cases or if a search implementation can't handle |
| /// particularly small haystack. (Moreover, we cannot use RK *generally*, |
| /// since its worst case time is multiplicative. Instead, we only use it |
| /// some small haystacks, where "small" is a constant.) |
| pub(crate) nhash: NeedleHash, |
| } |
| |
| /// Configuration for substring search. |
| #[derive(Clone, Copy, Debug, Default)] |
| struct SearcherConfig { |
| /// This permits changing the behavior of the prefilter, since it can have |
| /// a variable impact on performance. |
| prefilter: Prefilter, |
| } |
| |
| #[derive(Clone, Debug)] |
| enum SearcherKind { |
| /// A special case for empty needles. An empty needle always matches, even |
| /// in an empty haystack. |
| Empty, |
| /// This is used whenever the needle is a single byte. In this case, we |
| /// always use memchr. |
| OneByte(u8), |
| /// Two-Way is the generic work horse and is what provides our additive |
| /// linear time guarantee. In general, it's used when the needle is bigger |
| /// than 8 bytes or so. |
| TwoWay(twoway::Forward), |
| #[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] |
| GenericSIMD128(x86::sse::Forward), |
| #[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] |
| GenericSIMD256(x86::avx::Forward), |
| } |
| |
| impl<'n> Searcher<'n> { |
| #[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] |
| fn new(config: SearcherConfig, needle: &'n [u8]) -> Searcher<'n> { |
| use self::SearcherKind::*; |
| |
| let ninfo = NeedleInfo::new(needle); |
| let prefn = |
| prefilter::forward(&config.prefilter, &ninfo.rarebytes, needle); |
| let kind = if needle.len() == 0 { |
| Empty |
| } else if needle.len() == 1 { |
| OneByte(needle[0]) |
| } else if let Some(fwd) = x86::avx::Forward::new(&ninfo, needle) { |
| GenericSIMD256(fwd) |
| } else if let Some(fwd) = x86::sse::Forward::new(&ninfo, needle) { |
| GenericSIMD128(fwd) |
| } else { |
| TwoWay(twoway::Forward::new(needle)) |
| }; |
| Searcher { needle: CowBytes::new(needle), ninfo, prefn, kind } |
| } |
| |
| #[cfg(not(all(not(miri), target_arch = "x86_64", memchr_runtime_simd)))] |
| fn new(config: SearcherConfig, needle: &'n [u8]) -> Searcher<'n> { |
| use self::SearcherKind::*; |
| |
| let ninfo = NeedleInfo::new(needle); |
| let prefn = |
| prefilter::forward(&config.prefilter, &ninfo.rarebytes, needle); |
| let kind = if needle.len() == 0 { |
| Empty |
| } else if needle.len() == 1 { |
| OneByte(needle[0]) |
| } else { |
| TwoWay(twoway::Forward::new(needle)) |
| }; |
| Searcher { needle: CowBytes::new(needle), ninfo, prefn, kind } |
| } |
| |
| /// Return a fresh prefilter state that can be used with this searcher. |
| /// A prefilter state is used to track the effectiveness of a searcher's |
| /// prefilter for speeding up searches. Therefore, the prefilter state |
| /// should generally be reused on subsequent searches (such as in an |
| /// iterator). For searches on a different haystack, then a new prefilter |
| /// state should be used. |
| /// |
| /// This always initializes a valid (but possibly inert) prefilter state |
| /// even if this searcher does not have a prefilter enabled. |
| fn prefilter_state(&self) -> PrefilterState { |
| if self.prefn.is_none() { |
| PrefilterState::inert() |
| } else { |
| PrefilterState::new() |
| } |
| } |
| |
| fn needle(&self) -> &[u8] { |
| self.needle.as_slice() |
| } |
| |
| fn as_ref(&self) -> Searcher<'_> { |
| use self::SearcherKind::*; |
| |
| let kind = match self.kind { |
| Empty => Empty, |
| OneByte(b) => OneByte(b), |
| TwoWay(tw) => TwoWay(tw), |
| #[cfg(all( |
| not(miri), |
| target_arch = "x86_64", |
| memchr_runtime_simd |
| ))] |
| GenericSIMD128(gs) => GenericSIMD128(gs), |
| #[cfg(all( |
| not(miri), |
| target_arch = "x86_64", |
| memchr_runtime_simd |
| ))] |
| GenericSIMD256(gs) => GenericSIMD256(gs), |
| }; |
| Searcher { |
| needle: CowBytes::new(self.needle()), |
| ninfo: self.ninfo, |
| prefn: self.prefn, |
| kind, |
| } |
| } |
| |
| #[cfg(feature = "std")] |
| fn into_owned(self) -> Searcher<'static> { |
| use self::SearcherKind::*; |
| |
| let kind = match self.kind { |
| Empty => Empty, |
| OneByte(b) => OneByte(b), |
| TwoWay(tw) => TwoWay(tw), |
| #[cfg(all( |
| not(miri), |
| target_arch = "x86_64", |
| memchr_runtime_simd |
| ))] |
| GenericSIMD128(gs) => GenericSIMD128(gs), |
| #[cfg(all( |
| not(miri), |
| target_arch = "x86_64", |
| memchr_runtime_simd |
| ))] |
| GenericSIMD256(gs) => GenericSIMD256(gs), |
| }; |
| Searcher { |
| needle: self.needle.into_owned(), |
| ninfo: self.ninfo, |
| prefn: self.prefn, |
| kind, |
| } |
| } |
| |
| /// Implements forward substring search by selecting the implementation |
| /// chosen at construction and executing it on the given haystack with the |
| /// prefilter's current state of effectiveness. |
| #[inline(always)] |
| fn find( |
| &self, |
| state: &mut PrefilterState, |
| haystack: &[u8], |
| ) -> Option<usize> { |
| use self::SearcherKind::*; |
| |
| let needle = self.needle(); |
| if haystack.len() < needle.len() { |
| return None; |
| } |
| match self.kind { |
| Empty => Some(0), |
| OneByte(b) => crate::memchr(b, haystack), |
| TwoWay(ref tw) => { |
| // For very short haystacks (e.g., where the prefilter probably |
| // can't run), it's faster to just run RK. |
| if rabinkarp::is_fast(haystack, needle) { |
| rabinkarp::find_with(&self.ninfo.nhash, haystack, needle) |
| } else { |
| self.find_tw(tw, state, haystack, needle) |
| } |
| } |
| #[cfg(all( |
| not(miri), |
| target_arch = "x86_64", |
| memchr_runtime_simd |
| ))] |
| GenericSIMD128(ref gs) => { |
| // The SIMD matcher can't handle particularly short haystacks, |
| // so we fall back to RK in these cases. |
| if haystack.len() < gs.min_haystack_len() { |
| rabinkarp::find_with(&self.ninfo.nhash, haystack, needle) |
| } else { |
| gs.find(haystack, needle) |
| } |
| } |
| #[cfg(all( |
| not(miri), |
| target_arch = "x86_64", |
| memchr_runtime_simd |
| ))] |
| GenericSIMD256(ref gs) => { |
| // The SIMD matcher can't handle particularly short haystacks, |
| // so we fall back to RK in these cases. |
| if haystack.len() < gs.min_haystack_len() { |
| rabinkarp::find_with(&self.ninfo.nhash, haystack, needle) |
| } else { |
| gs.find(haystack, needle) |
| } |
| } |
| } |
| } |
| |
| /// Calls Two-Way on the given haystack/needle. |
| /// |
| /// This is marked as unlineable since it seems to have a better overall |
| /// effect on benchmarks. However, this is one of those cases where |
| /// inlining it results an improvement in other benchmarks too, so I |
| /// suspect we just don't have enough data yet to make the right call here. |
| /// |
| /// I suspect the main problem is that this function contains two different |
| /// inlined copies of Two-Way: one with and one without prefilters enabled. |
| #[inline(never)] |
| fn find_tw( |
| &self, |
| tw: &twoway::Forward, |
| state: &mut PrefilterState, |
| haystack: &[u8], |
| needle: &[u8], |
| ) -> Option<usize> { |
| if let Some(prefn) = self.prefn { |
| // We used to look at the length of a haystack here. That is, if |
| // it was too small, then don't bother with the prefilter. But two |
| // things changed: the prefilter falls back to memchr for small |
| // haystacks, and, above, Rabin-Karp is employed for tiny haystacks |
| // anyway. |
| if state.is_effective() { |
| let mut pre = Pre { state, prefn, ninfo: &self.ninfo }; |
| return tw.find(Some(&mut pre), haystack, needle); |
| } |
| } |
| tw.find(None, haystack, needle) |
| } |
| } |
| |
| impl NeedleInfo { |
| pub(crate) fn new(needle: &[u8]) -> NeedleInfo { |
| NeedleInfo { |
| rarebytes: RareNeedleBytes::forward(needle), |
| nhash: NeedleHash::forward(needle), |
| } |
| } |
| } |
| |
| /// The internal implementation of a reverse substring searcher. |
| /// |
| /// See the forward searcher docs for more details. Currently, the reverse |
| /// searcher is considerably simpler since it lacks prefilter support. This |
| /// was done because it adds a lot of code, and more surface area to test. And |
| /// in particular, it's not clear whether a prefilter on reverse searching is |
| /// worth it. (If you have a compelling use case, please file an issue!) |
| #[derive(Clone, Debug)] |
| struct SearcherRev<'n> { |
| /// The actual needle we're searching for. |
| needle: CowBytes<'n>, |
| /// A Rabin-Karp hash of the needle. |
| nhash: NeedleHash, |
| /// The actual substring implementation in use. |
| kind: SearcherRevKind, |
| } |
| |
| #[derive(Clone, Debug)] |
| enum SearcherRevKind { |
| /// A special case for empty needles. An empty needle always matches, even |
| /// in an empty haystack. |
| Empty, |
| /// This is used whenever the needle is a single byte. In this case, we |
| /// always use memchr. |
| OneByte(u8), |
| /// Two-Way is the generic work horse and is what provides our additive |
| /// linear time guarantee. In general, it's used when the needle is bigger |
| /// than 8 bytes or so. |
| TwoWay(twoway::Reverse), |
| } |
| |
| impl<'n> SearcherRev<'n> { |
| fn new(needle: &'n [u8]) -> SearcherRev<'n> { |
| use self::SearcherRevKind::*; |
| |
| let kind = if needle.len() == 0 { |
| Empty |
| } else if needle.len() == 1 { |
| OneByte(needle[0]) |
| } else { |
| TwoWay(twoway::Reverse::new(needle)) |
| }; |
| SearcherRev { |
| needle: CowBytes::new(needle), |
| nhash: NeedleHash::reverse(needle), |
| kind, |
| } |
| } |
| |
| fn needle(&self) -> &[u8] { |
| self.needle.as_slice() |
| } |
| |
| fn as_ref(&self) -> SearcherRev<'_> { |
| use self::SearcherRevKind::*; |
| |
| let kind = match self.kind { |
| Empty => Empty, |
| OneByte(b) => OneByte(b), |
| TwoWay(tw) => TwoWay(tw), |
| }; |
| SearcherRev { |
| needle: CowBytes::new(self.needle()), |
| nhash: self.nhash, |
| kind, |
| } |
| } |
| |
| #[cfg(feature = "std")] |
| fn into_owned(self) -> SearcherRev<'static> { |
| use self::SearcherRevKind::*; |
| |
| let kind = match self.kind { |
| Empty => Empty, |
| OneByte(b) => OneByte(b), |
| TwoWay(tw) => TwoWay(tw), |
| }; |
| SearcherRev { |
| needle: self.needle.into_owned(), |
| nhash: self.nhash, |
| kind, |
| } |
| } |
| |
| /// Implements reverse substring search by selecting the implementation |
| /// chosen at construction and executing it on the given haystack with the |
| /// prefilter's current state of effectiveness. |
| #[inline(always)] |
| fn rfind(&self, haystack: &[u8]) -> Option<usize> { |
| use self::SearcherRevKind::*; |
| |
| let needle = self.needle(); |
| if haystack.len() < needle.len() { |
| return None; |
| } |
| match self.kind { |
| Empty => Some(haystack.len()), |
| OneByte(b) => crate::memrchr(b, haystack), |
| TwoWay(ref tw) => { |
| // For very short haystacks (e.g., where the prefilter probably |
| // can't run), it's faster to just run RK. |
| if rabinkarp::is_fast(haystack, needle) { |
| rabinkarp::rfind_with(&self.nhash, haystack, needle) |
| } else { |
| tw.rfind(haystack, needle) |
| } |
| } |
| } |
| } |
| } |
| |
| /// This module defines some generic quickcheck properties useful for testing |
| /// any substring search algorithm. It also runs those properties for the |
| /// top-level public API memmem routines. (The properties are also used to |
| /// test various substring search implementations more granularly elsewhere as |
| /// well.) |
| #[cfg(all(test, feature = "std", not(miri)))] |
| mod proptests { |
| // N.B. This defines the quickcheck tests using the properties defined |
| // below. Because of macro-visibility weirdness, the actual macro is |
| // defined at the top of this file. |
| define_memmem_quickcheck_tests!(super::find, super::rfind); |
| |
| /// Check that every prefix of the given byte string is a substring. |
| pub(crate) fn prefix_is_substring( |
| reverse: bool, |
| bs: &[u8], |
| mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>, |
| ) -> bool { |
| if bs.is_empty() { |
| return true; |
| } |
| for i in 0..(bs.len() - 1) { |
| let prefix = &bs[..i]; |
| if reverse { |
| assert_eq!(naive_rfind(bs, prefix), search(bs, prefix)); |
| } else { |
| assert_eq!(naive_find(bs, prefix), search(bs, prefix)); |
| } |
| } |
| true |
| } |
| |
| /// Check that every suffix of the given byte string is a substring. |
| pub(crate) fn suffix_is_substring( |
| reverse: bool, |
| bs: &[u8], |
| mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>, |
| ) -> bool { |
| if bs.is_empty() { |
| return true; |
| } |
| for i in 0..(bs.len() - 1) { |
| let suffix = &bs[i..]; |
| if reverse { |
| assert_eq!(naive_rfind(bs, suffix), search(bs, suffix)); |
| } else { |
| assert_eq!(naive_find(bs, suffix), search(bs, suffix)); |
| } |
| } |
| true |
| } |
| |
| /// Check that naive substring search matches the result of the given search |
| /// algorithm. |
| pub(crate) fn matches_naive( |
| reverse: bool, |
| haystack: &[u8], |
| needle: &[u8], |
| mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>, |
| ) -> bool { |
| if reverse { |
| naive_rfind(haystack, needle) == search(haystack, needle) |
| } else { |
| naive_find(haystack, needle) == search(haystack, needle) |
| } |
| } |
| |
| /// Naively search forwards for the given needle in the given haystack. |
| fn naive_find(haystack: &[u8], needle: &[u8]) -> Option<usize> { |
| if needle.is_empty() { |
| return Some(0); |
| } else if haystack.len() < needle.len() { |
| return None; |
| } |
| for i in 0..(haystack.len() - needle.len() + 1) { |
| if needle == &haystack[i..i + needle.len()] { |
| return Some(i); |
| } |
| } |
| None |
| } |
| |
| /// Naively search in reverse for the given needle in the given haystack. |
| fn naive_rfind(haystack: &[u8], needle: &[u8]) -> Option<usize> { |
| if needle.is_empty() { |
| return Some(haystack.len()); |
| } else if haystack.len() < needle.len() { |
| return None; |
| } |
| for i in (0..(haystack.len() - needle.len() + 1)).rev() { |
| if needle == &haystack[i..i + needle.len()] { |
| return Some(i); |
| } |
| } |
| None |
| } |
| } |
| |
| /// This module defines some hand-written "simple" substring tests. It |
| /// also provides routines for easily running them on any substring search |
| /// implementation. |
| #[cfg(test)] |
| mod testsimples { |
| define_memmem_simple_tests!(super::find, super::rfind); |
| |
| /// Each test is a (needle, haystack, expected_fwd, expected_rev) tuple. |
| type SearchTest = |
| (&'static str, &'static str, Option<usize>, Option<usize>); |
| |
| const SEARCH_TESTS: &'static [SearchTest] = &[ |
| ("", "", Some(0), Some(0)), |
| ("", "a", Some(0), Some(1)), |
| ("", "ab", Some(0), Some(2)), |
| ("", "abc", Some(0), Some(3)), |
| ("a", "", None, None), |
| ("a", "a", Some(0), Some(0)), |
| ("a", "aa", Some(0), Some(1)), |
| ("a", "ba", Some(1), Some(1)), |
| ("a", "bba", Some(2), Some(2)), |
| ("a", "bbba", Some(3), Some(3)), |
| ("a", "bbbab", Some(3), Some(3)), |
| ("a", "bbbabb", Some(3), Some(3)), |
| ("a", "bbbabbb", Some(3), Some(3)), |
| ("a", "bbbbbb", None, None), |
| ("ab", "", None, None), |
| ("ab", "a", None, None), |
| ("ab", "b", None, None), |
| ("ab", "ab", Some(0), Some(0)), |
| ("ab", "aab", Some(1), Some(1)), |
| ("ab", "aaab", Some(2), Some(2)), |
| ("ab", "abaab", Some(0), Some(3)), |
| ("ab", "baaab", Some(3), Some(3)), |
| ("ab", "acb", None, None), |
| ("ab", "abba", Some(0), Some(0)), |
| ("abc", "ab", None, None), |
| ("abc", "abc", Some(0), Some(0)), |
| ("abc", "abcz", Some(0), Some(0)), |
| ("abc", "abczz", Some(0), Some(0)), |
| ("abc", "zabc", Some(1), Some(1)), |
| ("abc", "zzabc", Some(2), Some(2)), |
| ("abc", "azbc", None, None), |
| ("abc", "abzc", None, None), |
| ("abczdef", "abczdefzzzzzzzzzzzzzzzzzzzz", Some(0), Some(0)), |
| ("abczdef", "zzzzzzzzzzzzzzzzzzzzabczdef", Some(20), Some(20)), |
| ("xyz", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaxyz", Some(32), Some(32)), |
| // Failures caught by quickcheck. |
| ("\u{0}\u{15}", "\u{0}\u{15}\u{15}\u{0}", Some(0), Some(0)), |
| ("\u{0}\u{1e}", "\u{1e}\u{0}", None, None), |
| ]; |
| |
| /// Run the substring search tests. `search` should be a closure that |
| /// accepts a haystack and a needle and returns the starting position |
| /// of the first occurrence of needle in the haystack, or `None` if one |
| /// doesn't exist. |
| pub(crate) fn run_search_tests_fwd( |
| mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>, |
| ) { |
| for &(needle, haystack, expected_fwd, _) in SEARCH_TESTS { |
| let (n, h) = (needle.as_bytes(), haystack.as_bytes()); |
| assert_eq!( |
| expected_fwd, |
| search(h, n), |
| "needle: {:?}, haystack: {:?}, expected: {:?}", |
| n, |
| h, |
| expected_fwd |
| ); |
| } |
| } |
| |
| /// Run the substring search tests. `search` should be a closure that |
| /// accepts a haystack and a needle and returns the starting position of |
| /// the last occurrence of needle in the haystack, or `None` if one doesn't |
| /// exist. |
| pub(crate) fn run_search_tests_rev( |
| mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>, |
| ) { |
| for &(needle, haystack, _, expected_rev) in SEARCH_TESTS { |
| let (n, h) = (needle.as_bytes(), haystack.as_bytes()); |
| assert_eq!( |
| expected_rev, |
| search(h, n), |
| "needle: {:?}, haystack: {:?}, expected: {:?}", |
| n, |
| h, |
| expected_rev |
| ); |
| } |
| } |
| } |