src/memmem/genericsimd.rs - platform/external/rust/crates/memchr - Git at Google

 use core::mem::size_of;

 use crate::memmem::{util::memcmp, vector::Vector, NeedleInfo};

 /// The minimum length of a needle required for this algorithm. The minimum
 /// is 2 since a length of 1 should just use memchr and a length of 0 isn't
 /// a case handled by this searcher.
 pub(crate) const MIN_NEEDLE_LEN: usize = 2;

 /// The maximum length of a needle required for this algorithm.
 ///
 /// In reality, there is no hard max here. The code below can handle any
 /// length needle. (Perhaps that suggests there are missing optimizations.)
 /// Instead, this is a heuristic and a bound guaranteeing our linear time
 /// complexity.
 ///
 /// It is a heuristic because when a candidate match is found, memcmp is run.
 /// For very large needles with lots of false positives, memcmp can make the
 /// code run quite slow.
 ///
 /// It is a bound because the worst case behavior with memcmp is multiplicative
 /// in the size of the needle and haystack, and we want to keep that additive.
 /// This bound ensures we still meet that bound theoretically, since it's just
 /// a constant. We aren't acting in bad faith here, memcmp on tiny needles
 /// is so fast that even in pathological cases (see pathological vector
 /// benchmarks), this is still just as fast or faster in practice.
 ///
 /// This specific number was chosen by tweaking a bit and running benchmarks.
 /// The rare-medium-needle, for example, gets about 5% faster by using this
 /// algorithm instead of a prefilter-accelerated Two-Way. There's also a
 /// theoretical desire to keep this number reasonably low, to mitigate the
 /// impact of pathological cases. I did try 64, and some benchmarks got a
 /// little better, and others (particularly the pathological ones), got a lot
 /// worse. So... 32 it is?
 pub(crate) const MAX_NEEDLE_LEN: usize = 32;

 /// The implementation of the forward vector accelerated substring search.
 ///
 /// This is extremely similar to the prefilter vector module by the same name.
 /// The key difference is that this is not a prefilter. Instead, it handles
 /// confirming its own matches. The trade off is that this only works with
 /// smaller needles. The speed up here is that an inlined memcmp on a tiny
 /// needle is very quick, even on pathological inputs. This is much better than
 /// combining a prefilter with Two-Way, where using Two-Way to confirm the
 /// match has higher latency.
 ///
 /// So why not use this for all needles? We could, and it would probably work
 /// really well on most inputs. But its worst case is multiplicative and we
 /// want to guarantee worst case additive time. Some of the benchmarks try to
 /// justify this (see the pathological ones).
 ///
 /// The prefilter variant of this has more comments. Also note that we only
 /// implement this for forward searches for now. If you have a compelling use
 /// case for accelerated reverse search, please file an issue.
 #[derive(Clone, Copy, Debug)]
 pub(crate) struct Forward {
     rare1i: u8,
     rare2i: u8,
 }

 impl Forward {
     /// Create a new "generic simd" forward searcher. If one could not be
     /// created from the given inputs, then None is returned.
     pub(crate) fn new(ninfo: &NeedleInfo, needle: &[u8]) -> Option<Forward> {
         let (rare1i, rare2i) = ninfo.rarebytes.as_rare_ordered_u8();
         // If the needle is too short or too long, give up. Also, give up
         // if the rare bytes detected are at the same position. (It likely
         // suggests a degenerate case, although it should technically not be
         // possible.)
         if needle.len() < MIN_NEEDLE_LEN
             || needle.len() > MAX_NEEDLE_LEN
             || rare1i == rare2i
         {
             return None;
         }
         Some(Forward { rare1i, rare2i })
     }

     /// Returns the minimum length of haystack that is needed for this searcher
     /// to work for a particular vector. Passing a haystack with a length
     /// smaller than this will cause `fwd_find` to panic.
     #[inline(always)]
     pub(crate) fn min_haystack_len<V: Vector>(&self) -> usize {
         self.rare2i as usize + size_of::<V>()
     }
 }

 /// Searches the given haystack for the given needle. The needle given should
 /// be the same as the needle that this searcher was initialized with.
 ///
 /// # Panics
 ///
 /// When the given haystack has a length smaller than `min_haystack_len`.
 ///
 /// # Safety
 ///
 /// Since this is meant to be used with vector functions, callers need to
 /// specialize this inside of a function with a `target_feature` attribute.
 /// Therefore, callers must ensure that whatever target feature is being used
 /// supports the vector functions that this function is specialized for. (For
 /// the specific vector functions used, see the Vector trait implementations.)
 #[inline(always)]
 pub(crate) unsafe fn fwd_find<V: Vector>(
     fwd: &Forward,
     haystack: &[u8],
     needle: &[u8],
 ) -> Option<usize> {
     // It would be nice if we didn't have this check here, since the meta
     // searcher should handle it for us. But without this, I don't think we
     // guarantee that end_ptr.sub(needle.len()) won't result in UB. We could
     // put it as part of the safety contract, but it makes it more complicated
     // than necessary.
     if haystack.len() < needle.len() {
         return None;
     }
     let min_haystack_len = fwd.min_haystack_len::<V>();
     assert!(haystack.len() >= min_haystack_len, "haystack too small");
     debug_assert!(needle.len() <= haystack.len());
     debug_assert!(
         needle.len() >= MIN_NEEDLE_LEN,
         "needle must be at least {} bytes",
         MIN_NEEDLE_LEN,
     );
     debug_assert!(
         needle.len() <= MAX_NEEDLE_LEN,
         "needle must be at most {} bytes",
         MAX_NEEDLE_LEN,
     );

     let (rare1i, rare2i) = (fwd.rare1i as usize, fwd.rare2i as usize);
     let rare1chunk = V::splat(needle[rare1i]);
     let rare2chunk = V::splat(needle[rare2i]);

     let start_ptr = haystack.as_ptr();
     let end_ptr = start_ptr.add(haystack.len());
     let max_ptr = end_ptr.sub(min_haystack_len);
     let mut ptr = start_ptr;

     // N.B. I did experiment with unrolling the loop to deal with size(V)
     // bytes at a time and 2*size(V) bytes at a time. The double unroll was
     // marginally faster while the quadruple unroll was unambiguously slower.
     // In the end, I decided the complexity from unrolling wasn't worth it. I
     // used the memmem/krate/prebuilt/huge-en/ benchmarks to compare.
     while ptr <= max_ptr {
         let m = fwd_find_in_chunk(
             fwd, needle, ptr, end_ptr, rare1chunk, rare2chunk, !0,
         );
         if let Some(chunki) = m {
             return Some(matched(start_ptr, ptr, chunki));
         }
         ptr = ptr.add(size_of::<V>());
     }
     if ptr < end_ptr {
         let remaining = diff(end_ptr, ptr);
         debug_assert!(
             remaining < min_haystack_len,
             "remaining bytes should be smaller than the minimum haystack \
              length of {}, but there are {} bytes remaining",
             min_haystack_len,
             remaining,
         );
         if remaining < needle.len() {
             return None;
         }
         debug_assert!(
             max_ptr < ptr,
             "after main loop, ptr should have exceeded max_ptr",
         );
         let overlap = diff(ptr, max_ptr);
         debug_assert!(
             overlap > 0,
             "overlap ({}) must always be non-zero",
             overlap,
         );
         debug_assert!(
             overlap < size_of::<V>(),
             "overlap ({}) cannot possibly be >= than a vector ({})",
             overlap,
             size_of::<V>(),
         );
         // The mask has all of its bits set except for the first N least
         // significant bits, where N=overlap. This way, any matches that
         // occur in find_in_chunk within the overlap are automatically
         // ignored.
         let mask = !((1 << overlap) - 1);
         ptr = max_ptr;
         let m = fwd_find_in_chunk(
             fwd, needle, ptr, end_ptr, rare1chunk, rare2chunk, mask,
         );
         if let Some(chunki) = m {
             return Some(matched(start_ptr, ptr, chunki));
         }
     }
     None
 }

 /// Search for an occurrence of two rare bytes from the needle in the chunk
 /// pointed to by ptr, with the end of the haystack pointed to by end_ptr. When
 /// an occurrence is found, memcmp is run to check if a match occurs at the
 /// corresponding position.
 ///
 /// rare1chunk and rare2chunk correspond to vectors with the rare1 and rare2
 /// bytes repeated in each 8-bit lane, respectively.
 ///
 /// mask should have bits set corresponding the positions in the chunk in which
 /// matches are considered. This is only used for the last vector load where
 /// the beginning of the vector might have overlapped with the last load in
 /// the main loop. The mask lets us avoid visiting positions that have already
 /// been discarded as matches.
 ///
 /// # Safety
 ///
 /// It must be safe to do an unaligned read of size(V) bytes starting at both
 /// (ptr + rare1i) and (ptr + rare2i). It must also be safe to do unaligned
 /// loads on ptr up to (end_ptr - needle.len()).
 #[inline(always)]
 unsafe fn fwd_find_in_chunk<V: Vector>(
     fwd: &Forward,
     needle: &[u8],
     ptr: *const u8,
     end_ptr: *const u8,
     rare1chunk: V,
     rare2chunk: V,
     mask: u32,
 ) -> Option<usize> {
     let chunk0 = V::load_unaligned(ptr.add(fwd.rare1i as usize));
     let chunk1 = V::load_unaligned(ptr.add(fwd.rare2i as usize));

     let eq0 = chunk0.cmpeq(rare1chunk);
     let eq1 = chunk1.cmpeq(rare2chunk);

     let mut match_offsets = eq0.and(eq1).movemask() & mask;
     while match_offsets != 0 {
         let offset = match_offsets.trailing_zeros() as usize;
         let ptr = ptr.add(offset);
         if end_ptr.sub(needle.len()) < ptr {
             return None;
         }
         let chunk = core::slice::from_raw_parts(ptr, needle.len());
         if memcmp(needle, chunk) {
             return Some(offset);
         }
         match_offsets &= match_offsets - 1;
     }
     None
 }

 /// Accepts a chunk-relative offset and returns a haystack relative offset
 /// after updating the prefilter state.
 ///
 /// See the same function with the same name in the prefilter variant of this
 /// algorithm to learned why it's tagged with inline(never). Even here, where
 /// the function is simpler, inlining it leads to poorer codegen. (Although
 /// it does improve some benchmarks, like prebuiltiter/huge-en/common-you.)
 #[cold]
 #[inline(never)]
 fn matched(start_ptr: *const u8, ptr: *const u8, chunki: usize) -> usize {
     diff(ptr, start_ptr) + chunki
 }

 /// Subtract `b` from `a` and return the difference. `a` must be greater than
 /// or equal to `b`.
 fn diff(a: *const u8, b: *const u8) -> usize {
     debug_assert!(a >= b);
     (a as usize) - (b as usize)
 }
	use core::mem::size_of;

	use crate::memmem::{util::memcmp, vector::Vector, NeedleInfo};

	/// The minimum length of a needle required for this algorithm. The minimum
	/// is 2 since a length of 1 should just use memchr and a length of 0 isn't
	/// a case handled by this searcher.
	pub(crate) const MIN_NEEDLE_LEN: usize = 2;

	/// The maximum length of a needle required for this algorithm.
	///
	/// In reality, there is no hard max here. The code below can handle any
	/// length needle. (Perhaps that suggests there are missing optimizations.)
	/// Instead, this is a heuristic and a bound guaranteeing our linear time
	/// complexity.
	///
	/// It is a heuristic because when a candidate match is found, memcmp is run.
	/// For very large needles with lots of false positives, memcmp can make the
	/// code run quite slow.
	///
	/// It is a bound because the worst case behavior with memcmp is multiplicative
	/// in the size of the needle and haystack, and we want to keep that additive.
	/// This bound ensures we still meet that bound theoretically, since it's just
	/// a constant. We aren't acting in bad faith here, memcmp on tiny needles
	/// is so fast that even in pathological cases (see pathological vector
	/// benchmarks), this is still just as fast or faster in practice.
	///
	/// This specific number was chosen by tweaking a bit and running benchmarks.
	/// The rare-medium-needle, for example, gets about 5% faster by using this
	/// algorithm instead of a prefilter-accelerated Two-Way. There's also a
	/// theoretical desire to keep this number reasonably low, to mitigate the
	/// impact of pathological cases. I did try 64, and some benchmarks got a
	/// little better, and others (particularly the pathological ones), got a lot
	/// worse. So... 32 it is?
	pub(crate) const MAX_NEEDLE_LEN: usize = 32;

	/// The implementation of the forward vector accelerated substring search.
	///
	/// This is extremely similar to the prefilter vector module by the same name.
	/// The key difference is that this is not a prefilter. Instead, it handles
	/// confirming its own matches. The trade off is that this only works with
	/// smaller needles. The speed up here is that an inlined memcmp on a tiny
	/// needle is very quick, even on pathological inputs. This is much better than
	/// combining a prefilter with Two-Way, where using Two-Way to confirm the
	/// match has higher latency.
	///
	/// So why not use this for all needles? We could, and it would probably work
	/// really well on most inputs. But its worst case is multiplicative and we
	/// want to guarantee worst case additive time. Some of the benchmarks try to
	/// justify this (see the pathological ones).
	///
	/// The prefilter variant of this has more comments. Also note that we only
	/// implement this for forward searches for now. If you have a compelling use
	/// case for accelerated reverse search, please file an issue.
	#[derive(Clone, Copy, Debug)]
	pub(crate) struct Forward {
	rare1i: u8,
	rare2i: u8,
	}

	impl Forward {
	/// Create a new "generic simd" forward searcher. If one could not be
	/// created from the given inputs, then None is returned.
	pub(crate) fn new(ninfo: &NeedleInfo, needle: &[u8]) -> Option<Forward> {
	let (rare1i, rare2i) = ninfo.rarebytes.as_rare_ordered_u8();
	// If the needle is too short or too long, give up. Also, give up
	// if the rare bytes detected are at the same position. (It likely
	// suggests a degenerate case, although it should technically not be
	// possible.)
	if needle.len() < MIN_NEEDLE_LEN
	\|\| needle.len() > MAX_NEEDLE_LEN
	\|\| rare1i == rare2i
	{
	return None;
	}
	Some(Forward { rare1i, rare2i })
	}

	/// Returns the minimum length of haystack that is needed for this searcher
	/// to work for a particular vector. Passing a haystack with a length
	/// smaller than this will cause `fwd_find` to panic.
	#[inline(always)]
	pub(crate) fn min_haystack_len<V: Vector>(&self) -> usize {
	self.rare2i as usize + size_of::<V>()
	}
	}

	/// Searches the given haystack for the given needle. The needle given should
	/// be the same as the needle that this searcher was initialized with.
	///
	/// # Panics
	///
	/// When the given haystack has a length smaller than `min_haystack_len`.
	///
	/// # Safety
	///
	/// Since this is meant to be used with vector functions, callers need to
	/// specialize this inside of a function with a `target_feature` attribute.
	/// Therefore, callers must ensure that whatever target feature is being used
	/// supports the vector functions that this function is specialized for. (For
	/// the specific vector functions used, see the Vector trait implementations.)
	#[inline(always)]
	pub(crate) unsafe fn fwd_find<V: Vector>(
	fwd: &Forward,
	haystack: &[u8],
	needle: &[u8],
	) -> Option<usize> {
	// It would be nice if we didn't have this check here, since the meta
	// searcher should handle it for us. But without this, I don't think we
	// guarantee that end_ptr.sub(needle.len()) won't result in UB. We could
	// put it as part of the safety contract, but it makes it more complicated
	// than necessary.
	if haystack.len() < needle.len() {
	return None;
	}
	let min_haystack_len = fwd.min_haystack_len::<V>();
	assert!(haystack.len() >= min_haystack_len, "haystack too small");
	debug_assert!(needle.len() <= haystack.len());
	debug_assert!(
	needle.len() >= MIN_NEEDLE_LEN,
	"needle must be at least {} bytes",
	MIN_NEEDLE_LEN,
	);
	debug_assert!(
	needle.len() <= MAX_NEEDLE_LEN,
	"needle must be at most {} bytes",
	MAX_NEEDLE_LEN,
	);

	let (rare1i, rare2i) = (fwd.rare1i as usize, fwd.rare2i as usize);
	let rare1chunk = V::splat(needle[rare1i]);
	let rare2chunk = V::splat(needle[rare2i]);

	let start_ptr = haystack.as_ptr();
	let end_ptr = start_ptr.add(haystack.len());
	let max_ptr = end_ptr.sub(min_haystack_len);
	let mut ptr = start_ptr;

	// N.B. I did experiment with unrolling the loop to deal with size(V)
	// bytes at a time and 2*size(V) bytes at a time. The double unroll was
	// marginally faster while the quadruple unroll was unambiguously slower.
	// In the end, I decided the complexity from unrolling wasn't worth it. I
	// used the memmem/krate/prebuilt/huge-en/ benchmarks to compare.
	while ptr <= max_ptr {
	let m = fwd_find_in_chunk(
	fwd, needle, ptr, end_ptr, rare1chunk, rare2chunk, !0,
	);
	if let Some(chunki) = m {
	return Some(matched(start_ptr, ptr, chunki));
	}
	ptr = ptr.add(size_of::<V>());
	}
	if ptr < end_ptr {
	let remaining = diff(end_ptr, ptr);
	debug_assert!(
	remaining < min_haystack_len,
	"remaining bytes should be smaller than the minimum haystack \
	length of {}, but there are {} bytes remaining",
	min_haystack_len,
	remaining,
	);
	if remaining < needle.len() {
	return None;
	}
	debug_assert!(
	max_ptr < ptr,
	"after main loop, ptr should have exceeded max_ptr",
	);
	let overlap = diff(ptr, max_ptr);
	debug_assert!(
	overlap > 0,
	"overlap ({}) must always be non-zero",
	overlap,
	);
	debug_assert!(
	overlap < size_of::<V>(),
	"overlap ({}) cannot possibly be >= than a vector ({})",
	overlap,
	size_of::<V>(),
	);
	// The mask has all of its bits set except for the first N least
	// significant bits, where N=overlap. This way, any matches that
	// occur in find_in_chunk within the overlap are automatically
	// ignored.
	let mask = !((1 << overlap) - 1);
	ptr = max_ptr;
	let m = fwd_find_in_chunk(
	fwd, needle, ptr, end_ptr, rare1chunk, rare2chunk, mask,
	);
	if let Some(chunki) = m {
	return Some(matched(start_ptr, ptr, chunki));
	}
	}
	None
	}

	/// Search for an occurrence of two rare bytes from the needle in the chunk
	/// pointed to by ptr, with the end of the haystack pointed to by end_ptr. When
	/// an occurrence is found, memcmp is run to check if a match occurs at the
	/// corresponding position.
	///
	/// rare1chunk and rare2chunk correspond to vectors with the rare1 and rare2
	/// bytes repeated in each 8-bit lane, respectively.
	///
	/// mask should have bits set corresponding the positions in the chunk in which
	/// matches are considered. This is only used for the last vector load where
	/// the beginning of the vector might have overlapped with the last load in
	/// the main loop. The mask lets us avoid visiting positions that have already
	/// been discarded as matches.
	///
	/// # Safety
	///
	/// It must be safe to do an unaligned read of size(V) bytes starting at both
	/// (ptr + rare1i) and (ptr + rare2i). It must also be safe to do unaligned
	/// loads on ptr up to (end_ptr - needle.len()).
	#[inline(always)]
	unsafe fn fwd_find_in_chunk<V: Vector>(
	fwd: &Forward,
	needle: &[u8],
	ptr: *const u8,
	end_ptr: *const u8,
	rare1chunk: V,
	rare2chunk: V,
	mask: u32,
	) -> Option<usize> {
	let chunk0 = V::load_unaligned(ptr.add(fwd.rare1i as usize));
	let chunk1 = V::load_unaligned(ptr.add(fwd.rare2i as usize));

	let eq0 = chunk0.cmpeq(rare1chunk);
	let eq1 = chunk1.cmpeq(rare2chunk);

	let mut match_offsets = eq0.and(eq1).movemask() & mask;
	while match_offsets != 0 {
	let offset = match_offsets.trailing_zeros() as usize;
	let ptr = ptr.add(offset);
	if end_ptr.sub(needle.len()) < ptr {
	return None;
	}
	let chunk = core::slice::from_raw_parts(ptr, needle.len());
	if memcmp(needle, chunk) {
	return Some(offset);
	}
	match_offsets &= match_offsets - 1;
	}
	None
	}

	/// Accepts a chunk-relative offset and returns a haystack relative offset
	/// after updating the prefilter state.
	///
	/// See the same function with the same name in the prefilter variant of this
	/// algorithm to learned why it's tagged with inline(never). Even here, where
	/// the function is simpler, inlining it leads to poorer codegen. (Although
	/// it does improve some benchmarks, like prebuiltiter/huge-en/common-you.)
	#[cold]
	#[inline(never)]
	fn matched(start_ptr: const u8, ptr: const u8, chunki: usize) -> usize {
	diff(ptr, start_ptr) + chunki
	}

	/// Subtract `b` from `a` and return the difference. `a` must be greater than
	/// or equal to `b`.
	fn diff(a: const u8, b: const u8) -> usize {
	debug_assert!(a >= b);
	(a as usize) - (b as usize)
	}