| // See the README in this directory for an explanation of the Teddy algorithm. |
| // It is strongly recommended to peruse the README before trying to grok this |
| // code, as its use of SIMD is pretty opaque, although I tried to add comments |
| // where appropriate. |
| // |
| // Moreover, while there is a lot of code in this file, most of it is |
| // repeated variants of the same thing. Specifically, there are three Teddy |
| // variants: Slim 128-bit Teddy (8 buckets), Slim 256-bit Teddy (8 buckets) |
| // and Fat 256-bit Teddy (16 buckets). For each variant, there are three |
| // implementations, corresponding to mask lengths of 1, 2 and 3. Bringing it to |
| // a total of nine variants. Each one is structured roughly the same: |
| // |
| // while at <= len(haystack) - CHUNK_SIZE: |
| // let candidate = find_candidate_in_chunk(haystack, at) |
| // if not all zeroes(candidate): |
| // if match = verify(haystack, at, candidate): |
| // return match |
| // |
| // For the most part, this remains unchanged. The parts that vary are the |
| // verification routine (for slim vs fat Teddy) and the candidate extraction |
| // (based on the number of masks). |
| // |
| // In the code below, a "candidate" corresponds to a single vector with 8-bit |
| // lanes. Each lane is itself an 8-bit bitset, where the ith bit is set in the |
| // jth lane if and only if the byte occurring at position `j` is in the |
| // bucket `i` (where the `j`th position is the position in the current window |
| // of the haystack, which is always 16 or 32 bytes). Note to be careful here: |
| // the ith bit and the jth lane correspond to the least significant bits of the |
| // vector. So when visualizing how the current window of bytes is stored in a |
| // vector, you often need to flip it around. For example, the text `abcd` in a |
| // 4-byte vector would look like this: |
| // |
| // 01100100 01100011 01100010 01100001 |
| // d c b a |
| // |
| // When the mask length is 1, then finding the candidate is pretty straight |
| // forward: you just apply the shuffle indices (from the haystack window) to |
| // the masks, and then AND them together, as described in the README. But for |
| // masks of length 2 and 3, you need to keep a little state. Specifically, |
| // you need to store the final 1 (for mask length 2) or 2 (for mask length 3) |
| // bytes of the candidate for use when searching the next window. This is for |
| // handling matches that span two windows. |
| // |
| // With respect to the repeated code, it would likely be possible to reduce |
| // the number of copies of code below using polymorphism, but I find this |
| // formulation clearer instead of needing to reason through generics. However, |
| // I admit, there may be a simpler generic construction that I'm missing. |
| // |
| // All variants are fairly heavily tested in src/packed/tests.rs. |
| |
| use std::arch::x86_64::*; |
| use std::mem; |
| |
| use crate::packed::pattern::{PatternID, Patterns}; |
| use crate::packed::teddy::compile; |
| use crate::packed::vector::*; |
| use crate::Match; |
| |
| /// The Teddy runtime. |
| /// |
| /// A Teddy runtime can be used to quickly search for occurrences of one or |
| /// more patterns. While it does not scale to an arbitrary number of patterns |
| /// like Aho-Corasick, it does find occurrences for a small set of patterns |
| /// much more quickly than Aho-Corasick. |
| /// |
| /// Teddy cannot run on small haystacks below a certain size, which is |
| /// dependent on the type of matcher used. This size can be queried via the |
| /// `minimum_len` method. Violating this will result in a panic. |
| /// |
| /// Finally, when callers use a Teddy runtime, they must provide precisely the |
| /// patterns used to construct the Teddy matcher. Violating this will result |
| /// in either a panic or incorrect results, but will never sacrifice memory |
| /// safety. |
| #[derive(Clone, Debug)] |
| pub struct Teddy { |
| /// The allocation of patterns in buckets. This only contains the IDs of |
| /// patterns. In order to do full verification, callers must provide the |
| /// actual patterns when using Teddy. |
| pub buckets: Vec<Vec<PatternID>>, |
| /// The maximum identifier of a pattern. This is used as a sanity check to |
| /// ensure that the patterns provided by the caller are the same as the |
| /// patterns that were used to compile the matcher. This sanity check |
| /// permits safely eliminating bounds checks regardless of what patterns |
| /// are provided by the caller. |
| /// |
| /// Note that users of the aho-corasick crate cannot get this wrong. Only |
| /// code internal to this crate can get it wrong, since neither `Patterns` |
| /// type nor the Teddy runtime are public API items. |
| pub max_pattern_id: PatternID, |
| /// The actual runtime to use. |
| pub exec: Exec, |
| } |
| |
| impl Teddy { |
| /// Return the first occurrence of a match in the given haystack after or |
| /// starting at `at`. |
| /// |
| /// The patterns provided must be precisely the same patterns given to the |
| /// Teddy builder, otherwise this may panic or produce incorrect results. |
| /// |
| /// All matches are consistent with the match semantics (leftmost-first or |
| /// leftmost-longest) set on `pats`. |
| pub fn find_at( |
| &self, |
| pats: &Patterns, |
| haystack: &[u8], |
| at: usize, |
| ) -> Option<Match> { |
| // This assert is a bit subtle, but it's an important guarantee. |
| // Namely, if the maximum pattern ID seen by Teddy is the same as the |
| // one in the patterns given, then we are guaranteed that every pattern |
| // ID in all Teddy buckets are valid indices into `pats`. While this |
| // is nominally true, there is no guarantee that callers provide the |
| // same `pats` to both the Teddy builder and the searcher, which would |
| // otherwise make `find_at` unsafe to call. But this assert lets us |
| // keep this routine safe and eliminate an important bounds check in |
| // verification. |
| assert_eq!( |
| self.max_pattern_id, |
| pats.max_pattern_id(), |
| "teddy must be called with same patterns it was built with", |
| ); |
| // SAFETY: The haystack must have at least a minimum number of bytes |
| // for Teddy to be able to work. The minimum number varies depending on |
| // which matcher is used below. If this is violated, then it's possible |
| // for searching to do out-of-bounds writes. |
| assert!(haystack[at..].len() >= self.minimum_len()); |
| // SAFETY: The various Teddy matchers are always safe to call because |
| // the Teddy builder guarantees that a particular Exec variant is |
| // built only when it can be run the current CPU. That is, the Teddy |
| // builder will not produce a Exec::TeddySlim1Mask256 unless AVX2 is |
| // enabled. That is, our dynamic CPU feature detection is performed |
| // once in the builder, and we rely on the type system to avoid needing |
| // to do it again. |
| unsafe { |
| match self.exec { |
| Exec::TeddySlim1Mask128(ref e) => { |
| e.find_at(pats, self, haystack, at) |
| } |
| Exec::TeddySlim1Mask256(ref e) => { |
| e.find_at(pats, self, haystack, at) |
| } |
| Exec::TeddyFat1Mask256(ref e) => { |
| e.find_at(pats, self, haystack, at) |
| } |
| Exec::TeddySlim2Mask128(ref e) => { |
| e.find_at(pats, self, haystack, at) |
| } |
| Exec::TeddySlim2Mask256(ref e) => { |
| e.find_at(pats, self, haystack, at) |
| } |
| Exec::TeddyFat2Mask256(ref e) => { |
| e.find_at(pats, self, haystack, at) |
| } |
| Exec::TeddySlim3Mask128(ref e) => { |
| e.find_at(pats, self, haystack, at) |
| } |
| Exec::TeddySlim3Mask256(ref e) => { |
| e.find_at(pats, self, haystack, at) |
| } |
| Exec::TeddyFat3Mask256(ref e) => { |
| e.find_at(pats, self, haystack, at) |
| } |
| } |
| } |
| } |
| |
| /// Returns the minimum length of a haystack that must be provided by |
| /// callers to this Teddy searcher. Providing a haystack shorter than this |
| /// will result in a panic, but will never violate memory safety. |
| pub fn minimum_len(&self) -> usize { |
| // SAFETY: These values must be correct in order to ensure safety. |
| // The Teddy runtime assumes their haystacks have at least these |
| // lengths. Violating this will sacrifice memory safety. |
| match self.exec { |
| Exec::TeddySlim1Mask128(_) => 16, |
| Exec::TeddySlim1Mask256(_) => 32, |
| Exec::TeddyFat1Mask256(_) => 16, |
| Exec::TeddySlim2Mask128(_) => 17, |
| Exec::TeddySlim2Mask256(_) => 33, |
| Exec::TeddyFat2Mask256(_) => 17, |
| Exec::TeddySlim3Mask128(_) => 18, |
| Exec::TeddySlim3Mask256(_) => 34, |
| Exec::TeddyFat3Mask256(_) => 34, |
| } |
| } |
| |
| /// Returns the approximate total amount of heap used by this searcher, in |
| /// units of bytes. |
| pub fn heap_bytes(&self) -> usize { |
| let num_patterns = self.max_pattern_id as usize + 1; |
| self.buckets.len() * mem::size_of::<Vec<PatternID>>() |
| + num_patterns * mem::size_of::<PatternID>() |
| } |
| |
| /// Runs the verification routine for Slim 128-bit Teddy. |
| /// |
| /// The candidate given should be a collection of 8-bit bitsets (one bitset |
| /// per lane), where the ith bit is set in the jth lane if and only if the |
| /// byte occurring at `at + j` in `haystack` is in the bucket `i`. |
| /// |
| /// This is not safe to call unless the SSSE3 target feature is enabled. |
| /// The `target_feature` attribute is not applied since this function is |
| /// always forcefully inlined. |
| #[inline(always)] |
| unsafe fn verify128( |
| &self, |
| pats: &Patterns, |
| haystack: &[u8], |
| at: usize, |
| cand: __m128i, |
| ) -> Option<Match> { |
| debug_assert!(!is_all_zeroes128(cand)); |
| debug_assert_eq!(8, self.buckets.len()); |
| |
| // Convert the candidate into 64-bit chunks, and then verify each of |
| // those chunks. |
| let parts = unpack64x128(cand); |
| for (i, &part) in parts.iter().enumerate() { |
| let pos = at + i * 8; |
| if let Some(m) = self.verify64(pats, 8, haystack, pos, part) { |
| return Some(m); |
| } |
| } |
| None |
| } |
| |
| /// Runs the verification routine for Slim 256-bit Teddy. |
| /// |
| /// The candidate given should be a collection of 8-bit bitsets (one bitset |
| /// per lane), where the ith bit is set in the jth lane if and only if the |
| /// byte occurring at `at + j` in `haystack` is in the bucket `i`. |
| /// |
| /// This is not safe to call unless the AVX2 target feature is enabled. |
| /// The `target_feature` attribute is not applied since this function is |
| /// always forcefully inlined. |
| #[inline(always)] |
| unsafe fn verify256( |
| &self, |
| pats: &Patterns, |
| haystack: &[u8], |
| at: usize, |
| cand: __m256i, |
| ) -> Option<Match> { |
| debug_assert!(!is_all_zeroes256(cand)); |
| debug_assert_eq!(8, self.buckets.len()); |
| |
| // Convert the candidate into 64-bit chunks, and then verify each of |
| // those chunks. |
| let parts = unpack64x256(cand); |
| for (i, &part) in parts.iter().enumerate() { |
| let pos = at + i * 8; |
| if let Some(m) = self.verify64(pats, 8, haystack, pos, part) { |
| return Some(m); |
| } |
| } |
| None |
| } |
| |
| /// Runs the verification routine for Fat 256-bit Teddy. |
| /// |
| /// The candidate given should be a collection of 8-bit bitsets (one bitset |
| /// per lane), where the ith bit is set in the jth lane if and only if the |
| /// byte occurring at `at + (j < 16 ? j : j - 16)` in `haystack` is in the |
| /// bucket `j < 16 ? i : i + 8`. |
| /// |
| /// This is not safe to call unless the AVX2 target feature is enabled. |
| /// The `target_feature` attribute is not applied since this function is |
| /// always forcefully inlined. |
| #[inline(always)] |
| unsafe fn verify_fat256( |
| &self, |
| pats: &Patterns, |
| haystack: &[u8], |
| at: usize, |
| cand: __m256i, |
| ) -> Option<Match> { |
| debug_assert!(!is_all_zeroes256(cand)); |
| debug_assert_eq!(16, self.buckets.len()); |
| |
| // This is a bit tricky, but we basically want to convert our |
| // candidate, which looks like this |
| // |
| // a31 a30 ... a17 a16 a15 a14 ... a01 a00 |
| // |
| // where each a(i) is an 8-bit bitset corresponding to the activated |
| // buckets, to this |
| // |
| // a31 a15 a30 a14 a29 a13 ... a18 a02 a17 a01 a16 a00 |
| // |
| // Namely, for Fat Teddy, the high 128-bits of the candidate correspond |
| // to the same bytes in the haystack in the low 128-bits (so we only |
| // scan 16 bytes at a time), but are for buckets 8-15 instead of 0-7. |
| // |
| // The verification routine wants to look at all potentially matching |
| // buckets before moving on to the next lane. So for example, both |
| // a16 and a00 both correspond to the first byte in our window; a00 |
| // contains buckets 0-7 and a16 contains buckets 8-15. Specifically, |
| // a16 should be checked before a01. So the transformation shown above |
| // allows us to use our normal verification procedure with one small |
| // change: we treat each bitset as 16 bits instead of 8 bits. |
| |
| // Swap the 128-bit lanes in the candidate vector. |
| let swap = _mm256_permute4x64_epi64(cand, 0x4E); |
| // Interleave the bytes from the low 128-bit lanes, starting with |
| // cand first. |
| let r1 = _mm256_unpacklo_epi8(cand, swap); |
| // Interleave the bytes from the high 128-bit lanes, starting with |
| // cand first. |
| let r2 = _mm256_unpackhi_epi8(cand, swap); |
| // Now just take the 2 low 64-bit integers from both r1 and r2. We |
| // can drop the high 64-bit integers because they are a mirror image |
| // of the low 64-bit integers. All we care about are the low 128-bit |
| // lanes of r1 and r2. Combined, they contain all our 16-bit bitsets |
| // laid out in the desired order, as described above. |
| let parts = unpacklo64x256(r1, r2); |
| for (i, &part) in parts.iter().enumerate() { |
| let pos = at + i * 4; |
| if let Some(m) = self.verify64(pats, 16, haystack, pos, part) { |
| return Some(m); |
| } |
| } |
| None |
| } |
| |
| /// Verify whether there are any matches starting at or after `at` in the |
| /// given `haystack`. The candidate given should correspond to either 8-bit |
| /// (for 8 buckets) or 16-bit (16 buckets) bitsets. |
| #[inline(always)] |
| fn verify64( |
| &self, |
| pats: &Patterns, |
| bucket_count: usize, |
| haystack: &[u8], |
| at: usize, |
| mut cand: u64, |
| ) -> Option<Match> { |
| // N.B. While the bucket count is known from self.buckets.len(), |
| // requiring it as a parameter makes it easier for the optimizer to |
| // know its value, and thus produce more efficient codegen. |
| debug_assert!(bucket_count == 8 || bucket_count == 16); |
| while cand != 0 { |
| let bit = cand.trailing_zeros() as usize; |
| cand &= !(1 << bit); |
| |
| let at = at + (bit / bucket_count); |
| let bucket = bit % bucket_count; |
| if let Some(m) = self.verify_bucket(pats, haystack, bucket, at) { |
| return Some(m); |
| } |
| } |
| None |
| } |
| |
| /// Verify whether there are any matches starting at `at` in the given |
| /// `haystack` corresponding only to patterns in the given bucket. |
| #[inline(always)] |
| fn verify_bucket( |
| &self, |
| pats: &Patterns, |
| haystack: &[u8], |
| bucket: usize, |
| at: usize, |
| ) -> Option<Match> { |
| // Forcing this function to not inline and be "cold" seems to help |
| // the codegen for Teddy overall. Interestingly, this is good for a |
| // 16% boost in the sherlock/packed/teddy/name/alt1 benchmark (among |
| // others). Overall, this seems like a problem with codegen, since |
| // creating the Match itself is a very small amount of code. |
| #[cold] |
| #[inline(never)] |
| fn match_from_span( |
| pati: PatternID, |
| start: usize, |
| end: usize, |
| ) -> Match { |
| Match::from_span(pati as usize, start, end) |
| } |
| |
| // N.B. The bounds check for this bucket lookup *should* be elided |
| // since we assert the number of buckets in each `find_at` routine, |
| // and the compiler can prove that the `% 8` (or `% 16`) in callers |
| // of this routine will always be in bounds. |
| for &pati in &self.buckets[bucket] { |
| // SAFETY: This is safe because we are guaranteed that every |
| // index in a Teddy bucket is a valid index into `pats`. This |
| // guarantee is upheld by the assert checking `max_pattern_id` in |
| // the beginning of `find_at` above. |
| // |
| // This explicit bounds check elision is (amazingly) good for a |
| // 25-50% boost in some benchmarks, particularly ones with a lot |
| // of short literals. |
| let pat = unsafe { pats.get_unchecked(pati) }; |
| if pat.is_prefix(&haystack[at..]) { |
| return Some(match_from_span(pati, at, at + pat.len())); |
| } |
| } |
| None |
| } |
| } |
| |
| /// Exec represents the different search strategies supported by the Teddy |
| /// runtime. |
| /// |
| /// This enum is an important safety abstraction. Namely, callers should only |
| /// construct a variant in this enum if it is safe to execute its corresponding |
| /// target features on the current CPU. The 128-bit searchers require SSSE3, |
| /// while the 256-bit searchers require AVX2. |
| #[derive(Clone, Debug)] |
| pub enum Exec { |
| TeddySlim1Mask128(TeddySlim1Mask128), |
| TeddySlim1Mask256(TeddySlim1Mask256), |
| TeddyFat1Mask256(TeddyFat1Mask256), |
| TeddySlim2Mask128(TeddySlim2Mask128), |
| TeddySlim2Mask256(TeddySlim2Mask256), |
| TeddyFat2Mask256(TeddyFat2Mask256), |
| TeddySlim3Mask128(TeddySlim3Mask128), |
| TeddySlim3Mask256(TeddySlim3Mask256), |
| TeddyFat3Mask256(TeddyFat3Mask256), |
| } |
| |
| // Most of the code below remains undocumented because they are effectively |
| // repeated versions of themselves. The general structure is described in the |
| // README and in the comments above. |
| |
| #[derive(Clone, Debug)] |
| pub struct TeddySlim1Mask128 { |
| pub mask1: Mask128, |
| } |
| |
| impl TeddySlim1Mask128 { |
| #[target_feature(enable = "ssse3")] |
| unsafe fn find_at( |
| &self, |
| pats: &Patterns, |
| teddy: &Teddy, |
| haystack: &[u8], |
| mut at: usize, |
| ) -> Option<Match> { |
| debug_assert!(haystack[at..].len() >= teddy.minimum_len()); |
| // This assert helps eliminate bounds checks for bucket lookups in |
| // Teddy::verify_bucket, which has a small (3-4%) performance boost. |
| assert_eq!(8, teddy.buckets.len()); |
| |
| let len = haystack.len(); |
| while at <= len - 16 { |
| let c = self.candidate(haystack, at); |
| if !is_all_zeroes128(c) { |
| if let Some(m) = teddy.verify128(pats, haystack, at, c) { |
| return Some(m); |
| } |
| } |
| at += 16; |
| } |
| if at < len { |
| at = len - 16; |
| let c = self.candidate(haystack, at); |
| if !is_all_zeroes128(c) { |
| if let Some(m) = teddy.verify128(pats, haystack, at, c) { |
| return Some(m); |
| } |
| } |
| } |
| None |
| } |
| |
| #[inline(always)] |
| unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m128i { |
| debug_assert!(haystack[at..].len() >= 16); |
| |
| let chunk = loadu128(haystack, at); |
| members1m128(chunk, self.mask1) |
| } |
| } |
| |
| #[derive(Clone, Debug)] |
| pub struct TeddySlim1Mask256 { |
| pub mask1: Mask256, |
| } |
| |
| impl TeddySlim1Mask256 { |
| #[target_feature(enable = "avx2")] |
| unsafe fn find_at( |
| &self, |
| pats: &Patterns, |
| teddy: &Teddy, |
| haystack: &[u8], |
| mut at: usize, |
| ) -> Option<Match> { |
| debug_assert!(haystack[at..].len() >= teddy.minimum_len()); |
| // This assert helps eliminate bounds checks for bucket lookups in |
| // Teddy::verify_bucket, which has a small (3-4%) performance boost. |
| assert_eq!(8, teddy.buckets.len()); |
| |
| let len = haystack.len(); |
| while at <= len - 32 { |
| let c = self.candidate(haystack, at); |
| if !is_all_zeroes256(c) { |
| if let Some(m) = teddy.verify256(pats, haystack, at, c) { |
| return Some(m); |
| } |
| } |
| at += 32; |
| } |
| if at < len { |
| at = len - 32; |
| let c = self.candidate(haystack, at); |
| if !is_all_zeroes256(c) { |
| if let Some(m) = teddy.verify256(pats, haystack, at, c) { |
| return Some(m); |
| } |
| } |
| } |
| None |
| } |
| |
| #[inline(always)] |
| unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m256i { |
| debug_assert!(haystack[at..].len() >= 32); |
| |
| let chunk = loadu256(haystack, at); |
| members1m256(chunk, self.mask1) |
| } |
| } |
| |
| #[derive(Clone, Debug)] |
| pub struct TeddyFat1Mask256 { |
| pub mask1: Mask256, |
| } |
| |
| impl TeddyFat1Mask256 { |
| #[target_feature(enable = "avx2")] |
| unsafe fn find_at( |
| &self, |
| pats: &Patterns, |
| teddy: &Teddy, |
| haystack: &[u8], |
| mut at: usize, |
| ) -> Option<Match> { |
| debug_assert!(haystack[at..].len() >= teddy.minimum_len()); |
| // This assert helps eliminate bounds checks for bucket lookups in |
| // Teddy::verify_bucket, which has a small (3-4%) performance boost. |
| assert_eq!(16, teddy.buckets.len()); |
| |
| let len = haystack.len(); |
| while at <= len - 16 { |
| let c = self.candidate(haystack, at); |
| if !is_all_zeroes256(c) { |
| if let Some(m) = teddy.verify_fat256(pats, haystack, at, c) { |
| return Some(m); |
| } |
| } |
| at += 16; |
| } |
| if at < len { |
| at = len - 16; |
| let c = self.candidate(haystack, at); |
| if !is_all_zeroes256(c) { |
| if let Some(m) = teddy.verify_fat256(pats, haystack, at, c) { |
| return Some(m); |
| } |
| } |
| } |
| None |
| } |
| |
| #[inline(always)] |
| unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m256i { |
| debug_assert!(haystack[at..].len() >= 16); |
| |
| let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at)); |
| members1m256(chunk, self.mask1) |
| } |
| } |
| |
| #[derive(Clone, Debug)] |
| pub struct TeddySlim2Mask128 { |
| pub mask1: Mask128, |
| pub mask2: Mask128, |
| } |
| |
| impl TeddySlim2Mask128 { |
| #[target_feature(enable = "ssse3")] |
| unsafe fn find_at( |
| &self, |
| pats: &Patterns, |
| teddy: &Teddy, |
| haystack: &[u8], |
| mut at: usize, |
| ) -> Option<Match> { |
| debug_assert!(haystack[at..].len() >= teddy.minimum_len()); |
| // This assert helps eliminate bounds checks for bucket lookups in |
| // Teddy::verify_bucket, which has a small (3-4%) performance boost. |
| assert_eq!(8, teddy.buckets.len()); |
| |
| at += 1; |
| let len = haystack.len(); |
| let mut prev0 = ones128(); |
| while at <= len - 16 { |
| let c = self.candidate(haystack, at, &mut prev0); |
| if !is_all_zeroes128(c) { |
| if let Some(m) = teddy.verify128(pats, haystack, at - 1, c) { |
| return Some(m); |
| } |
| } |
| at += 16; |
| } |
| if at < len { |
| at = len - 16; |
| prev0 = ones128(); |
| |
| let c = self.candidate(haystack, at, &mut prev0); |
| if !is_all_zeroes128(c) { |
| if let Some(m) = teddy.verify128(pats, haystack, at - 1, c) { |
| return Some(m); |
| } |
| } |
| } |
| None |
| } |
| |
| #[inline(always)] |
| unsafe fn candidate( |
| &self, |
| haystack: &[u8], |
| at: usize, |
| prev0: &mut __m128i, |
| ) -> __m128i { |
| debug_assert!(haystack[at..].len() >= 16); |
| |
| let chunk = loadu128(haystack, at); |
| let (res0, res1) = members2m128(chunk, self.mask1, self.mask2); |
| let res0prev0 = _mm_alignr_epi8(res0, *prev0, 15); |
| _mm_and_si128(res0prev0, res1) |
| } |
| } |
| |
| #[derive(Clone, Debug)] |
| pub struct TeddySlim2Mask256 { |
| pub mask1: Mask256, |
| pub mask2: Mask256, |
| } |
| |
| impl TeddySlim2Mask256 { |
| #[target_feature(enable = "avx2")] |
| unsafe fn find_at( |
| &self, |
| pats: &Patterns, |
| teddy: &Teddy, |
| haystack: &[u8], |
| mut at: usize, |
| ) -> Option<Match> { |
| debug_assert!(haystack[at..].len() >= teddy.minimum_len()); |
| // This assert helps eliminate bounds checks for bucket lookups in |
| // Teddy::verify_bucket, which has a small (3-4%) performance boost. |
| assert_eq!(8, teddy.buckets.len()); |
| |
| at += 1; |
| let len = haystack.len(); |
| let mut prev0 = ones256(); |
| while at <= len - 32 { |
| let c = self.candidate(haystack, at, &mut prev0); |
| if !is_all_zeroes256(c) { |
| if let Some(m) = teddy.verify256(pats, haystack, at - 1, c) { |
| return Some(m); |
| } |
| } |
| at += 32; |
| } |
| if at < len { |
| at = len - 32; |
| prev0 = ones256(); |
| |
| let c = self.candidate(haystack, at, &mut prev0); |
| if !is_all_zeroes256(c) { |
| if let Some(m) = teddy.verify256(pats, haystack, at - 1, c) { |
| return Some(m); |
| } |
| } |
| } |
| None |
| } |
| |
| #[inline(always)] |
| unsafe fn candidate( |
| &self, |
| haystack: &[u8], |
| at: usize, |
| prev0: &mut __m256i, |
| ) -> __m256i { |
| debug_assert!(haystack[at..].len() >= 32); |
| |
| let chunk = loadu256(haystack, at); |
| let (res0, res1) = members2m256(chunk, self.mask1, self.mask2); |
| let res0prev0 = alignr256_15(res0, *prev0); |
| let res = _mm256_and_si256(res0prev0, res1); |
| *prev0 = res0; |
| res |
| } |
| } |
| |
| #[derive(Clone, Debug)] |
| pub struct TeddyFat2Mask256 { |
| pub mask1: Mask256, |
| pub mask2: Mask256, |
| } |
| |
| impl TeddyFat2Mask256 { |
| #[target_feature(enable = "avx2")] |
| unsafe fn find_at( |
| &self, |
| pats: &Patterns, |
| teddy: &Teddy, |
| haystack: &[u8], |
| mut at: usize, |
| ) -> Option<Match> { |
| debug_assert!(haystack[at..].len() >= teddy.minimum_len()); |
| // This assert helps eliminate bounds checks for bucket lookups in |
| // Teddy::verify_bucket, which has a small (3-4%) performance boost. |
| assert_eq!(16, teddy.buckets.len()); |
| |
| at += 1; |
| let len = haystack.len(); |
| let mut prev0 = ones256(); |
| while at <= len - 16 { |
| let c = self.candidate(haystack, at, &mut prev0); |
| if !is_all_zeroes256(c) { |
| if let Some(m) = teddy.verify_fat256(pats, haystack, at - 1, c) |
| { |
| return Some(m); |
| } |
| } |
| at += 16; |
| } |
| if at < len { |
| at = len - 16; |
| prev0 = ones256(); |
| |
| let c = self.candidate(haystack, at, &mut prev0); |
| if !is_all_zeroes256(c) { |
| if let Some(m) = teddy.verify_fat256(pats, haystack, at - 1, c) |
| { |
| return Some(m); |
| } |
| } |
| } |
| None |
| } |
| |
| #[inline(always)] |
| unsafe fn candidate( |
| &self, |
| haystack: &[u8], |
| at: usize, |
| prev0: &mut __m256i, |
| ) -> __m256i { |
| debug_assert!(haystack[at..].len() >= 16); |
| |
| let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at)); |
| let (res0, res1) = members2m256(chunk, self.mask1, self.mask2); |
| let res0prev0 = _mm256_alignr_epi8(res0, *prev0, 15); |
| let res = _mm256_and_si256(res0prev0, res1); |
| *prev0 = res0; |
| res |
| } |
| } |
| |
| #[derive(Clone, Debug)] |
| pub struct TeddySlim3Mask128 { |
| pub mask1: Mask128, |
| pub mask2: Mask128, |
| pub mask3: Mask128, |
| } |
| |
| impl TeddySlim3Mask128 { |
| #[target_feature(enable = "ssse3")] |
| unsafe fn find_at( |
| &self, |
| pats: &Patterns, |
| teddy: &Teddy, |
| haystack: &[u8], |
| mut at: usize, |
| ) -> Option<Match> { |
| debug_assert!(haystack[at..].len() >= teddy.minimum_len()); |
| // This assert helps eliminate bounds checks for bucket lookups in |
| // Teddy::verify_bucket, which has a small (3-4%) performance boost. |
| assert_eq!(8, teddy.buckets.len()); |
| |
| at += 2; |
| let len = haystack.len(); |
| let (mut prev0, mut prev1) = (ones128(), ones128()); |
| while at <= len - 16 { |
| let c = self.candidate(haystack, at, &mut prev0, &mut prev1); |
| if !is_all_zeroes128(c) { |
| if let Some(m) = teddy.verify128(pats, haystack, at - 2, c) { |
| return Some(m); |
| } |
| } |
| at += 16; |
| } |
| if at < len { |
| at = len - 16; |
| prev0 = ones128(); |
| prev1 = ones128(); |
| |
| let c = self.candidate(haystack, at, &mut prev0, &mut prev1); |
| if !is_all_zeroes128(c) { |
| if let Some(m) = teddy.verify128(pats, haystack, at - 2, c) { |
| return Some(m); |
| } |
| } |
| } |
| None |
| } |
| |
| #[inline(always)] |
| unsafe fn candidate( |
| &self, |
| haystack: &[u8], |
| at: usize, |
| prev0: &mut __m128i, |
| prev1: &mut __m128i, |
| ) -> __m128i { |
| debug_assert!(haystack[at..].len() >= 16); |
| |
| let chunk = loadu128(haystack, at); |
| let (res0, res1, res2) = |
| members3m128(chunk, self.mask1, self.mask2, self.mask3); |
| let res0prev0 = _mm_alignr_epi8(res0, *prev0, 14); |
| let res1prev1 = _mm_alignr_epi8(res1, *prev1, 15); |
| let res = _mm_and_si128(_mm_and_si128(res0prev0, res1prev1), res2); |
| *prev0 = res0; |
| *prev1 = res1; |
| res |
| } |
| } |
| |
| #[derive(Clone, Debug)] |
| pub struct TeddySlim3Mask256 { |
| pub mask1: Mask256, |
| pub mask2: Mask256, |
| pub mask3: Mask256, |
| } |
| |
| impl TeddySlim3Mask256 { |
| #[target_feature(enable = "avx2")] |
| unsafe fn find_at( |
| &self, |
| pats: &Patterns, |
| teddy: &Teddy, |
| haystack: &[u8], |
| mut at: usize, |
| ) -> Option<Match> { |
| debug_assert!(haystack[at..].len() >= teddy.minimum_len()); |
| // This assert helps eliminate bounds checks for bucket lookups in |
| // Teddy::verify_bucket, which has a small (3-4%) performance boost. |
| assert_eq!(8, teddy.buckets.len()); |
| |
| at += 2; |
| let len = haystack.len(); |
| let (mut prev0, mut prev1) = (ones256(), ones256()); |
| while at <= len - 32 { |
| let c = self.candidate(haystack, at, &mut prev0, &mut prev1); |
| if !is_all_zeroes256(c) { |
| if let Some(m) = teddy.verify256(pats, haystack, at - 2, c) { |
| return Some(m); |
| } |
| } |
| at += 32; |
| } |
| if at < len { |
| at = len - 32; |
| prev0 = ones256(); |
| prev1 = ones256(); |
| |
| let c = self.candidate(haystack, at, &mut prev0, &mut prev1); |
| if !is_all_zeroes256(c) { |
| if let Some(m) = teddy.verify256(pats, haystack, at - 2, c) { |
| return Some(m); |
| } |
| } |
| } |
| None |
| } |
| |
| #[inline(always)] |
| unsafe fn candidate( |
| &self, |
| haystack: &[u8], |
| at: usize, |
| prev0: &mut __m256i, |
| prev1: &mut __m256i, |
| ) -> __m256i { |
| debug_assert!(haystack[at..].len() >= 32); |
| |
| let chunk = loadu256(haystack, at); |
| let (res0, res1, res2) = |
| members3m256(chunk, self.mask1, self.mask2, self.mask3); |
| let res0prev0 = alignr256_14(res0, *prev0); |
| let res1prev1 = alignr256_15(res1, *prev1); |
| let res = |
| _mm256_and_si256(_mm256_and_si256(res0prev0, res1prev1), res2); |
| *prev0 = res0; |
| *prev1 = res1; |
| res |
| } |
| } |
| |
| #[derive(Clone, Debug)] |
| pub struct TeddyFat3Mask256 { |
| pub mask1: Mask256, |
| pub mask2: Mask256, |
| pub mask3: Mask256, |
| } |
| |
| impl TeddyFat3Mask256 { |
| #[target_feature(enable = "avx2")] |
| unsafe fn find_at( |
| &self, |
| pats: &Patterns, |
| teddy: &Teddy, |
| haystack: &[u8], |
| mut at: usize, |
| ) -> Option<Match> { |
| debug_assert!(haystack[at..].len() >= teddy.minimum_len()); |
| // This assert helps eliminate bounds checks for bucket lookups in |
| // Teddy::verify_bucket, which has a small (3-4%) performance boost. |
| assert_eq!(16, teddy.buckets.len()); |
| |
| at += 2; |
| let len = haystack.len(); |
| let (mut prev0, mut prev1) = (ones256(), ones256()); |
| while at <= len - 16 { |
| let c = self.candidate(haystack, at, &mut prev0, &mut prev1); |
| if !is_all_zeroes256(c) { |
| if let Some(m) = teddy.verify_fat256(pats, haystack, at - 2, c) |
| { |
| return Some(m); |
| } |
| } |
| at += 16; |
| } |
| if at < len { |
| at = len - 16; |
| prev0 = ones256(); |
| prev1 = ones256(); |
| |
| let c = self.candidate(haystack, at, &mut prev0, &mut prev1); |
| if !is_all_zeroes256(c) { |
| if let Some(m) = teddy.verify_fat256(pats, haystack, at - 2, c) |
| { |
| return Some(m); |
| } |
| } |
| } |
| None |
| } |
| |
| #[inline(always)] |
| unsafe fn candidate( |
| &self, |
| haystack: &[u8], |
| at: usize, |
| prev0: &mut __m256i, |
| prev1: &mut __m256i, |
| ) -> __m256i { |
| debug_assert!(haystack[at..].len() >= 16); |
| |
| let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at)); |
| let (res0, res1, res2) = |
| members3m256(chunk, self.mask1, self.mask2, self.mask3); |
| let res0prev0 = _mm256_alignr_epi8(res0, *prev0, 14); |
| let res1prev1 = _mm256_alignr_epi8(res1, *prev1, 15); |
| let res = |
| _mm256_and_si256(_mm256_and_si256(res0prev0, res1prev1), res2); |
| *prev0 = res0; |
| *prev1 = res1; |
| res |
| } |
| } |
| |
| /// A 128-bit mask for the low and high nybbles in a set of patterns. Each |
| /// lane `j` corresponds to a bitset where the `i`th bit is set if and only if |
| /// the nybble `j` is in the bucket `i` at a particular position. |
| #[derive(Clone, Copy, Debug)] |
| pub struct Mask128 { |
| lo: __m128i, |
| hi: __m128i, |
| } |
| |
| impl Mask128 { |
| /// Create a new SIMD mask from the mask produced by the Teddy builder. |
| pub fn new(mask: compile::Mask) -> Mask128 { |
| // SAFETY: This is safe since [u8; 16] has the same representation |
| // as __m128i. |
| unsafe { |
| Mask128 { |
| lo: mem::transmute(mask.lo128()), |
| hi: mem::transmute(mask.hi128()), |
| } |
| } |
| } |
| } |
| |
| /// A 256-bit mask for the low and high nybbles in a set of patterns. Each |
| /// lane `j` corresponds to a bitset where the `i`th bit is set if and only if |
| /// the nybble `j` is in the bucket `i` at a particular position. |
| /// |
| /// This is slightly tweaked dependending on whether Slim or Fat Teddy is being |
| /// used. For Slim Teddy, the bitsets in the lower 128-bits are the same as |
| /// the bitsets in the higher 128-bits, so that we can search 32 bytes at a |
| /// time. (Remember, the nybbles in the haystack are used as indices into these |
| /// masks, and 256-bit shuffles only operate on 128-bit lanes.) |
| /// |
| /// For Fat Teddy, the bitsets are not repeated, but instead, the high 128 |
| /// bits correspond to buckets 8-15. So that a bitset `00100010` has buckets |
| /// 1 and 5 set if it's in the lower 128 bits, but has buckets 9 and 13 set |
| /// if it's in the higher 128 bits. |
| #[derive(Clone, Copy, Debug)] |
| pub struct Mask256 { |
| lo: __m256i, |
| hi: __m256i, |
| } |
| |
| impl Mask256 { |
| /// Create a new SIMD mask from the mask produced by the Teddy builder. |
| pub fn new(mask: compile::Mask) -> Mask256 { |
| // SAFETY: This is safe since [u8; 32] has the same representation |
| // as __m256i. |
| unsafe { |
| Mask256 { |
| lo: mem::transmute(mask.lo256()), |
| hi: mem::transmute(mask.hi256()), |
| } |
| } |
| } |
| } |
| |
| // The "members" routines below are responsible for taking a chunk of bytes, |
| // a number of nybble masks and returning the result of using the masks to |
| // lookup bytes in the chunk. The results of the high and low nybble masks are |
| // AND'ed together, such that each candidate returned is a vector, with byte |
| // sized lanes, and where each lane is an 8-bit bitset corresponding to the |
| // buckets that contain the corresponding byte. |
| // |
| // In the case of masks of length greater than 1, callers will need to keep |
| // the results from the previous haystack's window, and then shift the vectors |
| // so that they all line up. Then they can be AND'ed together. |
| |
| /// Return a candidate for Slim 128-bit Teddy, where `chunk` corresponds to a |
| /// 16-byte window of the haystack (where the least significant byte |
| /// corresponds to the start of the window), and `mask1` corresponds to a |
| /// low/high mask for the first byte of all patterns that are being searched. |
| #[target_feature(enable = "ssse3")] |
| unsafe fn members1m128(chunk: __m128i, mask1: Mask128) -> __m128i { |
| let lomask = _mm_set1_epi8(0xF); |
| let hlo = _mm_and_si128(chunk, lomask); |
| let hhi = _mm_and_si128(_mm_srli_epi16(chunk, 4), lomask); |
| _mm_and_si128( |
| _mm_shuffle_epi8(mask1.lo, hlo), |
| _mm_shuffle_epi8(mask1.hi, hhi), |
| ) |
| } |
| |
| /// Return a candidate for Slim 256-bit Teddy, where `chunk` corresponds to a |
| /// 32-byte window of the haystack (where the least significant byte |
| /// corresponds to the start of the window), and `mask1` corresponds to a |
| /// low/high mask for the first byte of all patterns that are being searched. |
| /// |
| /// Note that this can also be used for Fat Teddy, where the high 128 bits in |
| /// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte |
| /// window in the haystack. |
| #[target_feature(enable = "avx2")] |
| unsafe fn members1m256(chunk: __m256i, mask1: Mask256) -> __m256i { |
| let lomask = _mm256_set1_epi8(0xF); |
| let hlo = _mm256_and_si256(chunk, lomask); |
| let hhi = _mm256_and_si256(_mm256_srli_epi16(chunk, 4), lomask); |
| _mm256_and_si256( |
| _mm256_shuffle_epi8(mask1.lo, hlo), |
| _mm256_shuffle_epi8(mask1.hi, hhi), |
| ) |
| } |
| |
| /// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds |
| /// to a 16-byte window of the haystack (where the least significant byte |
| /// corresponds to the start of the window), and the masks correspond to a |
| /// low/high mask for the first and second bytes of all patterns that are being |
| /// searched. The vectors returned correspond to candidates for the first and |
| /// second bytes in the patterns represented by the masks. |
| #[target_feature(enable = "ssse3")] |
| unsafe fn members2m128( |
| chunk: __m128i, |
| mask1: Mask128, |
| mask2: Mask128, |
| ) -> (__m128i, __m128i) { |
| let lomask = _mm_set1_epi8(0xF); |
| let hlo = _mm_and_si128(chunk, lomask); |
| let hhi = _mm_and_si128(_mm_srli_epi16(chunk, 4), lomask); |
| let res0 = _mm_and_si128( |
| _mm_shuffle_epi8(mask1.lo, hlo), |
| _mm_shuffle_epi8(mask1.hi, hhi), |
| ); |
| let res1 = _mm_and_si128( |
| _mm_shuffle_epi8(mask2.lo, hlo), |
| _mm_shuffle_epi8(mask2.hi, hhi), |
| ); |
| (res0, res1) |
| } |
| |
| /// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds |
| /// to a 32-byte window of the haystack (where the least significant byte |
| /// corresponds to the start of the window), and the masks correspond to a |
| /// low/high mask for the first and second bytes of all patterns that are being |
| /// searched. The vectors returned correspond to candidates for the first and |
| /// second bytes in the patterns represented by the masks. |
| /// |
| /// Note that this can also be used for Fat Teddy, where the high 128 bits in |
| /// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte |
| /// window in the haystack. |
| #[target_feature(enable = "avx2")] |
| unsafe fn members2m256( |
| chunk: __m256i, |
| mask1: Mask256, |
| mask2: Mask256, |
| ) -> (__m256i, __m256i) { |
| let lomask = _mm256_set1_epi8(0xF); |
| let hlo = _mm256_and_si256(chunk, lomask); |
| let hhi = _mm256_and_si256(_mm256_srli_epi16(chunk, 4), lomask); |
| let res0 = _mm256_and_si256( |
| _mm256_shuffle_epi8(mask1.lo, hlo), |
| _mm256_shuffle_epi8(mask1.hi, hhi), |
| ); |
| let res1 = _mm256_and_si256( |
| _mm256_shuffle_epi8(mask2.lo, hlo), |
| _mm256_shuffle_epi8(mask2.hi, hhi), |
| ); |
| (res0, res1) |
| } |
| |
| /// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds |
| /// to a 16-byte window of the haystack (where the least significant byte |
| /// corresponds to the start of the window), and the masks correspond to a |
| /// low/high mask for the first, second and third bytes of all patterns that |
| /// are being searched. The vectors returned correspond to candidates for the |
| /// first, second and third bytes in the patterns represented by the masks. |
| #[target_feature(enable = "ssse3")] |
| unsafe fn members3m128( |
| chunk: __m128i, |
| mask1: Mask128, |
| mask2: Mask128, |
| mask3: Mask128, |
| ) -> (__m128i, __m128i, __m128i) { |
| let lomask = _mm_set1_epi8(0xF); |
| let hlo = _mm_and_si128(chunk, lomask); |
| let hhi = _mm_and_si128(_mm_srli_epi16(chunk, 4), lomask); |
| let res0 = _mm_and_si128( |
| _mm_shuffle_epi8(mask1.lo, hlo), |
| _mm_shuffle_epi8(mask1.hi, hhi), |
| ); |
| let res1 = _mm_and_si128( |
| _mm_shuffle_epi8(mask2.lo, hlo), |
| _mm_shuffle_epi8(mask2.hi, hhi), |
| ); |
| let res2 = _mm_and_si128( |
| _mm_shuffle_epi8(mask3.lo, hlo), |
| _mm_shuffle_epi8(mask3.hi, hhi), |
| ); |
| (res0, res1, res2) |
| } |
| |
| /// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds |
| /// to a 32-byte window of the haystack (where the least significant byte |
| /// corresponds to the start of the window), and the masks correspond to a |
| /// low/high mask for the first, second and third bytes of all patterns that |
| /// are being searched. The vectors returned correspond to candidates for the |
| /// first, second and third bytes in the patterns represented by the masks. |
| /// |
| /// Note that this can also be used for Fat Teddy, where the high 128 bits in |
| /// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte |
| /// window in the haystack. |
| #[target_feature(enable = "avx2")] |
| unsafe fn members3m256( |
| chunk: __m256i, |
| mask1: Mask256, |
| mask2: Mask256, |
| mask3: Mask256, |
| ) -> (__m256i, __m256i, __m256i) { |
| let lomask = _mm256_set1_epi8(0xF); |
| let hlo = _mm256_and_si256(chunk, lomask); |
| let hhi = _mm256_and_si256(_mm256_srli_epi16(chunk, 4), lomask); |
| let res0 = _mm256_and_si256( |
| _mm256_shuffle_epi8(mask1.lo, hlo), |
| _mm256_shuffle_epi8(mask1.hi, hhi), |
| ); |
| let res1 = _mm256_and_si256( |
| _mm256_shuffle_epi8(mask2.lo, hlo), |
| _mm256_shuffle_epi8(mask2.hi, hhi), |
| ); |
| let res2 = _mm256_and_si256( |
| _mm256_shuffle_epi8(mask3.lo, hlo), |
| _mm256_shuffle_epi8(mask3.hi, hhi), |
| ); |
| (res0, res1, res2) |
| } |