vendor/regex-0.2.11/src/literal/teddy_avx2/imp.rs - toolchain/rustc - Git at Google

 /*!
 This is the Teddy searcher, but ported to AVX2.

 See the module comments in the SSSE3 Teddy searcher for a more in depth
 explanation of how this algorithm works. For the most part, this port is
 basically the same as the SSSE3 version, but using 256-bit vectors instead of
 128-bit vectors, which increases throughput.
 */

 use std::cmp;

 use aho_corasick::{Automaton, AcAutomaton, FullAcAutomaton};
 use syntax::hir::literal::Literals;

 use vector::avx2::{AVX2VectorBuilder, u8x32};

 /// Corresponds to the number of bytes read at a time in the haystack.
 const BLOCK_SIZE: usize = 32;

 /// Match reports match information.
 #[derive(Debug, Clone)]
 pub struct Match {
     /// The index of the pattern that matched. The index is in correspondence
     /// with the order of the patterns given at construction.
     pub pat: usize,
     /// The start byte offset of the match.
     pub start: usize,
     /// The end byte offset of the match. This is always `start + pat.len()`.
     pub end: usize,
 }

 /// A SIMD accelerated multi substring searcher.
 #[derive(Debug, Clone)]
 pub struct Teddy {
     /// A builder for AVX2 empowered vectors.
     vb: AVX2VectorBuilder,
     /// A list of substrings to match.
     pats: Vec<Vec<u8>>,
     /// An Aho-Corasick automaton of the patterns. We use this when we need to
     /// search pieces smaller than the Teddy block size.
     ac: FullAcAutomaton<Vec<u8>>,
     /// A set of 8 buckets. Each bucket corresponds to a single member of a
     /// bitset. A bucket contains zero or more substrings. This is useful
     /// when the number of substrings exceeds 8, since our bitsets cannot have
     /// more than 8 members.
     buckets: Vec<Vec<usize>>,
     /// Our set of masks. There's one mask for each byte in the fingerprint.
     masks: Masks,
 }

 impl Teddy {
     /// Returns true if and only if Teddy is supported on this platform.
     ///
     /// If this returns `false`, then `Teddy::new(...)` is guaranteed to
     /// return `None`.
     pub fn available() -> bool {
         AVX2VectorBuilder::new().is_some()
     }

     /// Create a new `Teddy` multi substring matcher.
     ///
     /// If a `Teddy` matcher could not be created (e.g., `pats` is empty or has
     /// an empty substring), then `None` is returned.
     pub fn new(pats: &Literals) -> Option<Teddy> {
         let vb = match AVX2VectorBuilder::new() {
             None => return None,
             Some(vb) => vb,
         };
         if !Teddy::available() {
             return None;
         }

         let pats: Vec<_> = pats.literals().iter().map(|p|p.to_vec()).collect();
         let min_len = pats.iter().map(|p| p.len()).min().unwrap_or(0);
         // Don't allow any empty patterns and require that we have at
         // least one pattern.
         if min_len < 1 {
             return None;
         }
         // Pick the largest mask possible, but no larger than 3.
         let nmasks = cmp::min(3, min_len);
         let mut masks = Masks::new(vb, nmasks);
         let mut buckets = vec![vec![]; 8];
         // Assign a substring to each bucket, and add the bucket's bitfield to
         // the appropriate position in the mask.
         for (pati, pat) in pats.iter().enumerate() {
             let bucket = pati % 8;
             buckets[bucket].push(pati);
             masks.add(bucket as u8, pat);
         }
         Some(Teddy {
             vb: vb,
             pats: pats.to_vec(),
             ac: AcAutomaton::new(pats.to_vec()).into_full(),
             buckets: buckets,
             masks: masks,
         })
     }

     /// Returns all of the substrings matched by this `Teddy`.
     pub fn patterns(&self) -> &[Vec<u8>] {
         &self.pats
     }

     /// Returns the number of substrings in this matcher.
     pub fn len(&self) -> usize {
         self.pats.len()
     }

     /// Returns the approximate size on the heap used by this matcher.
     pub fn approximate_size(&self) -> usize {
         self.pats.iter().fold(0, |a, b| a + b.len())
     }

     /// Searches `haystack` for the substrings in this `Teddy`. If a match was
     /// found, then it is returned. Otherwise, `None` is returned.
     pub fn find(&self, haystack: &[u8]) -> Option<Match> {
         // This is safe because the only way we can construct a Teddy type
         // is if AVX2 is available.
         unsafe { self.find_impl(haystack) }
     }

     #[allow(unused_attributes)]
     #[target_feature(enable = "avx2")]
     unsafe fn find_impl(&self, haystack: &[u8]) -> Option<Match> {
         // If our haystack is smaller than the block size, then fall back to
         // a naive brute force search.
         if haystack.is_empty() || haystack.len() < (BLOCK_SIZE + 2) {
             return self.slow(haystack, 0);
         }
         match self.masks.len() {
             0 => None,
             1 => self.find1(haystack),
             2 => self.find2(haystack),
             3 => self.find3(haystack),
             _ => unreachable!(),
         }
     }

     /// `find1` is used when there is only 1 mask. This is the easy case and is
     /// pretty much as described in the module documentation.
     #[inline(always)]
     fn find1(&self, haystack: &[u8]) -> Option<Match> {
         let mut pos = 0;
         let zero = self.vb.u8x32_splat(0);
         let len = haystack.len();
         debug_assert!(len >= BLOCK_SIZE);
         while pos <= len - BLOCK_SIZE {
             let h = unsafe {
                 // I tried and failed to eliminate bounds checks in safe code.
                 // This is safe because of our loop invariant: pos is always
                 // <= len-32.
                 let p = haystack.get_unchecked(pos..);
                 self.vb.u8x32_load_unchecked_unaligned(p)
             };
             // N.B. `res0` is our `C` in the module documentation.
             let res0 = self.masks.members1(h);
             // Only do expensive verification if there are any non-zero bits.
             let bitfield = res0.ne(zero).movemask();
             if bitfield != 0 {
                 if let Some(m) = self.verify(haystack, pos, res0, bitfield) {
                     return Some(m);
                 }
             }
             pos += BLOCK_SIZE;
         }
         self.slow(haystack, pos)
     }

     /// `find2` is used when there are 2 masks, e.g., the fingerprint is 2 bytes
     /// long.
     #[inline(always)]
     fn find2(&self, haystack: &[u8]) -> Option<Match> {
         // This is an exotic way to right shift a SIMD vector across lanes.
         // See below at use for more details.
         let zero = self.vb.u8x32_splat(0);
         let len = haystack.len();
         // The previous value of `C` (from the module documentation) for the
         // *first* byte in the fingerprint. On subsequent iterations, we take
         // the last bitset from the previous `C` and insert it into the first
         // position of the current `C`, shifting all other bitsets to the right
         // one lane. This causes `C` for the first byte to line up with `C` for
         // the second byte, so that they can be `AND`'d together.
         let mut prev0 = self.vb.u8x32_splat(0xFF);
         let mut pos = 1;
         debug_assert!(len >= BLOCK_SIZE);
         while pos <= len - BLOCK_SIZE {
             let h = unsafe {
                 // I tried and failed to eliminate bounds checks in safe code.
                 // This is safe because of our loop invariant: pos is always
                 // <= len-32.
                 let p = haystack.get_unchecked(pos..);
                 self.vb.u8x32_load_unchecked_unaligned(p)
             };
             let (res0, res1) = self.masks.members2(h);

             // Do this:
             //
             //     (prev0 << 15) | (res0 >> 1)
             //
             // This lets us line up our C values for each byte.
             let res0prev0 = res0.alignr_15(prev0);

             // `AND`'s our `C` values together.
             let res = res0prev0.and(res1);
             prev0 = res0;

             let bitfield = res.ne(zero).movemask();
             if bitfield != 0 {
                 let pos = pos.checked_sub(1).unwrap();
                 if let Some(m) = self.verify(haystack, pos, res, bitfield) {
                     return Some(m);
                 }
             }
             pos += BLOCK_SIZE;
         }
         // The windowing above doesn't check the last byte in the last
         // window, so start the slow search at the last byte of the last
         // window.
         self.slow(haystack, pos.checked_sub(1).unwrap())
     }

     /// `find3` is used when there are 3 masks, e.g., the fingerprint is 3 bytes
     /// long.
     ///
     /// N.B. This is a straight-forward extrapolation of `find2`. The only
     /// difference is that we need to keep track of two previous values of `C`,
     /// since we now need to align for three bytes.
     #[inline(always)]
     fn find3(&self, haystack: &[u8]) -> Option<Match> {
         let zero = self.vb.u8x32_splat(0);
         let len = haystack.len();
         let mut prev0 = self.vb.u8x32_splat(0xFF);
         let mut prev1 = self.vb.u8x32_splat(0xFF);
         let mut pos = 2;

         while pos <= len - BLOCK_SIZE {
             let h = unsafe {
                 // I tried and failed to eliminate bounds checks in safe code.
                 // This is safe because of our loop invariant: pos is always
                 // <= len-32.
                 let p = haystack.get_unchecked(pos..);
                 self.vb.u8x32_load_unchecked_unaligned(p)
             };
             let (res0, res1, res2) = self.masks.members3(h);

             let res0prev0 = res0.alignr_14(prev0);
             let res1prev1 = res1.alignr_15(prev1);
             let res = res0prev0.and(res1prev1).and(res2);

             prev0 = res0;
             prev1 = res1;

             let bitfield = res.ne(zero).movemask();
             if bitfield != 0 {
                 let pos = pos.checked_sub(2).unwrap();
                 if let Some(m) = self.verify(haystack, pos, res, bitfield) {
                     return Some(m);
                 }
             }
             pos += BLOCK_SIZE;
         }
         // The windowing above doesn't check the last two bytes in the last
         // window, so start the slow search at the penultimate byte of the
         // last window.
         // self.slow(haystack, pos.saturating_sub(2))
         self.slow(haystack, pos.checked_sub(2).unwrap())
     }

     /// Runs the verification procedure on `res` (i.e., `C` from the module
     /// documentation), where the haystack block starts at `pos` in
     /// `haystack`. `bitfield` has ones in the bit positions that `res` has
     /// non-zero bytes.
     ///
     /// If a match exists, it returns the first one.
     #[inline(always)]
     fn verify(
         &self,
         haystack: &[u8],
         pos: usize,
         res: u8x32,
         mut bitfield: u32,
     ) -> Option<Match> {
         while bitfield != 0 {
             // The next offset, relative to pos, where some fingerprint
             // matched.
             let byte_pos = bitfield.trailing_zeros() as usize;
             bitfield &= !(1 << byte_pos);

             // Offset relative to the beginning of the haystack.
             let start = pos + byte_pos;

             // The bitfield telling us which patterns had fingerprints that
             // match at this starting position.
             let mut patterns = res.extract(byte_pos);
             while patterns != 0 {
                 let bucket = patterns.trailing_zeros() as usize;
                 patterns &= !(1 << bucket);

                 // Actual substring search verification.
                 if let Some(m) = self.verify_bucket(haystack, bucket, start) {
                     return Some(m);
                 }
             }
         }

         None
     }

     /// Verifies whether any substring in the given bucket matches in haystack
     /// at the given starting position.
     #[inline(always)]
     fn verify_bucket(
         &self,
         haystack: &[u8],
         bucket: usize,
         start: usize,
     ) -> Option<Match> {
         // This cycles through the patterns in the bucket in the order that
         // the patterns were given. Therefore, we guarantee leftmost-first
         // semantics.
         for &pati in &self.buckets[bucket] {
             let pat = &*self.pats[pati];
             if start + pat.len() > haystack.len() {
                 continue;
             }
             if pat == &haystack[start..start + pat.len()] {
                 return Some(Match {
                     pat: pati,
                     start: start,
                     end: start + pat.len(),
                 });
             }
         }
         None
     }

     /// Slow substring search through all patterns in this matcher.
     ///
     /// This is used when we don't have enough bytes in the haystack for our
     /// block based approach.
     #[inline(never)]
     fn slow(&self, haystack: &[u8], pos: usize) -> Option<Match> {
         self.ac.find(&haystack[pos..]).next().map(|m| {
             Match {
                 pat: m.pati,
                 start: pos + m.start,
                 end: pos + m.end,
             }
         })
     }
 }

 /// A list of masks. This has length equal to the length of the fingerprint.
 /// The length of the fingerprint is always `min(3, len(smallest_substring))`.
 #[derive(Debug, Clone)]
 struct Masks {
     vb: AVX2VectorBuilder,
     masks: [Mask; 3],
     size: usize,
 }

 impl Masks {
     /// Create a new set of masks of size `n`, where `n` corresponds to the
     /// number of bytes in a fingerprint.
     fn new(vb: AVX2VectorBuilder, n: usize) -> Masks {
         Masks {
             vb: vb,
             masks: [Mask::new(vb), Mask::new(vb), Mask::new(vb)],
             size: n,
         }
     }

     /// Returns the number of masks.
     fn len(&self) -> usize {
         self.size
     }

     /// Adds the given pattern to the given bucket. The bucket should be a
     /// power of `2 <= 2^7`.
     fn add(&mut self, bucket: u8, pat: &[u8]) {
         for i in 0..self.len() {
             self.masks[i].add(bucket, pat[i]);
         }
     }

     /// Finds the fingerprints that are in the given haystack block. i.e., this
     /// returns `C` as described in the module documentation.
     ///
     /// More specifically, `for i in 0..16` and `j in 0..8, C[i][j] == 1` if and
     /// only if `haystack_block[i]` corresponds to a fingerprint that is part
     /// of a pattern in bucket `j`.
     #[inline(always)]
     fn members1(&self, haystack_block: u8x32) -> u8x32 {
         let masklo = self.vb.u8x32_splat(0xF);
         let hlo = haystack_block.and(masklo);
         let hhi = haystack_block.bit_shift_right_4().and(masklo);

         self.masks[0].lo.shuffle(hlo).and(self.masks[0].hi.shuffle(hhi))
     }

     /// Like members1, but computes C for the first and second bytes in the
     /// fingerprint.
     #[inline(always)]
     fn members2(&self, haystack_block: u8x32) -> (u8x32, u8x32) {
         let masklo = self.vb.u8x32_splat(0xF);
         let hlo = haystack_block.and(masklo);
         let hhi = haystack_block.bit_shift_right_4().and(masklo);

         let res0 =
             self.masks[0].lo.shuffle(hlo).and(self.masks[0].hi.shuffle(hhi));
         let res1 =
             self.masks[1].lo.shuffle(hlo).and(self.masks[1].hi.shuffle(hhi));
         (res0, res1)
     }

     /// Like `members1`, but computes `C` for the first, second and third bytes
     /// in the fingerprint.
     #[inline(always)]
     fn members3(&self, haystack_block: u8x32) -> (u8x32, u8x32, u8x32) {
         let masklo = self.vb.u8x32_splat(0xF);
         let hlo = haystack_block.and(masklo);
         let hhi = haystack_block.bit_shift_right_4().and(masklo);

         let res0 =
             self.masks[0].lo.shuffle(hlo).and(self.masks[0].hi.shuffle(hhi));
         let res1 =
             self.masks[1].lo.shuffle(hlo).and(self.masks[1].hi.shuffle(hhi));
         let res2 =
             self.masks[2].lo.shuffle(hlo).and(self.masks[2].hi.shuffle(hhi));
         (res0, res1, res2)
     }
 }

 /// A single mask.
 #[derive(Debug, Clone, Copy)]
 struct Mask {
     /// Bitsets for the low nybbles in a fingerprint.
     lo: u8x32,
     /// Bitsets for the high nybbles in a fingerprint.
     hi: u8x32,
 }

 impl Mask {
     /// Create a new mask with no members.
     fn new(vb: AVX2VectorBuilder) -> Mask {
         Mask {
             lo: vb.u8x32_splat(0),
             hi: vb.u8x32_splat(0),
         }
     }

     /// Adds the given byte to the given bucket.
     fn add(&mut self, bucket: u8, byte: u8) {
         // Split our byte into two nybbles, and add each nybble to our
         // mask.
         let byte_lo = (byte & 0xF) as usize;
         let byte_hi = (byte >> 4) as usize;

         let lo = self.lo.extract(byte_lo) | ((1 << bucket) as u8);
         self.lo.replace(byte_lo, lo);
         self.lo.replace(byte_lo + 16, lo);

         let hi = self.hi.extract(byte_hi) | ((1 << bucket) as u8);
         self.hi.replace(byte_hi, hi);
         self.hi.replace(byte_hi + 16, hi);
     }
 }
	/*!
	This is the Teddy searcher, but ported to AVX2.

	See the module comments in the SSSE3 Teddy searcher for a more in depth
	explanation of how this algorithm works. For the most part, this port is
	basically the same as the SSSE3 version, but using 256-bit vectors instead of
	128-bit vectors, which increases throughput.
	*/

	use std::cmp;

	use aho_corasick::{Automaton, AcAutomaton, FullAcAutomaton};
	use syntax::hir::literal::Literals;

	use vector::avx2::{AVX2VectorBuilder, u8x32};

	/// Corresponds to the number of bytes read at a time in the haystack.
	const BLOCK_SIZE: usize = 32;

	/// Match reports match information.
	#[derive(Debug, Clone)]
	pub struct Match {
	/// The index of the pattern that matched. The index is in correspondence
	/// with the order of the patterns given at construction.
	pub pat: usize,
	/// The start byte offset of the match.
	pub start: usize,
	/// The end byte offset of the match. This is always `start + pat.len()`.
	pub end: usize,
	}

	/// A SIMD accelerated multi substring searcher.
	#[derive(Debug, Clone)]
	pub struct Teddy {
	/// A builder for AVX2 empowered vectors.
	vb: AVX2VectorBuilder,
	/// A list of substrings to match.
	pats: Vec<Vec<u8>>,
	/// An Aho-Corasick automaton of the patterns. We use this when we need to
	/// search pieces smaller than the Teddy block size.
	ac: FullAcAutomaton<Vec<u8>>,
	/// A set of 8 buckets. Each bucket corresponds to a single member of a
	/// bitset. A bucket contains zero or more substrings. This is useful
	/// when the number of substrings exceeds 8, since our bitsets cannot have
	/// more than 8 members.
	buckets: Vec<Vec<usize>>,
	/// Our set of masks. There's one mask for each byte in the fingerprint.
	masks: Masks,
	}

	impl Teddy {
	/// Returns true if and only if Teddy is supported on this platform.
	///
	/// If this returns `false`, then `Teddy::new(...)` is guaranteed to
	/// return `None`.
	pub fn available() -> bool {
	AVX2VectorBuilder::new().is_some()
	}

	/// Create a new `Teddy` multi substring matcher.
	///
	/// If a `Teddy` matcher could not be created (e.g., `pats` is empty or has
	/// an empty substring), then `None` is returned.
	pub fn new(pats: &Literals) -> Option<Teddy> {
	let vb = match AVX2VectorBuilder::new() {
	None => return None,
	Some(vb) => vb,
	};
	if !Teddy::available() {
	return None;
	}

	let pats: Vec<_> = pats.literals().iter().map(\|p\|p.to_vec()).collect();
	let min_len = pats.iter().map(\|p\| p.len()).min().unwrap_or(0);
	// Don't allow any empty patterns and require that we have at
	// least one pattern.
	if min_len < 1 {
	return None;
	}
	// Pick the largest mask possible, but no larger than 3.
	let nmasks = cmp::min(3, min_len);
	let mut masks = Masks::new(vb, nmasks);
	let mut buckets = vec![vec![]; 8];
	// Assign a substring to each bucket, and add the bucket's bitfield to
	// the appropriate position in the mask.
	for (pati, pat) in pats.iter().enumerate() {
	let bucket = pati % 8;
	buckets[bucket].push(pati);
	masks.add(bucket as u8, pat);
	}
	Some(Teddy {
	vb: vb,
	pats: pats.to_vec(),
	ac: AcAutomaton::new(pats.to_vec()).into_full(),
	buckets: buckets,
	masks: masks,
	})
	}

	/// Returns all of the substrings matched by this `Teddy`.
	pub fn patterns(&self) -> &[Vec<u8>] {
	&self.pats
	}

	/// Returns the number of substrings in this matcher.
	pub fn len(&self) -> usize {
	self.pats.len()
	}

	/// Returns the approximate size on the heap used by this matcher.
	pub fn approximate_size(&self) -> usize {
	self.pats.iter().fold(0, \|a, b\| a + b.len())
	}

	/// Searches `haystack` for the substrings in this `Teddy`. If a match was
	/// found, then it is returned. Otherwise, `None` is returned.
	pub fn find(&self, haystack: &[u8]) -> Option<Match> {
	// This is safe because the only way we can construct a Teddy type
	// is if AVX2 is available.
	unsafe { self.find_impl(haystack) }
	}

	#[allow(unused_attributes)]
	#[target_feature(enable = "avx2")]
	unsafe fn find_impl(&self, haystack: &[u8]) -> Option<Match> {
	// If our haystack is smaller than the block size, then fall back to
	// a naive brute force search.
	if haystack.is_empty() \|\| haystack.len() < (BLOCK_SIZE + 2) {
	return self.slow(haystack, 0);
	}
	match self.masks.len() {
	0 => None,
	1 => self.find1(haystack),
	2 => self.find2(haystack),
	3 => self.find3(haystack),
	_ => unreachable!(),
	}
	}

	/// `find1` is used when there is only 1 mask. This is the easy case and is
	/// pretty much as described in the module documentation.
	#[inline(always)]
	fn find1(&self, haystack: &[u8]) -> Option<Match> {
	let mut pos = 0;
	let zero = self.vb.u8x32_splat(0);
	let len = haystack.len();
	debug_assert!(len >= BLOCK_SIZE);
	while pos <= len - BLOCK_SIZE {
	let h = unsafe {
	// I tried and failed to eliminate bounds checks in safe code.
	// This is safe because of our loop invariant: pos is always
	// <= len-32.
	let p = haystack.get_unchecked(pos..);
	self.vb.u8x32_load_unchecked_unaligned(p)
	};
	// N.B. `res0` is our `C` in the module documentation.
	let res0 = self.masks.members1(h);
	// Only do expensive verification if there are any non-zero bits.
	let bitfield = res0.ne(zero).movemask();
	if bitfield != 0 {
	if let Some(m) = self.verify(haystack, pos, res0, bitfield) {
	return Some(m);
	}
	}
	pos += BLOCK_SIZE;
	}
	self.slow(haystack, pos)
	}

	/// `find2` is used when there are 2 masks, e.g., the fingerprint is 2 bytes
	/// long.
	#[inline(always)]
	fn find2(&self, haystack: &[u8]) -> Option<Match> {
	// This is an exotic way to right shift a SIMD vector across lanes.
	// See below at use for more details.
	let zero = self.vb.u8x32_splat(0);
	let len = haystack.len();
	// The previous value of `C` (from the module documentation) for the
	// first byte in the fingerprint. On subsequent iterations, we take
	// the last bitset from the previous `C` and insert it into the first
	// position of the current `C`, shifting all other bitsets to the right
	// one lane. This causes `C` for the first byte to line up with `C` for
	// the second byte, so that they can be `AND`'d together.
	let mut prev0 = self.vb.u8x32_splat(0xFF);
	let mut pos = 1;
	debug_assert!(len >= BLOCK_SIZE);
	while pos <= len - BLOCK_SIZE {
	let h = unsafe {
	// I tried and failed to eliminate bounds checks in safe code.
	// This is safe because of our loop invariant: pos is always
	// <= len-32.
	let p = haystack.get_unchecked(pos..);
	self.vb.u8x32_load_unchecked_unaligned(p)
	};
	let (res0, res1) = self.masks.members2(h);

	// Do this:
	//
	// (prev0 << 15) \| (res0 >> 1)
	//
	// This lets us line up our C values for each byte.
	let res0prev0 = res0.alignr_15(prev0);

	// `AND`'s our `C` values together.
	let res = res0prev0.and(res1);
	prev0 = res0;

	let bitfield = res.ne(zero).movemask();
	if bitfield != 0 {
	let pos = pos.checked_sub(1).unwrap();
	if let Some(m) = self.verify(haystack, pos, res, bitfield) {
	return Some(m);
	}
	}
	pos += BLOCK_SIZE;
	}
	// The windowing above doesn't check the last byte in the last
	// window, so start the slow search at the last byte of the last
	// window.
	self.slow(haystack, pos.checked_sub(1).unwrap())
	}

	/// `find3` is used when there are 3 masks, e.g., the fingerprint is 3 bytes
	/// long.
	///
	/// N.B. This is a straight-forward extrapolation of `find2`. The only
	/// difference is that we need to keep track of two previous values of `C`,
	/// since we now need to align for three bytes.
	#[inline(always)]
	fn find3(&self, haystack: &[u8]) -> Option<Match> {
	let zero = self.vb.u8x32_splat(0);
	let len = haystack.len();
	let mut prev0 = self.vb.u8x32_splat(0xFF);
	let mut prev1 = self.vb.u8x32_splat(0xFF);
	let mut pos = 2;

	while pos <= len - BLOCK_SIZE {
	let h = unsafe {
	// I tried and failed to eliminate bounds checks in safe code.
	// This is safe because of our loop invariant: pos is always
	// <= len-32.
	let p = haystack.get_unchecked(pos..);
	self.vb.u8x32_load_unchecked_unaligned(p)
	};
	let (res0, res1, res2) = self.masks.members3(h);

	let res0prev0 = res0.alignr_14(prev0);
	let res1prev1 = res1.alignr_15(prev1);
	let res = res0prev0.and(res1prev1).and(res2);

	prev0 = res0;
	prev1 = res1;

	let bitfield = res.ne(zero).movemask();
	if bitfield != 0 {
	let pos = pos.checked_sub(2).unwrap();
	if let Some(m) = self.verify(haystack, pos, res, bitfield) {
	return Some(m);
	}
	}
	pos += BLOCK_SIZE;
	}
	// The windowing above doesn't check the last two bytes in the last
	// window, so start the slow search at the penultimate byte of the
	// last window.
	// self.slow(haystack, pos.saturating_sub(2))
	self.slow(haystack, pos.checked_sub(2).unwrap())
	}

	/// Runs the verification procedure on `res` (i.e., `C` from the module
	/// documentation), where the haystack block starts at `pos` in
	/// `haystack`. `bitfield` has ones in the bit positions that `res` has
	/// non-zero bytes.
	///
	/// If a match exists, it returns the first one.
	#[inline(always)]
	fn verify(
	&self,
	haystack: &[u8],
	pos: usize,
	res: u8x32,
	mut bitfield: u32,
	) -> Option<Match> {
	while bitfield != 0 {
	// The next offset, relative to pos, where some fingerprint
	// matched.
	let byte_pos = bitfield.trailing_zeros() as usize;
	bitfield &= !(1 << byte_pos);

	// Offset relative to the beginning of the haystack.
	let start = pos + byte_pos;

	// The bitfield telling us which patterns had fingerprints that
	// match at this starting position.
	let mut patterns = res.extract(byte_pos);
	while patterns != 0 {
	let bucket = patterns.trailing_zeros() as usize;
	patterns &= !(1 << bucket);

	// Actual substring search verification.
	if let Some(m) = self.verify_bucket(haystack, bucket, start) {
	return Some(m);
	}
	}
	}

	None
	}

	/// Verifies whether any substring in the given bucket matches in haystack
	/// at the given starting position.
	#[inline(always)]
	fn verify_bucket(
	&self,
	haystack: &[u8],
	bucket: usize,
	start: usize,
	) -> Option<Match> {
	// This cycles through the patterns in the bucket in the order that
	// the patterns were given. Therefore, we guarantee leftmost-first
	// semantics.
	for &pati in &self.buckets[bucket] {
	let pat = &*self.pats[pati];
	if start + pat.len() > haystack.len() {
	continue;
	}
	if pat == &haystack[start..start + pat.len()] {
	return Some(Match {
	pat: pati,
	start: start,
	end: start + pat.len(),
	});
	}
	}
	None
	}

	/// Slow substring search through all patterns in this matcher.
	///
	/// This is used when we don't have enough bytes in the haystack for our
	/// block based approach.
	#[inline(never)]
	fn slow(&self, haystack: &[u8], pos: usize) -> Option<Match> {
	self.ac.find(&haystack[pos..]).next().map(\|m\| {
	Match {
	pat: m.pati,
	start: pos + m.start,
	end: pos + m.end,
	}
	})
	}
	}

	/// A list of masks. This has length equal to the length of the fingerprint.
	/// The length of the fingerprint is always `min(3, len(smallest_substring))`.
	#[derive(Debug, Clone)]
	struct Masks {
	vb: AVX2VectorBuilder,
	masks: [Mask; 3],
	size: usize,
	}

	impl Masks {
	/// Create a new set of masks of size `n`, where `n` corresponds to the
	/// number of bytes in a fingerprint.
	fn new(vb: AVX2VectorBuilder, n: usize) -> Masks {
	Masks {
	vb: vb,
	masks: [Mask::new(vb), Mask::new(vb), Mask::new(vb)],
	size: n,
	}
	}

	/// Returns the number of masks.
	fn len(&self) -> usize {
	self.size
	}

	/// Adds the given pattern to the given bucket. The bucket should be a
	/// power of `2 <= 2^7`.
	fn add(&mut self, bucket: u8, pat: &[u8]) {
	for i in 0..self.len() {
	self.masks[i].add(bucket, pat[i]);
	}
	}

	/// Finds the fingerprints that are in the given haystack block. i.e., this
	/// returns `C` as described in the module documentation.
	///
	/// More specifically, `for i in 0..16` and `j in 0..8, C[i][j] == 1` if and
	/// only if `haystack_block[i]` corresponds to a fingerprint that is part
	/// of a pattern in bucket `j`.
	#[inline(always)]
	fn members1(&self, haystack_block: u8x32) -> u8x32 {
	let masklo = self.vb.u8x32_splat(0xF);
	let hlo = haystack_block.and(masklo);
	let hhi = haystack_block.bit_shift_right_4().and(masklo);

	self.masks[0].lo.shuffle(hlo).and(self.masks[0].hi.shuffle(hhi))
	}

	/// Like members1, but computes C for the first and second bytes in the
	/// fingerprint.
	#[inline(always)]
	fn members2(&self, haystack_block: u8x32) -> (u8x32, u8x32) {
	let masklo = self.vb.u8x32_splat(0xF);
	let hlo = haystack_block.and(masklo);
	let hhi = haystack_block.bit_shift_right_4().and(masklo);

	let res0 =
	self.masks[0].lo.shuffle(hlo).and(self.masks[0].hi.shuffle(hhi));
	let res1 =
	self.masks[1].lo.shuffle(hlo).and(self.masks[1].hi.shuffle(hhi));
	(res0, res1)
	}

	/// Like `members1`, but computes `C` for the first, second and third bytes
	/// in the fingerprint.
	#[inline(always)]
	fn members3(&self, haystack_block: u8x32) -> (u8x32, u8x32, u8x32) {
	let masklo = self.vb.u8x32_splat(0xF);
	let hlo = haystack_block.and(masklo);
	let hhi = haystack_block.bit_shift_right_4().and(masklo);

	let res0 =
	self.masks[0].lo.shuffle(hlo).and(self.masks[0].hi.shuffle(hhi));
	let res1 =
	self.masks[1].lo.shuffle(hlo).and(self.masks[1].hi.shuffle(hhi));
	let res2 =
	self.masks[2].lo.shuffle(hlo).and(self.masks[2].hi.shuffle(hhi));
	(res0, res1, res2)
	}
	}

	/// A single mask.
	#[derive(Debug, Clone, Copy)]
	struct Mask {
	/// Bitsets for the low nybbles in a fingerprint.
	lo: u8x32,
	/// Bitsets for the high nybbles in a fingerprint.
	hi: u8x32,
	}

	impl Mask {
	/// Create a new mask with no members.
	fn new(vb: AVX2VectorBuilder) -> Mask {
	Mask {
	lo: vb.u8x32_splat(0),
	hi: vb.u8x32_splat(0),
	}
	}

	/// Adds the given byte to the given bucket.
	fn add(&mut self, bucket: u8, byte: u8) {
	// Split our byte into two nybbles, and add each nybble to our
	// mask.
	let byte_lo = (byte & 0xF) as usize;
	let byte_hi = (byte >> 4) as usize;

	let lo = self.lo.extract(byte_lo) \| ((1 << bucket) as u8);
	self.lo.replace(byte_lo, lo);
	self.lo.replace(byte_lo + 16, lo);

	let hi = self.hi.extract(byte_hi) \| ((1 << bucket) as u8);
	self.hi.replace(byte_hi, hi);
	self.hi.replace(byte_hi + 16, hi);
	}
	}