| use std::mem; |
| |
| use packed::pattern::{PatternID, Patterns}; |
| use Match; |
| |
| /// The type of the rolling hash used in the Rabin-Karp algorithm. |
| type Hash = usize; |
| |
| /// The number of buckets to store our patterns in. We don't want this to be |
| /// too big in order to avoid wasting memory, but we don't want it to be too |
| /// small either to avoid spending too much time confirming literals. |
| /// |
| /// The number of buckets MUST be a power of two. Otherwise, determining the |
| /// bucket from a hash will slow down the code considerably. Using a power |
| /// of two means `hash % NUM_BUCKETS` can compile down to a simple `and` |
| /// instruction. |
| const NUM_BUCKETS: usize = 64; |
| |
| /// An implementation of the Rabin-Karp algorithm. The main idea of this |
| /// algorithm is to maintain a rolling hash as it moves through the input, and |
| /// then check whether that hash corresponds to the same hash for any of the |
| /// patterns we're looking for. |
| /// |
| /// A draw back of naively scaling Rabin-Karp to multiple patterns is that |
| /// it requires all of the patterns to be the same length, which in turn |
| /// corresponds to the number of bytes to hash. We adapt this to work for |
| /// multiple patterns of varying size by fixing the number of bytes to hash |
| /// to be the length of the smallest pattern. We also split the patterns into |
| /// several buckets to hopefully make the confirmation step faster. |
| /// |
| /// Wikipedia has a decent explanation, if a bit heavy on the theory: |
| /// https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm |
| /// |
| /// But ESMAJ provides something a bit more concrete: |
| /// http://www-igm.univ-mlv.fr/~lecroq/string/node5.html |
| #[derive(Clone, Debug)] |
| pub struct RabinKarp { |
| /// The order of patterns in each bucket is significant. Namely, they are |
| /// arranged such that the first one to match is the correct match. This |
| /// may not necessarily correspond to the order provided by the caller. |
| /// For example, if leftmost-longest semantics are used, then the patterns |
| /// are sorted by their length in descending order. If leftmost-first |
| /// semantics are used, then the patterns are sorted by their pattern ID |
| /// in ascending order (which corresponds to the caller's order). |
| buckets: Vec<Vec<(Hash, PatternID)>>, |
| /// The length of the hashing window. Generally, this corresponds to the |
| /// length of the smallest pattern. |
| hash_len: usize, |
| /// The factor to subtract out of a hash before updating it with a new |
| /// byte. |
| hash_2pow: usize, |
| /// The maximum identifier of a pattern. This is used as a sanity check |
| /// to ensure that the patterns provided by the caller are the same as |
| /// the patterns that were used to compile the matcher. This sanity check |
| /// possibly permits safely eliminating bounds checks regardless of what |
| /// patterns are provided by the caller. |
| /// |
| /// (Currently, we don't use this to elide bounds checks since it doesn't |
| /// result in a measurable performance improvement, but we do use it for |
| /// better failure modes.) |
| max_pattern_id: PatternID, |
| } |
| |
| impl RabinKarp { |
| /// Compile a new Rabin-Karp matcher from the patterns given. |
| /// |
| /// This panics if any of the patterns in the collection are empty, or if |
| /// the collection is itself empty. |
| pub fn new(patterns: &Patterns) -> RabinKarp { |
| assert!(patterns.len() >= 1); |
| let hash_len = patterns.minimum_len(); |
| assert!(hash_len >= 1); |
| |
| let mut hash_2pow = 1usize; |
| for _ in 1..hash_len { |
| hash_2pow = hash_2pow.wrapping_shl(1); |
| } |
| |
| let mut rk = RabinKarp { |
| buckets: vec![vec![]; NUM_BUCKETS], |
| hash_len, |
| hash_2pow, |
| max_pattern_id: patterns.max_pattern_id(), |
| }; |
| for (id, pat) in patterns.iter() { |
| let hash = rk.hash(&pat.bytes()[..rk.hash_len]); |
| let bucket = hash % NUM_BUCKETS; |
| rk.buckets[bucket].push((hash, id)); |
| } |
| rk |
| } |
| |
| /// Return the first matching pattern in the given haystack, begining the |
| /// search at `at`. |
| pub fn find_at( |
| &self, |
| patterns: &Patterns, |
| haystack: &[u8], |
| mut at: usize, |
| ) -> Option<Match> { |
| assert_eq!(NUM_BUCKETS, self.buckets.len()); |
| assert_eq!( |
| self.max_pattern_id, |
| patterns.max_pattern_id(), |
| "Rabin-Karp must be called with same patterns it was built with", |
| ); |
| |
| if at + self.hash_len > haystack.len() { |
| return None; |
| } |
| let mut hash = self.hash(&haystack[at..at + self.hash_len]); |
| loop { |
| let bucket = &self.buckets[hash % NUM_BUCKETS]; |
| for &(phash, pid) in bucket { |
| if phash == hash { |
| if let Some(c) = self.verify(patterns, pid, haystack, at) { |
| return Some(c); |
| } |
| } |
| } |
| if at + self.hash_len >= haystack.len() { |
| return None; |
| } |
| hash = self.update_hash( |
| hash, |
| haystack[at], |
| haystack[at + self.hash_len], |
| ); |
| at += 1; |
| } |
| } |
| |
| /// Returns the approximate total amount of heap used by this searcher, in |
| /// units of bytes. |
| pub fn heap_bytes(&self) -> usize { |
| let num_patterns = self.max_pattern_id as usize + 1; |
| self.buckets.len() * mem::size_of::<Vec<(Hash, PatternID)>>() |
| + num_patterns * mem::size_of::<(Hash, PatternID)>() |
| } |
| |
| /// Verify whether the pattern with the given id matches at |
| /// `haystack[at..]`. |
| /// |
| /// We tag this function as `cold` because it helps improve codegen. |
| /// Intuitively, it would seem like inlining it would be better. However, |
| /// the only time this is called and a match is not found is when there |
| /// there is a hash collision, or when a prefix of a pattern matches but |
| /// the entire pattern doesn't match. This is hopefully fairly rare, and |
| /// if it does occur a lot, it's going to be slow no matter what we do. |
| #[cold] |
| fn verify( |
| &self, |
| patterns: &Patterns, |
| id: PatternID, |
| haystack: &[u8], |
| at: usize, |
| ) -> Option<Match> { |
| let pat = patterns.get(id); |
| if pat.is_prefix(&haystack[at..]) { |
| Some(Match::from_span(id as usize, at, at + pat.len())) |
| } else { |
| None |
| } |
| } |
| |
| /// Hash the given bytes. |
| fn hash(&self, bytes: &[u8]) -> Hash { |
| assert_eq!(self.hash_len, bytes.len()); |
| |
| let mut hash = 0usize; |
| for &b in bytes { |
| hash = hash.wrapping_shl(1).wrapping_add(b as usize); |
| } |
| hash |
| } |
| |
| /// Update the hash given based on removing `old_byte` at the beginning |
| /// of some byte string, and appending `new_byte` to the end of that same |
| /// byte string. |
| fn update_hash(&self, prev: Hash, old_byte: u8, new_byte: u8) -> Hash { |
| prev.wrapping_sub((old_byte as usize).wrapping_mul(self.hash_2pow)) |
| .wrapping_shl(1) |
| .wrapping_add(new_byte as usize) |
| } |
| } |