| use std::mem; |
| |
| use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder}; |
| use memchr::{memchr, memchr2, memchr3, memmem}; |
| use regex_syntax::hir::literal::{Literal, Literals}; |
| |
| /// A prefix extracted from a compiled regular expression. |
| /// |
| /// A regex prefix is a set of literal strings that *must* be matched at the |
| /// beginning of a regex in order for the entire regex to match. Similarly |
| /// for a regex suffix. |
| #[derive(Clone, Debug)] |
| pub struct LiteralSearcher { |
| complete: bool, |
| lcp: Memmem, |
| lcs: Memmem, |
| matcher: Matcher, |
| } |
| |
| #[derive(Clone, Debug)] |
| enum Matcher { |
| /// No literals. (Never advances through the input.) |
| Empty, |
| /// A set of four or more single byte literals. |
| Bytes(SingleByteSet), |
| /// A single substring, using vector accelerated routines when available. |
| Memmem(Memmem), |
| /// An Aho-Corasick automaton. |
| AC { ac: AhoCorasick<u32>, lits: Vec<Literal> }, |
| /// A packed multiple substring searcher, using SIMD. |
| /// |
| /// Note that Aho-Corasick will actually use this packed searcher |
| /// internally automatically, however, there is some overhead associated |
| /// with going through the Aho-Corasick machinery. So using the packed |
| /// searcher directly results in some gains. |
| Packed { s: packed::Searcher, lits: Vec<Literal> }, |
| } |
| |
| impl LiteralSearcher { |
| /// Returns a matcher that never matches and never advances the input. |
| pub fn empty() -> Self { |
| Self::new(Literals::empty(), Matcher::Empty) |
| } |
| |
| /// Returns a matcher for literal prefixes from the given set. |
| pub fn prefixes(lits: Literals) -> Self { |
| let matcher = Matcher::prefixes(&lits); |
| Self::new(lits, matcher) |
| } |
| |
| /// Returns a matcher for literal suffixes from the given set. |
| pub fn suffixes(lits: Literals) -> Self { |
| let matcher = Matcher::suffixes(&lits); |
| Self::new(lits, matcher) |
| } |
| |
| fn new(lits: Literals, matcher: Matcher) -> Self { |
| let complete = lits.all_complete(); |
| LiteralSearcher { |
| complete: complete, |
| lcp: Memmem::new(lits.longest_common_prefix()), |
| lcs: Memmem::new(lits.longest_common_suffix()), |
| matcher: matcher, |
| } |
| } |
| |
| /// Returns true if all matches comprise the entire regular expression. |
| /// |
| /// This does not necessarily mean that a literal match implies a match |
| /// of the regular expression. For example, the regular expression `^a` |
| /// is comprised of a single complete literal `a`, but the regular |
| /// expression demands that it only match at the beginning of a string. |
| pub fn complete(&self) -> bool { |
| self.complete && !self.is_empty() |
| } |
| |
| /// Find the position of a literal in `haystack` if it exists. |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> { |
| use self::Matcher::*; |
| match self.matcher { |
| Empty => Some((0, 0)), |
| Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)), |
| Memmem(ref s) => s.find(haystack).map(|i| (i, i + s.len())), |
| AC { ref ac, .. } => { |
| ac.find(haystack).map(|m| (m.start(), m.end())) |
| } |
| Packed { ref s, .. } => { |
| s.find(haystack).map(|m| (m.start(), m.end())) |
| } |
| } |
| } |
| |
| /// Like find, except matches must start at index `0`. |
| pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> { |
| for lit in self.iter() { |
| if lit.len() > haystack.len() { |
| continue; |
| } |
| if lit == &haystack[0..lit.len()] { |
| return Some((0, lit.len())); |
| } |
| } |
| None |
| } |
| |
| /// Like find, except matches must end at index `haystack.len()`. |
| pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> { |
| for lit in self.iter() { |
| if lit.len() > haystack.len() { |
| continue; |
| } |
| if lit == &haystack[haystack.len() - lit.len()..] { |
| return Some((haystack.len() - lit.len(), haystack.len())); |
| } |
| } |
| None |
| } |
| |
| /// Returns an iterator over all literals to be matched. |
| pub fn iter(&self) -> LiteralIter<'_> { |
| match self.matcher { |
| Matcher::Empty => LiteralIter::Empty, |
| Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense), |
| Matcher::Memmem(ref s) => LiteralIter::Single(&s.finder.needle()), |
| Matcher::AC { ref lits, .. } => LiteralIter::AC(lits), |
| Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits), |
| } |
| } |
| |
| /// Returns a matcher for the longest common prefix of this matcher. |
| pub fn lcp(&self) -> &Memmem { |
| &self.lcp |
| } |
| |
| /// Returns a matcher for the longest common suffix of this matcher. |
| pub fn lcs(&self) -> &Memmem { |
| &self.lcs |
| } |
| |
| /// Returns true iff this prefix is empty. |
| pub fn is_empty(&self) -> bool { |
| self.len() == 0 |
| } |
| |
| /// Returns the number of prefixes in this machine. |
| pub fn len(&self) -> usize { |
| use self::Matcher::*; |
| match self.matcher { |
| Empty => 0, |
| Bytes(ref sset) => sset.dense.len(), |
| Memmem(_) => 1, |
| AC { ref ac, .. } => ac.pattern_count(), |
| Packed { ref lits, .. } => lits.len(), |
| } |
| } |
| |
| /// Return the approximate heap usage of literals in bytes. |
| pub fn approximate_size(&self) -> usize { |
| use self::Matcher::*; |
| match self.matcher { |
| Empty => 0, |
| Bytes(ref sset) => sset.approximate_size(), |
| Memmem(ref single) => single.approximate_size(), |
| AC { ref ac, .. } => ac.heap_bytes(), |
| Packed { ref s, .. } => s.heap_bytes(), |
| } |
| } |
| } |
| |
| impl Matcher { |
| fn prefixes(lits: &Literals) -> Self { |
| let sset = SingleByteSet::prefixes(lits); |
| Matcher::new(lits, sset) |
| } |
| |
| fn suffixes(lits: &Literals) -> Self { |
| let sset = SingleByteSet::suffixes(lits); |
| Matcher::new(lits, sset) |
| } |
| |
| fn new(lits: &Literals, sset: SingleByteSet) -> Self { |
| if lits.literals().is_empty() { |
| return Matcher::Empty; |
| } |
| if sset.dense.len() >= 26 { |
| // Avoid trying to match a large number of single bytes. |
| // This is *very* sensitive to a frequency analysis comparison |
| // between the bytes in sset and the composition of the haystack. |
| // No matter the size of sset, if its members all are rare in the |
| // haystack, then it'd be worth using it. How to tune this... IDK. |
| // ---AG |
| return Matcher::Empty; |
| } |
| if sset.complete { |
| return Matcher::Bytes(sset); |
| } |
| if lits.literals().len() == 1 { |
| return Matcher::Memmem(Memmem::new(&lits.literals()[0])); |
| } |
| |
| let pats = lits.literals().to_owned(); |
| let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii; |
| if lits.literals().len() <= 100 && !is_aho_corasick_fast { |
| let mut builder = packed::Config::new() |
| .match_kind(packed::MatchKind::LeftmostFirst) |
| .builder(); |
| if let Some(s) = builder.extend(&pats).build() { |
| return Matcher::Packed { s, lits: pats }; |
| } |
| } |
| let ac = AhoCorasickBuilder::new() |
| .match_kind(aho_corasick::MatchKind::LeftmostFirst) |
| .dfa(true) |
| .build_with_size::<u32, _, _>(&pats) |
| .unwrap(); |
| Matcher::AC { ac, lits: pats } |
| } |
| } |
| |
| #[derive(Debug)] |
| pub enum LiteralIter<'a> { |
| Empty, |
| Bytes(&'a [u8]), |
| Single(&'a [u8]), |
| AC(&'a [Literal]), |
| Packed(&'a [Literal]), |
| } |
| |
| impl<'a> Iterator for LiteralIter<'a> { |
| type Item = &'a [u8]; |
| |
| fn next(&mut self) -> Option<Self::Item> { |
| match *self { |
| LiteralIter::Empty => None, |
| LiteralIter::Bytes(ref mut many) => { |
| if many.is_empty() { |
| None |
| } else { |
| let next = &many[0..1]; |
| *many = &many[1..]; |
| Some(next) |
| } |
| } |
| LiteralIter::Single(ref mut one) => { |
| if one.is_empty() { |
| None |
| } else { |
| let next = &one[..]; |
| *one = &[]; |
| Some(next) |
| } |
| } |
| LiteralIter::AC(ref mut lits) => { |
| if lits.is_empty() { |
| None |
| } else { |
| let next = &lits[0]; |
| *lits = &lits[1..]; |
| Some(&**next) |
| } |
| } |
| LiteralIter::Packed(ref mut lits) => { |
| if lits.is_empty() { |
| None |
| } else { |
| let next = &lits[0]; |
| *lits = &lits[1..]; |
| Some(&**next) |
| } |
| } |
| } |
| } |
| } |
| |
| #[derive(Clone, Debug)] |
| struct SingleByteSet { |
| sparse: Vec<bool>, |
| dense: Vec<u8>, |
| complete: bool, |
| all_ascii: bool, |
| } |
| |
| impl SingleByteSet { |
| fn new() -> SingleByteSet { |
| SingleByteSet { |
| sparse: vec![false; 256], |
| dense: vec![], |
| complete: true, |
| all_ascii: true, |
| } |
| } |
| |
| fn prefixes(lits: &Literals) -> SingleByteSet { |
| let mut sset = SingleByteSet::new(); |
| for lit in lits.literals() { |
| sset.complete = sset.complete && lit.len() == 1; |
| if let Some(&b) = lit.get(0) { |
| if !sset.sparse[b as usize] { |
| if b > 0x7F { |
| sset.all_ascii = false; |
| } |
| sset.dense.push(b); |
| sset.sparse[b as usize] = true; |
| } |
| } |
| } |
| sset |
| } |
| |
| fn suffixes(lits: &Literals) -> SingleByteSet { |
| let mut sset = SingleByteSet::new(); |
| for lit in lits.literals() { |
| sset.complete = sset.complete && lit.len() == 1; |
| if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) { |
| if !sset.sparse[b as usize] { |
| if b > 0x7F { |
| sset.all_ascii = false; |
| } |
| sset.dense.push(b); |
| sset.sparse[b as usize] = true; |
| } |
| } |
| } |
| sset |
| } |
| |
| /// Faster find that special cases certain sizes to use memchr. |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| fn find(&self, text: &[u8]) -> Option<usize> { |
| match self.dense.len() { |
| 0 => None, |
| 1 => memchr(self.dense[0], text), |
| 2 => memchr2(self.dense[0], self.dense[1], text), |
| 3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text), |
| _ => self._find(text), |
| } |
| } |
| |
| /// Generic find that works on any sized set. |
| fn _find(&self, haystack: &[u8]) -> Option<usize> { |
| for (i, &b) in haystack.iter().enumerate() { |
| if self.sparse[b as usize] { |
| return Some(i); |
| } |
| } |
| None |
| } |
| |
| fn approximate_size(&self) -> usize { |
| (self.dense.len() * mem::size_of::<u8>()) |
| + (self.sparse.len() * mem::size_of::<bool>()) |
| } |
| } |
| |
| /// A simple wrapper around the memchr crate's memmem implementation. |
| /// |
| /// The API this exposes mirrors the API of previous substring searchers that |
| /// this supplanted. |
| #[derive(Clone, Debug)] |
| pub struct Memmem { |
| finder: memmem::Finder<'static>, |
| char_len: usize, |
| } |
| |
| impl Memmem { |
| fn new(pat: &[u8]) -> Memmem { |
| Memmem { |
| finder: memmem::Finder::new(pat).into_owned(), |
| char_len: char_len_lossy(pat), |
| } |
| } |
| |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| pub fn find(&self, haystack: &[u8]) -> Option<usize> { |
| self.finder.find(haystack) |
| } |
| |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| pub fn is_suffix(&self, text: &[u8]) -> bool { |
| if text.len() < self.len() { |
| return false; |
| } |
| &text[text.len() - self.len()..] == self.finder.needle() |
| } |
| |
| pub fn len(&self) -> usize { |
| self.finder.needle().len() |
| } |
| |
| pub fn char_len(&self) -> usize { |
| self.char_len |
| } |
| |
| fn approximate_size(&self) -> usize { |
| self.finder.needle().len() * mem::size_of::<u8>() |
| } |
| } |
| |
| fn char_len_lossy(bytes: &[u8]) -> usize { |
| String::from_utf8_lossy(bytes).chars().count() |
| } |