|  | use std::collections::HashMap; | 
|  | use std::fmt; | 
|  | use std::iter; | 
|  | use std::result; | 
|  | use std::sync::Arc; | 
|  |  | 
|  | use regex_syntax::hir::{self, Hir}; | 
|  | use regex_syntax::is_word_byte; | 
|  | use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences}; | 
|  |  | 
|  | use crate::prog::{ | 
|  | EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges, | 
|  | InstSave, InstSplit, Program, | 
|  | }; | 
|  |  | 
|  | use crate::Error; | 
|  |  | 
|  | type Result = result::Result<Patch, Error>; | 
|  | type ResultOrEmpty = result::Result<Option<Patch>, Error>; | 
|  |  | 
|  | #[derive(Debug)] | 
|  | struct Patch { | 
|  | hole: Hole, | 
|  | entry: InstPtr, | 
|  | } | 
|  |  | 
|  | /// A compiler translates a regular expression AST to a sequence of | 
|  | /// instructions. The sequence of instructions represents an NFA. | 
|  | // `Compiler` is only public via the `internal` module, so avoid deriving | 
|  | // `Debug`. | 
|  | #[allow(missing_debug_implementations)] | 
|  | pub struct Compiler { | 
|  | insts: Vec<MaybeInst>, | 
|  | compiled: Program, | 
|  | capture_name_idx: HashMap<String, usize>, | 
|  | num_exprs: usize, | 
|  | size_limit: usize, | 
|  | suffix_cache: SuffixCache, | 
|  | utf8_seqs: Option<Utf8Sequences>, | 
|  | byte_classes: ByteClassSet, | 
|  | // This keeps track of extra bytes allocated while compiling the regex | 
|  | // program. Currently, this corresponds to two things. First is the heap | 
|  | // memory allocated by Unicode character classes ('InstRanges'). Second is | 
|  | // a "fake" amount of memory used by empty sub-expressions, so that enough | 
|  | // empty sub-expressions will ultimately trigger the compiler to bail | 
|  | // because of a size limit restriction. (That empty sub-expressions don't | 
|  | // add to heap memory usage is more-or-less an implementation detail.) In | 
|  | // the second case, if we don't bail, then an excessively large repetition | 
|  | // on an empty sub-expression can result in the compiler using a very large | 
|  | // amount of CPU time. | 
|  | extra_inst_bytes: usize, | 
|  | } | 
|  |  | 
|  | impl Compiler { | 
|  | /// Create a new regular expression compiler. | 
|  | /// | 
|  | /// Various options can be set before calling `compile` on an expression. | 
|  | pub fn new() -> Self { | 
|  | Compiler { | 
|  | insts: vec![], | 
|  | compiled: Program::new(), | 
|  | capture_name_idx: HashMap::new(), | 
|  | num_exprs: 0, | 
|  | size_limit: 10 * (1 << 20), | 
|  | suffix_cache: SuffixCache::new(1000), | 
|  | utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')), | 
|  | byte_classes: ByteClassSet::new(), | 
|  | extra_inst_bytes: 0, | 
|  | } | 
|  | } | 
|  |  | 
|  | /// The size of the resulting program is limited by size_limit. If | 
|  | /// the program approximately exceeds the given size (in bytes), then | 
|  | /// compilation will stop and return an error. | 
|  | pub fn size_limit(mut self, size_limit: usize) -> Self { | 
|  | self.size_limit = size_limit; | 
|  | self | 
|  | } | 
|  |  | 
|  | /// If bytes is true, then the program is compiled as a byte based | 
|  | /// automaton, which incorporates UTF-8 decoding into the machine. If it's | 
|  | /// false, then the automaton is Unicode scalar value based, e.g., an | 
|  | /// engine utilizing such an automaton is responsible for UTF-8 decoding. | 
|  | /// | 
|  | /// The specific invariant is that when returning a byte based machine, | 
|  | /// the neither the `Char` nor `Ranges` instructions are produced. | 
|  | /// Conversely, when producing a Unicode scalar value machine, the `Bytes` | 
|  | /// instruction is never produced. | 
|  | /// | 
|  | /// Note that `dfa(true)` implies `bytes(true)`. | 
|  | pub fn bytes(mut self, yes: bool) -> Self { | 
|  | self.compiled.is_bytes = yes; | 
|  | self | 
|  | } | 
|  |  | 
|  | /// When disabled, the program compiled may match arbitrary bytes. | 
|  | /// | 
|  | /// When enabled (the default), all compiled programs exclusively match | 
|  | /// valid UTF-8 bytes. | 
|  | pub fn only_utf8(mut self, yes: bool) -> Self { | 
|  | self.compiled.only_utf8 = yes; | 
|  | self | 
|  | } | 
|  |  | 
|  | /// When set, the machine returned is suitable for use in the DFA matching | 
|  | /// engine. | 
|  | /// | 
|  | /// In particular, this ensures that if the regex is not anchored in the | 
|  | /// beginning, then a preceding `.*?` is included in the program. (The NFA | 
|  | /// based engines handle the preceding `.*?` explicitly, which is difficult | 
|  | /// or impossible in the DFA engine.) | 
|  | pub fn dfa(mut self, yes: bool) -> Self { | 
|  | self.compiled.is_dfa = yes; | 
|  | self | 
|  | } | 
|  |  | 
|  | /// When set, the machine returned is suitable for matching text in | 
|  | /// reverse. In particular, all concatenations are flipped. | 
|  | pub fn reverse(mut self, yes: bool) -> Self { | 
|  | self.compiled.is_reverse = yes; | 
|  | self | 
|  | } | 
|  |  | 
|  | /// Compile a regular expression given its AST. | 
|  | /// | 
|  | /// The compiler is guaranteed to succeed unless the program exceeds the | 
|  | /// specified size limit. If the size limit is exceeded, then compilation | 
|  | /// stops and returns an error. | 
|  | pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> { | 
|  | debug_assert!(!exprs.is_empty()); | 
|  | self.num_exprs = exprs.len(); | 
|  | if exprs.len() == 1 { | 
|  | self.compile_one(&exprs[0]) | 
|  | } else { | 
|  | self.compile_many(exprs) | 
|  | } | 
|  | } | 
|  |  | 
|  | fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> { | 
|  | // If we're compiling a forward DFA and we aren't anchored, then | 
|  | // add a `.*?` before the first capture group. | 
|  | // Other matching engines handle this by baking the logic into the | 
|  | // matching engine itself. | 
|  | let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; | 
|  | self.compiled.is_anchored_start = expr.is_anchored_start(); | 
|  | self.compiled.is_anchored_end = expr.is_anchored_end(); | 
|  | if self.compiled.needs_dotstar() { | 
|  | dotstar_patch = self.c_dotstar()?; | 
|  | self.compiled.start = dotstar_patch.entry; | 
|  | } | 
|  | self.compiled.captures = vec![None]; | 
|  | let patch = | 
|  | self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst()); | 
|  | if self.compiled.needs_dotstar() { | 
|  | self.fill(dotstar_patch.hole, patch.entry); | 
|  | } else { | 
|  | self.compiled.start = patch.entry; | 
|  | } | 
|  | self.fill_to_next(patch.hole); | 
|  | self.compiled.matches = vec![self.insts.len()]; | 
|  | self.push_compiled(Inst::Match(0)); | 
|  | self.compile_finish() | 
|  | } | 
|  |  | 
|  | fn compile_many( | 
|  | mut self, | 
|  | exprs: &[Hir], | 
|  | ) -> result::Result<Program, Error> { | 
|  | debug_assert!(exprs.len() > 1); | 
|  |  | 
|  | self.compiled.is_anchored_start = | 
|  | exprs.iter().all(|e| e.is_anchored_start()); | 
|  | self.compiled.is_anchored_end = | 
|  | exprs.iter().all(|e| e.is_anchored_end()); | 
|  | let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; | 
|  | if self.compiled.needs_dotstar() { | 
|  | dotstar_patch = self.c_dotstar()?; | 
|  | self.compiled.start = dotstar_patch.entry; | 
|  | } else { | 
|  | self.compiled.start = 0; // first instruction is always split | 
|  | } | 
|  | self.fill_to_next(dotstar_patch.hole); | 
|  |  | 
|  | let mut prev_hole = Hole::None; | 
|  | for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() { | 
|  | self.fill_to_next(prev_hole); | 
|  | let split = self.push_split_hole(); | 
|  | let Patch { hole, entry } = | 
|  | self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst()); | 
|  | self.fill_to_next(hole); | 
|  | self.compiled.matches.push(self.insts.len()); | 
|  | self.push_compiled(Inst::Match(i)); | 
|  | prev_hole = self.fill_split(split, Some(entry), None); | 
|  | } | 
|  | let i = exprs.len() - 1; | 
|  | let Patch { hole, entry } = | 
|  | self.c_capture(0, &exprs[i])?.unwrap_or_else(|| self.next_inst()); | 
|  | self.fill(prev_hole, entry); | 
|  | self.fill_to_next(hole); | 
|  | self.compiled.matches.push(self.insts.len()); | 
|  | self.push_compiled(Inst::Match(i)); | 
|  | self.compile_finish() | 
|  | } | 
|  |  | 
|  | fn compile_finish(mut self) -> result::Result<Program, Error> { | 
|  | self.compiled.insts = | 
|  | self.insts.into_iter().map(|inst| inst.unwrap()).collect(); | 
|  | self.compiled.byte_classes = self.byte_classes.byte_classes(); | 
|  | self.compiled.capture_name_idx = Arc::new(self.capture_name_idx); | 
|  | Ok(self.compiled) | 
|  | } | 
|  |  | 
|  | /// Compile expr into self.insts, returning a patch on success, | 
|  | /// or an error if we run out of memory. | 
|  | /// | 
|  | /// All of the c_* methods of the compiler share the contract outlined | 
|  | /// here. | 
|  | /// | 
|  | /// The main thing that a c_* method does is mutate `self.insts` | 
|  | /// to add a list of mostly compiled instructions required to execute | 
|  | /// the given expression. `self.insts` contains MaybeInsts rather than | 
|  | /// Insts because there is some backpatching required. | 
|  | /// | 
|  | /// The `Patch` value returned by each c_* method provides metadata | 
|  | /// about the compiled instructions emitted to `self.insts`. The | 
|  | /// `entry` member of the patch refers to the first instruction | 
|  | /// (the entry point), while the `hole` member contains zero or | 
|  | /// more offsets to partial instructions that need to be backpatched. | 
|  | /// The c_* routine can't know where its list of instructions are going to | 
|  | /// jump to after execution, so it is up to the caller to patch | 
|  | /// these jumps to point to the right place. So compiling some | 
|  | /// expression, e, we would end up with a situation that looked like: | 
|  | /// | 
|  | /// ```text | 
|  | /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...] | 
|  | ///                     ^              ^             ^ | 
|  | ///                     |                \         / | 
|  | ///                   entry                \     / | 
|  | ///                                         hole | 
|  | /// ``` | 
|  | /// | 
|  | /// To compile two expressions, e1 and e2, concatenated together we | 
|  | /// would do: | 
|  | /// | 
|  | /// ```ignore | 
|  | /// let patch1 = self.c(e1); | 
|  | /// let patch2 = self.c(e2); | 
|  | /// ``` | 
|  | /// | 
|  | /// while leaves us with a situation that looks like | 
|  | /// | 
|  | /// ```text | 
|  | /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ] | 
|  | ///                     ^        ^            ^        ^ | 
|  | ///                     |        |            |        | | 
|  | ///                entry1        hole1   entry2        hole2 | 
|  | /// ``` | 
|  | /// | 
|  | /// Then to merge the two patches together into one we would backpatch | 
|  | /// hole1 with entry2 and return a new patch that enters at entry1 | 
|  | /// and has hole2 for a hole. In fact, if you look at the c_concat | 
|  | /// method you will see that it does exactly this, though it handles | 
|  | /// a list of expressions rather than just the two that we use for | 
|  | /// an example. | 
|  | /// | 
|  | /// Ok(None) is returned when an expression is compiled to no | 
|  | /// instruction, and so no patch.entry value makes sense. | 
|  | fn c(&mut self, expr: &Hir) -> ResultOrEmpty { | 
|  | use crate::prog; | 
|  | use regex_syntax::hir::HirKind::*; | 
|  |  | 
|  | self.check_size()?; | 
|  | match *expr.kind() { | 
|  | Empty => self.c_empty(), | 
|  | Literal(hir::Literal::Unicode(c)) => self.c_char(c), | 
|  | Literal(hir::Literal::Byte(b)) => { | 
|  | assert!(self.compiled.uses_bytes()); | 
|  | self.c_byte(b) | 
|  | } | 
|  | Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()), | 
|  | Class(hir::Class::Bytes(ref cls)) => { | 
|  | if self.compiled.uses_bytes() { | 
|  | self.c_class_bytes(cls.ranges()) | 
|  | } else { | 
|  | assert!(cls.is_all_ascii()); | 
|  | let mut char_ranges = vec![]; | 
|  | for r in cls.iter() { | 
|  | let (s, e) = (r.start() as char, r.end() as char); | 
|  | char_ranges.push(hir::ClassUnicodeRange::new(s, e)); | 
|  | } | 
|  | self.c_class(&char_ranges) | 
|  | } | 
|  | } | 
|  | Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => { | 
|  | self.byte_classes.set_range(b'\n', b'\n'); | 
|  | self.c_empty_look(prog::EmptyLook::EndLine) | 
|  | } | 
|  | Anchor(hir::Anchor::StartLine) => { | 
|  | self.byte_classes.set_range(b'\n', b'\n'); | 
|  | self.c_empty_look(prog::EmptyLook::StartLine) | 
|  | } | 
|  | Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => { | 
|  | self.byte_classes.set_range(b'\n', b'\n'); | 
|  | self.c_empty_look(prog::EmptyLook::StartLine) | 
|  | } | 
|  | Anchor(hir::Anchor::EndLine) => { | 
|  | self.byte_classes.set_range(b'\n', b'\n'); | 
|  | self.c_empty_look(prog::EmptyLook::EndLine) | 
|  | } | 
|  | Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => { | 
|  | self.c_empty_look(prog::EmptyLook::EndText) | 
|  | } | 
|  | Anchor(hir::Anchor::StartText) => { | 
|  | self.c_empty_look(prog::EmptyLook::StartText) | 
|  | } | 
|  | Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => { | 
|  | self.c_empty_look(prog::EmptyLook::StartText) | 
|  | } | 
|  | Anchor(hir::Anchor::EndText) => { | 
|  | self.c_empty_look(prog::EmptyLook::EndText) | 
|  | } | 
|  | WordBoundary(hir::WordBoundary::Unicode) => { | 
|  | if !cfg!(feature = "unicode-perl") { | 
|  | return Err(Error::Syntax( | 
|  | "Unicode word boundaries are unavailable when \ | 
|  | the unicode-perl feature is disabled" | 
|  | .to_string(), | 
|  | )); | 
|  | } | 
|  | self.compiled.has_unicode_word_boundary = true; | 
|  | self.byte_classes.set_word_boundary(); | 
|  | // We also make sure that all ASCII bytes are in a different | 
|  | // class from non-ASCII bytes. Otherwise, it's possible for | 
|  | // ASCII bytes to get lumped into the same class as non-ASCII | 
|  | // bytes. This in turn may cause the lazy DFA to falsely start | 
|  | // when it sees an ASCII byte that maps to a byte class with | 
|  | // non-ASCII bytes. This ensures that never happens. | 
|  | self.byte_classes.set_range(0, 0x7F); | 
|  | self.c_empty_look(prog::EmptyLook::WordBoundary) | 
|  | } | 
|  | WordBoundary(hir::WordBoundary::UnicodeNegate) => { | 
|  | if !cfg!(feature = "unicode-perl") { | 
|  | return Err(Error::Syntax( | 
|  | "Unicode word boundaries are unavailable when \ | 
|  | the unicode-perl feature is disabled" | 
|  | .to_string(), | 
|  | )); | 
|  | } | 
|  | self.compiled.has_unicode_word_boundary = true; | 
|  | self.byte_classes.set_word_boundary(); | 
|  | // See comments above for why we set the ASCII range here. | 
|  | self.byte_classes.set_range(0, 0x7F); | 
|  | self.c_empty_look(prog::EmptyLook::NotWordBoundary) | 
|  | } | 
|  | WordBoundary(hir::WordBoundary::Ascii) => { | 
|  | self.byte_classes.set_word_boundary(); | 
|  | self.c_empty_look(prog::EmptyLook::WordBoundaryAscii) | 
|  | } | 
|  | WordBoundary(hir::WordBoundary::AsciiNegate) => { | 
|  | self.byte_classes.set_word_boundary(); | 
|  | self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii) | 
|  | } | 
|  | Group(ref g) => match g.kind { | 
|  | hir::GroupKind::NonCapturing => self.c(&g.hir), | 
|  | hir::GroupKind::CaptureIndex(index) => { | 
|  | if index as usize >= self.compiled.captures.len() { | 
|  | self.compiled.captures.push(None); | 
|  | } | 
|  | self.c_capture(2 * index as usize, &g.hir) | 
|  | } | 
|  | hir::GroupKind::CaptureName { index, ref name } => { | 
|  | if index as usize >= self.compiled.captures.len() { | 
|  | let n = name.to_string(); | 
|  | self.compiled.captures.push(Some(n.clone())); | 
|  | self.capture_name_idx.insert(n, index as usize); | 
|  | } | 
|  | self.c_capture(2 * index as usize, &g.hir) | 
|  | } | 
|  | }, | 
|  | Concat(ref es) => { | 
|  | if self.compiled.is_reverse { | 
|  | self.c_concat(es.iter().rev()) | 
|  | } else { | 
|  | self.c_concat(es) | 
|  | } | 
|  | } | 
|  | Alternation(ref es) => self.c_alternate(&**es), | 
|  | Repetition(ref rep) => self.c_repeat(rep), | 
|  | } | 
|  | } | 
|  |  | 
|  | fn c_empty(&mut self) -> ResultOrEmpty { | 
|  | // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8 | 
|  | // See: CVE-2022-24713 | 
|  | // | 
|  | // Since 'empty' sub-expressions don't increase the size of | 
|  | // the actual compiled object, we "fake" an increase in its | 
|  | // size so that our 'check_size_limit' routine will eventually | 
|  | // stop compilation if there are too many empty sub-expressions | 
|  | // (e.g., via a large repetition). | 
|  | self.extra_inst_bytes += std::mem::size_of::<Inst>(); | 
|  | Ok(None) | 
|  | } | 
|  |  | 
|  | fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty { | 
|  | if self.num_exprs > 1 || self.compiled.is_dfa { | 
|  | // Don't ever compile Save instructions for regex sets because | 
|  | // they are never used. They are also never used in DFA programs | 
|  | // because DFAs can't handle captures. | 
|  | self.c(expr) | 
|  | } else { | 
|  | let entry = self.insts.len(); | 
|  | let hole = self.push_hole(InstHole::Save { slot: first_slot }); | 
|  | let patch = self.c(expr)?.unwrap_or_else(|| self.next_inst()); | 
|  | self.fill(hole, patch.entry); | 
|  | self.fill_to_next(patch.hole); | 
|  | let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 }); | 
|  | Ok(Some(Patch { hole, entry })) | 
|  | } | 
|  | } | 
|  |  | 
|  | fn c_dotstar(&mut self) -> Result { | 
|  | Ok(if !self.compiled.only_utf8() { | 
|  | self.c(&Hir::repetition(hir::Repetition { | 
|  | kind: hir::RepetitionKind::ZeroOrMore, | 
|  | greedy: false, | 
|  | hir: Box::new(Hir::any(true)), | 
|  | }))? | 
|  | .unwrap() | 
|  | } else { | 
|  | self.c(&Hir::repetition(hir::Repetition { | 
|  | kind: hir::RepetitionKind::ZeroOrMore, | 
|  | greedy: false, | 
|  | hir: Box::new(Hir::any(false)), | 
|  | }))? | 
|  | .unwrap() | 
|  | }) | 
|  | } | 
|  |  | 
|  | fn c_char(&mut self, c: char) -> ResultOrEmpty { | 
|  | if self.compiled.uses_bytes() { | 
|  | if c.is_ascii() { | 
|  | let b = c as u8; | 
|  | let hole = | 
|  | self.push_hole(InstHole::Bytes { start: b, end: b }); | 
|  | self.byte_classes.set_range(b, b); | 
|  | Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) | 
|  | } else { | 
|  | self.c_class(&[hir::ClassUnicodeRange::new(c, c)]) | 
|  | } | 
|  | } else { | 
|  | let hole = self.push_hole(InstHole::Char { c }); | 
|  | Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) | 
|  | } | 
|  | } | 
|  |  | 
|  | fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty { | 
|  | use std::mem::size_of; | 
|  |  | 
|  | assert!(!ranges.is_empty()); | 
|  | if self.compiled.uses_bytes() { | 
|  | Ok(Some(CompileClass { c: self, ranges }.compile()?)) | 
|  | } else { | 
|  | let ranges: Vec<(char, char)> = | 
|  | ranges.iter().map(|r| (r.start(), r.end())).collect(); | 
|  | let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 { | 
|  | self.push_hole(InstHole::Char { c: ranges[0].0 }) | 
|  | } else { | 
|  | self.extra_inst_bytes += | 
|  | ranges.len() * (size_of::<char>() * 2); | 
|  | self.push_hole(InstHole::Ranges { ranges }) | 
|  | }; | 
|  | Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) | 
|  | } | 
|  | } | 
|  |  | 
|  | fn c_byte(&mut self, b: u8) -> ResultOrEmpty { | 
|  | self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)]) | 
|  | } | 
|  |  | 
|  | fn c_class_bytes( | 
|  | &mut self, | 
|  | ranges: &[hir::ClassBytesRange], | 
|  | ) -> ResultOrEmpty { | 
|  | debug_assert!(!ranges.is_empty()); | 
|  |  | 
|  | let first_split_entry = self.insts.len(); | 
|  | let mut holes = vec![]; | 
|  | let mut prev_hole = Hole::None; | 
|  | for r in &ranges[0..ranges.len() - 1] { | 
|  | self.fill_to_next(prev_hole); | 
|  | let split = self.push_split_hole(); | 
|  | let next = self.insts.len(); | 
|  | self.byte_classes.set_range(r.start(), r.end()); | 
|  | holes.push(self.push_hole(InstHole::Bytes { | 
|  | start: r.start(), | 
|  | end: r.end(), | 
|  | })); | 
|  | prev_hole = self.fill_split(split, Some(next), None); | 
|  | } | 
|  | let next = self.insts.len(); | 
|  | let r = &ranges[ranges.len() - 1]; | 
|  | self.byte_classes.set_range(r.start(), r.end()); | 
|  | holes.push( | 
|  | self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }), | 
|  | ); | 
|  | self.fill(prev_hole, next); | 
|  | Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry })) | 
|  | } | 
|  |  | 
|  | fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty { | 
|  | let hole = self.push_hole(InstHole::EmptyLook { look }); | 
|  | Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) | 
|  | } | 
|  |  | 
|  | fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty | 
|  | where | 
|  | I: IntoIterator<Item = &'a Hir>, | 
|  | { | 
|  | let mut exprs = exprs.into_iter(); | 
|  | let Patch { mut hole, entry } = loop { | 
|  | match exprs.next() { | 
|  | None => return self.c_empty(), | 
|  | Some(e) => { | 
|  | if let Some(p) = self.c(e)? { | 
|  | break p; | 
|  | } | 
|  | } | 
|  | } | 
|  | }; | 
|  | for e in exprs { | 
|  | if let Some(p) = self.c(e)? { | 
|  | self.fill(hole, p.entry); | 
|  | hole = p.hole; | 
|  | } | 
|  | } | 
|  | Ok(Some(Patch { hole, entry })) | 
|  | } | 
|  |  | 
|  | fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty { | 
|  | debug_assert!( | 
|  | exprs.len() >= 2, | 
|  | "alternates must have at least 2 exprs" | 
|  | ); | 
|  |  | 
|  | // Initial entry point is always the first split. | 
|  | let first_split_entry = self.insts.len(); | 
|  |  | 
|  | // Save up all of the holes from each alternate. They will all get | 
|  | // patched to point to the same location. | 
|  | let mut holes = vec![]; | 
|  |  | 
|  | // true indicates that the hole is a split where we want to fill | 
|  | // the second branch. | 
|  | let mut prev_hole = (Hole::None, false); | 
|  | for e in &exprs[0..exprs.len() - 1] { | 
|  | if prev_hole.1 { | 
|  | let next = self.insts.len(); | 
|  | self.fill_split(prev_hole.0, None, Some(next)); | 
|  | } else { | 
|  | self.fill_to_next(prev_hole.0); | 
|  | } | 
|  | let split = self.push_split_hole(); | 
|  | if let Some(Patch { hole, entry }) = self.c(e)? { | 
|  | holes.push(hole); | 
|  | prev_hole = (self.fill_split(split, Some(entry), None), false); | 
|  | } else { | 
|  | let (split1, split2) = split.dup_one(); | 
|  | holes.push(split1); | 
|  | prev_hole = (split2, true); | 
|  | } | 
|  | } | 
|  | if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? { | 
|  | holes.push(hole); | 
|  | if prev_hole.1 { | 
|  | self.fill_split(prev_hole.0, None, Some(entry)); | 
|  | } else { | 
|  | self.fill(prev_hole.0, entry); | 
|  | } | 
|  | } else { | 
|  | // We ignore prev_hole.1. When it's true, it means we have two | 
|  | // empty branches both pushing prev_hole.0 into holes, so both | 
|  | // branches will go to the same place anyway. | 
|  | holes.push(prev_hole.0); | 
|  | } | 
|  | Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry })) | 
|  | } | 
|  |  | 
|  | fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty { | 
|  | use regex_syntax::hir::RepetitionKind::*; | 
|  | match rep.kind { | 
|  | ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy), | 
|  | ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy), | 
|  | OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy), | 
|  | Range(hir::RepetitionRange::Exactly(min_max)) => { | 
|  | self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max) | 
|  | } | 
|  | Range(hir::RepetitionRange::AtLeast(min)) => { | 
|  | self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min) | 
|  | } | 
|  | Range(hir::RepetitionRange::Bounded(min, max)) => { | 
|  | self.c_repeat_range(&rep.hir, rep.greedy, min, max) | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | fn c_repeat_zero_or_one( | 
|  | &mut self, | 
|  | expr: &Hir, | 
|  | greedy: bool, | 
|  | ) -> ResultOrEmpty { | 
|  | let split_entry = self.insts.len(); | 
|  | let split = self.push_split_hole(); | 
|  | let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { | 
|  | Some(p) => p, | 
|  | None => return self.pop_split_hole(), | 
|  | }; | 
|  | let split_hole = if greedy { | 
|  | self.fill_split(split, Some(entry_rep), None) | 
|  | } else { | 
|  | self.fill_split(split, None, Some(entry_rep)) | 
|  | }; | 
|  | let holes = vec![hole_rep, split_hole]; | 
|  | Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry })) | 
|  | } | 
|  |  | 
|  | fn c_repeat_zero_or_more( | 
|  | &mut self, | 
|  | expr: &Hir, | 
|  | greedy: bool, | 
|  | ) -> ResultOrEmpty { | 
|  | let split_entry = self.insts.len(); | 
|  | let split = self.push_split_hole(); | 
|  | let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { | 
|  | Some(p) => p, | 
|  | None => return self.pop_split_hole(), | 
|  | }; | 
|  |  | 
|  | self.fill(hole_rep, split_entry); | 
|  | let split_hole = if greedy { | 
|  | self.fill_split(split, Some(entry_rep), None) | 
|  | } else { | 
|  | self.fill_split(split, None, Some(entry_rep)) | 
|  | }; | 
|  | Ok(Some(Patch { hole: split_hole, entry: split_entry })) | 
|  | } | 
|  |  | 
|  | fn c_repeat_one_or_more( | 
|  | &mut self, | 
|  | expr: &Hir, | 
|  | greedy: bool, | 
|  | ) -> ResultOrEmpty { | 
|  | let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { | 
|  | Some(p) => p, | 
|  | None => return Ok(None), | 
|  | }; | 
|  | self.fill_to_next(hole_rep); | 
|  | let split = self.push_split_hole(); | 
|  |  | 
|  | let split_hole = if greedy { | 
|  | self.fill_split(split, Some(entry_rep), None) | 
|  | } else { | 
|  | self.fill_split(split, None, Some(entry_rep)) | 
|  | }; | 
|  | Ok(Some(Patch { hole: split_hole, entry: entry_rep })) | 
|  | } | 
|  |  | 
|  | fn c_repeat_range_min_or_more( | 
|  | &mut self, | 
|  | expr: &Hir, | 
|  | greedy: bool, | 
|  | min: u32, | 
|  | ) -> ResultOrEmpty { | 
|  | let min = u32_to_usize(min); | 
|  | // Using next_inst() is ok, because we can't return it (concat would | 
|  | // have to return Some(_) while c_repeat_range_min_or_more returns | 
|  | // None). | 
|  | let patch_concat = self | 
|  | .c_concat(iter::repeat(expr).take(min))? | 
|  | .unwrap_or_else(|| self.next_inst()); | 
|  | if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? { | 
|  | self.fill(patch_concat.hole, patch_rep.entry); | 
|  | Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry })) | 
|  | } else { | 
|  | Ok(None) | 
|  | } | 
|  | } | 
|  |  | 
|  | fn c_repeat_range( | 
|  | &mut self, | 
|  | expr: &Hir, | 
|  | greedy: bool, | 
|  | min: u32, | 
|  | max: u32, | 
|  | ) -> ResultOrEmpty { | 
|  | let (min, max) = (u32_to_usize(min), u32_to_usize(max)); | 
|  | debug_assert!(min <= max); | 
|  | let patch_concat = self.c_concat(iter::repeat(expr).take(min))?; | 
|  | if min == max { | 
|  | return Ok(patch_concat); | 
|  | } | 
|  | // Same reasoning as in c_repeat_range_min_or_more (we know that min < | 
|  | // max at this point). | 
|  | let patch_concat = patch_concat.unwrap_or_else(|| self.next_inst()); | 
|  | let initial_entry = patch_concat.entry; | 
|  | // It is much simpler to compile, e.g., `a{2,5}` as: | 
|  | // | 
|  | //     aaa?a?a? | 
|  | // | 
|  | // But you end up with a sequence of instructions like this: | 
|  | // | 
|  | //     0: 'a' | 
|  | //     1: 'a', | 
|  | //     2: split(3, 4) | 
|  | //     3: 'a' | 
|  | //     4: split(5, 6) | 
|  | //     5: 'a' | 
|  | //     6: split(7, 8) | 
|  | //     7: 'a' | 
|  | //     8: MATCH | 
|  | // | 
|  | // This is *incredibly* inefficient because the splits end | 
|  | // up forming a chain, which has to be resolved everything a | 
|  | // transition is followed. | 
|  | let mut holes = vec![]; | 
|  | let mut prev_hole = patch_concat.hole; | 
|  | for _ in min..max { | 
|  | self.fill_to_next(prev_hole); | 
|  | let split = self.push_split_hole(); | 
|  | let Patch { hole, entry } = match self.c(expr)? { | 
|  | Some(p) => p, | 
|  | None => return self.pop_split_hole(), | 
|  | }; | 
|  | prev_hole = hole; | 
|  | if greedy { | 
|  | holes.push(self.fill_split(split, Some(entry), None)); | 
|  | } else { | 
|  | holes.push(self.fill_split(split, None, Some(entry))); | 
|  | } | 
|  | } | 
|  | holes.push(prev_hole); | 
|  | Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry })) | 
|  | } | 
|  |  | 
|  | /// Can be used as a default value for the c_* functions when the call to | 
|  | /// c_function is followed by inserting at least one instruction that is | 
|  | /// always executed after the ones written by the c* function. | 
|  | fn next_inst(&self) -> Patch { | 
|  | Patch { hole: Hole::None, entry: self.insts.len() } | 
|  | } | 
|  |  | 
|  | fn fill(&mut self, hole: Hole, goto: InstPtr) { | 
|  | match hole { | 
|  | Hole::None => {} | 
|  | Hole::One(pc) => { | 
|  | self.insts[pc].fill(goto); | 
|  | } | 
|  | Hole::Many(holes) => { | 
|  | for hole in holes { | 
|  | self.fill(hole, goto); | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | fn fill_to_next(&mut self, hole: Hole) { | 
|  | let next = self.insts.len(); | 
|  | self.fill(hole, next); | 
|  | } | 
|  |  | 
|  | fn fill_split( | 
|  | &mut self, | 
|  | hole: Hole, | 
|  | goto1: Option<InstPtr>, | 
|  | goto2: Option<InstPtr>, | 
|  | ) -> Hole { | 
|  | match hole { | 
|  | Hole::None => Hole::None, | 
|  | Hole::One(pc) => match (goto1, goto2) { | 
|  | (Some(goto1), Some(goto2)) => { | 
|  | self.insts[pc].fill_split(goto1, goto2); | 
|  | Hole::None | 
|  | } | 
|  | (Some(goto1), None) => { | 
|  | self.insts[pc].half_fill_split_goto1(goto1); | 
|  | Hole::One(pc) | 
|  | } | 
|  | (None, Some(goto2)) => { | 
|  | self.insts[pc].half_fill_split_goto2(goto2); | 
|  | Hole::One(pc) | 
|  | } | 
|  | (None, None) => unreachable!( | 
|  | "at least one of the split \ | 
|  | holes must be filled" | 
|  | ), | 
|  | }, | 
|  | Hole::Many(holes) => { | 
|  | let mut new_holes = vec![]; | 
|  | for hole in holes { | 
|  | new_holes.push(self.fill_split(hole, goto1, goto2)); | 
|  | } | 
|  | if new_holes.is_empty() { | 
|  | Hole::None | 
|  | } else if new_holes.len() == 1 { | 
|  | new_holes.pop().unwrap() | 
|  | } else { | 
|  | Hole::Many(new_holes) | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | fn push_compiled(&mut self, inst: Inst) { | 
|  | self.insts.push(MaybeInst::Compiled(inst)); | 
|  | } | 
|  |  | 
|  | fn push_hole(&mut self, inst: InstHole) -> Hole { | 
|  | let hole = self.insts.len(); | 
|  | self.insts.push(MaybeInst::Uncompiled(inst)); | 
|  | Hole::One(hole) | 
|  | } | 
|  |  | 
|  | fn push_split_hole(&mut self) -> Hole { | 
|  | let hole = self.insts.len(); | 
|  | self.insts.push(MaybeInst::Split); | 
|  | Hole::One(hole) | 
|  | } | 
|  |  | 
|  | fn pop_split_hole(&mut self) -> ResultOrEmpty { | 
|  | self.insts.pop(); | 
|  | Ok(None) | 
|  | } | 
|  |  | 
|  | fn check_size(&self) -> result::Result<(), Error> { | 
|  | use std::mem::size_of; | 
|  |  | 
|  | let size = | 
|  | self.extra_inst_bytes + (self.insts.len() * size_of::<Inst>()); | 
|  | if size > self.size_limit { | 
|  | Err(Error::CompiledTooBig(self.size_limit)) | 
|  | } else { | 
|  | Ok(()) | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | #[derive(Debug)] | 
|  | enum Hole { | 
|  | None, | 
|  | One(InstPtr), | 
|  | Many(Vec<Hole>), | 
|  | } | 
|  |  | 
|  | impl Hole { | 
|  | fn dup_one(self) -> (Self, Self) { | 
|  | match self { | 
|  | Hole::One(pc) => (Hole::One(pc), Hole::One(pc)), | 
|  | Hole::None | Hole::Many(_) => { | 
|  | unreachable!("must be called on single hole") | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | #[derive(Clone, Debug)] | 
|  | enum MaybeInst { | 
|  | Compiled(Inst), | 
|  | Uncompiled(InstHole), | 
|  | Split, | 
|  | Split1(InstPtr), | 
|  | Split2(InstPtr), | 
|  | } | 
|  |  | 
|  | impl MaybeInst { | 
|  | fn fill(&mut self, goto: InstPtr) { | 
|  | let maybeinst = match *self { | 
|  | MaybeInst::Split => MaybeInst::Split1(goto), | 
|  | MaybeInst::Uncompiled(ref inst) => { | 
|  | MaybeInst::Compiled(inst.fill(goto)) | 
|  | } | 
|  | MaybeInst::Split1(goto1) => { | 
|  | MaybeInst::Compiled(Inst::Split(InstSplit { | 
|  | goto1, | 
|  | goto2: goto, | 
|  | })) | 
|  | } | 
|  | MaybeInst::Split2(goto2) => { | 
|  | MaybeInst::Compiled(Inst::Split(InstSplit { | 
|  | goto1: goto, | 
|  | goto2, | 
|  | })) | 
|  | } | 
|  | _ => unreachable!( | 
|  | "not all instructions were compiled! \ | 
|  | found uncompiled instruction: {:?}", | 
|  | self | 
|  | ), | 
|  | }; | 
|  | *self = maybeinst; | 
|  | } | 
|  |  | 
|  | fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) { | 
|  | let filled = match *self { | 
|  | MaybeInst::Split => Inst::Split(InstSplit { goto1, goto2 }), | 
|  | _ => unreachable!( | 
|  | "must be called on Split instruction, \ | 
|  | instead it was called on: {:?}", | 
|  | self | 
|  | ), | 
|  | }; | 
|  | *self = MaybeInst::Compiled(filled); | 
|  | } | 
|  |  | 
|  | fn half_fill_split_goto1(&mut self, goto1: InstPtr) { | 
|  | let half_filled = match *self { | 
|  | MaybeInst::Split => goto1, | 
|  | _ => unreachable!( | 
|  | "must be called on Split instruction, \ | 
|  | instead it was called on: {:?}", | 
|  | self | 
|  | ), | 
|  | }; | 
|  | *self = MaybeInst::Split1(half_filled); | 
|  | } | 
|  |  | 
|  | fn half_fill_split_goto2(&mut self, goto2: InstPtr) { | 
|  | let half_filled = match *self { | 
|  | MaybeInst::Split => goto2, | 
|  | _ => unreachable!( | 
|  | "must be called on Split instruction, \ | 
|  | instead it was called on: {:?}", | 
|  | self | 
|  | ), | 
|  | }; | 
|  | *self = MaybeInst::Split2(half_filled); | 
|  | } | 
|  |  | 
|  | fn unwrap(self) -> Inst { | 
|  | match self { | 
|  | MaybeInst::Compiled(inst) => inst, | 
|  | _ => unreachable!( | 
|  | "must be called on a compiled instruction, \ | 
|  | instead it was called on: {:?}", | 
|  | self | 
|  | ), | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | #[derive(Clone, Debug)] | 
|  | enum InstHole { | 
|  | Save { slot: usize }, | 
|  | EmptyLook { look: EmptyLook }, | 
|  | Char { c: char }, | 
|  | Ranges { ranges: Vec<(char, char)> }, | 
|  | Bytes { start: u8, end: u8 }, | 
|  | } | 
|  |  | 
|  | impl InstHole { | 
|  | fn fill(&self, goto: InstPtr) -> Inst { | 
|  | match *self { | 
|  | InstHole::Save { slot } => Inst::Save(InstSave { goto, slot }), | 
|  | InstHole::EmptyLook { look } => { | 
|  | Inst::EmptyLook(InstEmptyLook { goto, look }) | 
|  | } | 
|  | InstHole::Char { c } => Inst::Char(InstChar { goto, c }), | 
|  | InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges { | 
|  | goto, | 
|  | ranges: ranges.clone().into_boxed_slice(), | 
|  | }), | 
|  | InstHole::Bytes { start, end } => { | 
|  | Inst::Bytes(InstBytes { goto, start, end }) | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | struct CompileClass<'a, 'b> { | 
|  | c: &'a mut Compiler, | 
|  | ranges: &'b [hir::ClassUnicodeRange], | 
|  | } | 
|  |  | 
|  | impl<'a, 'b> CompileClass<'a, 'b> { | 
|  | fn compile(mut self) -> Result { | 
|  | let mut holes = vec![]; | 
|  | let mut initial_entry = None; | 
|  | let mut last_split = Hole::None; | 
|  | let mut utf8_seqs = self.c.utf8_seqs.take().unwrap(); | 
|  | self.c.suffix_cache.clear(); | 
|  |  | 
|  | for (i, range) in self.ranges.iter().enumerate() { | 
|  | let is_last_range = i + 1 == self.ranges.len(); | 
|  | utf8_seqs.reset(range.start(), range.end()); | 
|  | let mut it = (&mut utf8_seqs).peekable(); | 
|  | loop { | 
|  | let utf8_seq = match it.next() { | 
|  | None => break, | 
|  | Some(utf8_seq) => utf8_seq, | 
|  | }; | 
|  | if is_last_range && it.peek().is_none() { | 
|  | let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?; | 
|  | holes.push(hole); | 
|  | self.c.fill(last_split, entry); | 
|  | last_split = Hole::None; | 
|  | if initial_entry.is_none() { | 
|  | initial_entry = Some(entry); | 
|  | } | 
|  | } else { | 
|  | if initial_entry.is_none() { | 
|  | initial_entry = Some(self.c.insts.len()); | 
|  | } | 
|  | self.c.fill_to_next(last_split); | 
|  | last_split = self.c.push_split_hole(); | 
|  | let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?; | 
|  | holes.push(hole); | 
|  | last_split = | 
|  | self.c.fill_split(last_split, Some(entry), None); | 
|  | } | 
|  | } | 
|  | } | 
|  | self.c.utf8_seqs = Some(utf8_seqs); | 
|  | Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() }) | 
|  | } | 
|  |  | 
|  | fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result { | 
|  | if self.c.compiled.is_reverse { | 
|  | self.c_utf8_seq_(seq) | 
|  | } else { | 
|  | self.c_utf8_seq_(seq.into_iter().rev()) | 
|  | } | 
|  | } | 
|  |  | 
|  | fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result | 
|  | where | 
|  | I: IntoIterator<Item = &'r Utf8Range>, | 
|  | { | 
|  | // The initial instruction for each UTF-8 sequence should be the same. | 
|  | let mut from_inst = ::std::usize::MAX; | 
|  | let mut last_hole = Hole::None; | 
|  | for byte_range in seq { | 
|  | let key = SuffixCacheKey { | 
|  | from_inst, | 
|  | start: byte_range.start, | 
|  | end: byte_range.end, | 
|  | }; | 
|  | { | 
|  | let pc = self.c.insts.len(); | 
|  | if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) { | 
|  | from_inst = cached_pc; | 
|  | continue; | 
|  | } | 
|  | } | 
|  | self.c.byte_classes.set_range(byte_range.start, byte_range.end); | 
|  | if from_inst == ::std::usize::MAX { | 
|  | last_hole = self.c.push_hole(InstHole::Bytes { | 
|  | start: byte_range.start, | 
|  | end: byte_range.end, | 
|  | }); | 
|  | } else { | 
|  | self.c.push_compiled(Inst::Bytes(InstBytes { | 
|  | goto: from_inst, | 
|  | start: byte_range.start, | 
|  | end: byte_range.end, | 
|  | })); | 
|  | } | 
|  | from_inst = self.c.insts.len().checked_sub(1).unwrap(); | 
|  | debug_assert!(from_inst < ::std::usize::MAX); | 
|  | } | 
|  | debug_assert!(from_inst < ::std::usize::MAX); | 
|  | Ok(Patch { hole: last_hole, entry: from_inst }) | 
|  | } | 
|  | } | 
|  |  | 
|  | /// `SuffixCache` is a simple bounded hash map for caching suffix entries in | 
|  | /// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}. | 
|  | /// The set of byte ranges looks like this: | 
|  | /// | 
|  | /// [0-7F] | 
|  | /// [C2-DF][80-BF] | 
|  | /// [E0][A0-BF][80-BF] | 
|  | /// [E1-EC][80-BF][80-BF] | 
|  | /// [ED][80-9F][80-BF] | 
|  | /// [EE-EF][80-BF][80-BF] | 
|  | /// | 
|  | /// Each line above translates to one alternate in the compiled regex program. | 
|  | /// However, all but one of the alternates end in the same suffix, which is | 
|  | /// a waste of an instruction. The suffix cache facilitates reusing them across | 
|  | /// alternates. | 
|  | /// | 
|  | /// Note that a HashMap could be trivially used for this, but we don't need its | 
|  | /// overhead. Some small bounded space (LRU style) is more than enough. | 
|  | /// | 
|  | /// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html), | 
|  | /// except it uses hashes as original indices and then compares full keys for | 
|  | /// validation against `dense` array. | 
|  | #[derive(Debug)] | 
|  | struct SuffixCache { | 
|  | sparse: Box<[usize]>, | 
|  | dense: Vec<SuffixCacheEntry>, | 
|  | } | 
|  |  | 
|  | #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)] | 
|  | struct SuffixCacheEntry { | 
|  | key: SuffixCacheKey, | 
|  | pc: InstPtr, | 
|  | } | 
|  |  | 
|  | #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)] | 
|  | struct SuffixCacheKey { | 
|  | from_inst: InstPtr, | 
|  | start: u8, | 
|  | end: u8, | 
|  | } | 
|  |  | 
|  | impl SuffixCache { | 
|  | fn new(size: usize) -> Self { | 
|  | SuffixCache { | 
|  | sparse: vec![0usize; size].into(), | 
|  | dense: Vec::with_capacity(size), | 
|  | } | 
|  | } | 
|  |  | 
|  | fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> { | 
|  | let hash = self.hash(&key); | 
|  | let pos = &mut self.sparse[hash]; | 
|  | if let Some(entry) = self.dense.get(*pos) { | 
|  | if entry.key == key { | 
|  | return Some(entry.pc); | 
|  | } | 
|  | } | 
|  | *pos = self.dense.len(); | 
|  | self.dense.push(SuffixCacheEntry { key, pc }); | 
|  | None | 
|  | } | 
|  |  | 
|  | fn clear(&mut self) { | 
|  | self.dense.clear(); | 
|  | } | 
|  |  | 
|  | fn hash(&self, suffix: &SuffixCacheKey) -> usize { | 
|  | // Basic FNV-1a hash as described: | 
|  | // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function | 
|  | const FNV_PRIME: u64 = 1_099_511_628_211; | 
|  | let mut h = 14_695_981_039_346_656_037; | 
|  | h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME); | 
|  | h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME); | 
|  | h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME); | 
|  | (h as usize) % self.sparse.len() | 
|  | } | 
|  | } | 
|  |  | 
|  | struct ByteClassSet([bool; 256]); | 
|  |  | 
|  | impl ByteClassSet { | 
|  | fn new() -> Self { | 
|  | ByteClassSet([false; 256]) | 
|  | } | 
|  |  | 
|  | fn set_range(&mut self, start: u8, end: u8) { | 
|  | debug_assert!(start <= end); | 
|  | if start > 0 { | 
|  | self.0[start as usize - 1] = true; | 
|  | } | 
|  | self.0[end as usize] = true; | 
|  | } | 
|  |  | 
|  | fn set_word_boundary(&mut self) { | 
|  | // We need to mark all ranges of bytes whose pairs result in | 
|  | // evaluating \b differently. | 
|  | let iswb = is_word_byte; | 
|  | let mut b1: u16 = 0; | 
|  | let mut b2: u16; | 
|  | while b1 <= 255 { | 
|  | b2 = b1 + 1; | 
|  | while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) { | 
|  | b2 += 1; | 
|  | } | 
|  | self.set_range(b1 as u8, (b2 - 1) as u8); | 
|  | b1 = b2; | 
|  | } | 
|  | } | 
|  |  | 
|  | fn byte_classes(&self) -> Vec<u8> { | 
|  | // N.B. If you're debugging the DFA, it's useful to simply return | 
|  | // `(0..256).collect()`, which effectively removes the byte classes | 
|  | // and makes the transitions easier to read. | 
|  | // (0usize..256).map(|x| x as u8).collect() | 
|  | let mut byte_classes = vec![0; 256]; | 
|  | let mut class = 0u8; | 
|  | let mut i = 0; | 
|  | loop { | 
|  | byte_classes[i] = class as u8; | 
|  | if i >= 255 { | 
|  | break; | 
|  | } | 
|  | if self.0[i] { | 
|  | class = class.checked_add(1).unwrap(); | 
|  | } | 
|  | i += 1; | 
|  | } | 
|  | byte_classes | 
|  | } | 
|  | } | 
|  |  | 
|  | impl fmt::Debug for ByteClassSet { | 
|  | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { | 
|  | f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish() | 
|  | } | 
|  | } | 
|  |  | 
|  | fn u32_to_usize(n: u32) -> usize { | 
|  | // In case usize is less than 32 bits, we need to guard against overflow. | 
|  | // On most platforms this compiles to nothing. | 
|  | // TODO Use `std::convert::TryFrom` once it's stable. | 
|  | if (n as u64) > (::std::usize::MAX as u64) { | 
|  | panic!("BUG: {} is too big to be pointer sized", n) | 
|  | } | 
|  | n as usize | 
|  | } | 
|  |  | 
|  | #[cfg(test)] | 
|  | mod tests { | 
|  | use super::ByteClassSet; | 
|  |  | 
|  | #[test] | 
|  | fn byte_classes() { | 
|  | let mut set = ByteClassSet::new(); | 
|  | set.set_range(b'a', b'z'); | 
|  | let classes = set.byte_classes(); | 
|  | assert_eq!(classes[0], 0); | 
|  | assert_eq!(classes[1], 0); | 
|  | assert_eq!(classes[2], 0); | 
|  | assert_eq!(classes[b'a' as usize - 1], 0); | 
|  | assert_eq!(classes[b'a' as usize], 1); | 
|  | assert_eq!(classes[b'm' as usize], 1); | 
|  | assert_eq!(classes[b'z' as usize], 1); | 
|  | assert_eq!(classes[b'z' as usize + 1], 2); | 
|  | assert_eq!(classes[254], 2); | 
|  | assert_eq!(classes[255], 2); | 
|  |  | 
|  | let mut set = ByteClassSet::new(); | 
|  | set.set_range(0, 2); | 
|  | set.set_range(4, 6); | 
|  | let classes = set.byte_classes(); | 
|  | assert_eq!(classes[0], 0); | 
|  | assert_eq!(classes[1], 0); | 
|  | assert_eq!(classes[2], 0); | 
|  | assert_eq!(classes[3], 1); | 
|  | assert_eq!(classes[4], 2); | 
|  | assert_eq!(classes[5], 2); | 
|  | assert_eq!(classes[6], 2); | 
|  | assert_eq!(classes[7], 3); | 
|  | assert_eq!(classes[255], 3); | 
|  | } | 
|  |  | 
|  | #[test] | 
|  | fn full_byte_classes() { | 
|  | let mut set = ByteClassSet::new(); | 
|  | for i in 0..256u16 { | 
|  | set.set_range(i as u8, i as u8); | 
|  | } | 
|  | assert_eq!(set.byte_classes().len(), 256); | 
|  | } | 
|  | } |