|  | use std::cmp::Ordering; | 
|  | use std::collections::HashMap; | 
|  | use std::fmt; | 
|  | use std::mem; | 
|  | use std::ops::Deref; | 
|  | use std::slice; | 
|  | use std::sync::Arc; | 
|  |  | 
|  | use crate::input::Char; | 
|  | use crate::literal::LiteralSearcher; | 
|  |  | 
|  | /// `InstPtr` represents the index of an instruction in a regex program. | 
|  | pub type InstPtr = usize; | 
|  |  | 
|  | /// Program is a sequence of instructions and various facts about thos | 
|  | /// instructions. | 
|  | #[derive(Clone)] | 
|  | pub struct Program { | 
|  | /// A sequence of instructions that represents an NFA. | 
|  | pub insts: Vec<Inst>, | 
|  | /// Pointers to each Match instruction in the sequence. | 
|  | /// | 
|  | /// This is always length 1 unless this program represents a regex set. | 
|  | pub matches: Vec<InstPtr>, | 
|  | /// The ordered sequence of all capture groups extracted from the AST. | 
|  | /// Unnamed groups are `None`. | 
|  | pub captures: Vec<Option<String>>, | 
|  | /// Pointers to all named capture groups into `captures`. | 
|  | pub capture_name_idx: Arc<HashMap<String, usize>>, | 
|  | /// A pointer to the start instruction. This can vary depending on how | 
|  | /// the program was compiled. For example, programs for use with the DFA | 
|  | /// engine have a `.*?` inserted at the beginning of unanchored regular | 
|  | /// expressions. The actual starting point of the program is after the | 
|  | /// `.*?`. | 
|  | pub start: InstPtr, | 
|  | /// A set of equivalence classes for discriminating bytes in the compiled | 
|  | /// program. | 
|  | pub byte_classes: Vec<u8>, | 
|  | /// When true, this program can only match valid UTF-8. | 
|  | pub only_utf8: bool, | 
|  | /// When true, this program uses byte range instructions instead of Unicode | 
|  | /// range instructions. | 
|  | pub is_bytes: bool, | 
|  | /// When true, the program is compiled for DFA matching. For example, this | 
|  | /// implies `is_bytes` and also inserts a preceding `.*?` for unanchored | 
|  | /// regexes. | 
|  | pub is_dfa: bool, | 
|  | /// When true, the program matches text in reverse (for use only in the | 
|  | /// DFA). | 
|  | pub is_reverse: bool, | 
|  | /// Whether the regex must match from the start of the input. | 
|  | pub is_anchored_start: bool, | 
|  | /// Whether the regex must match at the end of the input. | 
|  | pub is_anchored_end: bool, | 
|  | /// Whether this program contains a Unicode word boundary instruction. | 
|  | pub has_unicode_word_boundary: bool, | 
|  | /// A possibly empty machine for very quickly matching prefix literals. | 
|  | pub prefixes: LiteralSearcher, | 
|  | /// A limit on the size of the cache that the DFA is allowed to use while | 
|  | /// matching. | 
|  | /// | 
|  | /// The cache limit specifies approximately how much space we're willing to | 
|  | /// give to the state cache. Once the state cache exceeds the size, it is | 
|  | /// wiped and all states must be re-computed. | 
|  | /// | 
|  | /// Note that this value does not impact correctness. It can be set to 0 | 
|  | /// and the DFA will run just fine. (It will only ever store exactly one | 
|  | /// state in the cache, and will likely run very slowly, but it will work.) | 
|  | /// | 
|  | /// Also note that this limit is *per thread of execution*. That is, | 
|  | /// if the same regex is used to search text across multiple threads | 
|  | /// simultaneously, then the DFA cache is not shared. Instead, copies are | 
|  | /// made. | 
|  | pub dfa_size_limit: usize, | 
|  | } | 
|  |  | 
|  | impl Program { | 
|  | /// Creates an empty instruction sequence. Fields are given default | 
|  | /// values. | 
|  | pub fn new() -> Self { | 
|  | Program { | 
|  | insts: vec![], | 
|  | matches: vec![], | 
|  | captures: vec![], | 
|  | capture_name_idx: Arc::new(HashMap::new()), | 
|  | start: 0, | 
|  | byte_classes: vec![0; 256], | 
|  | only_utf8: true, | 
|  | is_bytes: false, | 
|  | is_dfa: false, | 
|  | is_reverse: false, | 
|  | is_anchored_start: false, | 
|  | is_anchored_end: false, | 
|  | has_unicode_word_boundary: false, | 
|  | prefixes: LiteralSearcher::empty(), | 
|  | dfa_size_limit: 2 * (1 << 20), | 
|  | } | 
|  | } | 
|  |  | 
|  | /// If pc is an index to a no-op instruction (like Save), then return the | 
|  | /// next pc that is not a no-op instruction. | 
|  | pub fn skip(&self, mut pc: usize) -> usize { | 
|  | loop { | 
|  | match self[pc] { | 
|  | Inst::Save(ref i) => pc = i.goto, | 
|  | _ => return pc, | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | /// Return true if and only if an execution engine at instruction `pc` will | 
|  | /// always lead to a match. | 
|  | pub fn leads_to_match(&self, pc: usize) -> bool { | 
|  | if self.matches.len() > 1 { | 
|  | // If we have a regex set, then we have more than one ending | 
|  | // state, so leading to one of those states is generally | 
|  | // meaningless. | 
|  | return false; | 
|  | } | 
|  | match self[self.skip(pc)] { | 
|  | Inst::Match(_) => true, | 
|  | _ => false, | 
|  | } | 
|  | } | 
|  |  | 
|  | /// Returns true if the current configuration demands that an implicit | 
|  | /// `.*?` be prepended to the instruction sequence. | 
|  | pub fn needs_dotstar(&self) -> bool { | 
|  | self.is_dfa && !self.is_reverse && !self.is_anchored_start | 
|  | } | 
|  |  | 
|  | /// Returns true if this program uses Byte instructions instead of | 
|  | /// Char/Range instructions. | 
|  | pub fn uses_bytes(&self) -> bool { | 
|  | self.is_bytes || self.is_dfa | 
|  | } | 
|  |  | 
|  | /// Returns true if this program exclusively matches valid UTF-8 bytes. | 
|  | /// | 
|  | /// That is, if an invalid UTF-8 byte is seen, then no match is possible. | 
|  | pub fn only_utf8(&self) -> bool { | 
|  | self.only_utf8 | 
|  | } | 
|  |  | 
|  | /// Return the approximate heap usage of this instruction sequence in | 
|  | /// bytes. | 
|  | pub fn approximate_size(&self) -> usize { | 
|  | // The only instruction that uses heap space is Ranges (for | 
|  | // Unicode codepoint programs) to store non-overlapping codepoint | 
|  | // ranges. To keep this operation constant time, we ignore them. | 
|  | (self.len() * mem::size_of::<Inst>()) | 
|  | + (self.matches.len() * mem::size_of::<InstPtr>()) | 
|  | + (self.captures.len() * mem::size_of::<Option<String>>()) | 
|  | + (self.capture_name_idx.len() | 
|  | * (mem::size_of::<String>() + mem::size_of::<usize>())) | 
|  | + (self.byte_classes.len() * mem::size_of::<u8>()) | 
|  | + self.prefixes.approximate_size() | 
|  | } | 
|  | } | 
|  |  | 
|  | impl Deref for Program { | 
|  | type Target = [Inst]; | 
|  |  | 
|  | #[cfg_attr(feature = "perf-inline", inline(always))] | 
|  | fn deref(&self) -> &Self::Target { | 
|  | &*self.insts | 
|  | } | 
|  | } | 
|  |  | 
|  | impl fmt::Debug for Program { | 
|  | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { | 
|  | use self::Inst::*; | 
|  |  | 
|  | fn with_goto(cur: usize, goto: usize, fmtd: String) -> String { | 
|  | if goto == cur + 1 { | 
|  | fmtd | 
|  | } else { | 
|  | format!("{} (goto: {})", fmtd, goto) | 
|  | } | 
|  | } | 
|  |  | 
|  | fn visible_byte(b: u8) -> String { | 
|  | use std::ascii::escape_default; | 
|  | let escaped = escape_default(b).collect::<Vec<u8>>(); | 
|  | String::from_utf8_lossy(&escaped).into_owned() | 
|  | } | 
|  |  | 
|  | for (pc, inst) in self.iter().enumerate() { | 
|  | match *inst { | 
|  | Match(slot) => write!(f, "{:04} Match({:?})", pc, slot)?, | 
|  | Save(ref inst) => { | 
|  | let s = format!("{:04} Save({})", pc, inst.slot); | 
|  | write!(f, "{}", with_goto(pc, inst.goto, s))?; | 
|  | } | 
|  | Split(ref inst) => { | 
|  | write!( | 
|  | f, | 
|  | "{:04} Split({}, {})", | 
|  | pc, inst.goto1, inst.goto2 | 
|  | )?; | 
|  | } | 
|  | EmptyLook(ref inst) => { | 
|  | let s = format!("{:?}", inst.look); | 
|  | write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; | 
|  | } | 
|  | Char(ref inst) => { | 
|  | let s = format!("{:?}", inst.c); | 
|  | write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; | 
|  | } | 
|  | Ranges(ref inst) => { | 
|  | let ranges = inst | 
|  | .ranges | 
|  | .iter() | 
|  | .map(|r| format!("{:?}-{:?}", r.0, r.1)) | 
|  | .collect::<Vec<String>>() | 
|  | .join(", "); | 
|  | write!( | 
|  | f, | 
|  | "{:04} {}", | 
|  | pc, | 
|  | with_goto(pc, inst.goto, ranges) | 
|  | )?; | 
|  | } | 
|  | Bytes(ref inst) => { | 
|  | let s = format!( | 
|  | "Bytes({}, {})", | 
|  | visible_byte(inst.start), | 
|  | visible_byte(inst.end) | 
|  | ); | 
|  | write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; | 
|  | } | 
|  | } | 
|  | if pc == self.start { | 
|  | write!(f, " (start)")?; | 
|  | } | 
|  | writeln!(f)?; | 
|  | } | 
|  | Ok(()) | 
|  | } | 
|  | } | 
|  |  | 
|  | impl<'a> IntoIterator for &'a Program { | 
|  | type Item = &'a Inst; | 
|  | type IntoIter = slice::Iter<'a, Inst>; | 
|  | fn into_iter(self) -> Self::IntoIter { | 
|  | self.iter() | 
|  | } | 
|  | } | 
|  |  | 
|  | /// Inst is an instruction code in a Regex program. | 
|  | /// | 
|  | /// Regrettably, a regex program either contains Unicode codepoint | 
|  | /// instructions (Char and Ranges) or it contains byte instructions (Bytes). | 
|  | /// A regex program can never contain both. | 
|  | /// | 
|  | /// It would be worth investigating splitting this into two distinct types and | 
|  | /// then figuring out how to make the matching engines polymorphic over those | 
|  | /// types without sacrificing performance. | 
|  | /// | 
|  | /// Other than the benefit of moving invariants into the type system, another | 
|  | /// benefit is the decreased size. If we remove the `Char` and `Ranges` | 
|  | /// instructions from the `Inst` enum, then its size shrinks from 32 bytes to | 
|  | /// 24 bytes. (This is because of the removal of a `Box<[]>` in the `Ranges` | 
|  | /// variant.) Given that byte based machines are typically much bigger than | 
|  | /// their Unicode analogues (because they can decode UTF-8 directly), this ends | 
|  | /// up being a pretty significant savings. | 
|  | #[derive(Clone, Debug)] | 
|  | pub enum Inst { | 
|  | /// Match indicates that the program has reached a match state. | 
|  | /// | 
|  | /// The number in the match corresponds to the Nth logical regular | 
|  | /// expression in this program. This index is always 0 for normal regex | 
|  | /// programs. Values greater than 0 appear when compiling regex sets, and | 
|  | /// each match instruction gets its own unique value. The value corresponds | 
|  | /// to the Nth regex in the set. | 
|  | Match(usize), | 
|  | /// Save causes the program to save the current location of the input in | 
|  | /// the slot indicated by InstSave. | 
|  | Save(InstSave), | 
|  | /// Split causes the program to diverge to one of two paths in the | 
|  | /// program, preferring goto1 in InstSplit. | 
|  | Split(InstSplit), | 
|  | /// EmptyLook represents a zero-width assertion in a regex program. A | 
|  | /// zero-width assertion does not consume any of the input text. | 
|  | EmptyLook(InstEmptyLook), | 
|  | /// Char requires the regex program to match the character in InstChar at | 
|  | /// the current position in the input. | 
|  | Char(InstChar), | 
|  | /// Ranges requires the regex program to match the character at the current | 
|  | /// position in the input with one of the ranges specified in InstRanges. | 
|  | Ranges(InstRanges), | 
|  | /// Bytes is like Ranges, except it expresses a single byte range. It is | 
|  | /// used in conjunction with Split instructions to implement multi-byte | 
|  | /// character classes. | 
|  | Bytes(InstBytes), | 
|  | } | 
|  |  | 
|  | impl Inst { | 
|  | /// Returns true if and only if this is a match instruction. | 
|  | pub fn is_match(&self) -> bool { | 
|  | match *self { | 
|  | Inst::Match(_) => true, | 
|  | _ => false, | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | /// Representation of the Save instruction. | 
|  | #[derive(Clone, Debug)] | 
|  | pub struct InstSave { | 
|  | /// The next location to execute in the program. | 
|  | pub goto: InstPtr, | 
|  | /// The capture slot (there are two slots for every capture in a regex, | 
|  | /// including the zeroth capture for the entire match). | 
|  | pub slot: usize, | 
|  | } | 
|  |  | 
|  | /// Representation of the Split instruction. | 
|  | #[derive(Clone, Debug)] | 
|  | pub struct InstSplit { | 
|  | /// The first instruction to try. A match resulting from following goto1 | 
|  | /// has precedence over a match resulting from following goto2. | 
|  | pub goto1: InstPtr, | 
|  | /// The second instruction to try. A match resulting from following goto1 | 
|  | /// has precedence over a match resulting from following goto2. | 
|  | pub goto2: InstPtr, | 
|  | } | 
|  |  | 
|  | /// Representation of the `EmptyLook` instruction. | 
|  | #[derive(Clone, Debug)] | 
|  | pub struct InstEmptyLook { | 
|  | /// The next location to execute in the program if this instruction | 
|  | /// succeeds. | 
|  | pub goto: InstPtr, | 
|  | /// The type of zero-width assertion to check. | 
|  | pub look: EmptyLook, | 
|  | } | 
|  |  | 
|  | /// The set of zero-width match instructions. | 
|  | #[derive(Clone, Copy, Debug, PartialEq, Eq)] | 
|  | pub enum EmptyLook { | 
|  | /// Start of line or input. | 
|  | StartLine, | 
|  | /// End of line or input. | 
|  | EndLine, | 
|  | /// Start of input. | 
|  | StartText, | 
|  | /// End of input. | 
|  | EndText, | 
|  | /// Word character on one side and non-word character on other. | 
|  | WordBoundary, | 
|  | /// Word character on both sides or non-word character on both sides. | 
|  | NotWordBoundary, | 
|  | /// ASCII word boundary. | 
|  | WordBoundaryAscii, | 
|  | /// Not ASCII word boundary. | 
|  | NotWordBoundaryAscii, | 
|  | } | 
|  |  | 
|  | /// Representation of the Char instruction. | 
|  | #[derive(Clone, Debug)] | 
|  | pub struct InstChar { | 
|  | /// The next location to execute in the program if this instruction | 
|  | /// succeeds. | 
|  | pub goto: InstPtr, | 
|  | /// The character to test. | 
|  | pub c: char, | 
|  | } | 
|  |  | 
|  | /// Representation of the Ranges instruction. | 
|  | #[derive(Clone, Debug)] | 
|  | pub struct InstRanges { | 
|  | /// The next location to execute in the program if this instruction | 
|  | /// succeeds. | 
|  | pub goto: InstPtr, | 
|  | /// The set of Unicode scalar value ranges to test. | 
|  | pub ranges: Box<[(char, char)]>, | 
|  | } | 
|  |  | 
|  | impl InstRanges { | 
|  | /// Tests whether the given input character matches this instruction. | 
|  | pub fn matches(&self, c: Char) -> bool { | 
|  | // This speeds up the `match_class_unicode` benchmark by checking | 
|  | // some common cases quickly without binary search. e.g., Matching | 
|  | // a Unicode class on predominantly ASCII text. | 
|  | for r in self.ranges.iter().take(4) { | 
|  | if c < r.0 { | 
|  | return false; | 
|  | } | 
|  | if c <= r.1 { | 
|  | return true; | 
|  | } | 
|  | } | 
|  | self.ranges | 
|  | .binary_search_by(|r| { | 
|  | if r.1 < c { | 
|  | Ordering::Less | 
|  | } else if r.0 > c { | 
|  | Ordering::Greater | 
|  | } else { | 
|  | Ordering::Equal | 
|  | } | 
|  | }) | 
|  | .is_ok() | 
|  | } | 
|  |  | 
|  | /// Return the number of distinct characters represented by all of the | 
|  | /// ranges. | 
|  | pub fn num_chars(&self) -> usize { | 
|  | self.ranges | 
|  | .iter() | 
|  | .map(|&(s, e)| 1 + (e as u32) - (s as u32)) | 
|  | .sum::<u32>() as usize | 
|  | } | 
|  | } | 
|  |  | 
|  | /// Representation of the Bytes instruction. | 
|  | #[derive(Clone, Debug)] | 
|  | pub struct InstBytes { | 
|  | /// The next location to execute in the program if this instruction | 
|  | /// succeeds. | 
|  | pub goto: InstPtr, | 
|  | /// The start (inclusive) of this byte range. | 
|  | pub start: u8, | 
|  | /// The end (inclusive) of this byte range. | 
|  | pub end: u8, | 
|  | } | 
|  |  | 
|  | impl InstBytes { | 
|  | /// Returns true if and only if the given byte is in this range. | 
|  | pub fn matches(&self, byte: u8) -> bool { | 
|  | self.start <= byte && byte <= self.end | 
|  | } | 
|  | } | 
|  |  | 
|  | #[cfg(test)] | 
|  | mod test { | 
|  | #[test] | 
|  | #[cfg(target_pointer_width = "64")] | 
|  | fn test_size_of_inst() { | 
|  | use std::mem::size_of; | 
|  |  | 
|  | use super::Inst; | 
|  |  | 
|  | assert_eq!(32, size_of::<Inst>()); | 
|  | } | 
|  | } |