src/pikevm.rs - platform/external/rust/crates/regex - Git at Google

 // This module implements the Pike VM. That is, it guarantees linear time
 // search of a regex on any text with memory use proportional to the size of
 // the regex.
 //
 // It is equal in power to the backtracking engine in this crate, except the
 // backtracking engine is typically faster on small regexes/texts at the
 // expense of a bigger memory footprint.
 //
 // It can do more than the DFA can (specifically, record capture locations
 // and execute Unicode word boundary assertions), but at a slower speed.
 // Specifically, the Pike VM exectues a DFA implicitly by repeatedly expanding
 // epsilon transitions. That is, the Pike VM engine can be in multiple states
 // at once where as the DFA is only ever in one state at a time.
 //
 // Therefore, the Pike VM is generally treated as the fallback when the other
 // matching engines either aren't feasible to run or are insufficient.

 use std::mem;

 use exec::ProgramCache;
 use input::{Input, InputAt};
 use prog::{InstPtr, Program};
 use re_trait::Slot;
 use sparse::SparseSet;

 /// An NFA simulation matching engine.
 #[derive(Debug)]
 pub struct Fsm<'r, I> {
     /// The sequence of opcodes (among other things) that is actually executed.
     ///
     /// The program may be byte oriented or Unicode codepoint oriented.
     prog: &'r Program,
     /// An explicit stack used for following epsilon transitions. (This is
     /// borrowed from the cache.)
     stack: &'r mut Vec<FollowEpsilon>,
     /// The input to search.
     input: I,
 }

 /// A cached allocation that can be reused on each execution.
 #[derive(Clone, Debug)]
 pub struct Cache {
     /// A pair of ordered sets for tracking NFA states.
     clist: Threads,
     nlist: Threads,
     /// An explicit stack used for following epsilon transitions.
     stack: Vec<FollowEpsilon>,
 }

 /// An ordered set of NFA states and their captures.
 #[derive(Clone, Debug)]
 struct Threads {
     /// An ordered set of opcodes (each opcode is an NFA state).
     set: SparseSet,
     /// Captures for every NFA state.
     ///
     /// It is stored in row-major order, where the columns are the capture
     /// slots and the rows are the states.
     caps: Vec<Slot>,
     /// The number of capture slots stored per thread. (Every capture has
     /// two slots.)
     slots_per_thread: usize,
 }

 /// A representation of an explicit stack frame when following epsilon
 /// transitions. This is used to avoid recursion.
 #[derive(Clone, Debug)]
 enum FollowEpsilon {
     /// Follow transitions at the given instruction pointer.
     IP(InstPtr),
     /// Restore the capture slot with the given position in the input.
     Capture { slot: usize, pos: Slot },
 }

 impl Cache {
     /// Create a new allocation used by the NFA machine to record execution
     /// and captures.
     pub fn new(_prog: &Program) -> Self {
         Cache { clist: Threads::new(), nlist: Threads::new(), stack: vec![] }
     }
 }

 impl<'r, I: Input> Fsm<'r, I> {
     /// Execute the NFA matching engine.
     ///
     /// If there's a match, `exec` returns `true` and populates the given
     /// captures accordingly.
     pub fn exec(
         prog: &'r Program,
         cache: &ProgramCache,
         matches: &mut [bool],
         slots: &mut [Slot],
         quit_after_match: bool,
         input: I,
         start: usize,
         end: usize,
     ) -> bool {
         let mut cache = cache.borrow_mut();
         let cache = &mut cache.pikevm;
         cache.clist.resize(prog.len(), prog.captures.len());
         cache.nlist.resize(prog.len(), prog.captures.len());
         let at = input.at(start);
         Fsm { prog: prog, stack: &mut cache.stack, input: input }.exec_(
             &mut cache.clist,
             &mut cache.nlist,
             matches,
             slots,
             quit_after_match,
             at,
             end,
         )
     }

     fn exec_(
         &mut self,
         mut clist: &mut Threads,
         mut nlist: &mut Threads,
         matches: &mut [bool],
         slots: &mut [Slot],
         quit_after_match: bool,
         mut at: InputAt,
         end: usize,
     ) -> bool {
         let mut matched = false;
         let mut all_matched = false;
         clist.set.clear();
         nlist.set.clear();
         'LOOP: loop {
             if clist.set.is_empty() {
                 // Three ways to bail out when our current set of threads is
                 // empty.
                 //
                 // 1. We have a match---so we're done exploring any possible
                 //    alternatives. Time to quit. (We can't do this if we're
                 //    looking for matches for multiple regexes, unless we know
                 //    they all matched.)
                 //
                 // 2. If the expression starts with a '^' we can terminate as
                 //    soon as the last thread dies.
                 if (matched && matches.len() <= 1)
                     || all_matched
                     || (!at.is_start() && self.prog.is_anchored_start)
                 {
                     break;
                 }

                 // 3. If there's a literal prefix for the program, try to
                 //    jump ahead quickly. If it can't be found, then we can
                 //    bail out early.
                 if !self.prog.prefixes.is_empty() {
                     at = match self.input.prefix_at(&self.prog.prefixes, at) {
                         None => break,
                         Some(at) => at,
                     };
                 }
             }

             // This simulates a preceding '.*?' for every regex by adding
             // a state starting at the current position in the input for the
             // beginning of the program only if we don't already have a match.
             if clist.set.is_empty()
                 || (!self.prog.is_anchored_start && !all_matched)
             {
                 self.add(&mut clist, slots, 0, at);
             }
             // The previous call to "add" actually inspects the position just
             // before the current character. For stepping through the machine,
             // we can to look at the current character, so we advance the
             // input.
             let at_next = self.input.at(at.next_pos());
             for i in 0..clist.set.len() {
                 let ip = clist.set[i];
                 if self.step(
                     &mut nlist,
                     matches,
                     slots,
                     clist.caps(ip),
                     ip,
                     at,
                     at_next,
                 ) {
                     matched = true;
                     all_matched = all_matched || matches.iter().all(|&b| b);
                     if quit_after_match {
                         // If we only care if a match occurs (not its
                         // position), then we can quit right now.
                         break 'LOOP;
                     }
                     if self.prog.matches.len() == 1 {
                         // We don't need to check the rest of the threads
                         // in this set because we've matched something
                         // ("leftmost-first"). However, we still need to check
                         // threads in the next set to support things like
                         // greedy matching.
                         //
                         // This is only true on normal regexes. For regex sets,
                         // we need to mush on to observe other matches.
                         break;
                     }
                 }
             }
             if at.pos() >= end {
                 break;
             }
             at = at_next;
             mem::swap(clist, nlist);
             nlist.set.clear();
         }
         matched
     }

     /// Step through the input, one token (byte or codepoint) at a time.
     ///
     /// nlist is the set of states that will be processed on the next token
     /// in the input.
     ///
     /// caps is the set of captures passed by the caller of the NFA. They are
     /// written to only when a match state is visited.
     ///
     /// thread_caps is the set of captures set for the current NFA state, ip.
     ///
     /// at and at_next are the current and next positions in the input. at or
     /// at_next may be EOF.
     fn step(
         &mut self,
         nlist: &mut Threads,
         matches: &mut [bool],
         slots: &mut [Slot],
         thread_caps: &mut [Option<usize>],
         ip: usize,
         at: InputAt,
         at_next: InputAt,
     ) -> bool {
         use prog::Inst::*;
         match self.prog[ip] {
             Match(match_slot) => {
                 if match_slot < matches.len() {
                     matches[match_slot] = true;
                 }
                 for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) {
                     *slot = *val;
                 }
                 true
             }
             Char(ref inst) => {
                 if inst.c == at.char() {
                     self.add(nlist, thread_caps, inst.goto, at_next);
                 }
                 false
             }
             Ranges(ref inst) => {
                 if inst.matches(at.char()) {
                     self.add(nlist, thread_caps, inst.goto, at_next);
                 }
                 false
             }
             Bytes(ref inst) => {
                 if let Some(b) = at.byte() {
                     if inst.matches(b) {
                         self.add(nlist, thread_caps, inst.goto, at_next);
                     }
                 }
                 false
             }
             EmptyLook(_) | Save(_) | Split(_) => false,
         }
     }

     /// Follows epsilon transitions and adds them for processing to nlist,
     /// starting at and including ip.
     fn add(
         &mut self,
         nlist: &mut Threads,
         thread_caps: &mut [Option<usize>],
         ip: usize,
         at: InputAt,
     ) {
         self.stack.push(FollowEpsilon::IP(ip));
         while let Some(frame) = self.stack.pop() {
             match frame {
                 FollowEpsilon::IP(ip) => {
                     self.add_step(nlist, thread_caps, ip, at);
                 }
                 FollowEpsilon::Capture { slot, pos } => {
                     thread_caps[slot] = pos;
                 }
             }
         }
     }

     /// A helper function for add that avoids excessive pushing to the stack.
     fn add_step(
         &mut self,
         nlist: &mut Threads,
         thread_caps: &mut [Option<usize>],
         mut ip: usize,
         at: InputAt,
     ) {
         // Instead of pushing and popping to the stack, we mutate ip as we
         // traverse the set of states. We only push to the stack when we
         // absolutely need recursion (restoring captures or following a
         // branch).
         use prog::Inst::*;
         loop {
             // Don't visit states we've already added.
             if nlist.set.contains(ip) {
                 return;
             }
             nlist.set.insert(ip);
             match self.prog[ip] {
                 EmptyLook(ref inst) => {
                     if self.input.is_empty_match(at, inst) {
                         ip = inst.goto;
                     }
                 }
                 Save(ref inst) => {
                     if inst.slot < thread_caps.len() {
                         self.stack.push(FollowEpsilon::Capture {
                             slot: inst.slot,
                             pos: thread_caps[inst.slot],
                         });
                         thread_caps[inst.slot] = Some(at.pos());
                     }
                     ip = inst.goto;
                 }
                 Split(ref inst) => {
                     self.stack.push(FollowEpsilon::IP(inst.goto2));
                     ip = inst.goto1;
                 }
                 Match(_) | Char(_) | Ranges(_) | Bytes(_) => {
                     let t = &mut nlist.caps(ip);
                     for (slot, val) in t.iter_mut().zip(thread_caps.iter()) {
                         *slot = *val;
                     }
                     return;
                 }
             }
         }
     }
 }

 impl Threads {
     fn new() -> Self {
         Threads { set: SparseSet::new(0), caps: vec![], slots_per_thread: 0 }
     }

     fn resize(&mut self, num_insts: usize, ncaps: usize) {
         if num_insts == self.set.capacity() {
             return;
         }
         self.slots_per_thread = ncaps * 2;
         self.set = SparseSet::new(num_insts);
         self.caps = vec![None; self.slots_per_thread * num_insts];
     }

     fn caps(&mut self, pc: usize) -> &mut [Option<usize>] {
         let i = pc * self.slots_per_thread;
         &mut self.caps[i..i + self.slots_per_thread]
     }
 }
	// This module implements the Pike VM. That is, it guarantees linear time
	// search of a regex on any text with memory use proportional to the size of
	// the regex.
	//
	// It is equal in power to the backtracking engine in this crate, except the
	// backtracking engine is typically faster on small regexes/texts at the
	// expense of a bigger memory footprint.
	//
	// It can do more than the DFA can (specifically, record capture locations
	// and execute Unicode word boundary assertions), but at a slower speed.
	// Specifically, the Pike VM exectues a DFA implicitly by repeatedly expanding
	// epsilon transitions. That is, the Pike VM engine can be in multiple states
	// at once where as the DFA is only ever in one state at a time.
	//
	// Therefore, the Pike VM is generally treated as the fallback when the other
	// matching engines either aren't feasible to run or are insufficient.

	use std::mem;

	use exec::ProgramCache;
	use input::{Input, InputAt};
	use prog::{InstPtr, Program};
	use re_trait::Slot;
	use sparse::SparseSet;

	/// An NFA simulation matching engine.
	#[derive(Debug)]
	pub struct Fsm<'r, I> {
	/// The sequence of opcodes (among other things) that is actually executed.
	///
	/// The program may be byte oriented or Unicode codepoint oriented.
	prog: &'r Program,
	/// An explicit stack used for following epsilon transitions. (This is
	/// borrowed from the cache.)
	stack: &'r mut Vec<FollowEpsilon>,
	/// The input to search.
	input: I,
	}

	/// A cached allocation that can be reused on each execution.
	#[derive(Clone, Debug)]
	pub struct Cache {
	/// A pair of ordered sets for tracking NFA states.
	clist: Threads,
	nlist: Threads,
	/// An explicit stack used for following epsilon transitions.
	stack: Vec<FollowEpsilon>,
	}

	/// An ordered set of NFA states and their captures.
	#[derive(Clone, Debug)]
	struct Threads {
	/// An ordered set of opcodes (each opcode is an NFA state).
	set: SparseSet,
	/// Captures for every NFA state.
	///
	/// It is stored in row-major order, where the columns are the capture
	/// slots and the rows are the states.
	caps: Vec<Slot>,
	/// The number of capture slots stored per thread. (Every capture has
	/// two slots.)
	slots_per_thread: usize,
	}

	/// A representation of an explicit stack frame when following epsilon
	/// transitions. This is used to avoid recursion.
	#[derive(Clone, Debug)]
	enum FollowEpsilon {
	/// Follow transitions at the given instruction pointer.
	IP(InstPtr),
	/// Restore the capture slot with the given position in the input.
	Capture { slot: usize, pos: Slot },
	}

	impl Cache {
	/// Create a new allocation used by the NFA machine to record execution
	/// and captures.
	pub fn new(_prog: &Program) -> Self {
	Cache { clist: Threads::new(), nlist: Threads::new(), stack: vec![] }
	}
	}

	impl<'r, I: Input> Fsm<'r, I> {
	/// Execute the NFA matching engine.
	///
	/// If there's a match, `exec` returns `true` and populates the given
	/// captures accordingly.
	pub fn exec(
	prog: &'r Program,
	cache: &ProgramCache,
	matches: &mut [bool],
	slots: &mut [Slot],
	quit_after_match: bool,
	input: I,
	start: usize,
	end: usize,
	) -> bool {
	let mut cache = cache.borrow_mut();
	let cache = &mut cache.pikevm;
	cache.clist.resize(prog.len(), prog.captures.len());
	cache.nlist.resize(prog.len(), prog.captures.len());
	let at = input.at(start);
	Fsm { prog: prog, stack: &mut cache.stack, input: input }.exec_(
	&mut cache.clist,
	&mut cache.nlist,
	matches,
	slots,
	quit_after_match,
	at,
	end,
	)
	}

	fn exec_(
	&mut self,
	mut clist: &mut Threads,
	mut nlist: &mut Threads,
	matches: &mut [bool],
	slots: &mut [Slot],
	quit_after_match: bool,
	mut at: InputAt,
	end: usize,
	) -> bool {
	let mut matched = false;
	let mut all_matched = false;
	clist.set.clear();
	nlist.set.clear();
	'LOOP: loop {
	if clist.set.is_empty() {
	// Three ways to bail out when our current set of threads is
	// empty.
	//
	// 1. We have a match---so we're done exploring any possible
	// alternatives. Time to quit. (We can't do this if we're
	// looking for matches for multiple regexes, unless we know
	// they all matched.)
	//
	// 2. If the expression starts with a '^' we can terminate as
	// soon as the last thread dies.
	if (matched && matches.len() <= 1)
	\|\| all_matched
	\|\| (!at.is_start() && self.prog.is_anchored_start)
	{
	break;
	}

	// 3. If there's a literal prefix for the program, try to
	// jump ahead quickly. If it can't be found, then we can
	// bail out early.
	if !self.prog.prefixes.is_empty() {
	at = match self.input.prefix_at(&self.prog.prefixes, at) {
	None => break,
	Some(at) => at,
	};
	}
	}

	// This simulates a preceding '.*?' for every regex by adding
	// a state starting at the current position in the input for the
	// beginning of the program only if we don't already have a match.
	if clist.set.is_empty()
	\|\| (!self.prog.is_anchored_start && !all_matched)
	{
	self.add(&mut clist, slots, 0, at);
	}
	// The previous call to "add" actually inspects the position just
	// before the current character. For stepping through the machine,
	// we can to look at the current character, so we advance the
	// input.
	let at_next = self.input.at(at.next_pos());
	for i in 0..clist.set.len() {
	let ip = clist.set[i];
	if self.step(
	&mut nlist,
	matches,
	slots,
	clist.caps(ip),
	ip,
	at,
	at_next,
	) {
	matched = true;
	all_matched = all_matched \|\| matches.iter().all(\|&b\| b);
	if quit_after_match {
	// If we only care if a match occurs (not its
	// position), then we can quit right now.
	break 'LOOP;
	}
	if self.prog.matches.len() == 1 {
	// We don't need to check the rest of the threads
	// in this set because we've matched something
	// ("leftmost-first"). However, we still need to check
	// threads in the next set to support things like
	// greedy matching.
	//
	// This is only true on normal regexes. For regex sets,
	// we need to mush on to observe other matches.
	break;
	}
	}
	}
	if at.pos() >= end {
	break;
	}
	at = at_next;
	mem::swap(clist, nlist);
	nlist.set.clear();
	}
	matched
	}

	/// Step through the input, one token (byte or codepoint) at a time.
	///
	/// nlist is the set of states that will be processed on the next token
	/// in the input.
	///
	/// caps is the set of captures passed by the caller of the NFA. They are
	/// written to only when a match state is visited.
	///
	/// thread_caps is the set of captures set for the current NFA state, ip.
	///
	/// at and at_next are the current and next positions in the input. at or
	/// at_next may be EOF.
	fn step(
	&mut self,
	nlist: &mut Threads,
	matches: &mut [bool],
	slots: &mut [Slot],
	thread_caps: &mut [Option<usize>],
	ip: usize,
	at: InputAt,
	at_next: InputAt,
	) -> bool {
	use prog::Inst::*;
	match self.prog[ip] {
	Match(match_slot) => {
	if match_slot < matches.len() {
	matches[match_slot] = true;
	}
	for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) {
	slot = val;
	}
	true
	}
	Char(ref inst) => {
	if inst.c == at.char() {
	self.add(nlist, thread_caps, inst.goto, at_next);
	}
	false
	}
	Ranges(ref inst) => {
	if inst.matches(at.char()) {
	self.add(nlist, thread_caps, inst.goto, at_next);
	}
	false
	}
	Bytes(ref inst) => {
	if let Some(b) = at.byte() {
	if inst.matches(b) {
	self.add(nlist, thread_caps, inst.goto, at_next);
	}
	}
	false
	}
	EmptyLook(_) \| Save(_) \| Split(_) => false,
	}
	}

	/// Follows epsilon transitions and adds them for processing to nlist,
	/// starting at and including ip.
	fn add(
	&mut self,
	nlist: &mut Threads,
	thread_caps: &mut [Option<usize>],
	ip: usize,
	at: InputAt,
	) {
	self.stack.push(FollowEpsilon::IP(ip));
	while let Some(frame) = self.stack.pop() {
	match frame {
	FollowEpsilon::IP(ip) => {
	self.add_step(nlist, thread_caps, ip, at);
	}
	FollowEpsilon::Capture { slot, pos } => {
	thread_caps[slot] = pos;
	}
	}
	}
	}

	/// A helper function for add that avoids excessive pushing to the stack.
	fn add_step(
	&mut self,
	nlist: &mut Threads,
	thread_caps: &mut [Option<usize>],
	mut ip: usize,
	at: InputAt,
	) {
	// Instead of pushing and popping to the stack, we mutate ip as we
	// traverse the set of states. We only push to the stack when we
	// absolutely need recursion (restoring captures or following a
	// branch).
	use prog::Inst::*;
	loop {
	// Don't visit states we've already added.
	if nlist.set.contains(ip) {
	return;
	}
	nlist.set.insert(ip);
	match self.prog[ip] {
	EmptyLook(ref inst) => {
	if self.input.is_empty_match(at, inst) {
	ip = inst.goto;
	}
	}
	Save(ref inst) => {
	if inst.slot < thread_caps.len() {
	self.stack.push(FollowEpsilon::Capture {
	slot: inst.slot,
	pos: thread_caps[inst.slot],
	});
	thread_caps[inst.slot] = Some(at.pos());
	}
	ip = inst.goto;
	}
	Split(ref inst) => {
	self.stack.push(FollowEpsilon::IP(inst.goto2));
	ip = inst.goto1;
	}
	Match(_) \| Char(_) \| Ranges(_) \| Bytes(_) => {
	let t = &mut nlist.caps(ip);
	for (slot, val) in t.iter_mut().zip(thread_caps.iter()) {
	slot = val;
	}
	return;
	}
	}
	}
	}
	}

	impl Threads {
	fn new() -> Self {
	Threads { set: SparseSet::new(0), caps: vec![], slots_per_thread: 0 }
	}

	fn resize(&mut self, num_insts: usize, ncaps: usize) {
	if num_insts == self.set.capacity() {
	return;
	}
	self.slots_per_thread = ncaps * 2;
	self.set = SparseSet::new(num_insts);
	self.caps = vec![None; self.slots_per_thread * num_insts];
	}

	fn caps(&mut self, pc: usize) -> &mut [Option<usize>] {
	let i = pc * self.slots_per_thread;
	&mut self.caps[i..i + self.slots_per_thread]
	}
	}