| use core::{cell::RefCell, fmt, mem}; |
| |
| use alloc::{collections::BTreeMap, rc::Rc, vec, vec::Vec}; |
| |
| use crate::{ |
| dfa::{automaton::Automaton, dense, DEAD}, |
| util::{ |
| alphabet, |
| primitives::{PatternID, StateID}, |
| }, |
| }; |
| |
| /// An implementation of Hopcroft's algorithm for minimizing DFAs. |
| /// |
| /// The algorithm implemented here is mostly taken from Wikipedia: |
| /// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm |
| /// |
| /// This code has had some light optimization attention paid to it, |
| /// particularly in the form of reducing allocation as much as possible. |
| /// However, it is still generally slow. Future optimization work should |
| /// probably focus on the bigger picture rather than micro-optimizations. For |
| /// example: |
| /// |
| /// 1. Figure out how to more intelligently create initial partitions. That is, |
| /// Hopcroft's algorithm starts by creating two partitions of DFA states |
| /// that are known to NOT be equivalent: match states and non-match states. |
| /// The algorithm proceeds by progressively refining these partitions into |
| /// smaller partitions. If we could start with more partitions, then we |
| /// could reduce the amount of work that Hopcroft's algorithm needs to do. |
| /// 2. For every partition that we visit, we find all incoming transitions to |
| /// every state in the partition for *every* element in the alphabet. (This |
| /// is why using byte classes can significantly decrease minimization times, |
| /// since byte classes shrink the alphabet.) This is quite costly and there |
| /// is perhaps some redundant work being performed depending on the specific |
| /// states in the set. For example, we might be able to only visit some |
| /// elements of the alphabet based on the transitions. |
| /// 3. Move parts of minimization into determinization. If minimization has |
| /// fewer states to deal with, then it should run faster. A prime example |
| /// of this might be large Unicode classes, which are generated in way that |
| /// can create a lot of redundant states. (Some work has been done on this |
| /// point during NFA compilation via the algorithm described in the |
| /// "Incremental Construction of MinimalAcyclic Finite-State Automata" |
| /// paper.) |
| pub(crate) struct Minimizer<'a> { |
| dfa: &'a mut dense::OwnedDFA, |
| in_transitions: Vec<Vec<Vec<StateID>>>, |
| partitions: Vec<StateSet>, |
| waiting: Vec<StateSet>, |
| } |
| |
| impl<'a> fmt::Debug for Minimizer<'a> { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| f.debug_struct("Minimizer") |
| .field("dfa", &self.dfa) |
| .field("in_transitions", &self.in_transitions) |
| .field("partitions", &self.partitions) |
| .field("waiting", &self.waiting) |
| .finish() |
| } |
| } |
| |
| /// A set of states. A state set makes up a single partition in Hopcroft's |
| /// algorithm. |
| /// |
| /// It is represented by an ordered set of state identifiers. We use shared |
| /// ownership so that a single state set can be in both the set of partitions |
| /// and in the set of waiting sets simultaneously without an additional |
| /// allocation. Generally, once a state set is built, it becomes immutable. |
| /// |
| /// We use this representation because it avoids the overhead of more |
| /// traditional set data structures (HashSet/BTreeSet), and also because |
| /// computing intersection/subtraction on this representation is especially |
| /// fast. |
| #[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)] |
| struct StateSet { |
| ids: Rc<RefCell<Vec<StateID>>>, |
| } |
| |
| impl<'a> Minimizer<'a> { |
| pub fn new(dfa: &'a mut dense::OwnedDFA) -> Minimizer<'a> { |
| let in_transitions = Minimizer::incoming_transitions(dfa); |
| let partitions = Minimizer::initial_partitions(dfa); |
| let waiting = partitions.clone(); |
| Minimizer { dfa, in_transitions, partitions, waiting } |
| } |
| |
| pub fn run(mut self) { |
| let stride2 = self.dfa.stride2(); |
| let as_state_id = |index: usize| -> StateID { |
| StateID::new(index << stride2).unwrap() |
| }; |
| let as_index = |id: StateID| -> usize { id.as_usize() >> stride2 }; |
| |
| let mut incoming = StateSet::empty(); |
| let mut scratch1 = StateSet::empty(); |
| let mut scratch2 = StateSet::empty(); |
| let mut newparts = vec![]; |
| |
| // This loop is basically Hopcroft's algorithm. Everything else is just |
| // shuffling data around to fit our representation. |
| while let Some(set) = self.waiting.pop() { |
| for b in self.dfa.byte_classes().iter() { |
| self.find_incoming_to(b, &set, &mut incoming); |
| // If incoming is empty, then the intersection with any other |
| // set must also be empty. So 'newparts' just ends up being |
| // 'self.partitions'. So there's no need to go through the loop |
| // below. |
| // |
| // This actually turns out to be rather large optimization. On |
| // the order of making minimization 4-5x faster. It's likely |
| // that the vast majority of all states have very few incoming |
| // transitions. |
| if incoming.is_empty() { |
| continue; |
| } |
| |
| for p in 0..self.partitions.len() { |
| self.partitions[p].intersection(&incoming, &mut scratch1); |
| if scratch1.is_empty() { |
| newparts.push(self.partitions[p].clone()); |
| continue; |
| } |
| |
| self.partitions[p].subtract(&incoming, &mut scratch2); |
| if scratch2.is_empty() { |
| newparts.push(self.partitions[p].clone()); |
| continue; |
| } |
| |
| let (x, y) = |
| (scratch1.deep_clone(), scratch2.deep_clone()); |
| newparts.push(x.clone()); |
| newparts.push(y.clone()); |
| match self.find_waiting(&self.partitions[p]) { |
| Some(i) => { |
| self.waiting[i] = x; |
| self.waiting.push(y); |
| } |
| None => { |
| if x.len() <= y.len() { |
| self.waiting.push(x); |
| } else { |
| self.waiting.push(y); |
| } |
| } |
| } |
| } |
| newparts = mem::replace(&mut self.partitions, newparts); |
| newparts.clear(); |
| } |
| } |
| |
| // At this point, we now have a minimal partitioning of states, where |
| // each partition is an equivalence class of DFA states. Now we need to |
| // use this partitioning to update the DFA to only contain one state for |
| // each partition. |
| |
| // Create a map from DFA state ID to the representative ID of the |
| // equivalence class to which it belongs. The representative ID of an |
| // equivalence class of states is the minimum ID in that class. |
| let mut state_to_part = vec![DEAD; self.dfa.state_len()]; |
| for p in &self.partitions { |
| p.iter(|id| state_to_part[as_index(id)] = p.min()); |
| } |
| |
| // Generate a new contiguous sequence of IDs for minimal states, and |
| // create a map from equivalence IDs to the new IDs. Thus, the new |
| // minimal ID of *any* state in the unminimized DFA can be obtained |
| // with minimals_ids[state_to_part[old_id]]. |
| let mut minimal_ids = vec![DEAD; self.dfa.state_len()]; |
| let mut new_index = 0; |
| for state in self.dfa.states() { |
| if state_to_part[as_index(state.id())] == state.id() { |
| minimal_ids[as_index(state.id())] = as_state_id(new_index); |
| new_index += 1; |
| } |
| } |
| // The total number of states in the minimal DFA. |
| let minimal_count = new_index; |
| // Convenience function for remapping state IDs. This takes an old ID, |
| // looks up its Hopcroft partition and then maps that to the new ID |
| // range. |
| let remap = |old| minimal_ids[as_index(state_to_part[as_index(old)])]; |
| |
| // Re-map this DFA in place such that the only states remaining |
| // correspond to the representative states of every equivalence class. |
| for id in (0..self.dfa.state_len()).map(as_state_id) { |
| // If this state isn't a representative for an equivalence class, |
| // then we skip it since it won't appear in the minimal DFA. |
| if state_to_part[as_index(id)] != id { |
| continue; |
| } |
| self.dfa.remap_state(id, remap); |
| self.dfa.swap_states(id, minimal_ids[as_index(id)]); |
| } |
| // Trim off all unused states from the pre-minimized DFA. This |
| // represents all states that were merged into a non-singleton |
| // equivalence class of states, and appeared after the first state |
| // in each such class. (Because the state with the smallest ID in each |
| // equivalence class is its representative ID.) |
| self.dfa.truncate_states(minimal_count); |
| |
| // Update the new start states, which is now just the minimal ID of |
| // whatever state the old start state was collapsed into. Also, we |
| // collect everything before-hand to work around the borrow checker. |
| // We're already allocating so much that this is probably fine. If this |
| // turns out to be costly, then I guess add a `starts_mut` iterator. |
| let starts: Vec<_> = self.dfa.starts().collect(); |
| for (old_start_id, anchored, start_type) in starts { |
| self.dfa.set_start_state( |
| anchored, |
| start_type, |
| remap(old_start_id), |
| ); |
| } |
| |
| // Update the match state pattern ID list for multi-regexes. All we |
| // need to do is remap the match state IDs. The pattern ID lists are |
| // always the same as they were since match states with distinct |
| // pattern ID lists are always considered distinct states. |
| let mut pmap = BTreeMap::new(); |
| for (match_id, pattern_ids) in self.dfa.pattern_map() { |
| let new_id = remap(match_id); |
| pmap.insert(new_id, pattern_ids); |
| } |
| // This unwrap is OK because minimization never increases the number of |
| // match states or patterns in those match states. Since minimization |
| // runs after the pattern map has already been set at least once, we |
| // know that our match states cannot error. |
| self.dfa.set_pattern_map(&pmap).unwrap(); |
| |
| // In order to update the ID of the maximum match state, we need to |
| // find the maximum ID among all of the match states in the minimized |
| // DFA. This is not necessarily the new ID of the unminimized maximum |
| // match state, since that could have been collapsed with a much |
| // earlier match state. Therefore, to find the new max match state, |
| // we iterate over all previous match states, find their corresponding |
| // new minimal ID, and take the maximum of those. |
| let old = self.dfa.special().clone(); |
| let new = self.dfa.special_mut(); |
| // ... but only remap if we had match states. |
| if old.matches() { |
| new.min_match = StateID::MAX; |
| new.max_match = StateID::ZERO; |
| for i in as_index(old.min_match)..=as_index(old.max_match) { |
| let new_id = remap(as_state_id(i)); |
| if new_id < new.min_match { |
| new.min_match = new_id; |
| } |
| if new_id > new.max_match { |
| new.max_match = new_id; |
| } |
| } |
| } |
| // ... same, but for start states. |
| if old.starts() { |
| new.min_start = StateID::MAX; |
| new.max_start = StateID::ZERO; |
| for i in as_index(old.min_start)..=as_index(old.max_start) { |
| let new_id = remap(as_state_id(i)); |
| if new_id == DEAD { |
| continue; |
| } |
| if new_id < new.min_start { |
| new.min_start = new_id; |
| } |
| if new_id > new.max_start { |
| new.max_start = new_id; |
| } |
| } |
| if new.max_start == DEAD { |
| new.min_start = DEAD; |
| } |
| } |
| new.quit_id = remap(new.quit_id); |
| new.set_max(); |
| } |
| |
| fn find_waiting(&self, set: &StateSet) -> Option<usize> { |
| self.waiting.iter().position(|s| s == set) |
| } |
| |
| fn find_incoming_to( |
| &self, |
| b: alphabet::Unit, |
| set: &StateSet, |
| incoming: &mut StateSet, |
| ) { |
| incoming.clear(); |
| set.iter(|id| { |
| for &inid in |
| &self.in_transitions[self.dfa.to_index(id)][b.as_usize()] |
| { |
| incoming.add(inid); |
| } |
| }); |
| incoming.canonicalize(); |
| } |
| |
| fn initial_partitions(dfa: &dense::OwnedDFA) -> Vec<StateSet> { |
| // For match states, we know that two match states with different |
| // pattern ID lists will *always* be distinct, so we can partition them |
| // initially based on that. |
| let mut matching: BTreeMap<Vec<PatternID>, StateSet> = BTreeMap::new(); |
| let mut is_quit = StateSet::empty(); |
| let mut no_match = StateSet::empty(); |
| for state in dfa.states() { |
| if dfa.is_match_state(state.id()) { |
| let mut pids = vec![]; |
| for i in 0..dfa.match_len(state.id()) { |
| pids.push(dfa.match_pattern(state.id(), i)); |
| } |
| matching |
| .entry(pids) |
| .or_insert(StateSet::empty()) |
| .add(state.id()); |
| } else if dfa.is_quit_state(state.id()) { |
| is_quit.add(state.id()); |
| } else { |
| no_match.add(state.id()); |
| } |
| } |
| |
| let mut sets: Vec<StateSet> = |
| matching.into_iter().map(|(_, set)| set).collect(); |
| sets.push(no_match); |
| sets.push(is_quit); |
| sets |
| } |
| |
| fn incoming_transitions(dfa: &dense::OwnedDFA) -> Vec<Vec<Vec<StateID>>> { |
| let mut incoming = vec![]; |
| for _ in dfa.states() { |
| incoming.push(vec![vec![]; dfa.alphabet_len()]); |
| } |
| for state in dfa.states() { |
| for (b, next) in state.transitions() { |
| incoming[dfa.to_index(next)][b.as_usize()].push(state.id()); |
| } |
| } |
| incoming |
| } |
| } |
| |
| impl StateSet { |
| fn empty() -> StateSet { |
| StateSet { ids: Rc::new(RefCell::new(vec![])) } |
| } |
| |
| fn add(&mut self, id: StateID) { |
| self.ids.borrow_mut().push(id); |
| } |
| |
| fn min(&self) -> StateID { |
| self.ids.borrow()[0] |
| } |
| |
| fn canonicalize(&mut self) { |
| self.ids.borrow_mut().sort(); |
| self.ids.borrow_mut().dedup(); |
| } |
| |
| fn clear(&mut self) { |
| self.ids.borrow_mut().clear(); |
| } |
| |
| fn len(&self) -> usize { |
| self.ids.borrow().len() |
| } |
| |
| fn is_empty(&self) -> bool { |
| self.len() == 0 |
| } |
| |
| fn deep_clone(&self) -> StateSet { |
| let ids = self.ids.borrow().iter().cloned().collect(); |
| StateSet { ids: Rc::new(RefCell::new(ids)) } |
| } |
| |
| fn iter<F: FnMut(StateID)>(&self, mut f: F) { |
| for &id in self.ids.borrow().iter() { |
| f(id); |
| } |
| } |
| |
| fn intersection(&self, other: &StateSet, dest: &mut StateSet) { |
| dest.clear(); |
| if self.is_empty() || other.is_empty() { |
| return; |
| } |
| |
| let (seta, setb) = (self.ids.borrow(), other.ids.borrow()); |
| let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned()); |
| let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap()); |
| loop { |
| if a == b { |
| dest.add(a); |
| a = match ita.next() { |
| None => break, |
| Some(a) => a, |
| }; |
| b = match itb.next() { |
| None => break, |
| Some(b) => b, |
| }; |
| } else if a < b { |
| a = match ita.next() { |
| None => break, |
| Some(a) => a, |
| }; |
| } else { |
| b = match itb.next() { |
| None => break, |
| Some(b) => b, |
| }; |
| } |
| } |
| } |
| |
| fn subtract(&self, other: &StateSet, dest: &mut StateSet) { |
| dest.clear(); |
| if self.is_empty() || other.is_empty() { |
| self.iter(|s| dest.add(s)); |
| return; |
| } |
| |
| let (seta, setb) = (self.ids.borrow(), other.ids.borrow()); |
| let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned()); |
| let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap()); |
| loop { |
| if a == b { |
| a = match ita.next() { |
| None => break, |
| Some(a) => a, |
| }; |
| b = match itb.next() { |
| None => { |
| dest.add(a); |
| break; |
| } |
| Some(b) => b, |
| }; |
| } else if a < b { |
| dest.add(a); |
| a = match ita.next() { |
| None => break, |
| Some(a) => a, |
| }; |
| } else { |
| b = match itb.next() { |
| None => { |
| dest.add(a); |
| break; |
| } |
| Some(b) => b, |
| }; |
| } |
| } |
| for a in ita { |
| dest.add(a); |
| } |
| } |
| } |