|  | // This module provides an NFA compiler using Thompson's construction | 
|  | // algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA | 
|  | // graph as output. The NFA graph is structured in a way that permits it to be | 
|  | // executed by a virtual machine and also used to efficiently build a DFA. | 
|  | // | 
|  | // The compiler deals with a slightly expanded set of NFA states that notably | 
|  | // includes an empty node that has exactly one epsilon transition to the next | 
|  | // state. In other words, it's a "goto" instruction if one views Thompson's NFA | 
|  | // as a set of bytecode instructions. These goto instructions are removed in | 
|  | // a subsequent phase before returning the NFA to the caller. The purpose of | 
|  | // these empty nodes is that they make the construction algorithm substantially | 
|  | // simpler to implement. We remove them before returning to the caller because | 
|  | // they can represent substantial overhead when traversing the NFA graph | 
|  | // (either while searching using the NFA directly or while building a DFA). | 
|  | // | 
|  | // In the future, it would be nice to provide a Glushkov compiler as well, | 
|  | // as it would work well as a bit-parallel NFA for smaller regexes. But | 
|  | // the Thompson construction is one I'm more familiar with and seems more | 
|  | // straight-forward to deal with when it comes to large Unicode character | 
|  | // classes. | 
|  | // | 
|  | // Internally, the compiler uses interior mutability to improve composition | 
|  | // in the face of the borrow checker. In particular, we'd really like to be | 
|  | // able to write things like this: | 
|  | // | 
|  | //     self.c_concat(exprs.iter().map(|e| self.c(e))) | 
|  | // | 
|  | // Which elegantly uses iterators to build up a sequence of compiled regex | 
|  | // sub-expressions and then hands it off to the concatenating compiler | 
|  | // routine. Without interior mutability, the borrow checker won't let us | 
|  | // borrow `self` mutably both inside and outside the closure at the same | 
|  | // time. | 
|  |  | 
|  | use std::cell::RefCell; | 
|  | use std::mem; | 
|  |  | 
|  | use regex_syntax::hir::{self, Hir, HirKind}; | 
|  | use regex_syntax::utf8::{Utf8Range, Utf8Sequences}; | 
|  |  | 
|  | use classes::ByteClassSet; | 
|  | use error::{Error, Result}; | 
|  | use nfa::map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap}; | 
|  | use nfa::range_trie::RangeTrie; | 
|  | use nfa::{State, StateID, Transition, NFA}; | 
|  |  | 
|  | /// Config knobs for the NFA compiler. See the builder's methods for more | 
|  | /// docs on each one. | 
|  | #[derive(Clone, Copy, Debug)] | 
|  | struct Config { | 
|  | anchored: bool, | 
|  | allow_invalid_utf8: bool, | 
|  | reverse: bool, | 
|  | shrink: bool, | 
|  | } | 
|  |  | 
|  | impl Default for Config { | 
|  | fn default() -> Config { | 
|  | Config { | 
|  | anchored: false, | 
|  | allow_invalid_utf8: false, | 
|  | reverse: false, | 
|  | shrink: true, | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | /// A builder for compiling an NFA. | 
|  | #[derive(Clone, Debug)] | 
|  | pub struct Builder { | 
|  | config: Config, | 
|  | } | 
|  |  | 
|  | impl Builder { | 
|  | /// Create a new NFA builder with its default configuration. | 
|  | pub fn new() -> Builder { | 
|  | Builder { config: Config::default() } | 
|  | } | 
|  |  | 
|  | /// Compile the given high level intermediate representation of a regular | 
|  | /// expression into an NFA. | 
|  | /// | 
|  | /// If there was a problem building the NFA, then an error is returned. | 
|  | /// For example, if the regex uses unsupported features (such as zero-width | 
|  | /// assertions), then an error is returned. | 
|  | pub fn build(&self, expr: &Hir) -> Result<NFA> { | 
|  | let mut nfa = NFA::always_match(); | 
|  | self.build_with(&mut Compiler::new(), &mut nfa, expr)?; | 
|  | Ok(nfa) | 
|  | } | 
|  |  | 
|  | /// Compile the given high level intermediate representation of a regular | 
|  | /// expression into the NFA given using the given compiler. Callers may | 
|  | /// prefer this over `build` if they would like to reuse allocations while | 
|  | /// compiling many regular expressions. | 
|  | /// | 
|  | /// On success, the given NFA is completely overwritten with the NFA | 
|  | /// produced by the compiler. | 
|  | /// | 
|  | /// If there was a problem building the NFA, then an error is returned. For | 
|  | /// example, if the regex uses unsupported features (such as zero-width | 
|  | /// assertions), then an error is returned. When an error is returned, | 
|  | /// the contents of `nfa` are unspecified and should not be relied upon. | 
|  | /// However, it can still be reused in subsequent calls to this method. | 
|  | pub fn build_with( | 
|  | &self, | 
|  | compiler: &mut Compiler, | 
|  | nfa: &mut NFA, | 
|  | expr: &Hir, | 
|  | ) -> Result<()> { | 
|  | compiler.clear(); | 
|  | compiler.configure(self.config); | 
|  | compiler.compile(nfa, expr) | 
|  | } | 
|  |  | 
|  | /// Set whether matching must be anchored at the beginning of the input. | 
|  | /// | 
|  | /// When enabled, a match must begin at the start of the input. When | 
|  | /// disabled, the NFA will act as if the pattern started with a `.*?`, | 
|  | /// which enables a match to appear anywhere. | 
|  | /// | 
|  | /// By default this is disabled. | 
|  | pub fn anchored(&mut self, yes: bool) -> &mut Builder { | 
|  | self.config.anchored = yes; | 
|  | self | 
|  | } | 
|  |  | 
|  | /// When enabled, the builder will permit the construction of an NFA that | 
|  | /// may match invalid UTF-8. | 
|  | /// | 
|  | /// When disabled (the default), the builder is guaranteed to produce a | 
|  | /// regex that will only ever match valid UTF-8 (otherwise, the builder | 
|  | /// will return an error). | 
|  | pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut Builder { | 
|  | self.config.allow_invalid_utf8 = yes; | 
|  | self | 
|  | } | 
|  |  | 
|  | /// Reverse the NFA. | 
|  | /// | 
|  | /// A NFA reversal is performed by reversing all of the concatenated | 
|  | /// sub-expressions in the original pattern, recursively. The resulting | 
|  | /// NFA can be used to match the pattern starting from the end of a string | 
|  | /// instead of the beginning of a string. | 
|  | /// | 
|  | /// Reversing the NFA is useful for building a reverse DFA, which is most | 
|  | /// useful for finding the start of a match. | 
|  | pub fn reverse(&mut self, yes: bool) -> &mut Builder { | 
|  | self.config.reverse = yes; | 
|  | self | 
|  | } | 
|  |  | 
|  | /// Apply best effort heuristics to shrink the NFA at the expense of more | 
|  | /// time/memory. | 
|  | /// | 
|  | /// This is enabled by default. Generally speaking, if one is using an NFA | 
|  | /// to compile DFA, then the extra time used to shrink the NFA will be | 
|  | /// more than made up for during DFA construction (potentially by a lot). | 
|  | /// In other words, enabling this can substantially decrease the overall | 
|  | /// amount of time it takes to build a DFA. | 
|  | /// | 
|  | /// The only reason to disable this if you want to compile an NFA and start | 
|  | /// using it as quickly as possible without needing to build a DFA. | 
|  | pub fn shrink(&mut self, yes: bool) -> &mut Builder { | 
|  | self.config.shrink = yes; | 
|  | self | 
|  | } | 
|  | } | 
|  |  | 
|  | /// A compiler that converts a regex abstract syntax to an NFA via Thompson's | 
|  | /// construction. Namely, this compiler permits epsilon transitions between | 
|  | /// states. | 
|  | /// | 
|  | /// Users of this crate cannot use a compiler directly. Instead, all one can | 
|  | /// do is create one and use it via the | 
|  | /// [`Builder::build_with`](struct.Builder.html#method.build_with) | 
|  | /// method. This permits callers to reuse compilers in order to amortize | 
|  | /// allocations. | 
|  | #[derive(Clone, Debug)] | 
|  | pub struct Compiler { | 
|  | /// The set of compiled NFA states. Once a state is compiled, it is | 
|  | /// assigned a state ID equivalent to its index in this list. Subsequent | 
|  | /// compilation can modify previous states by adding new transitions. | 
|  | states: RefCell<Vec<CState>>, | 
|  | /// The configuration from the builder. | 
|  | config: Config, | 
|  | /// State used for compiling character classes to UTF-8 byte automata. | 
|  | /// State is not retained between character class compilations. This just | 
|  | /// serves to amortize allocation to the extent possible. | 
|  | utf8_state: RefCell<Utf8State>, | 
|  | /// State used for arranging character classes in reverse into a trie. | 
|  | trie_state: RefCell<RangeTrie>, | 
|  | /// State used for caching common suffixes when compiling reverse UTF-8 | 
|  | /// automata (for Unicode character classes). | 
|  | utf8_suffix: RefCell<Utf8SuffixMap>, | 
|  | /// A map used to re-map state IDs when translating the compiler's internal | 
|  | /// NFA state representation to the external NFA representation. | 
|  | remap: RefCell<Vec<StateID>>, | 
|  | /// A set of compiler internal state IDs that correspond to states that are | 
|  | /// exclusively epsilon transitions, i.e., goto instructions, combined with | 
|  | /// the state that they point to. This is used to record said states while | 
|  | /// transforming the compiler's internal NFA representation to the external | 
|  | /// form. | 
|  | empties: RefCell<Vec<(StateID, StateID)>>, | 
|  | } | 
|  |  | 
|  | /// A compiler intermediate state representation for an NFA that is only used | 
|  | /// during compilation. Once compilation is done, `CState`s are converted to | 
|  | /// `State`s, which have a much simpler representation. | 
|  | #[derive(Clone, Debug, Eq, PartialEq)] | 
|  | enum CState { | 
|  | /// An empty state whose only purpose is to forward the automaton to | 
|  | /// another state via en epsilon transition. These are useful during | 
|  | /// compilation but are otherwise removed at the end. | 
|  | Empty { next: StateID }, | 
|  | /// A state that only transitions to `next` if the current input byte is | 
|  | /// in the range `[start, end]` (inclusive on both ends). | 
|  | Range { range: Transition }, | 
|  | /// A state with possibly many transitions, represented in a sparse | 
|  | /// fashion. Transitions are ordered lexicographically by input range. | 
|  | /// As such, this may only be used when every transition has equal | 
|  | /// priority. (In practice, this is only used for encoding large UTF-8 | 
|  | /// automata.) | 
|  | Sparse { ranges: Vec<Transition> }, | 
|  | /// An alternation such that there exists an epsilon transition to all | 
|  | /// states in `alternates`, where matches found via earlier transitions | 
|  | /// are preferred over later transitions. | 
|  | Union { alternates: Vec<StateID> }, | 
|  | /// An alternation such that there exists an epsilon transition to all | 
|  | /// states in `alternates`, where matches found via later transitions | 
|  | /// are preferred over earlier transitions. | 
|  | /// | 
|  | /// This "reverse" state exists for convenience during compilation that | 
|  | /// permits easy construction of non-greedy combinations of NFA states. | 
|  | /// At the end of compilation, Union and UnionReverse states are merged | 
|  | /// into one Union type of state, where the latter has its epsilon | 
|  | /// transitions reversed to reflect the priority inversion. | 
|  | UnionReverse { alternates: Vec<StateID> }, | 
|  | /// A match state. There is exactly one such occurrence of this state in | 
|  | /// an NFA. | 
|  | Match, | 
|  | } | 
|  |  | 
|  | /// A value that represents the result of compiling a sub-expression of a | 
|  | /// regex's HIR. Specifically, this represents a sub-graph of the NFA that | 
|  | /// has an initial state at `start` and a final state at `end`. | 
|  | #[derive(Clone, Copy, Debug)] | 
|  | pub struct ThompsonRef { | 
|  | start: StateID, | 
|  | end: StateID, | 
|  | } | 
|  |  | 
|  | impl Compiler { | 
|  | /// Create a new compiler. | 
|  | pub fn new() -> Compiler { | 
|  | Compiler { | 
|  | states: RefCell::new(vec![]), | 
|  | config: Config::default(), | 
|  | utf8_state: RefCell::new(Utf8State::new()), | 
|  | trie_state: RefCell::new(RangeTrie::new()), | 
|  | utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)), | 
|  | remap: RefCell::new(vec![]), | 
|  | empties: RefCell::new(vec![]), | 
|  | } | 
|  | } | 
|  |  | 
|  | /// Clear any memory used by this compiler such that it is ready to compile | 
|  | /// a new regex. | 
|  | /// | 
|  | /// It is preferrable to reuse a compiler if possible in order to reuse | 
|  | /// allocations. | 
|  | fn clear(&self) { | 
|  | self.states.borrow_mut().clear(); | 
|  | // We don't need to clear anything else since they are cleared on | 
|  | // their own and only when they are used. | 
|  | } | 
|  |  | 
|  | /// Configure this compiler from the builder's knobs. | 
|  | /// | 
|  | /// The compiler is always reconfigured by the builder before using it to | 
|  | /// build an NFA. | 
|  | fn configure(&mut self, config: Config) { | 
|  | self.config = config; | 
|  | } | 
|  |  | 
|  | /// Convert the current intermediate NFA to its final compiled form. | 
|  | fn compile(&self, nfa: &mut NFA, expr: &Hir) -> Result<()> { | 
|  | nfa.anchored = self.config.anchored; | 
|  |  | 
|  | let mut start = self.add_empty(); | 
|  | if !nfa.anchored { | 
|  | let compiled = if self.config.allow_invalid_utf8 { | 
|  | self.c_unanchored_prefix_invalid_utf8()? | 
|  | } else { | 
|  | self.c_unanchored_prefix_valid_utf8()? | 
|  | }; | 
|  | self.patch(start, compiled.start); | 
|  | start = compiled.end; | 
|  | } | 
|  | let compiled = self.c(&expr)?; | 
|  | let match_id = self.add_match(); | 
|  | self.patch(start, compiled.start); | 
|  | self.patch(compiled.end, match_id); | 
|  | self.finish(nfa); | 
|  | Ok(()) | 
|  | } | 
|  |  | 
|  | /// Finishes the compilation process and populates the provide NFA with | 
|  | /// the final graph. | 
|  | fn finish(&self, nfa: &mut NFA) { | 
|  | let mut bstates = self.states.borrow_mut(); | 
|  | let mut remap = self.remap.borrow_mut(); | 
|  | remap.resize(bstates.len(), 0); | 
|  | let mut empties = self.empties.borrow_mut(); | 
|  | empties.clear(); | 
|  |  | 
|  | // We don't reuse allocations here becuase this is what we're | 
|  | // returning. | 
|  | nfa.states.clear(); | 
|  | let mut byteset = ByteClassSet::new(); | 
|  |  | 
|  | // The idea here is to convert our intermediate states to their final | 
|  | // form. The only real complexity here is the process of converting | 
|  | // transitions, which are expressed in terms of state IDs. The new | 
|  | // set of states will be smaller because of partial epsilon removal, | 
|  | // so the state IDs will not be the same. | 
|  | for (id, bstate) in bstates.iter_mut().enumerate() { | 
|  | match *bstate { | 
|  | CState::Empty { next } => { | 
|  | // Since we're removing empty states, we need to handle | 
|  | // them later since we don't yet know which new state this | 
|  | // empty state will be mapped to. | 
|  | empties.push((id, next)); | 
|  | } | 
|  | CState::Range { ref range } => { | 
|  | remap[id] = nfa.states.len(); | 
|  | byteset.set_range(range.start, range.end); | 
|  | nfa.states.push(State::Range { range: range.clone() }); | 
|  | } | 
|  | CState::Sparse { ref mut ranges } => { | 
|  | remap[id] = nfa.states.len(); | 
|  |  | 
|  | let ranges = mem::replace(ranges, vec![]); | 
|  | for r in &ranges { | 
|  | byteset.set_range(r.start, r.end); | 
|  | } | 
|  | nfa.states.push(State::Sparse { | 
|  | ranges: ranges.into_boxed_slice(), | 
|  | }); | 
|  | } | 
|  | CState::Union { ref mut alternates } => { | 
|  | remap[id] = nfa.states.len(); | 
|  |  | 
|  | let alternates = mem::replace(alternates, vec![]); | 
|  | nfa.states.push(State::Union { | 
|  | alternates: alternates.into_boxed_slice(), | 
|  | }); | 
|  | } | 
|  | CState::UnionReverse { ref mut alternates } => { | 
|  | remap[id] = nfa.states.len(); | 
|  |  | 
|  | let mut alternates = mem::replace(alternates, vec![]); | 
|  | alternates.reverse(); | 
|  | nfa.states.push(State::Union { | 
|  | alternates: alternates.into_boxed_slice(), | 
|  | }); | 
|  | } | 
|  | CState::Match => { | 
|  | remap[id] = nfa.states.len(); | 
|  | nfa.states.push(State::Match); | 
|  | } | 
|  | } | 
|  | } | 
|  | for &(empty_id, mut empty_next) in empties.iter() { | 
|  | // empty states can point to other empty states, forming a chain. | 
|  | // So we must follow the chain until the end, which must end at | 
|  | // a non-empty state, and therefore, a state that is correctly | 
|  | // remapped. We are guaranteed to terminate because our compiler | 
|  | // never builds a loop among empty states. | 
|  | while let CState::Empty { next } = bstates[empty_next] { | 
|  | empty_next = next; | 
|  | } | 
|  | remap[empty_id] = remap[empty_next]; | 
|  | } | 
|  | for state in &mut nfa.states { | 
|  | state.remap(&remap); | 
|  | } | 
|  | // The compiler always begins the NFA at the first state. | 
|  | nfa.start = remap[0]; | 
|  | nfa.byte_classes = byteset.byte_classes(); | 
|  | } | 
|  |  | 
|  | fn c(&self, expr: &Hir) -> Result<ThompsonRef> { | 
|  | match *expr.kind() { | 
|  | HirKind::Empty => { | 
|  | let id = self.add_empty(); | 
|  | Ok(ThompsonRef { start: id, end: id }) | 
|  | } | 
|  | HirKind::Literal(hir::Literal::Unicode(ch)) => { | 
|  | let mut buf = [0; 4]; | 
|  | let it = ch | 
|  | .encode_utf8(&mut buf) | 
|  | .as_bytes() | 
|  | .iter() | 
|  | .map(|&b| Ok(self.c_range(b, b))); | 
|  | self.c_concat(it) | 
|  | } | 
|  | HirKind::Literal(hir::Literal::Byte(b)) => Ok(self.c_range(b, b)), | 
|  | HirKind::Class(hir::Class::Bytes(ref cls)) => { | 
|  | self.c_byte_class(cls) | 
|  | } | 
|  | HirKind::Class(hir::Class::Unicode(ref cls)) => { | 
|  | self.c_unicode_class(cls) | 
|  | } | 
|  | HirKind::Repetition(ref rep) => self.c_repetition(rep), | 
|  | HirKind::Group(ref group) => self.c(&*group.hir), | 
|  | HirKind::Concat(ref exprs) => { | 
|  | self.c_concat(exprs.iter().map(|e| self.c(e))) | 
|  | } | 
|  | HirKind::Alternation(ref exprs) => { | 
|  | self.c_alternation(exprs.iter().map(|e| self.c(e))) | 
|  | } | 
|  | HirKind::Anchor(_) => Err(Error::unsupported_anchor()), | 
|  | HirKind::WordBoundary(_) => Err(Error::unsupported_word()), | 
|  | } | 
|  | } | 
|  |  | 
|  | fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef> | 
|  | where | 
|  | I: DoubleEndedIterator<Item = Result<ThompsonRef>>, | 
|  | { | 
|  | let first = | 
|  | if self.config.reverse { it.next_back() } else { it.next() }; | 
|  | let ThompsonRef { start, mut end } = match first { | 
|  | Some(result) => result?, | 
|  | None => return Ok(self.c_empty()), | 
|  | }; | 
|  | loop { | 
|  | let next = | 
|  | if self.config.reverse { it.next_back() } else { it.next() }; | 
|  | let compiled = match next { | 
|  | Some(result) => result?, | 
|  | None => break, | 
|  | }; | 
|  | self.patch(end, compiled.start); | 
|  | end = compiled.end; | 
|  | } | 
|  | Ok(ThompsonRef { start, end }) | 
|  | } | 
|  |  | 
|  | fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef> | 
|  | where | 
|  | I: Iterator<Item = Result<ThompsonRef>>, | 
|  | { | 
|  | let first = it.next().expect("alternations must be non-empty")?; | 
|  | let second = match it.next() { | 
|  | None => return Ok(first), | 
|  | Some(result) => result?, | 
|  | }; | 
|  |  | 
|  | let union = self.add_union(); | 
|  | let end = self.add_empty(); | 
|  | self.patch(union, first.start); | 
|  | self.patch(first.end, end); | 
|  | self.patch(union, second.start); | 
|  | self.patch(second.end, end); | 
|  | for result in it { | 
|  | let compiled = result?; | 
|  | self.patch(union, compiled.start); | 
|  | self.patch(compiled.end, end); | 
|  | } | 
|  | Ok(ThompsonRef { start: union, end }) | 
|  | } | 
|  |  | 
|  | fn c_repetition(&self, rep: &hir::Repetition) -> Result<ThompsonRef> { | 
|  | match rep.kind { | 
|  | hir::RepetitionKind::ZeroOrOne => { | 
|  | self.c_zero_or_one(&rep.hir, rep.greedy) | 
|  | } | 
|  | hir::RepetitionKind::ZeroOrMore => { | 
|  | self.c_at_least(&rep.hir, rep.greedy, 0) | 
|  | } | 
|  | hir::RepetitionKind::OneOrMore => { | 
|  | self.c_at_least(&rep.hir, rep.greedy, 1) | 
|  | } | 
|  | hir::RepetitionKind::Range(ref rng) => match *rng { | 
|  | hir::RepetitionRange::Exactly(count) => { | 
|  | self.c_exactly(&rep.hir, count) | 
|  | } | 
|  | hir::RepetitionRange::AtLeast(m) => { | 
|  | self.c_at_least(&rep.hir, rep.greedy, m) | 
|  | } | 
|  | hir::RepetitionRange::Bounded(min, max) => { | 
|  | self.c_bounded(&rep.hir, rep.greedy, min, max) | 
|  | } | 
|  | }, | 
|  | } | 
|  | } | 
|  |  | 
|  | fn c_bounded( | 
|  | &self, | 
|  | expr: &Hir, | 
|  | greedy: bool, | 
|  | min: u32, | 
|  | max: u32, | 
|  | ) -> Result<ThompsonRef> { | 
|  | let prefix = self.c_exactly(expr, min)?; | 
|  | if min == max { | 
|  | return Ok(prefix); | 
|  | } | 
|  |  | 
|  | // It is tempting here to compile the rest here as a concatenation | 
|  | // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it | 
|  | // were `aaa?a?a?`. The problem here is that it leads to this program: | 
|  | // | 
|  | //     >000000: 61 => 01 | 
|  | //     000001: 61 => 02 | 
|  | //     000002: alt(03, 04) | 
|  | //     000003: 61 => 04 | 
|  | //     000004: alt(05, 06) | 
|  | //     000005: 61 => 06 | 
|  | //     000006: alt(07, 08) | 
|  | //     000007: 61 => 08 | 
|  | //     000008: MATCH | 
|  | // | 
|  | // And effectively, once you hit state 2, the epsilon closure will | 
|  | // include states 3, 5, 5, 6, 7 and 8, which is quite a bit. It is | 
|  | // better to instead compile it like so: | 
|  | // | 
|  | //     >000000: 61 => 01 | 
|  | //      000001: 61 => 02 | 
|  | //      000002: alt(03, 08) | 
|  | //      000003: 61 => 04 | 
|  | //      000004: alt(05, 08) | 
|  | //      000005: 61 => 06 | 
|  | //      000006: alt(07, 08) | 
|  | //      000007: 61 => 08 | 
|  | //      000008: MATCH | 
|  | // | 
|  | // So that the epsilon closure of state 2 is now just 3 and 8. | 
|  | let empty = self.add_empty(); | 
|  | let mut prev_end = prefix.end; | 
|  | for _ in min..max { | 
|  | let union = if greedy { | 
|  | self.add_union() | 
|  | } else { | 
|  | self.add_reverse_union() | 
|  | }; | 
|  | let compiled = self.c(expr)?; | 
|  | self.patch(prev_end, union); | 
|  | self.patch(union, compiled.start); | 
|  | self.patch(union, empty); | 
|  | prev_end = compiled.end; | 
|  | } | 
|  | self.patch(prev_end, empty); | 
|  | Ok(ThompsonRef { start: prefix.start, end: empty }) | 
|  | } | 
|  |  | 
|  | fn c_at_least( | 
|  | &self, | 
|  | expr: &Hir, | 
|  | greedy: bool, | 
|  | n: u32, | 
|  | ) -> Result<ThompsonRef> { | 
|  | if n == 0 { | 
|  | let union = if greedy { | 
|  | self.add_union() | 
|  | } else { | 
|  | self.add_reverse_union() | 
|  | }; | 
|  | let compiled = self.c(expr)?; | 
|  | self.patch(union, compiled.start); | 
|  | self.patch(compiled.end, union); | 
|  | Ok(ThompsonRef { start: union, end: union }) | 
|  | } else if n == 1 { | 
|  | let compiled = self.c(expr)?; | 
|  | let union = if greedy { | 
|  | self.add_union() | 
|  | } else { | 
|  | self.add_reverse_union() | 
|  | }; | 
|  | self.patch(compiled.end, union); | 
|  | self.patch(union, compiled.start); | 
|  | Ok(ThompsonRef { start: compiled.start, end: union }) | 
|  | } else { | 
|  | let prefix = self.c_exactly(expr, n - 1)?; | 
|  | let last = self.c(expr)?; | 
|  | let union = if greedy { | 
|  | self.add_union() | 
|  | } else { | 
|  | self.add_reverse_union() | 
|  | }; | 
|  | self.patch(prefix.end, last.start); | 
|  | self.patch(last.end, union); | 
|  | self.patch(union, last.start); | 
|  | Ok(ThompsonRef { start: prefix.start, end: union }) | 
|  | } | 
|  | } | 
|  |  | 
|  | fn c_zero_or_one(&self, expr: &Hir, greedy: bool) -> Result<ThompsonRef> { | 
|  | let union = | 
|  | if greedy { self.add_union() } else { self.add_reverse_union() }; | 
|  | let compiled = self.c(expr)?; | 
|  | let empty = self.add_empty(); | 
|  | self.patch(union, compiled.start); | 
|  | self.patch(union, empty); | 
|  | self.patch(compiled.end, empty); | 
|  | Ok(ThompsonRef { start: union, end: empty }) | 
|  | } | 
|  |  | 
|  | fn c_exactly(&self, expr: &Hir, n: u32) -> Result<ThompsonRef> { | 
|  | let it = (0..n).map(|_| self.c(expr)); | 
|  | self.c_concat(it) | 
|  | } | 
|  |  | 
|  | fn c_byte_class(&self, cls: &hir::ClassBytes) -> Result<ThompsonRef> { | 
|  | let end = self.add_empty(); | 
|  | let mut trans = Vec::with_capacity(cls.ranges().len()); | 
|  | for r in cls.iter() { | 
|  | trans.push(Transition { | 
|  | start: r.start(), | 
|  | end: r.end(), | 
|  | next: end, | 
|  | }); | 
|  | } | 
|  | Ok(ThompsonRef { start: self.add_sparse(trans), end }) | 
|  | } | 
|  |  | 
|  | fn c_unicode_class(&self, cls: &hir::ClassUnicode) -> Result<ThompsonRef> { | 
|  | // If all we have are ASCII ranges wrapped in a Unicode package, then | 
|  | // there is zero reason to bring out the big guns. We can fit all ASCII | 
|  | // ranges within a single sparse transition. | 
|  | if cls.is_all_ascii() { | 
|  | let end = self.add_empty(); | 
|  | let mut trans = Vec::with_capacity(cls.ranges().len()); | 
|  | for r in cls.iter() { | 
|  | assert!(r.start() <= '\x7F'); | 
|  | assert!(r.end() <= '\x7F'); | 
|  | trans.push(Transition { | 
|  | start: r.start() as u8, | 
|  | end: r.end() as u8, | 
|  | next: end, | 
|  | }); | 
|  | } | 
|  | Ok(ThompsonRef { start: self.add_sparse(trans), end }) | 
|  | } else if self.config.reverse { | 
|  | if !self.config.shrink { | 
|  | // When we don't want to spend the extra time shrinking, we | 
|  | // compile the UTF-8 automaton in reverse using something like | 
|  | // the "naive" approach, but will attempt to re-use common | 
|  | // suffixes. | 
|  | self.c_unicode_class_reverse_with_suffix(cls) | 
|  | } else { | 
|  | // When we want to shrink our NFA for reverse UTF-8 automata, | 
|  | // we cannot feed UTF-8 sequences directly to the UTF-8 | 
|  | // compiler, since the UTF-8 compiler requires all sequences | 
|  | // to be lexicographically sorted. Instead, we organize our | 
|  | // sequences into a range trie, which can then output our | 
|  | // sequences in the correct order. Unfortunately, building the | 
|  | // range trie is fairly expensive (but not nearly as expensive | 
|  | // as building a DFA). Hence the reason why the 'shrink' option | 
|  | // exists, so that this path can be toggled off. | 
|  | let mut trie = self.trie_state.borrow_mut(); | 
|  | trie.clear(); | 
|  |  | 
|  | for rng in cls.iter() { | 
|  | for mut seq in Utf8Sequences::new(rng.start(), rng.end()) { | 
|  | seq.reverse(); | 
|  | trie.insert(seq.as_slice()); | 
|  | } | 
|  | } | 
|  | let mut utf8_state = self.utf8_state.borrow_mut(); | 
|  | let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state); | 
|  | trie.iter(|seq| { | 
|  | utf8c.add(&seq); | 
|  | }); | 
|  | Ok(utf8c.finish()) | 
|  | } | 
|  | } else { | 
|  | // In the forward direction, we always shrink our UTF-8 automata | 
|  | // because we can stream it right into the UTF-8 compiler. There | 
|  | // is almost no downside (in either memory or time) to using this | 
|  | // approach. | 
|  | let mut utf8_state = self.utf8_state.borrow_mut(); | 
|  | let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state); | 
|  | for rng in cls.iter() { | 
|  | for seq in Utf8Sequences::new(rng.start(), rng.end()) { | 
|  | utf8c.add(seq.as_slice()); | 
|  | } | 
|  | } | 
|  | Ok(utf8c.finish()) | 
|  | } | 
|  |  | 
|  | // For reference, the code below is the "naive" version of compiling a | 
|  | // UTF-8 automaton. It is deliciously simple (and works for both the | 
|  | // forward and reverse cases), but will unfortunately produce very | 
|  | // large NFAs. When compiling a forward automaton, the size difference | 
|  | // can sometimes be an order of magnitude. For example, the '\w' regex | 
|  | // will generate about ~3000 NFA states using the naive approach below, | 
|  | // but only 283 states when using the approach above. This is because | 
|  | // the approach above actually compiles a *minimal* (or near minimal, | 
|  | // because of the bounded hashmap) UTF-8 automaton. | 
|  | // | 
|  | // The code below is kept as a reference point in order to make it | 
|  | // easier to understand the higher level goal here. | 
|  | /* | 
|  | let it = cls | 
|  | .iter() | 
|  | .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end())) | 
|  | .map(|seq| { | 
|  | let it = seq | 
|  | .as_slice() | 
|  | .iter() | 
|  | .map(|rng| Ok(self.c_range(rng.start, rng.end))); | 
|  | self.c_concat(it) | 
|  | }); | 
|  | self.c_alternation(it); | 
|  | */ | 
|  | } | 
|  |  | 
|  | fn c_unicode_class_reverse_with_suffix( | 
|  | &self, | 
|  | cls: &hir::ClassUnicode, | 
|  | ) -> Result<ThompsonRef> { | 
|  | // N.B. It would likely be better to cache common *prefixes* in the | 
|  | // reverse direction, but it's not quite clear how to do that. The | 
|  | // advantage of caching suffixes is that it does give us a win, and | 
|  | // has a very small additional overhead. | 
|  | let mut cache = self.utf8_suffix.borrow_mut(); | 
|  | cache.clear(); | 
|  |  | 
|  | let union = self.add_union(); | 
|  | let alt_end = self.add_empty(); | 
|  | for urng in cls.iter() { | 
|  | for seq in Utf8Sequences::new(urng.start(), urng.end()) { | 
|  | let mut end = alt_end; | 
|  | for brng in seq.as_slice() { | 
|  | let key = Utf8SuffixKey { | 
|  | from: end, | 
|  | start: brng.start, | 
|  | end: brng.end, | 
|  | }; | 
|  | let hash = cache.hash(&key); | 
|  | if let Some(id) = cache.get(&key, hash) { | 
|  | end = id; | 
|  | continue; | 
|  | } | 
|  |  | 
|  | let compiled = self.c_range(brng.start, brng.end); | 
|  | self.patch(compiled.end, end); | 
|  | end = compiled.start; | 
|  | cache.set(key, hash, end); | 
|  | } | 
|  | self.patch(union, end); | 
|  | } | 
|  | } | 
|  | Ok(ThompsonRef { start: union, end: alt_end }) | 
|  | } | 
|  |  | 
|  | fn c_range(&self, start: u8, end: u8) -> ThompsonRef { | 
|  | let id = self.add_range(start, end); | 
|  | ThompsonRef { start: id, end: id } | 
|  | } | 
|  |  | 
|  | fn c_empty(&self) -> ThompsonRef { | 
|  | let id = self.add_empty(); | 
|  | ThompsonRef { start: id, end: id } | 
|  | } | 
|  |  | 
|  | fn c_unanchored_prefix_valid_utf8(&self) -> Result<ThompsonRef> { | 
|  | self.c(&Hir::repetition(hir::Repetition { | 
|  | kind: hir::RepetitionKind::ZeroOrMore, | 
|  | greedy: false, | 
|  | hir: Box::new(Hir::any(false)), | 
|  | })) | 
|  | } | 
|  |  | 
|  | fn c_unanchored_prefix_invalid_utf8(&self) -> Result<ThompsonRef> { | 
|  | self.c(&Hir::repetition(hir::Repetition { | 
|  | kind: hir::RepetitionKind::ZeroOrMore, | 
|  | greedy: false, | 
|  | hir: Box::new(Hir::any(true)), | 
|  | })) | 
|  | } | 
|  |  | 
|  | fn patch(&self, from: StateID, to: StateID) { | 
|  | match self.states.borrow_mut()[from] { | 
|  | CState::Empty { ref mut next } => { | 
|  | *next = to; | 
|  | } | 
|  | CState::Range { ref mut range } => { | 
|  | range.next = to; | 
|  | } | 
|  | CState::Sparse { .. } => { | 
|  | panic!("cannot patch from a sparse NFA state") | 
|  | } | 
|  | CState::Union { ref mut alternates } => { | 
|  | alternates.push(to); | 
|  | } | 
|  | CState::UnionReverse { ref mut alternates } => { | 
|  | alternates.push(to); | 
|  | } | 
|  | CState::Match => {} | 
|  | } | 
|  | } | 
|  |  | 
|  | fn add_empty(&self) -> StateID { | 
|  | let id = self.states.borrow().len(); | 
|  | self.states.borrow_mut().push(CState::Empty { next: 0 }); | 
|  | id | 
|  | } | 
|  |  | 
|  | fn add_range(&self, start: u8, end: u8) -> StateID { | 
|  | let id = self.states.borrow().len(); | 
|  | let trans = Transition { start, end, next: 0 }; | 
|  | let state = CState::Range { range: trans }; | 
|  | self.states.borrow_mut().push(state); | 
|  | id | 
|  | } | 
|  |  | 
|  | fn add_sparse(&self, ranges: Vec<Transition>) -> StateID { | 
|  | if ranges.len() == 1 { | 
|  | let id = self.states.borrow().len(); | 
|  | let state = CState::Range { range: ranges[0] }; | 
|  | self.states.borrow_mut().push(state); | 
|  | return id; | 
|  | } | 
|  | let id = self.states.borrow().len(); | 
|  | let state = CState::Sparse { ranges }; | 
|  | self.states.borrow_mut().push(state); | 
|  | id | 
|  | } | 
|  |  | 
|  | fn add_union(&self) -> StateID { | 
|  | let id = self.states.borrow().len(); | 
|  | let state = CState::Union { alternates: vec![] }; | 
|  | self.states.borrow_mut().push(state); | 
|  | id | 
|  | } | 
|  |  | 
|  | fn add_reverse_union(&self) -> StateID { | 
|  | let id = self.states.borrow().len(); | 
|  | let state = CState::UnionReverse { alternates: vec![] }; | 
|  | self.states.borrow_mut().push(state); | 
|  | id | 
|  | } | 
|  |  | 
|  | fn add_match(&self) -> StateID { | 
|  | let id = self.states.borrow().len(); | 
|  | self.states.borrow_mut().push(CState::Match); | 
|  | id | 
|  | } | 
|  | } | 
|  |  | 
|  | #[derive(Debug)] | 
|  | struct Utf8Compiler<'a> { | 
|  | nfac: &'a Compiler, | 
|  | state: &'a mut Utf8State, | 
|  | target: StateID, | 
|  | } | 
|  |  | 
|  | #[derive(Clone, Debug)] | 
|  | struct Utf8State { | 
|  | compiled: Utf8BoundedMap, | 
|  | uncompiled: Vec<Utf8Node>, | 
|  | } | 
|  |  | 
|  | #[derive(Clone, Debug)] | 
|  | struct Utf8Node { | 
|  | trans: Vec<Transition>, | 
|  | last: Option<Utf8LastTransition>, | 
|  | } | 
|  |  | 
|  | #[derive(Clone, Debug)] | 
|  | struct Utf8LastTransition { | 
|  | start: u8, | 
|  | end: u8, | 
|  | } | 
|  |  | 
|  | impl Utf8State { | 
|  | fn new() -> Utf8State { | 
|  | Utf8State { compiled: Utf8BoundedMap::new(5000), uncompiled: vec![] } | 
|  | } | 
|  |  | 
|  | fn clear(&mut self) { | 
|  | self.compiled.clear(); | 
|  | self.uncompiled.clear(); | 
|  | } | 
|  | } | 
|  |  | 
|  | impl<'a> Utf8Compiler<'a> { | 
|  | fn new(nfac: &'a Compiler, state: &'a mut Utf8State) -> Utf8Compiler<'a> { | 
|  | let target = nfac.add_empty(); | 
|  | state.clear(); | 
|  | let mut utf8c = Utf8Compiler { nfac, state, target }; | 
|  | utf8c.add_empty(); | 
|  | utf8c | 
|  | } | 
|  |  | 
|  | fn finish(&mut self) -> ThompsonRef { | 
|  | self.compile_from(0); | 
|  | let node = self.pop_root(); | 
|  | let start = self.compile(node); | 
|  | ThompsonRef { start, end: self.target } | 
|  | } | 
|  |  | 
|  | fn add(&mut self, ranges: &[Utf8Range]) { | 
|  | let prefix_len = ranges | 
|  | .iter() | 
|  | .zip(&self.state.uncompiled) | 
|  | .take_while(|&(range, node)| { | 
|  | node.last.as_ref().map_or(false, |t| { | 
|  | (t.start, t.end) == (range.start, range.end) | 
|  | }) | 
|  | }) | 
|  | .count(); | 
|  | assert!(prefix_len < ranges.len()); | 
|  | self.compile_from(prefix_len); | 
|  | self.add_suffix(&ranges[prefix_len..]); | 
|  | } | 
|  |  | 
|  | fn compile_from(&mut self, from: usize) { | 
|  | let mut next = self.target; | 
|  | while from + 1 < self.state.uncompiled.len() { | 
|  | let node = self.pop_freeze(next); | 
|  | next = self.compile(node); | 
|  | } | 
|  | self.top_last_freeze(next); | 
|  | } | 
|  |  | 
|  | fn compile(&mut self, node: Vec<Transition>) -> StateID { | 
|  | let hash = self.state.compiled.hash(&node); | 
|  | if let Some(id) = self.state.compiled.get(&node, hash) { | 
|  | return id; | 
|  | } | 
|  | let id = self.nfac.add_sparse(node.clone()); | 
|  | self.state.compiled.set(node, hash, id); | 
|  | id | 
|  | } | 
|  |  | 
|  | fn add_suffix(&mut self, ranges: &[Utf8Range]) { | 
|  | assert!(!ranges.is_empty()); | 
|  | let last = self | 
|  | .state | 
|  | .uncompiled | 
|  | .len() | 
|  | .checked_sub(1) | 
|  | .expect("non-empty nodes"); | 
|  | assert!(self.state.uncompiled[last].last.is_none()); | 
|  | self.state.uncompiled[last].last = Some(Utf8LastTransition { | 
|  | start: ranges[0].start, | 
|  | end: ranges[0].end, | 
|  | }); | 
|  | for r in &ranges[1..] { | 
|  | self.state.uncompiled.push(Utf8Node { | 
|  | trans: vec![], | 
|  | last: Some(Utf8LastTransition { start: r.start, end: r.end }), | 
|  | }); | 
|  | } | 
|  | } | 
|  |  | 
|  | fn add_empty(&mut self) { | 
|  | self.state.uncompiled.push(Utf8Node { trans: vec![], last: None }); | 
|  | } | 
|  |  | 
|  | fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> { | 
|  | let mut uncompiled = self.state.uncompiled.pop().unwrap(); | 
|  | uncompiled.set_last_transition(next); | 
|  | uncompiled.trans | 
|  | } | 
|  |  | 
|  | fn pop_root(&mut self) -> Vec<Transition> { | 
|  | assert_eq!(self.state.uncompiled.len(), 1); | 
|  | assert!(self.state.uncompiled[0].last.is_none()); | 
|  | self.state.uncompiled.pop().expect("non-empty nodes").trans | 
|  | } | 
|  |  | 
|  | fn top_last_freeze(&mut self, next: StateID) { | 
|  | let last = self | 
|  | .state | 
|  | .uncompiled | 
|  | .len() | 
|  | .checked_sub(1) | 
|  | .expect("non-empty nodes"); | 
|  | self.state.uncompiled[last].set_last_transition(next); | 
|  | } | 
|  | } | 
|  |  | 
|  | impl Utf8Node { | 
|  | fn set_last_transition(&mut self, next: StateID) { | 
|  | if let Some(last) = self.last.take() { | 
|  | self.trans.push(Transition { | 
|  | start: last.start, | 
|  | end: last.end, | 
|  | next, | 
|  | }); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | #[cfg(test)] | 
|  | mod tests { | 
|  | use regex_syntax::hir::Hir; | 
|  | use regex_syntax::ParserBuilder; | 
|  |  | 
|  | use super::{Builder, State, StateID, Transition, NFA}; | 
|  |  | 
|  | fn parse(pattern: &str) -> Hir { | 
|  | ParserBuilder::new().build().parse(pattern).unwrap() | 
|  | } | 
|  |  | 
|  | fn build(pattern: &str) -> NFA { | 
|  | Builder::new().anchored(true).build(&parse(pattern)).unwrap() | 
|  | } | 
|  |  | 
|  | fn s_byte(byte: u8, next: StateID) -> State { | 
|  | let trans = Transition { start: byte, end: byte, next }; | 
|  | State::Range { range: trans } | 
|  | } | 
|  |  | 
|  | fn s_range(start: u8, end: u8, next: StateID) -> State { | 
|  | let trans = Transition { start, end, next }; | 
|  | State::Range { range: trans } | 
|  | } | 
|  |  | 
|  | fn s_sparse(ranges: &[(u8, u8, StateID)]) -> State { | 
|  | let ranges = ranges | 
|  | .iter() | 
|  | .map(|&(start, end, next)| Transition { start, end, next }) | 
|  | .collect(); | 
|  | State::Sparse { ranges } | 
|  | } | 
|  |  | 
|  | fn s_union(alts: &[StateID]) -> State { | 
|  | State::Union { alternates: alts.to_vec().into_boxed_slice() } | 
|  | } | 
|  |  | 
|  | fn s_match() -> State { | 
|  | State::Match | 
|  | } | 
|  |  | 
|  | #[test] | 
|  | fn errors() { | 
|  | // unsupported anchors | 
|  | assert!(Builder::new().build(&parse(r"^")).is_err()); | 
|  | assert!(Builder::new().build(&parse(r"$")).is_err()); | 
|  | assert!(Builder::new().build(&parse(r"\A")).is_err()); | 
|  | assert!(Builder::new().build(&parse(r"\z")).is_err()); | 
|  |  | 
|  | // unsupported word boundaries | 
|  | assert!(Builder::new().build(&parse(r"\b")).is_err()); | 
|  | assert!(Builder::new().build(&parse(r"\B")).is_err()); | 
|  | assert!(Builder::new().build(&parse(r"(?-u)\b")).is_err()); | 
|  | } | 
|  |  | 
|  | // Test that building an unanchored NFA has an appropriate `.*?` prefix. | 
|  | #[test] | 
|  | fn compile_unanchored_prefix() { | 
|  | // When the machine can only match valid UTF-8. | 
|  | let nfa = Builder::new().anchored(false).build(&parse(r"a")).unwrap(); | 
|  | // There should be many states since the `.` in `.*?` matches any | 
|  | // Unicode scalar value. | 
|  | assert_eq!(11, nfa.len()); | 
|  | assert_eq!(nfa.states[10], s_match()); | 
|  | assert_eq!(nfa.states[9], s_byte(b'a', 10)); | 
|  |  | 
|  | // When the machine can match invalid UTF-8. | 
|  | let nfa = Builder::new() | 
|  | .anchored(false) | 
|  | .allow_invalid_utf8(true) | 
|  | .build(&parse(r"a")) | 
|  | .unwrap(); | 
|  | assert_eq!( | 
|  | nfa.states, | 
|  | &[ | 
|  | s_union(&[2, 1]), | 
|  | s_range(0, 255, 0), | 
|  | s_byte(b'a', 3), | 
|  | s_match(), | 
|  | ] | 
|  | ); | 
|  | } | 
|  |  | 
|  | #[test] | 
|  | fn compile_empty() { | 
|  | assert_eq!(build("").states, &[s_match(),]); | 
|  | } | 
|  |  | 
|  | #[test] | 
|  | fn compile_literal() { | 
|  | assert_eq!(build("a").states, &[s_byte(b'a', 1), s_match(),]); | 
|  | assert_eq!( | 
|  | build("ab").states, | 
|  | &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),] | 
|  | ); | 
|  | assert_eq!( | 
|  | build("ā").states, | 
|  | &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(),] | 
|  | ); | 
|  |  | 
|  | // Check that non-UTF-8 literals work. | 
|  | let hir = ParserBuilder::new() | 
|  | .allow_invalid_utf8(true) | 
|  | .build() | 
|  | .parse(r"(?-u)\xFF") | 
|  | .unwrap(); | 
|  | let nfa = Builder::new() | 
|  | .anchored(true) | 
|  | .allow_invalid_utf8(true) | 
|  | .build(&hir) | 
|  | .unwrap(); | 
|  | assert_eq!(nfa.states, &[s_byte(b'\xFF', 1), s_match(),]); | 
|  | } | 
|  |  | 
|  | #[test] | 
|  | fn compile_class() { | 
|  | assert_eq!( | 
|  | build(r"[a-z]").states, | 
|  | &[s_range(b'a', b'z', 1), s_match(),] | 
|  | ); | 
|  | assert_eq!( | 
|  | build(r"[x-za-c]").states, | 
|  | &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match()] | 
|  | ); | 
|  | assert_eq!( | 
|  | build(r"[\u03B1-\u03B4]").states, | 
|  | &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match()] | 
|  | ); | 
|  | assert_eq!( | 
|  | build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states, | 
|  | &[ | 
|  | s_range(0xB1, 0xB4, 5), | 
|  | s_range(0x99, 0x9E, 5), | 
|  | s_byte(0xA4, 1), | 
|  | s_byte(0x9F, 2), | 
|  | s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]), | 
|  | s_match(), | 
|  | ] | 
|  | ); | 
|  | assert_eq!( | 
|  | build(r"[a-zā]").states, | 
|  | &[ | 
|  | s_byte(0x83, 3), | 
|  | s_byte(0x98, 0), | 
|  | s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]), | 
|  | s_match(), | 
|  | ] | 
|  | ); | 
|  | } | 
|  |  | 
|  | #[test] | 
|  | fn compile_repetition() { | 
|  | assert_eq!( | 
|  | build(r"a?").states, | 
|  | &[s_union(&[1, 2]), s_byte(b'a', 2), s_match(),] | 
|  | ); | 
|  | assert_eq!( | 
|  | build(r"a??").states, | 
|  | &[s_union(&[2, 1]), s_byte(b'a', 2), s_match(),] | 
|  | ); | 
|  | } | 
|  |  | 
|  | #[test] | 
|  | fn compile_group() { | 
|  | assert_eq!( | 
|  | build(r"ab+").states, | 
|  | &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[1, 3]), s_match(),] | 
|  | ); | 
|  | assert_eq!( | 
|  | build(r"(ab)").states, | 
|  | &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),] | 
|  | ); | 
|  | assert_eq!( | 
|  | build(r"(ab)+").states, | 
|  | &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[0, 3]), s_match(),] | 
|  | ); | 
|  | } | 
|  |  | 
|  | #[test] | 
|  | fn compile_alternation() { | 
|  | assert_eq!( | 
|  | build(r"a|b").states, | 
|  | &[s_byte(b'a', 3), s_byte(b'b', 3), s_union(&[0, 1]), s_match(),] | 
|  | ); | 
|  | assert_eq!( | 
|  | build(r"|b").states, | 
|  | &[s_byte(b'b', 2), s_union(&[2, 0]), s_match(),] | 
|  | ); | 
|  | assert_eq!( | 
|  | build(r"a|").states, | 
|  | &[s_byte(b'a', 2), s_union(&[0, 2]), s_match(),] | 
|  | ); | 
|  | } | 
|  | } |