|  | #[cfg(feature = "std")] | 
|  | use core::fmt; | 
|  | #[cfg(feature = "std")] | 
|  | use core::iter; | 
|  | use core::marker::PhantomData; | 
|  | use core::mem::size_of; | 
|  | #[cfg(feature = "std")] | 
|  | use std::collections::HashMap; | 
|  |  | 
|  | #[cfg(feature = "std")] | 
|  | use byteorder::{BigEndian, LittleEndian}; | 
|  | use byteorder::{ByteOrder, NativeEndian}; | 
|  |  | 
|  | use classes::ByteClasses; | 
|  | use dense; | 
|  | use dfa::DFA; | 
|  | #[cfg(feature = "std")] | 
|  | use error::{Error, Result}; | 
|  | #[cfg(feature = "std")] | 
|  | use state_id::{dead_id, usize_to_state_id, write_state_id_bytes, StateID}; | 
|  | #[cfg(not(feature = "std"))] | 
|  | use state_id::{dead_id, StateID}; | 
|  |  | 
|  | /// A sparse table-based deterministic finite automaton (DFA). | 
|  | /// | 
|  | /// In contrast to a [dense DFA](enum.DenseDFA.html), a sparse DFA uses a | 
|  | /// more space efficient representation for its transition table. Consequently, | 
|  | /// sparse DFAs can use much less memory than dense DFAs, but this comes at a | 
|  | /// price. In particular, reading the more space efficient transitions takes | 
|  | /// more work, and consequently, searching using a sparse DFA is typically | 
|  | /// slower than a dense DFA. | 
|  | /// | 
|  | /// A sparse DFA can be built using the default configuration via the | 
|  | /// [`SparseDFA::new`](enum.SparseDFA.html#method.new) constructor. Otherwise, | 
|  | /// one can configure various aspects of a dense DFA via | 
|  | /// [`dense::Builder`](dense/struct.Builder.html), and then convert a dense | 
|  | /// DFA to a sparse DFA using | 
|  | /// [`DenseDFA::to_sparse`](enum.DenseDFA.html#method.to_sparse). | 
|  | /// | 
|  | /// In general, a sparse DFA supports all the same operations as a dense DFA. | 
|  | /// | 
|  | /// Making the choice between a dense and sparse DFA depends on your specific | 
|  | /// work load. If you can sacrifice a bit of search time performance, then a | 
|  | /// sparse DFA might be the best choice. In particular, while sparse DFAs are | 
|  | /// probably always slower than dense DFAs, you may find that they are easily | 
|  | /// fast enough for your purposes! | 
|  | /// | 
|  | /// # State size | 
|  | /// | 
|  | /// A `SparseDFA` has two type parameters, `T` and `S`. `T` corresponds to | 
|  | /// the type of the DFA's transition table while `S` corresponds to the | 
|  | /// representation used for the DFA's state identifiers as described by the | 
|  | /// [`StateID`](trait.StateID.html) trait. This type parameter is typically | 
|  | /// `usize`, but other valid choices provided by this crate include `u8`, | 
|  | /// `u16`, `u32` and `u64`. The primary reason for choosing a different state | 
|  | /// identifier representation than the default is to reduce the amount of | 
|  | /// memory used by a DFA. Note though, that if the chosen representation cannot | 
|  | /// accommodate the size of your DFA, then building the DFA will fail and | 
|  | /// return an error. | 
|  | /// | 
|  | /// While the reduction in heap memory used by a DFA is one reason for choosing | 
|  | /// a smaller state identifier representation, another possible reason is for | 
|  | /// decreasing the serialization size of a DFA, as returned by | 
|  | /// [`to_bytes_little_endian`](enum.SparseDFA.html#method.to_bytes_little_endian), | 
|  | /// [`to_bytes_big_endian`](enum.SparseDFA.html#method.to_bytes_big_endian) | 
|  | /// or | 
|  | /// [`to_bytes_native_endian`](enum.DenseDFA.html#method.to_bytes_native_endian). | 
|  | /// | 
|  | /// The type of the transition table is typically either `Vec<u8>` or `&[u8]`, | 
|  | /// depending on where the transition table is stored. Note that this is | 
|  | /// different than a dense DFA, whose transition table is typically | 
|  | /// `Vec<S>` or `&[S]`. The reason for this is that a sparse DFA always reads | 
|  | /// its transition table from raw bytes because the table is compactly packed. | 
|  | /// | 
|  | /// # Variants | 
|  | /// | 
|  | /// This DFA is defined as a non-exhaustive enumeration of different types of | 
|  | /// dense DFAs. All of the variants use the same internal representation | 
|  | /// for the transition table, but they vary in how the transition table is | 
|  | /// read. A DFA's specific variant depends on the configuration options set via | 
|  | /// [`dense::Builder`](dense/struct.Builder.html). The default variant is | 
|  | /// `ByteClass`. | 
|  | /// | 
|  | /// # The `DFA` trait | 
|  | /// | 
|  | /// This type implements the [`DFA`](trait.DFA.html) trait, which means it | 
|  | /// can be used for searching. For example: | 
|  | /// | 
|  | /// ``` | 
|  | /// use regex_automata::{DFA, SparseDFA}; | 
|  | /// | 
|  | /// # fn example() -> Result<(), regex_automata::Error> { | 
|  | /// let dfa = SparseDFA::new("foo[0-9]+")?; | 
|  | /// assert_eq!(Some(8), dfa.find(b"foo12345")); | 
|  | /// # Ok(()) }; example().unwrap() | 
|  | /// ``` | 
|  | /// | 
|  | /// The `DFA` trait also provides an assortment of other lower level methods | 
|  | /// for DFAs, such as `start_state` and `next_state`. While these are correctly | 
|  | /// implemented, it is an anti-pattern to use them in performance sensitive | 
|  | /// code on the `SparseDFA` type directly. Namely, each implementation requires | 
|  | /// a branch to determine which type of sparse DFA is being used. Instead, | 
|  | /// this branch should be pushed up a layer in the code since walking the | 
|  | /// transitions of a DFA is usually a hot path. If you do need to use these | 
|  | /// lower level methods in performance critical code, then you should match on | 
|  | /// the variants of this DFA and use each variant's implementation of the `DFA` | 
|  | /// trait directly. | 
|  | #[derive(Clone, Debug)] | 
|  | pub enum SparseDFA<T: AsRef<[u8]>, S: StateID = usize> { | 
|  | /// A standard DFA that does not use byte classes. | 
|  | Standard(Standard<T, S>), | 
|  | /// A DFA that shrinks its alphabet to a set of equivalence classes instead | 
|  | /// of using all possible byte values. Any two bytes belong to the same | 
|  | /// equivalence class if and only if they can be used interchangeably | 
|  | /// anywhere in the DFA while never discriminating between a match and a | 
|  | /// non-match. | 
|  | /// | 
|  | /// Unlike dense DFAs, sparse DFAs do not tend to benefit nearly as much | 
|  | /// from using byte classes. In some cases, using byte classes can even | 
|  | /// marginally increase the size of a sparse DFA's transition table. The | 
|  | /// reason for this is that a sparse DFA already compacts each state's | 
|  | /// transitions separate from whether byte classes are used. | 
|  | ByteClass(ByteClass<T, S>), | 
|  | /// Hints that destructuring should not be exhaustive. | 
|  | /// | 
|  | /// This enum may grow additional variants, so this makes sure clients | 
|  | /// don't count on exhaustive matching. (Otherwise, adding a new variant | 
|  | /// could break existing code.) | 
|  | #[doc(hidden)] | 
|  | __Nonexhaustive, | 
|  | } | 
|  |  | 
|  | #[cfg(feature = "std")] | 
|  | impl SparseDFA<Vec<u8>, usize> { | 
|  | /// Parse the given regular expression using a default configuration and | 
|  | /// return the corresponding sparse DFA. | 
|  | /// | 
|  | /// The default configuration uses `usize` for state IDs and reduces the | 
|  | /// alphabet size by splitting bytes into equivalence classes. The | 
|  | /// resulting DFA is *not* minimized. | 
|  | /// | 
|  | /// If you want a non-default configuration, then use the | 
|  | /// [`dense::Builder`](dense/struct.Builder.html) | 
|  | /// to set your own configuration, and then call | 
|  | /// [`DenseDFA::to_sparse`](enum.DenseDFA.html#method.to_sparse) | 
|  | /// to create a sparse DFA. | 
|  | /// | 
|  | /// # Example | 
|  | /// | 
|  | /// ``` | 
|  | /// use regex_automata::{DFA, SparseDFA}; | 
|  | /// | 
|  | /// # fn example() -> Result<(), regex_automata::Error> { | 
|  | /// let dfa = SparseDFA::new("foo[0-9]+bar")?; | 
|  | /// assert_eq!(Some(11), dfa.find(b"foo12345bar")); | 
|  | /// # Ok(()) }; example().unwrap() | 
|  | /// ``` | 
|  | pub fn new(pattern: &str) -> Result<SparseDFA<Vec<u8>, usize>> { | 
|  | dense::Builder::new() | 
|  | .build(pattern) | 
|  | .and_then(|dense| dense.to_sparse()) | 
|  | } | 
|  | } | 
|  |  | 
|  | #[cfg(feature = "std")] | 
|  | impl<S: StateID> SparseDFA<Vec<u8>, S> { | 
|  | /// Create a new empty sparse DFA that never matches any input. | 
|  | /// | 
|  | /// # Example | 
|  | /// | 
|  | /// In order to build an empty DFA, callers must provide a type hint | 
|  | /// indicating their choice of state identifier representation. | 
|  | /// | 
|  | /// ``` | 
|  | /// use regex_automata::{DFA, SparseDFA}; | 
|  | /// | 
|  | /// # fn example() -> Result<(), regex_automata::Error> { | 
|  | /// let dfa: SparseDFA<Vec<u8>, usize> = SparseDFA::empty(); | 
|  | /// assert_eq!(None, dfa.find(b"")); | 
|  | /// assert_eq!(None, dfa.find(b"foo")); | 
|  | /// # Ok(()) }; example().unwrap() | 
|  | /// ``` | 
|  | pub fn empty() -> SparseDFA<Vec<u8>, S> { | 
|  | dense::DenseDFA::empty().to_sparse().unwrap() | 
|  | } | 
|  |  | 
|  | pub(crate) fn from_dense_sized<T: AsRef<[S]>, A: StateID>( | 
|  | dfa: &dense::Repr<T, S>, | 
|  | ) -> Result<SparseDFA<Vec<u8>, A>> { | 
|  | Repr::from_dense_sized(dfa).map(|r| r.into_sparse_dfa()) | 
|  | } | 
|  | } | 
|  |  | 
|  | impl<T: AsRef<[u8]>, S: StateID> SparseDFA<T, S> { | 
|  | /// Cheaply return a borrowed version of this sparse DFA. Specifically, the | 
|  | /// DFA returned always uses `&[u8]` for its transition table while keeping | 
|  | /// the same state identifier representation. | 
|  | pub fn as_ref<'a>(&'a self) -> SparseDFA<&'a [u8], S> { | 
|  | match *self { | 
|  | SparseDFA::Standard(Standard(ref r)) => { | 
|  | SparseDFA::Standard(Standard(r.as_ref())) | 
|  | } | 
|  | SparseDFA::ByteClass(ByteClass(ref r)) => { | 
|  | SparseDFA::ByteClass(ByteClass(r.as_ref())) | 
|  | } | 
|  | SparseDFA::__Nonexhaustive => unreachable!(), | 
|  | } | 
|  | } | 
|  |  | 
|  | /// Return an owned version of this sparse DFA. Specifically, the DFA | 
|  | /// returned always uses `Vec<u8>` for its transition table while keeping | 
|  | /// the same state identifier representation. | 
|  | /// | 
|  | /// Effectively, this returns a sparse DFA whose transition table lives | 
|  | /// on the heap. | 
|  | #[cfg(feature = "std")] | 
|  | pub fn to_owned(&self) -> SparseDFA<Vec<u8>, S> { | 
|  | match *self { | 
|  | SparseDFA::Standard(Standard(ref r)) => { | 
|  | SparseDFA::Standard(Standard(r.to_owned())) | 
|  | } | 
|  | SparseDFA::ByteClass(ByteClass(ref r)) => { | 
|  | SparseDFA::ByteClass(ByteClass(r.to_owned())) | 
|  | } | 
|  | SparseDFA::__Nonexhaustive => unreachable!(), | 
|  | } | 
|  | } | 
|  |  | 
|  | /// Returns the memory usage, in bytes, of this DFA. | 
|  | /// | 
|  | /// The memory usage is computed based on the number of bytes used to | 
|  | /// represent this DFA's transition table. This typically corresponds to | 
|  | /// heap memory usage. | 
|  | /// | 
|  | /// This does **not** include the stack size used up by this DFA. To | 
|  | /// compute that, used `std::mem::size_of::<SparseDFA>()`. | 
|  | pub fn memory_usage(&self) -> usize { | 
|  | self.repr().memory_usage() | 
|  | } | 
|  |  | 
|  | fn repr(&self) -> &Repr<T, S> { | 
|  | match *self { | 
|  | SparseDFA::Standard(ref r) => &r.0, | 
|  | SparseDFA::ByteClass(ref r) => &r.0, | 
|  | SparseDFA::__Nonexhaustive => unreachable!(), | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | /// Routines for converting a sparse DFA to other representations, such as | 
|  | /// smaller state identifiers or raw bytes suitable for persistent storage. | 
|  | #[cfg(feature = "std")] | 
|  | impl<T: AsRef<[u8]>, S: StateID> SparseDFA<T, S> { | 
|  | /// Create a new sparse DFA whose match semantics are equivalent to | 
|  | /// this DFA, but attempt to use `u8` for the representation of state | 
|  | /// identifiers. If `u8` is insufficient to represent all state identifiers | 
|  | /// in this DFA, then this returns an error. | 
|  | /// | 
|  | /// This is a convenience routine for `to_sized::<u8>()`. | 
|  | pub fn to_u8(&self) -> Result<SparseDFA<Vec<u8>, u8>> { | 
|  | self.to_sized() | 
|  | } | 
|  |  | 
|  | /// Create a new sparse DFA whose match semantics are equivalent to | 
|  | /// this DFA, but attempt to use `u16` for the representation of state | 
|  | /// identifiers. If `u16` is insufficient to represent all state | 
|  | /// identifiers in this DFA, then this returns an error. | 
|  | /// | 
|  | /// This is a convenience routine for `to_sized::<u16>()`. | 
|  | pub fn to_u16(&self) -> Result<SparseDFA<Vec<u8>, u16>> { | 
|  | self.to_sized() | 
|  | } | 
|  |  | 
|  | /// Create a new sparse DFA whose match semantics are equivalent to | 
|  | /// this DFA, but attempt to use `u32` for the representation of state | 
|  | /// identifiers. If `u32` is insufficient to represent all state | 
|  | /// identifiers in this DFA, then this returns an error. | 
|  | /// | 
|  | /// This is a convenience routine for `to_sized::<u32>()`. | 
|  | #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] | 
|  | pub fn to_u32(&self) -> Result<SparseDFA<Vec<u8>, u32>> { | 
|  | self.to_sized() | 
|  | } | 
|  |  | 
|  | /// Create a new sparse DFA whose match semantics are equivalent to | 
|  | /// this DFA, but attempt to use `u64` for the representation of state | 
|  | /// identifiers. If `u64` is insufficient to represent all state | 
|  | /// identifiers in this DFA, then this returns an error. | 
|  | /// | 
|  | /// This is a convenience routine for `to_sized::<u64>()`. | 
|  | #[cfg(target_pointer_width = "64")] | 
|  | pub fn to_u64(&self) -> Result<SparseDFA<Vec<u8>, u64>> { | 
|  | self.to_sized() | 
|  | } | 
|  |  | 
|  | /// Create a new sparse DFA whose match semantics are equivalent to | 
|  | /// this DFA, but attempt to use `A` for the representation of state | 
|  | /// identifiers. If `A` is insufficient to represent all state identifiers | 
|  | /// in this DFA, then this returns an error. | 
|  | /// | 
|  | /// An alternative way to construct such a DFA is to use | 
|  | /// [`DenseDFA::to_sparse_sized`](enum.DenseDFA.html#method.to_sparse_sized). | 
|  | /// In general, picking the appropriate size upon initial construction of | 
|  | /// a sparse DFA is preferred, since it will do the conversion in one | 
|  | /// step instead of two. | 
|  | pub fn to_sized<A: StateID>(&self) -> Result<SparseDFA<Vec<u8>, A>> { | 
|  | self.repr().to_sized().map(|r| r.into_sparse_dfa()) | 
|  | } | 
|  |  | 
|  | /// Serialize a sparse DFA to raw bytes in little endian format. | 
|  | /// | 
|  | /// If the state identifier representation of this DFA has a size different | 
|  | /// than 1, 2, 4 or 8 bytes, then this returns an error. All | 
|  | /// implementations of `StateID` provided by this crate satisfy this | 
|  | /// requirement. | 
|  | pub fn to_bytes_little_endian(&self) -> Result<Vec<u8>> { | 
|  | self.repr().to_bytes::<LittleEndian>() | 
|  | } | 
|  |  | 
|  | /// Serialize a sparse DFA to raw bytes in big endian format. | 
|  | /// | 
|  | /// If the state identifier representation of this DFA has a size different | 
|  | /// than 1, 2, 4 or 8 bytes, then this returns an error. All | 
|  | /// implementations of `StateID` provided by this crate satisfy this | 
|  | /// requirement. | 
|  | pub fn to_bytes_big_endian(&self) -> Result<Vec<u8>> { | 
|  | self.repr().to_bytes::<BigEndian>() | 
|  | } | 
|  |  | 
|  | /// Serialize a sparse DFA to raw bytes in native endian format. | 
|  | /// Generally, it is better to pick an explicit endianness using either | 
|  | /// `to_bytes_little_endian` or `to_bytes_big_endian`. This routine is | 
|  | /// useful in tests where the DFA is serialized and deserialized on the | 
|  | /// same platform. | 
|  | /// | 
|  | /// If the state identifier representation of this DFA has a size different | 
|  | /// than 1, 2, 4 or 8 bytes, then this returns an error. All | 
|  | /// implementations of `StateID` provided by this crate satisfy this | 
|  | /// requirement. | 
|  | pub fn to_bytes_native_endian(&self) -> Result<Vec<u8>> { | 
|  | self.repr().to_bytes::<NativeEndian>() | 
|  | } | 
|  | } | 
|  |  | 
|  | impl<'a, S: StateID> SparseDFA<&'a [u8], S> { | 
|  | /// Deserialize a sparse DFA with a specific state identifier | 
|  | /// representation. | 
|  | /// | 
|  | /// Deserializing a DFA using this routine will never allocate heap memory. | 
|  | /// This is also guaranteed to be a constant time operation that does not | 
|  | /// vary with the size of the DFA. | 
|  | /// | 
|  | /// The bytes given should be generated by the serialization of a DFA with | 
|  | /// either the | 
|  | /// [`to_bytes_little_endian`](enum.DenseDFA.html#method.to_bytes_little_endian) | 
|  | /// method or the | 
|  | /// [`to_bytes_big_endian`](enum.DenseDFA.html#method.to_bytes_big_endian) | 
|  | /// endian, depending on the endianness of the machine you are | 
|  | /// deserializing this DFA from. | 
|  | /// | 
|  | /// If the state identifier representation is `usize`, then deserialization | 
|  | /// is dependent on the pointer size. For this reason, it is best to | 
|  | /// serialize DFAs using a fixed size representation for your state | 
|  | /// identifiers, such as `u8`, `u16`, `u32` or `u64`. | 
|  | /// | 
|  | /// # Panics | 
|  | /// | 
|  | /// The bytes given should be *trusted*. In particular, if the bytes | 
|  | /// are not a valid serialization of a DFA, or if the endianness of the | 
|  | /// serialized bytes is different than the endianness of the machine that | 
|  | /// is deserializing the DFA, then this routine will panic. Moreover, it | 
|  | /// is possible for this deserialization routine to succeed even if the | 
|  | /// given bytes do not represent a valid serialized sparse DFA. | 
|  | /// | 
|  | /// # Safety | 
|  | /// | 
|  | /// This routine is unsafe because it permits callers to provide an | 
|  | /// arbitrary transition table with possibly incorrect transitions. While | 
|  | /// the various serialization routines will never return an incorrect | 
|  | /// transition table, there is no guarantee that the bytes provided here | 
|  | /// are correct. While deserialization does many checks (as documented | 
|  | /// above in the panic conditions), this routine does not check that the | 
|  | /// transition table is correct. Given an incorrect transition table, it is | 
|  | /// possible for the search routines to access out-of-bounds memory because | 
|  | /// of explicit bounds check elision. | 
|  | /// | 
|  | /// # Example | 
|  | /// | 
|  | /// This example shows how to serialize a DFA to raw bytes, deserialize it | 
|  | /// and then use it for searching. Note that we first convert the DFA to | 
|  | /// using `u16` for its state identifier representation before serializing | 
|  | /// it. While this isn't strictly necessary, it's good practice in order to | 
|  | /// decrease the size of the DFA and to avoid platform specific pitfalls | 
|  | /// such as differing pointer sizes. | 
|  | /// | 
|  | /// ``` | 
|  | /// use regex_automata::{DFA, DenseDFA, SparseDFA}; | 
|  | /// | 
|  | /// # fn example() -> Result<(), regex_automata::Error> { | 
|  | /// let sparse = SparseDFA::new("foo[0-9]+")?; | 
|  | /// let bytes = sparse.to_u16()?.to_bytes_native_endian()?; | 
|  | /// | 
|  | /// let dfa: SparseDFA<&[u8], u16> = unsafe { | 
|  | ///     SparseDFA::from_bytes(&bytes) | 
|  | /// }; | 
|  | /// | 
|  | /// assert_eq!(Some(8), dfa.find(b"foo12345")); | 
|  | /// # Ok(()) }; example().unwrap() | 
|  | /// ``` | 
|  | pub unsafe fn from_bytes(buf: &'a [u8]) -> SparseDFA<&'a [u8], S> { | 
|  | Repr::from_bytes(buf).into_sparse_dfa() | 
|  | } | 
|  | } | 
|  |  | 
|  | impl<T: AsRef<[u8]>, S: StateID> DFA for SparseDFA<T, S> { | 
|  | type ID = S; | 
|  |  | 
|  | #[inline] | 
|  | fn start_state(&self) -> S { | 
|  | self.repr().start_state() | 
|  | } | 
|  |  | 
|  | #[inline] | 
|  | fn is_match_state(&self, id: S) -> bool { | 
|  | self.repr().is_match_state(id) | 
|  | } | 
|  |  | 
|  | #[inline] | 
|  | fn is_dead_state(&self, id: S) -> bool { | 
|  | self.repr().is_dead_state(id) | 
|  | } | 
|  |  | 
|  | #[inline] | 
|  | fn is_match_or_dead_state(&self, id: S) -> bool { | 
|  | self.repr().is_match_or_dead_state(id) | 
|  | } | 
|  |  | 
|  | #[inline] | 
|  | fn is_anchored(&self) -> bool { | 
|  | self.repr().is_anchored() | 
|  | } | 
|  |  | 
|  | #[inline] | 
|  | fn next_state(&self, current: S, input: u8) -> S { | 
|  | match *self { | 
|  | SparseDFA::Standard(ref r) => r.next_state(current, input), | 
|  | SparseDFA::ByteClass(ref r) => r.next_state(current, input), | 
|  | SparseDFA::__Nonexhaustive => unreachable!(), | 
|  | } | 
|  | } | 
|  |  | 
|  | #[inline] | 
|  | unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { | 
|  | self.next_state(current, input) | 
|  | } | 
|  |  | 
|  | // We specialize the following methods because it lets us lift the | 
|  | // case analysis between the different types of sparse DFAs. Instead of | 
|  | // doing the case analysis for every transition, we do it once before | 
|  | // searching. For sparse DFAs, this doesn't seem to benefit performance as | 
|  | // much as it does for the dense DFAs, but it's easy to do so we might as | 
|  | // well do it. | 
|  |  | 
|  | #[inline] | 
|  | fn is_match_at(&self, bytes: &[u8], start: usize) -> bool { | 
|  | match *self { | 
|  | SparseDFA::Standard(ref r) => r.is_match_at(bytes, start), | 
|  | SparseDFA::ByteClass(ref r) => r.is_match_at(bytes, start), | 
|  | SparseDFA::__Nonexhaustive => unreachable!(), | 
|  | } | 
|  | } | 
|  |  | 
|  | #[inline] | 
|  | fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> { | 
|  | match *self { | 
|  | SparseDFA::Standard(ref r) => r.shortest_match_at(bytes, start), | 
|  | SparseDFA::ByteClass(ref r) => r.shortest_match_at(bytes, start), | 
|  | SparseDFA::__Nonexhaustive => unreachable!(), | 
|  | } | 
|  | } | 
|  |  | 
|  | #[inline] | 
|  | fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> { | 
|  | match *self { | 
|  | SparseDFA::Standard(ref r) => r.find_at(bytes, start), | 
|  | SparseDFA::ByteClass(ref r) => r.find_at(bytes, start), | 
|  | SparseDFA::__Nonexhaustive => unreachable!(), | 
|  | } | 
|  | } | 
|  |  | 
|  | #[inline] | 
|  | fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> { | 
|  | match *self { | 
|  | SparseDFA::Standard(ref r) => r.rfind_at(bytes, start), | 
|  | SparseDFA::ByteClass(ref r) => r.rfind_at(bytes, start), | 
|  | SparseDFA::__Nonexhaustive => unreachable!(), | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | /// A standard sparse DFA that does not use premultiplication or byte classes. | 
|  | /// | 
|  | /// Generally, it isn't necessary to use this type directly, since a | 
|  | /// `SparseDFA` can be used for searching directly. One possible reason why | 
|  | /// one might want to use this type directly is if you are implementing your | 
|  | /// own search routines by walking a DFA's transitions directly. In that case, | 
|  | /// you'll want to use this type (or any of the other DFA variant types) | 
|  | /// directly, since they implement `next_state` more efficiently. | 
|  | #[derive(Clone, Debug)] | 
|  | pub struct Standard<T: AsRef<[u8]>, S: StateID = usize>(Repr<T, S>); | 
|  |  | 
|  | impl<T: AsRef<[u8]>, S: StateID> DFA for Standard<T, S> { | 
|  | type ID = S; | 
|  |  | 
|  | #[inline] | 
|  | fn start_state(&self) -> S { | 
|  | self.0.start_state() | 
|  | } | 
|  |  | 
|  | #[inline] | 
|  | fn is_match_state(&self, id: S) -> bool { | 
|  | self.0.is_match_state(id) | 
|  | } | 
|  |  | 
|  | #[inline] | 
|  | fn is_dead_state(&self, id: S) -> bool { | 
|  | self.0.is_dead_state(id) | 
|  | } | 
|  |  | 
|  | #[inline] | 
|  | fn is_match_or_dead_state(&self, id: S) -> bool { | 
|  | self.0.is_match_or_dead_state(id) | 
|  | } | 
|  |  | 
|  | #[inline] | 
|  | fn is_anchored(&self) -> bool { | 
|  | self.0.is_anchored() | 
|  | } | 
|  |  | 
|  | #[inline] | 
|  | fn next_state(&self, current: S, input: u8) -> S { | 
|  | self.0.state(current).next(input) | 
|  | } | 
|  |  | 
|  | #[inline] | 
|  | unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { | 
|  | self.next_state(current, input) | 
|  | } | 
|  | } | 
|  |  | 
|  | /// A sparse DFA that shrinks its alphabet. | 
|  | /// | 
|  | /// Alphabet shrinking is achieved by using a set of equivalence classes | 
|  | /// instead of using all possible byte values. Any two bytes belong to the same | 
|  | /// equivalence class if and only if they can be used interchangeably anywhere | 
|  | /// in the DFA while never discriminating between a match and a non-match. | 
|  | /// | 
|  | /// Unlike dense DFAs, sparse DFAs do not tend to benefit nearly as much from | 
|  | /// using byte classes. In some cases, using byte classes can even marginally | 
|  | /// increase the size of a sparse DFA's transition table. The reason for this | 
|  | /// is that a sparse DFA already compacts each state's transitions separate | 
|  | /// from whether byte classes are used. | 
|  | /// | 
|  | /// Generally, it isn't necessary to use this type directly, since a | 
|  | /// `SparseDFA` can be used for searching directly. One possible reason why | 
|  | /// one might want to use this type directly is if you are implementing your | 
|  | /// own search routines by walking a DFA's transitions directly. In that case, | 
|  | /// you'll want to use this type (or any of the other DFA variant types) | 
|  | /// directly, since they implement `next_state` more efficiently. | 
|  | #[derive(Clone, Debug)] | 
|  | pub struct ByteClass<T: AsRef<[u8]>, S: StateID = usize>(Repr<T, S>); | 
|  |  | 
|  | impl<T: AsRef<[u8]>, S: StateID> DFA for ByteClass<T, S> { | 
|  | type ID = S; | 
|  |  | 
|  | #[inline] | 
|  | fn start_state(&self) -> S { | 
|  | self.0.start_state() | 
|  | } | 
|  |  | 
|  | #[inline] | 
|  | fn is_match_state(&self, id: S) -> bool { | 
|  | self.0.is_match_state(id) | 
|  | } | 
|  |  | 
|  | #[inline] | 
|  | fn is_dead_state(&self, id: S) -> bool { | 
|  | self.0.is_dead_state(id) | 
|  | } | 
|  |  | 
|  | #[inline] | 
|  | fn is_match_or_dead_state(&self, id: S) -> bool { | 
|  | self.0.is_match_or_dead_state(id) | 
|  | } | 
|  |  | 
|  | #[inline] | 
|  | fn is_anchored(&self) -> bool { | 
|  | self.0.is_anchored() | 
|  | } | 
|  |  | 
|  | #[inline] | 
|  | fn next_state(&self, current: S, input: u8) -> S { | 
|  | let input = self.0.byte_classes.get(input); | 
|  | self.0.state(current).next(input) | 
|  | } | 
|  |  | 
|  | #[inline] | 
|  | unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { | 
|  | self.next_state(current, input) | 
|  | } | 
|  | } | 
|  |  | 
|  | /// The underlying representation of a sparse DFA. This is shared by all of | 
|  | /// the different variants of a sparse DFA. | 
|  | #[derive(Clone)] | 
|  | #[cfg_attr(not(feature = "std"), derive(Debug))] | 
|  | struct Repr<T: AsRef<[u8]>, S: StateID = usize> { | 
|  | anchored: bool, | 
|  | start: S, | 
|  | state_count: usize, | 
|  | max_match: S, | 
|  | byte_classes: ByteClasses, | 
|  | trans: T, | 
|  | } | 
|  |  | 
|  | impl<T: AsRef<[u8]>, S: StateID> Repr<T, S> { | 
|  | fn into_sparse_dfa(self) -> SparseDFA<T, S> { | 
|  | if self.byte_classes.is_singleton() { | 
|  | SparseDFA::Standard(Standard(self)) | 
|  | } else { | 
|  | SparseDFA::ByteClass(ByteClass(self)) | 
|  | } | 
|  | } | 
|  |  | 
|  | fn as_ref<'a>(&'a self) -> Repr<&'a [u8], S> { | 
|  | Repr { | 
|  | anchored: self.anchored, | 
|  | start: self.start, | 
|  | state_count: self.state_count, | 
|  | max_match: self.max_match, | 
|  | byte_classes: self.byte_classes.clone(), | 
|  | trans: self.trans(), | 
|  | } | 
|  | } | 
|  |  | 
|  | #[cfg(feature = "std")] | 
|  | fn to_owned(&self) -> Repr<Vec<u8>, S> { | 
|  | Repr { | 
|  | anchored: self.anchored, | 
|  | start: self.start, | 
|  | state_count: self.state_count, | 
|  | max_match: self.max_match, | 
|  | byte_classes: self.byte_classes.clone(), | 
|  | trans: self.trans().to_vec(), | 
|  | } | 
|  | } | 
|  |  | 
|  | /// Return a convenient representation of the given state. | 
|  | /// | 
|  | /// This is marked as inline because it doesn't seem to get inlined | 
|  | /// otherwise, which leads to a fairly significant performance loss (~25%). | 
|  | #[inline] | 
|  | fn state<'a>(&'a self, id: S) -> State<'a, S> { | 
|  | let mut pos = id.to_usize(); | 
|  | let ntrans = NativeEndian::read_u16(&self.trans()[pos..]) as usize; | 
|  | pos += 2; | 
|  | let input_ranges = &self.trans()[pos..pos + (ntrans * 2)]; | 
|  | pos += 2 * ntrans; | 
|  | let next = &self.trans()[pos..pos + (ntrans * size_of::<S>())]; | 
|  | State { _state_id_repr: PhantomData, ntrans, input_ranges, next } | 
|  | } | 
|  |  | 
|  | /// Return an iterator over all of the states in this DFA. | 
|  | /// | 
|  | /// The iterator returned yields tuples, where the first element is the | 
|  | /// state ID and the second element is the state itself. | 
|  | #[cfg(feature = "std")] | 
|  | fn states<'a>(&'a self) -> StateIter<'a, T, S> { | 
|  | StateIter { dfa: self, id: dead_id() } | 
|  | } | 
|  |  | 
|  | fn memory_usage(&self) -> usize { | 
|  | self.trans().len() | 
|  | } | 
|  |  | 
|  | fn start_state(&self) -> S { | 
|  | self.start | 
|  | } | 
|  |  | 
|  | fn is_match_state(&self, id: S) -> bool { | 
|  | self.is_match_or_dead_state(id) && !self.is_dead_state(id) | 
|  | } | 
|  |  | 
|  | fn is_dead_state(&self, id: S) -> bool { | 
|  | id == dead_id() | 
|  | } | 
|  |  | 
|  | fn is_match_or_dead_state(&self, id: S) -> bool { | 
|  | id <= self.max_match | 
|  | } | 
|  |  | 
|  | fn is_anchored(&self) -> bool { | 
|  | self.anchored | 
|  | } | 
|  |  | 
|  | fn trans(&self) -> &[u8] { | 
|  | self.trans.as_ref() | 
|  | } | 
|  |  | 
|  | /// Create a new sparse DFA whose match semantics are equivalent to this | 
|  | /// DFA, but attempt to use `A` for the representation of state | 
|  | /// identifiers. If `A` is insufficient to represent all state identifiers | 
|  | /// in this DFA, then this returns an error. | 
|  | #[cfg(feature = "std")] | 
|  | fn to_sized<A: StateID>(&self) -> Result<Repr<Vec<u8>, A>> { | 
|  | // To build the new DFA, we proceed much like the initial construction | 
|  | // of the sparse DFA. Namely, since the state ID size is changing, | 
|  | // we don't actually know all of our state IDs until we've allocated | 
|  | // all necessary space. So we do one pass that allocates all of the | 
|  | // storage we need, and then another pass to fill in the transitions. | 
|  |  | 
|  | let mut trans = Vec::with_capacity(size_of::<A>() * self.state_count); | 
|  | let mut map: HashMap<S, A> = HashMap::with_capacity(self.state_count); | 
|  | for (old_id, state) in self.states() { | 
|  | let pos = trans.len(); | 
|  | map.insert(old_id, usize_to_state_id(pos)?); | 
|  |  | 
|  | let n = state.ntrans; | 
|  | let zeros = 2 + (n * 2) + (n * size_of::<A>()); | 
|  | trans.extend(iter::repeat(0).take(zeros)); | 
|  |  | 
|  | NativeEndian::write_u16(&mut trans[pos..], n as u16); | 
|  | let (s, e) = (pos + 2, pos + 2 + (n * 2)); | 
|  | trans[s..e].copy_from_slice(state.input_ranges); | 
|  | } | 
|  |  | 
|  | let mut new = Repr { | 
|  | anchored: self.anchored, | 
|  | start: map[&self.start], | 
|  | state_count: self.state_count, | 
|  | max_match: map[&self.max_match], | 
|  | byte_classes: self.byte_classes.clone(), | 
|  | trans, | 
|  | }; | 
|  | for (&old_id, &new_id) in map.iter() { | 
|  | let old_state = self.state(old_id); | 
|  | let mut new_state = new.state_mut(new_id); | 
|  | for i in 0..new_state.ntrans { | 
|  | let next = map[&old_state.next_at(i)]; | 
|  | new_state.set_next_at(i, usize_to_state_id(next.to_usize())?); | 
|  | } | 
|  | } | 
|  | new.start = map[&self.start]; | 
|  | new.max_match = map[&self.max_match]; | 
|  | Ok(new) | 
|  | } | 
|  |  | 
|  | /// Serialize a sparse DFA to raw bytes using the provided endianness. | 
|  | /// | 
|  | /// If the state identifier representation of this DFA has a size different | 
|  | /// than 1, 2, 4 or 8 bytes, then this returns an error. All | 
|  | /// implementations of `StateID` provided by this crate satisfy this | 
|  | /// requirement. | 
|  | /// | 
|  | /// Unlike dense DFAs, the result is not necessarily aligned since a | 
|  | /// sparse DFA's transition table is always read as a sequence of bytes. | 
|  | #[cfg(feature = "std")] | 
|  | fn to_bytes<A: ByteOrder>(&self) -> Result<Vec<u8>> { | 
|  | let label = b"rust-regex-automata-sparse-dfa\x00"; | 
|  | let size = | 
|  | // For human readable label. | 
|  | label.len() | 
|  | // endiannes check, must be equal to 0xFEFF for native endian | 
|  | + 2 | 
|  | // For version number. | 
|  | + 2 | 
|  | // Size of state ID representation, in bytes. | 
|  | // Must be 1, 2, 4 or 8. | 
|  | + 2 | 
|  | // For DFA misc options. (Currently unused.) | 
|  | + 2 | 
|  | // For start state. | 
|  | + 8 | 
|  | // For state count. | 
|  | + 8 | 
|  | // For max match state. | 
|  | + 8 | 
|  | // For byte class map. | 
|  | + 256 | 
|  | // For transition table. | 
|  | + self.trans().len(); | 
|  |  | 
|  | let mut i = 0; | 
|  | let mut buf = vec![0; size]; | 
|  |  | 
|  | // write label | 
|  | for &b in label { | 
|  | buf[i] = b; | 
|  | i += 1; | 
|  | } | 
|  | // endianness check | 
|  | A::write_u16(&mut buf[i..], 0xFEFF); | 
|  | i += 2; | 
|  | // version number | 
|  | A::write_u16(&mut buf[i..], 1); | 
|  | i += 2; | 
|  | // size of state ID | 
|  | let state_size = size_of::<S>(); | 
|  | if ![1, 2, 4, 8].contains(&state_size) { | 
|  | return Err(Error::serialize(&format!( | 
|  | "state size of {} not supported, must be 1, 2, 4 or 8", | 
|  | state_size | 
|  | ))); | 
|  | } | 
|  | A::write_u16(&mut buf[i..], state_size as u16); | 
|  | i += 2; | 
|  | // DFA misc options | 
|  | let mut options = 0u16; | 
|  | if self.anchored { | 
|  | options |= dense::MASK_ANCHORED; | 
|  | } | 
|  | A::write_u16(&mut buf[i..], options); | 
|  | i += 2; | 
|  | // start state | 
|  | A::write_u64(&mut buf[i..], self.start.to_usize() as u64); | 
|  | i += 8; | 
|  | // state count | 
|  | A::write_u64(&mut buf[i..], self.state_count as u64); | 
|  | i += 8; | 
|  | // max match state | 
|  | A::write_u64(&mut buf[i..], self.max_match.to_usize() as u64); | 
|  | i += 8; | 
|  | // byte class map | 
|  | for b in (0..256).map(|b| b as u8) { | 
|  | buf[i] = self.byte_classes.get(b); | 
|  | i += 1; | 
|  | } | 
|  | // transition table | 
|  | for (_, state) in self.states() { | 
|  | A::write_u16(&mut buf[i..], state.ntrans as u16); | 
|  | i += 2; | 
|  | buf[i..i + (state.ntrans * 2)].copy_from_slice(state.input_ranges); | 
|  | i += state.ntrans * 2; | 
|  | for j in 0..state.ntrans { | 
|  | write_state_id_bytes::<A, _>(&mut buf[i..], state.next_at(j)); | 
|  | i += size_of::<S>(); | 
|  | } | 
|  | } | 
|  |  | 
|  | assert_eq!(size, i, "expected to consume entire buffer"); | 
|  |  | 
|  | Ok(buf) | 
|  | } | 
|  | } | 
|  |  | 
|  | impl<'a, S: StateID> Repr<&'a [u8], S> { | 
|  | /// The implementation for deserializing a sparse DFA from raw bytes. | 
|  | unsafe fn from_bytes(mut buf: &'a [u8]) -> Repr<&'a [u8], S> { | 
|  | // skip over label | 
|  | match buf.iter().position(|&b| b == b'\x00') { | 
|  | None => panic!("could not find label"), | 
|  | Some(i) => buf = &buf[i + 1..], | 
|  | } | 
|  |  | 
|  | // check that current endianness is same as endianness of DFA | 
|  | let endian_check = NativeEndian::read_u16(buf); | 
|  | buf = &buf[2..]; | 
|  | if endian_check != 0xFEFF { | 
|  | panic!( | 
|  | "endianness mismatch, expected 0xFEFF but got 0x{:X}. \ | 
|  | are you trying to load a SparseDFA serialized with a \ | 
|  | different endianness?", | 
|  | endian_check, | 
|  | ); | 
|  | } | 
|  |  | 
|  | // check that the version number is supported | 
|  | let version = NativeEndian::read_u16(buf); | 
|  | buf = &buf[2..]; | 
|  | if version != 1 { | 
|  | panic!( | 
|  | "expected version 1, but found unsupported version {}", | 
|  | version, | 
|  | ); | 
|  | } | 
|  |  | 
|  | // read size of state | 
|  | let state_size = NativeEndian::read_u16(buf) as usize; | 
|  | if state_size != size_of::<S>() { | 
|  | panic!( | 
|  | "state size of SparseDFA ({}) does not match \ | 
|  | requested state size ({})", | 
|  | state_size, | 
|  | size_of::<S>(), | 
|  | ); | 
|  | } | 
|  | buf = &buf[2..]; | 
|  |  | 
|  | // read miscellaneous options | 
|  | let opts = NativeEndian::read_u16(buf); | 
|  | buf = &buf[2..]; | 
|  |  | 
|  | // read start state | 
|  | let start = S::from_usize(NativeEndian::read_u64(buf) as usize); | 
|  | buf = &buf[8..]; | 
|  |  | 
|  | // read state count | 
|  | let state_count = NativeEndian::read_u64(buf) as usize; | 
|  | buf = &buf[8..]; | 
|  |  | 
|  | // read max match state | 
|  | let max_match = S::from_usize(NativeEndian::read_u64(buf) as usize); | 
|  | buf = &buf[8..]; | 
|  |  | 
|  | // read byte classes | 
|  | let byte_classes = ByteClasses::from_slice(&buf[..256]); | 
|  | buf = &buf[256..]; | 
|  |  | 
|  | Repr { | 
|  | anchored: opts & dense::MASK_ANCHORED > 0, | 
|  | start, | 
|  | state_count, | 
|  | max_match, | 
|  | byte_classes, | 
|  | trans: buf, | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | #[cfg(feature = "std")] | 
|  | impl<S: StateID> Repr<Vec<u8>, S> { | 
|  | /// The implementation for constructing a sparse DFA from a dense DFA. | 
|  | fn from_dense_sized<T: AsRef<[S]>, A: StateID>( | 
|  | dfa: &dense::Repr<T, S>, | 
|  | ) -> Result<Repr<Vec<u8>, A>> { | 
|  | // In order to build the transition table, we need to be able to write | 
|  | // state identifiers for each of the "next" transitions in each state. | 
|  | // Our state identifiers correspond to the byte offset in the | 
|  | // transition table at which the state is encoded. Therefore, we do not | 
|  | // actually know what the state identifiers are until we've allocated | 
|  | // exactly as much space as we need for each state. Thus, construction | 
|  | // of the transition table happens in two passes. | 
|  | // | 
|  | // In the first pass, we fill out the shell of each state, which | 
|  | // includes the transition count, the input byte ranges and zero-filled | 
|  | // space for the transitions. In this first pass, we also build up a | 
|  | // map from the state identifier index of the dense DFA to the state | 
|  | // identifier in this sparse DFA. | 
|  | // | 
|  | // In the second pass, we fill in the transitions based on the map | 
|  | // built in the first pass. | 
|  |  | 
|  | let mut trans = Vec::with_capacity(size_of::<A>() * dfa.state_count()); | 
|  | let mut remap: Vec<A> = vec![dead_id(); dfa.state_count()]; | 
|  | for (old_id, state) in dfa.states() { | 
|  | let pos = trans.len(); | 
|  |  | 
|  | remap[dfa.state_id_to_index(old_id)] = usize_to_state_id(pos)?; | 
|  | // zero-filled space for the transition count | 
|  | trans.push(0); | 
|  | trans.push(0); | 
|  |  | 
|  | let mut trans_count = 0; | 
|  | for (b1, b2, _) in state.sparse_transitions() { | 
|  | trans_count += 1; | 
|  | trans.push(b1); | 
|  | trans.push(b2); | 
|  | } | 
|  | // fill in the transition count | 
|  | NativeEndian::write_u16(&mut trans[pos..], trans_count); | 
|  |  | 
|  | // zero-fill the actual transitions | 
|  | let zeros = trans_count as usize * size_of::<A>(); | 
|  | trans.extend(iter::repeat(0).take(zeros)); | 
|  | } | 
|  |  | 
|  | let mut new = Repr { | 
|  | anchored: dfa.is_anchored(), | 
|  | start: remap[dfa.state_id_to_index(dfa.start_state())], | 
|  | state_count: dfa.state_count(), | 
|  | max_match: remap[dfa.state_id_to_index(dfa.max_match_state())], | 
|  | byte_classes: dfa.byte_classes().clone(), | 
|  | trans, | 
|  | }; | 
|  | for (old_id, old_state) in dfa.states() { | 
|  | let new_id = remap[dfa.state_id_to_index(old_id)]; | 
|  | let mut new_state = new.state_mut(new_id); | 
|  | let sparse = old_state.sparse_transitions(); | 
|  | for (i, (_, _, next)) in sparse.enumerate() { | 
|  | let next = remap[dfa.state_id_to_index(next)]; | 
|  | new_state.set_next_at(i, next); | 
|  | } | 
|  | } | 
|  | Ok(new) | 
|  | } | 
|  |  | 
|  | /// Return a convenient mutable representation of the given state. | 
|  | fn state_mut<'a>(&'a mut self, id: S) -> StateMut<'a, S> { | 
|  | let mut pos = id.to_usize(); | 
|  | let ntrans = NativeEndian::read_u16(&self.trans[pos..]) as usize; | 
|  | pos += 2; | 
|  |  | 
|  | let size = (ntrans * 2) + (ntrans * size_of::<S>()); | 
|  | let ranges_and_next = &mut self.trans[pos..pos + size]; | 
|  | let (input_ranges, next) = ranges_and_next.split_at_mut(ntrans * 2); | 
|  | StateMut { _state_id_repr: PhantomData, ntrans, input_ranges, next } | 
|  | } | 
|  | } | 
|  |  | 
|  | #[cfg(feature = "std")] | 
|  | impl<T: AsRef<[u8]>, S: StateID> fmt::Debug for Repr<T, S> { | 
|  | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | 
|  | fn state_status<T: AsRef<[u8]>, S: StateID>( | 
|  | dfa: &Repr<T, S>, | 
|  | id: S, | 
|  | ) -> &'static str { | 
|  | if id == dead_id() { | 
|  | if dfa.is_match_state(id) { | 
|  | "D*" | 
|  | } else { | 
|  | "D " | 
|  | } | 
|  | } else if id == dfa.start_state() { | 
|  | if dfa.is_match_state(id) { | 
|  | ">*" | 
|  | } else { | 
|  | "> " | 
|  | } | 
|  | } else { | 
|  | if dfa.is_match_state(id) { | 
|  | " *" | 
|  | } else { | 
|  | "  " | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | writeln!(f, "SparseDFA(")?; | 
|  | for (id, state) in self.states() { | 
|  | let status = state_status(self, id); | 
|  | writeln!(f, "{}{:06}: {:?}", status, id.to_usize(), state)?; | 
|  | } | 
|  | writeln!(f, ")")?; | 
|  | Ok(()) | 
|  | } | 
|  | } | 
|  |  | 
|  | /// An iterator over all states in a sparse DFA. | 
|  | /// | 
|  | /// This iterator yields tuples, where the first element is the state ID and | 
|  | /// the second element is the state itself. | 
|  | #[cfg(feature = "std")] | 
|  | #[derive(Debug)] | 
|  | struct StateIter<'a, T: AsRef<[u8]> + 'a, S: StateID + 'a = usize> { | 
|  | dfa: &'a Repr<T, S>, | 
|  | id: S, | 
|  | } | 
|  |  | 
|  | #[cfg(feature = "std")] | 
|  | impl<'a, T: AsRef<[u8]>, S: StateID> Iterator for StateIter<'a, T, S> { | 
|  | type Item = (S, State<'a, S>); | 
|  |  | 
|  | fn next(&mut self) -> Option<(S, State<'a, S>)> { | 
|  | if self.id.to_usize() >= self.dfa.trans().len() { | 
|  | return None; | 
|  | } | 
|  | let id = self.id; | 
|  | let state = self.dfa.state(id); | 
|  | self.id = S::from_usize(self.id.to_usize() + state.bytes()); | 
|  | Some((id, state)) | 
|  | } | 
|  | } | 
|  |  | 
|  | /// A representation of a sparse DFA state that can be cheaply materialized | 
|  | /// from a state identifier. | 
|  | #[derive(Clone)] | 
|  | struct State<'a, S: StateID = usize> { | 
|  | /// The state identifier representation used by the DFA from which this | 
|  | /// state was extracted. Since our transition table is compacted in a | 
|  | /// &[u8], we don't actually use the state ID type parameter explicitly | 
|  | /// anywhere, so we fake it. This prevents callers from using an incorrect | 
|  | /// state ID representation to read from this state. | 
|  | _state_id_repr: PhantomData<S>, | 
|  | /// The number of transitions in this state. | 
|  | ntrans: usize, | 
|  | /// Pairs of input ranges, where there is one pair for each transition. | 
|  | /// Each pair specifies an inclusive start and end byte range for the | 
|  | /// corresponding transition. | 
|  | input_ranges: &'a [u8], | 
|  | /// Transitions to the next state. This slice contains native endian | 
|  | /// encoded state identifiers, with `S` as the representation. Thus, there | 
|  | /// are `ntrans * size_of::<S>()` bytes in this slice. | 
|  | next: &'a [u8], | 
|  | } | 
|  |  | 
|  | impl<'a, S: StateID> State<'a, S> { | 
|  | /// Searches for the next transition given an input byte. If no such | 
|  | /// transition could be found, then a dead state is returned. | 
|  | fn next(&self, input: u8) -> S { | 
|  | // This straight linear search was observed to be much better than | 
|  | // binary search on ASCII haystacks, likely because a binary search | 
|  | // visits the ASCII case last but a linear search sees it first. A | 
|  | // binary search does do a little better on non-ASCII haystacks, but | 
|  | // not by much. There might be a better trade off lurking here. | 
|  | for i in 0..self.ntrans { | 
|  | let (start, end) = self.range(i); | 
|  | if start <= input && input <= end { | 
|  | return self.next_at(i); | 
|  | } | 
|  | // We could bail early with an extra branch: if input < b1, then | 
|  | // we know we'll never find a matching transition. Interestingly, | 
|  | // this extra branch seems to not help performance, or will even | 
|  | // hurt it. It's likely very dependent on the DFA itself and what | 
|  | // is being searched. | 
|  | } | 
|  | dead_id() | 
|  | } | 
|  |  | 
|  | /// Returns the inclusive input byte range for the ith transition in this | 
|  | /// state. | 
|  | fn range(&self, i: usize) -> (u8, u8) { | 
|  | (self.input_ranges[i * 2], self.input_ranges[i * 2 + 1]) | 
|  | } | 
|  |  | 
|  | /// Returns the next state for the ith transition in this state. | 
|  | fn next_at(&self, i: usize) -> S { | 
|  | S::read_bytes(&self.next[i * size_of::<S>()..]) | 
|  | } | 
|  |  | 
|  | /// Return the total number of bytes that this state consumes in its | 
|  | /// encoded form. | 
|  | #[cfg(feature = "std")] | 
|  | fn bytes(&self) -> usize { | 
|  | 2 + (self.ntrans * 2) + (self.ntrans * size_of::<S>()) | 
|  | } | 
|  | } | 
|  |  | 
|  | #[cfg(feature = "std")] | 
|  | impl<'a, S: StateID> fmt::Debug for State<'a, S> { | 
|  | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | 
|  | let mut transitions = vec![]; | 
|  | for i in 0..self.ntrans { | 
|  | let next = self.next_at(i); | 
|  | if next == dead_id() { | 
|  | continue; | 
|  | } | 
|  |  | 
|  | let (start, end) = self.range(i); | 
|  | if start == end { | 
|  | transitions.push(format!( | 
|  | "{} => {}", | 
|  | escape(start), | 
|  | next.to_usize() | 
|  | )); | 
|  | } else { | 
|  | transitions.push(format!( | 
|  | "{}-{} => {}", | 
|  | escape(start), | 
|  | escape(end), | 
|  | next.to_usize(), | 
|  | )); | 
|  | } | 
|  | } | 
|  | write!(f, "{}", transitions.join(", ")) | 
|  | } | 
|  | } | 
|  |  | 
|  | /// A representation of a mutable sparse DFA state that can be cheaply | 
|  | /// materialized from a state identifier. | 
|  | #[cfg(feature = "std")] | 
|  | struct StateMut<'a, S: StateID = usize> { | 
|  | /// The state identifier representation used by the DFA from which this | 
|  | /// state was extracted. Since our transition table is compacted in a | 
|  | /// &[u8], we don't actually use the state ID type parameter explicitly | 
|  | /// anywhere, so we fake it. This prevents callers from using an incorrect | 
|  | /// state ID representation to read from this state. | 
|  | _state_id_repr: PhantomData<S>, | 
|  | /// The number of transitions in this state. | 
|  | ntrans: usize, | 
|  | /// Pairs of input ranges, where there is one pair for each transition. | 
|  | /// Each pair specifies an inclusive start and end byte range for the | 
|  | /// corresponding transition. | 
|  | input_ranges: &'a mut [u8], | 
|  | /// Transitions to the next state. This slice contains native endian | 
|  | /// encoded state identifiers, with `S` as the representation. Thus, there | 
|  | /// are `ntrans * size_of::<S>()` bytes in this slice. | 
|  | next: &'a mut [u8], | 
|  | } | 
|  |  | 
|  | #[cfg(feature = "std")] | 
|  | impl<'a, S: StateID> StateMut<'a, S> { | 
|  | /// Sets the ith transition to the given state. | 
|  | fn set_next_at(&mut self, i: usize, next: S) { | 
|  | next.write_bytes(&mut self.next[i * size_of::<S>()..]); | 
|  | } | 
|  | } | 
|  |  | 
|  | #[cfg(feature = "std")] | 
|  | impl<'a, S: StateID> fmt::Debug for StateMut<'a, S> { | 
|  | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | 
|  | let state = State { | 
|  | _state_id_repr: self._state_id_repr, | 
|  | ntrans: self.ntrans, | 
|  | input_ranges: self.input_ranges, | 
|  | next: self.next, | 
|  | }; | 
|  | fmt::Debug::fmt(&state, f) | 
|  | } | 
|  | } | 
|  |  | 
|  | /// Return the given byte as its escaped string form. | 
|  | #[cfg(feature = "std")] | 
|  | fn escape(b: u8) -> String { | 
|  | use std::ascii; | 
|  |  | 
|  | String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap() | 
|  | } | 
|  |  | 
|  | /// A binary search routine specialized specifically to a sparse DFA state's | 
|  | /// transitions. Specifically, the transitions are defined as a set of pairs | 
|  | /// of input bytes that delineate an inclusive range of bytes. If the input | 
|  | /// byte is in the range, then the corresponding transition is a match. | 
|  | /// | 
|  | /// This binary search accepts a slice of these pairs and returns the position | 
|  | /// of the matching pair (the ith transition), or None if no matching pair | 
|  | /// could be found. | 
|  | /// | 
|  | /// Note that this routine is not currently used since it was observed to | 
|  | /// either decrease performance when searching ASCII, or did not provide enough | 
|  | /// of a boost on non-ASCII haystacks to be worth it. However, we leave it here | 
|  | /// for posterity in case we can find a way to use it. | 
|  | /// | 
|  | /// In theory, we could use the standard library's search routine if we could | 
|  | /// cast a `&[u8]` to a `&[(u8, u8)]`, but I don't believe this is currently | 
|  | /// guaranteed to be safe and is thus UB (since I don't think the in-memory | 
|  | /// representation of `(u8, u8)` has been nailed down). | 
|  | #[inline(always)] | 
|  | #[allow(dead_code)] | 
|  | fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> { | 
|  | debug_assert!(ranges.len() % 2 == 0, "ranges must have even length"); | 
|  | debug_assert!(ranges.len() <= 512, "ranges should be short"); | 
|  |  | 
|  | let (mut left, mut right) = (0, ranges.len() / 2); | 
|  | while left < right { | 
|  | let mid = (left + right) / 2; | 
|  | let (b1, b2) = (ranges[mid * 2], ranges[mid * 2 + 1]); | 
|  | if needle < b1 { | 
|  | right = mid; | 
|  | } else if needle > b2 { | 
|  | left = mid + 1; | 
|  | } else { | 
|  | return Some(mid); | 
|  | } | 
|  | } | 
|  | None | 
|  | } |