crates/regex-automata/src/classes.rs - platform/external/rust/android-crates-io - Git at Google

 use core::fmt;

 /// A representation of byte oriented equivalence classes.
 ///
 /// This is used in a DFA to reduce the size of the transition table. This can
 /// have a particularly large impact not only on the total size of a dense DFA,
 /// but also on compile times.
 #[derive(Clone, Copy)]
 pub struct ByteClasses([u8; 256]);

 impl ByteClasses {
     /// Creates a new set of equivalence classes where all bytes are mapped to
     /// the same class.
     pub fn empty() -> ByteClasses {
         ByteClasses([0; 256])
     }

     /// Creates a new set of equivalence classes where each byte belongs to
     /// its own equivalence class.
     pub fn singletons() -> ByteClasses {
         let mut classes = ByteClasses::empty();
         for i in 0..256 {
             classes.set(i as u8, i as u8);
         }
         classes
     }

     /// Copies the byte classes given. The given slice must have length 0 or
     /// length 256. Slices of length 0 are treated as singletons (every byte
     /// is its own class).
     pub fn from_slice(slice: &[u8]) -> ByteClasses {
         assert!(slice.is_empty() || slice.len() == 256);

         if slice.is_empty() {
             ByteClasses::singletons()
         } else {
             let mut classes = ByteClasses::empty();
             for (b, &class) in slice.iter().enumerate() {
                 classes.set(b as u8, class);
             }
             classes
         }
     }

     /// Set the equivalence class for the given byte.
     #[inline]
     pub fn set(&mut self, byte: u8, class: u8) {
         self.0[byte as usize] = class;
     }

     /// Get the equivalence class for the given byte.
     #[inline]
     pub fn get(&self, byte: u8) -> u8 {
         self.0[byte as usize]
     }

     /// Get the equivalence class for the given byte while forcefully
     /// eliding bounds checks.
     #[inline]
     pub unsafe fn get_unchecked(&self, byte: u8) -> u8 {
         *self.0.get_unchecked(byte as usize)
     }

     /// Return the total number of elements in the alphabet represented by
     /// these equivalence classes. Equivalently, this returns the total number
     /// of equivalence classes.
     #[inline]
     pub fn alphabet_len(&self) -> usize {
         self.0[255] as usize + 1
     }

     /// Returns true if and only if every byte in this class maps to its own
     /// equivalence class. Equivalently, there are 256 equivalence classes
     /// and each class contains exactly one byte.
     #[inline]
     pub fn is_singleton(&self) -> bool {
         self.alphabet_len() == 256
     }

     /// Returns an iterator over a sequence of representative bytes from each
     /// equivalence class. Namely, this yields exactly N items, where N is
     /// equivalent to the number of equivalence classes. Each item is an
     /// arbitrary byte drawn from each equivalence class.
     ///
     /// This is useful when one is determinizing an NFA and the NFA's alphabet
     /// hasn't been converted to equivalence classes yet. Picking an arbitrary
     /// byte from each equivalence class then permits a full exploration of
     /// the NFA instead of using every possible byte value.
     #[cfg(feature = "std")]
     pub fn representatives(&self) -> ByteClassRepresentatives {
         ByteClassRepresentatives { classes: self, byte: 0, last_class: None }
     }

     /// Returns all of the bytes in the given equivalence class.
     ///
     /// The second element in the tuple indicates the number of elements in
     /// the array.
     fn elements(&self, equiv: u8) -> ([u8; 256], usize) {
         let (mut array, mut len) = ([0; 256], 0);
         for b in 0..256 {
             if self.get(b as u8) == equiv {
                 array[len] = b as u8;
                 len += 1;
             }
         }
         (array, len)
     }
 }

 impl fmt::Debug for ByteClasses {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         if self.is_singleton() {
             write!(f, "ByteClasses({{singletons}})")
         } else {
             write!(f, "ByteClasses(")?;
             for equiv in 0..self.alphabet_len() {
                 let (members, len) = self.elements(equiv as u8);
                 write!(f, "{} => {:?}", equiv, &members[..len])?;
             }
             write!(f, ")")
         }
     }
 }

 /// An iterator over representative bytes from each equivalence class.
 #[cfg(feature = "std")]
 #[derive(Debug)]
 pub struct ByteClassRepresentatives<'a> {
     classes: &'a ByteClasses,
     byte: usize,
     last_class: Option<u8>,
 }

 #[cfg(feature = "std")]
 impl<'a> Iterator for ByteClassRepresentatives<'a> {
     type Item = u8;

     fn next(&mut self) -> Option<u8> {
         while self.byte < 256 {
             let byte = self.byte as u8;
             let class = self.classes.get(byte);
             self.byte += 1;

             if self.last_class != Some(class) {
                 self.last_class = Some(class);
                 return Some(byte);
             }
         }
         None
     }
 }

 /// A byte class set keeps track of an *approximation* of equivalence classes
 /// of bytes during NFA construction. That is, every byte in an equivalence
 /// class cannot discriminate between a match and a non-match.
 ///
 /// For example, in the regex `[ab]+`, the bytes `a` and `b` would be in the
 /// same equivalence class because it never matters whether an `a` or a `b` is
 /// seen, and no combination of `a`s and `b`s in the text can discriminate
 /// a match.
 ///
 /// Note though that this does not compute the minimal set of equivalence
 /// classes. For example, in the regex `[ac]+`, both `a` and `c` are in the
 /// same equivalence class for the same reason that `a` and `b` are in the
 /// same equivalence class in the aforementioned regex. However, in this
 /// implementation, `a` and `c` are put into distinct equivalence classes.
 /// The reason for this is implementation complexity. In the future, we should
 /// endeavor to compute the minimal equivalence classes since they can have a
 /// rather large impact on the size of the DFA.
 ///
 /// The representation here is 256 booleans, all initially set to false. Each
 /// boolean maps to its corresponding byte based on position. A `true` value
 /// indicates the end of an equivalence class, where its corresponding byte
 /// and all of the bytes corresponding to all previous contiguous `false`
 /// values are in the same equivalence class.
 ///
 /// This particular representation only permits contiguous ranges of bytes to
 /// be in the same equivalence class, which means that we can never discover
 /// the true minimal set of equivalence classes.
 #[cfg(feature = "std")]
 #[derive(Debug)]
 pub struct ByteClassSet(Vec<bool>);

 #[cfg(feature = "std")]
 impl ByteClassSet {
     /// Create a new set of byte classes where all bytes are part of the same
     /// equivalence class.
     pub fn new() -> Self {
         ByteClassSet(vec![false; 256])
     }

     /// Indicate the the range of byte given (inclusive) can discriminate a
     /// match between it and all other bytes outside of the range.
     pub fn set_range(&mut self, start: u8, end: u8) {
         debug_assert!(start <= end);
         if start > 0 {
             self.0[start as usize - 1] = true;
         }
         self.0[end as usize] = true;
     }

     /// Convert this boolean set to a map that maps all byte values to their
     /// corresponding equivalence class. The last mapping indicates the largest
     /// equivalence class identifier (which is never bigger than 255).
     pub fn byte_classes(&self) -> ByteClasses {
         let mut classes = ByteClasses::empty();
         let mut class = 0u8;
         let mut i = 0;
         loop {
             classes.set(i as u8, class as u8);
             if i >= 255 {
                 break;
             }
             if self.0[i] {
                 class = class.checked_add(1).unwrap();
             }
             i += 1;
         }
         classes
     }
 }

 #[cfg(test)]
 mod tests {
     #[cfg(feature = "std")]
     #[test]
     fn byte_classes() {
         use super::ByteClassSet;

         let mut set = ByteClassSet::new();
         set.set_range(b'a', b'z');

         let classes = set.byte_classes();
         assert_eq!(classes.get(0), 0);
         assert_eq!(classes.get(1), 0);
         assert_eq!(classes.get(2), 0);
         assert_eq!(classes.get(b'a' - 1), 0);
         assert_eq!(classes.get(b'a'), 1);
         assert_eq!(classes.get(b'm'), 1);
         assert_eq!(classes.get(b'z'), 1);
         assert_eq!(classes.get(b'z' + 1), 2);
         assert_eq!(classes.get(254), 2);
         assert_eq!(classes.get(255), 2);

         let mut set = ByteClassSet::new();
         set.set_range(0, 2);
         set.set_range(4, 6);
         let classes = set.byte_classes();
         assert_eq!(classes.get(0), 0);
         assert_eq!(classes.get(1), 0);
         assert_eq!(classes.get(2), 0);
         assert_eq!(classes.get(3), 1);
         assert_eq!(classes.get(4), 2);
         assert_eq!(classes.get(5), 2);
         assert_eq!(classes.get(6), 2);
         assert_eq!(classes.get(7), 3);
         assert_eq!(classes.get(255), 3);
     }

     #[cfg(feature = "std")]
     #[test]
     fn full_byte_classes() {
         use super::ByteClassSet;

         let mut set = ByteClassSet::new();
         for i in 0..256u16 {
             set.set_range(i as u8, i as u8);
         }
         assert_eq!(set.byte_classes().alphabet_len(), 256);
     }
 }
	use core::fmt;

	/// A representation of byte oriented equivalence classes.
	///
	/// This is used in a DFA to reduce the size of the transition table. This can
	/// have a particularly large impact not only on the total size of a dense DFA,
	/// but also on compile times.
	#[derive(Clone, Copy)]
	pub struct ByteClasses([u8; 256]);

	impl ByteClasses {
	/// Creates a new set of equivalence classes where all bytes are mapped to
	/// the same class.
	pub fn empty() -> ByteClasses {
	ByteClasses([0; 256])
	}

	/// Creates a new set of equivalence classes where each byte belongs to
	/// its own equivalence class.
	pub fn singletons() -> ByteClasses {
	let mut classes = ByteClasses::empty();
	for i in 0..256 {
	classes.set(i as u8, i as u8);
	}
	classes
	}

	/// Copies the byte classes given. The given slice must have length 0 or
	/// length 256. Slices of length 0 are treated as singletons (every byte
	/// is its own class).
	pub fn from_slice(slice: &[u8]) -> ByteClasses {
	assert!(slice.is_empty() \|\| slice.len() == 256);

	if slice.is_empty() {
	ByteClasses::singletons()
	} else {
	let mut classes = ByteClasses::empty();
	for (b, &class) in slice.iter().enumerate() {
	classes.set(b as u8, class);
	}
	classes
	}
	}

	/// Set the equivalence class for the given byte.
	#[inline]
	pub fn set(&mut self, byte: u8, class: u8) {
	self.0[byte as usize] = class;
	}

	/// Get the equivalence class for the given byte.
	#[inline]
	pub fn get(&self, byte: u8) -> u8 {
	self.0[byte as usize]
	}

	/// Get the equivalence class for the given byte while forcefully
	/// eliding bounds checks.
	#[inline]
	pub unsafe fn get_unchecked(&self, byte: u8) -> u8 {
	*self.0.get_unchecked(byte as usize)
	}

	/// Return the total number of elements in the alphabet represented by
	/// these equivalence classes. Equivalently, this returns the total number
	/// of equivalence classes.
	#[inline]
	pub fn alphabet_len(&self) -> usize {
	self.0[255] as usize + 1
	}

	/// Returns true if and only if every byte in this class maps to its own
	/// equivalence class. Equivalently, there are 256 equivalence classes
	/// and each class contains exactly one byte.
	#[inline]
	pub fn is_singleton(&self) -> bool {
	self.alphabet_len() == 256
	}

	/// Returns an iterator over a sequence of representative bytes from each
	/// equivalence class. Namely, this yields exactly N items, where N is
	/// equivalent to the number of equivalence classes. Each item is an
	/// arbitrary byte drawn from each equivalence class.
	///
	/// This is useful when one is determinizing an NFA and the NFA's alphabet
	/// hasn't been converted to equivalence classes yet. Picking an arbitrary
	/// byte from each equivalence class then permits a full exploration of
	/// the NFA instead of using every possible byte value.
	#[cfg(feature = "std")]
	pub fn representatives(&self) -> ByteClassRepresentatives {
	ByteClassRepresentatives { classes: self, byte: 0, last_class: None }
	}

	/// Returns all of the bytes in the given equivalence class.
	///
	/// The second element in the tuple indicates the number of elements in
	/// the array.
	fn elements(&self, equiv: u8) -> ([u8; 256], usize) {
	let (mut array, mut len) = ([0; 256], 0);
	for b in 0..256 {
	if self.get(b as u8) == equiv {
	array[len] = b as u8;
	len += 1;
	}
	}
	(array, len)
	}
	}

	impl fmt::Debug for ByteClasses {
	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
	if self.is_singleton() {
	write!(f, "ByteClasses({{singletons}})")
	} else {
	write!(f, "ByteClasses(")?;
	for equiv in 0..self.alphabet_len() {
	let (members, len) = self.elements(equiv as u8);
	write!(f, "{} => {:?}", equiv, &members[..len])?;
	}
	write!(f, ")")
	}
	}
	}

	/// An iterator over representative bytes from each equivalence class.
	#[cfg(feature = "std")]
	#[derive(Debug)]
	pub struct ByteClassRepresentatives<'a> {
	classes: &'a ByteClasses,
	byte: usize,
	last_class: Option<u8>,
	}

	#[cfg(feature = "std")]
	impl<'a> Iterator for ByteClassRepresentatives<'a> {
	type Item = u8;

	fn next(&mut self) -> Option<u8> {
	while self.byte < 256 {
	let byte = self.byte as u8;
	let class = self.classes.get(byte);
	self.byte += 1;

	if self.last_class != Some(class) {
	self.last_class = Some(class);
	return Some(byte);
	}
	}
	None
	}
	}

	/// A byte class set keeps track of an approximation of equivalence classes
	/// of bytes during NFA construction. That is, every byte in an equivalence
	/// class cannot discriminate between a match and a non-match.
	///
	/// For example, in the regex `[ab]+`, the bytes `a` and `b` would be in the
	/// same equivalence class because it never matters whether an `a` or a `b` is
	/// seen, and no combination of `a`s and `b`s in the text can discriminate
	/// a match.
	///
	/// Note though that this does not compute the minimal set of equivalence
	/// classes. For example, in the regex `[ac]+`, both `a` and `c` are in the
	/// same equivalence class for the same reason that `a` and `b` are in the
	/// same equivalence class in the aforementioned regex. However, in this
	/// implementation, `a` and `c` are put into distinct equivalence classes.
	/// The reason for this is implementation complexity. In the future, we should
	/// endeavor to compute the minimal equivalence classes since they can have a
	/// rather large impact on the size of the DFA.
	///
	/// The representation here is 256 booleans, all initially set to false. Each
	/// boolean maps to its corresponding byte based on position. A `true` value
	/// indicates the end of an equivalence class, where its corresponding byte
	/// and all of the bytes corresponding to all previous contiguous `false`
	/// values are in the same equivalence class.
	///
	/// This particular representation only permits contiguous ranges of bytes to
	/// be in the same equivalence class, which means that we can never discover
	/// the true minimal set of equivalence classes.
	#[cfg(feature = "std")]
	#[derive(Debug)]
	pub struct ByteClassSet(Vec<bool>);

	#[cfg(feature = "std")]
	impl ByteClassSet {
	/// Create a new set of byte classes where all bytes are part of the same
	/// equivalence class.
	pub fn new() -> Self {
	ByteClassSet(vec![false; 256])
	}

	/// Indicate the the range of byte given (inclusive) can discriminate a
	/// match between it and all other bytes outside of the range.
	pub fn set_range(&mut self, start: u8, end: u8) {
	debug_assert!(start <= end);
	if start > 0 {
	self.0[start as usize - 1] = true;
	}
	self.0[end as usize] = true;
	}

	/// Convert this boolean set to a map that maps all byte values to their
	/// corresponding equivalence class. The last mapping indicates the largest
	/// equivalence class identifier (which is never bigger than 255).
	pub fn byte_classes(&self) -> ByteClasses {
	let mut classes = ByteClasses::empty();
	let mut class = 0u8;
	let mut i = 0;
	loop {
	classes.set(i as u8, class as u8);
	if i >= 255 {
	break;
	}
	if self.0[i] {
	class = class.checked_add(1).unwrap();
	}
	i += 1;
	}
	classes
	}
	}

	#[cfg(test)]
	mod tests {
	#[cfg(feature = "std")]
	#[test]
	fn byte_classes() {
	use super::ByteClassSet;

	let mut set = ByteClassSet::new();
	set.set_range(b'a', b'z');

	let classes = set.byte_classes();
	assert_eq!(classes.get(0), 0);
	assert_eq!(classes.get(1), 0);
	assert_eq!(classes.get(2), 0);
	assert_eq!(classes.get(b'a' - 1), 0);
	assert_eq!(classes.get(b'a'), 1);
	assert_eq!(classes.get(b'm'), 1);
	assert_eq!(classes.get(b'z'), 1);
	assert_eq!(classes.get(b'z' + 1), 2);
	assert_eq!(classes.get(254), 2);
	assert_eq!(classes.get(255), 2);

	let mut set = ByteClassSet::new();
	set.set_range(0, 2);
	set.set_range(4, 6);
	let classes = set.byte_classes();
	assert_eq!(classes.get(0), 0);
	assert_eq!(classes.get(1), 0);
	assert_eq!(classes.get(2), 0);
	assert_eq!(classes.get(3), 1);
	assert_eq!(classes.get(4), 2);
	assert_eq!(classes.get(5), 2);
	assert_eq!(classes.get(6), 2);
	assert_eq!(classes.get(7), 3);
	assert_eq!(classes.get(255), 3);
	}

	#[cfg(feature = "std")]
	#[test]
	fn full_byte_classes() {
	use super::ByteClassSet;

	let mut set = ByteClassSet::new();
	for i in 0..256u16 {
	set.set_range(i as u8, i as u8);
	}
	assert_eq!(set.byte_classes().alphabet_len(), 256);
	}
	}