vendor/regex-automata-0.4.3/src/util/utf8.rs - toolchain/rustc - Git at Google

 /*!
 Utilities for dealing with UTF-8.

 This module provides some UTF-8 related helper routines, including an
 incremental decoder.
 */

 /// Returns true if and only if the given byte is considered a word character.
 /// This only applies to ASCII.
 ///
 /// This was copied from regex-syntax so that we can use it to determine the
 /// starting DFA state while searching without depending on regex-syntax. The
 /// definition is never going to change, so there's no maintenance/bit-rot
 /// hazard here.
 #[cfg_attr(feature = "perf-inline", inline(always))]
 pub(crate) fn is_word_byte(b: u8) -> bool {
     const fn mkwordset() -> [bool; 256] {
         // FIXME: Use as_usize() once const functions in traits are stable.
         let mut set = [false; 256];
         set[b'_' as usize] = true;

         let mut byte = b'0';
         while byte <= b'9' {
             set[byte as usize] = true;
             byte += 1;
         }
         byte = b'A';
         while byte <= b'Z' {
             set[byte as usize] = true;
             byte += 1;
         }
         byte = b'a';
         while byte <= b'z' {
             set[byte as usize] = true;
             byte += 1;
         }
         set
     }
     const WORD: [bool; 256] = mkwordset();
     WORD[b as usize]
 }

 /// Decodes the next UTF-8 encoded codepoint from the given byte slice.
 ///
 /// If no valid encoding of a codepoint exists at the beginning of the given
 /// byte slice, then the first byte is returned instead.
 ///
 /// This returns `None` if and only if `bytes` is empty.
 ///
 /// This never panics.
 ///
 /// *WARNING*: This is not designed for performance. If you're looking for a
 /// fast UTF-8 decoder, this is not it. If you feel like you need one in this
 /// crate, then please file an issue and discuss your use case.
 #[cfg_attr(feature = "perf-inline", inline(always))]
 pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, u8>> {
     if bytes.is_empty() {
         return None;
     }
     let len = match len(bytes[0]) {
         None => return Some(Err(bytes[0])),
         Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
         Some(1) => return Some(Ok(char::from(bytes[0]))),
         Some(len) => len,
     };
     match core::str::from_utf8(&bytes[..len]) {
         Ok(s) => Some(Ok(s.chars().next().unwrap())),
         Err(_) => Some(Err(bytes[0])),
     }
 }

 /// Decodes the last UTF-8 encoded codepoint from the given byte slice.
 ///
 /// If no valid encoding of a codepoint exists at the end of the given byte
 /// slice, then the last byte is returned instead.
 ///
 /// This returns `None` if and only if `bytes` is empty.
 #[cfg_attr(feature = "perf-inline", inline(always))]
 pub(crate) fn decode_last(bytes: &[u8]) -> Option<Result<char, u8>> {
     if bytes.is_empty() {
         return None;
     }
     let mut start = bytes.len() - 1;
     let limit = bytes.len().saturating_sub(4);
     while start > limit && !is_leading_or_invalid_byte(bytes[start]) {
         start -= 1;
     }
     match decode(&bytes[start..]) {
         None => None,
         Some(Ok(ch)) => Some(Ok(ch)),
         Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])),
     }
 }

 /// Given a UTF-8 leading byte, this returns the total number of code units
 /// in the following encoded codepoint.
 ///
 /// If the given byte is not a valid UTF-8 leading byte, then this returns
 /// `None`.
 #[cfg_attr(feature = "perf-inline", inline(always))]
 fn len(byte: u8) -> Option<usize> {
     if byte <= 0x7F {
         return Some(1);
     } else if byte & 0b1100_0000 == 0b1000_0000 {
         return None;
     } else if byte <= 0b1101_1111 {
         Some(2)
     } else if byte <= 0b1110_1111 {
         Some(3)
     } else if byte <= 0b1111_0111 {
         Some(4)
     } else {
         None
     }
 }

 /// Returns true if and only if the given offset in the given bytes falls on a
 /// valid UTF-8 encoded codepoint boundary.
 ///
 /// If `bytes` is not valid UTF-8, then the behavior of this routine is
 /// unspecified.
 #[cfg_attr(feature = "perf-inline", inline(always))]
 pub(crate) fn is_boundary(bytes: &[u8], i: usize) -> bool {
     match bytes.get(i) {
         // The position at the end of the bytes always represents an empty
         // string, which is a valid boundary. But anything after that doesn't
         // make much sense to call valid a boundary.
         None => i == bytes.len(),
         // Other than ASCII (where the most significant bit is never set),
         // valid starting bytes always have their most significant two bits
         // set, where as continuation bytes never have their second most
         // significant bit set. Therefore, this only returns true when bytes[i]
         // corresponds to a byte that begins a valid UTF-8 encoding of a
         // Unicode scalar value.
         Some(&b) => b <= 0b0111_1111 || b >= 0b1100_0000,
     }
 }

 /// Returns true if and only if the given byte is either a valid leading UTF-8
 /// byte, or is otherwise an invalid byte that can never appear anywhere in a
 /// valid UTF-8 sequence.
 #[cfg_attr(feature = "perf-inline", inline(always))]
 fn is_leading_or_invalid_byte(b: u8) -> bool {
     // In the ASCII case, the most significant bit is never set. The leading
     // byte of a 2/3/4-byte sequence always has the top two most significant
     // bits set. For bytes that can never appear anywhere in valid UTF-8, this
     // also returns true, since every such byte has its two most significant
     // bits set:
     //
     //     \xC0 :: 11000000
     //     \xC1 :: 11000001
     //     \xF5 :: 11110101
     //     \xF6 :: 11110110
     //     \xF7 :: 11110111
     //     \xF8 :: 11111000
     //     \xF9 :: 11111001
     //     \xFA :: 11111010
     //     \xFB :: 11111011
     //     \xFC :: 11111100
     //     \xFD :: 11111101
     //     \xFE :: 11111110
     //     \xFF :: 11111111
     (b & 0b1100_0000) != 0b1000_0000
 }

 /*
 /// Returns the smallest possible index of the next valid UTF-8 sequence
 /// starting after `i`.
 ///
 /// For all inputs, including invalid UTF-8 and any value of `i`, the return
 /// value is guaranteed to be greater than `i`. (If there is no value greater
 /// than `i` that fits in `usize`, then this panics.)
 ///
 /// Generally speaking, this should only be called on `text` when it is
 /// permitted to assume that it is valid UTF-8 and where either `i >=
 /// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence.
 ///
 /// NOTE: This method was used in a previous conception of iterators where we
 /// specifically tried to skip over empty matches that split a codepoint by
 /// simply requiring that our next search begin at the beginning of codepoint.
 /// But we ended up changing that technique to always advance by 1 byte and
 /// then filter out matches that split a codepoint after-the-fact. Thus, we no
 /// longer use this method. But I've kept it around in case we want to switch
 /// back to this approach. Its guarantees are a little subtle, so I'd prefer
 /// not to rebuild it from whole cloth.
 pub(crate) fn next(text: &[u8], i: usize) -> usize {
     let b = match text.get(i) {
         None => return i.checked_add(1).unwrap(),
         Some(&b) => b,
     };
     // For cases where we see an invalid UTF-8 byte, there isn't much we can do
     // other than just start at the next byte.
     let inc = len(b).unwrap_or(1);
     i.checked_add(inc).unwrap()
 }
 */
	/*!
	Utilities for dealing with UTF-8.

	This module provides some UTF-8 related helper routines, including an
	incremental decoder.
	*/

	/// Returns true if and only if the given byte is considered a word character.
	/// This only applies to ASCII.
	///
	/// This was copied from regex-syntax so that we can use it to determine the
	/// starting DFA state while searching without depending on regex-syntax. The
	/// definition is never going to change, so there's no maintenance/bit-rot
	/// hazard here.
	#[cfg_attr(feature = "perf-inline", inline(always))]
	pub(crate) fn is_word_byte(b: u8) -> bool {
	const fn mkwordset() -> [bool; 256] {
	// FIXME: Use as_usize() once const functions in traits are stable.
	let mut set = [false; 256];
	set[b'_' as usize] = true;

	let mut byte = b'0';
	while byte <= b'9' {
	set[byte as usize] = true;
	byte += 1;
	}
	byte = b'A';
	while byte <= b'Z' {
	set[byte as usize] = true;
	byte += 1;
	}
	byte = b'a';
	while byte <= b'z' {
	set[byte as usize] = true;
	byte += 1;
	}
	set
	}
	const WORD: [bool; 256] = mkwordset();
	WORD[b as usize]
	}

	/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
	///
	/// If no valid encoding of a codepoint exists at the beginning of the given
	/// byte slice, then the first byte is returned instead.
	///
	/// This returns `None` if and only if `bytes` is empty.
	///
	/// This never panics.
	///
	/// WARNING: This is not designed for performance. If you're looking for a
	/// fast UTF-8 decoder, this is not it. If you feel like you need one in this
	/// crate, then please file an issue and discuss your use case.
	#[cfg_attr(feature = "perf-inline", inline(always))]
	pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, u8>> {
	if bytes.is_empty() {
	return None;
	}
	let len = match len(bytes[0]) {
	None => return Some(Err(bytes[0])),
	Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
	Some(1) => return Some(Ok(char::from(bytes[0]))),
	Some(len) => len,
	};
	match core::str::from_utf8(&bytes[..len]) {
	Ok(s) => Some(Ok(s.chars().next().unwrap())),
	Err(_) => Some(Err(bytes[0])),
	}
	}

	/// Decodes the last UTF-8 encoded codepoint from the given byte slice.
	///
	/// If no valid encoding of a codepoint exists at the end of the given byte
	/// slice, then the last byte is returned instead.
	///
	/// This returns `None` if and only if `bytes` is empty.
	#[cfg_attr(feature = "perf-inline", inline(always))]
	pub(crate) fn decode_last(bytes: &[u8]) -> Option<Result<char, u8>> {
	if bytes.is_empty() {
	return None;
	}
	let mut start = bytes.len() - 1;
	let limit = bytes.len().saturating_sub(4);
	while start > limit && !is_leading_or_invalid_byte(bytes[start]) {
	start -= 1;
	}
	match decode(&bytes[start..]) {
	None => None,
	Some(Ok(ch)) => Some(Ok(ch)),
	Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])),
	}
	}

	/// Given a UTF-8 leading byte, this returns the total number of code units
	/// in the following encoded codepoint.
	///
	/// If the given byte is not a valid UTF-8 leading byte, then this returns
	/// `None`.
	#[cfg_attr(feature = "perf-inline", inline(always))]
	fn len(byte: u8) -> Option<usize> {
	if byte <= 0x7F {
	return Some(1);
	} else if byte & 0b1100_0000 == 0b1000_0000 {
	return None;
	} else if byte <= 0b1101_1111 {
	Some(2)
	} else if byte <= 0b1110_1111 {
	Some(3)
	} else if byte <= 0b1111_0111 {
	Some(4)
	} else {
	None
	}
	}

	/// Returns true if and only if the given offset in the given bytes falls on a
	/// valid UTF-8 encoded codepoint boundary.
	///
	/// If `bytes` is not valid UTF-8, then the behavior of this routine is
	/// unspecified.
	#[cfg_attr(feature = "perf-inline", inline(always))]
	pub(crate) fn is_boundary(bytes: &[u8], i: usize) -> bool {
	match bytes.get(i) {
	// The position at the end of the bytes always represents an empty
	// string, which is a valid boundary. But anything after that doesn't
	// make much sense to call valid a boundary.
	None => i == bytes.len(),
	// Other than ASCII (where the most significant bit is never set),
	// valid starting bytes always have their most significant two bits
	// set, where as continuation bytes never have their second most
	// significant bit set. Therefore, this only returns true when bytes[i]
	// corresponds to a byte that begins a valid UTF-8 encoding of a
	// Unicode scalar value.
	Some(&b) => b <= 0b0111_1111 \|\| b >= 0b1100_0000,
	}
	}

	/// Returns true if and only if the given byte is either a valid leading UTF-8
	/// byte, or is otherwise an invalid byte that can never appear anywhere in a
	/// valid UTF-8 sequence.
	#[cfg_attr(feature = "perf-inline", inline(always))]
	fn is_leading_or_invalid_byte(b: u8) -> bool {
	// In the ASCII case, the most significant bit is never set. The leading
	// byte of a 2/3/4-byte sequence always has the top two most significant
	// bits set. For bytes that can never appear anywhere in valid UTF-8, this
	// also returns true, since every such byte has its two most significant
	// bits set:
	//
	// \xC0 :: 11000000
	// \xC1 :: 11000001
	// \xF5 :: 11110101
	// \xF6 :: 11110110
	// \xF7 :: 11110111
	// \xF8 :: 11111000
	// \xF9 :: 11111001
	// \xFA :: 11111010
	// \xFB :: 11111011
	// \xFC :: 11111100
	// \xFD :: 11111101
	// \xFE :: 11111110
	// \xFF :: 11111111
	(b & 0b1100_0000) != 0b1000_0000
	}

	/*
	/// Returns the smallest possible index of the next valid UTF-8 sequence
	/// starting after `i`.
	///
	/// For all inputs, including invalid UTF-8 and any value of `i`, the return
	/// value is guaranteed to be greater than `i`. (If there is no value greater
	/// than `i` that fits in `usize`, then this panics.)
	///
	/// Generally speaking, this should only be called on `text` when it is
	/// permitted to assume that it is valid UTF-8 and where either `i >=
	/// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence.
	///
	/// NOTE: This method was used in a previous conception of iterators where we
	/// specifically tried to skip over empty matches that split a codepoint by
	/// simply requiring that our next search begin at the beginning of codepoint.
	/// But we ended up changing that technique to always advance by 1 byte and
	/// then filter out matches that split a codepoint after-the-fact. Thus, we no
	/// longer use this method. But I've kept it around in case we want to switch
	/// back to this approach. Its guarantees are a little subtle, so I'd prefer
	/// not to rebuild it from whole cloth.
	pub(crate) fn next(text: &[u8], i: usize) -> usize {
	let b = match text.get(i) {
	None => return i.checked_add(1).unwrap(),
	Some(&b) => b,
	};
	// For cases where we see an invalid UTF-8 byte, there isn't much we can do
	// other than just start at the next byte.
	let inc = len(b).unwrap_or(1);
	i.checked_add(inc).unwrap()
	}
	*/