| /*! |
| Utilities for dealing with UTF-8. |
| |
| This module provides some UTF-8 related helper routines, including an |
| incremental decoder. |
| */ |
| |
| /// Returns true if and only if the given byte is considered a word character. |
| /// This only applies to ASCII. |
| /// |
| /// This was copied from regex-syntax so that we can use it to determine the |
| /// starting DFA state while searching without depending on regex-syntax. The |
| /// definition is never going to change, so there's no maintenance/bit-rot |
| /// hazard here. |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| pub(crate) fn is_word_byte(b: u8) -> bool { |
| const fn mkwordset() -> [bool; 256] { |
| // FIXME: Use as_usize() once const functions in traits are stable. |
| let mut set = [false; 256]; |
| set[b'_' as usize] = true; |
| |
| let mut byte = b'0'; |
| while byte <= b'9' { |
| set[byte as usize] = true; |
| byte += 1; |
| } |
| byte = b'A'; |
| while byte <= b'Z' { |
| set[byte as usize] = true; |
| byte += 1; |
| } |
| byte = b'a'; |
| while byte <= b'z' { |
| set[byte as usize] = true; |
| byte += 1; |
| } |
| set |
| } |
| const WORD: [bool; 256] = mkwordset(); |
| WORD[b as usize] |
| } |
| |
| /// Decodes the next UTF-8 encoded codepoint from the given byte slice. |
| /// |
| /// If no valid encoding of a codepoint exists at the beginning of the given |
| /// byte slice, then the first byte is returned instead. |
| /// |
| /// This returns `None` if and only if `bytes` is empty. |
| /// |
| /// This never panics. |
| /// |
| /// *WARNING*: This is not designed for performance. If you're looking for a |
| /// fast UTF-8 decoder, this is not it. If you feel like you need one in this |
| /// crate, then please file an issue and discuss your use case. |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, u8>> { |
| if bytes.is_empty() { |
| return None; |
| } |
| let len = match len(bytes[0]) { |
| None => return Some(Err(bytes[0])), |
| Some(len) if len > bytes.len() => return Some(Err(bytes[0])), |
| Some(1) => return Some(Ok(char::from(bytes[0]))), |
| Some(len) => len, |
| }; |
| match core::str::from_utf8(&bytes[..len]) { |
| Ok(s) => Some(Ok(s.chars().next().unwrap())), |
| Err(_) => Some(Err(bytes[0])), |
| } |
| } |
| |
| /// Decodes the last UTF-8 encoded codepoint from the given byte slice. |
| /// |
| /// If no valid encoding of a codepoint exists at the end of the given byte |
| /// slice, then the last byte is returned instead. |
| /// |
| /// This returns `None` if and only if `bytes` is empty. |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| pub(crate) fn decode_last(bytes: &[u8]) -> Option<Result<char, u8>> { |
| if bytes.is_empty() { |
| return None; |
| } |
| let mut start = bytes.len() - 1; |
| let limit = bytes.len().saturating_sub(4); |
| while start > limit && !is_leading_or_invalid_byte(bytes[start]) { |
| start -= 1; |
| } |
| match decode(&bytes[start..]) { |
| None => None, |
| Some(Ok(ch)) => Some(Ok(ch)), |
| Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])), |
| } |
| } |
| |
| /// Given a UTF-8 leading byte, this returns the total number of code units |
| /// in the following encoded codepoint. |
| /// |
| /// If the given byte is not a valid UTF-8 leading byte, then this returns |
| /// `None`. |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| fn len(byte: u8) -> Option<usize> { |
| if byte <= 0x7F { |
| return Some(1); |
| } else if byte & 0b1100_0000 == 0b1000_0000 { |
| return None; |
| } else if byte <= 0b1101_1111 { |
| Some(2) |
| } else if byte <= 0b1110_1111 { |
| Some(3) |
| } else if byte <= 0b1111_0111 { |
| Some(4) |
| } else { |
| None |
| } |
| } |
| |
| /// Returns true if and only if the given offset in the given bytes falls on a |
| /// valid UTF-8 encoded codepoint boundary. |
| /// |
| /// If `bytes` is not valid UTF-8, then the behavior of this routine is |
| /// unspecified. |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| pub(crate) fn is_boundary(bytes: &[u8], i: usize) -> bool { |
| match bytes.get(i) { |
| // The position at the end of the bytes always represents an empty |
| // string, which is a valid boundary. But anything after that doesn't |
| // make much sense to call valid a boundary. |
| None => i == bytes.len(), |
| // Other than ASCII (where the most significant bit is never set), |
| // valid starting bytes always have their most significant two bits |
| // set, where as continuation bytes never have their second most |
| // significant bit set. Therefore, this only returns true when bytes[i] |
| // corresponds to a byte that begins a valid UTF-8 encoding of a |
| // Unicode scalar value. |
| Some(&b) => b <= 0b0111_1111 || b >= 0b1100_0000, |
| } |
| } |
| |
| /// Returns true if and only if the given byte is either a valid leading UTF-8 |
| /// byte, or is otherwise an invalid byte that can never appear anywhere in a |
| /// valid UTF-8 sequence. |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| fn is_leading_or_invalid_byte(b: u8) -> bool { |
| // In the ASCII case, the most significant bit is never set. The leading |
| // byte of a 2/3/4-byte sequence always has the top two most significant |
| // bits set. For bytes that can never appear anywhere in valid UTF-8, this |
| // also returns true, since every such byte has its two most significant |
| // bits set: |
| // |
| // \xC0 :: 11000000 |
| // \xC1 :: 11000001 |
| // \xF5 :: 11110101 |
| // \xF6 :: 11110110 |
| // \xF7 :: 11110111 |
| // \xF8 :: 11111000 |
| // \xF9 :: 11111001 |
| // \xFA :: 11111010 |
| // \xFB :: 11111011 |
| // \xFC :: 11111100 |
| // \xFD :: 11111101 |
| // \xFE :: 11111110 |
| // \xFF :: 11111111 |
| (b & 0b1100_0000) != 0b1000_0000 |
| } |
| |
| /* |
| /// Returns the smallest possible index of the next valid UTF-8 sequence |
| /// starting after `i`. |
| /// |
| /// For all inputs, including invalid UTF-8 and any value of `i`, the return |
| /// value is guaranteed to be greater than `i`. (If there is no value greater |
| /// than `i` that fits in `usize`, then this panics.) |
| /// |
| /// Generally speaking, this should only be called on `text` when it is |
| /// permitted to assume that it is valid UTF-8 and where either `i >= |
| /// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence. |
| /// |
| /// NOTE: This method was used in a previous conception of iterators where we |
| /// specifically tried to skip over empty matches that split a codepoint by |
| /// simply requiring that our next search begin at the beginning of codepoint. |
| /// But we ended up changing that technique to always advance by 1 byte and |
| /// then filter out matches that split a codepoint after-the-fact. Thus, we no |
| /// longer use this method. But I've kept it around in case we want to switch |
| /// back to this approach. Its guarantees are a little subtle, so I'd prefer |
| /// not to rebuild it from whole cloth. |
| pub(crate) fn next(text: &[u8], i: usize) -> usize { |
| let b = match text.get(i) { |
| None => return i.checked_add(1).unwrap(), |
| Some(&b) => b, |
| }; |
| // For cases where we see an invalid UTF-8 byte, there isn't much we can do |
| // other than just start at the next byte. |
| let inc = len(b).unwrap_or(1); |
| i.checked_add(inc).unwrap() |
| } |
| */ |