| use std::fmt; |
| use std::io::{self, Read}; |
| use std::str::{self, FromStr}; |
| |
| #[derive(Debug)] |
| pub enum CharReadError { |
| UnexpectedEof, |
| Utf8(str::Utf8Error), |
| Io(io::Error), |
| } |
| |
| impl From<str::Utf8Error> for CharReadError { |
| #[cold] |
| fn from(e: str::Utf8Error) -> CharReadError { |
| CharReadError::Utf8(e) |
| } |
| } |
| |
| impl From<io::Error> for CharReadError { |
| #[cold] |
| fn from(e: io::Error) -> CharReadError { |
| CharReadError::Io(e) |
| } |
| } |
| |
| impl fmt::Display for CharReadError { |
| #[cold] |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| use self::CharReadError::{Io, UnexpectedEof, Utf8}; |
| match *self { |
| UnexpectedEof => write!(f, "unexpected end of stream"), |
| Utf8(ref e) => write!(f, "UTF-8 decoding error: {e}"), |
| Io(ref e) => write!(f, "I/O error: {e}"), |
| } |
| } |
| } |
| |
| /// Character encoding used for parsing |
| #[derive(Debug, Copy, Clone, Eq, PartialEq)] |
| #[non_exhaustive] |
| pub enum Encoding { |
| /// Explicitly UTF-8 only |
| Utf8, |
| /// UTF-8 fallback, but can be any 8-bit encoding |
| Default, |
| /// ISO-8859-1 |
| Latin1, |
| /// US-ASCII |
| Ascii, |
| /// Big-Endian |
| Utf16Be, |
| /// Little-Endian |
| Utf16Le, |
| /// Unknown endianness yet, will be sniffed |
| Utf16, |
| /// Not determined yet, may be sniffed to be anything |
| Unknown, |
| } |
| |
| // Rustc inlines eq_ignore_ascii_case and creates kilobytes of code! |
| #[inline(never)] |
| fn icmp(lower: &str, varcase: &str) -> bool { |
| lower.bytes().zip(varcase.bytes()).all(|(l, v)| l == v.to_ascii_lowercase()) |
| } |
| |
| impl FromStr for Encoding { |
| type Err = &'static str; |
| |
| fn from_str(val: &str) -> Result<Self, Self::Err> { |
| if ["utf-8", "utf8"].into_iter().any(move |label| icmp(label, val)) { |
| Ok(Encoding::Utf8) |
| } else if ["iso-8859-1", "latin1"].into_iter().any(move |label| icmp(label, val)) { |
| Ok(Encoding::Latin1) |
| } else if ["utf-16", "utf16"].into_iter().any(move |label| icmp(label, val)) { |
| Ok(Encoding::Utf16) |
| } else if ["ascii", "us-ascii"].into_iter().any(move |label| icmp(label, val)) { |
| Ok(Encoding::Ascii) |
| } else { |
| Err("unknown encoding name") |
| } |
| } |
| } |
| |
| impl fmt::Display for Encoding { |
| #[cold] |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| f.write_str(match self { |
| Encoding::Utf8 => "UTF-8", |
| Encoding::Default => "UTF-8", |
| Encoding::Latin1 => "ISO-8859-1", |
| Encoding::Ascii => "US-ASCII", |
| Encoding::Utf16Be => "UTF-16", |
| Encoding::Utf16Le => "UTF-16", |
| Encoding::Utf16 => "UTF-16", |
| Encoding::Unknown => "(unknown)", |
| }) |
| } |
| } |
| |
| pub(crate) struct CharReader { |
| pub encoding: Encoding, |
| } |
| |
| impl CharReader { |
| pub fn new() -> Self { |
| Self { |
| encoding: Encoding::Unknown, |
| } |
| } |
| |
| pub fn next_char_from<R: Read>(&mut self, source: &mut R) -> Result<Option<char>, CharReadError> { |
| let mut bytes = source.bytes(); |
| const MAX_CODEPOINT_LEN: usize = 4; |
| |
| let mut buf = [0u8; MAX_CODEPOINT_LEN]; |
| let mut pos = 0; |
| loop { |
| let next = match bytes.next() { |
| Some(Ok(b)) => b, |
| Some(Err(e)) => return Err(e.into()), |
| None if pos == 0 => return Ok(None), |
| None => return Err(CharReadError::UnexpectedEof), |
| }; |
| |
| match self.encoding { |
| Encoding::Utf8 | Encoding::Default => { |
| // fast path for ASCII subset |
| if pos == 0 && next.is_ascii() { |
| return Ok(Some(next.into())); |
| } |
| |
| buf[pos] = next; |
| pos += 1; |
| |
| match str::from_utf8(&buf[..pos]) { |
| Ok(s) => return Ok(s.chars().next()), // always Some(..) |
| Err(_) if pos < MAX_CODEPOINT_LEN => continue, |
| Err(e) => return Err(e.into()), |
| } |
| }, |
| Encoding::Latin1 => { |
| return Ok(Some(next.into())); |
| }, |
| Encoding::Ascii => { |
| if next.is_ascii() { |
| return Ok(Some(next.into())); |
| } else { |
| return Err(CharReadError::Io(io::Error::new(io::ErrorKind::InvalidData, "char is not ASCII"))); |
| } |
| }, |
| Encoding::Unknown | Encoding::Utf16 => { |
| buf[pos] = next; |
| pos += 1; |
| |
| // sniff BOM |
| if pos <= 3 && buf[..pos] == [0xEF, 0xBB, 0xBF][..pos] { |
| if pos == 3 && self.encoding != Encoding::Utf16 { |
| pos = 0; |
| self.encoding = Encoding::Utf8; |
| } |
| } else if pos <= 2 && buf[..pos] == [0xFE, 0xFF][..pos] { |
| if pos == 2 { |
| pos = 0; |
| self.encoding = Encoding::Utf16Be; |
| } |
| } else if pos <= 2 && buf[..pos] == [0xFF, 0xFE][..pos] { |
| if pos == 2 { |
| pos = 0; |
| self.encoding = Encoding::Utf16Le; |
| } |
| } else if pos == 1 && self.encoding == Encoding::Utf16 { |
| // sniff ASCII char in UTF-16 |
| self.encoding = if next == 0 { Encoding::Utf16Be } else { Encoding::Utf16Le }; |
| } else { |
| // UTF-8 is the default, but XML decl can change it to other 8-bit encoding |
| self.encoding = Encoding::Default; |
| if pos == 1 && next.is_ascii() { |
| return Ok(Some(next.into())); |
| } |
| } |
| }, |
| Encoding::Utf16Be => { |
| buf[pos] = next; |
| pos += 1; |
| if pos == 2 { |
| if let Some(Ok(c)) = char::decode_utf16([u16::from_be_bytes(buf[..2].try_into().unwrap())]).next() { |
| return Ok(Some(c)); |
| } |
| } else if pos == 4 { // surrogate |
| return char::decode_utf16([u16::from_be_bytes(buf[..2].try_into().unwrap()), u16::from_be_bytes(buf[2..4].try_into().unwrap())]) |
| .next().transpose() |
| .map_err(|e| CharReadError::Io(io::Error::new(io::ErrorKind::InvalidData, e))); |
| } |
| }, |
| Encoding::Utf16Le => { |
| buf[pos] = next; |
| pos += 1; |
| if pos == 2 { |
| if let Some(Ok(c)) = char::decode_utf16([u16::from_le_bytes(buf[..2].try_into().unwrap())]).next() { |
| return Ok(Some(c)); |
| } |
| } else if pos == 4 { // surrogate |
| return char::decode_utf16([u16::from_le_bytes(buf[..2].try_into().unwrap()), u16::from_le_bytes(buf[2..4].try_into().unwrap())]) |
| .next().transpose() |
| .map_err(|e| CharReadError::Io(io::Error::new(io::ErrorKind::InvalidData, e))); |
| } |
| }, |
| } |
| } |
| } |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use super::{CharReadError, CharReader, Encoding}; |
| |
| #[test] |
| fn test_next_char_from() { |
| use std::io; |
| |
| let mut bytes: &[u8] = "correct".as_bytes(); // correct ASCII |
| assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('c')); |
| |
| let mut bytes: &[u8] = b"\xEF\xBB\xBF\xE2\x80\xA2!"; // BOM |
| assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('•')); |
| |
| let mut bytes: &[u8] = b"\xEF\xBB\xBFx123"; // BOM |
| assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('x')); |
| |
| let mut bytes: &[u8] = b"\xEF\xBB\xBF"; // Nothing after BOM |
| assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), None); |
| |
| let mut bytes: &[u8] = b"\xEF\xBB"; // Nothing after BO |
| assert!(matches!(CharReader::new().next_char_from(&mut bytes), Err(CharReadError::UnexpectedEof))); |
| |
| let mut bytes: &[u8] = b"\xEF\xBB\x42"; // Nothing after BO |
| assert!(matches!(CharReader::new().next_char_from(&mut bytes), Err(_))); |
| |
| let mut bytes: &[u8] = b"\xFE\xFF\x00\x42"; // UTF-16 |
| assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('B')); |
| |
| let mut bytes: &[u8] = b"\xFF\xFE\x42\x00"; // UTF-16 |
| assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('B')); |
| |
| let mut bytes: &[u8] = b"\xFF\xFE"; // UTF-16 |
| assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), None); |
| |
| let mut bytes: &[u8] = b"\xFF\xFE\x00"; // UTF-16 |
| assert!(matches!(CharReader::new().next_char_from(&mut bytes), Err(CharReadError::UnexpectedEof))); |
| |
| let mut bytes: &[u8] = "правильно".as_bytes(); // correct BMP |
| assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('п')); |
| |
| let mut bytes: &[u8] = "правильно".as_bytes(); |
| assert_eq!(CharReader { encoding: Encoding::Utf16Be }.next_char_from(&mut bytes).unwrap(), Some('킿')); |
| |
| let mut bytes: &[u8] = "правильно".as_bytes(); |
| assert_eq!(CharReader { encoding: Encoding::Utf16Le }.next_char_from(&mut bytes).unwrap(), Some('뿐')); |
| |
| let mut bytes: &[u8] = b"\xD8\xD8\x80"; |
| assert!(matches!(CharReader { encoding: Encoding::Utf16 }.next_char_from(&mut bytes), Err(_))); |
| |
| let mut bytes: &[u8] = b"\x00\x42"; |
| assert_eq!(CharReader { encoding: Encoding::Utf16 }.next_char_from(&mut bytes).unwrap(), Some('B')); |
| |
| let mut bytes: &[u8] = b"\x42\x00"; |
| assert_eq!(CharReader { encoding: Encoding::Utf16 }.next_char_from(&mut bytes).unwrap(), Some('B')); |
| |
| let mut bytes: &[u8] = b"\x00"; |
| assert!(matches!(CharReader { encoding: Encoding::Utf16Be }.next_char_from(&mut bytes), Err(_))); |
| |
| let mut bytes: &[u8] = "😊".as_bytes(); // correct non-BMP |
| assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('😊')); |
| |
| let mut bytes: &[u8] = b""; // empty |
| assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), None); |
| |
| let mut bytes: &[u8] = b"\xf0\x9f\x98"; // incomplete code point |
| match CharReader::new().next_char_from(&mut bytes).unwrap_err() { |
| super::CharReadError::UnexpectedEof => {}, |
| e => panic!("Unexpected result: {e:?}") |
| }; |
| |
| let mut bytes: &[u8] = b"\xff\x9f\x98\x32"; // invalid code point |
| match CharReader::new().next_char_from(&mut bytes).unwrap_err() { |
| super::CharReadError::Utf8(_) => {}, |
| e => panic!("Unexpected result: {e:?}") |
| }; |
| |
| // error during read |
| struct ErrorReader; |
| impl io::Read for ErrorReader { |
| fn read(&mut self, _: &mut [u8]) -> io::Result<usize> { |
| Err(io::Error::new(io::ErrorKind::Other, "test error")) |
| } |
| } |
| |
| let mut r = ErrorReader; |
| match CharReader::new().next_char_from(&mut r).unwrap_err() { |
| super::CharReadError::Io(ref e) if e.kind() == io::ErrorKind::Other && |
| e.to_string().contains("test error") => {}, |
| e => panic!("Unexpected result: {e:?}") |
| } |
| } |
| } |