blob: 9800d781fc39256cd80f7365ef58e09d51499308 [file] [log] [blame]
use std::iter::FusedIterator;
use std::iter::Peekable;
use std::mem;
use crate::util::is_continuation;
use crate::util::BYTE_SHIFT;
use crate::util::CONT_MASK;
use super::EncodingError;
use super::Result;
pub(in super::super) struct CodePoints<I>
where
I: Iterator<Item = u8>,
{
iter: Peekable<I>,
surrogate: bool,
still_utf8: bool,
}
impl<I> CodePoints<I>
where
I: Iterator<Item = u8>,
{
pub(in super::super) fn new<S>(string: S) -> Self
where
S: IntoIterator<IntoIter = I>,
{
Self {
iter: string.into_iter().peekable(),
surrogate: false,
still_utf8: true,
}
}
pub(super) fn is_still_utf8(&self) -> bool {
self.still_utf8
}
fn consume_next(&mut self, code_point: &mut u32) -> Result<()> {
let &byte = self.iter.peek().ok_or(EncodingError::End())?;
if !is_continuation(byte) {
self.surrogate = false;
// Not consuming this byte will be useful if this crate ever offers
// a way to encode lossily.
return Err(EncodingError::Byte(byte));
}
*code_point =
(*code_point << BYTE_SHIFT) | u32::from(byte & CONT_MASK);
let removed = self.iter.next();
debug_assert_eq!(Some(byte), removed);
Ok(())
}
pub(super) fn inner_size_hint(&self) -> (usize, Option<usize>) {
self.iter.size_hint()
}
}
impl<I> FusedIterator for CodePoints<I> where
I: FusedIterator + Iterator<Item = u8>
{
}
impl<I> Iterator for CodePoints<I>
where
I: Iterator<Item = u8>,
{
type Item = Result<u32>;
fn next(&mut self) -> Option<Self::Item> {
let byte = self.iter.next()?;
let mut code_point: u32 = byte.into();
macro_rules! consume_next {
() => {{
if let Err(error) = self.consume_next(&mut code_point) {
return Some(Err(error));
}
}};
}
let prev_surrogate = mem::replace(&mut self.surrogate, false);
let mut invalid = false;
if !byte.is_ascii() {
if byte < 0xC2 {
return Some(Err(EncodingError::Byte(byte)));
}
if byte < 0xE0 {
code_point &= 0x1F;
} else {
code_point &= 0x0F;
consume_next!();
if byte >= 0xF0 {
if code_point.wrapping_sub(0x10) >= 0x100 {
invalid = true;
}
consume_next!();
// This condition is optimized to detect surrogate code points.
} else if code_point & 0xFE0 == 0x360 {
self.still_utf8 = false;
if code_point & 0x10 == 0 {
self.surrogate = true;
} else if prev_surrogate {
// Decoding a broken surrogate pair would be lossy.
invalid = true;
}
}
if code_point < 0x20 {
invalid = true;
}
}
consume_next!();
}
if invalid {
return Some(Err(EncodingError::CodePoint(code_point)));
}
Some(Ok(code_point))
}
}