blob: e6f0fb1fcd7bda59adbf324f5cff350654da1f2b [file] [log] [blame]
// Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use super::*;
use handles::*;
use variant::*;
pub struct Utf16Decoder {
lead_surrogate: u16, // If non-zero and pending_bmp == false, a pending lead surrogate
lead_byte: Option<u8>,
be: bool,
pending_bmp: bool, // if true, lead_surrogate is actually pending BMP
}
impl Utf16Decoder {
pub fn new(big_endian: bool) -> VariantDecoder {
VariantDecoder::Utf16(Utf16Decoder {
lead_surrogate: 0,
lead_byte: None,
be: big_endian,
pending_bmp: false,
})
}
pub fn additional_from_state(&self) -> usize {
1 + if self.lead_byte.is_some() { 1 } else { 0 }
+ if self.lead_surrogate == 0 { 0 } else { 2 }
}
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
checked_add(
1,
checked_div(byte_length.checked_add(self.additional_from_state()), 2),
)
}
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
checked_add(
1,
checked_mul(
3,
checked_div(byte_length.checked_add(self.additional_from_state()), 2),
),
)
}
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
checked_add(
1,
checked_mul(
3,
checked_div(byte_length.checked_add(self.additional_from_state()), 2),
),
)
}
decoder_functions!(
{
if self.pending_bmp {
match dest.check_space_bmp() {
Space::Full(_) => {
return (DecoderResult::OutputFull, 0, 0);
}
Space::Available(destination_handle) => {
destination_handle.write_bmp(self.lead_surrogate);
self.pending_bmp = false;
self.lead_surrogate = 0;
}
}
}
},
{
// This is the fast path. The rest runs only at the
// start and end for partial sequences.
if self.lead_byte.is_none() && self.lead_surrogate == 0 {
if let Some((read, written)) = if self.be {
dest.copy_utf16_from::<BigEndian>(&mut source)
} else {
dest.copy_utf16_from::<LittleEndian>(&mut source)
} {
return (DecoderResult::Malformed(2, 0), read, written);
}
}
},
{
debug_assert!(!self.pending_bmp);
if self.lead_surrogate != 0 {
self.lead_surrogate = 0;
match self.lead_byte {
None => {
return (DecoderResult::Malformed(2, 0), src_consumed, dest.written());
}
Some(_) => {
self.lead_byte = None;
return (DecoderResult::Malformed(3, 0), src_consumed, dest.written());
}
}
}
match self.lead_byte {
None => {}
Some(_) => {
self.lead_byte = None;
return (DecoderResult::Malformed(1, 0), src_consumed, dest.written());
}
}
},
{
match self.lead_byte {
None => {
self.lead_byte = Some(b);
continue;
}
Some(lead) => {
self.lead_byte = None;
let code_unit = if self.be {
u16::from(lead) << 8 | u16::from(b)
} else {
u16::from(b) << 8 | u16::from(lead)
};
let high_bits = code_unit & 0xFC00u16;
if high_bits == 0xD800u16 {
// high surrogate
if self.lead_surrogate != 0 {
// The previous high surrogate was in
// error and this one becomes the new
// pending one.
self.lead_surrogate = code_unit as u16;
return (
DecoderResult::Malformed(2, 2),
unread_handle.consumed(),
destination_handle.written(),
);
}
self.lead_surrogate = code_unit;
continue;
}
if high_bits == 0xDC00u16 {
// low surrogate
if self.lead_surrogate == 0 {
return (
DecoderResult::Malformed(2, 0),
unread_handle.consumed(),
destination_handle.written(),
);
}
destination_handle.write_surrogate_pair(self.lead_surrogate, code_unit);
self.lead_surrogate = 0;
continue;
}
// bmp
if self.lead_surrogate != 0 {
// The previous high surrogate was in
// error and this code unit becomes a
// pending BMP character.
self.lead_surrogate = code_unit;
self.pending_bmp = true;
return (
DecoderResult::Malformed(2, 2),
unread_handle.consumed(),
destination_handle.written(),
);
}
destination_handle.write_bmp(code_unit);
continue;
}
}
},
self,
src_consumed,
dest,
source,
b,
destination_handle,
unread_handle,
check_space_astral
);
}
// Any copyright to the test code below this comment is dedicated to the
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
#[cfg(test)]
mod tests {
use super::super::testing::*;
use super::super::*;
fn decode_utf_16le(bytes: &[u8], expect: &str) {
decode_without_padding(UTF_16LE, bytes, expect);
}
fn decode_utf_16be(bytes: &[u8], expect: &str) {
decode_without_padding(UTF_16BE, bytes, expect);
}
fn encode_utf_16le(string: &str, expect: &[u8]) {
encode(UTF_16LE, string, expect);
}
fn encode_utf_16be(string: &str, expect: &[u8]) {
encode(UTF_16BE, string, expect);
}
#[test]
fn test_utf_16_decode() {
decode_utf_16le(b"", "");
decode_utf_16be(b"", "");
decode_utf_16le(b"\x61\x00\x62\x00", "\u{0061}\u{0062}");
decode_utf_16be(b"\x00\x61\x00\x62", "\u{0061}\u{0062}");
decode_utf_16le(b"\xFE\xFF\x00\x61\x00\x62", "\u{0061}\u{0062}");
decode_utf_16be(b"\xFF\xFE\x61\x00\x62\x00", "\u{0061}\u{0062}");
decode_utf_16le(b"\x61\x00\x62", "\u{0061}\u{FFFD}");
decode_utf_16be(b"\x00\x61\x00", "\u{0061}\u{FFFD}");
decode_utf_16le(b"\x3D\xD8\xA9", "\u{FFFD}");
decode_utf_16be(b"\xD8\x3D\xDC", "\u{FFFD}");
decode_utf_16le(b"\x3D\xD8\xA9\xDC\x03\x26", "\u{1F4A9}\u{2603}");
decode_utf_16be(b"\xD8\x3D\xDC\xA9\x26\x03", "\u{1F4A9}\u{2603}");
decode_utf_16le(b"\xA9\xDC\x03\x26", "\u{FFFD}\u{2603}");
decode_utf_16be(b"\xDC\xA9\x26\x03", "\u{FFFD}\u{2603}");
decode_utf_16le(b"\x3D\xD8\x03\x26", "\u{FFFD}\u{2603}");
decode_utf_16be(b"\xD8\x3D\x26\x03", "\u{FFFD}\u{2603}");
// The \xFF makes sure that the parts before and after have different alignment
let long_le = b"\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\x00\x00\x00\x00\x00\x00\x00\x00\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xFF\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\x00\x00\x00\x00\x00\x00\x00\x00\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8";
let long_be = b"\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\x00\x00\x00\x00\x00\x00\x00\x00\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xFF\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\x00\x00\x00\x00\x00\x00\x00\x00\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D";
let long_expect = "\x00\x00\x00\x00\u{1F4A9}\x00\x00\x00\x00\u{FFFD}\x00\x00\x00\x00\u{FFFD}\x00\x00\x00\x00\x00\x00\x00\x00\u{FFFD}";
decode_utf_16le(&long_le[..long_le.len() / 2], long_expect);
decode_utf_16be(&long_be[..long_be.len() / 2], long_expect);
decode_utf_16le(&long_le[long_le.len() / 2 + 1..], long_expect);
decode_utf_16be(&long_be[long_be.len() / 2 + 1..], long_expect);
}
#[test]
fn test_utf_16_encode() {
// Empty
encode_utf_16be("", b"");
encode_utf_16le("", b"");
// Encodes as UTF-8
assert_eq!(UTF_16LE.new_encoder().encoding(), UTF_8);
assert_eq!(UTF_16BE.new_encoder().encoding(), UTF_8);
encode_utf_16le("\u{1F4A9}\u{2603}", "\u{1F4A9}\u{2603}".as_bytes());
encode_utf_16be("\u{1F4A9}\u{2603}", "\u{1F4A9}\u{2603}".as_bytes());
}
#[test]
fn test_utf_16be_decode_one_by_one() {
let input = b"\x00\x61\x00\xE4\x26\x03\xD8\x3D\xDC\xA9";
let mut output = [0u16; 20];
let mut decoder = UTF_16BE.new_decoder();
for b in input.chunks(1) {
assert_eq!(b.len(), 1);
let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
let (result, read, _, had_errors) =
decoder.decode_to_utf16(b, &mut output[..needed], false);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert!(!had_errors);
}
}
#[test]
fn test_utf_16le_decode_one_by_one() {
let input = b"\x61\x00\xE4\x00\x03\x26\x3D\xD8\xA9\xDC";
let mut output = [0u16; 20];
let mut decoder = UTF_16LE.new_decoder();
for b in input.chunks(1) {
assert_eq!(b.len(), 1);
let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
let (result, read, _, had_errors) =
decoder.decode_to_utf16(b, &mut output[..needed], false);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert!(!had_errors);
}
}
#[test]
fn test_utf_16be_decode_three_at_a_time() {
let input = b"\x00\xE4\x26\x03\xD8\x3D\xDC\xA9\x00\x61\x00\xE4";
let mut output = [0u16; 20];
let mut decoder = UTF_16BE.new_decoder();
for b in input.chunks(3) {
assert_eq!(b.len(), 3);
let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
let (result, read, _, had_errors) =
decoder.decode_to_utf16(b, &mut output[..needed], false);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, b.len());
assert!(!had_errors);
}
}
#[test]
fn test_utf_16le_decode_three_at_a_time() {
let input = b"\xE4\x00\x03\x26\x3D\xD8\xA9\xDC\x61\x00\xE4\x00";
let mut output = [0u16; 20];
let mut decoder = UTF_16LE.new_decoder();
for b in input.chunks(3) {
assert_eq!(b.len(), 3);
let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
let (result, read, _, had_errors) =
decoder.decode_to_utf16(b, &mut output[..needed], false);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, b.len());
assert!(!had_errors);
}
}
#[test]
fn test_utf_16le_decode_bom_prefixed_split_byte_pair() {
let mut output = [0u16; 20];
let mut decoder = UTF_16LE.new_decoder();
{
let needed = decoder.max_utf16_buffer_length(1).unwrap();
let (result, read, written, had_errors) =
decoder.decode_to_utf16(b"\xFF", &mut output[..needed], false);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert_eq!(written, 0);
assert!(!had_errors);
}
{
let needed = decoder.max_utf16_buffer_length(1).unwrap();
let (result, read, written, had_errors) =
decoder.decode_to_utf16(b"\xFD", &mut output[..needed], true);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert_eq!(written, 1);
assert!(!had_errors);
assert_eq!(output[0], 0xFDFF);
}
}
#[test]
fn test_utf_16be_decode_bom_prefixed_split_byte_pair() {
let mut output = [0u16; 20];
let mut decoder = UTF_16BE.new_decoder();
{
let needed = decoder.max_utf16_buffer_length(1).unwrap();
let (result, read, written, had_errors) =
decoder.decode_to_utf16(b"\xFE", &mut output[..needed], false);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert_eq!(written, 0);
assert!(!had_errors);
}
{
let needed = decoder.max_utf16_buffer_length(1).unwrap();
let (result, read, written, had_errors) =
decoder.decode_to_utf16(b"\xFD", &mut output[..needed], true);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert_eq!(written, 1);
assert!(!had_errors);
assert_eq!(output[0], 0xFEFD);
}
}
#[test]
fn test_utf_16le_decode_bom_prefix() {
let mut output = [0u16; 20];
let mut decoder = UTF_16LE.new_decoder();
{
let needed = decoder.max_utf16_buffer_length(1).unwrap();
let (result, read, written, had_errors) =
decoder.decode_to_utf16(b"\xFF", &mut output[..needed], true);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert_eq!(written, 1);
assert!(had_errors);
assert_eq!(output[0], 0xFFFD);
}
}
#[test]
fn test_utf_16be_decode_bom_prefix() {
let mut output = [0u16; 20];
let mut decoder = UTF_16BE.new_decoder();
{
let needed = decoder.max_utf16_buffer_length(1).unwrap();
let (result, read, written, had_errors) =
decoder.decode_to_utf16(b"\xFE", &mut output[..needed], true);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert_eq!(written, 1);
assert!(had_errors);
assert_eq!(output[0], 0xFFFD);
}
}
#[test]
fn test_utf_16le_decode_near_end() {
let mut output = [0u8; 4];
let mut decoder = UTF_16LE.new_decoder();
{
let (result, read, written, had_errors) =
decoder.decode_to_utf8(&[0x03], &mut output[..], false);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert_eq!(written, 0);
assert!(!had_errors);
assert_eq!(output[0], 0x0);
}
{
let (result, read, written, had_errors) =
decoder.decode_to_utf8(&[0x26, 0x03, 0x26], &mut output[..], false);
assert_eq!(result, CoderResult::OutputFull);
assert_eq!(read, 1);
assert_eq!(written, 3);
assert!(!had_errors);
assert_eq!(output[0], 0xE2);
assert_eq!(output[1], 0x98);
assert_eq!(output[2], 0x83);
assert_eq!(output[3], 0x00);
}
}
#[test]
fn test_utf_16be_decode_near_end() {
let mut output = [0u8; 4];
let mut decoder = UTF_16BE.new_decoder();
{
let (result, read, written, had_errors) =
decoder.decode_to_utf8(&[0x26], &mut output[..], false);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert_eq!(written, 0);
assert!(!had_errors);
assert_eq!(output[0], 0x0);
}
{
let (result, read, written, had_errors) =
decoder.decode_to_utf8(&[0x03, 0x26, 0x03], &mut output[..], false);
assert_eq!(result, CoderResult::OutputFull);
assert_eq!(read, 1);
assert_eq!(written, 3);
assert!(!had_errors);
assert_eq!(output[0], 0xE2);
assert_eq!(output[1], 0x98);
assert_eq!(output[2], 0x83);
assert_eq!(output[3], 0x00);
}
}
}