| // Copyright Mozilla Foundation. See the COPYRIGHT |
| // file at the top-level directory of this distribution. |
| // |
| // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
| // option. This file may not be copied, modified, or distributed |
| // except according to those terms. |
| |
| use super::*; |
| use crate::data::*; |
| use crate::handles::*; |
| use crate::variant::*; |
| // Rust 1.14.0 requires the following despite the asterisk above. |
| use super::in_inclusive_range32; |
| |
| pub struct Big5Decoder { |
| lead: Option<u8>, |
| } |
| |
| impl Big5Decoder { |
| pub fn new() -> VariantDecoder { |
| VariantDecoder::Big5(Big5Decoder { lead: None }) |
| } |
| |
| pub fn in_neutral_state(&self) -> bool { |
| self.lead.is_none() |
| } |
| |
| fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> { |
| byte_length.checked_add(match self.lead { |
| None => 0, |
| Some(_) => 1, |
| }) |
| } |
| |
| pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> { |
| // If there is a lead but the next byte isn't a valid trail, an |
| // error is generated for the lead (+1). Then another iteration checks |
| // space, which needs +1 to account for the possibility of astral |
| // output or combining pair. |
| checked_add(1, self.plus_one_if_lead(byte_length)) |
| } |
| |
| pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> { |
| // No need to account for REPLACEMENT CHARACTERS. |
| // Cases: |
| // ASCII: 1 to 1 |
| // Valid pair: 2 to 2, 2 to 3 or 2 to 4, i.e. worst case 2 to 4 |
| // lead set and first byte is trail: 1 to 4 worst case |
| // |
| // When checking for space for the last byte: |
| // no lead: the last byte must be ASCII (or fatal error): 1 to 1 |
| // lead set: space for 4 bytes was already checked when reading the |
| // lead, hence the last lead and the last trail together are worst |
| // case 2 to 4. |
| // |
| // If lead set and the input is a single trail byte, the worst-case |
| // output is 4, so we need to add one before multiplying if lead is |
| // set. |
| // |
| // Finally, add two so that if input is non-zero, the output is at |
| // least 4. |
| checked_add(2, checked_mul(2, self.plus_one_if_lead(byte_length))) |
| } |
| |
| pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> { |
| // If there is a lead but the next byte isn't a valid trail, an |
| // error is generated for the lead (+(1*3)). Then another iteration |
| // checks space, which needs +3 to account for the possibility of astral |
| // output or combining pair. In between start and end, the worst case |
| // is that every byte is bad: *3. |
| checked_add(3, checked_mul(3, self.plus_one_if_lead(byte_length))) |
| } |
| |
| ascii_compatible_two_byte_decoder_functions!( |
| { |
| // If lead is between 0x81 and 0xFE, inclusive, |
| // subtract offset 0x81. |
| let non_ascii_minus_offset = |
| non_ascii.wrapping_sub(0x81); |
| if non_ascii_minus_offset > (0xFE - 0x81) { |
| return (DecoderResult::Malformed(1, 0), |
| source.consumed(), |
| handle.written()); |
| } |
| non_ascii_minus_offset |
| }, |
| { |
| // If trail is between 0x40 and 0x7E, inclusive, |
| // subtract offset 0x40. Else if trail is |
| // between 0xA1 and 0xFE, inclusive, subtract |
| // offset 0x62. |
| // TODO: Find out which range is more probable. |
| let mut trail_minus_offset = |
| byte.wrapping_sub(0x40); |
| if trail_minus_offset > (0x7E - 0x40) { |
| let trail_minus_range_start = |
| byte.wrapping_sub(0xA1); |
| if trail_minus_range_start > |
| (0xFE - 0xA1) { |
| if byte < 0x80 { |
| return (DecoderResult::Malformed(1, 0), |
| unread_handle_trail.unread(), |
| handle.written()); |
| } |
| return (DecoderResult::Malformed(2, 0), |
| unread_handle_trail.consumed(), |
| handle.written()); |
| } |
| trail_minus_offset = byte - 0x62; |
| } |
| let pointer = lead_minus_offset as usize * |
| 157usize + |
| trail_minus_offset as usize; |
| let rebased_pointer = pointer.wrapping_sub(942); |
| let low_bits = big5_low_bits(rebased_pointer); |
| if low_bits == 0 { |
| match pointer { |
| 1133 => { |
| handle.write_big5_combination(0x00CAu16, |
| 0x0304u16) |
| } |
| 1135 => { |
| handle.write_big5_combination(0x00CAu16, |
| 0x030Cu16) |
| } |
| 1164 => { |
| handle.write_big5_combination(0x00EAu16, |
| 0x0304u16) |
| } |
| 1166 => { |
| handle.write_big5_combination(0x00EAu16, |
| 0x030Cu16) |
| } |
| _ => { |
| if byte < 0x80 { |
| return (DecoderResult::Malformed(1, 0), |
| unread_handle_trail.unread(), |
| handle.written()); |
| } |
| return (DecoderResult::Malformed(2, 0), |
| unread_handle_trail.consumed(), |
| handle.written()); |
| } |
| } |
| } else if big5_is_astral(rebased_pointer) { |
| handle.write_astral(u32::from(low_bits) | |
| 0x20000u32) |
| } else { |
| handle.write_bmp_excl_ascii(low_bits) |
| } |
| }, |
| self, |
| non_ascii, |
| byte, |
| lead_minus_offset, |
| unread_handle_trail, |
| source, |
| handle, |
| 'outermost, |
| copy_ascii_from_check_space_astral, |
| check_space_astral, |
| false); |
| } |
| |
| pub struct Big5Encoder; |
| |
| impl Big5Encoder { |
| pub fn new(encoding: &'static Encoding) -> Encoder { |
| Encoder::new(encoding, VariantEncoder::Big5(Big5Encoder)) |
| } |
| |
| pub fn max_buffer_length_from_utf16_without_replacement( |
| &self, |
| u16_length: usize, |
| ) -> Option<usize> { |
| // Astral: 2 to 2 |
| // ASCII: 1 to 1 |
| // Other: 1 to 2 |
| u16_length.checked_mul(2) |
| } |
| |
| pub fn max_buffer_length_from_utf8_without_replacement( |
| &self, |
| byte_length: usize, |
| ) -> Option<usize> { |
| // Astral: 4 to 2 |
| // Upper BMP: 3 to 2 |
| // Lower BMP: 2 to 2 |
| // ASCII: 1 to 1 |
| byte_length.checked_add(1) |
| } |
| |
| ascii_compatible_encoder_functions!( |
| { |
| // For simplicity, unified ideographs |
| // in the pointer range 11206...11212 are handled |
| // as Level 1 Hanzi. |
| if let Some((lead, trail)) = big5_level1_hanzi_encode(bmp) { |
| handle.write_two(lead, trail) |
| } else { |
| let pointer = if let Some(pointer) = big5_box_encode(bmp) { |
| pointer |
| } else if let Some(pointer) = big5_other_encode(bmp) { |
| pointer |
| } else { |
| return ( |
| EncoderResult::unmappable_from_bmp(bmp), |
| source.consumed(), |
| handle.written(), |
| ); |
| }; |
| let lead = pointer / 157 + 0x81; |
| let remainder = pointer % 157; |
| let trail = if remainder < 0x3F { |
| remainder + 0x40 |
| } else { |
| remainder + 0x62 |
| }; |
| handle.write_two(lead as u8, trail as u8) |
| } |
| }, |
| { |
| if in_inclusive_range32(astral as u32, 0x2008A, 0x2F8A6) { |
| if let Some(rebased_pointer) = big5_astral_encode(astral as u16) { |
| // big5_astral_encode returns rebased pointer, |
| // so adding 0x87 instead of 0x81. |
| let lead = rebased_pointer / 157 + 0x87; |
| let remainder = rebased_pointer % 157; |
| let trail = if remainder < 0x3F { |
| remainder + 0x40 |
| } else { |
| remainder + 0x62 |
| }; |
| handle.write_two(lead as u8, trail as u8) |
| } else { |
| return ( |
| EncoderResult::Unmappable(astral), |
| source.consumed(), |
| handle.written(), |
| ); |
| } |
| } else { |
| return ( |
| EncoderResult::Unmappable(astral), |
| source.consumed(), |
| handle.written(), |
| ); |
| } |
| }, |
| bmp, |
| astral, |
| self, |
| source, |
| handle, |
| copy_ascii_to_check_space_two, |
| check_space_two, |
| false |
| ); |
| } |
| |
| // Any copyright to the test code below this comment is dedicated to the |
| // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ |
| |
| #[cfg(all(test, feature = "alloc"))] |
| mod tests { |
| use super::super::testing::*; |
| use super::super::*; |
| |
| fn decode_big5(bytes: &[u8], expect: &str) { |
| decode(BIG5, bytes, expect); |
| } |
| |
| fn encode_big5(string: &str, expect: &[u8]) { |
| encode(BIG5, string, expect); |
| } |
| |
| #[test] |
| fn test_big5_decode() { |
| // Empty |
| decode_big5(b"", &""); |
| |
| // ASCII |
| decode_big5(&[0x61u8, 0x62u8], &"\u{0061}\u{0062}"); |
| |
| // Edge cases |
| decode_big5(&[0x87u8, 0x40u8], &"\u{43F0}"); |
| decode_big5(&[0xFEu8, 0xFEu8], &"\u{79D4}"); |
| decode_big5(&[0xFEu8, 0xFDu8], &"\u{2910D}"); |
| decode_big5(&[0x88u8, 0x62u8], &"\u{00CA}\u{0304}"); |
| decode_big5(&[0x88u8, 0x64u8], &"\u{00CA}\u{030C}"); |
| decode_big5(&[0x88u8, 0x66u8], &"\u{00CA}"); |
| decode_big5(&[0x88u8, 0xA3u8], &"\u{00EA}\u{0304}"); |
| decode_big5(&[0x88u8, 0xA5u8], &"\u{00EA}\u{030C}"); |
| decode_big5(&[0x88u8, 0xA7u8], &"\u{00EA}"); |
| decode_big5(&[0x99u8, 0xD4u8], &"\u{8991}"); |
| decode_big5(&[0x99u8, 0xD5u8], &"\u{27967}"); |
| decode_big5(&[0x99u8, 0xD6u8], &"\u{8A29}"); |
| |
| // Edge cases surrounded with ASCII |
| decode_big5( |
| &[0x61u8, 0x87u8, 0x40u8, 0x62u8], |
| &"\u{0061}\u{43F0}\u{0062}", |
| ); |
| decode_big5( |
| &[0x61u8, 0xFEu8, 0xFEu8, 0x62u8], |
| &"\u{0061}\u{79D4}\u{0062}", |
| ); |
| decode_big5( |
| &[0x61u8, 0xFEu8, 0xFDu8, 0x62u8], |
| &"\u{0061}\u{2910D}\u{0062}", |
| ); |
| decode_big5( |
| &[0x61u8, 0x88u8, 0x62u8, 0x62u8], |
| &"\u{0061}\u{00CA}\u{0304}\u{0062}", |
| ); |
| decode_big5( |
| &[0x61u8, 0x88u8, 0x64u8, 0x62u8], |
| &"\u{0061}\u{00CA}\u{030C}\u{0062}", |
| ); |
| decode_big5( |
| &[0x61u8, 0x88u8, 0x66u8, 0x62u8], |
| &"\u{0061}\u{00CA}\u{0062}", |
| ); |
| decode_big5( |
| &[0x61u8, 0x88u8, 0xA3u8, 0x62u8], |
| &"\u{0061}\u{00EA}\u{0304}\u{0062}", |
| ); |
| decode_big5( |
| &[0x61u8, 0x88u8, 0xA5u8, 0x62u8], |
| &"\u{0061}\u{00EA}\u{030C}\u{0062}", |
| ); |
| decode_big5( |
| &[0x61u8, 0x88u8, 0xA7u8, 0x62u8], |
| &"\u{0061}\u{00EA}\u{0062}", |
| ); |
| decode_big5( |
| &[0x61u8, 0x99u8, 0xD4u8, 0x62u8], |
| &"\u{0061}\u{8991}\u{0062}", |
| ); |
| decode_big5( |
| &[0x61u8, 0x99u8, 0xD5u8, 0x62u8], |
| &"\u{0061}\u{27967}\u{0062}", |
| ); |
| decode_big5( |
| &[0x61u8, 0x99u8, 0xD6u8, 0x62u8], |
| &"\u{0061}\u{8A29}\u{0062}", |
| ); |
| |
| // Bad sequences |
| decode_big5(&[0x80u8, 0x61u8], &"\u{FFFD}\u{0061}"); |
| decode_big5(&[0xFFu8, 0x61u8], &"\u{FFFD}\u{0061}"); |
| decode_big5(&[0xFEu8, 0x39u8], &"\u{FFFD}\u{0039}"); |
| decode_big5(&[0x87u8, 0x66u8], &"\u{FFFD}\u{0066}"); |
| decode_big5(&[0x81u8, 0x40u8], &"\u{FFFD}\u{0040}"); |
| decode_big5(&[0x61u8, 0x81u8], &"\u{0061}\u{FFFD}"); |
| } |
| |
| #[test] |
| fn test_big5_encode() { |
| // Empty |
| encode_big5("", b""); |
| |
| // ASCII |
| encode_big5("\u{0061}\u{0062}", b"\x61\x62"); |
| |
| if !cfg!(miri) { |
| // Miri is too slow |
| // Edge cases |
| encode_big5("\u{9EA6}\u{0061}", b"麦\x61"); |
| encode_big5("\u{2626B}\u{0061}", b"𦉫\x61"); |
| encode_big5("\u{3000}", b"\xA1\x40"); |
| encode_big5("\u{20AC}", b"\xA3\xE1"); |
| encode_big5("\u{4E00}", b"\xA4\x40"); |
| encode_big5("\u{27607}", b"\xC8\xA4"); |
| encode_big5("\u{FFE2}", b"\xC8\xCD"); |
| encode_big5("\u{79D4}", b"\xFE\xFE"); |
| |
| // Not in index |
| encode_big5("\u{2603}\u{0061}", b"☃\x61"); |
| } |
| |
| // duplicate low bits |
| encode_big5("\u{203B5}", b"\xFD\x6A"); |
| encode_big5("\u{25605}", b"\xFE\x46"); |
| |
| // prefer last |
| encode_big5("\u{2550}", b"\xF9\xF9"); |
| } |
| |
| #[test] |
| #[cfg_attr(miri, ignore)] // Miri is too slow |
| fn test_big5_decode_all() { |
| let input = include_bytes!("test_data/big5_in.txt"); |
| let expectation = include_str!("test_data/big5_in_ref.txt"); |
| let (cow, had_errors) = BIG5.decode_without_bom_handling(input); |
| assert!(had_errors, "Should have had errors."); |
| assert_eq!(&cow[..], expectation); |
| } |
| |
| #[test] |
| #[cfg_attr(miri, ignore)] // Miri is too slow |
| fn test_big5_encode_all() { |
| let input = include_str!("test_data/big5_out.txt"); |
| let expectation = include_bytes!("test_data/big5_out_ref.txt"); |
| let (cow, encoding, had_errors) = BIG5.encode(input); |
| assert!(!had_errors, "Should not have had errors."); |
| assert_eq!(encoding, BIG5); |
| assert_eq!(&cow[..], &expectation[..]); |
| } |
| |
| #[test] |
| #[cfg_attr(miri, ignore)] // Miri is too slow |
| fn test_big5_encode_from_two_low_surrogates() { |
| let expectation = b"��"; |
| let mut output = [0u8; 40]; |
| let mut encoder = BIG5.new_encoder(); |
| let (result, read, written, had_errors) = |
| encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true); |
| assert_eq!(result, CoderResult::InputEmpty); |
| assert_eq!(read, 2); |
| assert_eq!(written, expectation.len()); |
| assert!(had_errors); |
| assert_eq!(&output[..written], expectation); |
| } |
| } |