| // Copyright Mozilla Foundation. See the COPYRIGHT |
| // file at the top-level directory of this distribution. |
| // |
| // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
| // option. This file may not be copied, modified, or distributed |
| // except according to those terms. |
| |
| use super::*; |
| use crate::data::*; |
| use crate::handles::*; |
| use crate::variant::*; |
| // Rust 1.14.0 requires the following despite the asterisk above. |
| use super::in_inclusive_range; |
| use super::in_inclusive_range16; |
| |
| pub struct ShiftJisDecoder { |
| lead: Option<u8>, |
| } |
| |
| impl ShiftJisDecoder { |
| pub fn new() -> VariantDecoder { |
| VariantDecoder::ShiftJis(ShiftJisDecoder { lead: None }) |
| } |
| |
| pub fn in_neutral_state(&self) -> bool { |
| self.lead.is_none() |
| } |
| |
| fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> { |
| byte_length.checked_add(match self.lead { |
| None => 0, |
| Some(_) => 1, |
| }) |
| } |
| |
| pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> { |
| self.plus_one_if_lead(byte_length) |
| } |
| |
| pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> { |
| // worst case: 1 to 3 (half-width katakana) |
| self.max_utf8_buffer_length(byte_length) |
| } |
| |
| pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> { |
| checked_mul(3, self.plus_one_if_lead(byte_length)) |
| } |
| |
| ascii_compatible_two_byte_decoder_functions!( |
| { |
| // If lead is between 0x81 and 0x9F, inclusive, |
| // subtract offset 0x81. Else if lead is |
| // between 0xE0 and 0xFC, inclusive, subtract |
| // offset 0xC1. Else if lead is between |
| // 0xA1 and 0xDF, inclusive, map to half-width |
| // Katakana. Else if lead is 0x80, pass through. |
| let mut non_ascii_minus_offset = |
| non_ascii.wrapping_sub(0x81); |
| if non_ascii_minus_offset > (0x9F - 0x81) { |
| let non_ascii_minus_range_start = non_ascii.wrapping_sub(0xE0); |
| if non_ascii_minus_range_start > (0xFC - 0xE0) { |
| let non_ascii_minus_half_with_katakana_start = non_ascii.wrapping_sub(0xA1); |
| if non_ascii_minus_half_with_katakana_start > (0xDF - 0xA1) { |
| if non_ascii == 0x80 { |
| handle.write_mid_bmp(0x80); |
| // Not caring about optimizing subsequent non-ASCII |
| continue 'outermost; |
| } |
| return (DecoderResult::Malformed(1, 0), |
| source.consumed(), |
| handle.written()); |
| } |
| handle.write_upper_bmp(0xFF61 + u16::from(non_ascii_minus_half_with_katakana_start)); |
| // Not caring about optimizing subsequent non-ASCII |
| continue 'outermost; |
| } |
| non_ascii_minus_offset = non_ascii - 0xC1; |
| } |
| non_ascii_minus_offset |
| }, |
| { |
| // If trail is between 0x40 and 0x7E, inclusive, |
| // subtract offset 0x40. Else if trail is |
| // between 0x80 and 0xFC, inclusive, subtract |
| // offset 0x41. |
| // Fast-track Hiragana (60% according to Lunde) |
| // and Katakana (10% acconding to Lunde). |
| // Hiragana doesn't cross 0x7F, but Katakana does. |
| // We can check for Hiragana before normalizing |
| // trail. |
| let trail_minus_hiragana = byte.wrapping_sub(0x9F); |
| if lead_minus_offset == 0x01 && trail_minus_hiragana < 0x53 { |
| // Hiragana |
| handle.write_upper_bmp(0x3041 + u16::from(trail_minus_hiragana)) |
| } else { |
| let mut trail_minus_offset = |
| byte.wrapping_sub(0x40); |
| if trail_minus_offset > (0x7E - 0x40) { |
| let trail_minus_range_start = |
| byte.wrapping_sub(0x80); |
| if trail_minus_range_start > (0xFC - 0x80) { |
| if byte < 0x80 { |
| return (DecoderResult::Malformed(1, 0), |
| unread_handle_trail.unread(), |
| handle.written()); |
| } |
| return (DecoderResult::Malformed(2, 0), |
| unread_handle_trail.consumed(), |
| handle.written()); |
| } |
| trail_minus_offset = byte - 0x41; |
| } |
| if lead_minus_offset == 0x02 && |
| trail_minus_offset < 0x56 { |
| // Katakana |
| handle.write_upper_bmp(0x30A1 + u16::from(trail_minus_offset)) |
| } else { |
| let pointer = lead_minus_offset as usize * |
| 188usize + |
| trail_minus_offset as usize; |
| let level1_pointer = pointer.wrapping_sub(1410); |
| if level1_pointer < JIS0208_LEVEL1_KANJI.len() { |
| handle.write_upper_bmp(JIS0208_LEVEL1_KANJI[level1_pointer]) |
| } else { |
| let level2_pointer = pointer.wrapping_sub(4418); |
| if level2_pointer < |
| JIS0208_LEVEL2_AND_ADDITIONAL_KANJI.len() { |
| handle.write_upper_bmp(JIS0208_LEVEL2_AND_ADDITIONAL_KANJI[level2_pointer]) |
| } else { |
| let upper_ibm_pointer = pointer.wrapping_sub(10744); |
| if upper_ibm_pointer < IBM_KANJI.len() { |
| handle.write_upper_bmp(IBM_KANJI[upper_ibm_pointer]) |
| } else { |
| let lower_ibm_pointer = pointer.wrapping_sub(8272); |
| if lower_ibm_pointer < IBM_KANJI.len() { |
| handle.write_upper_bmp(IBM_KANJI[lower_ibm_pointer]) |
| } else if in_inclusive_range(pointer, 8836, 10715) { |
| handle.write_upper_bmp((0xE000 - 8836 + pointer) as u16) |
| } else if let Some(bmp) = jis0208_symbol_decode(pointer) { |
| handle.write_bmp_excl_ascii(bmp) |
| } else if let Some(bmp) = jis0208_range_decode(pointer) { |
| handle.write_bmp_excl_ascii(bmp) |
| } else { |
| if byte < 0x80 { |
| return (DecoderResult::Malformed(1, 0), |
| unread_handle_trail.unread(), |
| handle.written()); |
| } |
| return (DecoderResult::Malformed(2, 0), |
| unread_handle_trail.consumed(), |
| handle.written()); |
| } |
| } |
| } |
| } |
| } |
| } |
| }, |
| self, |
| non_ascii, |
| byte, |
| lead_minus_offset, |
| unread_handle_trail, |
| source, |
| handle, |
| 'outermost, |
| copy_ascii_from_check_space_bmp, |
| check_space_bmp, |
| false); |
| } |
| |
| #[cfg(feature = "fast-kanji-encode")] |
| #[inline(always)] |
| fn encode_kanji(bmp: u16) -> Option<(u8, u8)> { |
| jis0208_kanji_shift_jis_encode(bmp) |
| } |
| |
| #[cfg(not(feature = "fast-kanji-encode"))] |
| #[inline(always)] |
| fn encode_kanji(bmp: u16) -> Option<(u8, u8)> { |
| if let Some((lead, trail)) = jis0208_level1_kanji_shift_jis_encode(bmp) { |
| return Some((lead, trail)); |
| } |
| let pointer = if 0x4EDD == bmp { |
| // Ideograph on the symbol row! |
| 23 |
| } else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) { |
| 4418 + pos |
| } else if let Some(pos) = position(&IBM_KANJI[..], bmp) { |
| 10744 + pos |
| } else { |
| return None; |
| }; |
| let lead = pointer / 188; |
| let lead_offset = if lead < 0x1F { 0x81usize } else { 0xC1usize }; |
| let trail = pointer % 188; |
| let trail_offset = if trail < 0x3F { 0x40usize } else { 0x41usize }; |
| Some(((lead + lead_offset) as u8, (trail + trail_offset) as u8)) |
| } |
| |
| pub struct ShiftJisEncoder; |
| |
| impl ShiftJisEncoder { |
| pub fn new(encoding: &'static Encoding) -> Encoder { |
| Encoder::new(encoding, VariantEncoder::ShiftJis(ShiftJisEncoder)) |
| } |
| |
| pub fn max_buffer_length_from_utf16_without_replacement( |
| &self, |
| u16_length: usize, |
| ) -> Option<usize> { |
| u16_length.checked_mul(2) |
| } |
| |
| pub fn max_buffer_length_from_utf8_without_replacement( |
| &self, |
| byte_length: usize, |
| ) -> Option<usize> { |
| byte_length.checked_add(1) |
| } |
| |
| ascii_compatible_bmp_encoder_functions!( |
| { |
| // Lunde says 60% Hiragana, 30% Kanji, 10% Katakana |
| let bmp_minus_hiragana = bmp.wrapping_sub(0x3041); |
| if bmp_minus_hiragana < 0x53 { |
| handle.write_two(0x82, 0x9F + bmp_minus_hiragana as u8) |
| } else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) { |
| if let Some((lead, trail)) = encode_kanji(bmp) { |
| handle.write_two(lead, trail) |
| } else { |
| return ( |
| EncoderResult::unmappable_from_bmp(bmp), |
| source.consumed(), |
| handle.written(), |
| ); |
| } |
| } else { |
| let bmp_minus_katakana = bmp.wrapping_sub(0x30A1); |
| if bmp_minus_katakana < 0x56 { |
| let trail_offset = if bmp_minus_katakana < 0x3F { |
| 0x40 |
| } else { |
| 0x41 |
| }; |
| handle.write_two(0x83, (trail_offset + bmp_minus_katakana) as u8) |
| } else { |
| let bmp_minus_space = bmp.wrapping_sub(0x3000); |
| if bmp_minus_space < 3 { |
| // fast-track common punctuation |
| handle.write_two(0x81, 0x40 + bmp_minus_space as u8) |
| } else if bmp == 0xA5 { |
| handle.write_one(0x5Cu8) |
| } else if bmp == 0x80 { |
| handle.write_one(0x80u8) |
| } else if bmp == 0x203E { |
| handle.write_one(0x7Eu8) |
| } else if in_inclusive_range16(bmp, 0xFF61, 0xFF9F) { |
| handle.write_one((bmp - (0xFF61 - 0xA1)) as u8) |
| } else if bmp == 0x2212 { |
| handle.write_two(0x81u8, 0x7Cu8) |
| } else { |
| let bmp_minus_roman = bmp.wrapping_sub(0x2170); |
| let pointer = if bmp_minus_roman <= (0x2179 - 0x2170) { |
| 10716 + bmp_minus_roman as usize |
| } else if let Some(pointer) = jis0208_range_encode(bmp) { |
| pointer |
| } else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D) |
| || bmp == 0xF929 |
| || bmp == 0xF9DC |
| { |
| // Guaranteed to be found in IBM_KANJI |
| let pos = position(&IBM_KANJI[..], bmp).unwrap(); |
| 10744 + pos |
| } else if let Some(pointer) = jis0208_symbol_encode(bmp) { |
| pointer |
| } else { |
| return ( |
| EncoderResult::unmappable_from_bmp(bmp), |
| source.consumed(), |
| handle.written(), |
| ); |
| }; |
| let lead = pointer / 188; |
| let lead_offset = if lead < 0x1F { 0x81usize } else { 0xC1usize }; |
| let trail = pointer % 188; |
| let trail_offset = if trail < 0x3F { 0x40usize } else { 0x41usize }; |
| handle.write_two((lead + lead_offset) as u8, (trail + trail_offset) as u8) |
| } |
| } |
| } |
| }, |
| bmp, |
| self, |
| source, |
| handle, |
| copy_ascii_to_check_space_two, |
| check_space_two, |
| false |
| ); |
| } |
| |
| // Any copyright to the test code below this comment is dedicated to the |
| // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ |
| |
| #[cfg(all(test, feature = "alloc"))] |
| mod tests { |
| use super::super::testing::*; |
| use super::super::*; |
| |
| fn decode_shift_jis(bytes: &[u8], expect: &str) { |
| decode(SHIFT_JIS, bytes, expect); |
| } |
| |
| fn encode_shift_jis(string: &str, expect: &[u8]) { |
| encode(SHIFT_JIS, string, expect); |
| } |
| |
| #[test] |
| fn test_shift_jis_decode() { |
| // Empty |
| decode_shift_jis(b"", &""); |
| |
| // ASCII |
| decode_shift_jis(b"\x61\x62", "\u{0061}\u{0062}"); |
| |
| // Half-width |
| decode_shift_jis(b"\xA1", "\u{FF61}"); |
| decode_shift_jis(b"\xDF", "\u{FF9F}"); |
| decode_shift_jis(b"\xA0", "\u{FFFD}"); |
| decode_shift_jis(b"\xE0", "\u{FFFD}"); |
| decode_shift_jis(b"\xA0+", "\u{FFFD}+"); |
| decode_shift_jis(b"\xE0+", "\u{FFFD}+"); |
| |
| // EUDC |
| decode_shift_jis(b"\xF0\x40", "\u{E000}"); |
| decode_shift_jis(b"\xF9\xFC", "\u{E757}"); |
| decode_shift_jis(b"\xEF\xFC", "\u{FFFD}"); |
| decode_shift_jis(b"\xFA\x40", "\u{2170}"); |
| |
| // JIS 0208 |
| decode_shift_jis(b"\x81\x40", "\u{3000}"); |
| decode_shift_jis(b"\x81\x3F", "\u{FFFD}?"); |
| decode_shift_jis(b"\xEE\xFC", "\u{FF02}"); |
| decode_shift_jis(b"\xEE\xFD", "\u{FFFD}"); |
| decode_shift_jis(b"\xFA\x40", "\u{2170}"); |
| decode_shift_jis(b"\xFA\x3F", "\u{FFFD}?"); |
| decode_shift_jis(b"\xFC\x4B", "\u{9ED1}"); |
| decode_shift_jis(b"\xFC\x4C", "\u{FFFD}L"); |
| // |
| } |
| |
| #[test] |
| fn test_shift_jis_encode() { |
| // Empty |
| encode_shift_jis("", b""); |
| |
| // ASCII |
| encode_shift_jis("\u{0061}\u{0062}", b"\x61\x62"); |
| |
| // Exceptional code points |
| encode_shift_jis("\u{0080}", b"\x80"); |
| encode_shift_jis("\u{00A5}", b"\x5C"); |
| encode_shift_jis("\u{203E}", b"\x7E"); |
| encode_shift_jis("\u{2212}", b"\x81\x7C"); |
| |
| // Half-width |
| encode_shift_jis("\u{FF61}", b"\xA1"); |
| encode_shift_jis("\u{FF9F}", b"\xDF"); |
| |
| // EUDC |
| encode_shift_jis("\u{E000}", b""); |
| encode_shift_jis("\u{E757}", b""); |
| |
| // JIS 0212 |
| encode_shift_jis("\u{02D8}", b"˘"); |
| |
| // JIS 0208 |
| encode_shift_jis("\u{3000}", b"\x81\x40"); |
| encode_shift_jis("\u{FF02}", b"\xFA\x57"); |
| encode_shift_jis("\u{2170}", b"\xFA\x40"); |
| encode_shift_jis("\u{9ED1}", b"\xFC\x4B"); |
| } |
| |
| #[test] |
| #[cfg_attr(miri, ignore)] // Miri is too slow |
| fn test_shift_jis_decode_all() { |
| let input = include_bytes!("test_data/shift_jis_in.txt"); |
| let expectation = include_str!("test_data/shift_jis_in_ref.txt"); |
| let (cow, had_errors) = SHIFT_JIS.decode_without_bom_handling(input); |
| assert!(had_errors, "Should have had errors."); |
| assert_eq!(&cow[..], expectation); |
| } |
| |
| #[test] |
| #[cfg_attr(miri, ignore)] // Miri is too slow |
| fn test_shift_jis_encode_all() { |
| let input = include_str!("test_data/shift_jis_out.txt"); |
| let expectation = include_bytes!("test_data/shift_jis_out_ref.txt"); |
| let (cow, encoding, had_errors) = SHIFT_JIS.encode(input); |
| assert!(!had_errors, "Should not have had errors."); |
| assert_eq!(encoding, SHIFT_JIS); |
| assert_eq!(&cow[..], &expectation[..]); |
| } |
| |
| #[test] |
| fn test_shift_jis_half_width_katakana_length() { |
| let mut output = [0u8; 20]; |
| let mut decoder = SHIFT_JIS.new_decoder(); |
| { |
| let needed = decoder |
| .max_utf8_buffer_length_without_replacement(1) |
| .unwrap(); |
| let (result, read, written) = |
| decoder.decode_to_utf8_without_replacement(b"\xA1", &mut output[..needed], true); |
| assert_eq!(result, DecoderResult::InputEmpty); |
| assert_eq!(read, 1); |
| assert_eq!(written, 3); |
| assert_eq!(output[0], 0xEF); |
| assert_eq!(output[1], 0xBD); |
| assert_eq!(output[2], 0xA1); |
| } |
| } |
| } |