| // This file is part of ICU4X. For terms of use, please see the file |
| // called LICENSE at the top level of the ICU4X source tree |
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
| |
| //! This is the main module pertaining to casemapping exceptions. |
| //! |
| //! A single exception is represented by the [`Exception`] type and its ULE equivalent. |
| //! |
| //! The storage format is complicated (and documented on [`Exception`]), but the data format is |
| //! represented equally by [`DecodedException`], which is more human-readable. |
| use icu_provider::prelude::*; |
| |
| use super::data::MappingKind; |
| use super::exception_helpers::{ExceptionBits, ExceptionSlot, SlotPresence}; |
| use crate::set::ClosureSink; |
| use alloc::borrow::Cow; |
| use core::fmt; |
| #[cfg(any(feature = "serde", feature = "datagen"))] |
| use core::ops::Range; |
| use core::ptr; |
| use zerovec::ule::AsULE; |
| use zerovec::VarZeroVec; |
| |
| const SURROGATES_START: u32 = 0xD800; |
| const SURROGATES_LEN: u32 = 0xDFFF - SURROGATES_START + 1; |
| |
| /// This represents case mapping exceptions that can't be represented as a delta applied to |
| /// the original code point. The codepoint |
| /// trie in CaseMapper stores indices into this VarZeroVec. |
| /// |
| /// <div class="stab unstable"> |
| /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, |
| /// including in SemVer minor releases. While the serde representation of data structs is guaranteed |
| /// to be stable, their Rust representation might not be. Use with caution. |
| /// </div> |
| #[cfg_attr(feature = "serde", derive(serde::Deserialize))] |
| #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] |
| #[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider::exceptions))] |
| #[derive(Debug, Eq, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] |
| pub struct CaseMapExceptions<'data> { |
| #[cfg_attr(feature = "serde", serde(borrow))] |
| /// The list of exceptions |
| pub exceptions: VarZeroVec<'data, ExceptionULE>, |
| } |
| |
| impl CaseMapExceptions<'_> { |
| /// Obtain the exception at index `idx`. Will |
| /// return a default value if not present (GIGO behavior), |
| /// as these indices should come from a paired CaseMapData object |
| /// |
| /// Will also panic in debug mode |
| pub fn get(&self, idx: u16) -> &ExceptionULE { |
| let exception = self.exceptions.get(idx.into()); |
| debug_assert!(exception.is_some()); |
| |
| exception.unwrap_or(ExceptionULE::empty_exception()) |
| } |
| |
| #[cfg(any(feature = "serde", feature = "datagen"))] |
| pub(crate) fn validate(&self) -> Result<Range<u16>, &'static str> { |
| for exception in self.exceptions.iter() { |
| exception.validate()?; |
| } |
| u16::try_from(self.exceptions.len()) |
| .map_err(|_| "Too many exceptions") |
| .map(|l| 0..l) |
| } |
| } |
| /// A type representing the wire format of `Exception`. The data contained is |
| /// equivalently represented by [`DecodedException`]. |
| /// |
| /// This type is itself not used that much, most of its relevant methods live |
| /// on [`ExceptionULE`]. |
| /// |
| /// The `bits` contain supplementary data, whereas |
| /// `slot_presence` marks te presence of various extra data |
| /// in the `data` field. |
| /// |
| /// The `data` field is not validated to contain all of this data, |
| /// this type will have GIGO behavior when constructed with invalid `data`. |
| /// |
| /// The format of `data` is documented on the field |
| /// |
| /// <div class="stab unstable"> |
| /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, |
| /// including in SemVer minor releases. While the serde representation of data structs is guaranteed |
| /// to be stable, their Rust representation might not be. Use with caution. |
| /// </div> |
| #[zerovec::make_varule(ExceptionULE)] |
| #[derive(PartialEq, Eq, Clone, Default, Debug)] |
| #[zerovec::skip_derive(Ord)] |
| #[cfg_attr( |
| feature = "serde", |
| derive(serde::Deserialize), |
| zerovec::derive(Deserialize) |
| )] |
| #[cfg_attr( |
| feature = "datagen", |
| derive(serde::Serialize), |
| zerovec::derive(Serialize) |
| )] |
| pub struct Exception<'a> { |
| /// The various bit based exception data associated with this. |
| /// |
| /// Format: Just a u8 of bitflags, some flags unused. See [`ExceptionBits`] and its ULE type for more. |
| pub bits: ExceptionBits, |
| /// Which slots are present in `data`. |
| /// |
| /// Format: a u8 of bitflags |
| pub slot_presence: SlotPresence, |
| /// Format : `[char slots] [optional closure length] [ closure slot ] [ full mappings data ]` |
| /// |
| /// For each set SlotPresence bit, except for the two stringy slots (Closure/FullMapping), |
| /// this will have one entry in the string, packed together. |
| /// |
| /// Note that the simple_case delta is stored as a u32 normalized to a `char`, where u32s |
| /// which are from or beyond the surrogate range 0xD800-0xDFFF are stored as chars |
| /// starting from 0xE000. The sign is stored in bits.negative_delta. |
| /// |
| /// If both Closure/FullMapping are present, the next char will be the length of the closure slot, |
| /// bisecting the rest of the data. |
| /// If only one is present, the rest of the data represents that slot. |
| /// |
| /// The closure slot simply represents one string. The full-mappings slot represents four strings, |
| /// packed in a way similar to VarZeroVec, in the following format: |
| /// `i1 i2 i3 [ str0 ] [ str1 ] [ str2 ] [ str3 ]` |
| /// |
| /// where `i1 i2 i3` are the indices of the relevant mappings string. The strings are stored in |
| /// the order corresponding to the MappingKind enum. |
| pub data: Cow<'a, str>, |
| } |
| |
| impl ExceptionULE { |
| #[inline] |
| fn empty_exception() -> &'static Self { |
| static EMPTY_BYTES: &[u8] = &[0, 0]; |
| // Safety: |
| // ExceptionULE is a packed DST with `(u8, u8, unsized)` fields. All bit patterns are valid for the two u8s |
| // |
| // An "empty" one can be constructed from a slice of two u8s |
| unsafe { |
| let slice: *const [u8] = ptr::slice_from_raw_parts(EMPTY_BYTES.as_ptr(), 0); |
| &*(slice as *const Self) |
| } |
| } |
| pub(crate) fn has_slot(&self, slot: ExceptionSlot) -> bool { |
| self.slot_presence.has_slot(slot) |
| } |
| /// Obtain a `char` slot, if occupied. If `slot` represents a string slot, |
| /// will return `None` |
| pub(crate) fn get_char_slot(&self, slot: ExceptionSlot) -> Option<char> { |
| if slot >= ExceptionSlot::STRING_SLOTS_START { |
| return None; |
| } |
| let bit = 1 << (slot as u8); |
| // check if slot is occupied |
| if self.slot_presence.0 & bit == 0 { |
| return None; |
| } |
| |
| let previous_slot_mask = bit - 1; |
| let previous_slots = self.slot_presence.0 & previous_slot_mask; |
| let slot_num = previous_slots.count_ones() as usize; |
| self.data.chars().nth(slot_num) |
| } |
| |
| /// Get the `simple_case` delta (i.e. the `delta` slot), given the character |
| /// this data belongs to. |
| /// |
| /// Normalizes the delta from char-format to u32 format |
| /// |
| /// Does *not* handle the sign of the delta; see self.bits.negative_delta |
| fn get_simple_case_delta(&self) -> Option<u32> { |
| let delta_ch = self.get_char_slot(ExceptionSlot::Delta)?; |
| let mut delta = u32::from(delta_ch); |
| // We "fill in" the surrogates range by offsetting deltas greater than it |
| if delta >= SURROGATES_START { |
| delta -= SURROGATES_LEN; |
| } |
| Some(delta) |
| } |
| |
| /// Get the `simple_case` value (i.e. the `delta` slot), given the character |
| /// this data belongs to. |
| /// |
| /// The data is stored as a delta so the character must be provided. |
| /// |
| /// The data cannot be stored directly as a character because the trie is more |
| /// compact with adjacent characters sharing deltas. |
| pub(crate) fn get_simple_case_slot_for(&self, ch: char) -> Option<char> { |
| let delta = self.get_simple_case_delta()?; |
| let mut delta = i32::try_from(delta).ok()?; |
| if self.bits.negative_delta() { |
| delta = -delta; |
| } |
| |
| let new_ch = i32::try_from(u32::from(ch)).ok()? + delta; |
| |
| char::try_from(u32::try_from(new_ch).ok()?).ok() |
| } |
| |
| /// Returns *all* the data in the closure/full slots, including length metadata |
| fn get_stringy_data(&self) -> Option<&str> { |
| const CHAR_MASK: u8 = (1 << ExceptionSlot::STRING_SLOTS_START as u8) - 1; |
| let char_slot_count = (self.slot_presence.0 & CHAR_MASK).count_ones() as usize; |
| let mut chars = self.data.chars(); |
| for _ in 0..char_slot_count { |
| let res = chars.next(); |
| res?; |
| } |
| Some(chars.as_str()) |
| } |
| |
| /// Returns a single stringy slot, either ExceptionSlot::Closure |
| /// or ExceptionSlot::FullMappings. |
| fn get_stringy_slot(&self, slot: ExceptionSlot) -> Option<&str> { |
| debug_assert!(slot == ExceptionSlot::Closure || slot == ExceptionSlot::FullMappings); |
| let other_slot = if slot == ExceptionSlot::Closure { |
| ExceptionSlot::FullMappings |
| } else { |
| ExceptionSlot::Closure |
| }; |
| if !self.slot_presence.has_slot(slot) { |
| return None; |
| } |
| let stringy_data = self.get_stringy_data()?; |
| |
| if self.slot_presence.has_slot(other_slot) { |
| // both stringy slots are used, we need a length |
| let mut chars = stringy_data.chars(); |
| // GIGO: to have two strings there must be a length, if not present return None |
| let length_char = chars.next()?; |
| |
| let length = usize::try_from(u32::from(length_char)).unwrap_or(0); |
| // The length indexes into the string after the first char |
| let remaining_slice = chars.as_str(); |
| // GIGO: will return none if there wasn't enough space in this slot |
| if slot == ExceptionSlot::Closure { |
| remaining_slice.get(0..length) |
| } else { |
| remaining_slice.get(length..) |
| } |
| } else { |
| // only a single stringy slot, there is no length stored |
| Some(stringy_data) |
| } |
| } |
| |
| /// Get the data behind the `closure` slot |
| pub(crate) fn get_closure_slot(&self) -> Option<&str> { |
| self.get_stringy_slot(ExceptionSlot::Closure) |
| } |
| |
| /// Get all the slot data for the FullMappings slot |
| /// |
| /// This needs to be further segmented into four based on length metadata |
| fn get_fullmappings_slot_data(&self) -> Option<&str> { |
| self.get_stringy_slot(ExceptionSlot::FullMappings) |
| } |
| |
| /// Get a specific FullMappings slot value |
| pub(crate) fn get_fullmappings_slot_for_kind(&self, kind: MappingKind) -> Option<&str> { |
| let data = self.get_fullmappings_slot_data()?; |
| |
| let mut chars = data.chars(); |
| // GIGO: must have three index strings, else return None |
| let i1 = usize::try_from(u32::from(chars.next()?)).ok()?; |
| let i2 = usize::try_from(u32::from(chars.next()?)).ok()?; |
| let i3 = usize::try_from(u32::from(chars.next()?)).ok()?; |
| let remaining_slice = chars.as_str(); |
| // GIGO: if the indices are wrong, return None |
| match kind { |
| MappingKind::Lower => remaining_slice.get(..i1), |
| MappingKind::Fold => remaining_slice.get(i1..i2), |
| MappingKind::Upper => remaining_slice.get(i2..i3), |
| MappingKind::Title => remaining_slice.get(i3..), |
| } |
| } |
| |
| // convenience function that lets us use the ? operator |
| fn get_all_fullmapping_slots(&self) -> Option<[Cow<'_, str>; 4]> { |
| Some([ |
| self.get_fullmappings_slot_for_kind(MappingKind::Lower)? |
| .into(), |
| self.get_fullmappings_slot_for_kind(MappingKind::Fold)? |
| .into(), |
| self.get_fullmappings_slot_for_kind(MappingKind::Upper)? |
| .into(), |
| self.get_fullmappings_slot_for_kind(MappingKind::Title)? |
| .into(), |
| ]) |
| } |
| |
| // Given a mapping kind, returns the character for that kind, if it exists. Fold falls |
| // back to Lower; Title falls back to Upper. |
| #[inline] |
| pub(crate) fn slot_char_for_kind(&self, kind: MappingKind) -> Option<char> { |
| match kind { |
| MappingKind::Lower | MappingKind::Upper => self.get_char_slot(kind.into()), |
| MappingKind::Fold => self |
| .get_char_slot(ExceptionSlot::Fold) |
| .or_else(|| self.get_char_slot(ExceptionSlot::Lower)), |
| MappingKind::Title => self |
| .get_char_slot(ExceptionSlot::Title) |
| .or_else(|| self.get_char_slot(ExceptionSlot::Upper)), |
| } |
| } |
| |
| pub(crate) fn add_full_and_closure_mappings<S: ClosureSink>(&self, set: &mut S) { |
| if let Some(full) = self.get_fullmappings_slot_for_kind(MappingKind::Fold) { |
| if !full.is_empty() { |
| set.add_string(full); |
| } |
| }; |
| if let Some(closure) = self.get_closure_slot() { |
| for c in closure.chars() { |
| set.add_char(c); |
| } |
| }; |
| } |
| |
| /// Extract all the data out into a structured form |
| /// |
| /// Useful for serialization and debugging |
| pub fn decode(&self) -> DecodedException<'_> { |
| // Potential future optimization: This can |
| // directly access each bit one after the other and iterate the string |
| // which avoids recomputing slot offsets over and over again. |
| // |
| // If we're doing so we may wish to retain this older impl so that we can still roundtrip test |
| let bits = self.bits; |
| let lowercase = self.get_char_slot(ExceptionSlot::Lower); |
| let casefold = self.get_char_slot(ExceptionSlot::Fold); |
| let uppercase = self.get_char_slot(ExceptionSlot::Upper); |
| let titlecase = self.get_char_slot(ExceptionSlot::Title); |
| let simple_case_delta = self.get_simple_case_delta(); |
| let closure = self.get_closure_slot().map(Into::into); |
| let full = self.get_all_fullmapping_slots(); |
| |
| DecodedException { |
| bits: ExceptionBits::from_unaligned(bits), |
| lowercase, |
| casefold, |
| uppercase, |
| titlecase, |
| simple_case_delta, |
| closure, |
| full, |
| } |
| } |
| |
| #[cfg(any(feature = "serde", feature = "datagen"))] |
| pub(crate) fn validate(&self) -> Result<(), &'static str> { |
| // check that ICU4C specific fields are not set |
| // check that there is enough space for all the offsets |
| if self.bits.double_width_slots() { |
| return Err("double-width-slots should not be used in ICU4C"); |
| } |
| |
| // just run all of the slot getters at once and then check |
| let decoded = self.decode(); |
| |
| for (slot, decoded_slot) in [ |
| (ExceptionSlot::Lower, &decoded.lowercase), |
| (ExceptionSlot::Fold, &decoded.casefold), |
| (ExceptionSlot::Upper, &decoded.uppercase), |
| (ExceptionSlot::Title, &decoded.titlecase), |
| ] { |
| if self.has_slot(slot) && decoded_slot.is_none() { |
| // decoding hit GIGO behavior, oops! |
| return Err("Slot decoding failed"); |
| } |
| } |
| if self.has_slot(ExceptionSlot::Delta) && decoded.simple_case_delta.is_none() { |
| // decoding hit GIGO behavior, oops! |
| return Err("Slot decoding failed"); |
| } |
| |
| if self.has_slot(ExceptionSlot::Closure) && decoded.closure.is_none() { |
| return Err("Slot decoding failed"); |
| } |
| |
| if self.has_slot(ExceptionSlot::FullMappings) { |
| if decoded.full.is_some() { |
| let data = self |
| .get_fullmappings_slot_data() |
| .ok_or("fullmappings slot doesn't parse")?; |
| let mut chars = data.chars(); |
| let i1 = u32::from(chars.next().ok_or("fullmappings string too small")?); |
| let i2 = u32::from(chars.next().ok_or("fullmappings string too small")?); |
| let i3 = u32::from(chars.next().ok_or("fullmappings string too small")?); |
| |
| if i2 < i1 || i3 < i2 { |
| return Err("fullmappings string contains non-sequential indices"); |
| } |
| let rest = chars.as_str(); |
| let len = u32::try_from(rest.len()).map_err(|_| "len too large for u32")?; |
| |
| if i1 > len || i2 > len || i3 > len { |
| return Err("fullmappings string contains out-of-bounds indices"); |
| } |
| } else { |
| return Err("Slot decoding failed"); |
| } |
| } |
| |
| Ok(()) |
| } |
| } |
| |
| impl fmt::Debug for ExceptionULE { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| self.decode().fmt(f) |
| } |
| } |
| |
| /// A decoded [`Exception`] type, with all of the data parsed out into |
| /// separate fields. |
| /// |
| /// <div class="stab unstable"> |
| /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, |
| /// including in SemVer minor releases. While the serde representation of data structs is guaranteed |
| /// to be stable, their Rust representation might not be. Use with caution. |
| /// </div> |
| #[cfg_attr(feature = "serde", derive(serde::Deserialize))] |
| #[cfg_attr(feature = "datagen", derive(serde::Serialize))] |
| #[derive(Debug, Clone, PartialEq, Eq, Default)] |
| pub struct DecodedException<'a> { |
| /// The various bit-based data associated with this exception |
| pub bits: ExceptionBits, |
| /// Lowercase mapping |
| pub lowercase: Option<char>, |
| /// Case folding |
| pub casefold: Option<char>, |
| /// Uppercase mapping |
| pub uppercase: Option<char>, |
| /// Titlecase mapping |
| pub titlecase: Option<char>, |
| /// The simple casefold delta. Its sign is stored in bits.negative_delta |
| pub simple_case_delta: Option<u32>, |
| /// Closure mappings |
| pub closure: Option<Cow<'a, str>>, |
| /// The four full-mappings strings, indexed by MappingKind u8 value |
| pub full: Option<[Cow<'a, str>; 4]>, |
| } |
| |
| impl DecodedException<'_> { |
| /// Convert to a wire-format encodeable (VarULE-encodeable) [`Exception`] |
| pub fn encode(&self) -> Exception<'static> { |
| let bits = self.bits; |
| let mut slot_presence = SlotPresence(0); |
| let mut data = alloc::string::String::new(); |
| if let Some(lowercase) = self.lowercase { |
| slot_presence.add_slot(ExceptionSlot::Lower); |
| data.push(lowercase) |
| } |
| if let Some(casefold) = self.casefold { |
| slot_presence.add_slot(ExceptionSlot::Fold); |
| data.push(casefold) |
| } |
| if let Some(uppercase) = self.uppercase { |
| slot_presence.add_slot(ExceptionSlot::Upper); |
| data.push(uppercase) |
| } |
| if let Some(titlecase) = self.titlecase { |
| slot_presence.add_slot(ExceptionSlot::Title); |
| data.push(titlecase) |
| } |
| if let Some(mut simple_case_delta) = self.simple_case_delta { |
| slot_presence.add_slot(ExceptionSlot::Delta); |
| |
| if simple_case_delta >= SURROGATES_START { |
| simple_case_delta += SURROGATES_LEN; |
| } |
| let simple_case_delta = char::try_from(simple_case_delta).unwrap_or('\0'); |
| data.push(simple_case_delta) |
| } |
| |
| if let Some(ref closure) = self.closure { |
| slot_presence.add_slot(ExceptionSlot::Closure); |
| if self.full.is_some() { |
| // GIGO: if the closure length is more than 0xD800 this will error. Plenty of space. |
| debug_assert!( |
| closure.len() < 0xD800, |
| "Found overlarge closure value when encoding exception" |
| ); |
| let len_char = u32::try_from(closure.len()) |
| .ok() |
| .and_then(|c| char::try_from(c).ok()) |
| .unwrap_or('\0'); |
| data.push(len_char); |
| } |
| data.push_str(closure); |
| } |
| if let Some(ref full) = self.full { |
| slot_presence.add_slot(ExceptionSlot::FullMappings); |
| let mut idx = 0; |
| // iterate all elements except the last, whose length we can calculate from context |
| for mapping in full.iter().take(3) { |
| idx += mapping.len(); |
| data.push(char::try_from(u32::try_from(idx).unwrap_or(0)).unwrap_or('\0')); |
| } |
| for mapping in full { |
| data.push_str(mapping); |
| } |
| } |
| Exception { |
| bits, |
| slot_presence, |
| data: data.into(), |
| } |
| } |
| |
| // Potential optimization: Write an `EncodeAsVarULE` that |
| // directly produces an ExceptionULE |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use super::*; |
| |
| fn test_roundtrip_once(exception: DecodedException) { |
| let encoded = exception.encode(); |
| let encoded = zerovec::ule::encode_varule_to_box(&encoded); |
| let decoded = encoded.decode(); |
| assert_eq!(decoded, exception); |
| } |
| |
| #[test] |
| fn test_roundtrip() { |
| test_roundtrip_once(DecodedException { |
| lowercase: Some('ø'), |
| ..Default::default() |
| }); |
| test_roundtrip_once(DecodedException { |
| titlecase: Some('X'), |
| lowercase: Some('ø'), |
| ..Default::default() |
| }); |
| test_roundtrip_once(DecodedException { |
| titlecase: Some('X'), |
| ..Default::default() |
| }); |
| test_roundtrip_once(DecodedException { |
| titlecase: Some('X'), |
| simple_case_delta: Some(0xE999), |
| closure: Some("hello world".into()), |
| ..Default::default() |
| }); |
| test_roundtrip_once(DecodedException { |
| simple_case_delta: Some(10), |
| closure: Some("hello world".into()), |
| full: Some(["ä½ å¥½ä¸–ç•Œ".into(), "".into(), "hi".into(), "Ã¥".into()]), |
| ..Default::default() |
| }); |
| test_roundtrip_once(DecodedException { |
| closure: Some("hello world".into()), |
| full: Some(["aa".into(), "È›".into(), "".into(), "Ã¥".into()]), |
| ..Default::default() |
| }); |
| test_roundtrip_once(DecodedException { |
| full: Some(["ä½ å¥½ä¸–ç•Œ".into(), "".into(), "hi".into(), "Ã¥".into()]), |
| ..Default::default() |
| }); |
| } |
| } |