blob: f1e43efcd038d5b6d0cb3441cdd68d3c2fe35e9b [file] [log] [blame]
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! This is the main module pertaining to casemapping exceptions.
//!
//! A single exception is represented by the [`Exception`] type and its ULE equivalent.
//!
//! The storage format is complicated (and documented on [`Exception`]), but the data format is
//! represented equally by [`DecodedException`], which is more human-readable.
use icu_provider::prelude::*;
use super::data::MappingKind;
use super::exception_helpers::{ExceptionBits, ExceptionSlot, SlotPresence};
use crate::set::ClosureSink;
use alloc::borrow::Cow;
use core::fmt;
#[cfg(any(feature = "serde", feature = "datagen"))]
use core::ops::Range;
use core::ptr;
use zerovec::ule::AsULE;
use zerovec::VarZeroVec;
const SURROGATES_START: u32 = 0xD800;
const SURROGATES_LEN: u32 = 0xDFFF - SURROGATES_START + 1;
/// This represents case mapping exceptions that can't be represented as a delta applied to
/// the original code point. The codepoint
/// trie in CaseMapper stores indices into this VarZeroVec.
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider::exceptions))]
#[derive(Debug, Eq, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
pub struct CaseMapExceptions<'data> {
#[cfg_attr(feature = "serde", serde(borrow))]
/// The list of exceptions
pub exceptions: VarZeroVec<'data, ExceptionULE>,
}
impl CaseMapExceptions<'_> {
/// Obtain the exception at index `idx`. Will
/// return a default value if not present (GIGO behavior),
/// as these indices should come from a paired CaseMapData object
///
/// Will also panic in debug mode
pub fn get(&self, idx: u16) -> &ExceptionULE {
let exception = self.exceptions.get(idx.into());
debug_assert!(exception.is_some());
exception.unwrap_or(ExceptionULE::empty_exception())
}
#[cfg(any(feature = "serde", feature = "datagen"))]
pub(crate) fn validate(&self) -> Result<Range<u16>, &'static str> {
for exception in self.exceptions.iter() {
exception.validate()?;
}
u16::try_from(self.exceptions.len())
.map_err(|_| "Too many exceptions")
.map(|l| 0..l)
}
}
/// A type representing the wire format of `Exception`. The data contained is
/// equivalently represented by [`DecodedException`].
///
/// This type is itself not used that much, most of its relevant methods live
/// on [`ExceptionULE`].
///
/// The `bits` contain supplementary data, whereas
/// `slot_presence` marks te presence of various extra data
/// in the `data` field.
///
/// The `data` field is not validated to contain all of this data,
/// this type will have GIGO behavior when constructed with invalid `data`.
///
/// The format of `data` is documented on the field
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[zerovec::make_varule(ExceptionULE)]
#[derive(PartialEq, Eq, Clone, Default, Debug)]
#[zerovec::skip_derive(Ord)]
#[cfg_attr(
feature = "serde",
derive(serde::Deserialize),
zerovec::derive(Deserialize)
)]
#[cfg_attr(
feature = "datagen",
derive(serde::Serialize),
zerovec::derive(Serialize)
)]
pub struct Exception<'a> {
/// The various bit based exception data associated with this.
///
/// Format: Just a u8 of bitflags, some flags unused. See [`ExceptionBits`] and its ULE type for more.
pub bits: ExceptionBits,
/// Which slots are present in `data`.
///
/// Format: a u8 of bitflags
pub slot_presence: SlotPresence,
/// Format : `[char slots] [optional closure length] [ closure slot ] [ full mappings data ]`
///
/// For each set SlotPresence bit, except for the two stringy slots (Closure/FullMapping),
/// this will have one entry in the string, packed together.
///
/// Note that the simple_case delta is stored as a u32 normalized to a `char`, where u32s
/// which are from or beyond the surrogate range 0xD800-0xDFFF are stored as chars
/// starting from 0xE000. The sign is stored in bits.negative_delta.
///
/// If both Closure/FullMapping are present, the next char will be the length of the closure slot,
/// bisecting the rest of the data.
/// If only one is present, the rest of the data represents that slot.
///
/// The closure slot simply represents one string. The full-mappings slot represents four strings,
/// packed in a way similar to VarZeroVec, in the following format:
/// `i1 i2 i3 [ str0 ] [ str1 ] [ str2 ] [ str3 ]`
///
/// where `i1 i2 i3` are the indices of the relevant mappings string. The strings are stored in
/// the order corresponding to the MappingKind enum.
pub data: Cow<'a, str>,
}
impl ExceptionULE {
#[inline]
fn empty_exception() -> &'static Self {
static EMPTY_BYTES: &[u8] = &[0, 0];
// Safety:
// ExceptionULE is a packed DST with `(u8, u8, unsized)` fields. All bit patterns are valid for the two u8s
//
// An "empty" one can be constructed from a slice of two u8s
unsafe {
let slice: *const [u8] = ptr::slice_from_raw_parts(EMPTY_BYTES.as_ptr(), 0);
&*(slice as *const Self)
}
}
pub(crate) fn has_slot(&self, slot: ExceptionSlot) -> bool {
self.slot_presence.has_slot(slot)
}
/// Obtain a `char` slot, if occupied. If `slot` represents a string slot,
/// will return `None`
pub(crate) fn get_char_slot(&self, slot: ExceptionSlot) -> Option<char> {
if slot >= ExceptionSlot::STRING_SLOTS_START {
return None;
}
let bit = 1 << (slot as u8);
// check if slot is occupied
if self.slot_presence.0 & bit == 0 {
return None;
}
let previous_slot_mask = bit - 1;
let previous_slots = self.slot_presence.0 & previous_slot_mask;
let slot_num = previous_slots.count_ones() as usize;
self.data.chars().nth(slot_num)
}
/// Get the `simple_case` delta (i.e. the `delta` slot), given the character
/// this data belongs to.
///
/// Normalizes the delta from char-format to u32 format
///
/// Does *not* handle the sign of the delta; see self.bits.negative_delta
fn get_simple_case_delta(&self) -> Option<u32> {
let delta_ch = self.get_char_slot(ExceptionSlot::Delta)?;
let mut delta = u32::from(delta_ch);
// We "fill in" the surrogates range by offsetting deltas greater than it
if delta >= SURROGATES_START {
delta -= SURROGATES_LEN;
}
Some(delta)
}
/// Get the `simple_case` value (i.e. the `delta` slot), given the character
/// this data belongs to.
///
/// The data is stored as a delta so the character must be provided.
///
/// The data cannot be stored directly as a character because the trie is more
/// compact with adjacent characters sharing deltas.
pub(crate) fn get_simple_case_slot_for(&self, ch: char) -> Option<char> {
let delta = self.get_simple_case_delta()?;
let mut delta = i32::try_from(delta).ok()?;
if self.bits.negative_delta() {
delta = -delta;
}
let new_ch = i32::try_from(u32::from(ch)).ok()? + delta;
char::try_from(u32::try_from(new_ch).ok()?).ok()
}
/// Returns *all* the data in the closure/full slots, including length metadata
fn get_stringy_data(&self) -> Option<&str> {
const CHAR_MASK: u8 = (1 << ExceptionSlot::STRING_SLOTS_START as u8) - 1;
let char_slot_count = (self.slot_presence.0 & CHAR_MASK).count_ones() as usize;
let mut chars = self.data.chars();
for _ in 0..char_slot_count {
let res = chars.next();
res?;
}
Some(chars.as_str())
}
/// Returns a single stringy slot, either ExceptionSlot::Closure
/// or ExceptionSlot::FullMappings.
fn get_stringy_slot(&self, slot: ExceptionSlot) -> Option<&str> {
debug_assert!(slot == ExceptionSlot::Closure || slot == ExceptionSlot::FullMappings);
let other_slot = if slot == ExceptionSlot::Closure {
ExceptionSlot::FullMappings
} else {
ExceptionSlot::Closure
};
if !self.slot_presence.has_slot(slot) {
return None;
}
let stringy_data = self.get_stringy_data()?;
if self.slot_presence.has_slot(other_slot) {
// both stringy slots are used, we need a length
let mut chars = stringy_data.chars();
// GIGO: to have two strings there must be a length, if not present return None
let length_char = chars.next()?;
let length = usize::try_from(u32::from(length_char)).unwrap_or(0);
// The length indexes into the string after the first char
let remaining_slice = chars.as_str();
// GIGO: will return none if there wasn't enough space in this slot
if slot == ExceptionSlot::Closure {
remaining_slice.get(0..length)
} else {
remaining_slice.get(length..)
}
} else {
// only a single stringy slot, there is no length stored
Some(stringy_data)
}
}
/// Get the data behind the `closure` slot
pub(crate) fn get_closure_slot(&self) -> Option<&str> {
self.get_stringy_slot(ExceptionSlot::Closure)
}
/// Get all the slot data for the FullMappings slot
///
/// This needs to be further segmented into four based on length metadata
fn get_fullmappings_slot_data(&self) -> Option<&str> {
self.get_stringy_slot(ExceptionSlot::FullMappings)
}
/// Get a specific FullMappings slot value
pub(crate) fn get_fullmappings_slot_for_kind(&self, kind: MappingKind) -> Option<&str> {
let data = self.get_fullmappings_slot_data()?;
let mut chars = data.chars();
// GIGO: must have three index strings, else return None
let i1 = usize::try_from(u32::from(chars.next()?)).ok()?;
let i2 = usize::try_from(u32::from(chars.next()?)).ok()?;
let i3 = usize::try_from(u32::from(chars.next()?)).ok()?;
let remaining_slice = chars.as_str();
// GIGO: if the indices are wrong, return None
match kind {
MappingKind::Lower => remaining_slice.get(..i1),
MappingKind::Fold => remaining_slice.get(i1..i2),
MappingKind::Upper => remaining_slice.get(i2..i3),
MappingKind::Title => remaining_slice.get(i3..),
}
}
// convenience function that lets us use the ? operator
fn get_all_fullmapping_slots(&self) -> Option<[Cow<'_, str>; 4]> {
Some([
self.get_fullmappings_slot_for_kind(MappingKind::Lower)?
.into(),
self.get_fullmappings_slot_for_kind(MappingKind::Fold)?
.into(),
self.get_fullmappings_slot_for_kind(MappingKind::Upper)?
.into(),
self.get_fullmappings_slot_for_kind(MappingKind::Title)?
.into(),
])
}
// Given a mapping kind, returns the character for that kind, if it exists. Fold falls
// back to Lower; Title falls back to Upper.
#[inline]
pub(crate) fn slot_char_for_kind(&self, kind: MappingKind) -> Option<char> {
match kind {
MappingKind::Lower | MappingKind::Upper => self.get_char_slot(kind.into()),
MappingKind::Fold => self
.get_char_slot(ExceptionSlot::Fold)
.or_else(|| self.get_char_slot(ExceptionSlot::Lower)),
MappingKind::Title => self
.get_char_slot(ExceptionSlot::Title)
.or_else(|| self.get_char_slot(ExceptionSlot::Upper)),
}
}
pub(crate) fn add_full_and_closure_mappings<S: ClosureSink>(&self, set: &mut S) {
if let Some(full) = self.get_fullmappings_slot_for_kind(MappingKind::Fold) {
if !full.is_empty() {
set.add_string(full);
}
};
if let Some(closure) = self.get_closure_slot() {
for c in closure.chars() {
set.add_char(c);
}
};
}
/// Extract all the data out into a structured form
///
/// Useful for serialization and debugging
pub fn decode(&self) -> DecodedException<'_> {
// Potential future optimization: This can
// directly access each bit one after the other and iterate the string
// which avoids recomputing slot offsets over and over again.
//
// If we're doing so we may wish to retain this older impl so that we can still roundtrip test
let bits = self.bits;
let lowercase = self.get_char_slot(ExceptionSlot::Lower);
let casefold = self.get_char_slot(ExceptionSlot::Fold);
let uppercase = self.get_char_slot(ExceptionSlot::Upper);
let titlecase = self.get_char_slot(ExceptionSlot::Title);
let simple_case_delta = self.get_simple_case_delta();
let closure = self.get_closure_slot().map(Into::into);
let full = self.get_all_fullmapping_slots();
DecodedException {
bits: ExceptionBits::from_unaligned(bits),
lowercase,
casefold,
uppercase,
titlecase,
simple_case_delta,
closure,
full,
}
}
#[cfg(any(feature = "serde", feature = "datagen"))]
pub(crate) fn validate(&self) -> Result<(), &'static str> {
// check that ICU4C specific fields are not set
// check that there is enough space for all the offsets
if self.bits.double_width_slots() {
return Err("double-width-slots should not be used in ICU4C");
}
// just run all of the slot getters at once and then check
let decoded = self.decode();
for (slot, decoded_slot) in [
(ExceptionSlot::Lower, &decoded.lowercase),
(ExceptionSlot::Fold, &decoded.casefold),
(ExceptionSlot::Upper, &decoded.uppercase),
(ExceptionSlot::Title, &decoded.titlecase),
] {
if self.has_slot(slot) && decoded_slot.is_none() {
// decoding hit GIGO behavior, oops!
return Err("Slot decoding failed");
}
}
if self.has_slot(ExceptionSlot::Delta) && decoded.simple_case_delta.is_none() {
// decoding hit GIGO behavior, oops!
return Err("Slot decoding failed");
}
if self.has_slot(ExceptionSlot::Closure) && decoded.closure.is_none() {
return Err("Slot decoding failed");
}
if self.has_slot(ExceptionSlot::FullMappings) {
if decoded.full.is_some() {
let data = self
.get_fullmappings_slot_data()
.ok_or("fullmappings slot doesn't parse")?;
let mut chars = data.chars();
let i1 = u32::from(chars.next().ok_or("fullmappings string too small")?);
let i2 = u32::from(chars.next().ok_or("fullmappings string too small")?);
let i3 = u32::from(chars.next().ok_or("fullmappings string too small")?);
if i2 < i1 || i3 < i2 {
return Err("fullmappings string contains non-sequential indices");
}
let rest = chars.as_str();
let len = u32::try_from(rest.len()).map_err(|_| "len too large for u32")?;
if i1 > len || i2 > len || i3 > len {
return Err("fullmappings string contains out-of-bounds indices");
}
} else {
return Err("Slot decoding failed");
}
}
Ok(())
}
}
impl fmt::Debug for ExceptionULE {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
self.decode().fmt(f)
}
}
/// A decoded [`Exception`] type, with all of the data parsed out into
/// separate fields.
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[cfg_attr(feature = "datagen", derive(serde::Serialize))]
#[derive(Debug, Clone, PartialEq, Eq, Default)]
pub struct DecodedException<'a> {
/// The various bit-based data associated with this exception
pub bits: ExceptionBits,
/// Lowercase mapping
pub lowercase: Option<char>,
/// Case folding
pub casefold: Option<char>,
/// Uppercase mapping
pub uppercase: Option<char>,
/// Titlecase mapping
pub titlecase: Option<char>,
/// The simple casefold delta. Its sign is stored in bits.negative_delta
pub simple_case_delta: Option<u32>,
/// Closure mappings
pub closure: Option<Cow<'a, str>>,
/// The four full-mappings strings, indexed by MappingKind u8 value
pub full: Option<[Cow<'a, str>; 4]>,
}
impl DecodedException<'_> {
/// Convert to a wire-format encodeable (VarULE-encodeable) [`Exception`]
pub fn encode(&self) -> Exception<'static> {
let bits = self.bits;
let mut slot_presence = SlotPresence(0);
let mut data = alloc::string::String::new();
if let Some(lowercase) = self.lowercase {
slot_presence.add_slot(ExceptionSlot::Lower);
data.push(lowercase)
}
if let Some(casefold) = self.casefold {
slot_presence.add_slot(ExceptionSlot::Fold);
data.push(casefold)
}
if let Some(uppercase) = self.uppercase {
slot_presence.add_slot(ExceptionSlot::Upper);
data.push(uppercase)
}
if let Some(titlecase) = self.titlecase {
slot_presence.add_slot(ExceptionSlot::Title);
data.push(titlecase)
}
if let Some(mut simple_case_delta) = self.simple_case_delta {
slot_presence.add_slot(ExceptionSlot::Delta);
if simple_case_delta >= SURROGATES_START {
simple_case_delta += SURROGATES_LEN;
}
let simple_case_delta = char::try_from(simple_case_delta).unwrap_or('\0');
data.push(simple_case_delta)
}
if let Some(ref closure) = self.closure {
slot_presence.add_slot(ExceptionSlot::Closure);
if self.full.is_some() {
// GIGO: if the closure length is more than 0xD800 this will error. Plenty of space.
debug_assert!(
closure.len() < 0xD800,
"Found overlarge closure value when encoding exception"
);
let len_char = u32::try_from(closure.len())
.ok()
.and_then(|c| char::try_from(c).ok())
.unwrap_or('\0');
data.push(len_char);
}
data.push_str(closure);
}
if let Some(ref full) = self.full {
slot_presence.add_slot(ExceptionSlot::FullMappings);
let mut idx = 0;
// iterate all elements except the last, whose length we can calculate from context
for mapping in full.iter().take(3) {
idx += mapping.len();
data.push(char::try_from(u32::try_from(idx).unwrap_or(0)).unwrap_or('\0'));
}
for mapping in full {
data.push_str(mapping);
}
}
Exception {
bits,
slot_presence,
data: data.into(),
}
}
// Potential optimization: Write an `EncodeAsVarULE` that
// directly produces an ExceptionULE
}
#[cfg(test)]
mod tests {
use super::*;
fn test_roundtrip_once(exception: DecodedException) {
let encoded = exception.encode();
let encoded = zerovec::ule::encode_varule_to_box(&encoded);
let decoded = encoded.decode();
assert_eq!(decoded, exception);
}
#[test]
fn test_roundtrip() {
test_roundtrip_once(DecodedException {
lowercase: Some('ø'),
..Default::default()
});
test_roundtrip_once(DecodedException {
titlecase: Some('X'),
lowercase: Some('ø'),
..Default::default()
});
test_roundtrip_once(DecodedException {
titlecase: Some('X'),
..Default::default()
});
test_roundtrip_once(DecodedException {
titlecase: Some('X'),
simple_case_delta: Some(0xE999),
closure: Some("hello world".into()),
..Default::default()
});
test_roundtrip_once(DecodedException {
simple_case_delta: Some(10),
closure: Some("hello world".into()),
full: Some(["你好世界".into(), "".into(), "hi".into(), "å".into()]),
..Default::default()
});
test_roundtrip_once(DecodedException {
closure: Some("hello world".into()),
full: Some(["aa".into(), "È›".into(), "".into(), "Ã¥".into()]),
..Default::default()
});
test_roundtrip_once(DecodedException {
full: Some(["你好世界".into(), "".into(), "hi".into(), "å".into()]),
..Default::default()
});
}
}