blob: 839d914baa9548ee856e7913ad76bcc9894daa05 [file] [log] [blame]
use std::collections::{BTreeMap, HashMap};
use std::ops::Range;
use ucd_parse::Codepoints;
mod case_mapping;
mod raw_emitter;
mod unicode_download;
use raw_emitter::{emit_codepoints, RawEmitter};
static PROPERTIES: &[&str] = &[
"Alphabetic",
"Lowercase",
"Uppercase",
"Cased",
"Case_Ignorable",
"Grapheme_Extend",
"White_Space",
"Cc",
"N",
];
struct UnicodeData {
ranges: Vec<(&'static str, Vec<Range<u32>>)>,
to_upper: BTreeMap<u32, (u32, u32, u32)>,
to_lower: BTreeMap<u32, (u32, u32, u32)>,
}
fn to_mapping(origin: u32, codepoints: Vec<ucd_parse::Codepoint>) -> Option<(u32, u32, u32)> {
let mut a = None;
let mut b = None;
let mut c = None;
for codepoint in codepoints {
if origin == codepoint.value() {
return None;
}
if a.is_none() {
a = Some(codepoint.value());
} else if b.is_none() {
b = Some(codepoint.value());
} else if c.is_none() {
c = Some(codepoint.value());
} else {
panic!("more than 3 mapped codepoints")
}
}
Some((a.unwrap(), b.unwrap_or(0), c.unwrap_or(0)))
}
static UNICODE_DIRECTORY: &str = "unicode-downloads";
fn load_data() -> UnicodeData {
unicode_download::fetch_latest();
let mut properties = HashMap::new();
for row in ucd_parse::parse::<_, ucd_parse::CoreProperty>(&UNICODE_DIRECTORY).unwrap() {
if let Some(name) = PROPERTIES.iter().find(|prop| **prop == row.property.as_str()) {
properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints);
}
}
for row in ucd_parse::parse::<_, ucd_parse::Property>(&UNICODE_DIRECTORY).unwrap() {
if let Some(name) = PROPERTIES.iter().find(|prop| **prop == row.property.as_str()) {
properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints);
}
}
let mut to_lower = BTreeMap::new();
let mut to_upper = BTreeMap::new();
for row in ucd_parse::UnicodeDataExpander::new(
ucd_parse::parse::<_, ucd_parse::UnicodeData>(&UNICODE_DIRECTORY).unwrap(),
) {
let general_category = if ["Nd", "Nl", "No"].contains(&row.general_category.as_str()) {
"N"
} else {
row.general_category.as_str()
};
if let Some(name) = PROPERTIES.iter().find(|prop| **prop == general_category) {
properties
.entry(*name)
.or_insert_with(Vec::new)
.push(Codepoints::Single(row.codepoint));
}
if let Some(mapped) = row.simple_lowercase_mapping {
if mapped != row.codepoint {
to_lower.insert(row.codepoint.value(), (mapped.value(), 0, 0));
}
}
if let Some(mapped) = row.simple_uppercase_mapping {
if mapped != row.codepoint {
to_upper.insert(row.codepoint.value(), (mapped.value(), 0, 0));
}
}
}
for row in ucd_parse::parse::<_, ucd_parse::SpecialCaseMapping>(&UNICODE_DIRECTORY).unwrap() {
if !row.conditions.is_empty() {
// Skip conditional case mappings
continue;
}
let key = row.codepoint.value();
if let Some(lower) = to_mapping(key, row.lowercase) {
to_lower.insert(key, lower);
}
if let Some(upper) = to_mapping(key, row.uppercase) {
to_upper.insert(key, upper);
}
}
let mut properties: HashMap<&'static str, Vec<Range<u32>>> = properties
.into_iter()
.map(|(k, v)| {
(
k,
v.into_iter()
.flat_map(|codepoints| match codepoints {
Codepoints::Single(c) => c
.scalar()
.map(|ch| (ch as u32..ch as u32 + 1))
.into_iter()
.collect::<Vec<_>>(),
Codepoints::Range(c) => c
.into_iter()
.flat_map(|c| c.scalar().map(|ch| (ch as u32..ch as u32 + 1)))
.collect::<Vec<_>>(),
})
.collect::<Vec<Range<u32>>>(),
)
})
.collect();
for ranges in properties.values_mut() {
merge_ranges(ranges);
}
let mut properties = properties.into_iter().collect::<Vec<_>>();
properties.sort_by_key(|p| p.0);
UnicodeData { ranges: properties, to_lower, to_upper }
}
fn main() {
let write_location = std::env::args().nth(1).unwrap_or_else(|| {
eprintln!("Must provide path to write unicode tables to");
eprintln!(
"e.g. {} src/libcore/unicode/unicode_data.rs",
std::env::args().next().unwrap_or_default()
);
std::process::exit(1);
});
let unicode_data = load_data();
let ranges_by_property = &unicode_data.ranges;
let mut total_bytes = 0;
let mut modules = Vec::new();
for (property, ranges) in ranges_by_property {
let datapoints = ranges.iter().map(|r| r.end - r.start).sum::<u32>();
let mut emitter = RawEmitter::new();
emit_codepoints(&mut emitter, &ranges);
modules.push((property.to_lowercase().to_string(), emitter.file));
println!("{:15}: {} bytes, {} codepoints", property, emitter.bytes_used, datapoints,);
total_bytes += emitter.bytes_used;
}
let mut table_file = String::new();
table_file.push_str(
"///! This file is generated by src/tools/unicode-table-generator; do not edit manually!\n",
);
table_file.push_str("use super::range_search;\n\n");
table_file.push_str(&version());
table_file.push('\n');
modules.push((String::from("conversions"), case_mapping::generate_case_mapping(&unicode_data)));
for (name, contents) in modules {
table_file.push_str("#[rustfmt::skip]\n");
table_file.push_str(&format!("pub mod {} {{\n", name));
for line in contents.lines() {
if !line.trim().is_empty() {
table_file.push_str(" ");
table_file.push_str(&line);
}
table_file.push('\n');
}
table_file.push_str("}\n\n");
}
std::fs::write(&write_location, format!("{}\n", table_file.trim_end())).unwrap();
println!("Total table sizes: {} bytes", total_bytes);
}
fn version() -> String {
let mut out = String::new();
out.push_str("pub const UNICODE_VERSION: (u32, u32, u32) = ");
let readme =
std::fs::read_to_string(std::path::Path::new(UNICODE_DIRECTORY).join("ReadMe.txt"))
.unwrap();
let prefix = "for Version ";
let start = readme.find(prefix).unwrap() + prefix.len();
let end = readme.find(" of the Unicode Standard.").unwrap();
let version =
readme[start..end].split('.').map(|v| v.parse::<u32>().expect(&v)).collect::<Vec<_>>();
let [major, minor, micro] = [version[0], version[1], version[2]];
out.push_str(&format!("({}, {}, {});\n", major, minor, micro));
out
}
fn fmt_list<V: std::fmt::Debug>(values: impl IntoIterator<Item = V>) -> String {
let pieces = values.into_iter().map(|b| format!("{:?}, ", b)).collect::<Vec<_>>();
let mut out = String::new();
let mut line = format!("\n ");
for piece in pieces {
if line.len() + piece.len() < 98 {
line.push_str(&piece);
} else {
out.push_str(line.trim_end());
out.push('\n');
line = format!(" {}", piece);
}
}
out.push_str(line.trim_end());
out.push('\n');
out
}
fn merge_ranges(ranges: &mut Vec<Range<u32>>) {
loop {
let mut new_ranges = Vec::new();
let mut idx_iter = 0..(ranges.len() - 1);
while let Some(idx) = idx_iter.next() {
let cur = ranges[idx].clone();
let next = ranges[idx + 1].clone();
if cur.end == next.start {
let _ = idx_iter.next(); // skip next as we're merging it in
new_ranges.push(cur.start..next.end);
} else {
new_ranges.push(cur);
}
}
new_ranges.push(ranges.last().unwrap().clone());
if new_ranges.len() == ranges.len() {
*ranges = new_ranges;
break;
} else {
*ranges = new_ranges;
}
}
}