blob: 52984e557437515b3addf90a73b17ee52a90c008 [file] [log] [blame]
package org.unicode.cldr.util;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import org.unicode.cldr.draft.FileUtilities;
import org.unicode.cldr.test.DisplayAndInputProcessor;
import org.unicode.cldr.tool.GenerateSubdivisions;
import org.unicode.cldr.util.CLDRFile.NumberingSystem;
import org.unicode.cldr.util.CLDRFile.WinningChoice;
import org.unicode.cldr.util.ChainedMap.M4;
import org.unicode.cldr.util.StandardCodes.LstrType;
import org.unicode.cldr.util.Validity.Status;
import com.google.common.base.Splitter;
import com.google.common.collect.LinkedHashMultimap;
import com.google.common.collect.Multimap;
import com.google.common.collect.TreeMultimap;
import com.ibm.icu.impl.Row.R3;
import com.ibm.icu.impl.Row.R4;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ICUUncheckedIOException;
import com.ibm.icu.util.ULocale;
public final class WikiSubdivisionLanguages {
private static final boolean DEBUG_CONSOLE = false;
private static final String DEBUG_LANG_FILTER = null; // "az";
private static final String BEFORE_TYPE = "//ldml/localeDisplayNames/subdivisions/subdivision[@type=\"";
private static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance();
private static final Normalizer2 NFC = Normalizer2.getNFCInstance();
enum Items {
// http://www.wikidata.org/entity/Q24260 كانيلو AD-02 ar
wid,
translation,
subdivisionId,
languageId
}
private static ChainedMap.M3<String, String, String> SUB_LANG_NAME = ChainedMap.of(new TreeMap<String, Object>(), new TreeMap<String, Object>(), String.class);
private static ChainedMap.M3<String, String, String> LANG_SUB_NAME = ChainedMap.of(new TreeMap<String, Object>(), new TreeMap<String, Object>(), String.class);
private static Set<String> bogus = new TreeSet<>();
private static Multimap<Status, String> bogusStatus = TreeMultimap.create();
public static String getSubdivisionName(String subdivisionId, String languageId) {
return WikiSubdivisionLanguages.LANG_SUB_NAME.get(languageId, subdivisionId);
}
public static String getBestWikiEnglishName(String subdivisionId) {
String languageId = "en";
String name = WikiSubdivisionLanguages.getSubdivisionName(subdivisionId, languageId);
if (name != null) {
return name;
}
name = WikiSubdivisionLanguages.getSubdivisionName(subdivisionId, "es");
if (name != null) {
return name;
}
name = WikiSubdivisionLanguages.getSubdivisionName(subdivisionId, "fr");
if (name != null) {
return name;
}
Map<String, String> data = WikiSubdivisionLanguages.SUB_LANG_NAME.get(subdivisionId);
// try Spanish, then French, then first other
if (data != null) {
return data.entrySet().iterator().next().getValue(); // get first
}
return null;
}
//static Map<String, String> WIKIDATA_TO_MID = new TreeMap<>();
static {
Splitter TAB = Splitter.on('\t').trimResults();
File file = new File("data/external", "wikiSubdivisionLanguages.tsv");
try {
System.out.println(file.getCanonicalFile());
} catch (IOException e) {
e.printStackTrace();
}
Set<String> regularSubdivisions = Validity.getInstance().getStatusToCodes(LstrType.subdivision).get(Status.regular);
Map<String, Status> codeToStatus = Validity.getInstance().getCodeToStatus(LstrType.subdivision);
for (String line : FileUtilities.in(WikiSubdivisionLanguages.class, "data/external/wikiSubdivisionLanguages.tsv")) {
List<String> data = TAB.splitToList(line);
String subdivision = GenerateSubdivisions.convertToCldr(data.get(Items.subdivisionId.ordinal()));
if (!regularSubdivisions.contains(subdivision)) {
Status status = codeToStatus.get(subdivision);
if (status == null) {
bogus.add(subdivision);
} else {
bogusStatus.put(status, subdivision);
}
continue;
}
String lang = data.get(Items.languageId.ordinal());
if (DEBUG_LANG_FILTER != null && !DEBUG_LANG_FILTER.equals(lang)) {
continue;
}
String name = NFC.normalize(data.get(Items.translation.ordinal()));
SUB_LANG_NAME.put(subdivision, lang, name);
// WIKIDATA_TO_MID.put(subdivision, data.get(2));
LANG_SUB_NAME.put(lang, subdivision, name);
}
// postprocess
String oldLang = null;
DisplayAndInputProcessor daip = null;
Exception[] internalException = {null};
for (R3<String, String, String> row : LANG_SUB_NAME.rows()) {
String lang = row.get0();
String subdivision = row.get1();
String name = row.get2();
if (!lang.equals(oldLang)) {
oldLang = lang;
daip = new DisplayAndInputProcessor(new ULocale(lang));
}
String path = getSubdivisionPath(subdivision);
String name2 = daip.processInput(
path,
name.replace("\u00AD", ""),
internalException);
// TODO remove soft hyphen in DAIP
if (internalException[0] != null) {
throw new IllegalArgumentException(lang + "\t" + subdivision + "\t" + name, internalException[0]);
} else if (!name.equals(name2)) {
//System.out.println(lang + "\t" + subdivision + "\t" + name + "\t" + name2);
SUB_LANG_NAME.put(subdivision, lang, name2);
LANG_SUB_NAME.put(lang, subdivision, name2);
}
}
}
private static String getSubdivisionPath(String subdivision) {
return BEFORE_TYPE + subdivision + "\"][@draft=\"contributed\"]";
}
private static String getSubdivisionFromPath(String path) {
return path.substring(BEFORE_TYPE.length(), path.indexOf('"',BEFORE_TYPE.length()));
}
public static void main(String[] args) {
Counter<String> counter = new Counter<>();
Factory cldrFactory = CLDR_CONFIG.getCldrFactory();
Factory cldrFactorySubdivisions = Factory.make(CLDRPaths.SUBDIVISIONS_DIRECTORY, ".*");
CLDRFile file = null;
UnicodeSet exemplars = null;
ChainedMap.M4<Integer, String, String, String> exemplarFailureLangSubdivisionName = ChainedMap.of(
new TreeMap<Integer, Object>(),
new TreeMap<String, Object>(),
new TreeMap<String, Object>(),
String.class);
for (Entry<String, Map<String, String>> entry : LANG_SUB_NAME) {
String lang = entry.getKey();
file = cldrFactory.make(lang,true);
CLDRFile oldFileSubdivisions;
try {
oldFileSubdivisions = cldrFactorySubdivisions.make(lang, false);
} catch (Exception e) {
oldFileSubdivisions = new CLDRFile(new SimpleXMLSource(lang)).freeze();
}
// for fixing collisions
// we first add existing items
Multimap<String, String> inverse = LinkedHashMultimap.create();
for (String path : oldFileSubdivisions) {
String name = oldFileSubdivisions.getStringValue(path);
if (name.equals("Böyük Britaniya")) {
int debug = 0;
}
inverse.put(name, path);
}
CLDRFile fileSubdivisions = oldFileSubdivisions.cloneAsThawed();
UnicodeSet main = file.getExemplarSet("", WinningChoice.WINNING, 0);
UnicodeSet auxiliary = file.getExemplarSet("auxiliary", WinningChoice.WINNING);
UnicodeSet punctuation = file.getExemplarSet("punctuation", WinningChoice.WINNING);
UnicodeSet numbers = file.getExemplarsNumeric(NumberingSystem.defaultSystem);
exemplars = new UnicodeSet()
.addAll(main)
.addAll(auxiliary)
.addAll(scriptsFor(main)) // broad test,...
.addAll(punctuation)
.addAll(numbers)
.addAll(new UnicodeSet("[\\ ]")).freeze();
for (Entry<String, String> entry2 : entry.getValue().entrySet()) {
String subdivision = entry2.getKey();
String name = entry2.getValue();
if (name.equals("Böyük Britaniya")) {
int debug = 0;
}
String path = getSubdivisionPath(subdivision);
String oldName = fileSubdivisions.getStringValue(path);
if (oldName != null) {
if (!oldName.equals(name)) {
//System.out.println("Already has translation\t" + lang + "\t" + subdivision + "\t" + name + "\t" + oldName);
}
continue;
}
if (!exemplars.containsAll(name)) {
UnicodeSet exemplarFailures = new UnicodeSet().addAll(name).removeAll(exemplars);
addExemplarFailures(exemplarFailureLangSubdivisionName, exemplarFailures, lang, subdivision, name);
continue;
}
fileSubdivisions.add(path, name);
inverse.put(name, path);
counter.add(lang, 1);
}
// We now fix collisions
for (Entry<String, Collection<String>> entry3 : inverse.asMap().entrySet()) {
String name = entry3.getKey();
if (name.isEmpty()) {
continue;
}
if (name.equals("Böyük Britaniya")) {
int debug = 0;
}
Collection<String> paths = entry3.getValue();
if (paths.size() <= 1) {
continue;
}
if (paths.size() > 3) {
int debug=0;
}
// we only care about collisions *within* a region.
// so group them together
Multimap<String, String> inverse2 = LinkedHashMultimap.create();
for (String path : paths) {
String region = getSubdivisionFromPath(path).substring(0, 2).toUpperCase(Locale.ROOT);
inverse2.put(region, path);
}
// Now fix as necessary
for (Entry<String, Collection<String>> entry4 : inverse2.asMap().entrySet()) {
Collection<String> paths2 = entry4.getValue();
int markerIndex = 0;
if (paths2.size() <= 1) {
continue;
}
for (String path : paths2) {
// if (nuke) {
// if (oldFileSubdivisions.getStringValue(path) == null) {
// fileSubdivisions.remove(path); // get rid of new ones
// System.out.println("Removing colliding " + lang + "\t" + path + "\t" + name);
// }
if (markerIndex != 0) {
String fixedName = name + MARKERS.get(markerIndex);
fail("Superscripting ", lang, getSubdivisionFromPath(path), fixedName, -1);
//System.out.println("Superscripting colliding:\t" + lang + "\t" + path + "\t" + fixedName);
fileSubdivisions.add(path, fixedName); // overwrite with superscripted
}
++markerIndex;
}
}
}
if (DEBUG_CONSOLE) {
PrintWriter pw = new PrintWriter(System.out);
fileSubdivisions.write(new PrintWriter(System.out));
pw.flush();
} else {
try (PrintWriter out = FileUtilities.openUTF8Writer(CLDRPaths.SUBDIVISIONS_DIRECTORY, lang + ".xml")) {
fileSubdivisions.write(out);
} catch (Exception e) {
throw new ICUUncheckedIOException(e);
}
}
}
fail("ExemplarFailures", exemplarFailureLangSubdivisionName);
for (String lang : counter.getKeysetSortedByKey()) {
fail("Superscripting", lang, String.valueOf(counter.get(lang)), null, -1);
}
System.out.println("Bogus subdivisionIds:\t" + "*" + "\t" + bogus.size() + "\t" + bogus);
for (Entry<Status, Collection<String>> entry : bogusStatus.asMap().entrySet()) {
System.out.println("SubdivisionId:\t\t"
+ ":\t" + entry.getKey() + "\t" + entry.getValue().size() + "\t" + entry.getValue());
}
}
private static void addExemplarFailures(M4<Integer, String, String, String> exemplarFailureLangSubdivisionName, UnicodeSet exemplarFailures, String language, String subdivision, String name) {
for (String s : exemplarFailures) {
exemplarFailureLangSubdivisionName.put(s.codePointAt(0), language, subdivision, name);
}
}
private static void fail(String title, M4<Integer, String, String, String> exemplarFailureLangSubdivisionName) {
for (R4<Integer, String, String, String> entry : exemplarFailureLangSubdivisionName.rows()) {
fail(title, entry.get1(), entry.get2(), entry.get3(), entry.get0());
}
}
private static void fail(String title, String lang, String subdivision, String name, int exemplarFailure) {
System.out.println(title
+ ":\t" + lang
+ "\t" + subdivision
+ "\t" + (exemplarFailure < 0 ? "" : "«" + UTF16.valueOf(exemplarFailure) + "»")
+ "\t" + (exemplarFailure < 0 ? "" : "U+" + Utility.hex(exemplarFailure))
+ "\t" + CldrUtility.ifNull(getBestWikiEnglishName(subdivision),"")
+ "\t" + CldrUtility.ifNull(name,"").replace("\"", "&quot;")
);
}
static final List<String> MARKERS = Arrays.asList("¹","²","³"); // if there are more than 3 of the same kind, throw exception
private static UnicodeSet scriptsFor(UnicodeSet main) {
UnicodeSet result = UnicodeSet.EMPTY;
for (String s : main) {
int scriptCode = UScript.getScript(s.codePointAt(0));
if (scriptCode != UScript.COMMON || scriptCode != UScript.INHERITED) {
result = new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, scriptCode);
if (scriptCode == UScript.LATIN) {
result.addAll("ʻ’&");
}
break;
}
}
return result;
}
}