blob: d92208f12ed425b453d2ed7b8c26bc0ea6d6bead [file] [log] [blame]
package org.unicode.cldr.tool;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Arrays;
import java.util.BitSet;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import org.unicode.cldr.draft.FileUtilities;
import org.unicode.cldr.draft.ScriptMetadata;
import org.unicode.cldr.draft.ScriptMetadata.Info;
import org.unicode.cldr.util.Builder;
import org.unicode.cldr.util.CLDRFile;
import org.unicode.cldr.util.CLDRLocale;
import org.unicode.cldr.util.CLDRPaths;
import org.unicode.cldr.util.CldrUtility;
import org.unicode.cldr.util.Containment;
import org.unicode.cldr.util.Counter;
import org.unicode.cldr.util.Factory;
import org.unicode.cldr.util.Iso639Data;
import org.unicode.cldr.util.Iso639Data.Scope;
import org.unicode.cldr.util.LanguageTagParser;
import org.unicode.cldr.util.LocaleIDParser;
import org.unicode.cldr.util.Log;
import org.unicode.cldr.util.PatternCache;
import org.unicode.cldr.util.SimpleFactory;
import org.unicode.cldr.util.StandardCodes;
import org.unicode.cldr.util.SupplementalDataInfo;
import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData;
import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData.Type;
import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus;
import org.unicode.cldr.util.SupplementalDataInfo.PopulationData;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.ibm.icu.dev.util.CollectionUtilities;
import com.ibm.icu.impl.Relation;
import com.ibm.icu.impl.Row;
import com.ibm.icu.impl.Row.R2;
import com.ibm.icu.impl.Row.R3;
import com.ibm.icu.impl.Row.R4;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.NumberFormat;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.icu.util.ULocale;
/**
* Problems:
* "und_Hani", "zh_Hani"
* "und_Sinh", "si_Sinh"
*
* @author markdavis
*
*/
public class GenerateMaximalLocales {
private static final String TEMP_UNKNOWN_REGION = "XZ";
private static final String DEBUG_ADD_KEY = "und_Latn_ZA";
private static final boolean SHOW_ADD = CldrUtility.getProperty("GenerateMaximalLocalesDebug", false);
private static final boolean SUPPRESS_CHANGES = CldrUtility.getProperty("GenerateMaximalLocalesSuppress", false);
private static final boolean SHOW_CONTAINERS = false;
enum OutputStyle {
PLAINTEXT, C, C_ALT, XML
};
private static OutputStyle OUTPUT_STYLE = OutputStyle.valueOf(CldrUtility.getProperty("OutputStyle", "XML", "XML")
.toUpperCase());
// set based on above
private static final String SEPARATOR = OUTPUT_STYLE == OutputStyle.C || OUTPUT_STYLE == OutputStyle.C_ALT ? CldrUtility.LINE_SEPARATOR
: "\t";
private static final String TAG_SEPARATOR = OUTPUT_STYLE == OutputStyle.C_ALT ? "-" : "_";
// private static final boolean FAVOR_REGION = true; // OUTPUT_STYLE == OutputStyle.C_ALT;
private static final boolean tryDifferent = true;
private static final File list[] = {
new File(CLDRPaths.MAIN_DIRECTORY),
new File(CLDRPaths.SEED_DIRECTORY),
new File(CLDRPaths.EXEMPLARS_DIRECTORY) };
private static Factory factory = SimpleFactory.make(list, ".*");
private static SupplementalDataInfo supplementalData = SupplementalDataInfo
.getInstance(CLDRPaths.SUPPLEMENTAL_DIRECTORY);
private static StandardCodes standardCodes = StandardCodes.make();
private static CLDRFile english = factory.make("en", false);
static Relation<String, String> cldrContainerToLanguages = Relation.of(new HashMap<String, Set<String>>(), HashSet.class);
static {
for (CLDRLocale locale : ToolConfig.getToolInstance().getCldrFactory().getAvailableCLDRLocales()) {
String region = locale.getCountry();
if (region == null || region.isEmpty() || Containment.isLeaf(region)) {
continue;
}
cldrContainerToLanguages.put(region, locale.getLanguage());
}
cldrContainerToLanguages.freeze();
System.out.println("Keep containers " + cldrContainerToLanguages);
}
private static final List<String> KEEP_TARGETS = Arrays.asList("und_Arab_PK", "und_Latn_ET");
private static final ImmutableSet<String> deprecatedISONotInLST = ImmutableSet.of("scc", "scr");
/**
* This is the simplest way to override, by supplying the max value.
* It gets a very low weight, so doesn't override any stronger value.
*/
private static final String[] MAX_ADDITIONS = new String[] {
"bss_Latn_CM",
"gez_Ethi_ET",
"ken_Latn_CM",
"und_Arab_PK",
"wa_Latn_BE",
"fub_Arab_CM",
"fuf_Latn_GN",
"kby_Arab_NE",
"kdh_Arab_TG",
"apd_Arab_TG",
"zlm_Latn_TG",
"cr_Cans_CA",
"hif_Latn_FJ",
"gon_Telu_IN",
"lzz_Latn_TR",
"lif_Deva_NP",
"unx_Beng_IN",
"unr_Beng_IN",
"ttt_Latn_AZ",
"pnt_Grek_GR",
"tly_Latn_AZ",
"tkr_Latn_AZ",
"bsq_Bass_LR",
"ccp_Cakm_BD",
"blt_Tavt_VN",
"rhg_Arab_MM",
"rhg_Rohg_MM",
};
/**
* The following overrides MASH the final values, so they may not result in consistent results. Safer is to add to MAX_ADDITIONS.
* However, if you add, add both the language and language+script mappings.
*/
// Many of the overrides below can be removed once the language/pop/country data is updated.
private static final Map<String, String> LANGUAGE_OVERRIDES = CldrUtility.asMap(new String[][] {
{ "eo", "eo_Latn_001" },
{ "eo_Latn", "eo_Latn_001" },
{ "es", "es_Latn_ES" },
{ "es_Latn", "es_Latn_ES" },
{ "ff_BF", "ff_Latn_BF" },
{ "ff_GM", "ff_Latn_GM" },
{ "ff_GH", "ff_Latn_GH" },
{ "ff_GW", "ff_Latn_GW" },
{ "ff_LR", "ff_Latn_LR" },
{ "ff_NE", "ff_Latn_NE" },
{ "ff_NG", "ff_Latn_NG" },
{ "ff_SL", "ff_Latn_SL" },
{ "ff_Adlm", "ff_Adlm_GN" },
{ "ia", "ia_Latn_001" },
{ "ia_Latn", "ia_Latn_001" },
{ "io", "io_Latn_001" },
{ "io_Latn", "io_Latn_001" },
{ "jbo", "jbo_Latn_001" },
{ "jbo_Latn", "jbo_Latn_001" },
{ "ku_Arab", "ku_Arab_IQ" },
{ "lrc", "lrc_Arab_IR" },
{ "lrc_Arab", "lrc_Arab_IR" },
{ "man", "man_Latn_GM" },
{ "man_Latn", "man_Latn_GM" },
{ "mas", "mas_Latn_KE" },
{ "mas_Latn", "mas_Latn_KE" },
{ "mn", "mn_Cyrl_MN" },
{ "mn_Cyrl", "mn_Cyrl_MN" },
{ "mro", "mro_Mroo_BD" },
{ "mro_BD", "mro_Mroo_BD" },
{ "ms_Arab", "ms_Arab_MY" },
{ "pap", "pap_Latn_AW" },
{ "pap_Latn", "pap_Latn_AW" },
{ "prg", "prg_Latn_001" },
{ "prg_Latn", "prg_Latn_001" },
{ "rif", "rif_Tfng_MA" },
{ "rif_Latn", "rif_Latn_MA" },
{ "rif_Tfng", "rif_Tfng_MA" },
{ "rif_MA", "rif_Tfng_MA" },
{ "shi", "shi_Tfng_MA" },
{ "shi_Tfng", "shi_Tfng_MA" },
{ "shi_MA", "shi_Tfng_MA" },
{ "sr_Latn", "sr_Latn_RS" },
{ "ss", "ss_Latn_ZA" },
{ "ss_Latn", "ss_Latn_ZA" },
{ "swc", "swc_Latn_CD" },
{ "ti", "ti_Ethi_ET" },
{ "ti_Ethi", "ti_Ethi_ET" },
{ "und", "en_Latn_US" },
{ "und_Adlm", "ff_Adlm_GN" },
{ "und_Adlm_GN", "ff_Adlm_GN" },
{ "und_Arab", "ar_Arab_EG" },
{ "und_Arab_PK", "ur_Arab_PK" },
{ "und_Bopo", "zh_Bopo_TW" },
{ "und_Deva_FJ", "hif_Deva_FJ" },
{ "und_EZ", "de_Latn_EZ" },
{ "und_Hani", "zh_Hani_CN" },
{ "und_Hani_CN", "zh_Hani_CN" },
{ "und_Kana", "ja_Kana_JP" },
{ "und_Kana_JP", "ja_Kana_JP" },
{ "und_Latn", "en_Latn_US" },
{ "und_Latn_ET", "en_Latn_ET" },
{ "und_Latn_NE", "ha_Latn_NE" },
{ "und_Latn_PH", "fil_Latn_PH" },
{ "und_ML", "bm_Latn_ML" },
{ "und_Latn_ML", "bm_Latn_ML" },
{ "und_MU", "mfe_Latn_MU" },
{ "und_NE", "ha_Latn_NE" },
{ "und_PH", "fil_Latn_PH" },
{ "und_PK", "ur_Arab_PK" },
{ "und_SO", "so_Latn_SO" },
{ "und_SS", "en_Latn_SS" },
{ "und_TK", "tkl_Latn_TK" },
{ "und_UN", "en_Latn_UN" },
{ "vo", "vo_Latn_001" },
{ "vo_Latn", "vo_Latn_001" },
{ "yi", "yi_Hebr_001" },
{ "yi_Hebr", "yi_Hebr_001" },
{ "yue", "yue_Hant_HK" },
{ "yue_Hant", "yue_Hant_HK" },
{ "yue_Hans", "yue_Hans_CN" },
{ "yue_CN", "yue_Hans_CN" },
{ "zh_Hani", "zh_Hani_CN" },
{ "zh_Bopo", "zh_Bopo_TW" },
{ "ccp", "ccp_Cakm_BD" },
{ "ccp_Cakm", "ccp_Cakm_BD" },
{ "und_Cakm", "ccp_Cakm_BD" },
{ "cu_Glag", "cu_Glag_BG" },
{ "sd_Khoj", "sd_Khoj_IN" },
{ "lif_Limb", "lif_Limb_IN" },
{ "grc_Linb", "grc_Linb_GR" },
{ "arc_Nbat", "arc_Nbat_JO" },
{ "arc_Palm", "arc_Palm_SY" },
{ "pal_Phlp", "pal_Phlp_CN" },
{ "en_Shaw", "en_Shaw_GB" },
{ "sd_Sind", "sd_Sind_IN" },
{ "und_Brai", "fr_Brai_FR" }, // hack
{ "und_Hanb", "zh_Hanb_TW" }, // Special script code
{ "zh_Hanb", "zh_Hanb_TW" }, // Special script code
{ "und_Jamo", "ko_Jamo_KR" }, // Special script code
//{"und_Cyrl_PL", "be_Cyrl_PL"},
// {"cr", "cr_Cans_CA"},
// {"hif", "hif_Latn_FJ"},
// {"gon", "gon_Telu_IN"},
// {"lzz", "lzz_Latn_TR"},
// {"lif", "lif_Deva_NP"},
// {"unx", "unx_Beng_IN"},
// {"unr", "unr_Beng_IN"},
// {"ttt", "ttt_Latn_AZ"},
// {"pnt", "pnt_Grek_GR"},
// {"tly", "tly_Latn_AZ"},
// {"tkr", "tkr_Latn_AZ"},
// {"bsq", "bsq_Bass_LR"},
// {"ccp", "ccp_Cakm_BD"},
// {"blt", "blt_Tavt_VN"},
{ "mis_Medf", "mis_Medf_NG" },
});
/**
* The following supplements the suppress-script. It overrides info from exemplars and the locale info.
*/
private static String[][] SpecialScripts = {
{ "zh", "Hans" }, // Hans (not Hani)
{ "yue", "Hant" }, // Hans (not Hani)
{ "chk", "Latn" }, // Chuukese (Micronesia)
{ "fil", "Latn" }, // Filipino (Philippines)"
{ "ko", "Kore" }, // Korean (North Korea)
{ "ko_KR", "Kore" }, // Korean (North Korea)
{ "pap", "Latn" }, // Papiamento (Netherlands Antilles)
{ "pau", "Latn" }, // Palauan (Palau)
{ "su", "Latn" }, // Sundanese (Indonesia)
{ "tet", "Latn" }, // Tetum (East Timor)
{ "tk", "Latn" }, // Turkmen (Turkmenistan)
{ "ty", "Latn" }, // Tahitian (French Polynesia)
{ "ja", "Jpan" }, // Special script for japan
{ "und", "Latn" }, // Ultimate fallback
};
private static Map<String, String> localeToScriptCache = new TreeMap<String, String>();
static {
for (String language : standardCodes.getAvailableCodes("language")) {
Map<String, String> info = standardCodes.getLangData("language", language);
String script = info.get("Suppress-Script");
if (script != null) {
localeToScriptCache.put(language, script);
}
}
for (String[] pair : SpecialScripts) {
localeToScriptCache.put(pair[0], pair[1]);
}
}
private static Map<String, String> FALLBACK_SCRIPTS;
static {
LanguageTagParser additionLtp = new LanguageTagParser();
Map<String, String> _FALLBACK_SCRIPTS = new TreeMap<>();
for (String addition : MAX_ADDITIONS) {
additionLtp.set(addition);
String lan = additionLtp.getLanguage();
_FALLBACK_SCRIPTS.put(lan, additionLtp.getScript());
}
FALLBACK_SCRIPTS = ImmutableMap.copyOf(_FALLBACK_SCRIPTS);
}
private static int errorCount;
public static void main(String[] args) throws IOException {
printDefaultLanguagesAndScripts();
Map<String, String> toMaximized = new TreeMap<String, String>();
tryDifferentAlgorithm(toMaximized);
minimize(toMaximized);
// HACK TEMP_UNKNOWN_REGION
// this is to get around the removal of items with ZZ in minimize.
// probably cleaner way to do it, but this provides control over just those we want to retain.
Set<String> toRemove = new TreeSet<>();
Map<String, String> toFix = new TreeMap<>();
for (Entry<String, String> entry : toMaximized.entrySet()) {
String key = entry.getKey();
String value = entry.getValue();
if (key.contains(TEMP_UNKNOWN_REGION)) {
toRemove.add(key);
} else if (value.contains(TEMP_UNKNOWN_REGION)) {
toFix.put(key, value.replace(TEMP_UNKNOWN_REGION, UNKNOWN_REGION));
}
}
for (String key : toRemove) {
toMaximized.remove(key);
}
toMaximized.putAll(toFix);
Map<String, String> oldLikely = SupplementalDataInfo.getInstance().getLikelySubtags();
Set<String> changes = compareMapsAndFixNew("*WARNING* Likely Subtags: ", oldLikely, toMaximized, "ms_Arab",
"ms_Arab_ID");
System.out.println(CollectionUtilities.join(changes, "\n"));
if (OUTPUT_STYLE == OutputStyle.C_ALT) {
doAlt(toMaximized);
}
if (SHOW_ADD)
System.out
.println("/*"
+ CldrUtility.LINE_SEPARATOR
+ " To Maximize:"
+
CldrUtility.LINE_SEPARATOR
+ " If using raw strings, make sure the input language/locale uses the right separator, and has the right casing."
+
CldrUtility.LINE_SEPARATOR
+ " Remove the script Zzzz and the region ZZ if they occur; change an empty language subtag to 'und'."
+
CldrUtility.LINE_SEPARATOR
+ " Get the language, region, and script from the cleaned-up tag, plus any variants/extensions"
+
CldrUtility.LINE_SEPARATOR
+ " Try each of the following in order (where the field exists)"
+
CldrUtility.LINE_SEPARATOR
+ " Lookup language-script-region. If in the table, return the result + variants"
+
CldrUtility.LINE_SEPARATOR
+ " Lookup language-script. If in the table, return the result (substituting the original region if it exists) + variants"
+
CldrUtility.LINE_SEPARATOR
+ " Lookup language-region. If in the table, return the result (substituting the original script if it exists) + variants"
+
CldrUtility.LINE_SEPARATOR
+ " Lookup language. If in the table, return the result (substituting the original region and script if either or both exist) + variants"
+
CldrUtility.LINE_SEPARATOR
+
CldrUtility.LINE_SEPARATOR
+ " Example: Input is zh-ZZZZ-SG."
+
CldrUtility.LINE_SEPARATOR
+ " Normalize to zh-SG. Lookup in table. No match."
+
CldrUtility.LINE_SEPARATOR
+ " Remove SG, but remember it. Lookup zh, and get the match (zh-Hans-CN). Substitute SG, and return zh-Hans-SG."
+
CldrUtility.LINE_SEPARATOR
+
CldrUtility.LINE_SEPARATOR
+ " To Minimize:"
+
CldrUtility.LINE_SEPARATOR
+ " First get max = maximize(input)."
+
CldrUtility.LINE_SEPARATOR
+ " Then for trial in {language, language-region, language-script}"
+
CldrUtility.LINE_SEPARATOR
+ " If maximize(trial) == max, then return trial."
+
CldrUtility.LINE_SEPARATOR
+ " If you don't get a match, return max."
+
CldrUtility.LINE_SEPARATOR
+
CldrUtility.LINE_SEPARATOR
+ " Example: Input is zh-Hant. Maximize to get zh-Hant-TW."
+
CldrUtility.LINE_SEPARATOR
+ " zh => zh-Hans-CN. No match, so continue."
+
CldrUtility.LINE_SEPARATOR
+ " zh-TW => zh-Hans-TW. Match, so return zh-TW."
+
CldrUtility.LINE_SEPARATOR
+
CldrUtility.LINE_SEPARATOR
+ " (A variant of this uses {language, language-script, language-region}): that is, tries script before language."
+
CldrUtility.LINE_SEPARATOR + " toMaximal size:\t" + toMaximized.size() +
CldrUtility.LINE_SEPARATOR + "*/");
printLikelySubtags(toMaximized);
// if (OUTPUT_STYLE != OutputStyle.XML) {
// printMap("const MapToMinimalSubtags default_subtags[]", toMinimized, null);
// }
printDefaultContent(toMaximized);
System.out.println(CldrUtility.LINE_SEPARATOR + "ERRORS:\t" + errorCount + CldrUtility.LINE_SEPARATOR);
}
static class RowData implements Comparable<RowData> {
OfficialStatus os;
String name;
Long pop;
public RowData(OfficialStatus os, String name, Long pop) {
this.os = os;
this.name = name;
this.pop = pop;
}
public OfficialStatus getStatus() {
// TODO Auto-generated method stub
return os;
}
public CharSequence getName() {
// TODO Auto-generated method stub
return name;
}
public Long getLiteratePopulation() {
// TODO Auto-generated method stub
return pop;
}
public int compareTo(RowData o) {
// TODO Auto-generated method stub
int result = os.compareTo(o.os);
if (result != 0) return -result;
long result2 = pop - o.pop;
if (result2 != 0) return result2 < 0 ? 1 : -1;
return name.compareTo(o.name);
}
public boolean equals(Object o) {
return 0 == compareTo((RowData) o);
}
public int hashCode() {
throw new UnsupportedOperationException();
}
}
private static void printDefaultLanguagesAndScripts() {
final int minTotalPopulation = 10000000;
final int minTerritoryPopulation = 1000000;
final double minTerritoryPercent = 1.0 / 3;
Map<String, Set<RowData>> languageToReason = new TreeMap<String, Set<RowData>>();
Counter<String> languageToLiteratePopulation = new Counter<String>();
NumberFormat nf = NumberFormat.getIntegerInstance(ULocale.ENGLISH);
nf.setGroupingUsed(true);
LanguageTagParser ltp = new LanguageTagParser();
LikelySubtags likelySubtags = new LikelySubtags();
/*
* A. X is a qualified language**, and at least one of the following is true:
*
* 1. X is has official status* in any country
* 2. X exceeds a threshold population† of literate users worldwide: 1M
* 3. X exceeds a threshold population† in some country Z: 100K and 20% of Z's population†.
*
* B. X is an exception explicitly approved by the committee or X has minimal
* language coverage‡ in CLDR itself.
*/
OfficialStatus minimalStatus = OfficialStatus.official_regional; // OfficialStatus.de_facto_official;
Map<String, String> languages = new TreeMap<String, String>();
for (String language : standardCodes.getAvailableCodes("language")) {
String path = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, language);
String result = english.getStringValue(path);
if (result != null) {
languages.put(language, result);
}
}
for (String language : languages.keySet()) {
System.out.println(language + "\t" + languages.get(language));
}
for (String territory : supplementalData.getTerritoriesWithPopulationData()) {
PopulationData territoryPop = supplementalData.getPopulationDataForTerritory(territory);
double territoryPopulation = territoryPop.getLiteratePopulation();
for (String languageScript : supplementalData.getLanguagesForTerritoryWithPopulationData(territory)) {
PopulationData popData = supplementalData.getLanguageAndTerritoryPopulationData(languageScript,
territory);
ltp.set(languageScript);
String language = ltp.getLanguage();
// if (ltp.getScript().isEmpty()) {
// String max = likelySubtags.maximize(languageScript);
// if (max != null) {
// ltp.set(max).setRegion("");
// languageScript = ltp.toString();
// }
// }
boolean add = false;
// #1
OfficialStatus status = popData.getOfficialStatus();
if (status.compareTo(minimalStatus) >= 0) {
add = true;
}
long literatePopulation = getWritingPopulation(popData);
// #2
languageToLiteratePopulation.add(language, literatePopulation);
// #3
if (literatePopulation > minTerritoryPopulation
&& literatePopulation > minTerritoryPercent * territoryPopulation) {
add = true;
}
if (add) {
add(languageToReason, language, territory, status, literatePopulation);
// Add the containing regions
for (String container : Containment.leafToContainer(territory)) {
add(languageToReason, language, container, OfficialStatus.unknown, literatePopulation);
}
}
}
}
// #2, now that we have the data
for (String language : languageToLiteratePopulation.keySet()) {
long totalPop = languageToLiteratePopulation.getCount(language);
if (totalPop > minTotalPopulation) {
add(languageToReason, language, "001", OfficialStatus.unknown, totalPop);
}
}
// Specials
add(languageToReason, "und", "001", OfficialStatus.unknown, 0);
// for (String language : Iso639Data.getAvailable()) {
// Scope scope = Iso639Data.getScope(language);
// Type type = Iso639Data.getType(language);
// if (scope == Scope.Special) {
// add(languageToReason, language, "001", OfficialStatus.unknown, -1);
// }
// }
// print them
System.out.println("Detailed - Including:\t" + languageToReason.size());
for (String language : languageToReason.keySet()) {
Set<RowData> reasons = languageToReason.get(language);
RowData lastReason = reasons.iterator().next();
System.out.append(language)
.append("\t")
.append(english.getName(language))
.append("\t")
.append(lastReason.getStatus().toShortString())
.append("\t")
.append(nf.format(languageToLiteratePopulation.getCount(language)));
for (RowData reason : reasons) {
String status = reason.getStatus().toShortString();
System.out.append("\t")
.append(status)
.append("-")
.append(reason.getName())
.append("-")
.append(nf.format(reason.getLiteratePopulation()));
}
System.out.append("\n");
}
// now list them
Set<String> others = new TreeSet<String>();
others.addAll(standardCodes.getGoodAvailableCodes("language"));
others.removeAll(languageToReason.keySet());
System.out.println("\nIncluded Languages:\t" + languageToReason.keySet().size());
showLanguages(languageToReason.keySet(), languageToReason);
System.out.println("\nExcluded Languages:\t" + others.size());
showLanguages(others, languageToReason);
}
private static long getWritingPopulation(PopulationData popData) {
final double writingPopulation = popData.getWritingPopulation();
if (!Double.isNaN(writingPopulation)) {
return (long) writingPopulation;
}
return (long) popData.getLiteratePopulation();
}
private static void showLanguages(Set<String> others, Map<String, Set<RowData>> languageToReason) {
Set<String> sorted = new TreeSet<String>(Collator.getInstance(ULocale.ENGLISH));
for (String language : others) {
sorted.add(getLanguageName(language, languageToReason));
}
char last = 0;
for (String language : sorted) {
final char curr = language.charAt(0);
if (last != curr) {
System.out.println();
} else if (last != '\u0000') {
System.out.print(", ");
}
System.out.print(language);
last = curr;
}
System.out.println();
}
private static String getLanguageName(String language,
Map<String, Set<RowData>> languageToReason) {
OfficialStatus best = OfficialStatus.unknown;
Set<RowData> reasons = languageToReason.get(language);
if (reasons != null) {
for (RowData reason : reasons) {
final OfficialStatus currentStatus = reason.getStatus();
if (best.compareTo(currentStatus) < 0) {
best = currentStatus;
}
}
}
String status = best.toShortString();
Scope scope = Iso639Data.getScope(language);
if (scope == Scope.Special) {
status = "S";
}
String languageFormatted = english.getName(language) + " [" + language + "]-" + status;
return languageFormatted;
}
private static void add(Map<String, Set<RowData>> languageToReason, String language,
String territoryRaw, OfficialStatus status, long population) {
String territory = english.getName("territory", territoryRaw) + " [" + territoryRaw + "]";
Set<RowData> set = languageToReason.get(language);
if (set == null) {
languageToReason.put(language, set = new TreeSet<RowData>());
}
set.add(new RowData(status, territory, population));
}
private static void printDefaultContent(Map<String, String> toMaximized) throws IOException {
Set<String> defaultLocaleContent = new TreeSet<String>();
// go through all the cldr locales, and add default contents
// now computed from toMaximized
Set<String> available = factory.getAvailable();
Relation<String, String> toChildren = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
LanguageTagParser ltp = new LanguageTagParser();
// System.out.println(maximize("az_Latn_AZ", toMaximized));
Set<String> hasScript = new TreeSet<String>();
// first get a mapping to children
for (String locale : available) {
if (locale.equals("root")) {
continue;
}
if (ltp.set(locale).getVariants().size() != 0) {
continue;
}
String parent = LocaleIDParser.getSimpleParent(locale);
if (ltp.getScript().length() != 0) {
hasScript.add(parent);
}
if (parent.equals("root")) {
continue;
}
toChildren.put(parent, locale);
}
// Suppress script for locales for which we only have one locale in common/main. See ticket #7834.
Set<String> suppressScriptLocales = new HashSet<String>(Arrays.asList(
"bm_ML", "en_US", "ha_NG", "iu_CA", "ms_MY", "mn_MN",
"byn_ER", "ff_SN", "dyo_SN", "kk_KZ", "ku_TR", "ky_KG", "ml_IN", "so_SO", "sw_TZ", "wo_SN", "yo_NG", "dje_NE",
"blt_VN"));
// if any have a script, then throw out any that don't have a script (unless they're specifically included.)
Set<String> toRemove = new TreeSet<String>();
for (String locale : hasScript) {
toRemove.clear();
Set<String> children = toChildren.getAll(locale);
for (String child : children) {
if (ltp.set(child).getScript().length() == 0 && !suppressScriptLocales.contains(child)) {
toRemove.add(child);
}
}
if (toRemove.size() != 0) {
System.out.println("Removing:\t" + locale + "\t" + toRemove + "\tfrom\t" + children);
toChildren.removeAll(locale, toRemove);
}
}
// we add a child as a default locale if it has the same maximization
main: for (String locale : toChildren.keySet()) {
String maximized = maximize(locale, toMaximized);
if (maximized == null) {
if (SHOW_ADD) System.out.println("Missing maximized:\t" + locale);
continue;
}
Set<String> children = toChildren.getAll(locale);
Map<String, String> debugStuff = new TreeMap<String, String>();
for (String child : children) {
String maximizedChild = maximize(child, toMaximized);
if (maximized.equals(maximizedChild)) {
defaultLocaleContent.add(child);
continue main;
}
debugStuff.put(child, maximizedChild);
}
if (SHOW_ADD) System.out.println("Can't find maximized: " + locale + "=" + maximized
+ "\tin\t" + debugStuff);
}
defaultLocaleContent.remove("und_ZZ"); // und_ZZ isn't ever a real locale.
showDefaultContentDifferencesAndFix(defaultLocaleContent);
Log.setLogNoBOM(CLDRPaths.GEN_DIRECTORY + "/supplemental", "supplementalMetadata.xml");
BufferedReader oldFile = FileUtilities.openUTF8Reader(CLDRPaths.SUPPLEMENTAL_DIRECTORY, "supplementalMetadata.xml");
CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*<defaultContent locales=\"\\s*"), Log.getLog(), false);
String sep = CldrUtility.LINE_SEPARATOR + "\t\t\t";
String broken = CldrUtility.breakLines(CldrUtility.join(defaultLocaleContent, " "), sep,
PatternCache.get("(\\S)\\S*").matcher(""), 80);
Log.println("\t\t<defaultContent locales=\"" + broken + "\"");
Log.println("\t\t/>");
// Log.println("</supplementalData>");
CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*/>\\s*(<!--.*)?"), null, true); // skip to matching >
CldrUtility.copyUpTo(oldFile, null, Log.getLog(), true); // copy the rest
Log.close();
oldFile.close();
}
// private static void oldAlgorithm(Map<String,String> toMaximized) {
// Set<String> defaultContentLocales = supplementalData.getDefaultContentLocales();
// LanguageTagParser parser = new LanguageTagParser();
// for (String locale : defaultContentLocales) {
// String parent = parser.getParent(locale);
// toMaximized.put(parent, locale);
// if (SHOW_ADD) System.out.println("Adding:\t" + parent + "\t=>\t" + locale + "\t\tDefaultContent");
// }
//
// for (String[] specialCase : SpecialCases) {
// toMaximized.put(specialCase[0], specialCase[1]);
// if (SHOW_ADD) System.out.println("Adding:\t" + specialCase[0] + "\t=>\t" + specialCase[1] + "\t\tSpecial");
// }
//
// // recurse and close
// closeMapping(toMaximized);
//
// addScript(toMaximized, parser);
//
// closeMapping(toMaximized);
//
// addLanguageScript(toMaximized, parser);
//
// closeMapping(toMaximized);
//
// addLanguageCountry(toMaximized, parser);
//
// closeMapping(toMaximized);
//
// addCountries(toMaximized);
// addScript(toMaximized, parser);
// closeMapping(toMaximized);
// closeUnd(toMaximized);
//
// addDeprecated(toMaximized);
//
// closeMapping(toMaximized);
//
// checkConsistency(toMaximized);
// }
private static class MaxData {
Relation<String, Row.R3<Double, String, String>> languages = Relation.of(new TreeMap<String, Set<Row.R3<Double, String, String>>>(), TreeSet.class);
Map<String, Counter<String>> languagesToScripts = new TreeMap<String, Counter<String>>();
Map<String, Counter<String>> languagesToRegions = new TreeMap<String, Counter<String>>();
Relation<String, Row.R3<Double, String, String>> scripts = Relation.of(new TreeMap<String, Set<Row.R3<Double, String, String>>>(), TreeSet.class);
Map<String, Counter<String>> scriptsToLanguages = new TreeMap<String, Counter<String>>();
Map<String, Counter<String>> scriptsToRegions = new TreeMap<String, Counter<String>>();
Relation<String, Row.R3<Double, String, String>> regions = Relation.of(new TreeMap<String, Set<Row.R3<Double, String, String>>>(), TreeSet.class);
Map<String, Counter<String>> regionsToLanguages = new TreeMap<String, Counter<String>>();
Map<String, Counter<String>> regionsToScripts = new TreeMap<String, Counter<String>>();
Map<String, Counter<Row.R2<String, String>>> containersToLanguage = new TreeMap<String, Counter<Row.R2<String, String>>>();
Relation<String, Row.R4<Double, String, String, String>> containersToLangRegion = Relation.of(
new TreeMap<String, Set<Row.R4<Double, String, String, String>>>(), TreeSet.class);
Relation<Row.R2<String, String>, Row.R2<Double, String>> languageScripts = Relation.of(
new TreeMap<Row.R2<String, String>, Set<Row.R2<Double, String>>>(),
TreeSet.class);
Relation<Row.R2<String, String>, Row.R2<Double, String>> scriptRegions = Relation.of(
new TreeMap<Row.R2<String, String>, Set<Row.R2<Double, String>>>(),
TreeSet.class);
Relation<Row.R2<String, String>, Row.R2<Double, String>> languageRegions = Relation.of(
new TreeMap<Row.R2<String, String>, Set<Row.R2<Double, String>>>(),
TreeSet.class);
/**
* Add population information. "order" is the negative of the population (makes the first be the highest).
* @param language
* @param script
* @param region
* @param order
*/
void add(String language, String script, String region, Double order) {
if (language.equals("cpp")) {
System.out.println(language + "\t" + script + "\t" + region + "\t" + -order);
}
languages.put(language, Row.of(order, script, region));
// addCounter(languagesToScripts, language, script, order);
// addCounter(languagesToRegions, language, region, order);
scripts.put(script, Row.of(order, language, region));
// addCounter(scriptsToLanguages, script, language, order);
// addCounter(scriptsToRegions, script, region, order);
regions.put(region, Row.of(order, language, script));
// addCounter(regionsToLanguages, region, language, order);
// addCounter(regionsToScripts, region, script, order);
languageScripts.put(Row.of(language, script), Row.of(order, region));
scriptRegions.put(Row.of(script, region), Row.of(order, language));
languageRegions.put(Row.of(language, region), Row.of(order, script));
Set<String> containerSet = Containment.leafToContainer(region);
if (containerSet != null) {
for (String container : containerSet) {
containersToLangRegion.put(container, Row.of(order, language, script, region));
Counter<R2<String, String>> data = containersToLanguage.get(container);
if (data == null) {
containersToLanguage.put(container, data = new Counter<R2<String, String>>());
}
data.add(Row.of(language, script), (long) (double) order);
}
}
if (SHOW_ADD) System.out.println("Data:\t" + language + "\t" + script + "\t" + region + "\t" + order);
}
// private void addCounter(Map<String, Counter<String>> map, String key, String key2, Double count) {
// Counter<String> counter = map.get(key);
// if (counter == null) {
// map.put(key, counter = new Counter<String>());
// }
// counter.add(key2, count.longValue());
// }
}
private static final double MIN_UNOFFICIAL_LANGUAGE_SIZE = 10000000;
private static final double MIN_UNOFFICIAL_LANGUAGE_PROPORTION = 0.20;
private static final double MIN_UNOFFICIAL_CLDR_LANGUAGE_SIZE = 100000;
private static final double UNOFFICIAL_SCALE_DOWN = 0.2;
private static NumberFormat percent = NumberFormat.getPercentInstance();
private static NumberFormat number = NumberFormat.getIntegerInstance();
private static void tryDifferentAlgorithm(Map<String, String> toMaximized) {
// we are going to try a different approach.
// first gather counts for maximized values
// Set<Row.R3<String,String,String>,Double> rowsToCounts = new TreeMap();
MaxData maxData = new MaxData();
Set<String> cldrLocales = factory.getAvailable();
Set<String> otherTerritories = new TreeSet<String>(standardCodes.getGoodAvailableCodes("territory"));
// process all the information to get the top values for each triple.
// each of the combinations of 1 or 2 components gets to be a key.
for (String region : supplementalData.getTerritoriesWithPopulationData()) {
otherTerritories.remove(region);
PopulationData regionData = supplementalData.getPopulationDataForTerritory(region);
final double literateTerritoryPopulation = regionData.getLiteratePopulation();
// we need any unofficial language to meet a certain absolute size requirement and proportion size
// requirement.
// so the bar is x percent of the population, reset up to y absolute size.
double minimalLiteratePopulation = literateTerritoryPopulation * MIN_UNOFFICIAL_LANGUAGE_PROPORTION;
if (minimalLiteratePopulation < MIN_UNOFFICIAL_LANGUAGE_SIZE) {
minimalLiteratePopulation = MIN_UNOFFICIAL_LANGUAGE_SIZE;
}
for (String writtenLanguage : supplementalData.getLanguagesForTerritoryWithPopulationData(region)) {
PopulationData data = supplementalData.getLanguageAndTerritoryPopulationData(writtenLanguage, region);
final double literatePopulation = getWritingPopulation(data); //data.getLiteratePopulation();
double order = -literatePopulation; // negative so we get the inverse order
if (data.getOfficialStatus() == OfficialStatus.unknown) {
final String locale = writtenLanguage + "_" + region;
if (literatePopulation >= minimalLiteratePopulation) {
// ok, skip
} else if (literatePopulation >= MIN_UNOFFICIAL_CLDR_LANGUAGE_SIZE && cldrLocales.contains(locale)) {
// ok, skip
} else {
// if (SHOW_ADD)
// System.out.println("Skipping:\t" + writtenLanguage + "\t" + region + "\t"
// + english.getName(locale)
// + "\t-- too small:\t" + number.format(literatePopulation));
// continue;
}
order *= UNOFFICIAL_SCALE_DOWN;
if (SHOW_ADD)
System.out.println("Retaining\t" + writtenLanguage + "\t" + region + "\t"
+ english.getName(locale)
+ "\t" + number.format(literatePopulation)
+ "\t" + percent.format(literatePopulation / literateTerritoryPopulation)
+ (cldrLocales.contains(locale) ? "\tin-CLDR" : ""));
}
String script;
String language = writtenLanguage;
final int pos = writtenLanguage.indexOf('_');
if (pos > 0) {
language = writtenLanguage.substring(0, pos);
script = writtenLanguage.substring(pos + 1);
} else {
script = getScriptForLocale2(language);
}
maxData.add(language, script, region, order);
}
}
LanguageTagParser additionLtp = new LanguageTagParser();
for (String addition : MAX_ADDITIONS) {
additionLtp.set(addition);
String lan = additionLtp.getLanguage();
Set<R3<Double, String, String>> key = maxData.languages.get(lan);
if (key == null) {
maxData.add(lan, additionLtp.getScript(), additionLtp.getRegion(), 1.0);
} else {
int debug = 0;
}
}
for (Entry<String, Collection<String>> entry : DeriveScripts.getLanguageToScript().asMap().entrySet()) {
String language = entry.getKey();
final Collection<String> values = entry.getValue();
if (values.size() != 1) {
continue; // skip, no either way
}
Set<R3<Double, String, String>> old = maxData.languages.get(language);
if (!maxData.languages.containsKey(language)) {
maxData.add(language, values.iterator().next(), TEMP_UNKNOWN_REGION, 1.0);
}
}
// add others, with English default
for (String region : otherTerritories) {
if (region.length() == 3) continue; // FIX ONCE WE ADD REGIONS
maxData.add("en", "Latn", region, 1.0);
}
// get a reverse mapping, so that we can add the aliases
Map<String, R2<List<String>, String>> languageAliases = SupplementalDataInfo.getInstance().getLocaleAliasInfo()
.get("language");
for (Entry<String, R2<List<String>, String>> str : languageAliases.entrySet()) {
String reason = str.getValue().get1();
if ("overlong".equals(reason) || "bibliographic".equals(reason) || "macrolanguage".equals(reason)) {
continue;
}
List<String> replacements = str.getValue().get0();
if (replacements == null) {
continue;
}
String goodLanguage = replacements.get(0);
String badLanguage = str.getKey();
if (badLanguage.contains("_")) {
continue;
}
if (deprecatedISONotInLST.contains(badLanguage)) {
continue;
}
Set<R3<Double, String, String>> goodLanguageData = maxData.languages.getAll(goodLanguage);
if (goodLanguageData == null) {
continue;
}
R3<Double, String, String> value = goodLanguageData.iterator().next();
final String script = value.get1();
final String region = value.get2();
maxData.add(badLanguage, script, region, 1.0);
System.out.println("Adding aliases: " + badLanguage + ", " + script + ", " + region + ", " + reason);
}
// now, get the best for each one
for (String language : maxData.languages.keySet()) {
R3<Double, String, String> value = maxData.languages.getAll(language).iterator().next();
final Comparable<String> script = value.get1();
final Comparable<String> region = value.get2();
add(language, language + "_" + script + "_" + region, toMaximized, "L->SR", Override.REPLACE_EXISTING,
SHOW_ADD);
}
for (String language : maxData.languagesToScripts.keySet()) {
String script = maxData.languagesToScripts.get(language).getKeysetSortedByCount(true).iterator().next();
add(language, language + "_" + script, toMaximized, "L->S", Override.REPLACE_EXISTING, SHOW_ADD);
}
for (String language : maxData.languagesToRegions.keySet()) {
String region = maxData.languagesToRegions.get(language).getKeysetSortedByCount(true).iterator().next();
add(language, language + "_" + region, toMaximized, "L->R", Override.REPLACE_EXISTING, SHOW_ADD);
}
for (String script : maxData.scripts.keySet()) {
R3<Double, String, String> value = maxData.scripts.getAll(script).iterator().next();
final Comparable<String> language = value.get1();
final Comparable<String> region = value.get2();
add("und_" + script, language + "_" + script + "_" + region, toMaximized, "S->LR",
Override.REPLACE_EXISTING, SHOW_ADD);
}
for (String script : maxData.scriptsToLanguages.keySet()) {
String language = maxData.scriptsToLanguages.get(script).getKeysetSortedByCount(true).iterator().next();
add("und_" + script, language + "_" + script, toMaximized, "S->L", Override.REPLACE_EXISTING, SHOW_ADD);
}
for (String script : maxData.scriptsToRegions.keySet()) {
String region = maxData.scriptsToRegions.get(script).getKeysetSortedByCount(true).iterator().next();
add("und_" + script, "und_" + script + "_" + region, toMaximized, "S->R", Override.REPLACE_EXISTING,
SHOW_ADD);
}
for (String region : maxData.regions.keySet()) {
R3<Double, String, String> value = maxData.regions.getAll(region).iterator().next();
final Comparable<String> language = value.get1();
final Comparable<String> script = value.get2();
add("und_" + region, language + "_" + script + "_" + region, toMaximized, "R->LS",
Override.REPLACE_EXISTING, SHOW_ADD);
}
for (String region : maxData.regionsToLanguages.keySet()) {
String language = maxData.regionsToLanguages.get(region).getKeysetSortedByCount(true).iterator().next();
add("und_" + region, language + "_" + region, toMaximized, "R->L", Override.REPLACE_EXISTING, SHOW_ADD);
}
for (String region : maxData.regionsToScripts.keySet()) {
String script = maxData.regionsToScripts.get(region).getKeysetSortedByCount(true).iterator().next();
add("und_" + region, "und_" + script + "_" + region, toMaximized, "R->S", Override.REPLACE_EXISTING,
SHOW_ADD);
}
for (Entry<String, Counter<R2<String, String>>> containerAndInfo : maxData.containersToLanguage.entrySet()) {
String region = containerAndInfo.getKey();
if (region.equals("001")) {
continue;
}
Counter<R2<String, String>> data = containerAndInfo.getValue();
Set<R2<String, String>> keysetSortedByCount = data.getKeysetSortedByCount(true);
if (SHOW_CONTAINERS) { // debug
System.out.println("Container2L:\t" + region + "\t" + shorten(data.getEntrySetSortedByCount(true, null)));
System.out.println("Container2LR:\t" + region + "\t" + maxData.containersToLangRegion.get(region));
}
R2<String, String> value = keysetSortedByCount.iterator().next(); // will get most negative
final Comparable<String> language = value.get0();
final Comparable<String> script = value.get1();
// fix special cases like es-419, where a locale exists.
// for those cases, what we add as output is the container. Otherwise the region.
Set<String> skipLanguages = cldrContainerToLanguages.get(region);
if (skipLanguages != null
&& skipLanguages.contains(language)) {
add("und_" + region, language + "_" + script + "_" + region, toMaximized, "R*->LS",
Override.REPLACE_EXISTING, SHOW_ADD);
continue;
}
// we now have the best language and script. Find the best region for that
for (R4<Double, String, String, String> e : maxData.containersToLangRegion.get(region)) {
final Comparable<String> language2 = e.get1();
final Comparable<String> script2 = e.get2();
if (language2.equals(language) && script2.equals(script)) {
add("und_" + region, language + "_" + script + "_" + e.get3(), toMaximized, "R*->LS",
Override.REPLACE_EXISTING, SHOW_ADD);
break;
}
}
}
for (R2<String, String> languageScript : maxData.languageScripts.keySet()) {
R2<Double, String> value = maxData.languageScripts.getAll(languageScript).iterator().next();
final Comparable<String> language = languageScript.get0();
final Comparable<String> script = languageScript.get1();
final Comparable<String> region = value.get1();
add(language + "_" + script, language + "_" + script + "_" + region, toMaximized, "LS->R",
Override.REPLACE_EXISTING, SHOW_ADD);
}
for (R2<String, String> scriptRegion : maxData.scriptRegions.keySet()) {
R2<Double, String> value = maxData.scriptRegions.getAll(scriptRegion).iterator().next();
final Comparable<String> script = scriptRegion.get0();
final Comparable<String> region = scriptRegion.get1();
final Comparable<String> language = value.get1();
add("und_" + script + "_" + region, language + "_" + script + "_" + region, toMaximized, "SR->L",
Override.REPLACE_EXISTING, SHOW_ADD);
}
for (R2<String, String> languageRegion : maxData.languageRegions.keySet()) {
R2<Double, String> value = maxData.languageRegions.getAll(languageRegion).iterator().next();
final Comparable<String> language = languageRegion.get0();
final Comparable<String> region = languageRegion.get1();
final Comparable<String> script = value.get1();
add(language + "_" + region, language + "_" + script + "_" + region, toMaximized, "LR->S",
Override.REPLACE_EXISTING, SHOW_ADD);
}
// get the script info from metadata as fallback
TreeSet<String> sorted = new TreeSet<String>(ScriptMetadata.getScripts());
for (String script : sorted) {
Info i = ScriptMetadata.getInfo(script);
String likelyLanguage = i.likelyLanguage;
String originCountry = i.originCountry;
final String result = likelyLanguage + "_" + script + "_" + originCountry;
add("und_" + script, result, toMaximized, "S->LR•",
Override.KEEP_EXISTING, SHOW_ADD);
add(likelyLanguage, result, toMaximized, "L->SR•",
Override.KEEP_EXISTING, SHOW_ADD);
}
// add overrides
for (String key : LANGUAGE_OVERRIDES.keySet()) {
add(key, LANGUAGE_OVERRIDES.get(key), toMaximized, "OVERRIDE", Override.REPLACE_EXISTING, true);
}
}
public static String shorten(Object data) {
String info = data.toString();
if (info.length() > 255) {
info = info.substring(0, 127) + "…";
}
return info;
}
private static void doAlt(Map<String, String> toMaximized) {
// TODO Auto-generated method stub
Map<String, String> temp = new TreeMap<String, String>();
for (String locale : toMaximized.keySet()) {
String target = toMaximized.get(locale);
temp.put(toAlt(locale, true), toAlt(target, true));
}
toMaximized.clear();
toMaximized.putAll(temp);
}
public static String maximize(String languageTag, Map<String, String> toMaximized) {
LanguageTagParser ltp = new LanguageTagParser();
// clean up the input by removing Zzzz, ZZ, and changing "" into und.
ltp.set(languageTag);
String language = ltp.getLanguage();
String region = ltp.getRegion();
String script = ltp.getScript();
boolean changed = false;
if (language.equals("")) {
ltp.setLanguage(language = "und");
changed = true;
}
if (region.equals(UNKNOWN_SCRIPT)) {
ltp.setScript(script = "");
changed = true;
}
if (ltp.getRegion().equals(UNKNOWN_REGION)) {
ltp.setRegion(region = "");
changed = true;
}
if (changed) {
languageTag = ltp.toString();
}
// check whole
String result = toMaximized.get(languageTag);
if (result != null) {
return result;
}
// try empty region
if (region.length() != 0) {
result = toMaximized.get(ltp.setRegion("").toString());
if (result != null) {
return ltp.set(result).setRegion(region).toString();
}
ltp.setRegion(region); // restore
}
// try empty script
if (script.length() != 0) {
result = toMaximized.get(ltp.setScript("").toString());
if (result != null) {
return ltp.set(result).setScript(script).toString();
}
// try empty script and region
if (region.length() != 0) {
result = toMaximized.get(ltp.setRegion("").toString());
if (result != null) {
return ltp.set(result).setScript(script).setRegion(region).toString();
}
}
}
if (!language.equals("und") && script.length() != 0 && region.length() != 0) {
return languageTag; // it was ok, and we couldn't do anything with it
}
return null; // couldn't maximize
}
public static String minimize(String input, Map<String, String> toMaximized, boolean favorRegion) {
if (input.equals("nb_Latn_SJ")) {
System.out.print(""); // debug
}
String maximized = maximize(input, toMaximized);
if (maximized == null) {
return null; // failed
}
LanguageTagParser ltp = new LanguageTagParser().set(maximized);
String language = ltp.getLanguage();
String region = ltp.getRegion();
String script = ltp.getScript();
// try building up from shorter to longer, and find the first that matches
// could be more optimized, but for this code we want simplest
String[] trials = { language,
language + TAG_SEPARATOR + (favorRegion ? region : script),
language + TAG_SEPARATOR + (!favorRegion ? region : script) };
for (String trial : trials) {
String newMaximized = maximize(trial, toMaximized);
if (maximized.equals(newMaximized)) {
return trial;
}
}
return maximized;
}
// /**
// * Verify that we can map from each language, script, and country to something.
// * @param toMaximized
// */
// private static void checkConsistency(Map<String, String> toMaximized) {
// Map<String,String> needMappings = new TreeMap();
// LanguageTagParser parser = new LanguageTagParser();
// for (String maximized : new TreeSet<String>(toMaximized.values())) {
// parser.set(maximized);
// final String language = parser.getLanguage();
// final String script = parser.getScript();
// final String region = parser.getRegion();
// if (language.length() == 0 || script.length() == 0 || region.length() == 0) {
// failure(" { \"" + maximized + "\", \"" + maximized + "\" }, // " + english.getName(maximized) +
// "\t\tFailed-Consistency");
// continue;
// }
// addIfNotIn(language, maximized, needMappings, toMaximized, "Consistency");
// addIfNotIn(language + "_" + script, maximized, needMappings, toMaximized, "Consistency");
// addIfNotIn(language + "_" + region, maximized, needMappings, toMaximized, "Consistency");
// addIfNotIn("und_" + script, maximized, needMappings, toMaximized, "Consistency");
// addIfNotIn("und_" + script + "_" + region, maximized, needMappings, toMaximized, "Consistency");
// addIfNotIn("und_" + region, maximized, needMappings, toMaximized, "Consistency");
// }
// toMaximized.putAll(needMappings);
// }
// private static void failure(String string) {
// System.out.println(string);
// errorCount++;
// }
// private static void addIfNotIn(String key, String value, Map<String, String> toAdd, Map<String, String>
// otherToCheck, String kind) {
// addIfNotIn(key, value, toAdd, otherToCheck == null ? null : otherToCheck.keySet(), null, kind);
// }
// private static void addIfNotIn(String key, String value, Map<String, String> toAdd, Set<String> skipKey,
// Set<String> skipValue, String kind) {
// if (!key.equals(value)
// && !toAdd.containsKey(key)
// && (skipKey == null || !skipKey.contains(key))
// && (skipValue == null || !skipValue.contains(value))) {
// add(key, value, toAdd, kind);
// }
// }
enum Override {
KEEP_EXISTING, REPLACE_EXISTING
}
private static void add(String key, String value, Map<String, String> toAdd, String kind, Override override,
boolean showAction) {
if (key.equals(DEBUG_ADD_KEY)) {
System.out.println("*debug*");
}
String oldValue = toAdd.get(key);
if (oldValue == null) {
if (showAction) {
System.out.println("Adding:\t\t" + getName(key) + "\t=>\t" + getName(value) + "\t\t\t\t" + kind);
}
} else if (override == Override.KEEP_EXISTING || value.equals(oldValue)) {
// if (showAction) {
// System.out.println("Skipping:\t" + key + "\t=>\t" + value + "\t\t\t\t" + kind);
// }
return;
} else {
if (showAction) {
System.out.println("Replacing:\t" + getName(key) + "\t=>\t" + getName(value) + "\t, was\t" + getName(oldValue) + "\t\t" + kind);
}
}
toAdd.put(key, value);
}
private static String getName(String value) {
return ConvertLanguageData.getLanguageCodeAndName(value);
}
// private static void addCountries(Map<String, String> toMaximized) {
// Map <String, Map<String, Double>> scriptToLanguageToSize = new TreeMap();
//
// for (String territory : supplementalData.getTerritoriesWithPopulationData()) {
// Set<String> languages = supplementalData.getLanguagesForTerritoryWithPopulationData(territory);
// String biggestOfficial = null;
// double biggest = -1;
// for (String language : languages) {
// PopulationData info = supplementalData.getLanguageAndTerritoryPopulationData(language, territory);
// // add to info about script
//
// String script = getScriptForLocale(language);
// if (script != null) {
// Map<String, Double> languageInfo = scriptToLanguageToSize.get(script);
// if (languageInfo == null) scriptToLanguageToSize.put(script, languageInfo = new TreeMap());
// String baseLanguage = language;
// int pos = baseLanguage.indexOf('_');
// if (pos >= 0) {
// baseLanguage = baseLanguage.substring(0,pos);
// }
// Double size = languageInfo.get(baseLanguage);
// languageInfo.put(baseLanguage, (size == null ? 0 : size) + info.getLiteratePopulation());
// }
//
//
// final OfficialStatus officialStatus = info.getOfficialStatus();
// if (officialStatus == OfficialStatus.de_facto_official || officialStatus == OfficialStatus.official) {
// double size2 = info.getLiteratePopulation();
// if (biggest < size2) {
// biggest = size2;
// biggestOfficial = language;
// }
// }
// }
// if (biggestOfficial != null) {
// final String replacementTag = "und_" + territory;
// String maximized = biggestOfficial + "_" + territory;
// toMaximized.put(replacementTag, maximized);
// if (SHOW_ADD) System.out.println("Adding:\t" + replacementTag + "\t=>\t" + maximized + "\t\tLanguage-Territory");
// }
// }
//
// for (String script : scriptToLanguageToSize.keySet()) {
// String biggestOfficial = null;
// double biggest = -1;
//
// final Map<String, Double> languageToSize = scriptToLanguageToSize.get(script);
// for (String language : languageToSize.keySet()) {
// double size = languageToSize.get(language);
// if (biggest < size) {
// biggest = size;
// biggestOfficial = language;
// }
// }
// if (biggestOfficial != null) {
// final String replacementTag = "und_" + script;
// String maximized = biggestOfficial + "_" + script;
// toMaximized.put(replacementTag, maximized);
// if (SHOW_ADD) System.out.println("Adding:\t" + replacementTag + "\t=>\t" + maximized + "\t\tUnd-Script");
// }
// }
// }
// private static void closeUnd(Map<String, String> toMaximized) {
// Map<String,String> toAdd = new TreeMap<String,String>();
// for (String oldSource : toMaximized.keySet()) {
// String maximized = toMaximized.get(oldSource);
// if (!maximized.startsWith("und")) {
// int pos = maximized.indexOf("_");
// if (pos >= 0) {
// addIfNotIn( "und" + maximized.substring(pos), maximized, toAdd, toMaximized, "CloseUnd");
// }
// }
// }
// toMaximized.putAll(toAdd);
// }
/**
* Generate tags where the deprecated values map to the expanded values
*
* @param toMaximized
*/
// private static void addDeprecated(Map<String, String> toMaximized) {
// Map<String, Map<String, List<String>>> typeToTagToReplacement = supplementalData.getLocaleAliasInfo();
// LanguageTagParser temp = new LanguageTagParser();
// LanguageTagParser tagParsed = new LanguageTagParser();
// LanguageTagParser replacementParsed = new LanguageTagParser();
// Map<String,String> toAdd = new TreeMap<String,String>();
// while (true) {
// toAdd.clear();
// for (String type : typeToTagToReplacement.keySet()) {
// if (type.equals("variant") || type.equals("zone")) continue;
// boolean addUnd = !type.equals("language");
//
// Map<String, List<String>> tagToReplacement = typeToTagToReplacement.get(type);
// System.out.println("*" + type + " = " + tagToReplacement);
//
// for (String tag: tagToReplacement.keySet()) {
//
// final List<String> list = tagToReplacement.get(tag);
// if (list == null) continue; // we don't have any information
// String replacement = list.get(0);
//
// // only do multiples
// if (tag.contains("_") || !replacement.contains("_")) {
// continue;
// }
//
// // we now have a tag and a replacement value
// // make parsers that we can use
// try {
// tagParsed.set(addUnd ? "und-" + tag : tag);
// replacementParsed.set(addUnd ? "und-" + replacement : replacement);
// } catch (RuntimeException e) {
// continue;
// }
// addIfNotIn(tag, replacement, toAdd, toMaximized,"Deprecated");
//
// for (String locale : toMaximized.keySet()) {
// String maximized = toMaximized.get(locale);
// addIfMatches(temp.set(locale), maximized, replacementParsed, tagParsed, toAdd, toMaximized);
// addIfMatches(temp.set(maximized), maximized, replacementParsed, tagParsed, toAdd, toMaximized);
// }
// }
// }
// if (toAdd.size() == 0) {
// break;
// }
// toMaximized.putAll(toAdd);
// }
// }
// private static void addIfMatches(LanguageTagParser locale, String maximized, LanguageTagParser tagParsed,
// LanguageTagParser replacementParsed, Map<String, String> toAdd, Map<String, String> toMaximized) {
// if (!tagParsed.getLanguage().equals(locale.getLanguage()) && !tagParsed.getLanguage().equals("und")) {
// return;
// }
// if (!tagParsed.getScript().equals(locale.getScript()) && !tagParsed.getScript().equals("")) {
// return;
// }
// if (!tagParsed.getRegion().equals(locale.getRegion()) && !tagParsed.getRegion().equals("")) {
// return;
// }
// if (!replacementParsed.getLanguage().equals("und")) {
// locale.setLanguage(replacementParsed.getLanguage());
// }
// if (!replacementParsed.getScript().equals("")) {
// locale.setScript(replacementParsed.getScript());
// }
// if (!replacementParsed.getRegion().equals("")) {
// locale.setRegion(replacementParsed.getRegion());
// }
// addIfNotIn(locale.toString(), maximized, toAdd, toMaximized,"Deprecated");
// }
// private static int getSubtagPosition(String locale, String subtags) {
// int pos = -1;
// while (true) {
// pos = locale.indexOf(subtags, pos + 1);
// if (pos < 0) return -1;
// // make sure boundaries are ok
// if (pos != 0) {
// char charBefore = locale.charAt(pos-1);
// if (charBefore != '_' && charBefore != '_') return -1;
// }
// int limit = pos + subtags.length();
// if (limit != locale.length()) {
// char charAfter = locale.charAt(limit);
// if (charAfter != '_' && charAfter != '_') return -1;
// }
// return pos;
// }
// }
/*
* Format
* const DefaultSubtags default_subtags[] = {
* {
* // Afar => Afar (Latin, Ethiopia)
* "aa",
* "aa_Latn_ET"
* },{
* // Afrikaans => Afrikaans (Latin, South Africa)
* "af",
* "af_Latn_ZA"
* },{
*/
private static void printLikelySubtags(Map<String, String> fluffup) throws IOException {
PrintWriter out = FileUtilities.openUTF8Writer(CLDRPaths.GEN_DIRECTORY,
"/supplemental/likelySubtags" + (OUTPUT_STYLE == OutputStyle.XML ? ".xml" : ".txt"));
String spacing = OUTPUT_STYLE == OutputStyle.PLAINTEXT ? "\t" : " ";
String header = OUTPUT_STYLE != OutputStyle.XML ? "const MapToMaximalSubtags default_subtags[] = {"
: "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" + CldrUtility.LINE_SEPARATOR
+ "<!DOCTYPE supplementalData SYSTEM \"../../common/dtd/ldmlSupplemental.dtd\">"
+ CldrUtility.LINE_SEPARATOR
+ "<!--"
+ CldrUtility.LINE_SEPARATOR
+ CldrUtility.getCopyrightString()
+ CldrUtility.LINE_SEPARATOR
+ "-->"
+ CldrUtility.LINE_SEPARATOR
+ "<!--"
+ CldrUtility.LINE_SEPARATOR
+ "Likely subtags data is generated programatically from CLDR's language/territory/population" + CldrUtility.LINE_SEPARATOR
+ "data using the GenerateMaximalLocales tool. Under normal circumstances, this file should" + CldrUtility.LINE_SEPARATOR
+ "not be patched by hand, as any changes made in that fashion may be lost."
+ CldrUtility.LINE_SEPARATOR
+ "-->"
+ CldrUtility.LINE_SEPARATOR
+ "<supplementalData>" + CldrUtility.LINE_SEPARATOR
+ " <version number=\"$" +
"Revision$\"/>" + CldrUtility.LINE_SEPARATOR
+ " <likelySubtags>";
String footer = OUTPUT_STYLE != OutputStyle.XML ? SEPARATOR + "};"
: " </likelySubtags>" + CldrUtility.LINE_SEPARATOR
+ "</supplementalData>";
out.println(header);
boolean first = true;
Set<String> keys = new TreeSet<String>(new LocaleStringComparator());
keys.addAll(fluffup.keySet());
for (String printingLocale : keys) {
String printingTarget = fluffup.get(printingLocale);
String comment = printingName(printingLocale, spacing) + spacing + "=>" + spacing
+ printingName(printingTarget, spacing);
if (OUTPUT_STYLE == OutputStyle.XML) {
out.println("\t\t<likelySubtag from=\"" + printingLocale +
"\" to=\"" + printingTarget + "\"" +
"/>" + CldrUtility.LINE_SEPARATOR + "\t\t" + "<!--" + comment + "-->");
} else {
if (first) {
first = false;
} else {
out.print(",");
}
if (comment.length() > 70 && SEPARATOR.equals(CldrUtility.LINE_SEPARATOR)) {
comment = printingName(printingLocale, spacing) + SEPARATOR + " // " + spacing + "=>" + spacing
+ printingName(printingTarget, spacing);
}
out.print(
" {"
+ SEPARATOR + " // " + comment
+ SEPARATOR + " \"" + printingLocale + "\","
+ SEPARATOR + " \"" + printingTarget + "\""
+ CldrUtility.LINE_SEPARATOR + " }");
}
}
out.println(footer);
out.close();
}
public static String printingName(String locale, String spacing) {
if (locale == null) {
return null;
}
LanguageTagParser parser = new LanguageTagParser().set(locale);
String lang = parser.getLanguage();
String script = parser.getScript();
String region = parser.getRegion();
return "{" + spacing +
(lang.equals("und") ? "?" : english.getName(CLDRFile.LANGUAGE_NAME, lang)) + ";" + spacing +
(script == null || script.equals("") ? "?" : english.getName(CLDRFile.SCRIPT_NAME, script)) + ";" + spacing
+
(region == null || region.equals("") ? "?" : english.getName(CLDRFile.TERRITORY_NAME, region)) + spacing
+ "}";
}
private static final String[][] ALT_REVERSAL = {
{ "nb", "no" },
{ "no", "nb" },
{ "he", "iw" },
{ "iw", "he" },
};
public static String toAlt(String locale, boolean change) {
if (!change || locale == null) {
return locale;
}
String firstTag = getFirstTag(locale);
for (String[] pair : ALT_REVERSAL) {
if (firstTag.equals(pair[0])) {
locale = pair[1] + locale.substring(pair[1].length());
break;
}
}
locale = locale.replace("_", "-");
return locale;
}
private static String getFirstTag(String locale) {
int pos = locale.indexOf('_');
return pos < 0 ? locale : locale.substring(0, pos);
}
// private static Map<String, String> getBackMapping(Map<String, String> fluffup) {
// Relation<String,String> backMap = new Relation(new TreeMap(), TreeSet.class, BEST_LANGUAGE_COMPARATOR);
// for (String source : fluffup.keySet()) {
// if (source.startsWith("und")) {
// continue;
// }
// String maximized = fluffup.get(source);
// backMap.put(maximized, source); // put in right order
// }
// Map<String,String> returnBackMap = new TreeMap();
// for (String maximized : backMap.keySet()) {
// final Set<String> all = backMap.getAll(maximized);
// final String minimized = all.iterator().next();
// returnBackMap.put(maximized, minimized);
// }
// return returnBackMap;
// }
/**
* Language tags are presumed to share the first language, except possibly "und". Best is least
*/
// private static Comparator BEST_LANGUAGE_COMPARATOR = new Comparator<String>() {
// LanguageTagParser p1 = new LanguageTagParser();
// LanguageTagParser p2 = new LanguageTagParser();
// public int compare(String o1, String o2) {
// if (o1.equals(o2)) return 0;
// p1.set(o1);
// p2.set(o2);
// String lang1 = p1.getLanguage();
// String lang2 = p2.getLanguage();
//
// // compare languages first
// // put und at the end
// int result = lang1.compareTo(lang2);
// if (result != 0) {
// if (lang1.equals("und")) return 1;
// if (lang2.equals("und")) return -1;
// return result;
// }
//
// // now scripts and regions.
// // if they have different numbers of fields, the shorter wins.
// // If there are two fields, region is lowest.
// // The simplest way is to just compare scripts first
// // so zh-TW < zh-Hant, because we first compare "" to Hant
// String script1 = p1.getScript();
// String script2 = p2.getScript();
// int scriptOrder = script1.compareTo(script2);
// if (scriptOrder != 0) return scriptOrder;
//
// String region1 = p1.getRegion();
// String region2 = p2.getRegion();
// int regionOrder = region1.compareTo(region2);
// if (regionOrder != 0) return regionOrder;
//
// return o1.compareTo(o2);
// }
//
// };
public static void minimize(Map<String, String> fluffup) {
LanguageTagParser parser = new LanguageTagParser();
LanguageTagParser targetParser = new LanguageTagParser();
Set<String> removals = new TreeSet<String>();
while (true) {
removals.clear();
for (String locale : fluffup.keySet()) {
String target = fluffup.get(locale);
if (targetParser.set(target).getRegion().equals(UNKNOWN_REGION)) {
removals.add(locale);
if (SHOW_ADD)
System.out.println("Removing:\t" + getName(locale) + "\t=>\t" + getName(target)
+ "\t\t - Unknown Region in target");
continue;
}
if (targetParser.getScript().equals(UNKNOWN_SCRIPT)) {
removals.add(locale);
if (SHOW_ADD)
System.out.println("Removing:\t" + getName(locale) + "\t=>\t" + getName(target)
+ "\t\t - Unknown Script in target");
continue;
}
String region = parser.set(locale).getRegion();
if (region.length() != 0) {
if (region.equals(UNKNOWN_REGION)) {
removals.add(locale);
if (SHOW_ADD)
System.out.println("Removing:\t" + getName(locale) + "\t=>\t" + getName(target)
+ "\t\t - Unknown Region in source");
continue;
}
parser.setRegion("");
String newLocale = parser.toString();
String newTarget = fluffup.get(newLocale);
if (newTarget != null) {
newTarget = targetParser.set(newTarget).setRegion(region).toString();
if (target.equals(newTarget) && !KEEP_TARGETS.contains(locale)) {
removals.add(locale);
if (SHOW_ADD)
System.out.println("Removing:\t" + locale + "\t=>\t" + target + "\t\tRedundant with "
+ newLocale);
continue;
}
}
}
String script = parser.set(locale).getScript();
if (locale.equals(DEBUG_ADD_KEY)) {
System.out.println("*debug*");
}
if (script.length() != 0) {
if (script.equals(UNKNOWN_SCRIPT)) {
removals.add(locale);
if (SHOW_ADD)
System.out.println("Removing:\t" + locale + "\t=>\t" + target + "\t\t - Unknown Script");
continue;
}
parser.setScript("");
String newLocale = parser.toString();
String newTarget = fluffup.get(newLocale);
if (newTarget != null) {
newTarget = targetParser.set(newTarget).setScript(script).toString();
if (target.equals(newTarget) && !KEEP_TARGETS.contains(locale)) {
removals.add(locale);
if (SHOW_ADD)
System.out.println("Removing:\t" + locale + "\t=>\t" + target + "\t\tRedundant with "
+ newLocale);
continue;
}
}
}
}
if (removals.size() == 0) {
break;
}
for (String locale : removals) {
fluffup.remove(locale);
}
}
}
// private static void addLanguageScript(Map<String, String> fluffup, LanguageTagParser parser) {
// // add script
// Map<String, String> temp = new TreeMap<String, String>();
// while (true) {
// temp.clear();
// for (String target : new TreeSet<String>(fluffup.values())) {
// parser.set(target);
// final String territory = parser.getRegion();
// if (territory.length() == 0) {
// continue;
// }
// parser.setRegion("");
// String possibleSource = parser.toString();
// if (fluffup.containsKey(possibleSource)) {
// continue;
// }
// String other = temp.get(possibleSource);
// if (other != null) {
// if (!target.equals(other)) {
// System.out.println("**Failure with multiple sources in addLanguageScript: "
// + possibleSource + "\t=>\t" + target + ", " + other);
// }
// continue;
// }
// temp.put(possibleSource, target);
// if (SHOW_ADD) System.out.println("Adding:\t" + possibleSource + "\t=>\t" + target + "\t\tLanguage-Script");
// }
// if (temp.size() == 0) {
// break;
// }
// fluffup.putAll(temp);
// }
//
// }
// private static void addLanguageCountry(Map<String, String> fluffup, LanguageTagParser parser) {
// // add script
// Map<String, String> temp = new TreeMap<String, String>();
// while (true) {
// temp.clear();
// for (String target : new TreeSet<String>(fluffup.values())) {
// parser.set(target);
// String script = parser.getScript();
// if (script.length() == 0) {
// continue;
// }
// parser.setScript("");
// String possibleSource = parser.toString();
// if (fluffup.containsKey(possibleSource)) {
// continue;
// }
// String other = temp.get(possibleSource);
//
// if (other != null) {
// if (!target.equals(other)) {
// script = getScriptForLocale(possibleSource);
// if (script == null) {
// System.out.println("**Failure with multiple sources in addLanguageCountry: "
// + possibleSource + "\t=>\t" + target + ", " + other);
// continue; // error message in routine
// }
// parser.setScript(script);
// target = parser.toString();
// }
// }
//
// temp.put(possibleSource, target);
// if (SHOW_ADD) System.out.println("Adding:\t" + possibleSource + "\t=>\t" + target + "\t\tLanguageCountry");
// }
// if (temp.size() == 0) {
// break;
// }
// fluffup.putAll(temp);
// }
//
// }
// private static void addScript(Map<String, String> fluffup, LanguageTagParser parser) {
// // add script
// Map<String, String> temp = new TreeMap<String, String>();
// while (true) {
// temp.clear();
// Set skipTarget = fluffup.keySet();
// for (String locale : fluffup.keySet()) {
// String target = fluffup.get(locale);
// parser.set(target);
// if (parser.getScript().length() != 0) {
// continue;
// }
// String script = getScriptForLocale(target);
//
// if (script == null) {
// continue; // error message in routine
// }
// parser.setScript(script);
// String furtherTarget = parser.toString();
// addIfNotIn(target, furtherTarget, temp, fluffup, "Script");
// }
// if (temp.size() == 0) {
// break;
// }
// fluffup.putAll(temp);
// }
// }
// private static String getScriptForLocale(String locale) {
// String result = getScriptForLocale2(locale);
// if (result != null) return result;
// int pos = locale.indexOf('_');
// if (pos >= 0) {
// result = getScriptForLocale2(locale.substring(0,pos));
// }
// return result;
// }
private static String UNKNOWN_SCRIPT = "Zzzz";
private static String UNKNOWN_REGION = "ZZ";
private static String getScriptForLocale2(String locale) {
String result = localeToScriptCache.get(locale);
if (result != null) {
return result;
}
if (locale.equals("ky")) {
int debug = 0;
}
try {
Map<Type, BasicLanguageData> data = supplementalData.getBasicLanguageDataMap(locale);
if (data != null) {
for (BasicLanguageData datum : data.values()) {
final Set<String> scripts = datum.getScripts();
boolean isPrimary = datum.getType() == BasicLanguageData.Type.primary;
if (scripts.size() != 1) {
if (scripts.size() > 1 && isPrimary) {
break;
}
continue;
}
String script = scripts.iterator().next();
if (isPrimary) {
return result = script;
} else if (result == null) {
result = script;
}
}
if (result != null) {
return result;
}
}
CLDRFile cldrFile;
try {
cldrFile = factory.make(locale, true);
} catch (RuntimeException e) {
result = FALLBACK_SCRIPTS.get(locale);
if (result == null) {
System.out.println("***Failed to find script for: " + locale + "\t" + english.getName(locale));
return result = UNKNOWN_SCRIPT;
} else {
return result;
}
}
UnicodeSet exemplars = getExemplarSet(cldrFile, "");
Set<String> CLDRScripts = getScriptsFromUnicodeSet(exemplars);
CLDRScripts.remove(UNKNOWN_SCRIPT);
if (CLDRScripts.size() == 1) {
return result = CLDRScripts.iterator().next();
} else if (CLDRScripts.size() == 0) {
System.out.println("**Failed to get script for:\t" + locale);
return result = UNKNOWN_SCRIPT;
} else {
System.out.println("**Failed, too many scripts for:\t" + locale + ", " + CLDRScripts);
return result = UNKNOWN_SCRIPT;
}
} finally {
if (result.equals(UNKNOWN_SCRIPT)) {
String temp = LANGUAGE_OVERRIDES.get(locale);
if (temp != null) {
result = new LanguageTagParser().set(temp).getScript();
System.out.println("Getting script from LANGUAGE_OVERRIDES for " + locale + " => " + result);
}
}
localeToScriptCache.put(locale, result);
if (SHOW_ADD)
System.out.println("Script:\t" + locale + "\t" + english.getName(locale) + "\t=>\t" + result + "\t"
+ english.getName(CLDRFile.SCRIPT_NAME, result));
}
}
// private static Map<String, String> closeMapping(Map<String, String> fluffup) {
// if (SHOW_ADD) System.out.flush();
// Map<String,String> temp = new TreeMap<String,String>();
// while (true) {
// temp.clear();
// for (String locale : fluffup.keySet()) {
// String target = fluffup.get(locale);
// if (target.equals("si_Sinh") || target.equals("zh-Hani")) {
// System.out.println("????");
// }
// String furtherTarget = fluffup.get(target);
// if (furtherTarget == null) {
// continue;
// }
// addIfNotIn(locale, furtherTarget, temp, null, "Close");
// }
// if (temp.size() == 0) {
// break;
// }
// fluffup.putAll(temp);
// }
// if (SHOW_ADD) System.out.flush();
// return temp;
// }
public static Set<String> getScriptsFromUnicodeSet(UnicodeSet exemplars) {
// use bits first, since that's faster
BitSet scriptBits = new BitSet();
boolean show = false;
for (UnicodeSetIterator it = new UnicodeSetIterator(exemplars); it.next();) {
if (show)
System.out.println(Integer.toHexString(it.codepoint));
if (it.codepoint != UnicodeSetIterator.IS_STRING) {
scriptBits.set(UScript.getScript(it.codepoint));
} else {
int cp;
for (int i = 0; i < it.string.length(); i += UTF16.getCharCount(cp)) {
scriptBits.set(UScript.getScript(cp = UTF16.charAt(it.string, i)));
}
}
}
scriptBits.clear(UScript.COMMON);
scriptBits.clear(UScript.INHERITED);
Set<String> scripts = new TreeSet<String>();
for (int j = 0; j < scriptBits.size(); ++j) {
if (scriptBits.get(j)) {
scripts.add(UScript.getShortName(j));
}
}
return scripts;
}
public static UnicodeSet getExemplarSet(CLDRFile cldrfile, String type) {
if (type.length() != 0)
type = "[@type=\"" + type + "\"]";
String v = cldrfile.getStringValue("//ldml/characters/exemplarCharacters"
+ type);
if (v == null)
return new UnicodeSet();
return new UnicodeSet(v);
}
// private static String[][] SpecialCases = {
// { "zh_Hani", "zh_Hans_CN"},
// { "si_Sinh", "si_Sinh_LK"},
// { "ii", "ii_CN"}, // Sichuan Yi (Yi)
// { "iu", "iu_CA"}, // Inuktitut (Unified Canadian Aboriginal Syllabics)
// { "und", "en"}, // English default
// };
static void showDefaultContentDifferencesAndFix(Set<String> defaultLocaleContent) {
Set<String> errors = new LinkedHashSet<String>();
Map<String, String> oldDefaultContent = SupplementalDataInfo.makeLocaleToDefaultContents(
ConvertLanguageData.supplementalData.getDefaultContentLocales(), new TreeMap<String, String>(), errors);
if (!errors.isEmpty()) {
System.out.println(CollectionUtilities.join(errors, "\n"));
errors.clear();
}
Map<String, String> newDefaultContent = SupplementalDataInfo.makeLocaleToDefaultContents(defaultLocaleContent,
new TreeMap<String, String>(), errors);
if (!errors.isEmpty()) {
System.out.println("Default Content errors: " + CollectionUtilities.join(errors, "\n"));
errors.clear();
}
Set<String> changes = compareMapsAndFixNew("*WARNING* Default Content: ", oldDefaultContent, newDefaultContent,
"ar", "ar_001");
System.out.println(CollectionUtilities.join(changes, "\n"));
defaultLocaleContent.clear();
defaultLocaleContent.addAll(newDefaultContent.values());
newDefaultContent = SupplementalDataInfo.makeLocaleToDefaultContents(defaultLocaleContent,
new TreeMap<String, String>(), errors);
if (!errors.isEmpty()) {
System.out.println("***New Errors: " + CollectionUtilities.join(errors, "\n"));
}
}
private static Set<String> compareMapsAndFixNew(String title,
Map<String, String> oldContent,
Map<String, String> newContent, String... allowedOverrideValues) {
Map<String, String> allowedOverrideValuesTest = new HashMap<String, String>();
for (int i = 0; i < allowedOverrideValues.length; i += 2) {
allowedOverrideValuesTest.put(allowedOverrideValues[i], allowedOverrideValues[i + 1]);
}
Set<String> changes = new TreeSet<String>();
for (String parent : Builder.with(new TreeSet<String>()).addAll(newContent.keySet())
.addAll(oldContent.keySet()).get()) {
String oldValue = oldContent.get(parent);
String newValue = newContent.get(parent);
String overrideValue = allowedOverrideValuesTest.get(parent);
if (overrideValue != null) {
newContent.put(parent, overrideValue);
newValue = overrideValue;
}
if (CldrUtility.equals(oldValue, newValue)) {
continue;
}
String message;
if (oldValue == null) {
message = "Adding " + ConvertLanguageData.getLanguageCodeAndName(parent) + " => "
+ ConvertLanguageData.getLanguageCodeAndName(newValue);
newContent.put(parent, newValue);
} else if (newValue == null) {
if (SUPPRESS_CHANGES) {
message = "Suppressing removal of "
+ ConvertLanguageData.getLanguageCodeAndName(parent) + " => "
+ ConvertLanguageData.getLanguageCodeAndName(oldValue);
newContent.put(parent, oldValue);
} else {
message = "Removing "
+ ConvertLanguageData.getLanguageCodeAndName(parent) + " => "
+ ConvertLanguageData.getLanguageCodeAndName(oldValue);
newContent.remove(oldValue);
}
} else {
if (SUPPRESS_CHANGES) {
message = "Suppressing change of "
+ ConvertLanguageData.getLanguageCodeAndName(parent) + " => "
+ ConvertLanguageData.getLanguageCodeAndName(oldValue) + " to "
+ ConvertLanguageData.getLanguageCodeAndName(newValue);
newContent.remove(newValue);
newContent.put(parent, oldValue);
} else {
message = "Changing "
+ ConvertLanguageData.getLanguageCodeAndName(parent) + " => "
+ ConvertLanguageData.getLanguageCodeAndName(oldValue) + " to "
+ ConvertLanguageData.getLanguageCodeAndName(newValue);
newContent.remove(oldValue);
newContent.put(parent, newValue);
}
}
changes.add(title + message);
}
return changes;
}
public static class LocaleStringComparator implements Comparator<String> {
LanguageTagParser ltp0 = new LanguageTagParser();
LanguageTagParser ltp1 = new LanguageTagParser();
public int compare(String arg0, String arg1) {
ltp0.set(arg0);
ltp1.set(arg1);
String s0 = ltp0.getLanguage();
String s1 = ltp1.getLanguage();
int result = s0.compareTo(s1);
if (result != 0) {
return s0.equals("und") ? 1
: s1.equals("und") ? -1
: result;
}
s0 = ltp0.getScript();
s1 = ltp1.getScript();
result = s0.compareTo(s1);
if (result != 0) {
return result;
}
s0 = ltp0.getRegion();
s1 = ltp1.getRegion();
result = s0.compareTo(s1);
if (result != 0) {
return result;
}
return arg0.compareTo(arg1); // just in case
}
}
}