| package org.unicode.cldr.tool; |
| |
| import java.io.BufferedReader; |
| import java.io.File; |
| import java.io.IOException; |
| import java.io.PrintWriter; |
| import java.util.Arrays; |
| import java.util.BitSet; |
| import java.util.Collection; |
| import java.util.Comparator; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.LinkedHashSet; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Map.Entry; |
| import java.util.Set; |
| import java.util.TreeMap; |
| import java.util.TreeSet; |
| |
| import org.unicode.cldr.draft.FileUtilities; |
| import org.unicode.cldr.draft.ScriptMetadata; |
| import org.unicode.cldr.draft.ScriptMetadata.Info; |
| import org.unicode.cldr.util.Builder; |
| import org.unicode.cldr.util.CLDRFile; |
| import org.unicode.cldr.util.CLDRLocale; |
| import org.unicode.cldr.util.CLDRPaths; |
| import org.unicode.cldr.util.CldrUtility; |
| import org.unicode.cldr.util.Containment; |
| import org.unicode.cldr.util.Counter; |
| import org.unicode.cldr.util.Factory; |
| import org.unicode.cldr.util.Iso639Data; |
| import org.unicode.cldr.util.Iso639Data.Scope; |
| import org.unicode.cldr.util.LanguageTagParser; |
| import org.unicode.cldr.util.LocaleIDParser; |
| import org.unicode.cldr.util.Log; |
| import org.unicode.cldr.util.PatternCache; |
| import org.unicode.cldr.util.SimpleFactory; |
| import org.unicode.cldr.util.StandardCodes; |
| import org.unicode.cldr.util.SupplementalDataInfo; |
| import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData; |
| import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData.Type; |
| import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus; |
| import org.unicode.cldr.util.SupplementalDataInfo.PopulationData; |
| |
| import com.google.common.collect.ImmutableMap; |
| import com.google.common.collect.ImmutableSet; |
| import com.ibm.icu.dev.util.CollectionUtilities; |
| import com.ibm.icu.impl.Relation; |
| import com.ibm.icu.impl.Row; |
| import com.ibm.icu.impl.Row.R2; |
| import com.ibm.icu.impl.Row.R3; |
| import com.ibm.icu.impl.Row.R4; |
| import com.ibm.icu.lang.UScript; |
| import com.ibm.icu.text.Collator; |
| import com.ibm.icu.text.NumberFormat; |
| import com.ibm.icu.text.UTF16; |
| import com.ibm.icu.text.UnicodeSet; |
| import com.ibm.icu.text.UnicodeSetIterator; |
| import com.ibm.icu.util.ULocale; |
| |
| /** |
| * Problems: |
| * "und_Hani", "zh_Hani" |
| * "und_Sinh", "si_Sinh" |
| * |
| * @author markdavis |
| * |
| */ |
| public class GenerateMaximalLocales { |
| |
| private static final String TEMP_UNKNOWN_REGION = "XZ"; |
| |
| private static final String DEBUG_ADD_KEY = "und_Latn_ZA"; |
| |
| private static final boolean SHOW_ADD = CldrUtility.getProperty("GenerateMaximalLocalesDebug", false); |
| private static final boolean SUPPRESS_CHANGES = CldrUtility.getProperty("GenerateMaximalLocalesSuppress", false); |
| private static final boolean SHOW_CONTAINERS = false; |
| |
| enum OutputStyle { |
| PLAINTEXT, C, C_ALT, XML |
| }; |
| |
| private static OutputStyle OUTPUT_STYLE = OutputStyle.valueOf(CldrUtility.getProperty("OutputStyle", "XML", "XML") |
| .toUpperCase()); |
| |
| // set based on above |
| private static final String SEPARATOR = OUTPUT_STYLE == OutputStyle.C || OUTPUT_STYLE == OutputStyle.C_ALT ? CldrUtility.LINE_SEPARATOR |
| : "\t"; |
| private static final String TAG_SEPARATOR = OUTPUT_STYLE == OutputStyle.C_ALT ? "-" : "_"; |
| // private static final boolean FAVOR_REGION = true; // OUTPUT_STYLE == OutputStyle.C_ALT; |
| |
| private static final boolean tryDifferent = true; |
| |
| private static final File list[] = { |
| new File(CLDRPaths.MAIN_DIRECTORY), |
| new File(CLDRPaths.SEED_DIRECTORY), |
| new File(CLDRPaths.EXEMPLARS_DIRECTORY) }; |
| |
| private static Factory factory = SimpleFactory.make(list, ".*"); |
| private static SupplementalDataInfo supplementalData = SupplementalDataInfo |
| .getInstance(CLDRPaths.SUPPLEMENTAL_DIRECTORY); |
| private static StandardCodes standardCodes = StandardCodes.make(); |
| private static CLDRFile english = factory.make("en", false); |
| static Relation<String, String> cldrContainerToLanguages = Relation.of(new HashMap<String, Set<String>>(), HashSet.class); |
| static { |
| for (CLDRLocale locale : ToolConfig.getToolInstance().getCldrFactory().getAvailableCLDRLocales()) { |
| String region = locale.getCountry(); |
| if (region == null || region.isEmpty() || Containment.isLeaf(region)) { |
| continue; |
| } |
| cldrContainerToLanguages.put(region, locale.getLanguage()); |
| } |
| cldrContainerToLanguages.freeze(); |
| System.out.println("Keep containers " + cldrContainerToLanguages); |
| } |
| |
| private static final List<String> KEEP_TARGETS = Arrays.asList("und_Arab_PK", "und_Latn_ET"); |
| private static final ImmutableSet<String> deprecatedISONotInLST = ImmutableSet.of("scc", "scr"); |
| |
| /** |
| * This is the simplest way to override, by supplying the max value. |
| * It gets a very low weight, so doesn't override any stronger value. |
| */ |
| private static final String[] MAX_ADDITIONS = new String[] { |
| "bss_Latn_CM", |
| "gez_Ethi_ET", |
| "ken_Latn_CM", |
| "und_Arab_PK", |
| "wa_Latn_BE", |
| |
| "fub_Arab_CM", |
| "fuf_Latn_GN", |
| "kby_Arab_NE", |
| "kdh_Arab_TG", |
| "apd_Arab_TG", |
| "zlm_Latn_TG", |
| |
| "cr_Cans_CA", |
| "hif_Latn_FJ", |
| "gon_Telu_IN", |
| "lzz_Latn_TR", |
| "lif_Deva_NP", |
| "unx_Beng_IN", |
| "unr_Beng_IN", |
| "ttt_Latn_AZ", |
| "pnt_Grek_GR", |
| "tly_Latn_AZ", |
| "tkr_Latn_AZ", |
| "bsq_Bass_LR", |
| "ccp_Cakm_BD", |
| "blt_Tavt_VN", |
| "rhg_Arab_MM", |
| "rhg_Rohg_MM", |
| }; |
| |
| /** |
| * The following overrides MASH the final values, so they may not result in consistent results. Safer is to add to MAX_ADDITIONS. |
| * However, if you add, add both the language and language+script mappings. |
| */ |
| // Many of the overrides below can be removed once the language/pop/country data is updated. |
| private static final Map<String, String> LANGUAGE_OVERRIDES = CldrUtility.asMap(new String[][] { |
| { "eo", "eo_Latn_001" }, |
| { "eo_Latn", "eo_Latn_001" }, |
| { "es", "es_Latn_ES" }, |
| { "es_Latn", "es_Latn_ES" }, |
| { "ff_BF", "ff_Latn_BF" }, |
| { "ff_GM", "ff_Latn_GM" }, |
| { "ff_GH", "ff_Latn_GH" }, |
| { "ff_GW", "ff_Latn_GW" }, |
| { "ff_LR", "ff_Latn_LR" }, |
| { "ff_NE", "ff_Latn_NE" }, |
| { "ff_NG", "ff_Latn_NG" }, |
| { "ff_SL", "ff_Latn_SL" }, |
| { "ff_Adlm", "ff_Adlm_GN" }, |
| { "ia", "ia_Latn_001" }, |
| { "ia_Latn", "ia_Latn_001" }, |
| { "io", "io_Latn_001" }, |
| { "io_Latn", "io_Latn_001" }, |
| { "jbo", "jbo_Latn_001" }, |
| { "jbo_Latn", "jbo_Latn_001" }, |
| { "ku_Arab", "ku_Arab_IQ" }, |
| { "lrc", "lrc_Arab_IR" }, |
| { "lrc_Arab", "lrc_Arab_IR" }, |
| { "man", "man_Latn_GM" }, |
| { "man_Latn", "man_Latn_GM" }, |
| { "mas", "mas_Latn_KE" }, |
| { "mas_Latn", "mas_Latn_KE" }, |
| { "mn", "mn_Cyrl_MN" }, |
| { "mn_Cyrl", "mn_Cyrl_MN" }, |
| { "mro", "mro_Mroo_BD" }, |
| { "mro_BD", "mro_Mroo_BD" }, |
| { "ms_Arab", "ms_Arab_MY" }, |
| { "pap", "pap_Latn_AW" }, |
| { "pap_Latn", "pap_Latn_AW" }, |
| { "prg", "prg_Latn_001" }, |
| { "prg_Latn", "prg_Latn_001" }, |
| { "rif", "rif_Tfng_MA" }, |
| { "rif_Latn", "rif_Latn_MA" }, |
| { "rif_Tfng", "rif_Tfng_MA" }, |
| { "rif_MA", "rif_Tfng_MA" }, |
| { "shi", "shi_Tfng_MA" }, |
| { "shi_Tfng", "shi_Tfng_MA" }, |
| { "shi_MA", "shi_Tfng_MA" }, |
| { "sr_Latn", "sr_Latn_RS" }, |
| { "ss", "ss_Latn_ZA" }, |
| { "ss_Latn", "ss_Latn_ZA" }, |
| { "swc", "swc_Latn_CD" }, |
| { "ti", "ti_Ethi_ET" }, |
| { "ti_Ethi", "ti_Ethi_ET" }, |
| { "und", "en_Latn_US" }, |
| { "und_Adlm", "ff_Adlm_GN" }, |
| { "und_Adlm_GN", "ff_Adlm_GN" }, |
| { "und_Arab", "ar_Arab_EG" }, |
| { "und_Arab_PK", "ur_Arab_PK" }, |
| { "und_Bopo", "zh_Bopo_TW" }, |
| { "und_Deva_FJ", "hif_Deva_FJ" }, |
| { "und_EZ", "de_Latn_EZ" }, |
| { "und_Hani", "zh_Hani_CN" }, |
| { "und_Hani_CN", "zh_Hani_CN" }, |
| { "und_Kana", "ja_Kana_JP" }, |
| { "und_Kana_JP", "ja_Kana_JP" }, |
| { "und_Latn", "en_Latn_US" }, |
| { "und_Latn_ET", "en_Latn_ET" }, |
| { "und_Latn_NE", "ha_Latn_NE" }, |
| { "und_Latn_PH", "fil_Latn_PH" }, |
| { "und_ML", "bm_Latn_ML" }, |
| { "und_Latn_ML", "bm_Latn_ML" }, |
| { "und_MU", "mfe_Latn_MU" }, |
| { "und_NE", "ha_Latn_NE" }, |
| { "und_PH", "fil_Latn_PH" }, |
| { "und_PK", "ur_Arab_PK" }, |
| { "und_SO", "so_Latn_SO" }, |
| { "und_SS", "en_Latn_SS" }, |
| { "und_TK", "tkl_Latn_TK" }, |
| { "und_UN", "en_Latn_UN" }, |
| { "vo", "vo_Latn_001" }, |
| { "vo_Latn", "vo_Latn_001" }, |
| { "yi", "yi_Hebr_001" }, |
| { "yi_Hebr", "yi_Hebr_001" }, |
| { "yue", "yue_Hant_HK" }, |
| { "yue_Hant", "yue_Hant_HK" }, |
| { "yue_Hans", "yue_Hans_CN" }, |
| { "yue_CN", "yue_Hans_CN" }, |
| { "zh_Hani", "zh_Hani_CN" }, |
| |
| { "zh_Bopo", "zh_Bopo_TW" }, |
| { "ccp", "ccp_Cakm_BD" }, |
| { "ccp_Cakm", "ccp_Cakm_BD" }, |
| { "und_Cakm", "ccp_Cakm_BD" }, |
| { "cu_Glag", "cu_Glag_BG" }, |
| { "sd_Khoj", "sd_Khoj_IN" }, |
| { "lif_Limb", "lif_Limb_IN" }, |
| { "grc_Linb", "grc_Linb_GR" }, |
| { "arc_Nbat", "arc_Nbat_JO" }, |
| { "arc_Palm", "arc_Palm_SY" }, |
| { "pal_Phlp", "pal_Phlp_CN" }, |
| { "en_Shaw", "en_Shaw_GB" }, |
| { "sd_Sind", "sd_Sind_IN" }, |
| { "und_Brai", "fr_Brai_FR" }, // hack |
| { "und_Hanb", "zh_Hanb_TW" }, // Special script code |
| { "zh_Hanb", "zh_Hanb_TW" }, // Special script code |
| { "und_Jamo", "ko_Jamo_KR" }, // Special script code |
| |
| //{"und_Cyrl_PL", "be_Cyrl_PL"}, |
| |
| // {"cr", "cr_Cans_CA"}, |
| // {"hif", "hif_Latn_FJ"}, |
| // {"gon", "gon_Telu_IN"}, |
| // {"lzz", "lzz_Latn_TR"}, |
| // {"lif", "lif_Deva_NP"}, |
| // {"unx", "unx_Beng_IN"}, |
| // {"unr", "unr_Beng_IN"}, |
| // {"ttt", "ttt_Latn_AZ"}, |
| // {"pnt", "pnt_Grek_GR"}, |
| // {"tly", "tly_Latn_AZ"}, |
| // {"tkr", "tkr_Latn_AZ"}, |
| // {"bsq", "bsq_Bass_LR"}, |
| // {"ccp", "ccp_Cakm_BD"}, |
| // {"blt", "blt_Tavt_VN"}, |
| { "mis_Medf", "mis_Medf_NG" }, |
| }); |
| |
| /** |
| * The following supplements the suppress-script. It overrides info from exemplars and the locale info. |
| */ |
| private static String[][] SpecialScripts = { |
| { "zh", "Hans" }, // Hans (not Hani) |
| { "yue", "Hant" }, // Hans (not Hani) |
| { "chk", "Latn" }, // Chuukese (Micronesia) |
| { "fil", "Latn" }, // Filipino (Philippines)" |
| { "ko", "Kore" }, // Korean (North Korea) |
| { "ko_KR", "Kore" }, // Korean (North Korea) |
| { "pap", "Latn" }, // Papiamento (Netherlands Antilles) |
| { "pau", "Latn" }, // Palauan (Palau) |
| { "su", "Latn" }, // Sundanese (Indonesia) |
| { "tet", "Latn" }, // Tetum (East Timor) |
| { "tk", "Latn" }, // Turkmen (Turkmenistan) |
| { "ty", "Latn" }, // Tahitian (French Polynesia) |
| { "ja", "Jpan" }, // Special script for japan |
| { "und", "Latn" }, // Ultimate fallback |
| }; |
| |
| private static Map<String, String> localeToScriptCache = new TreeMap<String, String>(); |
| static { |
| for (String language : standardCodes.getAvailableCodes("language")) { |
| Map<String, String> info = standardCodes.getLangData("language", language); |
| String script = info.get("Suppress-Script"); |
| if (script != null) { |
| localeToScriptCache.put(language, script); |
| } |
| } |
| for (String[] pair : SpecialScripts) { |
| localeToScriptCache.put(pair[0], pair[1]); |
| } |
| } |
| |
| private static Map<String, String> FALLBACK_SCRIPTS; |
| static { |
| LanguageTagParser additionLtp = new LanguageTagParser(); |
| Map<String, String> _FALLBACK_SCRIPTS = new TreeMap<>(); |
| for (String addition : MAX_ADDITIONS) { |
| additionLtp.set(addition); |
| String lan = additionLtp.getLanguage(); |
| _FALLBACK_SCRIPTS.put(lan, additionLtp.getScript()); |
| } |
| FALLBACK_SCRIPTS = ImmutableMap.copyOf(_FALLBACK_SCRIPTS); |
| } |
| |
| private static int errorCount; |
| |
| public static void main(String[] args) throws IOException { |
| |
| printDefaultLanguagesAndScripts(); |
| |
| Map<String, String> toMaximized = new TreeMap<String, String>(); |
| |
| tryDifferentAlgorithm(toMaximized); |
| |
| minimize(toMaximized); |
| |
| // HACK TEMP_UNKNOWN_REGION |
| // this is to get around the removal of items with ZZ in minimize. |
| // probably cleaner way to do it, but this provides control over just those we want to retain. |
| Set<String> toRemove = new TreeSet<>(); |
| Map<String, String> toFix = new TreeMap<>(); |
| for (Entry<String, String> entry : toMaximized.entrySet()) { |
| String key = entry.getKey(); |
| String value = entry.getValue(); |
| if (key.contains(TEMP_UNKNOWN_REGION)) { |
| toRemove.add(key); |
| } else if (value.contains(TEMP_UNKNOWN_REGION)) { |
| toFix.put(key, value.replace(TEMP_UNKNOWN_REGION, UNKNOWN_REGION)); |
| } |
| } |
| for (String key : toRemove) { |
| toMaximized.remove(key); |
| } |
| toMaximized.putAll(toFix); |
| |
| Map<String, String> oldLikely = SupplementalDataInfo.getInstance().getLikelySubtags(); |
| Set<String> changes = compareMapsAndFixNew("*WARNING* Likely Subtags: ", oldLikely, toMaximized, "ms_Arab", |
| "ms_Arab_ID"); |
| System.out.println(CollectionUtilities.join(changes, "\n")); |
| |
| if (OUTPUT_STYLE == OutputStyle.C_ALT) { |
| doAlt(toMaximized); |
| } |
| |
| if (SHOW_ADD) |
| System.out |
| .println("/*" |
| + CldrUtility.LINE_SEPARATOR |
| + " To Maximize:" |
| + |
| CldrUtility.LINE_SEPARATOR |
| + " If using raw strings, make sure the input language/locale uses the right separator, and has the right casing." |
| + |
| CldrUtility.LINE_SEPARATOR |
| + " Remove the script Zzzz and the region ZZ if they occur; change an empty language subtag to 'und'." |
| + |
| CldrUtility.LINE_SEPARATOR |
| + " Get the language, region, and script from the cleaned-up tag, plus any variants/extensions" |
| + |
| CldrUtility.LINE_SEPARATOR |
| + " Try each of the following in order (where the field exists)" |
| + |
| CldrUtility.LINE_SEPARATOR |
| + " Lookup language-script-region. If in the table, return the result + variants" |
| + |
| CldrUtility.LINE_SEPARATOR |
| + " Lookup language-script. If in the table, return the result (substituting the original region if it exists) + variants" |
| + |
| CldrUtility.LINE_SEPARATOR |
| + " Lookup language-region. If in the table, return the result (substituting the original script if it exists) + variants" |
| + |
| CldrUtility.LINE_SEPARATOR |
| + " Lookup language. If in the table, return the result (substituting the original region and script if either or both exist) + variants" |
| + |
| CldrUtility.LINE_SEPARATOR |
| + |
| CldrUtility.LINE_SEPARATOR |
| + " Example: Input is zh-ZZZZ-SG." |
| + |
| CldrUtility.LINE_SEPARATOR |
| + " Normalize to zh-SG. Lookup in table. No match." |
| + |
| CldrUtility.LINE_SEPARATOR |
| + " Remove SG, but remember it. Lookup zh, and get the match (zh-Hans-CN). Substitute SG, and return zh-Hans-SG." |
| + |
| CldrUtility.LINE_SEPARATOR |
| + |
| CldrUtility.LINE_SEPARATOR |
| + " To Minimize:" |
| + |
| CldrUtility.LINE_SEPARATOR |
| + " First get max = maximize(input)." |
| + |
| CldrUtility.LINE_SEPARATOR |
| + " Then for trial in {language, language-region, language-script}" |
| + |
| CldrUtility.LINE_SEPARATOR |
| + " If maximize(trial) == max, then return trial." |
| + |
| CldrUtility.LINE_SEPARATOR |
| + " If you don't get a match, return max." |
| + |
| CldrUtility.LINE_SEPARATOR |
| + |
| CldrUtility.LINE_SEPARATOR |
| + " Example: Input is zh-Hant. Maximize to get zh-Hant-TW." |
| + |
| CldrUtility.LINE_SEPARATOR |
| + " zh => zh-Hans-CN. No match, so continue." |
| + |
| CldrUtility.LINE_SEPARATOR |
| + " zh-TW => zh-Hans-TW. Match, so return zh-TW." |
| + |
| CldrUtility.LINE_SEPARATOR |
| + |
| CldrUtility.LINE_SEPARATOR |
| + " (A variant of this uses {language, language-script, language-region}): that is, tries script before language." |
| + |
| CldrUtility.LINE_SEPARATOR + " toMaximal size:\t" + toMaximized.size() + |
| CldrUtility.LINE_SEPARATOR + "*/"); |
| |
| printLikelySubtags(toMaximized); |
| |
| // if (OUTPUT_STYLE != OutputStyle.XML) { |
| // printMap("const MapToMinimalSubtags default_subtags[]", toMinimized, null); |
| // } |
| |
| printDefaultContent(toMaximized); |
| |
| System.out.println(CldrUtility.LINE_SEPARATOR + "ERRORS:\t" + errorCount + CldrUtility.LINE_SEPARATOR); |
| |
| } |
| |
| static class RowData implements Comparable<RowData> { |
| OfficialStatus os; |
| String name; |
| Long pop; |
| |
| public RowData(OfficialStatus os, String name, Long pop) { |
| this.os = os; |
| this.name = name; |
| this.pop = pop; |
| } |
| |
| public OfficialStatus getStatus() { |
| // TODO Auto-generated method stub |
| return os; |
| } |
| |
| public CharSequence getName() { |
| // TODO Auto-generated method stub |
| return name; |
| } |
| |
| public Long getLiteratePopulation() { |
| // TODO Auto-generated method stub |
| return pop; |
| } |
| |
| public int compareTo(RowData o) { |
| // TODO Auto-generated method stub |
| int result = os.compareTo(o.os); |
| if (result != 0) return -result; |
| long result2 = pop - o.pop; |
| if (result2 != 0) return result2 < 0 ? 1 : -1; |
| return name.compareTo(o.name); |
| } |
| |
| public boolean equals(Object o) { |
| return 0 == compareTo((RowData) o); |
| } |
| |
| public int hashCode() { |
| throw new UnsupportedOperationException(); |
| } |
| } |
| |
| private static void printDefaultLanguagesAndScripts() { |
| |
| final int minTotalPopulation = 10000000; |
| final int minTerritoryPopulation = 1000000; |
| final double minTerritoryPercent = 1.0 / 3; |
| Map<String, Set<RowData>> languageToReason = new TreeMap<String, Set<RowData>>(); |
| Counter<String> languageToLiteratePopulation = new Counter<String>(); |
| NumberFormat nf = NumberFormat.getIntegerInstance(ULocale.ENGLISH); |
| nf.setGroupingUsed(true); |
| LanguageTagParser ltp = new LanguageTagParser(); |
| LikelySubtags likelySubtags = new LikelySubtags(); |
| /* |
| * A. X is a qualified language**, and at least one of the following is true: |
| * |
| * 1. X is has official status* in any country |
| * 2. X exceeds a threshold population†of literate users worldwide: 1M |
| * 3. X exceeds a threshold population†in some country Z: 100K and 20% of Z's population†. |
| * |
| * B. X is an exception explicitly approved by the committee or X has minimal |
| * language coverage‡ in CLDR itself. |
| */ |
| OfficialStatus minimalStatus = OfficialStatus.official_regional; // OfficialStatus.de_facto_official; |
| Map<String, String> languages = new TreeMap<String, String>(); |
| for (String language : standardCodes.getAvailableCodes("language")) { |
| String path = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, language); |
| String result = english.getStringValue(path); |
| if (result != null) { |
| languages.put(language, result); |
| } |
| } |
| for (String language : languages.keySet()) { |
| System.out.println(language + "\t" + languages.get(language)); |
| } |
| |
| for (String territory : supplementalData.getTerritoriesWithPopulationData()) { |
| PopulationData territoryPop = supplementalData.getPopulationDataForTerritory(territory); |
| double territoryPopulation = territoryPop.getLiteratePopulation(); |
| for (String languageScript : supplementalData.getLanguagesForTerritoryWithPopulationData(territory)) { |
| PopulationData popData = supplementalData.getLanguageAndTerritoryPopulationData(languageScript, |
| territory); |
| ltp.set(languageScript); |
| String language = ltp.getLanguage(); |
| // if (ltp.getScript().isEmpty()) { |
| // String max = likelySubtags.maximize(languageScript); |
| // if (max != null) { |
| // ltp.set(max).setRegion(""); |
| // languageScript = ltp.toString(); |
| // } |
| // } |
| boolean add = false; |
| // #1 |
| OfficialStatus status = popData.getOfficialStatus(); |
| if (status.compareTo(minimalStatus) >= 0) { |
| add = true; |
| } |
| long literatePopulation = getWritingPopulation(popData); |
| // #2 |
| languageToLiteratePopulation.add(language, literatePopulation); |
| // #3 |
| if (literatePopulation > minTerritoryPopulation |
| && literatePopulation > minTerritoryPercent * territoryPopulation) { |
| add = true; |
| } |
| if (add) { |
| add(languageToReason, language, territory, status, literatePopulation); |
| // Add the containing regions |
| for (String container : Containment.leafToContainer(territory)) { |
| add(languageToReason, language, container, OfficialStatus.unknown, literatePopulation); |
| } |
| } |
| } |
| } |
| // #2, now that we have the data |
| for (String language : languageToLiteratePopulation.keySet()) { |
| long totalPop = languageToLiteratePopulation.getCount(language); |
| if (totalPop > minTotalPopulation) { |
| add(languageToReason, language, "001", OfficialStatus.unknown, totalPop); |
| } |
| } |
| |
| // Specials |
| add(languageToReason, "und", "001", OfficialStatus.unknown, 0); |
| |
| // for (String language : Iso639Data.getAvailable()) { |
| // Scope scope = Iso639Data.getScope(language); |
| // Type type = Iso639Data.getType(language); |
| // if (scope == Scope.Special) { |
| // add(languageToReason, language, "001", OfficialStatus.unknown, -1); |
| // } |
| // } |
| // print them |
| |
| System.out.println("Detailed - Including:\t" + languageToReason.size()); |
| |
| for (String language : languageToReason.keySet()) { |
| Set<RowData> reasons = languageToReason.get(language); |
| |
| RowData lastReason = reasons.iterator().next(); |
| |
| System.out.append(language) |
| .append("\t") |
| .append(english.getName(language)) |
| .append("\t") |
| .append(lastReason.getStatus().toShortString()) |
| .append("\t") |
| .append(nf.format(languageToLiteratePopulation.getCount(language))); |
| for (RowData reason : reasons) { |
| String status = reason.getStatus().toShortString(); |
| System.out.append("\t") |
| .append(status) |
| .append("-") |
| .append(reason.getName()) |
| .append("-") |
| .append(nf.format(reason.getLiteratePopulation())); |
| } |
| System.out.append("\n"); |
| } |
| |
| // now list them |
| |
| Set<String> others = new TreeSet<String>(); |
| others.addAll(standardCodes.getGoodAvailableCodes("language")); |
| others.removeAll(languageToReason.keySet()); |
| System.out.println("\nIncluded Languages:\t" + languageToReason.keySet().size()); |
| showLanguages(languageToReason.keySet(), languageToReason); |
| System.out.println("\nExcluded Languages:\t" + others.size()); |
| showLanguages(others, languageToReason); |
| } |
| |
| private static long getWritingPopulation(PopulationData popData) { |
| final double writingPopulation = popData.getWritingPopulation(); |
| if (!Double.isNaN(writingPopulation)) { |
| return (long) writingPopulation; |
| } |
| return (long) popData.getLiteratePopulation(); |
| } |
| |
| private static void showLanguages(Set<String> others, Map<String, Set<RowData>> languageToReason) { |
| Set<String> sorted = new TreeSet<String>(Collator.getInstance(ULocale.ENGLISH)); |
| for (String language : others) { |
| sorted.add(getLanguageName(language, languageToReason)); |
| } |
| char last = 0; |
| for (String language : sorted) { |
| final char curr = language.charAt(0); |
| if (last != curr) { |
| System.out.println(); |
| } else if (last != '\u0000') { |
| System.out.print(", "); |
| } |
| System.out.print(language); |
| last = curr; |
| } |
| System.out.println(); |
| } |
| |
| private static String getLanguageName(String language, |
| Map<String, Set<RowData>> languageToReason) { |
| OfficialStatus best = OfficialStatus.unknown; |
| Set<RowData> reasons = languageToReason.get(language); |
| if (reasons != null) { |
| for (RowData reason : reasons) { |
| final OfficialStatus currentStatus = reason.getStatus(); |
| if (best.compareTo(currentStatus) < 0) { |
| best = currentStatus; |
| } |
| } |
| } |
| String status = best.toShortString(); |
| Scope scope = Iso639Data.getScope(language); |
| if (scope == Scope.Special) { |
| status = "S"; |
| } |
| String languageFormatted = english.getName(language) + " [" + language + "]-" + status; |
| return languageFormatted; |
| } |
| |
| private static void add(Map<String, Set<RowData>> languageToReason, String language, |
| String territoryRaw, OfficialStatus status, long population) { |
| String territory = english.getName("territory", territoryRaw) + " [" + territoryRaw + "]"; |
| Set<RowData> set = languageToReason.get(language); |
| if (set == null) { |
| languageToReason.put(language, set = new TreeSet<RowData>()); |
| } |
| set.add(new RowData(status, territory, population)); |
| } |
| |
| private static void printDefaultContent(Map<String, String> toMaximized) throws IOException { |
| |
| Set<String> defaultLocaleContent = new TreeSet<String>(); |
| |
| // go through all the cldr locales, and add default contents |
| // now computed from toMaximized |
| Set<String> available = factory.getAvailable(); |
| Relation<String, String> toChildren = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class); |
| LanguageTagParser ltp = new LanguageTagParser(); |
| |
| // System.out.println(maximize("az_Latn_AZ", toMaximized)); |
| Set<String> hasScript = new TreeSet<String>(); |
| |
| // first get a mapping to children |
| for (String locale : available) { |
| if (locale.equals("root")) { |
| continue; |
| } |
| if (ltp.set(locale).getVariants().size() != 0) { |
| continue; |
| } |
| String parent = LocaleIDParser.getSimpleParent(locale); |
| if (ltp.getScript().length() != 0) { |
| hasScript.add(parent); |
| } |
| if (parent.equals("root")) { |
| continue; |
| } |
| toChildren.put(parent, locale); |
| } |
| |
| // Suppress script for locales for which we only have one locale in common/main. See ticket #7834. |
| Set<String> suppressScriptLocales = new HashSet<String>(Arrays.asList( |
| "bm_ML", "en_US", "ha_NG", "iu_CA", "ms_MY", "mn_MN", |
| "byn_ER", "ff_SN", "dyo_SN", "kk_KZ", "ku_TR", "ky_KG", "ml_IN", "so_SO", "sw_TZ", "wo_SN", "yo_NG", "dje_NE", |
| "blt_VN")); |
| |
| // if any have a script, then throw out any that don't have a script (unless they're specifically included.) |
| Set<String> toRemove = new TreeSet<String>(); |
| for (String locale : hasScript) { |
| toRemove.clear(); |
| Set<String> children = toChildren.getAll(locale); |
| for (String child : children) { |
| if (ltp.set(child).getScript().length() == 0 && !suppressScriptLocales.contains(child)) { |
| toRemove.add(child); |
| } |
| } |
| if (toRemove.size() != 0) { |
| System.out.println("Removing:\t" + locale + "\t" + toRemove + "\tfrom\t" + children); |
| toChildren.removeAll(locale, toRemove); |
| } |
| } |
| |
| // we add a child as a default locale if it has the same maximization |
| main: for (String locale : toChildren.keySet()) { |
| String maximized = maximize(locale, toMaximized); |
| if (maximized == null) { |
| if (SHOW_ADD) System.out.println("Missing maximized:\t" + locale); |
| continue; |
| } |
| Set<String> children = toChildren.getAll(locale); |
| Map<String, String> debugStuff = new TreeMap<String, String>(); |
| for (String child : children) { |
| String maximizedChild = maximize(child, toMaximized); |
| if (maximized.equals(maximizedChild)) { |
| defaultLocaleContent.add(child); |
| continue main; |
| } |
| debugStuff.put(child, maximizedChild); |
| } |
| if (SHOW_ADD) System.out.println("Can't find maximized: " + locale + "=" + maximized |
| + "\tin\t" + debugStuff); |
| } |
| |
| defaultLocaleContent.remove("und_ZZ"); // und_ZZ isn't ever a real locale. |
| |
| showDefaultContentDifferencesAndFix(defaultLocaleContent); |
| |
| Log.setLogNoBOM(CLDRPaths.GEN_DIRECTORY + "/supplemental", "supplementalMetadata.xml"); |
| BufferedReader oldFile = FileUtilities.openUTF8Reader(CLDRPaths.SUPPLEMENTAL_DIRECTORY, "supplementalMetadata.xml"); |
| CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*<defaultContent locales=\"\\s*"), Log.getLog(), false); |
| |
| String sep = CldrUtility.LINE_SEPARATOR + "\t\t\t"; |
| String broken = CldrUtility.breakLines(CldrUtility.join(defaultLocaleContent, " "), sep, |
| PatternCache.get("(\\S)\\S*").matcher(""), 80); |
| |
| Log.println("\t\t<defaultContent locales=\"" + broken + "\""); |
| Log.println("\t\t/>"); |
| |
| // Log.println("</supplementalData>"); |
| CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*/>\\s*(<!--.*)?"), null, true); // skip to matching > |
| CldrUtility.copyUpTo(oldFile, null, Log.getLog(), true); // copy the rest |
| |
| Log.close(); |
| oldFile.close(); |
| } |
| |
| // private static void oldAlgorithm(Map<String,String> toMaximized) { |
| // Set<String> defaultContentLocales = supplementalData.getDefaultContentLocales(); |
| // LanguageTagParser parser = new LanguageTagParser(); |
| // for (String locale : defaultContentLocales) { |
| // String parent = parser.getParent(locale); |
| // toMaximized.put(parent, locale); |
| // if (SHOW_ADD) System.out.println("Adding:\t" + parent + "\t=>\t" + locale + "\t\tDefaultContent"); |
| // } |
| // |
| // for (String[] specialCase : SpecialCases) { |
| // toMaximized.put(specialCase[0], specialCase[1]); |
| // if (SHOW_ADD) System.out.println("Adding:\t" + specialCase[0] + "\t=>\t" + specialCase[1] + "\t\tSpecial"); |
| // } |
| // |
| // // recurse and close |
| // closeMapping(toMaximized); |
| // |
| // addScript(toMaximized, parser); |
| // |
| // closeMapping(toMaximized); |
| // |
| // addLanguageScript(toMaximized, parser); |
| // |
| // closeMapping(toMaximized); |
| // |
| // addLanguageCountry(toMaximized, parser); |
| // |
| // closeMapping(toMaximized); |
| // |
| // addCountries(toMaximized); |
| // addScript(toMaximized, parser); |
| // closeMapping(toMaximized); |
| // closeUnd(toMaximized); |
| // |
| // addDeprecated(toMaximized); |
| // |
| // closeMapping(toMaximized); |
| // |
| // checkConsistency(toMaximized); |
| // } |
| |
| private static class MaxData { |
| Relation<String, Row.R3<Double, String, String>> languages = Relation.of(new TreeMap<String, Set<Row.R3<Double, String, String>>>(), TreeSet.class); |
| Map<String, Counter<String>> languagesToScripts = new TreeMap<String, Counter<String>>(); |
| Map<String, Counter<String>> languagesToRegions = new TreeMap<String, Counter<String>>(); |
| |
| Relation<String, Row.R3<Double, String, String>> scripts = Relation.of(new TreeMap<String, Set<Row.R3<Double, String, String>>>(), TreeSet.class); |
| Map<String, Counter<String>> scriptsToLanguages = new TreeMap<String, Counter<String>>(); |
| Map<String, Counter<String>> scriptsToRegions = new TreeMap<String, Counter<String>>(); |
| |
| Relation<String, Row.R3<Double, String, String>> regions = Relation.of(new TreeMap<String, Set<Row.R3<Double, String, String>>>(), TreeSet.class); |
| Map<String, Counter<String>> regionsToLanguages = new TreeMap<String, Counter<String>>(); |
| Map<String, Counter<String>> regionsToScripts = new TreeMap<String, Counter<String>>(); |
| |
| Map<String, Counter<Row.R2<String, String>>> containersToLanguage = new TreeMap<String, Counter<Row.R2<String, String>>>(); |
| Relation<String, Row.R4<Double, String, String, String>> containersToLangRegion = Relation.of( |
| new TreeMap<String, Set<Row.R4<Double, String, String, String>>>(), TreeSet.class); |
| |
| Relation<Row.R2<String, String>, Row.R2<Double, String>> languageScripts = Relation.of( |
| new TreeMap<Row.R2<String, String>, Set<Row.R2<Double, String>>>(), |
| TreeSet.class); |
| Relation<Row.R2<String, String>, Row.R2<Double, String>> scriptRegions = Relation.of( |
| new TreeMap<Row.R2<String, String>, Set<Row.R2<Double, String>>>(), |
| TreeSet.class); |
| Relation<Row.R2<String, String>, Row.R2<Double, String>> languageRegions = Relation.of( |
| new TreeMap<Row.R2<String, String>, Set<Row.R2<Double, String>>>(), |
| TreeSet.class); |
| |
| /** |
| * Add population information. "order" is the negative of the population (makes the first be the highest). |
| * @param language |
| * @param script |
| * @param region |
| * @param order |
| */ |
| void add(String language, String script, String region, Double order) { |
| if (language.equals("cpp")) { |
| System.out.println(language + "\t" + script + "\t" + region + "\t" + -order); |
| } |
| languages.put(language, Row.of(order, script, region)); |
| // addCounter(languagesToScripts, language, script, order); |
| // addCounter(languagesToRegions, language, region, order); |
| |
| scripts.put(script, Row.of(order, language, region)); |
| // addCounter(scriptsToLanguages, script, language, order); |
| // addCounter(scriptsToRegions, script, region, order); |
| |
| regions.put(region, Row.of(order, language, script)); |
| // addCounter(regionsToLanguages, region, language, order); |
| // addCounter(regionsToScripts, region, script, order); |
| |
| languageScripts.put(Row.of(language, script), Row.of(order, region)); |
| scriptRegions.put(Row.of(script, region), Row.of(order, language)); |
| languageRegions.put(Row.of(language, region), Row.of(order, script)); |
| |
| Set<String> containerSet = Containment.leafToContainer(region); |
| if (containerSet != null) { |
| for (String container : containerSet) { |
| |
| containersToLangRegion.put(container, Row.of(order, language, script, region)); |
| Counter<R2<String, String>> data = containersToLanguage.get(container); |
| if (data == null) { |
| containersToLanguage.put(container, data = new Counter<R2<String, String>>()); |
| } |
| data.add(Row.of(language, script), (long) (double) order); |
| |
| } |
| } |
| |
| if (SHOW_ADD) System.out.println("Data:\t" + language + "\t" + script + "\t" + region + "\t" + order); |
| } |
| // private void addCounter(Map<String, Counter<String>> map, String key, String key2, Double count) { |
| // Counter<String> counter = map.get(key); |
| // if (counter == null) { |
| // map.put(key, counter = new Counter<String>()); |
| // } |
| // counter.add(key2, count.longValue()); |
| // } |
| } |
| |
| private static final double MIN_UNOFFICIAL_LANGUAGE_SIZE = 10000000; |
| private static final double MIN_UNOFFICIAL_LANGUAGE_PROPORTION = 0.20; |
| private static final double MIN_UNOFFICIAL_CLDR_LANGUAGE_SIZE = 100000; |
| private static final double UNOFFICIAL_SCALE_DOWN = 0.2; |
| |
| private static NumberFormat percent = NumberFormat.getPercentInstance(); |
| private static NumberFormat number = NumberFormat.getIntegerInstance(); |
| |
| private static void tryDifferentAlgorithm(Map<String, String> toMaximized) { |
| // we are going to try a different approach. |
| // first gather counts for maximized values |
| // Set<Row.R3<String,String,String>,Double> rowsToCounts = new TreeMap(); |
| MaxData maxData = new MaxData(); |
| Set<String> cldrLocales = factory.getAvailable(); |
| Set<String> otherTerritories = new TreeSet<String>(standardCodes.getGoodAvailableCodes("territory")); |
| |
| // process all the information to get the top values for each triple. |
| // each of the combinations of 1 or 2 components gets to be a key. |
| for (String region : supplementalData.getTerritoriesWithPopulationData()) { |
| otherTerritories.remove(region); |
| PopulationData regionData = supplementalData.getPopulationDataForTerritory(region); |
| final double literateTerritoryPopulation = regionData.getLiteratePopulation(); |
| // we need any unofficial language to meet a certain absolute size requirement and proportion size |
| // requirement. |
| // so the bar is x percent of the population, reset up to y absolute size. |
| double minimalLiteratePopulation = literateTerritoryPopulation * MIN_UNOFFICIAL_LANGUAGE_PROPORTION; |
| if (minimalLiteratePopulation < MIN_UNOFFICIAL_LANGUAGE_SIZE) { |
| minimalLiteratePopulation = MIN_UNOFFICIAL_LANGUAGE_SIZE; |
| } |
| |
| for (String writtenLanguage : supplementalData.getLanguagesForTerritoryWithPopulationData(region)) { |
| PopulationData data = supplementalData.getLanguageAndTerritoryPopulationData(writtenLanguage, region); |
| final double literatePopulation = getWritingPopulation(data); //data.getLiteratePopulation(); |
| double order = -literatePopulation; // negative so we get the inverse order |
| |
| if (data.getOfficialStatus() == OfficialStatus.unknown) { |
| final String locale = writtenLanguage + "_" + region; |
| if (literatePopulation >= minimalLiteratePopulation) { |
| // ok, skip |
| } else if (literatePopulation >= MIN_UNOFFICIAL_CLDR_LANGUAGE_SIZE && cldrLocales.contains(locale)) { |
| // ok, skip |
| } else { |
| // if (SHOW_ADD) |
| // System.out.println("Skipping:\t" + writtenLanguage + "\t" + region + "\t" |
| // + english.getName(locale) |
| // + "\t-- too small:\t" + number.format(literatePopulation)); |
| // continue; |
| } |
| order *= UNOFFICIAL_SCALE_DOWN; |
| if (SHOW_ADD) |
| System.out.println("Retaining\t" + writtenLanguage + "\t" + region + "\t" |
| + english.getName(locale) |
| + "\t" + number.format(literatePopulation) |
| + "\t" + percent.format(literatePopulation / literateTerritoryPopulation) |
| + (cldrLocales.contains(locale) ? "\tin-CLDR" : "")); |
| } |
| String script; |
| String language = writtenLanguage; |
| final int pos = writtenLanguage.indexOf('_'); |
| if (pos > 0) { |
| language = writtenLanguage.substring(0, pos); |
| script = writtenLanguage.substring(pos + 1); |
| } else { |
| script = getScriptForLocale2(language); |
| } |
| maxData.add(language, script, region, order); |
| } |
| } |
| |
| LanguageTagParser additionLtp = new LanguageTagParser(); |
| |
| for (String addition : MAX_ADDITIONS) { |
| additionLtp.set(addition); |
| String lan = additionLtp.getLanguage(); |
| Set<R3<Double, String, String>> key = maxData.languages.get(lan); |
| if (key == null) { |
| maxData.add(lan, additionLtp.getScript(), additionLtp.getRegion(), 1.0); |
| } else { |
| int debug = 0; |
| } |
| } |
| |
| for (Entry<String, Collection<String>> entry : DeriveScripts.getLanguageToScript().asMap().entrySet()) { |
| String language = entry.getKey(); |
| final Collection<String> values = entry.getValue(); |
| if (values.size() != 1) { |
| continue; // skip, no either way |
| } |
| Set<R3<Double, String, String>> old = maxData.languages.get(language); |
| if (!maxData.languages.containsKey(language)) { |
| maxData.add(language, values.iterator().next(), TEMP_UNKNOWN_REGION, 1.0); |
| } |
| } |
| |
| // add others, with English default |
| for (String region : otherTerritories) { |
| if (region.length() == 3) continue; // FIX ONCE WE ADD REGIONS |
| maxData.add("en", "Latn", region, 1.0); |
| } |
| |
| // get a reverse mapping, so that we can add the aliases |
| |
| Map<String, R2<List<String>, String>> languageAliases = SupplementalDataInfo.getInstance().getLocaleAliasInfo() |
| .get("language"); |
| for (Entry<String, R2<List<String>, String>> str : languageAliases.entrySet()) { |
| String reason = str.getValue().get1(); |
| if ("overlong".equals(reason) || "bibliographic".equals(reason) || "macrolanguage".equals(reason)) { |
| continue; |
| } |
| List<String> replacements = str.getValue().get0(); |
| if (replacements == null) { |
| continue; |
| } |
| String goodLanguage = replacements.get(0); |
| |
| String badLanguage = str.getKey(); |
| if (badLanguage.contains("_")) { |
| continue; |
| } |
| if (deprecatedISONotInLST.contains(badLanguage)) { |
| continue; |
| } |
| Set<R3<Double, String, String>> goodLanguageData = maxData.languages.getAll(goodLanguage); |
| if (goodLanguageData == null) { |
| continue; |
| } |
| R3<Double, String, String> value = goodLanguageData.iterator().next(); |
| final String script = value.get1(); |
| final String region = value.get2(); |
| maxData.add(badLanguage, script, region, 1.0); |
| System.out.println("Adding aliases: " + badLanguage + ", " + script + ", " + region + ", " + reason); |
| } |
| |
| // now, get the best for each one |
| for (String language : maxData.languages.keySet()) { |
| R3<Double, String, String> value = maxData.languages.getAll(language).iterator().next(); |
| final Comparable<String> script = value.get1(); |
| final Comparable<String> region = value.get2(); |
| add(language, language + "_" + script + "_" + region, toMaximized, "L->SR", Override.REPLACE_EXISTING, |
| SHOW_ADD); |
| } |
| for (String language : maxData.languagesToScripts.keySet()) { |
| String script = maxData.languagesToScripts.get(language).getKeysetSortedByCount(true).iterator().next(); |
| add(language, language + "_" + script, toMaximized, "L->S", Override.REPLACE_EXISTING, SHOW_ADD); |
| } |
| for (String language : maxData.languagesToRegions.keySet()) { |
| String region = maxData.languagesToRegions.get(language).getKeysetSortedByCount(true).iterator().next(); |
| add(language, language + "_" + region, toMaximized, "L->R", Override.REPLACE_EXISTING, SHOW_ADD); |
| } |
| |
| for (String script : maxData.scripts.keySet()) { |
| R3<Double, String, String> value = maxData.scripts.getAll(script).iterator().next(); |
| final Comparable<String> language = value.get1(); |
| final Comparable<String> region = value.get2(); |
| add("und_" + script, language + "_" + script + "_" + region, toMaximized, "S->LR", |
| Override.REPLACE_EXISTING, SHOW_ADD); |
| } |
| for (String script : maxData.scriptsToLanguages.keySet()) { |
| String language = maxData.scriptsToLanguages.get(script).getKeysetSortedByCount(true).iterator().next(); |
| add("und_" + script, language + "_" + script, toMaximized, "S->L", Override.REPLACE_EXISTING, SHOW_ADD); |
| } |
| for (String script : maxData.scriptsToRegions.keySet()) { |
| String region = maxData.scriptsToRegions.get(script).getKeysetSortedByCount(true).iterator().next(); |
| add("und_" + script, "und_" + script + "_" + region, toMaximized, "S->R", Override.REPLACE_EXISTING, |
| SHOW_ADD); |
| } |
| |
| for (String region : maxData.regions.keySet()) { |
| R3<Double, String, String> value = maxData.regions.getAll(region).iterator().next(); |
| final Comparable<String> language = value.get1(); |
| final Comparable<String> script = value.get2(); |
| add("und_" + region, language + "_" + script + "_" + region, toMaximized, "R->LS", |
| Override.REPLACE_EXISTING, SHOW_ADD); |
| } |
| for (String region : maxData.regionsToLanguages.keySet()) { |
| String language = maxData.regionsToLanguages.get(region).getKeysetSortedByCount(true).iterator().next(); |
| add("und_" + region, language + "_" + region, toMaximized, "R->L", Override.REPLACE_EXISTING, SHOW_ADD); |
| } |
| for (String region : maxData.regionsToScripts.keySet()) { |
| String script = maxData.regionsToScripts.get(region).getKeysetSortedByCount(true).iterator().next(); |
| add("und_" + region, "und_" + script + "_" + region, toMaximized, "R->S", Override.REPLACE_EXISTING, |
| SHOW_ADD); |
| } |
| |
| for (Entry<String, Counter<R2<String, String>>> containerAndInfo : maxData.containersToLanguage.entrySet()) { |
| String region = containerAndInfo.getKey(); |
| if (region.equals("001")) { |
| continue; |
| } |
| Counter<R2<String, String>> data = containerAndInfo.getValue(); |
| Set<R2<String, String>> keysetSortedByCount = data.getKeysetSortedByCount(true); |
| if (SHOW_CONTAINERS) { // debug |
| System.out.println("Container2L:\t" + region + "\t" + shorten(data.getEntrySetSortedByCount(true, null))); |
| System.out.println("Container2LR:\t" + region + "\t" + maxData.containersToLangRegion.get(region)); |
| } |
| R2<String, String> value = keysetSortedByCount.iterator().next(); // will get most negative |
| final Comparable<String> language = value.get0(); |
| final Comparable<String> script = value.get1(); |
| |
| // fix special cases like es-419, where a locale exists. |
| // for those cases, what we add as output is the container. Otherwise the region. |
| Set<String> skipLanguages = cldrContainerToLanguages.get(region); |
| if (skipLanguages != null |
| && skipLanguages.contains(language)) { |
| add("und_" + region, language + "_" + script + "_" + region, toMaximized, "R*->LS", |
| Override.REPLACE_EXISTING, SHOW_ADD); |
| continue; |
| } |
| |
| // we now have the best language and script. Find the best region for that |
| for (R4<Double, String, String, String> e : maxData.containersToLangRegion.get(region)) { |
| final Comparable<String> language2 = e.get1(); |
| final Comparable<String> script2 = e.get2(); |
| if (language2.equals(language) && script2.equals(script)) { |
| add("und_" + region, language + "_" + script + "_" + e.get3(), toMaximized, "R*->LS", |
| Override.REPLACE_EXISTING, SHOW_ADD); |
| break; |
| } |
| } |
| } |
| |
| for (R2<String, String> languageScript : maxData.languageScripts.keySet()) { |
| R2<Double, String> value = maxData.languageScripts.getAll(languageScript).iterator().next(); |
| final Comparable<String> language = languageScript.get0(); |
| final Comparable<String> script = languageScript.get1(); |
| final Comparable<String> region = value.get1(); |
| add(language + "_" + script, language + "_" + script + "_" + region, toMaximized, "LS->R", |
| Override.REPLACE_EXISTING, SHOW_ADD); |
| } |
| |
| for (R2<String, String> scriptRegion : maxData.scriptRegions.keySet()) { |
| R2<Double, String> value = maxData.scriptRegions.getAll(scriptRegion).iterator().next(); |
| final Comparable<String> script = scriptRegion.get0(); |
| final Comparable<String> region = scriptRegion.get1(); |
| final Comparable<String> language = value.get1(); |
| add("und_" + script + "_" + region, language + "_" + script + "_" + region, toMaximized, "SR->L", |
| Override.REPLACE_EXISTING, SHOW_ADD); |
| } |
| |
| for (R2<String, String> languageRegion : maxData.languageRegions.keySet()) { |
| R2<Double, String> value = maxData.languageRegions.getAll(languageRegion).iterator().next(); |
| final Comparable<String> language = languageRegion.get0(); |
| final Comparable<String> region = languageRegion.get1(); |
| final Comparable<String> script = value.get1(); |
| add(language + "_" + region, language + "_" + script + "_" + region, toMaximized, "LR->S", |
| Override.REPLACE_EXISTING, SHOW_ADD); |
| } |
| |
| // get the script info from metadata as fallback |
| |
| TreeSet<String> sorted = new TreeSet<String>(ScriptMetadata.getScripts()); |
| for (String script : sorted) { |
| Info i = ScriptMetadata.getInfo(script); |
| String likelyLanguage = i.likelyLanguage; |
| String originCountry = i.originCountry; |
| final String result = likelyLanguage + "_" + script + "_" + originCountry; |
| add("und_" + script, result, toMaximized, "S->LR•", |
| Override.KEEP_EXISTING, SHOW_ADD); |
| add(likelyLanguage, result, toMaximized, "L->SR•", |
| Override.KEEP_EXISTING, SHOW_ADD); |
| } |
| |
| // add overrides |
| for (String key : LANGUAGE_OVERRIDES.keySet()) { |
| add(key, LANGUAGE_OVERRIDES.get(key), toMaximized, "OVERRIDE", Override.REPLACE_EXISTING, true); |
| } |
| } |
| |
| public static String shorten(Object data) { |
| String info = data.toString(); |
| if (info.length() > 255) { |
| info = info.substring(0, 127) + "…"; |
| } |
| return info; |
| } |
| |
| private static void doAlt(Map<String, String> toMaximized) { |
| // TODO Auto-generated method stub |
| Map<String, String> temp = new TreeMap<String, String>(); |
| for (String locale : toMaximized.keySet()) { |
| String target = toMaximized.get(locale); |
| temp.put(toAlt(locale, true), toAlt(target, true)); |
| } |
| toMaximized.clear(); |
| toMaximized.putAll(temp); |
| } |
| |
| public static String maximize(String languageTag, Map<String, String> toMaximized) { |
| LanguageTagParser ltp = new LanguageTagParser(); |
| |
| // clean up the input by removing Zzzz, ZZ, and changing "" into und. |
| ltp.set(languageTag); |
| String language = ltp.getLanguage(); |
| String region = ltp.getRegion(); |
| String script = ltp.getScript(); |
| boolean changed = false; |
| if (language.equals("")) { |
| ltp.setLanguage(language = "und"); |
| changed = true; |
| } |
| if (region.equals(UNKNOWN_SCRIPT)) { |
| ltp.setScript(script = ""); |
| changed = true; |
| } |
| if (ltp.getRegion().equals(UNKNOWN_REGION)) { |
| ltp.setRegion(region = ""); |
| changed = true; |
| } |
| if (changed) { |
| languageTag = ltp.toString(); |
| } |
| // check whole |
| String result = toMaximized.get(languageTag); |
| if (result != null) { |
| return result; |
| } |
| // try empty region |
| if (region.length() != 0) { |
| result = toMaximized.get(ltp.setRegion("").toString()); |
| if (result != null) { |
| return ltp.set(result).setRegion(region).toString(); |
| } |
| ltp.setRegion(region); // restore |
| } |
| // try empty script |
| if (script.length() != 0) { |
| result = toMaximized.get(ltp.setScript("").toString()); |
| if (result != null) { |
| return ltp.set(result).setScript(script).toString(); |
| } |
| // try empty script and region |
| if (region.length() != 0) { |
| result = toMaximized.get(ltp.setRegion("").toString()); |
| if (result != null) { |
| return ltp.set(result).setScript(script).setRegion(region).toString(); |
| } |
| } |
| } |
| if (!language.equals("und") && script.length() != 0 && region.length() != 0) { |
| return languageTag; // it was ok, and we couldn't do anything with it |
| } |
| return null; // couldn't maximize |
| } |
| |
| public static String minimize(String input, Map<String, String> toMaximized, boolean favorRegion) { |
| if (input.equals("nb_Latn_SJ")) { |
| System.out.print(""); // debug |
| } |
| String maximized = maximize(input, toMaximized); |
| if (maximized == null) { |
| return null; // failed |
| } |
| LanguageTagParser ltp = new LanguageTagParser().set(maximized); |
| String language = ltp.getLanguage(); |
| String region = ltp.getRegion(); |
| String script = ltp.getScript(); |
| // try building up from shorter to longer, and find the first that matches |
| // could be more optimized, but for this code we want simplest |
| String[] trials = { language, |
| language + TAG_SEPARATOR + (favorRegion ? region : script), |
| language + TAG_SEPARATOR + (!favorRegion ? region : script) }; |
| for (String trial : trials) { |
| String newMaximized = maximize(trial, toMaximized); |
| if (maximized.equals(newMaximized)) { |
| return trial; |
| } |
| } |
| return maximized; |
| } |
| |
| // /** |
| // * Verify that we can map from each language, script, and country to something. |
| // * @param toMaximized |
| // */ |
| // private static void checkConsistency(Map<String, String> toMaximized) { |
| // Map<String,String> needMappings = new TreeMap(); |
| // LanguageTagParser parser = new LanguageTagParser(); |
| // for (String maximized : new TreeSet<String>(toMaximized.values())) { |
| // parser.set(maximized); |
| // final String language = parser.getLanguage(); |
| // final String script = parser.getScript(); |
| // final String region = parser.getRegion(); |
| // if (language.length() == 0 || script.length() == 0 || region.length() == 0) { |
| // failure(" { \"" + maximized + "\", \"" + maximized + "\" }, // " + english.getName(maximized) + |
| // "\t\tFailed-Consistency"); |
| // continue; |
| // } |
| // addIfNotIn(language, maximized, needMappings, toMaximized, "Consistency"); |
| // addIfNotIn(language + "_" + script, maximized, needMappings, toMaximized, "Consistency"); |
| // addIfNotIn(language + "_" + region, maximized, needMappings, toMaximized, "Consistency"); |
| // addIfNotIn("und_" + script, maximized, needMappings, toMaximized, "Consistency"); |
| // addIfNotIn("und_" + script + "_" + region, maximized, needMappings, toMaximized, "Consistency"); |
| // addIfNotIn("und_" + region, maximized, needMappings, toMaximized, "Consistency"); |
| // } |
| // toMaximized.putAll(needMappings); |
| // } |
| |
| // private static void failure(String string) { |
| // System.out.println(string); |
| // errorCount++; |
| // } |
| |
| // private static void addIfNotIn(String key, String value, Map<String, String> toAdd, Map<String, String> |
| // otherToCheck, String kind) { |
| // addIfNotIn(key, value, toAdd, otherToCheck == null ? null : otherToCheck.keySet(), null, kind); |
| // } |
| |
| // private static void addIfNotIn(String key, String value, Map<String, String> toAdd, Set<String> skipKey, |
| // Set<String> skipValue, String kind) { |
| // if (!key.equals(value) |
| // && !toAdd.containsKey(key) |
| // && (skipKey == null || !skipKey.contains(key)) |
| // && (skipValue == null || !skipValue.contains(value))) { |
| // add(key, value, toAdd, kind); |
| // } |
| // } |
| |
| enum Override { |
| KEEP_EXISTING, REPLACE_EXISTING |
| } |
| |
| private static void add(String key, String value, Map<String, String> toAdd, String kind, Override override, |
| boolean showAction) { |
| if (key.equals(DEBUG_ADD_KEY)) { |
| System.out.println("*debug*"); |
| } |
| String oldValue = toAdd.get(key); |
| if (oldValue == null) { |
| if (showAction) { |
| System.out.println("Adding:\t\t" + getName(key) + "\t=>\t" + getName(value) + "\t\t\t\t" + kind); |
| } |
| } else if (override == Override.KEEP_EXISTING || value.equals(oldValue)) { |
| // if (showAction) { |
| // System.out.println("Skipping:\t" + key + "\t=>\t" + value + "\t\t\t\t" + kind); |
| // } |
| return; |
| } else { |
| if (showAction) { |
| System.out.println("Replacing:\t" + getName(key) + "\t=>\t" + getName(value) + "\t, was\t" + getName(oldValue) + "\t\t" + kind); |
| } |
| } |
| toAdd.put(key, value); |
| } |
| |
| private static String getName(String value) { |
| return ConvertLanguageData.getLanguageCodeAndName(value); |
| } |
| |
| // private static void addCountries(Map<String, String> toMaximized) { |
| // Map <String, Map<String, Double>> scriptToLanguageToSize = new TreeMap(); |
| // |
| // for (String territory : supplementalData.getTerritoriesWithPopulationData()) { |
| // Set<String> languages = supplementalData.getLanguagesForTerritoryWithPopulationData(territory); |
| // String biggestOfficial = null; |
| // double biggest = -1; |
| // for (String language : languages) { |
| // PopulationData info = supplementalData.getLanguageAndTerritoryPopulationData(language, territory); |
| // // add to info about script |
| // |
| // String script = getScriptForLocale(language); |
| // if (script != null) { |
| // Map<String, Double> languageInfo = scriptToLanguageToSize.get(script); |
| // if (languageInfo == null) scriptToLanguageToSize.put(script, languageInfo = new TreeMap()); |
| // String baseLanguage = language; |
| // int pos = baseLanguage.indexOf('_'); |
| // if (pos >= 0) { |
| // baseLanguage = baseLanguage.substring(0,pos); |
| // } |
| // Double size = languageInfo.get(baseLanguage); |
| // languageInfo.put(baseLanguage, (size == null ? 0 : size) + info.getLiteratePopulation()); |
| // } |
| // |
| // |
| // final OfficialStatus officialStatus = info.getOfficialStatus(); |
| // if (officialStatus == OfficialStatus.de_facto_official || officialStatus == OfficialStatus.official) { |
| // double size2 = info.getLiteratePopulation(); |
| // if (biggest < size2) { |
| // biggest = size2; |
| // biggestOfficial = language; |
| // } |
| // } |
| // } |
| // if (biggestOfficial != null) { |
| // final String replacementTag = "und_" + territory; |
| // String maximized = biggestOfficial + "_" + territory; |
| // toMaximized.put(replacementTag, maximized); |
| // if (SHOW_ADD) System.out.println("Adding:\t" + replacementTag + "\t=>\t" + maximized + "\t\tLanguage-Territory"); |
| // } |
| // } |
| // |
| // for (String script : scriptToLanguageToSize.keySet()) { |
| // String biggestOfficial = null; |
| // double biggest = -1; |
| // |
| // final Map<String, Double> languageToSize = scriptToLanguageToSize.get(script); |
| // for (String language : languageToSize.keySet()) { |
| // double size = languageToSize.get(language); |
| // if (biggest < size) { |
| // biggest = size; |
| // biggestOfficial = language; |
| // } |
| // } |
| // if (biggestOfficial != null) { |
| // final String replacementTag = "und_" + script; |
| // String maximized = biggestOfficial + "_" + script; |
| // toMaximized.put(replacementTag, maximized); |
| // if (SHOW_ADD) System.out.println("Adding:\t" + replacementTag + "\t=>\t" + maximized + "\t\tUnd-Script"); |
| // } |
| // } |
| // } |
| |
| // private static void closeUnd(Map<String, String> toMaximized) { |
| // Map<String,String> toAdd = new TreeMap<String,String>(); |
| // for (String oldSource : toMaximized.keySet()) { |
| // String maximized = toMaximized.get(oldSource); |
| // if (!maximized.startsWith("und")) { |
| // int pos = maximized.indexOf("_"); |
| // if (pos >= 0) { |
| // addIfNotIn( "und" + maximized.substring(pos), maximized, toAdd, toMaximized, "CloseUnd"); |
| // } |
| // } |
| // } |
| // toMaximized.putAll(toAdd); |
| // } |
| |
| /** |
| * Generate tags where the deprecated values map to the expanded values |
| * |
| * @param toMaximized |
| */ |
| // private static void addDeprecated(Map<String, String> toMaximized) { |
| // Map<String, Map<String, List<String>>> typeToTagToReplacement = supplementalData.getLocaleAliasInfo(); |
| // LanguageTagParser temp = new LanguageTagParser(); |
| // LanguageTagParser tagParsed = new LanguageTagParser(); |
| // LanguageTagParser replacementParsed = new LanguageTagParser(); |
| // Map<String,String> toAdd = new TreeMap<String,String>(); |
| // while (true) { |
| // toAdd.clear(); |
| // for (String type : typeToTagToReplacement.keySet()) { |
| // if (type.equals("variant") || type.equals("zone")) continue; |
| // boolean addUnd = !type.equals("language"); |
| // |
| // Map<String, List<String>> tagToReplacement = typeToTagToReplacement.get(type); |
| // System.out.println("*" + type + " = " + tagToReplacement); |
| // |
| // for (String tag: tagToReplacement.keySet()) { |
| // |
| // final List<String> list = tagToReplacement.get(tag); |
| // if (list == null) continue; // we don't have any information |
| // String replacement = list.get(0); |
| // |
| // // only do multiples |
| // if (tag.contains("_") || !replacement.contains("_")) { |
| // continue; |
| // } |
| // |
| // // we now have a tag and a replacement value |
| // // make parsers that we can use |
| // try { |
| // tagParsed.set(addUnd ? "und-" + tag : tag); |
| // replacementParsed.set(addUnd ? "und-" + replacement : replacement); |
| // } catch (RuntimeException e) { |
| // continue; |
| // } |
| // addIfNotIn(tag, replacement, toAdd, toMaximized,"Deprecated"); |
| // |
| // for (String locale : toMaximized.keySet()) { |
| // String maximized = toMaximized.get(locale); |
| // addIfMatches(temp.set(locale), maximized, replacementParsed, tagParsed, toAdd, toMaximized); |
| // addIfMatches(temp.set(maximized), maximized, replacementParsed, tagParsed, toAdd, toMaximized); |
| // } |
| // } |
| // } |
| // if (toAdd.size() == 0) { |
| // break; |
| // } |
| // toMaximized.putAll(toAdd); |
| // } |
| // } |
| |
| // private static void addIfMatches(LanguageTagParser locale, String maximized, LanguageTagParser tagParsed, |
| // LanguageTagParser replacementParsed, Map<String, String> toAdd, Map<String, String> toMaximized) { |
| // if (!tagParsed.getLanguage().equals(locale.getLanguage()) && !tagParsed.getLanguage().equals("und")) { |
| // return; |
| // } |
| // if (!tagParsed.getScript().equals(locale.getScript()) && !tagParsed.getScript().equals("")) { |
| // return; |
| // } |
| // if (!tagParsed.getRegion().equals(locale.getRegion()) && !tagParsed.getRegion().equals("")) { |
| // return; |
| // } |
| // if (!replacementParsed.getLanguage().equals("und")) { |
| // locale.setLanguage(replacementParsed.getLanguage()); |
| // } |
| // if (!replacementParsed.getScript().equals("")) { |
| // locale.setScript(replacementParsed.getScript()); |
| // } |
| // if (!replacementParsed.getRegion().equals("")) { |
| // locale.setRegion(replacementParsed.getRegion()); |
| // } |
| // addIfNotIn(locale.toString(), maximized, toAdd, toMaximized,"Deprecated"); |
| // } |
| |
| // private static int getSubtagPosition(String locale, String subtags) { |
| // int pos = -1; |
| // while (true) { |
| // pos = locale.indexOf(subtags, pos + 1); |
| // if (pos < 0) return -1; |
| // // make sure boundaries are ok |
| // if (pos != 0) { |
| // char charBefore = locale.charAt(pos-1); |
| // if (charBefore != '_' && charBefore != '_') return -1; |
| // } |
| // int limit = pos + subtags.length(); |
| // if (limit != locale.length()) { |
| // char charAfter = locale.charAt(limit); |
| // if (charAfter != '_' && charAfter != '_') return -1; |
| // } |
| // return pos; |
| // } |
| // } |
| |
| /* |
| * Format |
| * const DefaultSubtags default_subtags[] = { |
| * { |
| * // Afar => Afar (Latin, Ethiopia) |
| * "aa", |
| * "aa_Latn_ET" |
| * },{ |
| * // Afrikaans => Afrikaans (Latin, South Africa) |
| * "af", |
| * "af_Latn_ZA" |
| * },{ |
| */ |
| |
| private static void printLikelySubtags(Map<String, String> fluffup) throws IOException { |
| |
| PrintWriter out = FileUtilities.openUTF8Writer(CLDRPaths.GEN_DIRECTORY, |
| "/supplemental/likelySubtags" + (OUTPUT_STYLE == OutputStyle.XML ? ".xml" : ".txt")); |
| String spacing = OUTPUT_STYLE == OutputStyle.PLAINTEXT ? "\t" : " "; |
| String header = OUTPUT_STYLE != OutputStyle.XML ? "const MapToMaximalSubtags default_subtags[] = {" |
| : "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" + CldrUtility.LINE_SEPARATOR |
| + "<!DOCTYPE supplementalData SYSTEM \"../../common/dtd/ldmlSupplemental.dtd\">" |
| + CldrUtility.LINE_SEPARATOR |
| + "<!--" |
| + CldrUtility.LINE_SEPARATOR |
| + CldrUtility.getCopyrightString() |
| + CldrUtility.LINE_SEPARATOR |
| + "-->" |
| + CldrUtility.LINE_SEPARATOR |
| + "<!--" |
| + CldrUtility.LINE_SEPARATOR |
| + "Likely subtags data is generated programatically from CLDR's language/territory/population" + CldrUtility.LINE_SEPARATOR |
| + "data using the GenerateMaximalLocales tool. Under normal circumstances, this file should" + CldrUtility.LINE_SEPARATOR |
| + "not be patched by hand, as any changes made in that fashion may be lost." |
| + CldrUtility.LINE_SEPARATOR |
| + "-->" |
| + CldrUtility.LINE_SEPARATOR |
| + "<supplementalData>" + CldrUtility.LINE_SEPARATOR |
| + " <version number=\"$" + |
| "Revision$\"/>" + CldrUtility.LINE_SEPARATOR |
| + " <likelySubtags>"; |
| String footer = OUTPUT_STYLE != OutputStyle.XML ? SEPARATOR + "};" |
| : " </likelySubtags>" + CldrUtility.LINE_SEPARATOR |
| + "</supplementalData>"; |
| out.println(header); |
| boolean first = true; |
| Set<String> keys = new TreeSet<String>(new LocaleStringComparator()); |
| keys.addAll(fluffup.keySet()); |
| for (String printingLocale : keys) { |
| String printingTarget = fluffup.get(printingLocale); |
| String comment = printingName(printingLocale, spacing) + spacing + "=>" + spacing |
| + printingName(printingTarget, spacing); |
| |
| if (OUTPUT_STYLE == OutputStyle.XML) { |
| out.println("\t\t<likelySubtag from=\"" + printingLocale + |
| "\" to=\"" + printingTarget + "\"" + |
| "/>" + CldrUtility.LINE_SEPARATOR + "\t\t" + "<!--" + comment + "-->"); |
| } else { |
| if (first) { |
| first = false; |
| } else { |
| out.print(","); |
| } |
| if (comment.length() > 70 && SEPARATOR.equals(CldrUtility.LINE_SEPARATOR)) { |
| comment = printingName(printingLocale, spacing) + SEPARATOR + " // " + spacing + "=>" + spacing |
| + printingName(printingTarget, spacing); |
| } |
| out.print( |
| " {" |
| + SEPARATOR + " // " + comment |
| + SEPARATOR + " \"" + printingLocale + "\"," |
| + SEPARATOR + " \"" + printingTarget + "\"" |
| + CldrUtility.LINE_SEPARATOR + " }"); |
| } |
| } |
| out.println(footer); |
| out.close(); |
| } |
| |
| public static String printingName(String locale, String spacing) { |
| if (locale == null) { |
| return null; |
| } |
| LanguageTagParser parser = new LanguageTagParser().set(locale); |
| String lang = parser.getLanguage(); |
| String script = parser.getScript(); |
| String region = parser.getRegion(); |
| return "{" + spacing + |
| (lang.equals("und") ? "?" : english.getName(CLDRFile.LANGUAGE_NAME, lang)) + ";" + spacing + |
| (script == null || script.equals("") ? "?" : english.getName(CLDRFile.SCRIPT_NAME, script)) + ";" + spacing |
| + |
| (region == null || region.equals("") ? "?" : english.getName(CLDRFile.TERRITORY_NAME, region)) + spacing |
| + "}"; |
| } |
| |
| private static final String[][] ALT_REVERSAL = { |
| { "nb", "no" }, |
| { "no", "nb" }, |
| { "he", "iw" }, |
| { "iw", "he" }, |
| }; |
| |
| public static String toAlt(String locale, boolean change) { |
| if (!change || locale == null) { |
| return locale; |
| } |
| String firstTag = getFirstTag(locale); |
| for (String[] pair : ALT_REVERSAL) { |
| if (firstTag.equals(pair[0])) { |
| locale = pair[1] + locale.substring(pair[1].length()); |
| break; |
| } |
| } |
| locale = locale.replace("_", "-"); |
| return locale; |
| } |
| |
| private static String getFirstTag(String locale) { |
| int pos = locale.indexOf('_'); |
| return pos < 0 ? locale : locale.substring(0, pos); |
| } |
| |
| // private static Map<String, String> getBackMapping(Map<String, String> fluffup) { |
| // Relation<String,String> backMap = new Relation(new TreeMap(), TreeSet.class, BEST_LANGUAGE_COMPARATOR); |
| // for (String source : fluffup.keySet()) { |
| // if (source.startsWith("und")) { |
| // continue; |
| // } |
| // String maximized = fluffup.get(source); |
| // backMap.put(maximized, source); // put in right order |
| // } |
| // Map<String,String> returnBackMap = new TreeMap(); |
| // for (String maximized : backMap.keySet()) { |
| // final Set<String> all = backMap.getAll(maximized); |
| // final String minimized = all.iterator().next(); |
| // returnBackMap.put(maximized, minimized); |
| // } |
| // return returnBackMap; |
| // } |
| |
| /** |
| * Language tags are presumed to share the first language, except possibly "und". Best is least |
| */ |
| // private static Comparator BEST_LANGUAGE_COMPARATOR = new Comparator<String>() { |
| // LanguageTagParser p1 = new LanguageTagParser(); |
| // LanguageTagParser p2 = new LanguageTagParser(); |
| // public int compare(String o1, String o2) { |
| // if (o1.equals(o2)) return 0; |
| // p1.set(o1); |
| // p2.set(o2); |
| // String lang1 = p1.getLanguage(); |
| // String lang2 = p2.getLanguage(); |
| // |
| // // compare languages first |
| // // put und at the end |
| // int result = lang1.compareTo(lang2); |
| // if (result != 0) { |
| // if (lang1.equals("und")) return 1; |
| // if (lang2.equals("und")) return -1; |
| // return result; |
| // } |
| // |
| // // now scripts and regions. |
| // // if they have different numbers of fields, the shorter wins. |
| // // If there are two fields, region is lowest. |
| // // The simplest way is to just compare scripts first |
| // // so zh-TW < zh-Hant, because we first compare "" to Hant |
| // String script1 = p1.getScript(); |
| // String script2 = p2.getScript(); |
| // int scriptOrder = script1.compareTo(script2); |
| // if (scriptOrder != 0) return scriptOrder; |
| // |
| // String region1 = p1.getRegion(); |
| // String region2 = p2.getRegion(); |
| // int regionOrder = region1.compareTo(region2); |
| // if (regionOrder != 0) return regionOrder; |
| // |
| // return o1.compareTo(o2); |
| // } |
| // |
| // }; |
| |
| public static void minimize(Map<String, String> fluffup) { |
| LanguageTagParser parser = new LanguageTagParser(); |
| LanguageTagParser targetParser = new LanguageTagParser(); |
| Set<String> removals = new TreeSet<String>(); |
| while (true) { |
| removals.clear(); |
| for (String locale : fluffup.keySet()) { |
| String target = fluffup.get(locale); |
| if (targetParser.set(target).getRegion().equals(UNKNOWN_REGION)) { |
| removals.add(locale); |
| if (SHOW_ADD) |
| System.out.println("Removing:\t" + getName(locale) + "\t=>\t" + getName(target) |
| + "\t\t - Unknown Region in target"); |
| continue; |
| } |
| if (targetParser.getScript().equals(UNKNOWN_SCRIPT)) { |
| removals.add(locale); |
| if (SHOW_ADD) |
| System.out.println("Removing:\t" + getName(locale) + "\t=>\t" + getName(target) |
| + "\t\t - Unknown Script in target"); |
| continue; |
| } |
| |
| String region = parser.set(locale).getRegion(); |
| if (region.length() != 0) { |
| if (region.equals(UNKNOWN_REGION)) { |
| removals.add(locale); |
| if (SHOW_ADD) |
| System.out.println("Removing:\t" + getName(locale) + "\t=>\t" + getName(target) |
| + "\t\t - Unknown Region in source"); |
| continue; |
| } |
| parser.setRegion(""); |
| String newLocale = parser.toString(); |
| String newTarget = fluffup.get(newLocale); |
| if (newTarget != null) { |
| newTarget = targetParser.set(newTarget).setRegion(region).toString(); |
| if (target.equals(newTarget) && !KEEP_TARGETS.contains(locale)) { |
| removals.add(locale); |
| if (SHOW_ADD) |
| System.out.println("Removing:\t" + locale + "\t=>\t" + target + "\t\tRedundant with " |
| + newLocale); |
| continue; |
| } |
| } |
| } |
| String script = parser.set(locale).getScript(); |
| if (locale.equals(DEBUG_ADD_KEY)) { |
| System.out.println("*debug*"); |
| } |
| if (script.length() != 0) { |
| if (script.equals(UNKNOWN_SCRIPT)) { |
| removals.add(locale); |
| if (SHOW_ADD) |
| System.out.println("Removing:\t" + locale + "\t=>\t" + target + "\t\t - Unknown Script"); |
| continue; |
| } |
| parser.setScript(""); |
| String newLocale = parser.toString(); |
| String newTarget = fluffup.get(newLocale); |
| if (newTarget != null) { |
| newTarget = targetParser.set(newTarget).setScript(script).toString(); |
| if (target.equals(newTarget) && !KEEP_TARGETS.contains(locale)) { |
| removals.add(locale); |
| if (SHOW_ADD) |
| System.out.println("Removing:\t" + locale + "\t=>\t" + target + "\t\tRedundant with " |
| + newLocale); |
| continue; |
| } |
| } |
| } |
| } |
| if (removals.size() == 0) { |
| break; |
| } |
| for (String locale : removals) { |
| fluffup.remove(locale); |
| } |
| } |
| } |
| |
| // private static void addLanguageScript(Map<String, String> fluffup, LanguageTagParser parser) { |
| // // add script |
| // Map<String, String> temp = new TreeMap<String, String>(); |
| // while (true) { |
| // temp.clear(); |
| // for (String target : new TreeSet<String>(fluffup.values())) { |
| // parser.set(target); |
| // final String territory = parser.getRegion(); |
| // if (territory.length() == 0) { |
| // continue; |
| // } |
| // parser.setRegion(""); |
| // String possibleSource = parser.toString(); |
| // if (fluffup.containsKey(possibleSource)) { |
| // continue; |
| // } |
| // String other = temp.get(possibleSource); |
| // if (other != null) { |
| // if (!target.equals(other)) { |
| // System.out.println("**Failure with multiple sources in addLanguageScript: " |
| // + possibleSource + "\t=>\t" + target + ", " + other); |
| // } |
| // continue; |
| // } |
| // temp.put(possibleSource, target); |
| // if (SHOW_ADD) System.out.println("Adding:\t" + possibleSource + "\t=>\t" + target + "\t\tLanguage-Script"); |
| // } |
| // if (temp.size() == 0) { |
| // break; |
| // } |
| // fluffup.putAll(temp); |
| // } |
| // |
| // } |
| |
| // private static void addLanguageCountry(Map<String, String> fluffup, LanguageTagParser parser) { |
| // // add script |
| // Map<String, String> temp = new TreeMap<String, String>(); |
| // while (true) { |
| // temp.clear(); |
| // for (String target : new TreeSet<String>(fluffup.values())) { |
| // parser.set(target); |
| // String script = parser.getScript(); |
| // if (script.length() == 0) { |
| // continue; |
| // } |
| // parser.setScript(""); |
| // String possibleSource = parser.toString(); |
| // if (fluffup.containsKey(possibleSource)) { |
| // continue; |
| // } |
| // String other = temp.get(possibleSource); |
| // |
| // if (other != null) { |
| // if (!target.equals(other)) { |
| // script = getScriptForLocale(possibleSource); |
| // if (script == null) { |
| // System.out.println("**Failure with multiple sources in addLanguageCountry: " |
| // + possibleSource + "\t=>\t" + target + ", " + other); |
| // continue; // error message in routine |
| // } |
| // parser.setScript(script); |
| // target = parser.toString(); |
| // } |
| // } |
| // |
| // temp.put(possibleSource, target); |
| // if (SHOW_ADD) System.out.println("Adding:\t" + possibleSource + "\t=>\t" + target + "\t\tLanguageCountry"); |
| // } |
| // if (temp.size() == 0) { |
| // break; |
| // } |
| // fluffup.putAll(temp); |
| // } |
| // |
| // } |
| |
| // private static void addScript(Map<String, String> fluffup, LanguageTagParser parser) { |
| // // add script |
| // Map<String, String> temp = new TreeMap<String, String>(); |
| // while (true) { |
| // temp.clear(); |
| // Set skipTarget = fluffup.keySet(); |
| // for (String locale : fluffup.keySet()) { |
| // String target = fluffup.get(locale); |
| // parser.set(target); |
| // if (parser.getScript().length() != 0) { |
| // continue; |
| // } |
| // String script = getScriptForLocale(target); |
| // |
| // if (script == null) { |
| // continue; // error message in routine |
| // } |
| // parser.setScript(script); |
| // String furtherTarget = parser.toString(); |
| // addIfNotIn(target, furtherTarget, temp, fluffup, "Script"); |
| // } |
| // if (temp.size() == 0) { |
| // break; |
| // } |
| // fluffup.putAll(temp); |
| // } |
| // } |
| |
| // private static String getScriptForLocale(String locale) { |
| // String result = getScriptForLocale2(locale); |
| // if (result != null) return result; |
| // int pos = locale.indexOf('_'); |
| // if (pos >= 0) { |
| // result = getScriptForLocale2(locale.substring(0,pos)); |
| // } |
| // return result; |
| // } |
| |
| private static String UNKNOWN_SCRIPT = "Zzzz"; |
| private static String UNKNOWN_REGION = "ZZ"; |
| |
| private static String getScriptForLocale2(String locale) { |
| String result = localeToScriptCache.get(locale); |
| if (result != null) { |
| return result; |
| } |
| if (locale.equals("ky")) { |
| int debug = 0; |
| } |
| try { |
| Map<Type, BasicLanguageData> data = supplementalData.getBasicLanguageDataMap(locale); |
| if (data != null) { |
| for (BasicLanguageData datum : data.values()) { |
| final Set<String> scripts = datum.getScripts(); |
| boolean isPrimary = datum.getType() == BasicLanguageData.Type.primary; |
| if (scripts.size() != 1) { |
| if (scripts.size() > 1 && isPrimary) { |
| break; |
| } |
| continue; |
| } |
| String script = scripts.iterator().next(); |
| if (isPrimary) { |
| return result = script; |
| } else if (result == null) { |
| result = script; |
| } |
| } |
| if (result != null) { |
| return result; |
| } |
| } |
| CLDRFile cldrFile; |
| try { |
| cldrFile = factory.make(locale, true); |
| } catch (RuntimeException e) { |
| result = FALLBACK_SCRIPTS.get(locale); |
| if (result == null) { |
| System.out.println("***Failed to find script for: " + locale + "\t" + english.getName(locale)); |
| return result = UNKNOWN_SCRIPT; |
| } else { |
| return result; |
| } |
| } |
| UnicodeSet exemplars = getExemplarSet(cldrFile, ""); |
| Set<String> CLDRScripts = getScriptsFromUnicodeSet(exemplars); |
| CLDRScripts.remove(UNKNOWN_SCRIPT); |
| if (CLDRScripts.size() == 1) { |
| return result = CLDRScripts.iterator().next(); |
| } else if (CLDRScripts.size() == 0) { |
| System.out.println("**Failed to get script for:\t" + locale); |
| return result = UNKNOWN_SCRIPT; |
| } else { |
| System.out.println("**Failed, too many scripts for:\t" + locale + ", " + CLDRScripts); |
| return result = UNKNOWN_SCRIPT; |
| } |
| } finally { |
| if (result.equals(UNKNOWN_SCRIPT)) { |
| String temp = LANGUAGE_OVERRIDES.get(locale); |
| if (temp != null) { |
| result = new LanguageTagParser().set(temp).getScript(); |
| System.out.println("Getting script from LANGUAGE_OVERRIDES for " + locale + " => " + result); |
| } |
| } |
| localeToScriptCache.put(locale, result); |
| if (SHOW_ADD) |
| System.out.println("Script:\t" + locale + "\t" + english.getName(locale) + "\t=>\t" + result + "\t" |
| + english.getName(CLDRFile.SCRIPT_NAME, result)); |
| } |
| } |
| |
| // private static Map<String, String> closeMapping(Map<String, String> fluffup) { |
| // if (SHOW_ADD) System.out.flush(); |
| // Map<String,String> temp = new TreeMap<String,String>(); |
| // while (true) { |
| // temp.clear(); |
| // for (String locale : fluffup.keySet()) { |
| // String target = fluffup.get(locale); |
| // if (target.equals("si_Sinh") || target.equals("zh-Hani")) { |
| // System.out.println("????"); |
| // } |
| // String furtherTarget = fluffup.get(target); |
| // if (furtherTarget == null) { |
| // continue; |
| // } |
| // addIfNotIn(locale, furtherTarget, temp, null, "Close"); |
| // } |
| // if (temp.size() == 0) { |
| // break; |
| // } |
| // fluffup.putAll(temp); |
| // } |
| // if (SHOW_ADD) System.out.flush(); |
| // return temp; |
| // } |
| |
| public static Set<String> getScriptsFromUnicodeSet(UnicodeSet exemplars) { |
| // use bits first, since that's faster |
| BitSet scriptBits = new BitSet(); |
| boolean show = false; |
| for (UnicodeSetIterator it = new UnicodeSetIterator(exemplars); it.next();) { |
| if (show) |
| System.out.println(Integer.toHexString(it.codepoint)); |
| if (it.codepoint != UnicodeSetIterator.IS_STRING) { |
| scriptBits.set(UScript.getScript(it.codepoint)); |
| } else { |
| int cp; |
| for (int i = 0; i < it.string.length(); i += UTF16.getCharCount(cp)) { |
| scriptBits.set(UScript.getScript(cp = UTF16.charAt(it.string, i))); |
| } |
| } |
| } |
| scriptBits.clear(UScript.COMMON); |
| scriptBits.clear(UScript.INHERITED); |
| Set<String> scripts = new TreeSet<String>(); |
| for (int j = 0; j < scriptBits.size(); ++j) { |
| if (scriptBits.get(j)) { |
| scripts.add(UScript.getShortName(j)); |
| } |
| } |
| return scripts; |
| } |
| |
| public static UnicodeSet getExemplarSet(CLDRFile cldrfile, String type) { |
| if (type.length() != 0) |
| type = "[@type=\"" + type + "\"]"; |
| String v = cldrfile.getStringValue("//ldml/characters/exemplarCharacters" |
| + type); |
| if (v == null) |
| return new UnicodeSet(); |
| return new UnicodeSet(v); |
| } |
| |
| // private static String[][] SpecialCases = { |
| // { "zh_Hani", "zh_Hans_CN"}, |
| // { "si_Sinh", "si_Sinh_LK"}, |
| // { "ii", "ii_CN"}, // Sichuan Yi (Yi) |
| // { "iu", "iu_CA"}, // Inuktitut (Unified Canadian Aboriginal Syllabics) |
| // { "und", "en"}, // English default |
| // }; |
| |
| static void showDefaultContentDifferencesAndFix(Set<String> defaultLocaleContent) { |
| Set<String> errors = new LinkedHashSet<String>(); |
| Map<String, String> oldDefaultContent = SupplementalDataInfo.makeLocaleToDefaultContents( |
| ConvertLanguageData.supplementalData.getDefaultContentLocales(), new TreeMap<String, String>(), errors); |
| if (!errors.isEmpty()) { |
| System.out.println(CollectionUtilities.join(errors, "\n")); |
| errors.clear(); |
| } |
| Map<String, String> newDefaultContent = SupplementalDataInfo.makeLocaleToDefaultContents(defaultLocaleContent, |
| new TreeMap<String, String>(), errors); |
| if (!errors.isEmpty()) { |
| System.out.println("Default Content errors: " + CollectionUtilities.join(errors, "\n")); |
| errors.clear(); |
| } |
| Set<String> changes = compareMapsAndFixNew("*WARNING* Default Content: ", oldDefaultContent, newDefaultContent, |
| "ar", "ar_001"); |
| System.out.println(CollectionUtilities.join(changes, "\n")); |
| defaultLocaleContent.clear(); |
| defaultLocaleContent.addAll(newDefaultContent.values()); |
| newDefaultContent = SupplementalDataInfo.makeLocaleToDefaultContents(defaultLocaleContent, |
| new TreeMap<String, String>(), errors); |
| if (!errors.isEmpty()) { |
| System.out.println("***New Errors: " + CollectionUtilities.join(errors, "\n")); |
| } |
| } |
| |
| private static Set<String> compareMapsAndFixNew(String title, |
| Map<String, String> oldContent, |
| Map<String, String> newContent, String... allowedOverrideValues) { |
| Map<String, String> allowedOverrideValuesTest = new HashMap<String, String>(); |
| for (int i = 0; i < allowedOverrideValues.length; i += 2) { |
| allowedOverrideValuesTest.put(allowedOverrideValues[i], allowedOverrideValues[i + 1]); |
| } |
| Set<String> changes = new TreeSet<String>(); |
| for (String parent : Builder.with(new TreeSet<String>()).addAll(newContent.keySet()) |
| .addAll(oldContent.keySet()).get()) { |
| String oldValue = oldContent.get(parent); |
| String newValue = newContent.get(parent); |
| String overrideValue = allowedOverrideValuesTest.get(parent); |
| if (overrideValue != null) { |
| newContent.put(parent, overrideValue); |
| newValue = overrideValue; |
| } |
| if (CldrUtility.equals(oldValue, newValue)) { |
| continue; |
| } |
| String message; |
| if (oldValue == null) { |
| message = "Adding " + ConvertLanguageData.getLanguageCodeAndName(parent) + " => " |
| + ConvertLanguageData.getLanguageCodeAndName(newValue); |
| newContent.put(parent, newValue); |
| } else if (newValue == null) { |
| if (SUPPRESS_CHANGES) { |
| message = "Suppressing removal of " |
| + ConvertLanguageData.getLanguageCodeAndName(parent) + " => " |
| + ConvertLanguageData.getLanguageCodeAndName(oldValue); |
| newContent.put(parent, oldValue); |
| } else { |
| message = "Removing " |
| + ConvertLanguageData.getLanguageCodeAndName(parent) + " => " |
| + ConvertLanguageData.getLanguageCodeAndName(oldValue); |
| newContent.remove(oldValue); |
| } |
| } else { |
| if (SUPPRESS_CHANGES) { |
| message = "Suppressing change of " |
| + ConvertLanguageData.getLanguageCodeAndName(parent) + " => " |
| + ConvertLanguageData.getLanguageCodeAndName(oldValue) + " to " |
| + ConvertLanguageData.getLanguageCodeAndName(newValue); |
| newContent.remove(newValue); |
| newContent.put(parent, oldValue); |
| } else { |
| message = "Changing " |
| + ConvertLanguageData.getLanguageCodeAndName(parent) + " => " |
| + ConvertLanguageData.getLanguageCodeAndName(oldValue) + " to " |
| + ConvertLanguageData.getLanguageCodeAndName(newValue); |
| newContent.remove(oldValue); |
| newContent.put(parent, newValue); |
| } |
| } |
| changes.add(title + message); |
| } |
| return changes; |
| } |
| |
| public static class LocaleStringComparator implements Comparator<String> { |
| LanguageTagParser ltp0 = new LanguageTagParser(); |
| LanguageTagParser ltp1 = new LanguageTagParser(); |
| |
| public int compare(String arg0, String arg1) { |
| ltp0.set(arg0); |
| ltp1.set(arg1); |
| String s0 = ltp0.getLanguage(); |
| String s1 = ltp1.getLanguage(); |
| int result = s0.compareTo(s1); |
| if (result != 0) { |
| return s0.equals("und") ? 1 |
| : s1.equals("und") ? -1 |
| : result; |
| } |
| s0 = ltp0.getScript(); |
| s1 = ltp1.getScript(); |
| result = s0.compareTo(s1); |
| if (result != 0) { |
| return result; |
| } |
| s0 = ltp0.getRegion(); |
| s1 = ltp1.getRegion(); |
| result = s0.compareTo(s1); |
| if (result != 0) { |
| return result; |
| } |
| return arg0.compareTo(arg1); // just in case |
| } |
| |
| } |
| } |