blob: 50baa2e935fc1b50776ac7bc93444ca5253d6995 [file] [log] [blame]
package org.unicode.cldr.unittest;
import com.google.common.base.Joiner;
import com.google.common.base.Objects;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import com.google.common.collect.TreeMultimap;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.dev.util.UnicodeMap;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.VersionInfo;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import org.unicode.cldr.draft.ScriptMetadata;
import org.unicode.cldr.draft.ScriptMetadata.Info;
import org.unicode.cldr.tool.LikelySubtags;
import org.unicode.cldr.util.CLDRConfig;
import org.unicode.cldr.util.CLDRFile;
import org.unicode.cldr.util.CLDRFile.ExemplarType;
import org.unicode.cldr.util.CLDRFile.WinningChoice;
import org.unicode.cldr.util.CLDRLocale;
import org.unicode.cldr.util.CalculatedCoverageLevels;
import org.unicode.cldr.util.ChainedMap;
import org.unicode.cldr.util.ChainedMap.M3;
import org.unicode.cldr.util.CldrUtility;
import org.unicode.cldr.util.Containment;
import org.unicode.cldr.util.Factory;
import org.unicode.cldr.util.LanguageTagParser;
import org.unicode.cldr.util.Level;
import org.unicode.cldr.util.ScriptToExemplars;
import org.unicode.cldr.util.StandardCodes;
import org.unicode.cldr.util.StandardCodes.LstrType;
import org.unicode.cldr.util.SupplementalDataInfo;
import org.unicode.cldr.util.SupplementalDataInfo.PopulationData;
import org.unicode.cldr.util.Validity;
import org.unicode.cldr.util.Validity.Status;
public class LikelySubtagsTest extends TestFmwk {
private static final Validity VALIDITY = Validity.getInstance();
private boolean DEBUG = false;
private static boolean SHOW_EXEMPLARS = System.getProperty("SHOW_EXEMPLARS") != null;
private static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance();
private static final SupplementalDataInfo SUPPLEMENTAL_DATA_INFO =
CLDR_CONFIG.getSupplementalDataInfo();
static final Map<String, String> likely = SUPPLEMENTAL_DATA_INFO.getLikelySubtags();
static final LikelySubtags LIKELY = new LikelySubtags();
public static void main(String[] args) {
new LikelySubtagsTest().run(args);
}
static class Tags {
final Set<String> languages = new TreeSet<>();
final Set<String> scripts = new TreeSet<>();
final Set<String> regions = new TreeSet<>();
final Set<String> scriptRegion = new TreeSet<>();
final Set<String> languageScript = new TreeSet<>();
final Set<String> languageRegion = new TreeSet<>();
final Set<String> all = new TreeSet<>();
final ChainedMap.M4<String, String, String, Boolean> languageToScriptToRegions =
ChainedMap.of(
new TreeMap<String, Object>(),
new TreeMap<String, Object>(),
new TreeMap<String, Object>(),
Boolean.class);
final ChainedMap.M3<String, String, Boolean> languageToRegions =
ChainedMap.of(
new TreeMap<String, Object>(),
new TreeMap<String, Object>(),
Boolean.class);
public Tags() {
final LanguageTagParser ltp = new LanguageTagParser();
for (Entry<String, String> entry : likely.entrySet()) {
add(ltp.set(entry.getKey()), true);
add(ltp.set(entry.getValue()), false);
}
// add unfamiliar script, unfamiliar region
for (String lang : languageToScriptToRegions.keySet()) {
if (lang.equals("und")) {
continue;
}
M3<String, String, Boolean> scriptToRegion = languageToScriptToRegions.get(lang);
final Set<String> scriptsFor = scriptToRegion.keySet();
final Set<String> regionsFor = languageToRegions.get(lang).keySet();
String firstScriptNotIn = getNonEmptyNotIn(scripts, scriptsFor);
String firstRegionNotIn = getNonEmptyNotIn(regions, regionsFor);
languageToScriptToRegions.put(
lang, firstScriptNotIn, firstRegionNotIn, Boolean.TRUE);
// clone for safety before iterating
for (String script : new HashSet<>(scriptsFor)) {
languageToScriptToRegions.put(lang, script, firstRegionNotIn, Boolean.TRUE);
}
for (String region : new HashSet<>(regionsFor)) {
languageToScriptToRegions.put(lang, firstScriptNotIn, region, Boolean.TRUE);
}
}
// System.out.println("all: " + all);
// System.out.println("scriptRegion: " + scriptRegion);
// System.out.println("languageScript: " + languageScript);
// System.out.println("languageRegion: " + languageRegion);
}
private static <T> T getNonEmptyNotIn(Iterable<T> a, Set<T> b) {
for (T x : a) {
if (!b.contains(x) && !x.toString().isEmpty()) {
return x;
}
}
throw new IllegalArgumentException();
}
void add(LanguageTagParser ltp, boolean source) {
String sourceLanguage = ltp.getLanguage();
String sourceScript = ltp.getScript();
String sourceRegion = ltp.getRegion();
languageToScriptToRegions.put(sourceLanguage, sourceScript, sourceRegion, Boolean.TRUE);
languageToScriptToRegions.put(sourceLanguage, sourceScript, "", Boolean.TRUE);
languageToScriptToRegions.put(sourceLanguage, "", "", Boolean.TRUE);
languageToRegions.put(sourceLanguage, "", Boolean.TRUE);
if (StandardCodes.isCountry(sourceRegion)) {
languageToScriptToRegions.put(sourceLanguage, "", sourceRegion, Boolean.TRUE);
languageToRegions.put(sourceLanguage, sourceRegion, Boolean.TRUE);
}
// capture all cases of 2 items
if (source) {
if (!sourceScript.isEmpty() && !sourceRegion.isEmpty()) {
if (!sourceLanguage.equals("und")) {
all.add(ltp.toString());
} else {
scriptRegion.add(ltp.toString());
}
} else if (!sourceLanguage.equals("und")) {
if (!sourceScript.isEmpty()) {
languageScript.add(ltp.toString());
} else if (!sourceRegion.isEmpty()) {
languageRegion.add(ltp.toString());
}
}
}
languages.add(sourceLanguage);
scripts.add(sourceScript);
if (StandardCodes.isCountry(sourceRegion) || sourceRegion.isEmpty()) {
regions.add(sourceRegion);
}
}
}
static final Tags TAGS = new Tags();
final LanguageTagParser maxLtp = new LanguageTagParser();
final LanguageTagParser sourceLtp = new LanguageTagParser();
/**
* Return false if we should skip the language
*
* @param source
* @return
*/
public boolean checkAdding(String source) {
// if X maps to Y, then adding a field from Y to X will still map to Y
// Example:
// und_AF => fa_Arab_AF
// therefore, the following should also be true:
// und_Arab_AF => fa_Arab_AF
// fa_AF => fa_Arab_AF
// fa_Arab_AF => fa_Arab_AF
String max = LIKELY.maximize(source);
if (!assertNotEquals("Maximize " + source, null, max)) {
return source.contains("_");
}
sourceLtp.set(source);
if (!sourceLtp.getRegion().isEmpty() && !StandardCodes.isCountry(sourceLtp.getRegion())) {
return true;
}
maxLtp.set(max);
for (int i = 1; i < 8; ++i) {
if ((i & 1) != 0) {
if (!sourceLtp.getLanguage().equals("und")) continue;
sourceLtp.setLanguage(maxLtp.getLanguage());
}
if ((i & 2) != 0) {
if (!sourceLtp.getScript().isEmpty()) continue;
sourceLtp.setScript(maxLtp.getScript());
}
if ((i & 4) != 0) {
if (!sourceLtp.getRegion().isEmpty()) continue;
sourceLtp.setRegion(maxLtp.getRegion());
}
String test = sourceLtp.toString();
final String maximize = LIKELY.maximize(test);
if (!max.equals(maximize)) {
// max(source) = max, max(test) ≠ max
if (!assertEquals(
String.format(
"checkAdding: max(%s)->%s, however max(%s)->", source, max, test),
max,
maximize)) {
// LIKELY.maximize(test); // Could step into this for debugging.
}
}
sourceLtp.set(source); // restore
}
return true;
}
public void TestCompleteness() {
final LanguageTagParser ltp = new LanguageTagParser();
if (DEBUG) {
System.out.println(TAGS.languages.size() + "\t" + TAGS.languages);
System.out.println(TAGS.scripts.size() + "\t" + TAGS.scripts);
System.out.println(TAGS.regions.size() + "\t" + TAGS.regions);
}
main:
for (Entry<String, Map<String, Map<String, Boolean>>> languageScriptRegion :
TAGS.languageToScriptToRegions) {
String language = languageScriptRegion.getKey();
ltp.set(language); // clears script, region
for (Entry<String, Map<String, Boolean>> scriptRegion :
languageScriptRegion.getValue().entrySet()) {
String script = scriptRegion.getKey();
ltp.setScript(script);
for (String region : scriptRegion.getValue().keySet()) {
ltp.setRegion(region);
String testTag = ltp.toString();
// System.out.println(testTag);
if (!testTag.equals("und") && !checkAdding(testTag)) {
checkAdding(testTag); // for debugging
continue main;
}
}
}
}
}
static Set<String> exceptions =
new HashSet<>(
Arrays.asList(
"Zyyy", "Zinh", "Zzzz", "Brai",
"Cpmn")); // scripts with no default language
public void TestStability() {
// when maximized must never change
// first get all the subtags
// then test all the combinations
LanguageTagParser ltp = new LanguageTagParser();
for (Entry<String, String> entry : likely.entrySet()) {
ltp.set(entry.getKey());
String sourceLanguage = ltp.getLanguage();
if (sourceLanguage.equals("und")) {
sourceLanguage = "";
}
String sourceScript = ltp.getScript();
String sourceRegion = ltp.getRegion();
ltp.set(entry.getValue());
String targetLanguage = ltp.getLanguage();
String targetScript = ltp.getScript();
String targetRegion = ltp.getRegion();
if (!sourceLanguage.isEmpty()) {
assertEquals("language", sourceLanguage, targetLanguage);
}
if (!sourceScript.isEmpty()) {
assertEquals("script", sourceScript, targetScript);
}
if (!sourceRegion.isEmpty()) {
if (Containment.isLeaf(sourceRegion)) {
assertEquals("region", sourceRegion, targetRegion);
}
}
}
}
public void TestForMissingScriptMetadata() {
TreeSet<String> metadataScripts = new TreeSet<>(ScriptMetadata.getScripts());
UnicodeSet current = new UnicodeSet(0, 0x10FFFF);
UnicodeSet toRemove = new UnicodeSet();
while (!current.isEmpty()) {
int ch = current.charAt(0);
int script = UScript.getScript(ch);
String shortName = UScript.getShortName(script);
Info i = ScriptMetadata.getInfo(shortName);
if (i == null) {
errln("Script Metadata is missing: " + shortName);
continue;
}
if (i.likelyLanguage.equals("und") && !exceptions.contains(shortName)) {
errln("Script has no likely language: " + shortName);
}
toRemove.applyIntPropertyValue(UProperty.SCRIPT, script);
current.removeAll(toRemove);
metadataScripts.remove(shortName);
}
metadataScripts.removeAll(
Arrays.asList("Hans", "Hant", "Hanb", "Jamo", "Jpan", "Kore")); // remove
// "combo"
// scripts
if (!metadataScripts.isEmpty()) {
// Warning, not error, so that we can add scripts to the script metadata
// and later update to the Unicode version that has characters for those scripts.
warnln("Script Metadata for characters not in Unicode: " + metadataScripts);
}
}
public void TestMissingInfoForLanguage() {
CLDRFile english = CLDR_CONFIG.getEnglish().getUnresolved();
CalculatedCoverageLevels ccl = CalculatedCoverageLevels.getInstance();
for (String language : CLDR_CONFIG.getCldrFactory().getAvailableLanguages()) {
if (language.contains("_") || language.equals("root")) {
continue;
}
String likelyExpansion = likely.get(language);
if (likelyExpansion == null) {
errln("Missing likely subtags for: " + language);
} else {
logln("Likely subtags for " + language + ":\t " + likely);
}
String path = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, language);
String englishName = english.getStringValue(path);
if (englishName == null) {
Level covLevel = ccl.getEffectiveCoverageLevel(language);
if (covLevel == null || !covLevel.isAtLeast(Level.BASIC)) {
// https://unicode-org.atlassian.net/browse/CLDR-15663
if (logKnownIssue(
"CLDR-15663",
"English translation should not be required for sub-basic language name")) {
continue; // skip error
}
}
errln("Missing English translation for: " + language + " which is at " + covLevel);
}
}
}
public void TestMissingInfoForRegion() {
CLDRFile english = CLDR_CONFIG.getEnglish();
for (String region : StandardCodes.make().getGoodAvailableCodes("territory")) {
String likelyExpansion = likely.get("und_" + region);
if (likelyExpansion == null) {
if (SUPPLEMENTAL_DATA_INFO.getContained(region) == null) { // not
// container
String likelyTag = LikelySubtags.maximize("und_" + region, likely);
if (likelyTag == null) { // || !likelyTag.startsWith("en_Latn_")
logln(
"Missing likely subtags for region: "
+ region
+ "\t"
+ english.getName("territory", region));
}
} else { // container
logln(
"Missing likely subtags for macroregion (fix to exclude regions having 'en'): "
+ region
+ "\t"
+ english.getName("territory", region));
}
} else {
logln("Likely subtags for region: " + region + ":\t " + likely);
}
String path = CLDRFile.getKey(CLDRFile.TERRITORY_NAME, region);
String englishName = english.getStringValue(path);
if (englishName == null) {
errln("Missing English translation for: " + region);
}
}
}
// typically historical script that don't need to be in likely subtags
static final Set<String> KNOWN_SCRIPTS_WITHOUT_LIKELY_SUBTAGS =
ImmutableSet.of("Hatr", "Cpmn", "Ougr");
public void TestMissingInfoForScript() {
VersionInfo icuUnicodeVersion = UCharacter.getUnicodeVersion();
TreeSet<String> sorted = new TreeSet<>(ScriptMetadata.getScripts());
Set<String> exceptions2 =
new HashSet<>(
Arrays.asList("zh_Hans_CN", "hnj_Hmnp_US", "hnj_Hmng_LA", "iu_Cans_CA"));
for (String script : sorted) {
if (exceptions.contains(script) || script.equals("Latn") || script.equals("Dsrt")) {
// we minimize away und_X, when the code puts in en...US
continue;
}
Info i = ScriptMetadata.getInfo(script);
// System.out.println(i);
String likelyLanguage = i.likelyLanguage;
String originCountry = i.originCountry;
String undScript = "und_" + script;
String langScript = likelyLanguage + "_" + script + "_";
String likelyExpansion = likely.get(undScript);
if (likelyExpansion == null) {
if (!KNOWN_SCRIPTS_WITHOUT_LIKELY_SUBTAGS.contains(script)) {
String msg =
"likelySubtags.xml missing language for script (und_"
+ script
+ "). Script Metadata suggests that it should be something like:\t "
+ showOverride(script, originCountry, langScript);
if (i.age.compareTo(icuUnicodeVersion) <= 0) {
// Error: Missing data for a script in ICU's Unicode version.
errln(msg);
} else {
// Warning: Missing data for a script in a future Unicode version.
warnln(msg);
}
}
} else if (!exceptions2.contains(likelyExpansion)
&& !likelyExpansion.startsWith(langScript)) {
// if
// (logKnownIssue("Cldrbug:7181","Missing script metadata for "
// + script)
// && (script.equals("Tfng") || script.equals("Brah"))) {
// logln("Wrong likely language for script (und_" + script +
// "). Should not be " + likelyExpansion
// + ", but something like:\t " + showOverride(script,
// originCountry, langScript));
// } else {
errln(
"likelySubtags.xml has wrong language for script (und_"
+ script
+ "). Should not be "
+ likelyExpansion
+ ", but Script Metadata suggests something like:\t "
+ showOverride(script, originCountry, langScript));
// }
} else {
logln("OK: " + undScript + " => " + likelyExpansion);
}
}
/**
* und_Bopo => zh_Bopo_TW und_Copt => cop_Copt_EG // fix 002 und_Dsrt => en_Dsrt_US // fix
* US
*/
}
public String showOverride(String script, String originCountry, String langScript) {
return "{\"und_" + script + "\", \"" + langScript + originCountry + "\"},";
}
/**
* Test two issues:
*
* <ul>
* <li>That the script of the locale's examplars matches the script derived from the locale's
* identifier.
* <li>That the union of the exemplar sets (main+aux) for all locales with the script matches
* what is in ltp.getResolvedScript()
* </ul>
*
* Written as one test, to avoid the overhead of iterating over all locales twice.
*/
public void testGetResolvedScriptVsExemplars() {
Factory factory = CLDR_CONFIG.getCldrFactory();
LanguageTagParser ltp = new LanguageTagParser();
Multimap<String, UnicodeSet> scriptToMains = TreeMultimap.create();
Multimap<String, UnicodeSet> scriptToAuxes = TreeMultimap.create();
UnicodeSet collectedBad = new UnicodeSet();
for (String locale : factory.getAvailable()) {
if ("root".equals(locale)) {
continue;
}
CLDRFile cldrFile = factory.make(locale, true);
UnicodeSet main = cldrFile.getRawExemplarSet(ExemplarType.main, WinningChoice.WINNING);
main = checkSet("main", locale, main, collectedBad);
UnicodeSet aux =
cldrFile.getRawExemplarSet(ExemplarType.auxiliary, WinningChoice.WINNING);
aux = checkSet("aux", locale, aux, collectedBad);
String script = null;
int uScript = 0;
for (String s : main) {
uScript = UScript.getScript(s.codePointAt(0));
if (uScript > UScript.INHERITED) {
script = UScript.getShortName(uScript);
break;
}
}
if (script == null) {
errln("No script for " + locale);
continue;
}
String ltpScript = ltp.set(locale).getResolvedScript();
switch (uScript) {
case UScript.HAN:
switch (ltp.getLanguage()) {
case "ja":
script = "Jpan";
break;
case "yue":
script = ltp.getScript();
if (script.isEmpty()) {
script = "Hant";
}
break;
case "zh":
script = ltp.getScript();
if (script.isEmpty()) {
script = "Hans";
}
break;
}
break;
case UScript.HANGUL:
switch (ltp.getLanguage()) {
case "ko":
script = "Kore";
break;
}
}
if (!assertEquals(locale, script, ltpScript)) {
ltp.getResolvedScript(); // for debugging
}
scriptToMains.put(ltpScript, main.freeze());
if (!aux.isEmpty()) {
scriptToAuxes.put(ltpScript, aux.freeze());
}
}
if (!collectedBad.isEmpty()) {
warnln(
"Locales have "
+ collectedBad.size()
+ " unexpected characters in main and/or aux:\t"
+ collectedBad.toPattern(false)
+ "\n Use -DSHOW_EXEMPLARS for details");
}
// now check that ScriptToExemplars.getExemplars matches the data
Set<String> problemScripts = new LinkedHashSet<>();
Map<String, UnicodeSet> expected = new TreeMap<>();
for (Entry<String, Collection<UnicodeSet>> entry : scriptToMains.asMap().entrySet()) {
String script = entry.getKey();
Collection<UnicodeSet> mains = entry.getValue();
Collection<UnicodeSet> auxes = scriptToAuxes.get(script);
UnicodeSet flattened;
if (mains.size() <= 1 && auxes.size() <= 1) {
continue;
} else {
UnicodeMap<Integer> counts = new UnicodeMap<>();
getCounts(mains, counts);
flattened = getUncommon(counts, mains.size());
if (counts.size() < 32) {
getCounts(auxes, counts);
flattened = getUncommon(counts, mains.size());
}
}
expected.put(script, flattened.freeze());
}
for (Entry<String, UnicodeSet> entry : expected.entrySet()) {
String script = entry.getKey();
UnicodeSet flattened = entry.getValue();
// now compare to what we get from the cached file, to make sure the latter is up to
// date
if (!assertEquals(
script,
flattened.toPattern(false),
ScriptToExemplars.getExemplars(script).toPattern(false))) {
problemScripts.add(script);
}
}
if (!problemScripts.isEmpty()) {
warnln(
"Adjust the data in scriptToExemplars.txt. Use -DSHOW_EXEMPLARS to get a fresh copy, or reset to expected value for: "
+ problemScripts);
if (SHOW_EXEMPLARS) {
for (Entry<String, UnicodeSet> entry : expected.entrySet()) {
String script = entry.getKey();
UnicodeSet flattened = entry.getValue();
if (!flattened.isEmpty()) {
System.out.println(
script
+ " ;\t"
+ flattened.size()
+ " ;\t"
+ flattened.toPattern(false));
}
}
}
}
}
static final UnicodeSet MAIN_AUX_EXPECTED = new UnicodeSet("[\\p{L}\\p{M}\\p{Cf}·]").freeze();
private UnicodeSet checkSet(
String title, String locale, UnicodeSet main, UnicodeSet collected) {
UnicodeSet bad = new UnicodeSet();
for (String s : main) {
if (!MAIN_AUX_EXPECTED.containsAll(s)) {
bad.add(s);
}
}
if (!bad.isEmpty()) {
if (SHOW_EXEMPLARS) {
warnln(
"\t"
+ title
+ "\tLocale\t"
+ locale
+ "\thas "
+ bad.size()
+ " unexpected exemplar characters:\t"
+ bad.toPattern(false));
}
collected.addAll(bad);
}
return CldrUtility.flatten(new UnicodeSet(main).removeAll(bad));
}
/**
* Remove items with a count equal to size (they are common to all locales), and flatten
* (against the whole set)
*/
private UnicodeSet getUncommon(UnicodeMap<Integer> counts, int size) {
UnicodeSet flattenedAll =
CldrUtility.flatten(counts.keySet()); // we flatten against the whole set
UnicodeSet result = new UnicodeSet();
for (String s : flattenedAll) {
int count = counts.get(s);
if (count != size) {
result.add(s);
}
}
return result.freeze();
}
private void getCounts(Collection<UnicodeSet> usets, UnicodeMap<Integer> counts) {
for (UnicodeSet uset : usets) {
for (String s : uset) {
Integer old = counts.get(s);
if (old == null) {
counts.put(s, 1);
} else {
counts.put(s, old + 1);
}
}
}
}
public void testUndAllScriptsAndRegions() {
Set<String> regions = new TreeSet<>();
Set<String> scripts = new TreeSet<>();
Set<String> regularCountries =
VALIDITY.getStatusToCodes(LstrType.region).get(Status.regular);
Set<String> macroRegions =
Set
.of(); // Validity.getInstance().getStatusToCodes(LstrType.region).get(Status.macroregion);
for (String country : Sets.union(regularCountries, macroRegions)) {
regions.add(country);
}
// for Scripts, just test the ones in CLDR
for (String localeString : CLDR_CONFIG.getCldrFactory().getAvailable()) {
if (localeString.equals("root")) {
continue;
}
CLDRLocale cLocale = CLDRLocale.getInstance(localeString);
final String script = cLocale.getScript();
if (script.equals("Dsrt")) {
continue; // toy script
}
final String country = cLocale.getCountry();
if (!country.isEmpty() && !country.equals("001")) {
regions.add(country);
}
if (!script.isEmpty()) {
scripts.add(script);
// if (!country.isEmpty()) {
// // we only need this if the value from script + country is
// different from the value of script
// combinations.add("und_" + script + "_" + country);
// }
}
}
for (String script : scripts) {
if (script.equals("Latn")) {
assertTrue("contains und_" + script, likely.containsKey("und"));
} else if (!assertTrue("contains und_" + script, likely.containsKey("und_" + script))) {
}
}
LanguageTagParser ltp = new LanguageTagParser();
Set<String> possibleFixes = new TreeSet<>();
for (String region : regions) {
final String undRegion = "und_" + region;
if (region.equals("150") && likely.containsKey("und")) {
// skip
} else if (!assertTrue("contains und_" + region, likely.containsKey(undRegion))) {
Set<String> languages =
SUPPLEMENTAL_DATA_INFO.getLanguagesForTerritoryWithPopulationData(region);
double biggest = -1;
String biggestLang = null;
for (String language : languages) {
PopulationData popData =
SUPPLEMENTAL_DATA_INFO.getLanguageAndTerritoryPopulationData(
language, region);
if (popData.getLiteratePopulation() > biggest) {
biggest = popData.getLiteratePopulation();
biggestLang = language;
}
}
if (biggestLang != null) {
ltp.set(biggestLang);
if (ltp.getScript().isEmpty()) {
String biggestMax = likely.get(biggestLang);
ltp.set(biggestMax);
}
ltp.setRegion(region);
possibleFixes.add(
"<likelySubtag from=\"" + undRegion + "\" to=\"" + ltp + "\"/>");
}
}
}
System.out.println("\t\t" + Joiner.on("\n\t\t").join(possibleFixes));
}
public void testToAttributeValidityStatus() {
Set<String> okLanguages = VALIDITY.getStatusToCodes(LstrType.language).get(Status.regular);
Set<String> okScripts = VALIDITY.getStatusToCodes(LstrType.script).get(Status.regular);
Set<String> okRegions = VALIDITY.getStatusToCodes(LstrType.region).get(Status.regular);
Multimap<String, String> badFieldsToLocales = TreeMultimap.create();
Set<String> knownExceptions = Set.of("in", "iw", "ji", "jw", "mo", "tl");
for (String s : likely.values()) {
CLDRLocale cLocale = CLDRLocale.getInstance(s);
final String language = cLocale.getLanguage();
final String script = cLocale.getScript();
final String region = cLocale.getCountry();
if (!okLanguages.contains(language)) {
if (knownExceptions.contains(language)) {
continue;
}
badFieldsToLocales.put(language, s);
}
if (!okScripts.contains(script)) {
badFieldsToLocales.put(script, s);
}
if (!okRegions.contains(region)) {
badFieldsToLocales.put(region, s);
}
}
if (!badFieldsToLocales.isEmpty()) {
Multimap<Status, String> statusToExamples = TreeMultimap.create();
for (String field : badFieldsToLocales.keySet()) {
Status status = VALIDITY.getCodeToStatus(LstrType.language).get(field);
if (status == null) {
status = VALIDITY.getCodeToStatus(LstrType.script).get(field);
}
if (status == null) {
status = VALIDITY.getCodeToStatus(LstrType.region).get(field);
}
statusToExamples.put(status, field);
}
Map<String, String> fieldToOrigin = new TreeMap<>();
for (Entry<Status, Collection<String>> entry : statusToExamples.asMap().entrySet()) {
// for (String value : entry.getValue()) {
// String origin =
// SUPPLEMENTAL_DATA_INFO.getLikelyOrigins().get(value);
// fieldToOrigin.put(value, origin == null ? "n/a" : origin);
// }
warnln("Bad status=" + entry.getKey() + " for " + entry.getValue());
}
}
}
/**
* Test whether any of the mapping lines in likelySubtags.xml are superfluous. <br>
* For example, with the following mappings, #2 and #3 are superfluous, since they would be
* produced by the algorithm anyway.
*
* <ol>
* <li>ll => ll_Sss1_R1
* <li>ll_Sss2 => ll_Sss2_RR
* <li>ll_R2 => ll_Ssss_R2
* </ol>
*
* On the other hand, the following are not:
*
* <ol>
* <li>ll_Sss2 => ll_Sss2_R3
* <li>ll_R2 => ll_Sss3_R2
* </ol>
*/
public void testSuperfluous() {
Map<String, String> origins = SUPPLEMENTAL_DATA_INFO.getLikelyOrigins();
// collect all items with same language
LanguageTagParser ltp = new LanguageTagParser();
TreeMap<String, TreeMap<String, String>> langToLikelySubset = new TreeMap<>();
for (Entry<String, String> entry : likely.entrySet()) {
String lang = ltp.set(entry.getKey()).getLanguage();
if (lang.equals("und")) {
continue;
}
TreeMap<String, String> subtree = langToLikelySubset.get(lang);
if (subtree == null) {
langToLikelySubset.put(lang, subtree = new TreeMap<>());
}
subtree.put(entry.getKey(), entry.getValue());
}
boolean first = true;
for (Entry<String, TreeMap<String, String>> langAndMap : langToLikelySubset.entrySet()) {
String lang0 = langAndMap.getKey();
Map<String, String> goldenMap = ImmutableMap.copyOf(langAndMap.getValue());
if (goldenMap.size() == 1) {
continue;
}
// get test sets and build probe data
Set<String> scripts = new TreeSet<>();
scripts.add("Egyp");
scripts.add("");
Set<String> regions = new TreeSet<>();
regions.add("AQ");
regions.add("");
for (String key : Sets.union(goldenMap.keySet(), new TreeSet<>(goldenMap.values()))) {
scripts.add(ltp.set(key).getScript());
regions.add(ltp.getRegion());
}
scripts = ImmutableSet.copyOf(scripts);
regions = ImmutableSet.copyOf(regions);
TreeSet<String> probeData = new TreeSet<>();
ltp.setLanguage(lang0); // clear;
for (String script : scripts) {
ltp.setScript(script); // clear;
for (String region : regions) {
ltp.setRegion(region);
probeData.add(ltp.toString());
}
}
// see if the omission of a <key,value> makes no difference
String omittableKey = null;
for (String keyToTryOmitting : goldenMap.keySet()) {
if (!keyToTryOmitting.contains("_")) {
continue;
}
TreeMap<String, String> mapWithOmittedKey = new TreeMap<>(goldenMap);
mapWithOmittedKey.remove(keyToTryOmitting);
boolean makesADifference = false;
for (String probe : probeData) {
String expected = LikelySubtags.maximize(probe, goldenMap);
String actual = LikelySubtags.maximize(probe, mapWithOmittedKey);
if (!Objects.equal(expected, actual)) {
makesADifference = true;
break;
}
}
if (!makesADifference) {
omittableKey = keyToTryOmitting;
break;
}
}
// show the value that doesn't make a difference
// NOTE: there may be more than one, but it is sufficient to find one.
if (omittableKey != null) {
final String origin = origins.get(omittableKey);
if (origin != null) { // only check the non-sil for now
logKnownIssue("CLDR-17084", "Remove superfluous lines in likelySubtags.txt");
continue;
}
if (first) {
warnln("\tMaps\tKey to omit\tvalue\torigin");
first = false;
}
assertFalse(
"\t"
+ goldenMap
+ "\t"
+ omittableKey
+ "\t"
+ goldenMap.get(omittableKey)
+ "\t"
+ (origin == null ? "" : origin)
+ "\t",
true);
}
}
}
}