blob: 051b0ced82499e5aaa4a910f6ade18d103079ae1 [file] [log] [blame]
package org.unicode.cldr.unittest;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumSet;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import org.unicode.cldr.test.CoverageLevel2;
import org.unicode.cldr.tool.MinimizeRegex;
import org.unicode.cldr.util.CLDRConfig;
import org.unicode.cldr.util.CLDRFile;
import org.unicode.cldr.util.CLDRLocale;
import org.unicode.cldr.util.Counter;
import org.unicode.cldr.util.Factory;
import org.unicode.cldr.util.LanguageTagParser;
import org.unicode.cldr.util.Level;
import org.unicode.cldr.util.LocaleIDParser;
import org.unicode.cldr.util.Organization;
import org.unicode.cldr.util.StandardCodes;
import org.unicode.cldr.util.StandardCodes.LstrType;
import org.unicode.cldr.util.SupplementalDataInfo;
import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus;
import org.unicode.cldr.util.SupplementalDataInfo.PopulationData;
import org.unicode.cldr.util.Validity;
import org.unicode.cldr.util.Validity.Status;
public class TestCLDRLocaleCoverage extends TestFmwkPlus {
private static StandardCodes sc = StandardCodes.make();
private static final CLDRConfig CLDRCONFIG = CLDRConfig.getInstance();
private static final SupplementalDataInfo SDI = CLDRCONFIG.getSupplementalDataInfo();
private static final CLDRFile ENGLISH = CLDRCONFIG.getEnglish();
public static void main(String[] args) {
new TestCLDRLocaleCoverage().run(args);
public void TestLanguageNameCoverage() {
// mainLocales has the locales in common/main, which is basically the set in attributeValueValidity.xml $language..
// We add in additionsToTranslate below the set in attributeValueValidity.xml $languageExceptions
// (both sets are included in SDI.getCLDRLanguageCodes() but we do not use that until later).
Set<String> additionsToTranslate = ImmutableSortedSet.of("zxx", "mul",
"ab", "ace", "ada", "ady", "ain", "ale", "alt", "an", "anp", "arn", "arp", "ars", "atj", "av", "awa", "ay",
"ba", "ban", "bho", "bi", "bin", "bla", "bug", "byn",
"cay", "ch", "chk", "chm", "cho", "chp", "chy", "clc", "co", "crg", "crj", "crk", "crl", "crm", "crr", "csw", "cv",
"dak", "dar", "dgr", "dv", "dzg",
"efi", "eka",
"fj", "fon", "frc",
"gaa", "gez", "gil", "gn", "gor", "gwi",
"hai", "hax", "hil", "hmn", "ht", "hup", "hur", "hz",
"iba", "ibb", "ikt", "ilo", "inh", "io", "iu",
"kac", "kaj", "kbd", "kcg", "kfo", "kha", "kj", "kmb", "kpe", "kr", "krc", "krl", "kru", "kum", "kv", "kwk",
"la", "lad", "lez", "li", "lil", "lou", "loz", "lsm", "lua", "lun", "lus",
"mad", "mag", "mak", "mdf", "men", "mh", "mic", "min", "moe", "moh", "mos", "mus", "mwl", "myv",
"na", "nap", "new", "ng", "nia", "niu", "nog", "nqo", "nr", "nso", "nv", "ny",
"oc", "ojb", "ojc", "ojs", "ojw", "oka",
"pag", "pam", "pap", "pau", "pqm",
"rap", "rar", "rhg", "rup",
"sad", "sba", "scn", "sco", "shn", "slh", "sm", "snk", "srn", "ss", "st", "str", "suk", "swb", "syr",
"tce", "tem", "tet", "tgx", "tht", "tig", "tlh", "tli", "tn", "tpi", "trv", "ts", "ttm", "tum", "tvl", "ty", "tyv",
"udm", "umb",
"wa", "wal", "war", "wuu",
"zun", "zza" );
warnln("Locale names added for translation; revisit each release:\n"
+ Joiner.on("\n")
.join( -> x + "\t(" + ENGLISH.getName(x) + ")").collect(Collectors.toList())));
Map<String, Status> validity = Validity.getInstance().getCodeToStatus(LstrType.language);
Multimap<Status, String> statusToLang = Multimaps.invertFrom(Multimaps.forMap(validity), TreeMultimap.create());
Set<String> regular = (Set<String>) statusToLang.get(Status.regular);
Set<String> regularPlus = ImmutableSet.<String>builder().addAll(regular).add("und").add("zxx").add("mul").build();
Set<String> valid = validity.keySet();
Factory factory = CLDRCONFIG.getCldrFactory();
Set<String> mainLocales = new LinkedHashSet<>();
LanguageTagParser ltp = new LanguageTagParser();
for (String locale : factory.getAvailableLanguages()) {
String language = ltp.set(locale).getLanguage();
if (language.equals("root")) language = "und";
mainLocales = ImmutableSet.copyOf(mainLocales);
Set<String> localesForNames = new TreeSet<>();
localesForNames = ImmutableSet.copyOf(localesForNames);
assertContains("regularPlus.containsAll(mainLocales)", regularPlus, localesForNames);
CoverageLevel2 coverageLeveler = CoverageLevel2.getInstance("und");
Multimap<Level, String> levelToLanguage = TreeMultimap.create();
for (String locale : valid) {
String path = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, locale);
Level level = coverageLeveler.getLevel(path);
levelToLanguage.put(level, locale);
Set<String> coverageLocales = new TreeSet<>();
for (Level level : Level.values()) {
if (level == Level.COMPREHENSIVE) {
//assertContains("mainLocales.containsAll(coverage:" + level + ")", localesForNames, levelToLanguage.get(level));
if (logKnownIssue("CLDR-15888", "modern coverage not yet updated for bgc, raj")) {
// If this fails, it is because of a mismatch between coverage and the getCLDRLanguageCodes.
// Usually a problem with coverage.
boolean showRegex = !assertContains("localesForNames.containsAll(coverageLocales)", localesForNames, coverageLocales);
showRegex |= !assertContains("coverageLocales.containsAll(localesForNames) - add to %language80 or lower under coverageLevels.xml?", coverageLocales, localesForNames);
if (showRegex || true) {
String simplePattern = MinimizeRegex.simplePattern(localesForNames);
warnln("Plain Regex for coverage:\n" + simplePattern);
warnln("Compact Regex for coverage:\n" + MinimizeRegex.compressWith(localesForNames, new UnicodeSet("[a-z]")));
Map<String,Integer> official1M = getOfficial1M();
Set<String> official1MSet = new TreeSet<>();
for (String locale : official1M.keySet()) {
if (!localesForNames.contains(locale)) {
warnln("Official with 1M+ speakers, need investigation of literacy: " + official1MSet);
// assertContains("sdiLocales contains oldModernLocales", sdiLocales, oldModernLocales);
// assertContains("oldModernLocales contains sdiLocales", oldModernLocales, sdiLocales);
for (String locale : localesForNames) {
logln("\n" + locale + "\t" + ENGLISH.getName(locale));
logln("\nmainLocales:" + composeList(mainLocales, "\n\t", new StringBuilder()));
logln("\nadditionsToTranslate:" + composeList(additionsToTranslate, "\n\t", new StringBuilder()));
logln("\noldModernLocales:" + composeList(coverageLocales, "\n\t", new StringBuilder()));
private Map<String,Integer> getOfficial1M() {
Counter<String> counter = new Counter<>();
for (String region : SDI.getTerritoriesWithPopulationData()) {
for (String language : SDI.getLanguagesForTerritoryWithPopulationData(region)) {
PopulationData popData = SDI.getLanguageAndTerritoryPopulationData(language, region);
OfficialStatus status = popData.getOfficialStatus();
if (status == OfficialStatus.unknown) {
// we only care about names, so drop scripts
int underbar = language.indexOf('_');
if (underbar >= 0) {
language = language.substring(0, underbar);
counter.add(language, (int) popData.getLiteratePopulation());
Map<String,Integer> result = new TreeMap<>();
for (String language : counter.keySet()) {
long litPop = counter.get(language);
if (litPop >= 1_000_000) {
result.put(language, (int)litPop);
return ImmutableMap.copyOf(result);
static final StringBuilder composeList(Iterable<String> source, String separator, StringBuilder result) {
String prefix = null;
for (String item : source) {
if (prefix == null || !item.startsWith(prefix)) {
prefix = item.substring(0,1); // only ascii
} else {
result.append(' ');
return result;
private boolean assertContains(String title, Collection<String> set, Collection<String> subset) {
boolean result = set.containsAll(subset);
if (!result) {
Set<String> temp = new LinkedHashSet<>(subset);
Set<String> temp2 = new TreeSet<>();
for (String locale : temp) {
temp2.add(locale + "\t" + ENGLISH.getName(locale));
errln(title + ": Missing:\t" + temp.size() + "\n\t" + Joiner.on("\n\t").join(temp2));
return result;
* Test whether there are any locales for the organization CLDR
public void TestCLDROrganizationPresence() {
Set<String> cldrLocales = sc.getLocaleCoverageLocales(
Organization.cldr, EnumSet.of(Level.MODERN));
assertNotNull("Expected CLDR modern locales not to be null",
assertTrue("Expected locales for CLDR, but found none.",
cldrLocales != null && !cldrLocales.isEmpty());
* Tests that cldr+special is a superset of the TC locales, with the right levels
public void TestCldrSuperset() {
Map<String, Level> apple = sc.getLocalesToLevelsFor(;
Map<String, Level> google = sc.getLocalesToLevelsFor(;
Map<String, Level> microsoft = sc.getLocalesToLevelsFor(;
Map<String, Level> special = sc.getLocalesToLevelsFor(Organization.special);
Map<String, Level> cldr = sc.getLocalesToLevelsFor(Organization.cldr);
// check that the cldr locales (+ special) have the max level of the TC locales
for (Entry<String, Level> entry : cldr.entrySet()) {
String locale = entry.getKey();
Level cldrLevel = entry.getValue();
Level appleLevel = apple.get(locale);
Level googleLevel = google.get(locale);
Level microsoftLevel = microsoft.get(locale);
Level specialLevel = special.get(locale);
// check the 8 vote count
int count = getLevelCount(appleLevel)
+ getLevelCount(googleLevel)
+ getLevelCount(microsoftLevel)
int defaultVotes = SupplementalDataInfo.getInstance().getRequiredVotes(CLDRLocale.getInstance(locale), null);
assertEquals("8 votes for " + locale + " at " + cldrLevel, count > 2 && cldrLevel.compareTo(Level.MODERN) >= 0, defaultVotes == 8);
// check the max level
Level maxLevel = Level.max(appleLevel, googleLevel, microsoftLevel, specialLevel);
assertEquals("cldr level = max for " + locale + " (" + ENGLISH.getName(locale) + ")", cldrLevel, maxLevel);
// check that the cldr locales include all of the other locale's
checkCldrContains("cldr", cldr, "apple", apple);
checkCldrContains("cldr", cldr, "google", google);
checkCldrContains("cldr", cldr, "microsoft", microsoft);
checkCldrContains("cldr", cldr, "special", apple);
// check that special doesn't overlap with TC, except for generated locales
checkDisjoint("special", special, "apple", apple);
checkDisjoint("special", special, "google", google);
checkDisjoint("special", special, "microsoft", microsoft);
private int getLevelCount(Level appleLevel) {
return appleLevel == null ? 0
: appleLevel.compareTo(Level.MODERN) >= 0 ? 1 : 0;
private static final Set<String> ANY_LOCALE_SET = ImmutableSet.of("*");
private static final Set<String> LOCALE_CONTAINMENT_EXCEPTIONS = ImmutableSet.of(
"sr_Latn", // auto-generated
"hi", "sr", "yue" // these are inserted by Locales.txt processing TODO don't add to special
private void checkCldrContains(String firstName, Map<String, Level> first, String otherName, Map<String, Level> other) {
assertEquals(firstName + " ⊇ " + otherName, Collections.emptySet(), Sets.difference(Sets.difference(other.keySet(), ANY_LOCALE_SET), first.keySet()));
private void checkDisjoint(String firstName, Map<String, Level> first, String otherName, Map<String, Level> other) {
assertEquals(firstName + " ⩃ " + otherName, Collections.emptySet(), Sets.difference(Sets.intersection(other.keySet(), first.keySet()), LOCALE_CONTAINMENT_EXCEPTIONS));
public void TestParentCoverage() {
for (Organization organization : sc.getLocaleCoverageOrganizations()) {
if (organization == Organization.special) {
final Map<String, Level> localesToLevels = sc.getLocalesToLevelsFor(organization);
for (Entry<String, Level> localeAndLevel : localesToLevels.entrySet()) {
String originalLevel = localeAndLevel.getKey();
Level level = localeAndLevel.getValue();
String locale = originalLevel;
while (true) {
String parent = LocaleIDParser.getParent(locale);
if (parent == null || parent.equals("root")) {
if (!parent.equals("en_001")) { // en_001 is generated later from en_GB
Level parentLevel = localesToLevels.get(parent);
+ "; locale=" + originalLevel
+ "; level=" + level
+ "; parent=" + parent
+ "; level=" + parentLevel,
parentLevel != null && parentLevel.compareTo(level) >= 0);
locale = parent;