blob: e5743ed4c63d6f58060c5ad2b84f209a1e8a394c [file] [log] [blame]
package org.unicode.cldr.test;
import com.google.common.base.Joiner;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.util.ULocale;
import java.util.Collections;
import java.util.EnumMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import org.unicode.cldr.draft.ScriptMetadata;
import org.unicode.cldr.draft.ScriptMetadata.Info;
import org.unicode.cldr.draft.ScriptMetadata.Trinary;
import org.unicode.cldr.test.CheckCLDR.CheckStatus.Subtype;
import org.unicode.cldr.tool.LikelySubtags;
import org.unicode.cldr.util.CLDRFile;
import org.unicode.cldr.util.CLDRURLS;
import org.unicode.cldr.util.CldrUtility;
import org.unicode.cldr.util.Counter;
import org.unicode.cldr.util.Factory;
import org.unicode.cldr.util.PathStarrer;
import org.unicode.cldr.util.PatternCache;
import org.unicode.cldr.util.RegexLookup;
import org.unicode.cldr.util.SpecialLocales;
public class CheckConsistentCasing extends FactoryCheckCLDR {
private static final boolean DEBUG = CldrUtility.getProperty("DEBUG", false);
private static final double MIN_FACTOR = 2.5;
// remember to add this class to the list in CheckCLDR.getCheckAll
// to run just this test, on just locales starting with 'nl', use CheckCLDR with -fnl.*
// -t.*Currencies.*
ULocale uLocale = null;
BreakIterator breaker = null;
private String locale;
CasingInfo casingInfo;
private boolean hasCasingInfo;
public CheckConsistentCasing(Factory factory) {
super(factory);
casingInfo = new CasingInfo(factory);
}
@Override
public CheckCLDR handleSetCldrFileToCheck(
CLDRFile cldrFileToCheck, Options options, List<CheckStatus> possibleErrors) {
if (cldrFileToCheck == null) return this;
super.handleSetCldrFileToCheck(cldrFileToCheck, options, possibleErrors);
locale = cldrFileToCheck.getLocaleID();
// get info about casing; note that this is done in two steps since
// ScriptMetadata.getInfo() returns null, in some instances.
// OLD: Info localeInfo = ScriptMetadata.getInfo(locale);
String script = new LikelySubtags().getLikelyScript(locale);
Info localeInfo = ScriptMetadata.getInfo(script);
if (localeInfo != null && localeInfo.hasCase == Trinary.YES) {
// this script has casing info, so we can request it here
try {
types = casingInfo.getLocaleCasing(locale);
} catch (Exception e) {
types = Collections.emptyMap();
}
} else {
// no casing info - since the types Map is global, and null checks aren't done,
// we are better off with an empty map here
types = Collections.emptyMap();
}
if ((types == null || types.isEmpty()) && !SpecialLocales.isScratchLocale(locale)) {
possibleErrors.add(
new CheckStatus()
.setCause(this)
.setMainType(CheckStatus.warningType)
.setSubtype(Subtype.incorrectCasing)
.setMessage("Could not load casing info for {0}", locale));
}
// types may be null, avoid NPE
hasCasingInfo = (types == null) ? false : types.size() > 0;
return this;
}
// If you don't need any file initialization or postprocessing, you only need this one routine
@Override
public CheckCLDR handleCheck(
String path, String fullPath, String value, Options options, List<CheckStatus> result) {
// it helps performance to have a quick reject of most paths
if (fullPath == null) return this; // skip paths that we don't have
if (!accept(result)) return this; // causes hasCasingInfo to be calculated
if (!hasCasingInfo) return this;
String locale2 = getCldrFileToCheck().getSourceLocaleID(path, null);
if (locale2.equals(locale) && value != null && value.length() > 0) {
Category category = getCategory(path);
if (category != null) {
checkConsistentCasing(category, path, fullPath, value, options, result);
}
}
return this;
}
static final Matcher placeholder = PatternCache.get("\\{\\d+\\}").matcher("");
/** The casing type of a given string. */
public enum CasingType {
titlecase,
lowercase,
other;
public static CasingType from(String s) {
if (s == null || s.length() == 0) {
return other;
}
int cp;
// Look for the first meaningful character in the string to determine case.
for (int i = 0; i < s.length(); i += Character.charCount(cp)) {
cp = s.codePointAt(i);
// used to skip the placeholders, but works better to have them be 'other'
// if (cp == '{') {
// if (placeholder.reset(s).region(i,s.length()).lookingAt()) {
// i = placeholder.end() - 1; // skip
// continue;
// }
// }
int type = UCharacter.getType(cp);
switch (type) {
case UCharacter.LOWERCASE_LETTER:
return lowercase;
case UCharacter.UPPERCASE_LETTER:
case UCharacter.TITLECASE_LETTER:
return titlecase;
// for other letters / numbers / symbols, return other
case UCharacter.OTHER_LETTER:
case UCharacter.DECIMAL_DIGIT_NUMBER:
case UCharacter.LETTER_NUMBER:
case UCharacter.OTHER_NUMBER:
case UCharacter.MATH_SYMBOL:
case UCharacter.CURRENCY_SYMBOL:
case UCharacter.MODIFIER_SYMBOL:
case UCharacter.OTHER_SYMBOL:
return other;
// ignore everything else (whitespace, punctuation, etc) and keep going
}
}
return other;
}
/** Return true if either is other, or they are identical. */
public boolean worksWith(CasingType otherType) {
return otherType == null
|| this == otherType
|| this == CasingType.other
|| otherType == CasingType.other;
}
}
public enum CasingTypeAndErrFlag {
titlecase_mismatchWarn(CasingType.titlecase, false),
titlecase_mismatchErr(CasingType.titlecase, true),
lowercase_mismatchWarn(CasingType.lowercase, false),
lowercase_mismatchErr(CasingType.lowercase, true),
other_mismatchWarn(CasingType.other, false),
other_mismatchErr(CasingType.other, true);
private final CasingType type;
private final boolean flag; // force error instead of warning for mismatch
private CasingTypeAndErrFlag(CasingType type, boolean flag) {
this.type = type;
this.flag = flag;
}
public CasingType type() {
return type;
}
public boolean flag() {
return flag;
}
}
static final RegexLookup<Category> pathToBucket =
new RegexLookup<Category>()
.add("//ldml/localeDisplayNames/languages/language", Category.language)
.add("//ldml/localeDisplayNames/scripts/script", Category.script)
.add("//ldml/localeDisplayNames/territories/territory", Category.territory)
.add("//ldml/localeDisplayNames/variants/variant", Category.variant)
.add("//ldml/localeDisplayNames/keys/key", Category.key)
.add("//ldml/localeDisplayNames/types/type", Category.keyValue)
.add("//ldml/dates/calendars/calendar.*/months.*narrow", Category.month_narrow)
.add(
"//ldml/dates/calendars/calendar.*/months.*format",
Category.month_format_except_narrow)
.add(
"//ldml/dates/calendars/calendar.*/months",
Category.month_standalone_except_narrow)
.add("//ldml/dates/calendars/calendar.*/days.*narrow", Category.day_narrow)
.add(
"//ldml/dates/calendars/calendar.*/days.*format",
Category.day_format_except_narrow)
.add(
"//ldml/dates/calendars/calendar.*/days",
Category.day_standalone_except_narrow)
.add("//ldml/dates/calendars/calendar.*/eras/eraNarrow", Category.era_narrow)
.add("//ldml/dates/calendars/calendar.*/eras/eraAbbr", Category.era_abbr)
.add("//ldml/dates/calendars/calendar.*/eras/", Category.era_name)
.add(
"//ldml/dates/calendars/calendar.*/quarters.*narrow",
Category.quarter_narrow)
.add(
"//ldml/dates/calendars/calendar.*/quarters.*abbreviated",
Category.quarter_abbreviated)
.add(
"//ldml/dates/calendars/calendar.*/quarters.*format",
Category.quarter_format_wide)
.add(
"//ldml/dates/calendars/calendar.*/quarters",
Category.quarter_standalone_wide)
.add("//ldml/.*/relative", Category.relative)
.add("//ldml/dates/fields", Category.calendar_field)
.add(
"//ldml/dates/timeZoneNames/zone.*/exemplarCity",
Category.zone_exemplarCity)
.add("//ldml/dates/timeZoneNames/zone.*/short", Category.zone_short)
.add("//ldml/dates/timeZoneNames/zone", Category.zone_long)
.add(
"//ldml/dates/timeZoneNames/metazone.*/commonlyUsed",
Category.NOT_USED) // just to remove them from the other cases
.add("//ldml/dates/timeZoneNames/metazone.*/short", Category.metazone_long)
.add("//ldml/dates/timeZoneNames/metazone", Category.metazone_long)
.add("//ldml/numbers/currencies/currency.*/symbol", Category.symbol)
.add(
"//ldml/numbers/currencies/currency.*/displayName.*@count",
Category.currencyName_count)
.add("//ldml/numbers/currencies/currency.*/displayName", Category.currencyName)
.add("//ldml/units/unit.*/unitPattern.*(past|future)", Category.relative)
.add("//ldml/units/unit.*/unitPattern", Category.unit_pattern)
// ldml/localeDisplayNames/keys/key[@type=".*"]
// ldml/localeDisplayNames/measurementSystemNames/measurementSystemName[@type=".*"]
// ldml/localeDisplayNames/transformNames/transformName[@type=".*"]
;
Map<Category, CasingTypeAndErrFlag> types = new EnumMap<>(Category.class);
public enum Category {
language,
script,
territory,
variant,
keyValue,
month_narrow,
month_format_except_narrow,
month_standalone_except_narrow,
day_narrow,
day_format_except_narrow,
day_standalone_except_narrow,
era_narrow,
era_abbr,
era_name,
quarter_narrow,
quarter_abbreviated,
quarter_format_wide,
quarter_standalone_wide,
calendar_field,
zone_exemplarCity,
zone_short,
zone_long,
NOT_USED,
metazone_short,
metazone_long,
symbol,
currencyName_count,
currencyName,
relative,
unit_pattern,
key;
}
// //ldml/numbers/currencies/currency[@type="ADP"]/displayName
// //ldml/numbers/currencies/currency[@type="RON"]/displayName[@count="other"]
// //ldml/numbers/currencies/currency[@type="BYB"]/symbol
static Category getCategory(String path) {
return pathToBucket.get(path);
}
/**
* Calculates casing information using data from the specified CLDRFile.
*
* @param resolved the resolved CLDRFile to calculate casing information from
* @return
*/
public static Map<Category, CasingType> getSamples(CLDRFile resolved) {
// Use EnumMap instead of an array for type safety.
Map<Category, Counter<CasingType>> counters = new EnumMap<>(Category.class);
for (Category category : Category.values()) {
counters.put(category, new Counter<CasingType>());
}
PathStarrer starrer = new PathStarrer();
boolean isRoot = "root".equals(resolved.getLocaleID());
Set<String> missing = !DEBUG ? null : new TreeSet<>();
for (String path : resolved) {
if (!isRoot) {
String locale2 = resolved.getSourceLocaleID(path, null);
if (locale2.equals("root") || locale2.equals("code-fallback")) {
continue;
}
}
String winningPath = resolved.getWinningPath(path);
if (!winningPath.equals(path)) {
continue;
}
Category category = getCategory(path);
if (category != null) {
String value = resolved.getStringValue(path);
if (value == null || value.length() == 0) continue;
CasingType ft = CasingType.from(value);
counters.get(category).add(ft, 1);
} else if (DEBUG) {
String starred = starrer.set(path);
missing.add(starred);
}
}
Map<Category, CasingType> info = new EnumMap<>(Category.class);
for (Category category : Category.values()) {
if (category == Category.NOT_USED) continue;
Counter<CasingType> counter = counters.get(category);
long countLower = counter.getCount(CasingType.lowercase);
long countUpper = counter.getCount(CasingType.titlecase);
long countOther = counter.getCount(CasingType.other);
CasingType type;
if (countLower + countUpper == 0) {
type = CasingType.other;
} else if (countLower >= countUpper * MIN_FACTOR && countLower >= countOther) {
type = CasingType.lowercase;
} else if (countUpper >= countLower * MIN_FACTOR && countUpper >= countOther) {
type = CasingType.titlecase;
} else {
type = CasingType.other;
}
info.put(category, type);
}
if (DEBUG && missing.size() != 0) {
System.out.println("Paths skipped:\n" + Joiner.on("\n").join(missing));
}
return info;
}
private static final String CASE_WARNING =
"The first letter of 〈{0}〉 is {1}, which differs from what is expected "
+ "for the {2} category: that almost all values be {3}.\n\n"
+ "For guidance, see "
+ CLDRURLS.CAPITALIZATION_URL
+ ". "
+ "If this warning is wrong, please file a ticket at "
+ CLDRURLS.CLDR_NEWTICKET_URL
+ ".";
private void checkConsistentCasing(
Category category,
String path,
String fullPath,
String value,
Options options,
List<CheckStatus> result) {
// Avoid NPE
if (types != null) {
CasingType ft = CasingType.from(value);
CasingTypeAndErrFlag typeAndFlagFromCat = types.get(category);
if (typeAndFlagFromCat == null) {
typeAndFlagFromCat = CasingTypeAndErrFlag.other_mismatchWarn;
}
if (!ft.worksWith(typeAndFlagFromCat.type())) {
result.add(
new CheckStatus()
.setCause(this)
.setMainType(
typeAndFlagFromCat.flag()
? CheckStatus.errorType
: CheckStatus.warningType)
.setSubtype(Subtype.incorrectCasing) // typically warningType or
// errorType
.setMessage(
CASE_WARNING,
value,
ft,
category,
typeAndFlagFromCat
.type())); // the message; can be MessageFormat with
// arguments
}
}
}
}