blob: 40e25d09cc2f3b95719fe976a7d47626841c13c7 [file] [log] [blame]
package org.unicode.cldr.tool;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Multimap;
import com.google.common.collect.TreeMultimap;
import com.ibm.icu.impl.Relation;
import com.ibm.icu.impl.Row.R2;
import java.io.File;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.unicode.cldr.tool.Option.Options;
import org.unicode.cldr.util.CLDRConfig;
import org.unicode.cldr.util.CLDRFile;
import org.unicode.cldr.util.CLDRFile.DraftStatus;
import org.unicode.cldr.util.CLDRFile.Status;
import org.unicode.cldr.util.CLDRLocale;
import org.unicode.cldr.util.CLDRPaths;
import org.unicode.cldr.util.ChainedMap;
import org.unicode.cldr.util.ChainedMap.M3;
import org.unicode.cldr.util.ChainedMap.M4;
import org.unicode.cldr.util.Counter;
import org.unicode.cldr.util.DtdData;
import org.unicode.cldr.util.DtdType;
import org.unicode.cldr.util.LanguageTagCanonicalizer;
import org.unicode.cldr.util.LanguageTagParser;
import org.unicode.cldr.util.Level;
import org.unicode.cldr.util.LocaleNames;
import org.unicode.cldr.util.Pair;
import org.unicode.cldr.util.PathHeader;
import org.unicode.cldr.util.PathHeader.Factory;
import org.unicode.cldr.util.PathHeader.PageId;
import org.unicode.cldr.util.PathHeader.SectionId;
import org.unicode.cldr.util.PathHeader.SurveyToolStatus;
import org.unicode.cldr.util.PathStarrer;
import org.unicode.cldr.util.RegexUtilities;
import org.unicode.cldr.util.SupplementalDataInfo;
import org.unicode.cldr.util.SupplementalDataInfo.LengthFirstComparator;
import org.unicode.cldr.util.XMLFileReader;
import org.unicode.cldr.util.XPathParts;
public class ShowStarredCoverage {
static final CLDRConfig config = CLDRConfig.getInstance();
enum MyOptions {
language(".*", "it", "language to gather coverage data for"),
tag(".*", null, "gather data on language tags"),
dtdTypes(".*", "ldml", "dtdTypes, comma separated."),
pathRegex(".*", null, "If present, only matching paths"),
maxAttributes("\\d+", null, "At most this number of attributes"),
verbose(".*", "", "Verbose mode"),
// filter(".*", "en_001", "locale ancestor"),
;
// BOILERPLATE TO COPY
final Option option;
private MyOptions(String argumentPattern, String defaultArgument, String helpText) {
option = new Option(this, argumentPattern, defaultArgument, helpText);
}
static Options myOptions = new Options();
static {
for (MyOptions option : MyOptions.values()) {
myOptions.add(option, option.option);
}
}
private static Set<String> parse(String[] args, boolean showArguments) {
return myOptions.parse(MyOptions.values()[0], args, true);
}
}
static final PathStarrer pathStarrer = new PathStarrer().setSubstitutionPattern("*");
static final Factory phf = PathHeader.getFactory(config.getEnglish());
static final SupplementalDataInfo sdi = config.getSupplementalDataInfo();
static final CLDRFile ENGLISH =
config.getCommonAndSeedAndMainAndAnnotationsFactory().make("en", true);
static boolean verbose = false;
static Matcher pathRegex = null;
static int maxAttributes = 10;
static final Set<String> ALLOWED = ImmutableSet.of("main", "annotations");
public static void main(String[] args) {
MyOptions.parse(args, true);
verbose = MyOptions.verbose.option.doesOccur();
if (MyOptions.pathRegex.option.doesOccur()) {
pathRegex = Pattern.compile(MyOptions.pathRegex.option.getValue()).matcher("");
}
if (MyOptions.maxAttributes.option.doesOccur()) {
maxAttributes = Integer.parseInt(MyOptions.maxAttributes.option.getValue());
}
if (MyOptions.tag.option.doesOccur()) {
new LanguageTagCollector().getLanguageTags();
return;
}
Set<DtdType> dtdTypes = EnumSet.noneOf(DtdType.class);
String[] values = MyOptions.dtdTypes.option.getValue().split("[, ]+");
for (String value : values) {
dtdTypes.add(DtdType.valueOf(value));
}
final String fileLocale = MyOptions.language.option.getValue();
M3<Level, PathHeader, Boolean> levelToPathHeaders =
ChainedMap.of(
new TreeMap<Level, Object>(),
new TreeMap<PathHeader, Object>(),
Boolean.class);
if (verbose) {
System.out.println("№\tLevel\tStarredPath\tPH Status\t?\tAttributes");
}
for (DtdType dtdType : DtdType.values()) {
if (dtdType.getStatus() != DtdType.DtdStatus.active) continue;
if (dtdTypes != null && !dtdTypes.contains(dtdType)) {
continue;
}
for (String dir : dtdType.directories) {
if (!ALLOWED.contains(dir)) {
continue;
}
if (dtdType == DtdType.ldml) {
doLdml(dir, fileLocale, levelToPathHeaders);
} else {
doNonLdml(dtdType, dir, fileLocale, levelToPathHeaders);
}
}
}
if (!verbose) {
System.out.println("№\tLevel\tSection|Page\tStarredPath");
for (Entry<Level, Map<PathHeader, Boolean>> levelAndPathHeader : levelToPathHeaders) {
Level level = levelAndPathHeader.getKey();
Map<PathHeader, Boolean> pathHeaders2 = levelAndPathHeader.getValue();
Counter<String> codeCount = new Counter<>();
for (PathHeader ph : pathHeaders2.keySet()) {
codeCount.add(condense(ph), 1);
}
showResults("code count", level, codeCount);
}
}
}
static final Set<PageId> MainDateTimePages =
EnumSet.of(PageId.Fields, PageId.Gregorian, PageId.Generic);
private static String condense(PathHeader ph) {
// TODO Auto-generated method stub
String starredPath = pathStarrer.set(ph.getOriginalPath()).toString();
starredPath = starredPath.replace("[@alt=\"*\"]", ""); // collapse alts
SectionId sectionId = ph.getSectionId();
PageId pageId = ph.getPageId();
String category = sectionId + "|" + pageId;
switch (sectionId) {
case Core_Data:
category = sectionId.toString();
break;
case Currencies:
category =
sectionId
+ " — "
+ (starredPath.contains("@count")
? "long name"
: starredPath.contains("/symbol") ? "symbol" : "name");
break;
case DateTime:
category =
sectionId
+ " — "
+ (starredPath.contains("/displayName")
? "field labels"
: starredPath.contains("/interval")
? "intervals"
: pageId == PageId.Fields ? "relative" : "basic");
category += MainDateTimePages.contains(pageId) ? "" : " (non-greg)";
break;
case Locale_Display_Names:
category =
"Names — "
+ (starredPath.contains("/subdivision")
? "Country subdivisions"
: pageId == PageId.Territories
? "Continents & Sub~"
: pageId.toString().startsWith("Territor")
? "Countries"
: pageId.toString());
break;
case Numbers:
category =
pageId == PageId.Compact_Decimal_Formatting
? (starredPath.contains("currency") ? "Currency" : "Number")
+ " Formats — compact"
: sectionId.toString();
break;
case Misc:
category =
starredPath.contains("/annotation")
? "Emoji " + (starredPath.contains("@type") ? "names" : "keywords")
: starredPath.contains("/characterLabel")
? "Character Labels"
: PageId.LinguisticElements.toString();
break;
case Timezones:
category = sectionId.toString();
break;
case Units:
category = sectionId.toString();
break;
}
return stripParens(category + "\t" + starredPath);
}
static final Pattern PARENS = Pattern.compile("\\s*\\(.*\\)");
private static String stripParens(String label) {
if (label.contains("(")) {
String newLabel = PARENS.matcher(label).replaceAll("");
if (label.equals(newLabel)) {
System.out.println(
RegexUtilities.showMismatch(
Pattern.compile(".*" + PARENS.toString() + ".*"), label));
}
return newLabel;
} else {
return label;
}
}
private static void showResults(String title, Level level, Counter<String> counts) {
for (String key : counts.keySet()) {
long results = counts.get(key);
System.out.println(results + "\t" + level + "\t" + key);
}
}
private static void doNonLdml(
DtdType dtdType,
String dir,
String fileLocale,
M3<Level, PathHeader, Boolean> levelToPathHeaders) {
Matcher localeMatch = Pattern.compile("\\b" + fileLocale + "\\b").matcher("");
// Not keyed by locale, need to dig into data for that.
for (String file : new File(CLDRPaths.COMMON_DIRECTORY + dir).list()) {
if (!file.endsWith(".xml")) {
continue;
}
if (file.startsWith("plural")) {
int debug = 0;
}
List<Pair<String, String>> contents1;
try {
contents1 =
XMLFileReader.loadPathValues(
CLDRPaths.COMMON_DIRECTORY + dir + "/" + file,
new ArrayList<Pair<String, String>>(),
true);
} catch (Exception e) {
return;
}
DtdData dtdData = DtdData.getInstance(dtdType);
Multimap<String, String> extras = TreeMultimap.create();
for (Pair<String, String> s : contents1) {
String path = s.getFirst();
if (pathRegex != null && !pathRegex.reset(path).matches()) {
continue;
}
if (path.contains("it")) {
int debug = 0;
}
String value = s.getSecond();
XPathParts pathPlain = XPathParts.getFrozenInstance(path);
if (dtdData.isMetadata(pathPlain)) {
continue;
}
Set<String> pathForValues = dtdData.getRegularizedPaths(pathPlain, extras);
if (pathForValues != null) {
for (String pathForValue : pathForValues) {
if (!localeMatch.reset(pathForValue).find()
&& !localeMatch.reset(value).find()) {
continue;
}
PathHeader pathHeader = phf.fromPath(pathForValue);
levelToPathHeaders.put(Level.UNDETERMINED, pathHeader, true);
Splitter splitter = DtdData.getValueSplitter(pathPlain);
for (String line : splitter.split(value)) {
// special case # in transforms
if (isComment(pathPlain, line)) {
continue;
}
}
}
}
for (Entry<String, Collection<String>> entry : extras.asMap().entrySet()) {
final String extraPath = entry.getKey();
for (String value2 : entry.getValue()) {
if (!localeMatch.reset(extraPath).find()
&& !localeMatch.reset(value2).find()) {
continue;
}
final PathHeader pathHeaderExtra = phf.fromPath(extraPath);
levelToPathHeaders.put(Level.UNDETERMINED, pathHeaderExtra, true);
// final Collection<String> extraValue =
// entry.getValue();
// if (isExtraSplit(extraPath)) {
// for (String items : extraValue) {
// results.putAll(pathHeaderExtra,
// DtdData.SPACE_SPLITTER.splitToList(items));
// }
// } else {
// results.putAll(pathHeaderExtra, extraValue);
// }
}
}
}
}
}
static boolean isExtraSplit(String extraPath) {
if (extraPath.endsWith("/_type")
&& extraPath.startsWith("//supplementalData/metaZones/mapTimezones")) {
return true;
}
return false;
}
public static boolean isComment(XPathParts pathPlain, String line) {
if (pathPlain.contains("transform")) {
if (line.startsWith("#")) {
return true;
}
}
return false;
}
private static void doLdml(
String dir, String fileLocale, M3<Level, PathHeader, Boolean> levelToPathHeaders) {
Status status = new Status();
boolean isMain = "main".equals(dir);
System.out.println("directory:\t" + dir);
final org.unicode.cldr.util.Factory cldrFactory =
org.unicode.cldr.util.Factory.make(
CLDRPaths.COMMON_DIRECTORY + dir, fileLocale, DraftStatus.unconfirmed);
CLDRFile file;
try {
file =
cldrFactory.make(
fileLocale,
isMain); // bug, resolving source doesn't work without directory
} catch (Exception e) {
System.out.println(Level.UNDETERMINED + "\tNo file " + dir + "/" + fileLocale + ".xml");
return;
}
M4<Level, String, String, Boolean> levelToData =
ChainedMap.of(
new TreeMap<Level, Object>(),
new TreeMap<String, Object>(),
new TreeMap<String, Object>(),
Boolean.class);
Counter<Level> counter = new Counter<>();
TreeSet<PathHeader> pathHeaders = new TreeSet<>();
for (String path : file) {
if (path.endsWith("/alias") || path.startsWith("//ldml/identity")) {
continue;
}
if (pathRegex != null && !pathRegex.reset(path).matches()) {
continue;
}
String locale = file.getSourceLocaleID(path, status);
if (!path.equals(status.pathWhereFound)) {
// path is aliased, skip
continue;
}
if (config.getSupplementalDataInfo().isDeprecated(DtdType.ldml, path)) {
continue;
}
PathHeader ph = phf.fromPath(path);
CLDRLocale loc = CLDRLocale.getInstance(fileLocale);
int requiredVotes = sdi.getRequiredVotes(loc, ph);
Level level =
config.getSupplementalDataInfo()
.getCoverageLevel(
path, fileLocale); // isMain ? ... : Level.UNDETERMINED;
// if (level.compareTo(Level.MODERN) > 0) {
// continue;
// }
levelToPathHeaders.put(level, ph, true);
pathHeaders.add(ph);
SurveyToolStatus stStatus = ph.getSurveyToolStatus();
String starred = pathStarrer.set(path);
String attributes = Joiner.on("|").join(pathStarrer.getAttributes());
levelToData.put(
level,
starred + "|" + stStatus + "|" + requiredVotes,
attributes,
Boolean.TRUE);
counter.add(level, 1);
}
if (verbose) {
// for (Level level : Level.values()) {
// System.out.println(counter.get(level) + "\t" + level);
// }
for (Entry<Level, Map<String, Map<String, Boolean>>> entry : levelToData) {
Level level = entry.getKey();
for (Entry<String, Map<String, Boolean>> entry2 : entry.getValue().entrySet()) {
String[] starredStatus = entry2.getKey().split("\\|");
Map<String, Boolean> attributes = entry2.getValue();
int count = attributes.size();
if (count < 1) {
count = 1;
}
if (maxAttributes > 0) {
List<String> sampleList = new ArrayList<>(attributes.keySet());
String suffix = "";
if (sampleList.size() > maxAttributes) {
sampleList = sampleList.subList(0, maxAttributes);
suffix = ", …";
}
List<String> engList =
sampleList.stream()
.map(x -> x + "[" + getEnglish(starredStatus[0], x) + "]")
.collect(Collectors.toList());
String samples = Joiner.on(", ").join(engList) + suffix;
System.out.println(
count
+ "\t"
+ level
+ "\t"
+ starredStatus[0]
+ "\t"
+ starredStatus[1]
+ "\t"
+ starredStatus[2]
+ "\t"
+ samples);
}
}
}
}
// for (Entry<Level, Map<PathHeader, Boolean>> levelAndPathHeader :
// levelToPathHeaders) {
// Level level = levelAndPathHeader.getKey();
// Map<PathHeader, Boolean> pathHeaders2 = levelAndPathHeader.getValue();
// Builder<String, String> pageCount = ImmutableMultimap.builder();
// for (PathHeader ph : pathHeaders2.keySet()) {
// pageCount.put(ph.getSectionId() + "\t" + ph.getPageId(), ph.getHeader() +
// " : " + ph.getCode());
// }
// showResults("header+code count", level, pageCount.build());
// }
}
static final Splitter SPLIT_SLASH = Splitter.on('|');
static final Splitter SPLIT_STAR = Splitter.on('*');
static String getEnglish(String path, String value) {
Iterator<String> pathParts = SPLIT_STAR.split(path).iterator();
Iterator<String> values = SPLIT_SLASH.split(value).iterator();
StringBuilder pathBuffer = new StringBuilder();
boolean added = false;
do {
added = false;
if (pathParts.hasNext()) {
pathBuffer.append(pathParts.next());
added = true;
}
if (values.hasNext()) {
pathBuffer.append(values.next());
added = true;
}
} while (added);
String result = ENGLISH.getStringValue(pathBuffer.toString());
if (result != null && result.length() > 15) {
result = result.substring(0, 15) + '…';
}
return result;
}
static class LanguageTagCollector {
private static final CLDRConfig CldrConfig = CLDRConfig.getInstance();
enum Source {
main,
canon,
supp,
seed,
exemplars,
keyboards,
alias
}
LanguageTagParser ltp = new LanguageTagParser();
LanguageTagCanonicalizer ltc = new LanguageTagCanonicalizer();
Relation<String, Source> languageTags =
Relation.of(new TreeMap(new LengthFirstComparator()), TreeSet.class);
final SupplementalDataInfo supp = CldrConfig.getSupplementalDataInfo();
final Map<String, R2<List<String>, String>> languageFix =
supp.getLocaleAliasInfo().get("language");
private void getLanguageTags() {
Map<String, String> likely = supp.getLikelySubtags();
for (Entry<String, String> entry : likely.entrySet()) {
addLanguage(entry.getKey(), Source.canon);
}
for (String entry : supp.getLanguagesForTerritoriesPopulationData()) {
addLanguage(entry, Source.supp);
}
for (String entry : supp.getLanguages()) {
addLanguage(entry, Source.supp);
}
for (String entry : supp.getBasicLanguageDataLanguages()) {
addLanguage(entry, Source.supp);
}
for (Entry<String, R2<List<String>, String>> entry : languageFix.entrySet()) {
final String lang = entry.getKey();
if (!lang.contains("_")) {
addLanguage(lang, Source.alias);
}
}
// just use filenames
File base = CldrConfig.getCldrBaseDirectory();
// System.out.println(base);
// just do main, exemplars/main, seed/main, keyboards/.*
addFiles(base, "common/main", Source.main);
addFiles(base, "exemplars/main", Source.exemplars);
addFiles(base, "seed/main", Source.seed);
addFiles(base, "keyboards", Source.keyboards);
Set<String> badLines = new LinkedHashSet();
for (Entry<String, Set<Source>> entry : languageTags.keyValuesSet()) {
final String written = entry.getKey();
final String name = getName(written);
Set<Source> source = entry.getValue();
if (source.contains(Source.alias) && source.size() > 1) {
badLines.add(
written
+ "\t"
+ name
+ "\t"
+ languageFix.get(written).get0()
+ "\t"
+ Joiner.on(" ").join(source));
source = Collections.singleton(Source.alias);
}
System.out.println(written + "\t" + name + "\t" + Joiner.on(" ").join(source));
}
for (String s : badLines) {
System.out.println("BAD:\t" + s);
}
}
public String getName(final String written) {
String result = CldrConfig.getEnglish().getName(written);
if (result.equals(written)) {
R2<List<String>, String> alias = languageFix.get(written);
if (alias != null) {
result = CldrConfig.getEnglish().getName(alias.get0().get(0));
}
}
return result;
}
private void addFiles(File base, String name, Source source) {
addFiles(new File(base, name), source);
}
private void addFiles(File base, Source source) {
if (!base.isDirectory()) {
return;
}
for (File file : base.listFiles()) {
if (file.isDirectory()) {
addFiles(file, source);
continue;
}
String fileName = file.getName();
if (!fileName.endsWith(".xml") || fileName.startsWith("_")) {
continue;
}
addLanguage(fileName.substring(0, fileName.length() - 4), source);
}
}
private void addLanguage(String key, Source source) {
if (key.startsWith(LocaleNames.UND) || key.startsWith(LocaleNames.ROOT)) {
languageTags.put(LocaleNames.UND, source);
return;
}
ltp.set(key);
languageTags.put(ltp.getLanguage(), source);
}
}
}