blob: 2b74d69756899e1deebee8d2773f0f3dca11deae [file] [log] [blame]
package org.unicode.cldr.test;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import org.unicode.cldr.util.CLDRFile;
import org.unicode.cldr.util.CLDRFile.DraftStatus;
import org.unicode.cldr.util.Factory;
import org.unicode.cldr.util.LocaleIDParser;
import org.unicode.cldr.util.LogicalGrouping;
import org.unicode.cldr.util.With;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMultiset;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Multiset;
import com.google.common.collect.Multiset.Entry;
import com.google.common.collect.TreeMultiset;
import com.ibm.icu.dev.util.UnicodeMap;
import com.ibm.icu.text.Transliterator;
import com.ibm.icu.text.UnicodeSet;
public class CheckLogicalGroupings extends FactoryCheckCLDR {
// Change MINIMUM_DRAFT_STATUS to DraftStatus.contributed if you only care about
// contributed or higher. This can help to reduce the error count when you have a lot of new data.
static final DraftStatus MIMIMUM_DRAFT_STATUS = DraftStatus.contributed;
static final Set<Phase> PHASES_CAUSE_ERROR = ImmutableSet.of(Phase.FINAL_TESTING, Phase.VETTING);
public CheckLogicalGroupings(Factory factory) {
super(factory);
}
@Override
public CheckCLDR setCldrFileToCheck(CLDRFile cldrFileToCheck, Options options,
List<CheckStatus> possibleErrors) {
super.setCldrFileToCheck(cldrFileToCheck, options, possibleErrors);
// skip the test unless we are at the top level, eg
// test root, fr, sr_Latn, ...
// but skip fr_CA, sr_Latn_RS, etc.
// TODO: could simplify some of the code later, since non-topLevel locales are skipped
// NOTE: we could have a weaker test.
// Skip if all of the items are either inherited, or aliased *including votes for inherited (3 up arrows)*
String parent = LocaleIDParser.getParent(cldrFileToCheck.getLocaleID());
boolean isTopLevel = parent == null || parent.equals("root");
setSkipTest(!isTopLevel);
return this;
}
// remember to add this class to the list in CheckCLDR.getCheckAll
// to run just this test, on just locales starting with 'nl', use CheckCLDR with -fnl.* -t.*LogicalGroupings.*
@Override
public CheckCLDR handleCheck(String path, String fullPath, String value, Options options, List<CheckStatus> result) {
if (LogicalGrouping.isOptional(getCldrFileToCheck(), path)) {
return this;
}
new LogicalGroupChecker(this, path, value, result).run();
return this;
}
static final Transliterator SHOW_INVISIBLES = Transliterator.createFromRules(
"show",
"([[:C:][:Z:][:whitespace:][:Default_Ignorable_Code_Point:]-[\\u0020]]) > &hex/perl($1);",
Transliterator.FORWARD);
public static String showInvisibles(String value) {
return SHOW_INVISIBLES.transform(value);
}
public static String showInvisibles(int codePoint) {
return showInvisibles(With.fromCodePoint(codePoint));
}
public static String showInvisibles(Multiset<?> codePointCounts) {
StringBuilder b = new StringBuilder().append('{');
for (Entry<?> entry : codePointCounts.entrySet()) {
if (b.length() > 1) {
b.append(", ");
}
Object element = entry.getElement();
b.append(element instanceof Integer ? showInvisibles((int) element) : showInvisibles(element.toString()));
if (entry.getCount() > 1) {
b.append('⨱').append(entry.getCount());
}
}
return b.append('}').toString();
}
private static final UnicodeMap<String> OTHER_SPACES = new UnicodeMap<String>().putAll(new UnicodeSet("[[:Z:][:S:][:whitespace:]]"), " ").freeze();
public static String cleanSpaces(String pathValue) {
return OTHER_SPACES.transform(pathValue);
}
// Later
private static final ConcurrentHashMap<String, Fingerprint> FINGERPRINT_CACHE = new ConcurrentHashMap<>();
/**
* Use cheap distance metric for testing differences; just the number of characters of each kind.
*/
public static class Fingerprint {
private final Multiset<Integer> codePointCounts;
private Fingerprint(String value) {
Multiset<Integer> result = TreeMultiset.create();
for (int cp : With.codePointArray(value)) {
result.add(cp);
}
codePointCounts = ImmutableMultiset.copyOf(result);
}
public static int maxDistanceBetween(Set<Fingerprint> fingerprintsIn) {
int distance = 0;
List<Fingerprint> fingerprints = ImmutableList.copyOf(fingerprintsIn); // The set removes duplicates
// get the n x n comparisons (but skipping inverses, so (n x (n-1))/2). Quadratic, but most sets are small.
for (int i = fingerprints.size()-1; i > 0; --i) { // note the lower bound is different for i and j
final Fingerprint fingerprint_i = fingerprints.get(i);
for (int j = i-1; j >= 0; --j) {
final Fingerprint fingerprints_j = fingerprints.get(j);
final int currentDistance = fingerprint_i.getDistanceTo(fingerprints_j);
distance = Math.max(distance, currentDistance);
}
}
return distance;
}
public int getDistanceTo(Fingerprint that) {
int distance = 0;
Set<Integer> allChars = new HashSet<>(that.codePointCounts.elementSet());
allChars.addAll(codePointCounts.elementSet());
for (Integer element :allChars) {
final int count = codePointCounts.count(element);
final int otherCount = that.codePointCounts.count(element);
distance += Math.abs(count - otherCount);
}
return distance;
}
public int size() {
return codePointCounts.size();
}
public static Fingerprint make(String value) {
return FINGERPRINT_CACHE.computeIfAbsent(value, Fingerprint::new);
}
@Override
public String toString() {
return showInvisibles(codePointCounts);
}
}
}