| package org.unicode.cldr.test; |
| |
| import com.ibm.icu.lang.UCharacter; |
| import com.ibm.icu.lang.UCharacterDirection; |
| import com.ibm.icu.lang.UProperty; |
| import com.ibm.icu.lang.UScript; |
| import com.ibm.icu.text.Collator; |
| import com.ibm.icu.text.UnicodeSet; |
| import com.ibm.icu.text.UnicodeSetIterator; |
| import com.ibm.icu.util.ULocale; |
| import java.util.BitSet; |
| import java.util.Comparator; |
| import java.util.List; |
| import org.unicode.cldr.test.CheckCLDR.CheckStatus.Subtype; |
| import org.unicode.cldr.util.CLDRConfig; |
| import org.unicode.cldr.util.CLDRFile; |
| import org.unicode.cldr.util.ComparatorUtilities; |
| import org.unicode.cldr.util.Factory; |
| import org.unicode.cldr.util.SimpleUnicodeSetFormatter; |
| import org.unicode.cldr.util.SupplementalDataInfo; |
| import org.unicode.cldr.util.UnicodeSetPrettyPrinter; |
| import org.unicode.cldr.util.XPathParts; |
| |
| public class CheckExemplars extends FactoryCheckCLDR { |
| |
| public static final boolean USE_PUNCTUATION = false; |
| private static final boolean SUPPRESS_AUX_EMPTY_CHECK = true; |
| private static final String[] QUOTE_ELEMENTS = { |
| "quotationStart", "quotationEnd", |
| "alternateQuotationStart", "alternateQuotationEnd" |
| }; |
| static final SupplementalDataInfo SUP = CLDRConfig.getInstance().getSupplementalDataInfo(); |
| |
| Collator col; |
| boolean isRoot; |
| SimpleUnicodeSetFormatter displayFormatter; |
| UnicodeSetPrettyPrinter rawFormatter; |
| |
| static final UnicodeSet HangulSyllables = |
| new UnicodeSet("[[:Hangul_Syllable_Type=LVT:][:Hangul_Syllable_Type=LV:]]").freeze(); |
| |
| public static final UnicodeSet AlwaysOK; |
| |
| static { |
| if (USE_PUNCTUATION) { |
| AlwaysOK = new UnicodeSet("[\\u0020\\u00A0]"); |
| } else { |
| AlwaysOK = |
| new UnicodeSet( |
| "[[[:Nd:][:script=common:][:script=inherited:]-[:Default_Ignorable_Code_Point:]-[:C:] - [_]] [\u05BE \u05F3 \u066A-\u066C]" |
| + "[[؉][་ །༌][ཱ]{য়}য়]" |
| + // TODO Fix this Hack |
| "-[❮❯]]"); // [\\u200c-\\u200f] |
| // [:script=common:][:script=inherited:] |
| } |
| AlwaysOK.freeze(); |
| } |
| // TODO Fix some of these characters |
| private static final UnicodeSet SPECIAL_ALLOW = |
| new UnicodeSet( |
| "[\u061C\\u200E\\u200F\\u200c\\u200d" |
| + "[\u064B\u064E-\u0651\u0670][:Nd:][\u0951\u0952][\u064B-\u0652\u0654-\u0657\u0670][\u0A66-\u0A6F][\u0ED0-\u0ED9][\u064B-\u0652][\\u02BB\\u02BC][\u0CE6-\u0CEF][\u0966-\u096F]" |
| + "[:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] ]" // restore |
| // [:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] |
| ) |
| .freeze(); // add RLM, LRM [\u200C\u200D] |
| |
| public static final UnicodeSet UAllowedInExemplars = |
| new UnicodeSet( |
| "[[:assigned:]-[:Z:]]") // [:alphabetic:][:Mn:][:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] |
| .removeAll(AlwaysOK) // this will remove some |
| // [:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] so we |
| // restore them |
| // in SPECIAL_ALLOW |
| .addAll(SPECIAL_ALLOW) // add RLM, LRM [\u200C\u200D] |
| .freeze(); |
| |
| public static final UnicodeSet UAllowedInNumbers = |
| new UnicodeSet( |
| "[\u00A0\u202F[:N:][:P:][:Sm:][:Letter_Number:][:Numeric_Type=Numeric:]]") // [:alphabetic:][:Mn:][:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] |
| .addAll(SPECIAL_ALLOW) // add RLM, LRM [\u200C\u200D] |
| .freeze(); |
| |
| public static final UnicodeSet AllowedInExemplars = |
| new UnicodeSet(UAllowedInExemplars) |
| .removeAll(new UnicodeSet("[[:Uppercase:]-[\u0130]]")) |
| .freeze(); |
| |
| public static final UnicodeSet ALLOWED_IN_PUNCTUATION = |
| new UnicodeSet("[[:P:][:S:]-[:Sc:]]").freeze(); |
| |
| public static final UnicodeSet ALLOWED_IN_AUX = |
| new UnicodeSet(AllowedInExemplars) |
| .addAll(ALLOWED_IN_PUNCTUATION) |
| .removeAll(AlwaysOK) // this will remove some |
| // [:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] so we |
| // restore them |
| // in SPECIAL_ALLOW |
| .addAll(SPECIAL_ALLOW) // add RLM, LRM [\u200C\u200D] |
| .freeze(); |
| |
| public enum ExemplarType { |
| main(AllowedInExemplars, "(specific-script - uppercase - invisibles + \u0130)", true), |
| punctuation(ALLOWED_IN_PUNCTUATION, "punctuation", false), |
| auxiliary(ALLOWED_IN_AUX, "(specific-script - uppercase - invisibles + \u0130)", true), |
| index(UAllowedInExemplars, "(specific-script - invisibles)", false), |
| numbers(UAllowedInNumbers, "(specific-script - invisibles)", false), |
| // currencySymbol(AllowedInExemplars, "(specific-script - uppercase - invisibles + \u0130)", |
| // false) |
| ; |
| |
| public final UnicodeSet allowed; |
| public final UnicodeSet toRemove; |
| public final String message; |
| public final boolean convertUppercase; |
| |
| ExemplarType(UnicodeSet allowed, String message, boolean convertUppercase) { |
| if (!allowed.isFrozen()) { |
| throw new IllegalArgumentException("Internal Error"); |
| } |
| this.allowed = allowed; |
| this.message = message; |
| this.toRemove = new UnicodeSet(allowed).complement().freeze(); |
| this.convertUppercase = convertUppercase; |
| } |
| } |
| |
| public CheckExemplars(Factory factory) { |
| super(factory); |
| } |
| |
| // Allowed[:script=common:][:script=inherited:][:alphabetic=false:] |
| |
| @Override |
| public CheckCLDR handleSetCldrFileToCheck( |
| CLDRFile cldrFileToCheck, Options options, List<CheckStatus> possibleErrors) { |
| if (cldrFileToCheck == null) return this; |
| super.handleSetCldrFileToCheck(cldrFileToCheck, options, possibleErrors); |
| String locale = cldrFileToCheck.getLocaleID(); |
| isRoot = cldrFileToCheck.getLocaleID().equals("root"); |
| col = ComparatorUtilities.getIcuCollator(new ULocale(locale), Collator.IDENTICAL); |
| Collator spaceCol = |
| ComparatorUtilities.getIcuCollator(new ULocale(locale), Collator.PRIMARY); |
| displayFormatter = new SimpleUnicodeSetFormatter((Comparator) col); |
| rawFormatter = UnicodeSetPrettyPrinter.from((Comparator) col, (Comparator) spaceCol); |
| |
| // check for auxiliary anyway |
| if (!SUPPRESS_AUX_EMPTY_CHECK) { |
| UnicodeSet auxiliarySet = |
| getResolvedCldrFileToCheck() |
| .getExemplarSet("auxiliary", CLDRFile.WinningChoice.WINNING); |
| |
| if (auxiliarySet == null) { |
| possibleErrors.add( |
| new CheckStatus() |
| .setCause(this) |
| .setMainType(CheckStatus.warningType) |
| .setSubtype(Subtype.missingAuxiliaryExemplars) |
| .setMessage( |
| "Most languages allow <i>some<i> auxiliary characters, so review this.")); |
| } |
| } |
| return this; |
| } |
| |
| @Override |
| public CheckCLDR handleCheck( |
| String path, String fullPath, String value, Options options, List<CheckStatus> result) { |
| if (fullPath == null) return this; // skip paths that we don't have |
| if (path.indexOf("/exemplarCharacters") < 0) { |
| if (path.contains("parseLenient")) { |
| checkParse(path, fullPath, value, options, result); |
| } |
| return this; |
| } |
| if (!accept(result)) return this; |
| XPathParts oparts = XPathParts.getFrozenInstance(path); |
| final String exemplarString = oparts.findAttributeValue("exemplarCharacters", "type"); |
| ExemplarType type = |
| exemplarString == null ? ExemplarType.main : ExemplarType.valueOf(exemplarString); |
| checkExemplar(value, result, type); |
| |
| // check relation to auxiliary set |
| try { |
| UnicodeSet mainSet = |
| getResolvedCldrFileToCheck().getExemplarSet("", CLDRFile.WinningChoice.WINNING); |
| if (type == ExemplarType.auxiliary) { |
| UnicodeSet auxiliarySet = SimpleUnicodeSetFormatter.parseLenient(value); |
| |
| UnicodeSet combined = new UnicodeSet(mainSet).addAll(auxiliarySet); |
| checkMixedScripts("main+auxiliary", combined, result); |
| |
| if (auxiliarySet.containsSome(mainSet)) { |
| UnicodeSet overlap = |
| new UnicodeSet(mainSet) |
| .retainAll(auxiliarySet) |
| .removeAll(HangulSyllables); |
| if (overlap.size() != 0) { |
| String fixedExemplar1 = rawFormatter.format(overlap); |
| result.add( |
| new CheckStatus() |
| .setCause(this) |
| .setMainType(CheckStatus.errorType) |
| .setSubtype(Subtype.auxiliaryExemplarsOverlap) |
| .setMessage( |
| "Auxiliary characters also exist in main: \u200E{0}\u200E", |
| new Object[] {fixedExemplar1})); |
| } |
| } |
| } else if (type == ExemplarType.punctuation) { |
| // Check that the punctuation exemplar characters include quotation marks. |
| UnicodeSet punctuationSet = SimpleUnicodeSetFormatter.parseLenient(value); |
| UnicodeSet quoteSet = new UnicodeSet(); |
| for (String element : QUOTE_ELEMENTS) { |
| quoteSet.add( |
| getResolvedCldrFileToCheck() |
| .getWinningValue("//ldml/delimiters/" + element)); |
| } |
| if (!punctuationSet.containsAll(quoteSet)) { |
| quoteSet.removeAll(punctuationSet); |
| // go ahead and list the characters separately, with space between, for clarity. |
| StringBuilder characters = new StringBuilder(); |
| for (String item : quoteSet) { |
| if (characters.length() != 0) { |
| characters.append(" "); |
| } |
| characters.append(item); |
| } |
| // String characters = quoteSet.toPattern(false); |
| CheckStatus message = |
| new CheckStatus() |
| .setCause(this) |
| .setMainType(CheckStatus.warningType) |
| .setSubtype(Subtype.missingPunctuationCharacters) |
| .setMessage( |
| "Punctuation exemplar characters are missing quotation marks for this locale: {0}", |
| characters); |
| result.add(message); |
| } |
| } else if (type == ExemplarType.index) { |
| // Check that the index exemplar characters are in case-completed union of main and |
| // auxiliary exemplars |
| UnicodeSet auxiliarySet = |
| getResolvedCldrFileToCheck() |
| .getExemplarSet("auxiliary", CLDRFile.WinningChoice.WINNING); |
| if (auxiliarySet == null) { |
| auxiliarySet = new UnicodeSet(); |
| } |
| UnicodeSet mainAndAuxAllCase = |
| new UnicodeSet(mainSet) |
| .addAll(auxiliarySet) |
| .closeOver(UnicodeSet.ADD_CASE_MAPPINGS); |
| UnicodeSet indexBadChars = |
| SimpleUnicodeSetFormatter.parseLenient(value).removeAll(mainAndAuxAllCase); |
| |
| if (!indexBadChars.isEmpty()) { |
| CheckStatus message = |
| new CheckStatus() |
| .setCause(this) |
| .setMainType(CheckStatus.warningType) |
| .setSubtype(Subtype.charactersNotInMainOrAuxiliaryExemplars) |
| .setMessage( |
| "Index exemplars include characters not in main or auxiliary exemplars: {0}", |
| indexBadChars.toPattern(false)); |
| result.add(message); |
| } |
| } |
| |
| // check for consistency with RTL |
| |
| Boolean localeIsRTL = false; |
| String charOrientation = |
| getResolvedCldrFileToCheck() |
| .getStringValue("//ldml/layout/orientation/characterOrder"); |
| if (charOrientation.equals("right-to-left")) { |
| localeIsRTL = true; |
| } |
| |
| UnicodeSetIterator mi = new UnicodeSetIterator(mainSet); |
| while (mi.next()) { |
| if (mi.codepoint != UnicodeSetIterator.IS_STRING |
| && (UCharacter.getDirection(mi.codepoint) |
| == UCharacterDirection.RIGHT_TO_LEFT |
| || UCharacter.getDirection(mi.codepoint) |
| == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) |
| && !localeIsRTL) { |
| result.add( |
| new CheckStatus() |
| .setCause(this) |
| .setMainType(CheckStatus.errorType) |
| .setSubtype(Subtype.orientationDisagreesWithExemplars) |
| .setMessage( |
| "Main exemplar set contains RTL characters, but orientation of this locale is not RTL.")); |
| break; |
| } |
| } |
| |
| } catch (Exception e) { |
| } // if these didn't parse, checkExemplar will be called anyway at some point |
| return this; |
| } |
| |
| private void checkParse( |
| String path, String fullPath, String value, Options options, List<CheckStatus> result) { |
| try { |
| XPathParts oparts = XPathParts.getFrozenInstance(path); |
| // only thing we do is make sure that the sample is in the value |
| UnicodeSet us = SimpleUnicodeSetFormatter.parseLenient(value); |
| String sampleValue = oparts.getAttributeValue(-1, "sample"); |
| if (!us.contains(sampleValue)) { |
| CheckStatus message = |
| new CheckStatus() |
| .setCause(this) |
| .setMainType(CheckStatus.errorType) |
| .setSubtype(Subtype.badParseLenient) |
| .setMessage( |
| "ParseLenient sample not in value: {0} ∌ {1}", |
| us, sampleValue); |
| result.add(message); |
| } |
| } catch (IllegalArgumentException e) { |
| /* |
| * new UnicodeSet(value) throws IllegalArgumentException if, for example, value is null or value = "?". |
| * This can happen during cldr-unittest TestAll. |
| * path = //ldml/characters/parseLenients[@scope="general"][@level="lenient"]/parseLenient[@sample="’"] |
| * or |
| * path = //ldml/characters/parseLenients[@scope="date"][@level="lenient"]/parseLenient[@sample="-"] |
| */ |
| CheckStatus message = |
| new CheckStatus() |
| .setCause(this) |
| .setMainType(CheckStatus.errorType) |
| .setSubtype(Subtype.badParseLenient) |
| .setMessage( |
| e.toString() |
| + (e.getMessage() == null |
| ? "" |
| : ": " + e.getMessage())); |
| result.add(message); |
| } |
| } |
| |
| static final BitSet Japn = new BitSet(); |
| static final BitSet Kore = new BitSet(); |
| |
| static { |
| Japn.set(UScript.HAN); |
| Japn.set(UScript.HIRAGANA); |
| Japn.set(UScript.KATAKANA); |
| Kore.set(UScript.HAN); |
| Kore.set(UScript.HANGUL); |
| } |
| |
| private void checkMixedScripts(String title, UnicodeSet set, List<CheckStatus> result) { |
| BitSet s = new BitSet(); |
| for (String item : set) { |
| int script = UScript.getScript(item.codePointAt(0)); |
| if (script != UScript.COMMON && script != UScript.INHERITED) { |
| s.set(script); |
| } |
| } |
| final int cardinality = s.cardinality(); |
| if (cardinality < 2) { |
| return; |
| } |
| if (cardinality == 2 && title.equals("currencySymbol") && s.get(UScript.LATIN)) { |
| return; // allow 2 scripts in exemplars for currencies. |
| } |
| // allowable combinations |
| if (s.equals(Japn) || s.equals(Kore)) { |
| return; |
| } |
| StringBuilder scripts = new StringBuilder(); |
| for (int i = s.nextSetBit(0); i >= 0; i = s.nextSetBit(i + 1)) { |
| if (scripts.length() != 0) { |
| scripts.append(", "); |
| } |
| scripts.append(UScript.getName(i)); |
| UnicodeSet inSet = |
| new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, i).retainAll(set); |
| int count = 0; |
| scripts.append(" ("); |
| for (String cp : inSet) { |
| if (count != 0) { |
| scripts.append(","); |
| } |
| scripts.append(cp); |
| count++; |
| if (count > 3) { |
| scripts.append('\u2026'); |
| break; |
| } |
| } |
| scripts.append(")"); |
| } |
| result.add( |
| new CheckStatus() |
| .setCause(this) |
| .setMainType(CheckStatus.errorType) |
| .setSubtype(Subtype.illegalExemplarSet) |
| .setMessage( |
| "{0} exemplars contain multiple scripts: {1}", |
| new Object[] {title, scripts})); |
| return; |
| } |
| |
| private void checkExemplar(String v, List<CheckStatus> result, ExemplarType exemplarType) { |
| if (v == null) return; |
| final UnicodeSet exemplar1; |
| try { |
| exemplar1 = SimpleUnicodeSetFormatter.parseLenient(v).freeze(); |
| } catch (Exception e) { |
| result.add( |
| new CheckStatus() |
| .setCause(this) |
| .setMainType(CheckStatus.errorType) |
| .setSubtype(Subtype.illegalExemplarSet) |
| .setMessage(e.getMessage())); |
| return; |
| } |
| |
| // check for mixed scripts |
| |
| checkMixedScripts(exemplarType.toString(), exemplar1, result); |
| |
| // check that the formatting is correct |
| |
| String fixedExemplar1 = rawFormatter.format(exemplar1); |
| UnicodeSet doubleCheck = new UnicodeSet(fixedExemplar1); |
| if (!doubleCheck.equals(exemplar1)) { |
| result.add( |
| new CheckStatus() |
| .setCause(this) |
| .setMainType(CheckStatus.errorType) |
| .setSubtype(Subtype.internalUnicodeSetFormattingError) |
| .setMessage( |
| "Internal Error: formatting not working for {0}", |
| new Object[] {exemplar1})); |
| } |
| // else if (!v.equals(fixedExemplar1)) { |
| // result.add(new CheckStatus().setCause(this).setType(CheckStatus.warningType) |
| // .setMessage("Better formatting would be \u200E{0}\u200E", new Object[]{fixedExemplar1})); |
| // } |
| |
| // now check that only allowed characters are in the set |
| |
| if (!exemplarType.allowed.containsAll(exemplar1)) { |
| UnicodeSet remainder0 = new UnicodeSet(exemplar1).removeAll(exemplarType.allowed); |
| |
| // we do allow for punctuation & combining marks in strings |
| UnicodeSet remainder = new UnicodeSet(); |
| for (String s : remainder0) { |
| if (Character.codePointCount(s, 0, s.length()) == 1) { |
| remainder.add(s); |
| } else { |
| // just check normalization |
| } |
| } |
| |
| // after a first check, we check again in case we flattened |
| |
| if (remainder.size() != 0) { |
| fixedExemplar1 = displayFormatter.format(exemplar1); |
| result.add( |
| new CheckStatus() |
| .setCause(this) |
| .setMainType(CheckStatus.errorType) |
| .setSubtype(Subtype.illegalCharactersInExemplars) |
| .setMessage( |
| "Should be limited to " |
| + exemplarType.message |
| + "; thus not contain: \u200E{0}\u200E", |
| new Object[] {remainder})); |
| } |
| } |
| |
| // now check for empty |
| |
| if (!isRoot && exemplar1.size() == 0) { |
| switch (exemplarType) { |
| // case currencySymbol: // ok if empty |
| // break; |
| case auxiliary: |
| result.add( |
| new CheckStatus() |
| .setCause(this) |
| .setMainType(CheckStatus.warningType) |
| .setSubtype(Subtype.missingAuxiliaryExemplars) |
| .setMessage( |
| "Most languages allow <i>some<i> auxiliary characters, so review this.")); |
| break; |
| case index: |
| case punctuation: |
| case main: |
| result.add( |
| new CheckStatus() |
| .setCause(this) |
| .setMainType(CheckStatus.errorType) |
| .setSubtype(Subtype.missingMainExemplars) |
| .setMessage( |
| "Exemplar set (" |
| + exemplarType |
| + ") must not be empty -- that would imply that this language uses no " |
| + (exemplarType == ExemplarType.punctuation |
| ? "punctuation" |
| : "letters") |
| + "!")); |
| break; |
| } |
| } |
| } |
| } |