tools/cldr-code/src/main/java/org/unicode/cldr/test/CheckExemplars.java - platform/external/cldr - Git at Google

 package org.unicode.cldr.test;

 import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.lang.UCharacterDirection;
 import com.ibm.icu.lang.UProperty;
 import com.ibm.icu.lang.UScript;
 import com.ibm.icu.text.Collator;
 import com.ibm.icu.text.UnicodeSet;
 import com.ibm.icu.text.UnicodeSetIterator;
 import com.ibm.icu.util.ULocale;
 import java.util.BitSet;
 import java.util.Comparator;
 import java.util.List;
 import org.unicode.cldr.test.CheckCLDR.CheckStatus.Subtype;
 import org.unicode.cldr.util.CLDRConfig;
 import org.unicode.cldr.util.CLDRFile;
 import org.unicode.cldr.util.ComparatorUtilities;
 import org.unicode.cldr.util.Factory;
 import org.unicode.cldr.util.SimpleUnicodeSetFormatter;
 import org.unicode.cldr.util.SupplementalDataInfo;
 import org.unicode.cldr.util.UnicodeSetPrettyPrinter;
 import org.unicode.cldr.util.XPathParts;

 public class CheckExemplars extends FactoryCheckCLDR {

     public static final boolean USE_PUNCTUATION = false;
     private static final boolean SUPPRESS_AUX_EMPTY_CHECK = true;
     private static final String[] QUOTE_ELEMENTS = {
         "quotationStart", "quotationEnd",
         "alternateQuotationStart", "alternateQuotationEnd"
     };
     static final SupplementalDataInfo SUP = CLDRConfig.getInstance().getSupplementalDataInfo();

     Collator col;
     boolean isRoot;
     SimpleUnicodeSetFormatter displayFormatter;
     UnicodeSetPrettyPrinter rawFormatter;

     static final UnicodeSet HangulSyllables =
             new UnicodeSet("[[:Hangul_Syllable_Type=LVT:][:Hangul_Syllable_Type=LV:]]").freeze();

     public static final UnicodeSet AlwaysOK;

     static {
         if (USE_PUNCTUATION) {
             AlwaysOK = new UnicodeSet("[\\u0020\\u00A0]");
         } else {
             AlwaysOK =
                     new UnicodeSet(
                             "[[[:Nd:][:script=common:][:script=inherited:]-[:Default_Ignorable_Code_Point:]-[:C:] - [_]] [\u05BE \u05F3 \u066A-\u066C]"
                                     + "[[؉][་ །༌][ཱ]‎‎{য়}য়]"
                                     + // TODO Fix this Hack
                                     "-[❮❯]]"); // [\\u200c-\\u200f]
             // [:script=common:][:script=inherited:]
         }
         AlwaysOK.freeze();
     }
     // TODO Fix some of these characters
     private static final UnicodeSet SPECIAL_ALLOW =
             new UnicodeSet(
                             "[\u061C\\u200E\\u200F\\u200c\\u200d"
                                     + "‎‎‎[\u064B\u064E-\u0651\u0670]‎[:Nd:]‎[\u0951\u0952]‎[\u064B-\u0652\u0654-\u0657\u0670]‎[\u0A66-\u0A6F][\u0ED0-\u0ED9][\u064B-\u0652]‎[\\u02BB\\u02BC][\u0CE6-\u0CEF]‎‎[\u0966-\u096F]"
                                     + "‎‎‎[:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] ]" // restore
                             // [:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:]
                             )
                     .freeze(); // add RLM, LRM [\u200C\u200D]‎

     public static final UnicodeSet UAllowedInExemplars =
             new UnicodeSet(
                             "[[:assigned:]-[:Z:]]") // [:alphabetic:][:Mn:][:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:]
                     .removeAll(AlwaysOK) // this will remove some
                     // [:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] so we
                     // restore them
                     // in SPECIAL_ALLOW
                     .addAll(SPECIAL_ALLOW) // add RLM, LRM [\u200C\u200D]‎
                     .freeze();

     public static final UnicodeSet UAllowedInNumbers =
             new UnicodeSet(
                             "[\u00A0\u202F[:N:][:P:][:Sm:][:Letter_Number:][:Numeric_Type=Numeric:]]") // [:alphabetic:][:Mn:][:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:]
                     .addAll(SPECIAL_ALLOW) // add RLM, LRM [\u200C\u200D]‎
                     .freeze();

     public static final UnicodeSet AllowedInExemplars =
             new UnicodeSet(UAllowedInExemplars)
                     .removeAll(new UnicodeSet("[[:Uppercase:]-[\u0130]]"))
                     .freeze();

     public static final UnicodeSet ALLOWED_IN_PUNCTUATION =
             new UnicodeSet("[[:P:][:S:]-[:Sc:]]").freeze();

     public static final UnicodeSet ALLOWED_IN_AUX =
             new UnicodeSet(AllowedInExemplars)
                     .addAll(ALLOWED_IN_PUNCTUATION)
                     .removeAll(AlwaysOK) // this will remove some
                     // [:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] so we
                     // restore them
                     // in SPECIAL_ALLOW
                     .addAll(SPECIAL_ALLOW) // add RLM, LRM [\u200C\u200D]‎
                     .freeze();

     public enum ExemplarType {
         main(AllowedInExemplars, "(specific-script - uppercase - invisibles + \u0130)", true),
         punctuation(ALLOWED_IN_PUNCTUATION, "punctuation", false),
         auxiliary(ALLOWED_IN_AUX, "(specific-script - uppercase - invisibles + \u0130)", true),
         index(UAllowedInExemplars, "(specific-script - invisibles)", false),
         numbers(UAllowedInNumbers, "(specific-script - invisibles)", false),
     // currencySymbol(AllowedInExemplars, "(specific-script - uppercase - invisibles + \u0130)",
     // false)
     ;

         public final UnicodeSet allowed;
         public final UnicodeSet toRemove;
         public final String message;
         public final boolean convertUppercase;

         ExemplarType(UnicodeSet allowed, String message, boolean convertUppercase) {
             if (!allowed.isFrozen()) {
                 throw new IllegalArgumentException("Internal Error");
             }
             this.allowed = allowed;
             this.message = message;
             this.toRemove = new UnicodeSet(allowed).complement().freeze();
             this.convertUppercase = convertUppercase;
         }
     }

     public CheckExemplars(Factory factory) {
         super(factory);
     }

     // Allowed[:script=common:][:script=inherited:][:alphabetic=false:]

     @Override
     public CheckCLDR handleSetCldrFileToCheck(
             CLDRFile cldrFileToCheck, Options options, List<CheckStatus> possibleErrors) {
         if (cldrFileToCheck == null) return this;
         super.handleSetCldrFileToCheck(cldrFileToCheck, options, possibleErrors);
         String locale = cldrFileToCheck.getLocaleID();
         isRoot = cldrFileToCheck.getLocaleID().equals("root");
         col = ComparatorUtilities.getIcuCollator(new ULocale(locale), Collator.IDENTICAL);
         Collator spaceCol =
                 ComparatorUtilities.getIcuCollator(new ULocale(locale), Collator.PRIMARY);
         displayFormatter = new SimpleUnicodeSetFormatter((Comparator) col);
         rawFormatter = UnicodeSetPrettyPrinter.from((Comparator) col, (Comparator) spaceCol);

         // check for auxiliary anyway
         if (!SUPPRESS_AUX_EMPTY_CHECK) {
             UnicodeSet auxiliarySet =
                     getResolvedCldrFileToCheck()
                             .getExemplarSet("auxiliary", CLDRFile.WinningChoice.WINNING);

             if (auxiliarySet == null) {
                 possibleErrors.add(
                         new CheckStatus()
                                 .setCause(this)
                                 .setMainType(CheckStatus.warningType)
                                 .setSubtype(Subtype.missingAuxiliaryExemplars)
                                 .setMessage(
                                         "Most languages allow <i>some<i> auxiliary characters, so review this."));
             }
         }
         return this;
     }

     @Override
     public CheckCLDR handleCheck(
             String path, String fullPath, String value, Options options, List<CheckStatus> result) {
         if (fullPath == null) return this; // skip paths that we don't have
         if (path.indexOf("/exemplarCharacters") < 0) {
             if (path.contains("parseLenient")) {
                 checkParse(path, fullPath, value, options, result);
             }
             return this;
         }
         if (!accept(result)) return this;
         XPathParts oparts = XPathParts.getFrozenInstance(path);
         final String exemplarString = oparts.findAttributeValue("exemplarCharacters", "type");
         ExemplarType type =
                 exemplarString == null ? ExemplarType.main : ExemplarType.valueOf(exemplarString);
         checkExemplar(value, result, type);

         // check relation to auxiliary set
         try {
             UnicodeSet mainSet =
                     getResolvedCldrFileToCheck().getExemplarSet("", CLDRFile.WinningChoice.WINNING);
             if (type == ExemplarType.auxiliary) {
                 UnicodeSet auxiliarySet = SimpleUnicodeSetFormatter.parseLenient(value);

                 UnicodeSet combined = new UnicodeSet(mainSet).addAll(auxiliarySet);
                 checkMixedScripts("main+auxiliary", combined, result);

                 if (auxiliarySet.containsSome(mainSet)) {
                     UnicodeSet overlap =
                             new UnicodeSet(mainSet)
                                     .retainAll(auxiliarySet)
                                     .removeAll(HangulSyllables);
                     if (overlap.size() != 0) {
                         String fixedExemplar1 = rawFormatter.format(overlap);
                         result.add(
                                 new CheckStatus()
                                         .setCause(this)
                                         .setMainType(CheckStatus.errorType)
                                         .setSubtype(Subtype.auxiliaryExemplarsOverlap)
                                         .setMessage(
                                                 "Auxiliary characters also exist in main: \u200E{0}\u200E",
                                                 new Object[] {fixedExemplar1}));
                     }
                 }
             } else if (type == ExemplarType.punctuation) {
                 // Check that the punctuation exemplar characters include quotation marks.
                 UnicodeSet punctuationSet = SimpleUnicodeSetFormatter.parseLenient(value);
                 UnicodeSet quoteSet = new UnicodeSet();
                 for (String element : QUOTE_ELEMENTS) {
                     quoteSet.add(
                             getResolvedCldrFileToCheck()
                                     .getWinningValue("//ldml/delimiters/" + element));
                 }
                 if (!punctuationSet.containsAll(quoteSet)) {
                     quoteSet.removeAll(punctuationSet);
                     // go ahead and list the characters separately, with space between, for clarity.
                     StringBuilder characters = new StringBuilder();
                     for (String item : quoteSet) {
                         if (characters.length() != 0) {
                             characters.append(" ");
                         }
                         characters.append(item);
                     }
                     // String characters = quoteSet.toPattern(false);
                     CheckStatus message =
                             new CheckStatus()
                                     .setCause(this)
                                     .setMainType(CheckStatus.warningType)
                                     .setSubtype(Subtype.missingPunctuationCharacters)
                                     .setMessage(
                                             "Punctuation exemplar characters are missing quotation marks for this locale: {0}",
                                             characters);
                     result.add(message);
                 }
             } else if (type == ExemplarType.index) {
                 // Check that the index exemplar characters are in case-completed union of main and
                 // auxiliary exemplars
                 UnicodeSet auxiliarySet =
                         getResolvedCldrFileToCheck()
                                 .getExemplarSet("auxiliary", CLDRFile.WinningChoice.WINNING);
                 if (auxiliarySet == null) {
                     auxiliarySet = new UnicodeSet();
                 }
                 UnicodeSet mainAndAuxAllCase =
                         new UnicodeSet(mainSet)
                                 .addAll(auxiliarySet)
                                 .closeOver(UnicodeSet.ADD_CASE_MAPPINGS);
                 UnicodeSet indexBadChars =
                         SimpleUnicodeSetFormatter.parseLenient(value).removeAll(mainAndAuxAllCase);

                 if (!indexBadChars.isEmpty()) {
                     CheckStatus message =
                             new CheckStatus()
                                     .setCause(this)
                                     .setMainType(CheckStatus.warningType)
                                     .setSubtype(Subtype.charactersNotInMainOrAuxiliaryExemplars)
                                     .setMessage(
                                             "Index exemplars include characters not in main or auxiliary exemplars: {0}",
                                             indexBadChars.toPattern(false));
                     result.add(message);
                 }
             }

             // check for consistency with RTL

             Boolean localeIsRTL = false;
             String charOrientation =
                     getResolvedCldrFileToCheck()
                             .getStringValue("//ldml/layout/orientation/characterOrder");
             if (charOrientation.equals("right-to-left")) {
                 localeIsRTL = true;
             }

             UnicodeSetIterator mi = new UnicodeSetIterator(mainSet);
             while (mi.next()) {
                 if (mi.codepoint != UnicodeSetIterator.IS_STRING
                         && (UCharacter.getDirection(mi.codepoint)
                                         == UCharacterDirection.RIGHT_TO_LEFT
                                 || UCharacter.getDirection(mi.codepoint)
                                         == UCharacterDirection.RIGHT_TO_LEFT_ARABIC)
                         && !localeIsRTL) {
                     result.add(
                             new CheckStatus()
                                     .setCause(this)
                                     .setMainType(CheckStatus.errorType)
                                     .setSubtype(Subtype.orientationDisagreesWithExemplars)
                                     .setMessage(
                                             "Main exemplar set contains RTL characters, but orientation of this locale is not RTL."));
                     break;
                 }
             }

         } catch (Exception e) {
         } // if these didn't parse, checkExemplar will be called anyway at some point
         return this;
     }

     private void checkParse(
             String path, String fullPath, String value, Options options, List<CheckStatus> result) {
         try {
             XPathParts oparts = XPathParts.getFrozenInstance(path);
             // only thing we do is make sure that the sample is in the value
             UnicodeSet us = SimpleUnicodeSetFormatter.parseLenient(value);
             String sampleValue = oparts.getAttributeValue(-1, "sample");
             if (!us.contains(sampleValue)) {
                 CheckStatus message =
                         new CheckStatus()
                                 .setCause(this)
                                 .setMainType(CheckStatus.errorType)
                                 .setSubtype(Subtype.badParseLenient)
                                 .setMessage(
                                         "ParseLenient sample not in value: {0} ∌ {1}",
                                         us, sampleValue);
                 result.add(message);
             }
         } catch (IllegalArgumentException e) {
             /*
              * new UnicodeSet(value) throws IllegalArgumentException if, for example, value is null or value = "?".
              * This can happen during cldr-unittest TestAll.
              * path = //ldml/characters/parseLenients[@scope="general"][@level="lenient"]/parseLenient[@sample="’"]
              * or
              * path = //ldml/characters/parseLenients[@scope="date"][@level="lenient"]/parseLenient[@sample="-"]
              */
             CheckStatus message =
                     new CheckStatus()
                             .setCause(this)
                             .setMainType(CheckStatus.errorType)
                             .setSubtype(Subtype.badParseLenient)
                             .setMessage(
                                     e.toString()
                                             + (e.getMessage() == null
                                                     ? ""
                                                     : ": " + e.getMessage()));
             result.add(message);
         }
     }

     static final BitSet Japn = new BitSet();
     static final BitSet Kore = new BitSet();

     static {
         Japn.set(UScript.HAN);
         Japn.set(UScript.HIRAGANA);
         Japn.set(UScript.KATAKANA);
         Kore.set(UScript.HAN);
         Kore.set(UScript.HANGUL);
     }

     private void checkMixedScripts(String title, UnicodeSet set, List<CheckStatus> result) {
         BitSet s = new BitSet();
         for (String item : set) {
             int script = UScript.getScript(item.codePointAt(0));
             if (script != UScript.COMMON && script != UScript.INHERITED) {
                 s.set(script);
             }
         }
         final int cardinality = s.cardinality();
         if (cardinality < 2) {
             return;
         }
         if (cardinality == 2 && title.equals("currencySymbol") && s.get(UScript.LATIN)) {
             return; // allow 2 scripts in exemplars for currencies.
         }
         // allowable combinations
         if (s.equals(Japn) || s.equals(Kore)) {
             return;
         }
         StringBuilder scripts = new StringBuilder();
         for (int i = s.nextSetBit(0); i >= 0; i = s.nextSetBit(i + 1)) {
             if (scripts.length() != 0) {
                 scripts.append(", ");
             }
             scripts.append(UScript.getName(i));
             UnicodeSet inSet =
                     new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, i).retainAll(set);
             int count = 0;
             scripts.append(" (");
             for (String cp : inSet) {
                 if (count != 0) {
                     scripts.append(",");
                 }
                 scripts.append(cp);
                 count++;
                 if (count > 3) {
                     scripts.append('\u2026');
                     break;
                 }
             }
             scripts.append(")");
         }
         result.add(
                 new CheckStatus()
                         .setCause(this)
                         .setMainType(CheckStatus.errorType)
                         .setSubtype(Subtype.illegalExemplarSet)
                         .setMessage(
                                 "{0} exemplars contain multiple scripts: {1}",
                                 new Object[] {title, scripts}));
         return;
     }

     private void checkExemplar(String v, List<CheckStatus> result, ExemplarType exemplarType) {
         if (v == null) return;
         final UnicodeSet exemplar1;
         try {
             exemplar1 = SimpleUnicodeSetFormatter.parseLenient(v).freeze();
         } catch (Exception e) {
             result.add(
                     new CheckStatus()
                             .setCause(this)
                             .setMainType(CheckStatus.errorType)
                             .setSubtype(Subtype.illegalExemplarSet)
                             .setMessage(e.getMessage()));
             return;
         }

         // check for mixed scripts

         checkMixedScripts(exemplarType.toString(), exemplar1, result);

         // check that the formatting is correct

         String fixedExemplar1 = rawFormatter.format(exemplar1);
         UnicodeSet doubleCheck = new UnicodeSet(fixedExemplar1);
         if (!doubleCheck.equals(exemplar1)) {
             result.add(
                     new CheckStatus()
                             .setCause(this)
                             .setMainType(CheckStatus.errorType)
                             .setSubtype(Subtype.internalUnicodeSetFormattingError)
                             .setMessage(
                                     "Internal Error: formatting not working for {0}",
                                     new Object[] {exemplar1}));
         }
         // else if (!v.equals(fixedExemplar1)) {
         // result.add(new CheckStatus().setCause(this).setType(CheckStatus.warningType)
         // .setMessage("Better formatting would be \u200E{0}\u200E", new Object[]{fixedExemplar1}));
         // }

         // now check that only allowed characters are in the set

         if (!exemplarType.allowed.containsAll(exemplar1)) {
             UnicodeSet remainder0 = new UnicodeSet(exemplar1).removeAll(exemplarType.allowed);

             // we do allow for punctuation & combining marks in strings
             UnicodeSet remainder = new UnicodeSet();
             for (String s : remainder0) {
                 if (Character.codePointCount(s, 0, s.length()) == 1) {
                     remainder.add(s);
                 } else {
                     // just check normalization
                 }
             }

             // after a first check, we check again in case we flattened

             if (remainder.size() != 0) {
                 fixedExemplar1 = displayFormatter.format(exemplar1);
                 result.add(
                         new CheckStatus()
                                 .setCause(this)
                                 .setMainType(CheckStatus.errorType)
                                 .setSubtype(Subtype.illegalCharactersInExemplars)
                                 .setMessage(
                                         "Should be limited to "
                                                 + exemplarType.message
                                                 + "; thus not contain: \u200E{0}\u200E",
                                         new Object[] {remainder}));
             }
         }

         // now check for empty

         if (!isRoot && exemplar1.size() == 0) {
             switch (exemplarType) {
                     //            case currencySymbol: // ok if empty
                     //                break;
                 case auxiliary:
                     result.add(
                             new CheckStatus()
                                     .setCause(this)
                                     .setMainType(CheckStatus.warningType)
                                     .setSubtype(Subtype.missingAuxiliaryExemplars)
                                     .setMessage(
                                             "Most languages allow <i>some<i> auxiliary characters, so review this."));
                     break;
                 case index:
                 case punctuation:
                 case main:
                     result.add(
                             new CheckStatus()
                                     .setCause(this)
                                     .setMainType(CheckStatus.errorType)
                                     .setSubtype(Subtype.missingMainExemplars)
                                     .setMessage(
                                             "Exemplar set ("
                                                     + exemplarType
                                                     + ") must not be empty -- that would imply that this language uses no "
                                                     + (exemplarType == ExemplarType.punctuation
                                                             ? "punctuation"
                                                             : "letters")
                                                     + "!"));
                     break;
             }
         }
     }
 }
	package org.unicode.cldr.test;

	import com.ibm.icu.lang.UCharacter;
	import com.ibm.icu.lang.UCharacterDirection;
	import com.ibm.icu.lang.UProperty;
	import com.ibm.icu.lang.UScript;
	import com.ibm.icu.text.Collator;
	import com.ibm.icu.text.UnicodeSet;
	import com.ibm.icu.text.UnicodeSetIterator;
	import com.ibm.icu.util.ULocale;
	import java.util.BitSet;
	import java.util.Comparator;
	import java.util.List;
	import org.unicode.cldr.test.CheckCLDR.CheckStatus.Subtype;
	import org.unicode.cldr.util.CLDRConfig;
	import org.unicode.cldr.util.CLDRFile;
	import org.unicode.cldr.util.ComparatorUtilities;
	import org.unicode.cldr.util.Factory;
	import org.unicode.cldr.util.SimpleUnicodeSetFormatter;
	import org.unicode.cldr.util.SupplementalDataInfo;
	import org.unicode.cldr.util.UnicodeSetPrettyPrinter;
	import org.unicode.cldr.util.XPathParts;

	public class CheckExemplars extends FactoryCheckCLDR {

	public static final boolean USE_PUNCTUATION = false;
	private static final boolean SUPPRESS_AUX_EMPTY_CHECK = true;
	private static final String[] QUOTE_ELEMENTS = {
	"quotationStart", "quotationEnd",
	"alternateQuotationStart", "alternateQuotationEnd"
	};
	static final SupplementalDataInfo SUP = CLDRConfig.getInstance().getSupplementalDataInfo();

	Collator col;
	boolean isRoot;
	SimpleUnicodeSetFormatter displayFormatter;
	UnicodeSetPrettyPrinter rawFormatter;

	static final UnicodeSet HangulSyllables =
	new UnicodeSet("[[:Hangul_Syllable_Type=LVT:][:Hangul_Syllable_Type=LV:]]").freeze();

	public static final UnicodeSet AlwaysOK;

	static {
	if (USE_PUNCTUATION) {
	AlwaysOK = new UnicodeSet("[\\u0020\\u00A0]");
	} else {
	AlwaysOK =
	new UnicodeSet(
	"[[[:Nd:][:script=common:][:script=inherited:]-[:Default_Ignorable_Code_Point:]-[:C:] - [_]] [\u05BE \u05F3 \u066A-\u066C]"
	+ "[[؉][་ །༌][ཱ]‎‎{য়}য়]"
	+ // TODO Fix this Hack
	"-[❮❯]]"); // [\\u200c-\\u200f]
	// [:script=common:][:script=inherited:]
	}
	AlwaysOK.freeze();
	}
	// TODO Fix some of these characters
	private static final UnicodeSet SPECIAL_ALLOW =
	new UnicodeSet(
	"[\u061C\\u200E\\u200F\\u200c\\u200d"
	+ "‎‎‎[\u064B\u064E-\u0651\u0670]‎[:Nd:]‎[\u0951\u0952]‎[\u064B-\u0652\u0654-\u0657\u0670]‎[\u0A66-\u0A6F][\u0ED0-\u0ED9][\u064B-\u0652]‎[\\u02BB\\u02BC][\u0CE6-\u0CEF]‎‎[\u0966-\u096F]"
	+ "‎‎‎[:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] ]" // restore
	// [:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:]
	)
	.freeze(); // add RLM, LRM [\u200C\u200D]‎

	public static final UnicodeSet UAllowedInExemplars =
	new UnicodeSet(
	"[[:assigned:]-[:Z:]]") // [:alphabetic:][:Mn:][:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:]
	.removeAll(AlwaysOK) // this will remove some
	// [:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] so we
	// restore them
	// in SPECIAL_ALLOW
	.addAll(SPECIAL_ALLOW) // add RLM, LRM [\u200C\u200D]‎
	.freeze();

	public static final UnicodeSet UAllowedInNumbers =
	new UnicodeSet(
	"[\u00A0\u202F[:N:][:P:][:Sm:][:Letter_Number:][:Numeric_Type=Numeric:]]") // [:alphabetic:][:Mn:][:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:]
	.addAll(SPECIAL_ALLOW) // add RLM, LRM [\u200C\u200D]‎
	.freeze();

	public static final UnicodeSet AllowedInExemplars =
	new UnicodeSet(UAllowedInExemplars)
	.removeAll(new UnicodeSet("[[:Uppercase:]-[\u0130]]"))
	.freeze();

	public static final UnicodeSet ALLOWED_IN_PUNCTUATION =
	new UnicodeSet("[[:P:][:S:]-[:Sc:]]").freeze();

	public static final UnicodeSet ALLOWED_IN_AUX =
	new UnicodeSet(AllowedInExemplars)
	.addAll(ALLOWED_IN_PUNCTUATION)
	.removeAll(AlwaysOK) // this will remove some
	// [:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] so we
	// restore them
	// in SPECIAL_ALLOW
	.addAll(SPECIAL_ALLOW) // add RLM, LRM [\u200C\u200D]‎
	.freeze();

	public enum ExemplarType {
	main(AllowedInExemplars, "(specific-script - uppercase - invisibles + \u0130)", true),
	punctuation(ALLOWED_IN_PUNCTUATION, "punctuation", false),
	auxiliary(ALLOWED_IN_AUX, "(specific-script - uppercase - invisibles + \u0130)", true),
	index(UAllowedInExemplars, "(specific-script - invisibles)", false),
	numbers(UAllowedInNumbers, "(specific-script - invisibles)", false),
	// currencySymbol(AllowedInExemplars, "(specific-script - uppercase - invisibles + \u0130)",
	// false)
	;

	public final UnicodeSet allowed;
	public final UnicodeSet toRemove;
	public final String message;
	public final boolean convertUppercase;

	ExemplarType(UnicodeSet allowed, String message, boolean convertUppercase) {
	if (!allowed.isFrozen()) {
	throw new IllegalArgumentException("Internal Error");
	}
	this.allowed = allowed;
	this.message = message;
	this.toRemove = new UnicodeSet(allowed).complement().freeze();
	this.convertUppercase = convertUppercase;
	}
	}

	public CheckExemplars(Factory factory) {
	super(factory);
	}

	// Allowed[:script=common:][:script=inherited:][:alphabetic=false:]

	@Override
	public CheckCLDR handleSetCldrFileToCheck(
	CLDRFile cldrFileToCheck, Options options, List<CheckStatus> possibleErrors) {
	if (cldrFileToCheck == null) return this;
	super.handleSetCldrFileToCheck(cldrFileToCheck, options, possibleErrors);
	String locale = cldrFileToCheck.getLocaleID();
	isRoot = cldrFileToCheck.getLocaleID().equals("root");
	col = ComparatorUtilities.getIcuCollator(new ULocale(locale), Collator.IDENTICAL);
	Collator spaceCol =
	ComparatorUtilities.getIcuCollator(new ULocale(locale), Collator.PRIMARY);
	displayFormatter = new SimpleUnicodeSetFormatter((Comparator) col);
	rawFormatter = UnicodeSetPrettyPrinter.from((Comparator) col, (Comparator) spaceCol);

	// check for auxiliary anyway
	if (!SUPPRESS_AUX_EMPTY_CHECK) {
	UnicodeSet auxiliarySet =
	getResolvedCldrFileToCheck()
	.getExemplarSet("auxiliary", CLDRFile.WinningChoice.WINNING);

	if (auxiliarySet == null) {
	possibleErrors.add(
	new CheckStatus()
	.setCause(this)
	.setMainType(CheckStatus.warningType)
	.setSubtype(Subtype.missingAuxiliaryExemplars)
	.setMessage(
	"Most languages allow <i>some<i> auxiliary characters, so review this."));
	}
	}
	return this;
	}

	@Override
	public CheckCLDR handleCheck(
	String path, String fullPath, String value, Options options, List<CheckStatus> result) {
	if (fullPath == null) return this; // skip paths that we don't have
	if (path.indexOf("/exemplarCharacters") < 0) {
	if (path.contains("parseLenient")) {
	checkParse(path, fullPath, value, options, result);
	}
	return this;
	}
	if (!accept(result)) return this;
	XPathParts oparts = XPathParts.getFrozenInstance(path);
	final String exemplarString = oparts.findAttributeValue("exemplarCharacters", "type");
	ExemplarType type =
	exemplarString == null ? ExemplarType.main : ExemplarType.valueOf(exemplarString);
	checkExemplar(value, result, type);

	// check relation to auxiliary set
	try {
	UnicodeSet mainSet =
	getResolvedCldrFileToCheck().getExemplarSet("", CLDRFile.WinningChoice.WINNING);
	if (type == ExemplarType.auxiliary) {
	UnicodeSet auxiliarySet = SimpleUnicodeSetFormatter.parseLenient(value);

	UnicodeSet combined = new UnicodeSet(mainSet).addAll(auxiliarySet);
	checkMixedScripts("main+auxiliary", combined, result);

	if (auxiliarySet.containsSome(mainSet)) {
	UnicodeSet overlap =
	new UnicodeSet(mainSet)
	.retainAll(auxiliarySet)
	.removeAll(HangulSyllables);
	if (overlap.size() != 0) {
	String fixedExemplar1 = rawFormatter.format(overlap);
	result.add(
	new CheckStatus()
	.setCause(this)
	.setMainType(CheckStatus.errorType)
	.setSubtype(Subtype.auxiliaryExemplarsOverlap)
	.setMessage(
	"Auxiliary characters also exist in main: \u200E{0}\u200E",
	new Object[] {fixedExemplar1}));
	}
	}
	} else if (type == ExemplarType.punctuation) {
	// Check that the punctuation exemplar characters include quotation marks.
	UnicodeSet punctuationSet = SimpleUnicodeSetFormatter.parseLenient(value);
	UnicodeSet quoteSet = new UnicodeSet();
	for (String element : QUOTE_ELEMENTS) {
	quoteSet.add(
	getResolvedCldrFileToCheck()
	.getWinningValue("//ldml/delimiters/" + element));
	}
	if (!punctuationSet.containsAll(quoteSet)) {
	quoteSet.removeAll(punctuationSet);
	// go ahead and list the characters separately, with space between, for clarity.
	StringBuilder characters = new StringBuilder();
	for (String item : quoteSet) {
	if (characters.length() != 0) {
	characters.append(" ");
	}
	characters.append(item);
	}
	// String characters = quoteSet.toPattern(false);
	CheckStatus message =
	new CheckStatus()
	.setCause(this)
	.setMainType(CheckStatus.warningType)
	.setSubtype(Subtype.missingPunctuationCharacters)
	.setMessage(
	"Punctuation exemplar characters are missing quotation marks for this locale: {0}",
	characters);
	result.add(message);
	}
	} else if (type == ExemplarType.index) {
	// Check that the index exemplar characters are in case-completed union of main and
	// auxiliary exemplars
	UnicodeSet auxiliarySet =
	getResolvedCldrFileToCheck()
	.getExemplarSet("auxiliary", CLDRFile.WinningChoice.WINNING);
	if (auxiliarySet == null) {
	auxiliarySet = new UnicodeSet();
	}
	UnicodeSet mainAndAuxAllCase =
	new UnicodeSet(mainSet)
	.addAll(auxiliarySet)
	.closeOver(UnicodeSet.ADD_CASE_MAPPINGS);
	UnicodeSet indexBadChars =
	SimpleUnicodeSetFormatter.parseLenient(value).removeAll(mainAndAuxAllCase);

	if (!indexBadChars.isEmpty()) {
	CheckStatus message =
	new CheckStatus()
	.setCause(this)
	.setMainType(CheckStatus.warningType)
	.setSubtype(Subtype.charactersNotInMainOrAuxiliaryExemplars)
	.setMessage(
	"Index exemplars include characters not in main or auxiliary exemplars: {0}",
	indexBadChars.toPattern(false));
	result.add(message);
	}
	}

	// check for consistency with RTL

	Boolean localeIsRTL = false;
	String charOrientation =
	getResolvedCldrFileToCheck()
	.getStringValue("//ldml/layout/orientation/characterOrder");
	if (charOrientation.equals("right-to-left")) {
	localeIsRTL = true;
	}

	UnicodeSetIterator mi = new UnicodeSetIterator(mainSet);
	while (mi.next()) {
	if (mi.codepoint != UnicodeSetIterator.IS_STRING
	&& (UCharacter.getDirection(mi.codepoint)
	== UCharacterDirection.RIGHT_TO_LEFT
	\|\| UCharacter.getDirection(mi.codepoint)
	== UCharacterDirection.RIGHT_TO_LEFT_ARABIC)
	&& !localeIsRTL) {
	result.add(
	new CheckStatus()
	.setCause(this)
	.setMainType(CheckStatus.errorType)
	.setSubtype(Subtype.orientationDisagreesWithExemplars)
	.setMessage(
	"Main exemplar set contains RTL characters, but orientation of this locale is not RTL."));
	break;
	}
	}

	} catch (Exception e) {
	} // if these didn't parse, checkExemplar will be called anyway at some point
	return this;
	}

	private void checkParse(
	String path, String fullPath, String value, Options options, List<CheckStatus> result) {
	try {
	XPathParts oparts = XPathParts.getFrozenInstance(path);
	// only thing we do is make sure that the sample is in the value
	UnicodeSet us = SimpleUnicodeSetFormatter.parseLenient(value);
	String sampleValue = oparts.getAttributeValue(-1, "sample");
	if (!us.contains(sampleValue)) {
	CheckStatus message =
	new CheckStatus()
	.setCause(this)
	.setMainType(CheckStatus.errorType)
	.setSubtype(Subtype.badParseLenient)
	.setMessage(
	"ParseLenient sample not in value: {0} ∌ {1}",
	us, sampleValue);
	result.add(message);
	}
	} catch (IllegalArgumentException e) {
	/*
	* new UnicodeSet(value) throws IllegalArgumentException if, for example, value is null or value = "?".
	* This can happen during cldr-unittest TestAll.
	* path = //ldml/characters/parseLenients[@scope="general"][@level="lenient"]/parseLenient[@sample="’"]
	* or
	* path = //ldml/characters/parseLenients[@scope="date"][@level="lenient"]/parseLenient[@sample="-"]
	*/
	CheckStatus message =
	new CheckStatus()
	.setCause(this)
	.setMainType(CheckStatus.errorType)
	.setSubtype(Subtype.badParseLenient)
	.setMessage(
	e.toString()
	+ (e.getMessage() == null
	? ""
	: ": " + e.getMessage()));
	result.add(message);
	}
	}

	static final BitSet Japn = new BitSet();
	static final BitSet Kore = new BitSet();

	static {
	Japn.set(UScript.HAN);
	Japn.set(UScript.HIRAGANA);
	Japn.set(UScript.KATAKANA);
	Kore.set(UScript.HAN);
	Kore.set(UScript.HANGUL);
	}

	private void checkMixedScripts(String title, UnicodeSet set, List<CheckStatus> result) {
	BitSet s = new BitSet();
	for (String item : set) {
	int script = UScript.getScript(item.codePointAt(0));
	if (script != UScript.COMMON && script != UScript.INHERITED) {
	s.set(script);
	}
	}
	final int cardinality = s.cardinality();
	if (cardinality < 2) {
	return;
	}
	if (cardinality == 2 && title.equals("currencySymbol") && s.get(UScript.LATIN)) {
	return; // allow 2 scripts in exemplars for currencies.
	}
	// allowable combinations
	if (s.equals(Japn) \|\| s.equals(Kore)) {
	return;
	}
	StringBuilder scripts = new StringBuilder();
	for (int i = s.nextSetBit(0); i >= 0; i = s.nextSetBit(i + 1)) {
	if (scripts.length() != 0) {
	scripts.append(", ");
	}
	scripts.append(UScript.getName(i));
	UnicodeSet inSet =
	new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, i).retainAll(set);
	int count = 0;
	scripts.append(" (");
	for (String cp : inSet) {
	if (count != 0) {
	scripts.append(",");
	}
	scripts.append(cp);
	count++;
	if (count > 3) {
	scripts.append('\u2026');
	break;
	}
	}
	scripts.append(")");
	}
	result.add(
	new CheckStatus()
	.setCause(this)
	.setMainType(CheckStatus.errorType)
	.setSubtype(Subtype.illegalExemplarSet)
	.setMessage(
	"{0} exemplars contain multiple scripts: {1}",
	new Object[] {title, scripts}));
	return;
	}

	private void checkExemplar(String v, List<CheckStatus> result, ExemplarType exemplarType) {
	if (v == null) return;
	final UnicodeSet exemplar1;
	try {
	exemplar1 = SimpleUnicodeSetFormatter.parseLenient(v).freeze();
	} catch (Exception e) {
	result.add(
	new CheckStatus()
	.setCause(this)
	.setMainType(CheckStatus.errorType)
	.setSubtype(Subtype.illegalExemplarSet)
	.setMessage(e.getMessage()));
	return;
	}

	// check for mixed scripts

	checkMixedScripts(exemplarType.toString(), exemplar1, result);

	// check that the formatting is correct

	String fixedExemplar1 = rawFormatter.format(exemplar1);
	UnicodeSet doubleCheck = new UnicodeSet(fixedExemplar1);
	if (!doubleCheck.equals(exemplar1)) {
	result.add(
	new CheckStatus()
	.setCause(this)
	.setMainType(CheckStatus.errorType)
	.setSubtype(Subtype.internalUnicodeSetFormattingError)
	.setMessage(
	"Internal Error: formatting not working for {0}",
	new Object[] {exemplar1}));
	}
	// else if (!v.equals(fixedExemplar1)) {
	// result.add(new CheckStatus().setCause(this).setType(CheckStatus.warningType)
	// .setMessage("Better formatting would be \u200E{0}\u200E", new Object[]{fixedExemplar1}));
	// }

	// now check that only allowed characters are in the set

	if (!exemplarType.allowed.containsAll(exemplar1)) {
	UnicodeSet remainder0 = new UnicodeSet(exemplar1).removeAll(exemplarType.allowed);

	// we do allow for punctuation & combining marks in strings
	UnicodeSet remainder = new UnicodeSet();
	for (String s : remainder0) {
	if (Character.codePointCount(s, 0, s.length()) == 1) {
	remainder.add(s);
	} else {
	// just check normalization
	}
	}

	// after a first check, we check again in case we flattened

	if (remainder.size() != 0) {
	fixedExemplar1 = displayFormatter.format(exemplar1);
	result.add(
	new CheckStatus()
	.setCause(this)
	.setMainType(CheckStatus.errorType)
	.setSubtype(Subtype.illegalCharactersInExemplars)
	.setMessage(
	"Should be limited to "
	+ exemplarType.message
	+ "; thus not contain: \u200E{0}\u200E",
	new Object[] {remainder}));
	}
	}

	// now check for empty

	if (!isRoot && exemplar1.size() == 0) {
	switch (exemplarType) {
	// case currencySymbol: // ok if empty
	// break;
	case auxiliary:
	result.add(
	new CheckStatus()
	.setCause(this)
	.setMainType(CheckStatus.warningType)
	.setSubtype(Subtype.missingAuxiliaryExemplars)
	.setMessage(
	"Most languages allow <i>some<i> auxiliary characters, so review this."));
	break;
	case index:
	case punctuation:
	case main:
	result.add(
	new CheckStatus()
	.setCause(this)
	.setMainType(CheckStatus.errorType)
	.setSubtype(Subtype.missingMainExemplars)
	.setMessage(
	"Exemplar set ("
	+ exemplarType
	+ ") must not be empty -- that would imply that this language uses no "
	+ (exemplarType == ExemplarType.punctuation
	? "punctuation"
	: "letters")
	+ "!"));
	break;
	}
	}
	}
	}