| package org.unicode.cldr.tool; |
| |
| import java.io.File; |
| import java.io.PrintWriter; |
| import java.util.Map; |
| import java.util.concurrent.ConcurrentHashMap; |
| |
| import org.unicode.cldr.draft.FileUtilities; |
| import org.unicode.cldr.test.DisplayAndInputProcessor; |
| import org.unicode.cldr.util.CLDRFile; |
| import org.unicode.cldr.util.CLDRPaths; |
| import org.unicode.cldr.util.CLDRTransforms; |
| import org.unicode.cldr.util.CLDRTransforms.ParsedTransformID; |
| import org.unicode.cldr.util.CldrUtility; |
| import org.unicode.cldr.util.DtdType; |
| import org.unicode.cldr.util.Factory; |
| import org.unicode.cldr.util.LocaleIDParser; |
| import org.unicode.cldr.util.SimpleFactory.NoSourceDirectoryException; |
| import org.unicode.cldr.util.SimpleXMLSource; |
| import org.unicode.cldr.util.XMLSource; |
| |
| import com.ibm.icu.text.Normalizer; |
| import com.ibm.icu.text.Transliterator; |
| import com.ibm.icu.text.UnicodeSet; |
| import com.ibm.icu.util.ICUUncheckedIOException; |
| |
| /** |
| * Transforms the contents of a CLDRFile. |
| * |
| * @author jchye |
| */ |
| public class CLDRFileTransformer { |
| public enum PolicyIfExisting { |
| RETAIN, // Do not transliterate if existing output has locale content |
| DISCARD, // Replace existing output locale content |
| MINIMIZE // RETAIN, plus drop values if translit is a no-op. |
| } |
| |
| /** |
| * Contains all supported locale-to-locale conversions along with information |
| * needed to convert each locale. Each enum value is named after the locale that results |
| * from the conversion. |
| */ |
| public enum LocaleTransform { |
| sr_Latn("sr", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), // |
| sr_Latn_BA("sr_Cyrl_BA", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), // |
| sr_Latn_ME("sr_Cyrl_ME", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), // |
| sr_Latn_XK("sr_Cyrl_XK", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), // |
| ha_NE("ha", "ha-ha_NE.xml", Transliterator.FORWARD, "[y Y ƴ Ƴ ʼ]", PolicyIfExisting.DISCARD), // |
| yo_BJ("yo", "yo-yo_BJ.xml", Transliterator.FORWARD, "[ẹ ọ ṣ Ẹ Ọ Ṣ]", PolicyIfExisting.DISCARD), // |
| de_CH("de", "[ß] Casefold", Transliterator.FORWARD, "[ß]", PolicyIfExisting.MINIMIZE), // |
| yue_Hans("yue", "Simplified-Traditional.xml", Transliterator.REVERSE, "[:script=Hant:]", PolicyIfExisting.RETAIN), // |
| // en_NZ("en_AU", "null", Transliterator.FORWARD, "[]", PolicyIfExisting.DISCARD), |
| // Needs work to fix currency symbols, handle Māori. See http://unicode.org/cldr/trac/ticket/9516#comment:6 |
| ; |
| |
| private final String inputLocale; |
| private final String transformFilename; |
| private final int direction; |
| private final UnicodeSet inputChars; |
| private final PolicyIfExisting policy; |
| |
| /** |
| * @deprecated Use {@link #LocaleTransform(String,String,int,String,PolicyIfExisting)} instead |
| */ |
| @Deprecated |
| private LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern) { |
| this(inputLocale, transformFilename, direction, inputCharPattern, PolicyIfExisting.DISCARD); |
| } |
| |
| private LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern, PolicyIfExisting policy) { |
| this.inputLocale = inputLocale; |
| this.transformFilename = transformFilename; |
| this.direction = direction; |
| this.inputChars = new UnicodeSet(inputCharPattern); |
| this.policy = policy; |
| } |
| |
| /** |
| * @return the policy for existing content |
| */ |
| public PolicyIfExisting getPolicyIfExisting() { |
| return policy; |
| } |
| |
| /** |
| * @return the locale that used for conversion |
| */ |
| public String getInputLocale() { |
| return inputLocale; |
| } |
| |
| /** |
| * @return the locale that used for conversion |
| */ |
| public String getOutputLocale() { |
| return this.toString(); |
| } |
| |
| /** |
| * @return the filename of the transform used to make the conversion |
| */ |
| public String getTransformFilename() { |
| return transformFilename; |
| } |
| |
| /** |
| * @return the direction of the transformation |
| */ |
| public int getDirection() { |
| return direction; |
| } |
| |
| /** |
| * @return the set of characters in the input locale that should have been removed after |
| * transformation, used for internal debugging |
| */ |
| private UnicodeSet getInputChars() { |
| return inputChars; |
| } |
| } |
| |
| private UnicodeSet unconverted = new UnicodeSet(); |
| private Factory factory; |
| /* |
| * The transliterators map exists, and is static, to avoid wasting a lot of time creating |
| * a new Transliterator more often than necessary. (An alternative to "static" here might be to |
| * create only one CLDRFileTransformer, maybe as a member of ExampleGenerator.) |
| * Use ConcurrentHashMap rather than HashMap to avoid concurrency problems. |
| * Reference: https://unicode.org/cldr/trac/ticket/11657 |
| */ |
| private static Map<LocaleTransform, Transliterator> transliterators = new ConcurrentHashMap<>(); |
| private String transformDir; |
| |
| /** |
| * @param factory |
| * the factory to get locale data from |
| * @param transformDir |
| * the directory containing the transform files |
| */ |
| public CLDRFileTransformer(Factory factory, String transformDir) { |
| this.factory = factory; |
| this.transformDir = transformDir; |
| } |
| |
| public Transliterator loadTransliterator(LocaleTransform localeTransform) { |
| if (transliterators.containsKey(localeTransform)) { |
| return transliterators.get(localeTransform); |
| } |
| Transliterator transliterator; |
| if (localeTransform.getTransformFilename().contains(".xml")) { |
| ParsedTransformID directionInfo = new ParsedTransformID(); |
| String ruleString = CLDRTransforms.getIcuRulesFromXmlFile(transformDir, localeTransform.getTransformFilename(), directionInfo); |
| transliterator = Transliterator.createFromRules(directionInfo.getId(), |
| ruleString, localeTransform.getDirection()); |
| } else { |
| transliterator = Transliterator.getInstance(localeTransform.getTransformFilename()); |
| } |
| transliterators.put(localeTransform, transliterator); |
| return transliterator; |
| } |
| |
| /** |
| * NOTE: This method does not currently handle nested transliterators. |
| * |
| * @param input |
| * @return null if the input file was missing, or if there is no new output file. |
| */ |
| public CLDRFile transform(LocaleTransform localeTransform) { |
| Transliterator transliterator = loadTransliterator(localeTransform); |
| CLDRFile input; |
| final String inputLocale = localeTransform.getInputLocale(); |
| try { |
| input = factory.make(inputLocale, false); |
| } catch (ICUUncheckedIOException e1) { |
| return null; // input file is missing (or otherwise unavailable) |
| } |
| boolean hadOutput = true; |
| CLDRFile output; |
| try { |
| output = factory.make(localeTransform.getOutputLocale(), false); |
| } catch (NoSourceDirectoryException e) { |
| // if we can't open the file, then just make a new one. |
| XMLSource dataSource = new SimpleXMLSource(localeTransform.getOutputLocale()); |
| output = new CLDRFile(dataSource); |
| hadOutput = false; |
| } |
| String outputParentString = LocaleIDParser.getParent(localeTransform.getOutputLocale()); |
| CLDRFile outputParent = factory.make(outputParentString, true); |
| |
| outputParent = factory.make(inputLocale, false); |
| XMLSource outputSource = new SimpleXMLSource(localeTransform.toString()); |
| DisplayAndInputProcessor daip = new DisplayAndInputProcessor(output, true); |
| for (String xpath : input) { |
| String value = input.getStringValue(xpath); |
| if (CldrUtility.INHERITANCE_MARKER.equals(value)) { |
| final String foundIn = input.getSourceLocaleID(xpath, null); |
| // Include these only when they are actually present in this file |
| if (!foundIn.equals(inputLocale)) { |
| // inheritance marker came from somewhere else, ignore it |
| continue; |
| } |
| } |
| if (value == null) { |
| continue; |
| } |
| String fullPath = input.getFullXPath(xpath); |
| String oldValue = output.getStringValue(xpath); |
| String parentValue = outputParent.getStringValue(xpath); |
| value = transformValue(transliterator, localeTransform, xpath, value, oldValue, parentValue); |
| if (value != null) { |
| // check again |
| if (CldrUtility.INHERITANCE_MARKER.equals(value)) { |
| final String foundIn = input.getSourceLocaleID(xpath, null); |
| // Include these only when they are actually present in this file |
| if (!foundIn.equals(inputLocale)) { |
| // inheritance marker came from somewhere else, ignore it |
| continue; |
| } |
| } |
| value = daip.processInput(xpath, value, null); |
| outputSource.putValueAtPath(fullPath, value); |
| } |
| } |
| if (!outputSource.iterator().hasNext()) { // empty new output |
| if (!hadOutput) { |
| return null; // don't add file if nothing to add |
| } |
| } |
| return new CLDRFile(outputSource); |
| } |
| |
| /** |
| * Transforms a CLDRFile value into another form. |
| * @param parentValue |
| */ |
| private String transformValue(Transliterator transliterator, LocaleTransform localeTransform, String path, String value, |
| String oldValue, String parentValue) { |
| |
| // allows us to change only new values |
| switch (localeTransform.policy) { |
| case RETAIN: |
| case MINIMIZE: |
| if (oldValue != null) { |
| return oldValue; |
| } |
| break; |
| default: |
| } |
| |
| UnicodeSet chars = localeTransform.getInputChars(); |
| String transliterated; |
| |
| // TODO: Don't transform dates/patterns. |
| // For now, don't try to transliterate the exemplar characters - use the ones from the original locale. |
| // In the future, we can probably control this better with a config file - similar to CLDRModify's config file. |
| if (path.contains("exemplarCharacters")) { |
| if (oldValue != null) { |
| transliterated = oldValue; |
| } else { |
| transliterated = value; |
| } |
| } else { |
| transliterated = transliterator.transliterate(value); |
| transliterated = Normalizer.compose(transliterated, false); |
| } |
| if (localeTransform.policy == PolicyIfExisting.MINIMIZE) { |
| if (transliterated.equals(value)) { |
| return null; |
| } |
| } |
| |
| if (chars.containsSome(transliterated)) { |
| unconverted.addAll(new UnicodeSet().addAll(chars).retainAll(transliterated)); |
| } |
| return transliterated; |
| } |
| |
| public static void main(String[] args) throws Exception { |
| for (String dir : DtdType.ldml.directories) { |
| if (dir.equals("casing") // skip, field contents are keywords, not localizable content |
| || dir.equals("collation") // skip, field contents are complex, and can't be simply remapped |
| || dir.equals("annotationsDerived") // skip, derived later |
| ) { |
| continue; |
| } |
| System.out.println("\nDirectory: " + dir); |
| final String sourceDirectory = CLDRPaths.COMMON_DIRECTORY + dir + "/"; |
| Factory factory = Factory.make(sourceDirectory, ".*"); |
| |
| CLDRFileTransformer transformer = new CLDRFileTransformer(factory, CLDRPaths.COMMON_DIRECTORY + "transforms" + File.separator); |
| for (LocaleTransform localeTransform : LocaleTransform.values()) { |
| CLDRFile output = transformer.transform(localeTransform); |
| if (output == null) { |
| System.out.println("SKIPPING missing file: " + dir + "/" + localeTransform.inputLocale + ".xml"); |
| continue; |
| } |
| String outputDir = CLDRPaths.GEN_DIRECTORY + "common/" + dir + File.separator; |
| String outputFile = output.getLocaleID() + ".xml"; |
| |
| try (PrintWriter out = FileUtilities.openUTF8Writer(outputDir, outputFile)) { |
| System.out.println("Generating locale file: " + outputDir + outputFile); |
| if (!transformer.unconverted.isEmpty()) { |
| System.out.println("Untransformed characters: " + transformer.unconverted); |
| transformer.unconverted.clear(); |
| } |
| output.write(out); |
| } |
| } |
| } |
| } |
| } |