blob: 9990b715c43ba7c801391b175879a320fb665990 [file] [log] [blame]
// Copyright 2011-2017 Google Inc. All Rights Reserved.
package org.unicode.cldr.tool;
import java.io.File;
import java.io.PrintWriter;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.unicode.cldr.draft.FileUtilities;
import org.unicode.cldr.util.CLDRFile;
import org.unicode.cldr.util.CLDRPaths;
import org.unicode.cldr.util.Factory;
import org.unicode.cldr.util.SimpleXMLSource;
import org.unicode.cldr.util.XMLSource;
/**
* Generates pseudolocalized contents of a CLDRFile.
*
* @author viarheichyk@google.com (Igor Viarheichyk)
*/
public class CLDRFilePseudolocalizer {
private static final Pattern NUMERIC_PLACEHOLDER = Pattern.compile("\\{\\d+\\}");
private static final Pattern QUOTED_TEXT = Pattern.compile("'.*?'");
// Android patch (b/37077221) begin.
private static final String PSEUDOLOCALES_DIRECTORY = ".";
// Android patch (b/37077221) end.
private static final String ORIGINAL_LOCALE = "en";
// Android patch (b/37512961) begin.
private static final String NUMBERS_PATH = "//ldml/numbers/defaultNumberingSystem";
// Android patch (b/37512961) end.
private static final String EXEMPLAR_PATH = "//ldml/characters/exemplarCharacters";
private static final String EXEMPLAR_AUX_PATH = "//ldml/characters/exemplarCharacters[@type=\"auxiliary\"]";
private static final String TERRITORY_PATTERN = "//ldml/localeDisplayNames/territories/territory[@type=\"%s\"]";
private static final String[] EXCLUDE_LIST = { "/exemplarCharacters", "/delimiters",
"/contextTransforms", "/numbers",
"/units", // [ and ] are not allowed in units
"narrow", "localeDisplayPattern", "timeZoneNames/fallbackFormat", // Expansion limits
};
private static final String[] PATTERN_LIST = { "/pattern", "FormatItem", "hourFormat" };
private static class Pseudolocalizer {
private boolean pattern;
public Pseudolocalizer() {
pattern = false;
}
public boolean getPattern() {
return pattern;
}
public String start() {
return "";
}
public String end() {
return "";
}
public String fragment(String text) {
return text;
}
protected void setPattern(boolean pattern) {
this.pattern = pattern;
}
}
private static class PseudolocalizerXA extends Pseudolocalizer {
private static final String[] NUMBERS = {
"one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
"ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen",
"seventeen", "eighteen", "nineteen", "twenty", "twentyone", "twentytwo",
"twentythree", "twentyfour", "twentyfive", "twentysix", "twentyseven",
"twentyeight", "twentynine", "thirty", "thirtyone", "thirtytwo",
"thirtythree", "thirtyfour", "thirtyfive", "thirtysix", "thirtyseven",
"thirtyeight", "thirtynine", "forty"
};
private static final Map<Integer, String> REPLACEMENTS = buildReplacementsTable();
private int charCount = 0;
private static Map<Integer, String> buildReplacementsTable() {
Map<Integer, String> table = new HashMap<Integer, String>();
table.put((int) ' ', "\u2003");
table.put((int) '!', "\u00a1");
table.put((int) '"', "\u2033");
table.put((int) '#', "\u266f");
table.put((int) '$', "\u20ac");
table.put((int) '%', "\u2030");
table.put((int) '&', "\u214b");
table.put((int) '*', "\u204e");
table.put((int) '+', "\u207a");
table.put((int) ',', "\u060c");
table.put((int) '-', "\u2010");
table.put((int) '.', "\u00b7");
table.put((int) '/', "\u2044");
table.put((int) '0', "\u24ea");
table.put((int) '1', "\u2460");
table.put((int) '2', "\u2461");
table.put((int) '3', "\u2462");
table.put((int) '4', "\u2463");
table.put((int) '5', "\u2464");
table.put((int) '6', "\u2465");
table.put((int) '7', "\u2466");
table.put((int) '8', "\u2467");
table.put((int) '9', "\u2468");
table.put((int) ':', "\u2236");
table.put((int) ';', "\u204f");
table.put((int) '<', "\u2264");
table.put((int) '=', "\u2242");
table.put((int) '>', "\u2265");
table.put((int) '?', "\u00bf");
table.put((int) '@', "\u055e");
table.put((int) 'A', "\u00c5");
table.put((int) 'B', "\u0181");
table.put((int) 'C', "\u00c7");
table.put((int) 'D', "\u00d0");
table.put((int) 'E', "\u00c9");
table.put((int) 'F', "\u0191");
table.put((int) 'G', "\u011c");
table.put((int) 'H', "\u0124");
table.put((int) 'I', "\u00ce");
table.put((int) 'J', "\u0134");
table.put((int) 'K', "\u0136");
table.put((int) 'L', "\u013b");
table.put((int) 'M', "\u1e40");
table.put((int) 'N', "\u00d1");
table.put((int) 'O', "\u00d6");
table.put((int) 'P', "\u00de");
table.put((int) 'Q', "\u01ea");
table.put((int) 'R', "\u0154");
table.put((int) 'S', "\u0160");
table.put((int) 'T', "\u0162");
table.put((int) 'U', "\u00db");
table.put((int) 'V', "\u1e7c");
table.put((int) 'W', "\u0174");
table.put((int) 'X', "\u1e8a");
table.put((int) 'Y', "\u00dd");
table.put((int) 'Z', "\u017d");
table.put((int) '[', "\u2045");
table.put((int) '\\', "\u2216");
table.put((int) ']', "\u2046");
table.put((int) '^', "\u02c4");
table.put((int) '_', "\u203f");
table.put((int) '`', "\u2035");
table.put((int) 'a', "\u00e5");
table.put((int) 'b', "\u0180");
table.put((int) 'c', "\u00e7");
table.put((int) 'd', "\u00f0");
table.put((int) 'e', "\u00e9");
table.put((int) 'f', "\u0192");
table.put((int) 'g', "\u011d");
table.put((int) 'h', "\u0125");
table.put((int) 'i', "\u00ee");
table.put((int) 'j', "\u0135");
table.put((int) 'k', "\u0137");
table.put((int) 'l', "\u013c");
table.put((int) 'm', "\u0271");
table.put((int) 'n', "\u00f1");
table.put((int) 'o', "\u00f6");
table.put((int) 'p', "\u00fe");
table.put((int) 'q', "\u01eb");
table.put((int) 'r', "\u0155");
table.put((int) 's', "\u0161");
table.put((int) 't', "\u0163");
table.put((int) 'u', "\u00fb");
table.put((int) 'v', "\u1e7d");
table.put((int) 'w', "\u0175");
table.put((int) 'x', "\u1e8b");
table.put((int) 'y', "\u00fd");
table.put((int) 'z', "\u017e");
table.put((int) '|', "\u00a6");
table.put((int) '~', "\u02de");
return table;
}
public String start() {
charCount = 0;
return "[";
}
public String end() {
StringBuilder expansionText = new StringBuilder();
int expansion = (charCount + 1) / 2;
int wordIndex = 0;
while (expansion > 0) {
String word = NUMBERS[wordIndex++ % NUMBERS.length];
expansionText.append(' ');
// Protect expansion strings with single quotes for patterns.
if (getPattern()) {
expansionText.append('\'');
}
expansionText.append(word);
if (getPattern()) {
expansionText.append('\'');
}
expansion -= word.length() + 1;
}
expansionText.append(']');
return expansionText.toString();
}
public String fragment(String text) {
StringBuilder buf = new StringBuilder();
int index = 0;
while (index < text.length()) {
int codePoint = text.codePointAt(index);
charCount++;
index += Character.charCount(codePoint);
String replacement = REPLACEMENTS.get(codePoint);
if (replacement != null) {
buf.append(replacement);
} else {
buf.appendCodePoint(codePoint);
}
}
return buf.toString();
}
}
private static class PseudolocalizerXB extends Pseudolocalizer {
/** Right-to-left override character. */
private static final String RLO = "\u202e";
// Android patch (b/37512961) begin.
/** Arabic letter mark character. */
private static final String ALM = "\u061C";
/** Pop direction formatting character. */
private static final String PDF = "\u202c";
/** Prefix to add before each LTR word */
private static final String BIDI_PREFIX = ALM + RLO;
/** Postfix to add after each LTR word */
private static final String BIDI_POSTFIX = PDF + ALM;
// Android patch (b/37512961) end.
public String fragment(String text) {
StringBuilder output = new StringBuilder();
boolean wrapping = false;
for (int index = 0; index < text.length();) {
int codePoint = text.codePointAt(index);
index += Character.charCount(codePoint);
byte directionality = Character.getDirectionality(codePoint);
boolean needsWrap = (directionality == Character.DIRECTIONALITY_LEFT_TO_RIGHT);
if (needsWrap != wrapping) {
wrapping = needsWrap;
output.append(wrapping ? BIDI_PREFIX : BIDI_POSTFIX);
}
output.appendCodePoint(codePoint);
}
if (wrapping) {
output.append(BIDI_POSTFIX);
}
return output.toString();
}
}
private String outputLocale;
private Pseudolocalizer pseudolocalizer;
/**
* Construct new CLDRPseudolocalization object.
*
* @param outputLocale
* name of target locale
* @param pipeline
* pseudolocalization pipeline to generate target locale data
*/
public CLDRFilePseudolocalizer(String outputLocale, Pseudolocalizer pseudolocalizer) {
this.outputLocale = outputLocale;
this.pseudolocalizer = pseudolocalizer;
}
public static CLDRFilePseudolocalizer createInstanceXA() {
return new CLDRFilePseudolocalizer("en_XA", new PseudolocalizerXA());
}
public static CLDRFilePseudolocalizer createInstanceXB() {
return new CLDRFilePseudolocalizer("ar_XB", new PseudolocalizerXB());
}
/**
* Transforms a CLDRFile value into another form.
*
* @return pseudolocalized value.
*/
private String transformValue(String path, String value) {
if (containsOneOf(path, EXCLUDE_LIST)) {
return value;
}
if (containsOneOf(path, PATTERN_LIST)) {
return createMessage(value, QUOTED_TEXT, true);
} else {
return createMessage(value, NUMERIC_PLACEHOLDER, false);
}
}
/**
* Check if string contains any substring from the provided list.
*/
private boolean containsOneOf(String string, String[] substrings) {
for (String substring : substrings) {
if (string.contains(substring)) {
return true;
}
}
return false;
}
/**
* Create either localizable or non-localizable text fragment depending on flag value.
*/
private String pseudolocalizeFragment(String text, boolean localizable) {
return localizable ? pseudolocalizer.fragment(text) : text;
}
/**
* Create a message that can contain localizable and non-localizable parts.
*/
private String createMessage(String text, Pattern pattern,
boolean matchIsLocalizable) {
StringBuffer buffer = new StringBuffer(pseudolocalizer.start());
Matcher match = pattern.matcher(text);
int start = 0;
pseudolocalizer.setPattern(matchIsLocalizable);
for (; match.find(); start = match.end()) {
if (match.start() > start) {
buffer.append(pseudolocalizeFragment(
text.substring(start, match.start()), !matchIsLocalizable));
}
buffer.append(pseudolocalizeFragment(match.group(), matchIsLocalizable));
}
if (start < text.length()) {
buffer.append(pseudolocalizeFragment(text.substring(start), !matchIsLocalizable));
}
buffer.append(pseudolocalizer.end());
return buffer.toString();
}
/**
* Add pseudolocale characters to exemplarCharacters entry pointed by xpath.
*/
private String mergeExemplars(String value) {
String pseudolocalized = createMessage(value, NUMERIC_PLACEHOLDER, false);
StringBuffer result = new StringBuffer(value.substring(0, value.length() - 1));
final char CLOSING_BRACKET = ']';
for (int i = 0; i < pseudolocalized.length(); i++) {
char c = pseudolocalized.charAt(i);
if (c != CLOSING_BRACKET) {
String chunk;
if (Character.isAlphabetic(c)) {
chunk = String.valueOf(c);
} else {
chunk = String.format("\\u%04X", (int) c);
}
if (result.indexOf(chunk) == -1
&& result.indexOf(String.valueOf(c)) == -1) {
result.append(' ');
result.append(chunk);
}
}
}
result.append(CLOSING_BRACKET);
return result.toString();
}
/**
* Generate CLDRFile object. Original CLDRFile is created from .xml file and its
* content is passed through pseudolocalization pipeline.
*/
public CLDRFile generate() {
Factory factory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
// Create input CLDRFile object resolving inherited data.
CLDRFile input = factory.make(ORIGINAL_LOCALE, false);
XMLSource outputSource = new SimpleXMLSource(outputLocale);
for (String xpath : input) {
String fullPath = input.getFullXPath(xpath);
String value = input.getStringValue(xpath);
if (!value.isEmpty()) {
String newValue = transformValue(xpath, value);
if (!newValue.equals(value)) {
outputSource.putValueAtPath(fullPath, newValue);
}
}
}
// Pseudolocalize exemplar characters and put them into auxiliary set.
outputSource.putValueAtPath(EXEMPLAR_AUX_PATH,
mergeExemplars(input.getStringValue(EXEMPLAR_PATH)));
// Create fake pseudolocales territories.
addTerritory(outputSource, "XA");
addTerritory(outputSource, "XB");
// Android patch (b/37512961) begin.
// Use latin numbers for pseudolocales.
outputSource.putValueAtPath(NUMBERS_PATH, "latn");
// Android patch (b/37512961) end.
return new CLDRFile(outputSource);
}
/**
* Add a territory into output xml.
*/
private void addTerritory(XMLSource outputSource, String territory) {
String territoryPath = String.format(TERRITORY_PATTERN, territory);
outputSource.putValueAtPath(territoryPath, String.format("[%s]", territory));
}
/**
* Generate CLDRFile object and save it into .xml file.
*/
public String generateAndSave() throws Exception {
CLDRFile output = generate();
String outputDir = CLDRPaths.GEN_DIRECTORY + "main" + File.separator + PSEUDOLOCALES_DIRECTORY + File.separator;
String outputFile = output.getLocaleID() + ".xml";
PrintWriter out = FileUtilities.openUTF8Writer(outputDir, outputFile);
output.write(out);
out.close();
return (outputDir + outputFile);
}
public static void main(String[] args) throws Exception {
// Generate en-XA locale (accents, brackets and expansion),
// dump resulting file name to stdout.
System.out.println(createInstanceXA().generateAndSave());
// Generate ar-XB (fake Bidi) locale.
System.out.println(createInstanceXB().generateAndSave());
}
}