blob: a147f921c926f301c3f779b8b61345c02bc77b9f [file] [log] [blame]
package org.unicode.cldr.icu;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.unicode.cldr.draft.FileUtilities;
import org.unicode.cldr.util.FileCopier;
import org.unicode.cldr.util.PatternCache;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.util.Calendar;
/**
* Writes an IcuData object to a text file.
*
* @author jchye
*/
public class IcuTextWriter {
/**
* The default tab indent (actually spaces)
*/
private static final String TAB = " ";
// List of characters to escape in UnicodeSets.
private static final Pattern UNICODESET_ESCAPE = PatternCache.get("\\\\[\\\\\\[\\]\\{\\}\\-&:^=]");
// Only escape \ and " from other strings.
private static final Pattern STRING_ESCAPE = PatternCache.get("(?!')\\\\\\\\(?!')");
private static final Pattern QUOTE_ESCAPE = PatternCache.get("\\\\?\"");
private static String headerText;
/**
* ICU paths have a simple comparison, alphabetical within a level. We do
* have to catch the / so that it is lower than everything.
*/
public static final Comparator<String> PATH_COMPARATOR = new Comparator<String>() {
@Override
public int compare(String arg0, String arg1) {
int min = Math.min(arg0.length(), arg1.length());
for (int i = 0; i < min; ++i) {
int ch0 = arg0.charAt(i);
int ch1 = arg1.charAt(i);
int diff = ch0 - ch1;
if (diff == 0) {
continue;
}
if (ch0 == '/') {
return -1;
} else if (ch1 == '/') {
return 1;
}
// make * greater than everything, because of languageMatch
// while it is a pain to have it be unordered, this fix is sufficient to put all the *'s after anything else
if (ch0 == '*') {
return 1;
} else if (ch1 == '*') {
return -1;
}
return diff;
}
return arg0.length() - arg1.length();
}
};
private static String getHeader() {
if (headerText != null) {
return headerText;
}
try (StringWriter stringWriter = new StringWriter();) {
FileCopier.copy(NewLdml2IcuConverter.class, "ldml2icu_header.txt", stringWriter);
headerText = stringWriter.toString();
headerText = headerText.replace("%year%", String.valueOf(Calendar.getInstance().get(Calendar.YEAR)));
return headerText;
} catch (IOException ioe) {
throw new IllegalArgumentException(ioe);
}
}
/**
* Write a file in ICU format. LDML2ICUConverter currently has some
* funny formatting in a few cases; don't try to match everything.
*
* @param icuData
* the icu data structure to be written
* @param dirPath
* the directory to write the file to
* @param hasSpecial
* true if a special file was used to create the ICU data
*/
public static void writeToFile(IcuData icuData, String dirPath) throws IOException {
String name = icuData.getName();
PrintWriter out = FileUtilities.openUTF8Writer(dirPath, name + ".txt");
out.write('\uFEFF');
// Append the header.
String header = getHeader().replace("%source%", icuData.getSourceFile());
out.print(header);
if (icuData.getFileComment() != null) {
out.println("/**");
out.append(" * ").append(icuData.getFileComment()).println();
out.println(" */");
}
// Write the ICU data to file.
out.append(name);
if (!icuData.hasFallback()) out.append(":table(nofallback)");
List<String> sortedPaths = new ArrayList<String>(icuData.keySet());
Collections.sort(sortedPaths, PATH_COMPARATOR);
String[] lastLabels = new String[] {};
boolean wasSingular = false;
for (String path : sortedPaths) {
// Write values to file.
String[] labels = path.split("/", -1); // Don't discard trailing slashes.
int common = getCommon(lastLabels, labels);
for (int i = lastLabels.length - 1; i > common; --i) {
if (wasSingular) {
wasSingular = false;
} else {
out.append(Utility.repeat(TAB, i));
}
out.println("}");
}
for (int i = common + 1; i < labels.length; ++i) {
final String pad = Utility.repeat(TAB, i);
out.append(pad);
String label = labels[i];
if (!label.startsWith("<") && !label.endsWith(">")) {
out.append(label);
}
out.append('{');
if (i != labels.length - 1) {
out.println();
}
}
List<String[]> values = icuData.get(path);
try {
wasSingular = appendValues(name, path, values, labels.length, out);
} catch (NullPointerException npe) {
System.err.println("Null value encountered in " + path);
}
out.flush();
lastLabels = labels;
}
// Add last closing braces.
for (int i = lastLabels.length - 1; i > 0; --i) {
if (wasSingular) {
wasSingular = false;
} else {
out.append(Utility.repeat(TAB, i));
}
out.println("}");
}
out.println("}");
out.close();
}
/**
* Inserts padding and values between braces.
* @param name
* @param rbPath
* @param values
* @param numTabs
* @param out
* @return
*/
private static boolean appendValues(String name, String rbPath, List<String[]> values, int numTabs,
PrintWriter out) {
String[] firstArray;
boolean wasSingular = false;
boolean quote = !IcuData.isIntRbPath(rbPath);
boolean isSequence = rbPath.endsWith("/Sequence");
if (values.size() == 1 && !mustBeArray(true, name, rbPath)) {
if ((firstArray = values.get(0)).length == 1 && !mustBeArray(false, name, rbPath)) {
String value = firstArray[0];
if (quote) {
value = quoteInside(value);
}
int maxWidth = 84 - Math.min(4, numTabs) * TAB.length();
if (value.length() <= maxWidth) {
// Single value for path: don't add newlines.
appendValue(value, quote, out);
wasSingular = true;
} else {
// Value too long to fit in one line, so wrap.
final String pad = Utility.repeat(TAB, numTabs);
out.println();
int end;
for (int i = 0; i < value.length(); i = end) {
end = goodBreak(value, i + maxWidth);
String part = value.substring(i, end);
out.append(pad);
appendValue(part, quote, out).println();
}
}
} else {
// Only one array for the rbPath, so don't add an extra set of braces.
final String pad = Utility.repeat(TAB, numTabs);
out.println();
appendArray(pad, firstArray, quote, isSequence, out);
}
} else {
final String pad = Utility.repeat(TAB, numTabs);
out.println();
for (String[] valueArray : values) {
if (valueArray.length == 1) {
// Single-value array: print normally.
appendArray(pad, valueArray, quote, isSequence, out);
} else {
// Enclose this array in braces to separate it from other
// values.
out.append(pad).println("{");
appendArray(pad + TAB, valueArray, quote, isSequence, out);
out.append(pad).println("}");
}
}
}
return wasSingular;
}
/**
* Wrapper for a hack to determine if the given rb path should always
* present its values as an array. This hack is required for an ICU data test to pass.
*
* @param topValues
* @param name
* @param rbPath
* @return
*/
private static boolean mustBeArray(boolean topValues, String name, String rbPath) {
// TODO(jchye): Add this as an option to the locale file instead of hardcoding.
// System.out.println(name + "\t" + rbPath);
if (topValues) {
return (rbPath.startsWith("/rules/set")
&& name.equals("pluralRanges"));
}
return rbPath.equals("/LocaleScript")
|| (rbPath.contains("/eras/") && !rbPath.endsWith(":alias") && !rbPath.endsWith("/named"))
|| rbPath.startsWith("/calendarPreferenceData")
|| rbPath.startsWith("/metazoneInfo");
}
private static PrintWriter appendArray(String padding, String[] valueArray,
boolean quote, boolean isSequence, PrintWriter out) {
for (String value : valueArray) {
out.append(padding);
appendValue(quoteInside(value), quote, out);
if (!isSequence) {
out.print(",");
}
out.println();
}
return out;
}
private static PrintWriter appendValue(String value, boolean quote, PrintWriter out) {
if (quote) {
return out.append('"').append(value).append('"');
} else {
return out.append(value);
}
}
/**
* Can a string be broken here? If not, backup until we can.
*
* @param quoted
* @param end
* @return
*/
private static int goodBreak(String quoted, int end) {
if (end > quoted.length()) {
return quoted.length();
}
// Don't break escaped Unicode characters.
// Need to handle both e.g. \u4E00 and \U00020000
for (int i = end - 1; i > end - 10;) {
char current = quoted.charAt(i--);
if (!Character.toString(current).matches("[0-9A-Fa-f]")) {
if ((current == 'u' || current == 'U') && i > end - 10 && quoted.charAt(i) == '\\') {
return i;
}
break;
}
}
while (end > 0) {
char ch = quoted.charAt(end - 1);
if (ch != '\\' && (ch < '\uD800' || ch > '\uDFFF')) {
break;
}
--end;
}
return end;
}
/**
* Fix characters inside strings.
*
* @param item
* @return
*/
private static String quoteInside(String item) {
// Unicode-escape all quotes.
item = QUOTE_ESCAPE.matcher(item).replaceAll("\\\\u0022");
// Double up on backslashes, ignoring Unicode-escaped characters.
Pattern pattern = item.startsWith("[") && item.endsWith("]") ? UNICODESET_ESCAPE : STRING_ESCAPE;
Matcher matcher = pattern.matcher(item);
if (!matcher.find()) {
return item;
}
StringBuffer buffer = new StringBuffer();
int start = 0;
do {
buffer.append(item.substring(start, matcher.start()));
int punctuationChar = item.codePointAt(matcher.end() - 1);
buffer.append("\\");
if (punctuationChar == '\\') {
buffer.append('\\');
}
buffer.append(matcher.group());
start = matcher.end();
} while (matcher.find());
buffer.append(item.substring(start));
return buffer.toString();
}
/**
* find the initial labels (from a path) that are identical.
*
* @param item
* @return
*/
private static int getCommon(String[] lastLabels, String[] labels) {
int min = Math.min(lastLabels.length, labels.length);
int i;
for (i = 0; i < min; ++i) {
if (!lastLabels[i].equals(labels[i])) {
return i - 1;
}
}
return i - 1;
}
}