| /* |
| ********************************************************************** |
| * Copyright (c) 2002-2012, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ********************************************************************** |
| * Author: Mark Davis |
| ********************************************************************** |
| */ |
| package org.unicode.cldr.icu; |
| |
| import java.io.BufferedReader; |
| import java.io.File; |
| import java.io.IOException; |
| import java.io.PrintStream; |
| import java.io.PrintWriter; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.HashSet; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Set; |
| import java.util.TreeMap; |
| import java.util.TreeSet; |
| import java.util.regex.Matcher; |
| |
| import org.unicode.cldr.draft.FileUtilities; |
| import org.unicode.cldr.util.CLDRFile; |
| import org.unicode.cldr.util.CLDRPaths; |
| import org.unicode.cldr.util.CldrUtility; |
| import org.unicode.cldr.util.PatternCache; |
| import org.unicode.cldr.util.SimpleFactory; |
| |
| import com.ibm.icu.impl.ICUData; |
| import com.ibm.icu.lang.UCharacter; |
| import com.ibm.icu.lang.UProperty; |
| import com.ibm.icu.text.Collator; |
| import com.ibm.icu.text.RuleBasedCollator; |
| import com.ibm.icu.text.Transliterator; |
| import com.ibm.icu.util.ULocale; |
| import com.ibm.icu.util.UResourceBundle; |
| |
| /** |
| * Extract ICU transform data and convert to CLDR format.<br> |
| * With the option -Dfile=xxxx, will convert a single file. For example:<br> |
| * |
| * <pre> |
| * -DSHOW_FILES -Dfile=c:/downloads/zh_Hans-zh_Hant.txt |
| * </pre> |
| * |
| * The option -Dtarget=yyy will specify an output directory; otherwise it is Utility.GEN_DIRECTORY + "/translit/gen/" |
| * |
| * @author markdavis |
| * |
| */ |
| public class ExtractICUData { |
| public static void main(String[] args) throws Exception { |
| String file = CldrUtility.getProperty("file", null); |
| if (file != null) { |
| String targetDirectory = CldrUtility.getProperty("target", CLDRPaths.GEN_DIRECTORY + "/translit/gen/"); |
| convertFile(file, targetDirectory); |
| } else { |
| generateTransliterators(); |
| } |
| System.out.println("Done"); |
| } |
| |
| static Set<String> skipLines = new HashSet<String>(Arrays.asList(new String[] { |
| "#--------------------------------------------------------------------", |
| "# Copyright (c) 1999-2005, International Business Machines", |
| "# Copyright (c) 1999-2004, International Business Machines", |
| "# Corporation and others. All Rights Reserved.", |
| "#--------------------------------------------------------------------" |
| })); |
| static Set<String> skipFiles = new HashSet<String>(Arrays.asList(new String[] { |
| // "Any_Accents", |
| "el", |
| "en", |
| "root" |
| })); |
| |
| static void generateTransliterators() throws IOException { |
| Matcher fileFilter = PatternCache.get(".*").matcher(""); |
| |
| CLDRFile accumulatedItems = SimpleFactory.makeSupplemental("allItems"); |
| getTranslitIndex(accumulatedItems); |
| |
| File translitSource = new File("C:\\cvsdata\\icu\\icu\\source\\data\\translit\\"); |
| System.out.println("Source: " + translitSource.getCanonicalPath()); |
| File[] fileArray = translitSource.listFiles(); |
| List<Object> list = new ArrayList<Object>(Arrays.asList(fileArray)); |
| |
| // List<String> extras = Arrays.asList(new String[] { |
| // "Arabic_Latin.txt", |
| // "CanadianAboriginal_Latin.txt", |
| // "Cyrillic_Latin.txt", |
| // "Georgian_Latin.txt", |
| // // "Khmer_Latin.txt", "Lao_Latin.txt", "Tibetan_Latin.txt" |
| // "Latin_Armenian.txt", |
| // "Latin_Ethiopic.txt", |
| // "Syriac_Latin.txt", "Thaana_Latin.txt", }); |
| // list.addAll(extras); |
| |
| String[] attributesOut = new String[1]; |
| for (Object file : list) { |
| String fileName = (file instanceof File) ? ((File) file).getName() : (String) file; |
| // if (file instanceof File && extras.contains(fileName)) { |
| // System.out.println("Skipping old version: " + fileName); |
| // } |
| if (!fileName.endsWith(".txt")) continue; |
| String coreName = fileName.substring(0, fileName.length() - 4); |
| if (skipFiles.contains(coreName)) continue; |
| String id = fixTransID(coreName, attributesOut); |
| String outName = id.replace('/', '-'); |
| String attributes = attributesOut[0]; |
| attributes += "[@direction=\"both\"]"; |
| |
| System.out.println(coreName + "\t=>\t" + outName + " => " + attributes); |
| |
| if (!fileFilter.reset(fileName).matches()) continue; |
| |
| BufferedReader input; |
| if (file instanceof File) { |
| input = FileUtilities.openUTF8Reader(((File) file).getParent() + File.separator, fileName); |
| } else { |
| input = CldrUtility.getUTF8Data(fileName); |
| } |
| { |
| CLDRFile outFile = SimpleFactory.makeSupplemental(fileName); |
| int count = 0; |
| String prefixBase = "//supplementalData[@version=\"" + CLDRFile.GEN_VERSION + "\"]/transforms/transform" |
| + attributes; |
| String rulePrefix = prefixBase + "/tRule[@_q=\""; |
| String commentPrefix = prefixBase + "/comment[@_q=\""; |
| |
| StringBuffer accumulatedLines = new StringBuffer(); |
| while (true) { |
| String line = input.readLine(); |
| if (line == null) break; |
| if (line.startsWith("\uFEFF")) line = line.substring(1); // remove BOM |
| line = line.trim(); |
| if (skipLines.contains(line)) continue; |
| if (line.length() == 0) continue; |
| String fixedLine = fixTransRule(line); |
| // if (accumulatedLines.length() == 0) |
| accumulatedLines.append("\n\t\t"); |
| accumulatedLines.append(fixedLine); |
| String prefix = (line.startsWith("#")) ? commentPrefix : rulePrefix; |
| addInTwo(outFile, accumulatedItems, prefix + (++count) + "\"]", fixedLine); |
| } |
| |
| PrintWriter pw = FileUtilities.openUTF8Writer(CLDRPaths.GEN_DIRECTORY + "/translit/gen/", outName + ".xml"); |
| outFile.write(pw); |
| pw.close(); |
| } |
| } |
| PrintWriter pw = FileUtilities.openUTF8Writer(CLDRPaths.GEN_DIRECTORY + "/translit/gen/", "All" + ".xml"); |
| accumulatedItems.write(pw); |
| pw.close(); |
| } |
| |
| static void convertFile(String fileName, String targetDirectory) throws IOException { |
| // Get the ID |
| String coreName = new File(fileName).getName(); |
| if (coreName.endsWith(".txt")) { |
| coreName = coreName.substring(0, coreName.length() - 4); |
| } |
| String[] attributesOut = new String[1]; |
| attributesOut[0] = ""; |
| String id = fixTransID(coreName, attributesOut); |
| String outName = id.replace('/', '-'); |
| String attributes = attributesOut[0]; |
| attributes += "[@direction=\"both\"]"; |
| |
| System.out.println(coreName + "\t=>\t" + outName + " => " + attributes); |
| |
| BufferedReader input = FileUtilities.openUTF8Reader("", fileName); |
| CLDRFile outFile = SimpleFactory.makeSupplemental(coreName); |
| int count = 0; |
| String prefixBase = "//supplementalData[@version=\"" + CLDRFile.GEN_VERSION + "\"]/transforms/transform" |
| + attributes; |
| String rulePrefix = prefixBase + "/tRule[@_q=\""; |
| String commentPrefix = prefixBase + "/comment[@_q=\""; |
| |
| StringBuffer accumulatedLines = new StringBuffer(); |
| while (true) { |
| String line = input.readLine(); |
| if (line == null) break; |
| if (line.startsWith("\uFEFF")) line = line.substring(1); // remove BOM |
| line = line.trim(); |
| if (skipLines.contains(line)) continue; |
| if (line.length() == 0) continue; |
| String fixedLine = fixTransRule(line); |
| // if (accumulatedLines.length() == 0) |
| accumulatedLines.append("\n\t\t"); |
| accumulatedLines.append(fixedLine); |
| String prefix = (line.startsWith("#")) ? commentPrefix : rulePrefix; |
| addInTwo(outFile, null, prefix + (++count) + "\"]", fixedLine); |
| } |
| |
| PrintWriter pw = FileUtilities.openUTF8Writer(targetDirectory, outName + ".xml"); |
| outFile.write(pw); |
| pw.close(); |
| |
| } |
| |
| private static void addInTwo(CLDRFile outFile, CLDRFile accumulatedItems, String path, String value) { |
| // System.out.println("Adding: " + path + "\t\t" + value); |
| outFile.add(path, value); |
| if (accumulatedItems != null) { |
| accumulatedItems.add(path, value); |
| } |
| } |
| |
| private static String fixTransRule(String line) { |
| int hashPos = line.indexOf('#'); |
| // quick hack to separate comment, and check for quoted '#' |
| if (hashPos >= 0 && line.indexOf('\'', hashPos) < 0) { |
| String core = line.substring(0, hashPos).trim(); |
| String comment = line.substring(hashPos + 1).trim(); |
| if (comment.length() != 0) { |
| comment = "# " + comment; |
| } else if (core.length() == 0) { |
| return "#"; |
| } |
| line = (core.length() == 0 ? "" : core + " ") + comment; |
| } |
| // fixedLine = fixedLine.replaceAll("<>", "\u2194"); |
| // fixedLine = fixedLine.replaceAll("<", "\u2190"); |
| // fixedLine = fixedLine.replaceAll(">", "\u2192"); |
| // fixedLine = fixedLine.replaceAll("&", "\u00A7"); |
| String fixedLine = fixLine.transliterate(line); |
| return fixedLine; |
| } |
| |
| static String fixLineRules = |
| "'<>' > '\u2194';" + |
| "'<' > '\u2190';" + |
| "'>' > '\u2192';" + |
| "'&' > '\u00A7';" + |
| "('\\u00'[0-7][0-9A-Fa-f]) > $1;" + // leave ASCII alone |
| "('\\u'[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f]) > |@&hex-any/java($1);" + |
| "([[:whitespace:][:Default_Ignorable_Code_Point:][:C:]-[\\u0020\\u200E\\0009]]) > &any-hex/java($1);" |
| |
| ; |
| static Transliterator fixLine = Transliterator.createFromRules("foo", fixLineRules, Transliterator.FORWARD); |
| |
| private static final String INDEX = "index", |
| RB_RULE_BASED_IDS = "RuleBasedTransliteratorIDs"; |
| |
| private static void getTranslitIndex(CLDRFile accumulatedItems) throws IOException { |
| |
| UResourceBundle bundle, transIDs, colBund; |
| bundle = UResourceBundle.getBundleInstance(ICUData.ICU_TRANSLIT_BASE_NAME, INDEX); |
| transIDs = bundle.get(RB_RULE_BASED_IDS); |
| |
| String[] attributesOut = new String[1]; |
| int count = 0; |
| |
| int maxRows = transIDs.getSize(); |
| for (int row = 0; row < maxRows; row++) { |
| colBund = transIDs.get(row); |
| String ID = colBund.getKey(); |
| UResourceBundle res = colBund.get(0); |
| String type = res.getKey(); |
| if (type.equals("file") || type.equals("internal")) { |
| // // Rest of line is <resource>:<encoding>:<direction> |
| // // pos colon c2 |
| // String resString = res.getString("resource"); |
| // String direction = res.getString("direction"); |
| // result.add(Arrays.asList(new Object[]{ID, |
| // resString, // resource |
| // "UTF-16", // encoding |
| // direction, |
| // type})); |
| } else if (type.equals("alias")) { |
| CLDRFile outFile = SimpleFactory.makeSupplemental("transformAliases"); |
| // 'alias'; row[2]=createInstance argument |
| ID = fixTransID(ID, attributesOut); |
| String outName = ID.replace('/', '-'); |
| String attributes = attributesOut[0]; |
| attributes += "[@direction=\"forward\"]"; |
| System.out.println(ID + " => " + attributes); |
| String prefix = "//supplementalData[@version=\"" + CLDRFile.GEN_VERSION + "\"]/transforms/transform" |
| + attributes + "/tRule[@_q=\""; |
| String resString = res.getString(); |
| if (!instanceMatcher.reset(resString).matches()) { |
| System.out.println("Doesn't match id: " + resString); |
| } else { |
| String filter = instanceMatcher.group(1); |
| if (filter != null) { |
| filter = fixTransRule(filter); |
| outFile.add(prefix + (++count) + "\"]", "::" + filter + ";"); |
| accumulatedItems.add(prefix + (++count) + "\"]", "::" + filter + ";"); |
| } |
| String rest = instanceMatcher.group(2); |
| String[] pieces = rest.split(";"); |
| for (int i = 0; i < pieces.length; ++i) { |
| String piece = pieces[i].trim(); |
| if (piece.length() == 0) continue; |
| piece = fixTransID(piece, null); |
| outFile.add(prefix + (++count) + "\"]", "::" + piece + ";"); |
| accumulatedItems.add(prefix + (++count) + "\"]", "::" + piece + ";"); |
| } |
| } |
| PrintWriter pw = FileUtilities.openUTF8Writer( |
| CLDRPaths.GEN_DIRECTORY + "/translit/gen/", outName + ".xml"); |
| outFile.write(pw); |
| pw.close(); |
| } else { |
| // Unknown type |
| throw new RuntimeException("Unknown type: " + type); |
| } |
| } |
| } |
| |
| private static String fixTransID(String id, String[] attributesOut) { |
| if (!idMatcher.reset(id).matches()) { |
| System.out.println("Doesn't match id:: " + id); |
| } else { |
| String source = fixTransIDPart(idMatcher.group(1)); |
| String target = fixTransIDPart(idMatcher.group(2)); |
| String variant = fixTransIDPart(idMatcher.group(3)); |
| |
| if (attributesOut != null) { |
| attributesOut[0] = "[@source=\"" + source + "\"]" |
| + "[@target=\"" + target + "\"]" |
| + (variant == null ? "" : "[@variant=\"" + variant + "\"]"); |
| if (privateFiles.reset(id).matches()) attributesOut[0] += "[@visibility=\"internal\"]"; |
| } |
| |
| if (target == null) |
| target = ""; |
| else |
| target = "-" + target; |
| if (variant == null) |
| variant = ""; |
| else |
| variant = "/" + variant; |
| id = source + target + variant; |
| } |
| return id; |
| } |
| |
| static String idPattern = "\\s*(\\p{L}+)(?:[_-](\\p{L}+))?(?:\\[_/](\\p{L}+))?"; |
| static Matcher idMatcher = PatternCache.get(idPattern).matcher(""); |
| static Matcher instanceMatcher = PatternCache.get("\\s*(\\[.*\\]\\s*)?(.*)").matcher(""); |
| |
| // private static String fixTransName(String name, String[] attributesOut, String separator) { |
| // String[] pieces = name.split(separator); |
| // String source = fixTransIDPart(pieces[0]); |
| // String target = fixTransIDPart(pieces[1]); |
| // String variant = null; |
| // if (pieces.length > 2) { |
| // variant = pieces[2].toUpperCase(); |
| // } |
| // attributesOut[0] = "[@source=\"" + source + "\"]" |
| // + "[@target=\"" + target + "\"]" |
| // + (variant == null ? "" : "[@variant=\"" + variant + "\"]"); |
| // if (privateFiles.reset(name).matches()) attributesOut[0] += "[@visibility=\"internal\"]"; |
| // return source + (target == null ? "" : "-") + target + (variant == null ? "" : "/" + variant); |
| // } |
| |
| static Matcher privateFiles = PatternCache.get(".*(Spacedhan|InterIndic|ThaiLogical|ThaiSemi).*").matcher(""); |
| static Matcher allowNames = PatternCache.get("(Fullwidth|Halfwidth|NumericPinyin|Publishing)").matcher(""); |
| |
| static Set<String> collectedNames = new TreeSet<String>(); |
| |
| private static String fixTransIDPart(String name) { |
| if (name == null) return name; |
| try { |
| UCharacter.getPropertyValueEnum(UProperty.SCRIPT, name); |
| } catch (IllegalArgumentException e) { |
| collectedNames.add(name); |
| } |
| |
| if (name.equals("Tone")) return "Pinyin"; |
| if (name.equals("Digit")) return "NumericPinyin"; |
| if (name.equals("Jamo")) return "ConjoiningJamo"; |
| if (name.equals("LowerLatin")) return "Latin"; |
| |
| return name; |
| } |
| |
| static void testProps() { |
| int[][] ranges = { { UProperty.BINARY_START, UProperty.BINARY_LIMIT }, |
| { UProperty.INT_START, UProperty.INT_LIMIT }, |
| { UProperty.DOUBLE_START, UProperty.DOUBLE_START }, |
| { UProperty.STRING_START, UProperty.STRING_LIMIT }, |
| }; |
| Collator col = Collator.getInstance(ULocale.ROOT); |
| ((RuleBasedCollator) col).setNumericCollation(true); |
| Map<String, Set<String>> alpha = new TreeMap<String, Set<String>>(col); |
| |
| for (int range = 0; range < ranges.length; ++range) { |
| for (int propIndex = ranges[range][0]; propIndex < ranges[range][1]; ++propIndex) { |
| String propName = UCharacter.getPropertyName(propIndex, UProperty.NameChoice.LONG); |
| String shortPropName = UCharacter.getPropertyName(propIndex, UProperty.NameChoice.SHORT); |
| propName = getName(propIndex, propName, shortPropName); |
| Set<String> valueOrder = new TreeSet<String>(col); |
| alpha.put(propName, valueOrder); |
| switch (range) { |
| case 0: |
| valueOrder.add("[binary]"); |
| break; |
| case 2: |
| valueOrder.add("[double]"); |
| break; |
| case 3: |
| valueOrder.add("[string]"); |
| break; |
| case 1: |
| for (int valueIndex = 0; valueIndex < 256; ++valueIndex) { |
| try { |
| String valueName = UCharacter.getPropertyValueName(propIndex, valueIndex, |
| UProperty.NameChoice.LONG); |
| String shortValueName = UCharacter.getPropertyValueName(propIndex, valueIndex, |
| UProperty.NameChoice.SHORT); |
| valueName = getName(valueIndex, valueName, shortValueName); |
| valueOrder.add(valueName); |
| } catch (RuntimeException e) { |
| // just skip |
| } |
| } |
| break; |
| } |
| } |
| } |
| PrintStream out = System.out; |
| |
| for (Iterator<String> it = alpha.keySet().iterator(); it.hasNext();) { |
| String propName = it.next(); |
| Set<String> values = alpha.get(propName); |
| out.println("<tr><td>" + propName + "</td>"); |
| out.println("<td><table>"); |
| for (Iterator<String> it2 = values.iterator(); it2.hasNext();) { |
| String propValue = it2.next(); |
| System.out.println("<tr><td>" + propValue + "</td></tr>"); |
| } |
| out.println("</table></td></tr>"); |
| } |
| Collator c = Collator.getInstance(ULocale.ENGLISH); |
| ((RuleBasedCollator) c).setNumericCollation(true); |
| |
| // int enumValue = UCharacter.getIntPropertyValue(codePoint, propEnum); |
| // return UCharacter.getPropertyValueName(propEnum,enumValue, (int)nameChoice); |
| |
| } |
| |
| private static String getName(int index, String valueName, String shortValueName) { |
| if (valueName == null) { |
| if (shortValueName == null) return String.valueOf(index); |
| return shortValueName; |
| } |
| if (shortValueName == null) return valueName; |
| if (valueName.equals(shortValueName)) return valueName; |
| return valueName + "\u00A0(" + shortValueName + ")"; |
| } |
| } |