blob: 6f8bbf80b9f4298f9f1e1fbf3644a889d92f9507 [file] [log] [blame]
/*
* Created on May 19, 2005
* Copyright (C) 2004-2005, Unicode, Inc., International Business Machines Corporation, and others.
* For terms of use, see http://www.unicode.org/terms_of_use.html
*/
package org.unicode.cldr.tool;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import org.unicode.cldr.draft.FileUtilities;
import org.unicode.cldr.util.ArrayComparator;
import org.unicode.cldr.util.CLDRFile;
import org.unicode.cldr.util.CldrUtility;
import org.unicode.cldr.util.Factory;
import org.unicode.cldr.util.LanguageTagParser;
import org.unicode.cldr.util.Log;
import org.unicode.cldr.util.StandardCodes;
import org.unicode.cldr.util.TransliteratorUtilities;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.Transliterator;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ICUUncheckedIOException;
import com.ibm.icu.util.ULocale;
/**
* @throws IOException
*
*/
class GenerateStatistics {
static final boolean HACK = true;
static CLDRFile english;
static Factory factory;
static LanguageTagParser ltp = new LanguageTagParser();
static Collator col = Collator.getInstance(ULocale.ENGLISH);
static boolean notitlecase = true;
public static void generateSize(String sourceDir, String logDir, String match, boolean transliterate)
throws IOException {
factory = Factory.make(sourceDir, match);
ToolUtilities.registerExtraTransliterators();
PrintWriter logHtml = FileUtilities.openUTF8Writer(logDir, "test_generation_log.html");
//String dir = logDir + "main" + File.separator;
// DraftChecker dc = new DraftChecker(dir);
english = factory.make("en", true);
Set<String> languages = new TreeSet<String>(col), countries = new TreeSet<String>(col), draftLanguages = new TreeSet<String>(
col), draftCountries = new TreeSet<String>(col);
Set<Object> nativeLanguages = new TreeSet<Object>(), nativeCountries = new TreeSet<Object>(), draftNativeLanguages = new TreeSet<Object>(), draftNativeCountries = new TreeSet<Object>();
int localeCount = 0;
int draftLocaleCount = 0;
Set<String> contents = removeSingleLanguagesWhereWeHaveScripts(factory.getAvailable());
for (Iterator<String> it = contents.iterator(); it.hasNext();) {
String localeID = it.next();
if (CLDRFile.isSupplementalName(localeID)) continue;
if (localeID.equals("root"))
continue; // skip root
System.out.println("Collecting info for:\t" + localeID.replace("_", "\t"));
boolean draft = false; // dc.isDraft(localeName);
if (draft) {
draftLocaleCount++;
addCounts(localeID, true, draftLanguages,
draftCountries, draftNativeLanguages,
draftNativeCountries);
} else {
localeCount++;
addCounts(localeID, false, languages,
countries, nativeLanguages, nativeCountries);
}
if (false)
Log.logln(draft + ", " + localeCount + ", "
+ languages.size() + ", " + countries.size() + ", "
+ draftLocaleCount + ", " + draftLanguages.size()
+ ", " + draftCountries.size());
}
draftLanguages.removeAll(languages);
for (Iterator<Object> it = nativeLanguages.iterator(); it.hasNext();) {
draftNativeLanguages.remove(it.next());
}
logHtml.println("<html><head>");
logHtml
.println("<meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
logHtml.println("</head><body>");
logHtml.println("<p><b>Locales (" + localeCount + "):</b>");
logHtml.println("<p><b>Languages (" + languages.size() + "):</b>");
logHtml.println(showSet(nativeLanguages, transliterate, true));
logHtml.println("<p><b>Territories (" + countries.size() + "):</b>");
logHtml.println(showSet(nativeCountries, transliterate, false));
logHtml.println("<p><b>Draft locales (" + draftLocaleCount + "):</b>");
logHtml.println("<p><b>Draft languages (" + draftLanguages.size()
+ "):</b>");
logHtml.println(showSet(draftNativeLanguages, transliterate, true));
logHtml.println("<p><b>Draft countries (" + draftCountries.size()
+ "):</b>");
logHtml.println(showSet(draftNativeCountries, transliterate, false));
logHtml.println(CldrUtility.ANALYTICS);
logHtml.println("</body></html>");
logHtml.close();
}
/**
*
*/
private static Set<String> removeSingleLanguagesWhereWeHaveScripts(Set<String> contents) {
StandardCodes sc = StandardCodes.make();
contents = new TreeSet<String>(contents); // make writable
if (false && HACK) {
contents.add("bs_Latn");
contents.add("bs_Cyrl");
contents.add("bs_Latn_BA");
contents.add("bs_Cyrl_BA");
}
// find the languages with scripts
Set<String> toRemove = new HashSet<String>();
if (HACK) toRemove.add("sh");
for (Iterator<String> it = contents.iterator(); it.hasNext();) {
String localeID = it.next();
if (CLDRFile.isSupplementalName(localeID)) {
continue;
}
// if there is a lang_script, then remove everything starting with lang that doesn't have "a" script
String lang = ltp.set(localeID).getLanguage();
String territory = ltp.set(localeID).getRegion();
if (!sc.getGoodAvailableCodes("language").contains(lang)) {
System.out.println("Odd language, removing: " + localeID);
it.remove();
continue;
}
if (territory.length() != 0 && !sc.getGoodAvailableCodes("territory").contains(territory)) {
System.out.println("Odd territory, removing: " + localeID);
it.remove();
continue;
}
String langscript = ltp.set(localeID).getLanguageScript();
if (!lang.equals(langscript)) toRemove.add(lang);
}
for (Iterator<String> it = contents.iterator(); it.hasNext();) {
String localeID = it.next();
if (CLDRFile.isSupplementalName(localeID)) {
continue;
}
// if there is a lang_script, then remove everything starting with lang that doesn't have "a" script
String lang = ltp.set(localeID).getLanguage();
if (!toRemove.contains(lang)) continue;
String langscript = ltp.set(localeID).getLanguageScript();
if (lang.equals(langscript)) it.remove();
}
return contents;
}
static final UnicodeSet NON_LATIN = new UnicodeSet("[^[:latin:][:common:][:inherited:]]");
/**
* @param nativeCountries
* @param transliterate
* TODO
* @param isLanguage
* TODO
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
private static String showSet(Set nativeCountries, boolean transliterate,
boolean isLanguage) {
UnicodeSet BIDI_R = new UnicodeSet(
"[[:Bidi_Class=R:][:Bidi_Class=AL:]]");
StringBuffer result = new StringBuffer();
Map sb = new TreeMap(LanguageList.col);
// collect multiples by English name
for (Iterator it = nativeCountries.iterator(); it.hasNext();) {
LanguageList llist = (LanguageList) it.next();
Set s = (Set) sb.get(llist.getEnglishName());
if (s == null)
sb.put(llist.getEnglishName(), s = new TreeSet());
s.add(llist);
}
Set<String> titleSet = new TreeSet<String>(col);
Set<String> qualifierSet = new TreeSet<String>(col);
for (Iterator<String> it = sb.keySet().iterator(); it.hasNext();) {
String englishName = it.next();
Set s = (Set) sb.get(englishName);
if (result.length() != 0) {
result.append("; ");
}
String code = "";
boolean needQualifier = s.size() != 1;
titleSet.clear();
qualifierSet.clear();
for (Iterator<LanguageList> it2 = s.iterator(); it2.hasNext();) {
LanguageList llist = it2.next();
String localName = llist.getLocalName();
String locale = llist.getLocale();
// see if we need qualifier
String lang = locale, country = "";
if (locale.length() > 3
&& locale.charAt(locale.length() - 3) == '_') {
lang = locale.substring(0, locale.length() - 3);
country = locale.substring(locale.length() - 2);
}
// fix
if (BIDI_R.containsSome(localName))
localName = '\u200E' + localName + '\u200E';
// qualifiers += lang;
if (isLanguage) {
code = lang;
} else {
code = country;
}
if (!localName.equalsIgnoreCase(englishName)) {
needQualifier = true;
qualifierSet.add(localName);
if (transliterate && NON_LATIN.containsSome(localName)
&& !lang.equals("ja")) {
String transName = localName;
try {
transName = fixedTitleCase("en",
toLatin.transliterate(localName));
} catch (RuntimeException e) {
System.out.println("\t" + e.getMessage());
}
if (NON_LATIN.containsSome(transName)) {
Log.logln("Can't transliterate " + localName
+ ": " + transName);
} else {
titleSet.add(transName);
}
}
}
}
String title = code + (titleSet.isEmpty() ? "" : ": " + titleSet.toString());
String before = "", after = "";
if (title.length() != 0) {
before = "<span title=\'"
+ TransliteratorUtilities.toHTML.transliterate(title) + "'>";
after = "</span>";
}
String qualifiers = qualifierSet.toString();
if (!needQualifier || qualifierSet.isEmpty())
qualifiers = "";
else
qualifiers = " " + qualifiers; // qualifiers = " (" + qualifiers + ")";
// fix
if (englishName.endsWith(", China")) {
englishName = englishName.substring(0, englishName.length()
- ", China".length())
+ " China";
}
result.append(before)
.append(
TransliteratorUtilities.toHTML.transliterate(englishName
+ qualifiers)).append(after);
}
return result.toString();
}
/**
* @param localeID
* @param isDraft
* TODO
* @param draftLanguages
* @param draftCountries
* @param draftNativeLanguages
* @param draftNativeCountries
*/
private static void addCounts(String localeID, boolean isDraft, Set<String> draftLanguages, Set<String> draftCountries,
Set<Object> draftNativeLanguages, Set<Object> draftNativeCountries) {
// ULocale uloc = new ULocale(localeName);
ltp.set(localeID);
String lang = ltp.getLanguage();
String langScript = ltp.getLanguageScript();
String country = ltp.getRegion();
// dump aliases
// if ((country.equals("TW") || country.equals("HK") || country.equals("MO")) && lang.equals("zh")) return;
// if (lang.equals("zh_Hans") || lang.equals("sr_Cyrl") || lang.equals("sh")) return;
String nativeName, englishName;
draftLanguages.add(lang);
nativeName = getFixedLanguageName(localeID, langScript);
englishName = english.getName(langScript);
if (!lang.equals("en") && nativeName.equals(englishName)) {
Log.logln((isDraft ? "D" : "") + "\tWarning: in " + localeID + ", display name for " + lang
+ " equals English: " + nativeName);
}
draftNativeLanguages.add(new LanguageList(langScript, englishName, fixedTitleCase("en", nativeName)));
if (!country.equals("")) {
draftCountries.add(country);
nativeName = getFixedDisplayCountry(localeID, country);
englishName = getFixedDisplayCountry("en", country);
if (!lang.equals("en") && nativeName.equals(englishName)) {
Log.logln((isDraft ? "D" : "") + "\tWarning: in " + localeID + ", display name for " + country
+ " equals English: " + nativeName);
}
draftNativeCountries.add(new LanguageList(localeID, englishName, fixedTitleCase("en", nativeName)));
}
}
private static class LanguageList implements Comparable<Object> {
Object[] contents;
static Collator col = Collator.getInstance(ULocale.ENGLISH);
static Comparator<Object[]> comp = new ArrayComparator(new Collator[] { col, col, null });
LanguageList(String locale, String englishName, String localName) {
contents = new Object[] { englishName, locale, localName };
}
public int compareTo(Object o) {
return comp.compare(contents, ((LanguageList) o).contents);
}
String getLocale() {
return (String) contents[1];
}
String getEnglishName() {
return (String) contents[0];
}
String getLocalName() {
return (String) contents[2];
}
}
static String fixedTitleCase(String localeID, String in) {
if (notitlecase) return in;
String result = UCharacter.toTitleCase(new ULocale(localeID), in, null);
if (HACK) {
result = GenerateCldrTests.replace(result, "U.s.", "U.S.");
result = GenerateCldrTests.replace(result, "S.a.r.", "S.A.R.");
}
return result;
}
/*
* static void addMapSet(Map m, Object key, Object value, Comparator com) {
* Set valueSet = (Set) m.get(key);
* if (valueSet == null) {
* valueSet = new TreeSet(com);
* m.put(key, valueSet);
* }
* valueSet.add(value);
* }
*/
/**
*
*/
private static String getFixedLanguageName(String localeID, String lang) {
if (HACK) {
if (localeID.equals("bs") || localeID.startsWith("bs_")) {
if (lang.equals("bs") || lang.startsWith("bs_")) return "Bosanski";
}
}
CLDRFile cldr = factory.make(localeID, true);
return cldr.getName(lang);
}
/**
* @param uloc
* @return
*/
private static String getFixedDisplayCountry(String localeID, String country) {
if (HACK) {
if (localeID.equals("bs") || localeID.startsWith("bs_")) {
if (country.equals("BA"))
return "\u0411\u043E\u0441\u043D\u0430 \u0438 \u0425\u0435\u0440\u0446\u0435\u0433\u043E\u0432\u0438\u043D\u0430";
}
}
CLDRFile cldr = factory.make(localeID, true);
String name = cldr.getName("territory", country);
if (false && HACK) {
Object trial = fixCountryNames.get(name);
if (trial != null) {
return (String) trial;
}
}
return name;
}
static Map<String, String> fixCountryNames = new HashMap<String, String>();
static {
fixCountryNames.put("\u0408\u0443\u0433\u043E\u0441\u043B\u0430\u0432\u0438\u0458\u0430",
"\u0421\u0440\u0431\u0438\u0458\u0430 \u0438 \u0426\u0440\u043D\u0430 \u0413\u043E\u0440\u0430");
fixCountryNames.put("Jugoslavija", "Srbija i Crna Gora");
fixCountryNames.put("Yugoslavia", "Serbia and Montenegro");
}
public static final Transliterator toLatin = Transliterator.getInstance("any-latin");
public static class DraftChecker {
String dir;
Map<String, Object> cache = new HashMap<String, Object>();
Object TRUE = new Object();
Object FALSE = new Object();
public DraftChecker(String dir) {
this.dir = dir;
}
public boolean isDraft(String localeName) {
Object check = cache.get(localeName);
if (check != null) {
return check == TRUE;
}
BufferedReader pw = null;
//boolean result = true;
try {
pw = FileUtilities.openUTF8Reader(dir, localeName + ".xml");
while (true) {
String line = pw.readLine();
if (line == null) {
throw new IllegalArgumentException("Internal Error: should never get here.");
}
if (line.indexOf("<ldml") >= 0) {
if (line.indexOf("draft") >= 0) {
check = TRUE;
} else {
check = FALSE;
}
break;
}
}
pw.close();
} catch (IOException e) {
throw new ICUUncheckedIOException("Failure on " + localeName + ": " + dir + localeName + ".xml", e);
}
cache.put(localeName, check);
return check == TRUE;
}
}
}