blob: 681dc72f8fc76e67ee03df2a7ea27081ffd724c5 [file] [log] [blame]
package org.unicode.cldr.draft;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import org.unicode.cldr.util.CLDRFile;
import org.unicode.cldr.util.CLDRPaths;
import org.unicode.cldr.util.Counter2;
import org.unicode.cldr.util.Factory;
import org.unicode.cldr.util.LanguageTagParser;
import org.unicode.cldr.util.StandardCodes;
import org.unicode.cldr.util.SupplementalDataInfo;
import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus;
import org.unicode.cldr.util.SupplementalDataInfo.PopulationData;
import com.ibm.icu.dev.util.CollectionUtilities;
import com.ibm.icu.text.NumberFormat;
public class GenerateLanguageData {
SupplementalDataInfo info = SupplementalDataInfo.getInstance(CLDRPaths.DEFAULT_SUPPLEMENTAL_DIRECTORY);
Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
CLDRFile english = cldrFactory.make("en", true);
CLDRFile root = cldrFactory.make("root", true);
StandardCodes sc = StandardCodes.make();
NumberFormat nf = NumberFormat.getInstance();
NumberFormat pf = NumberFormat.getPercentInstance();
public static void main(String[] args) throws IOException {
new GenerateLanguageData().run();
}
private void run() throws IOException {
try (PrintWriter out = FileUtilities.openUTF8Writer(CLDRPaths.GEN_DIRECTORY + "langData/", "generatedLanguageData.txt")) {
Counter2<String> langToPopulation = new Counter2<String>();
// Counter2<String> langToGDP = new Counter2<String>();
LanguageTagParser ltp = new LanguageTagParser();
Map<String, String> languageNameToCode = new TreeMap<String, String>();
for (String languageCode : info.getLanguages()) {
languageNameToCode.put(english.getName(languageCode), languageCode);
}
out.println("\n@sheet:CLDR County Data");
out.println("code\tgdp\tlit-pop\tpopulation\tliteracy");
for (String territory : info.getTerritoriesWithPopulationData()) {
PopulationData terrData = info.getPopulationDataForTerritory(territory);
out.println(territory
+ "\t" + terrData.getGdp()
+ "\t" + terrData.getLiteratePopulation()
+ "\t" + terrData.getPopulation()
+ "\t" + (terrData.getLiteratePopulationPercent()/100)
);
}
out.flush();
out.println("\n@sheet:CLDR Language Data");
out.println("LC\tName\tCC\tName\tStatus\tLitPop");
Map<String,Counter2<String>> langToCountriesOfficial = new TreeMap<>();
for (String languageCode : info.getLanguages()) {
String languageName = english.getName(languageCode);
String baseLanguage = languageCode;
ltp.set(languageCode);
String script = ltp.getScript();
if (script.length() != 0 && !script.startsWith("Han")) {
baseLanguage = ltp.getLanguage();
}
Set<String> territories = info.getTerritoriesForPopulationData(languageCode);
if (territories == null) continue;
// out.println("langName\tcode\tregionName\tcode\tregionPop"
// + "\t" + "regionLitPop"
// + "\t" + nf.format(literatePopulationLangRegion)
// + "\t" + pf.format(factor)
// + "\t" + pf.format(status)
// );
for (String territory : territories) {
PopulationData terrData = info.getPopulationDataForTerritory(territory);
String territoryName = english.getName(CLDRFile.TERRITORY_NAME, territory);
PopulationData data = info.getLanguageAndTerritoryPopulationData(languageCode, territory);
double literatePopulationLangRegion = data.getLiteratePopulation();
double pop = literatePopulationLangRegion;
langToPopulation.add(baseLanguage, pop);
OfficialStatus status = data.getOfficialStatus();
if (status.compareTo(OfficialStatus.official_minority) >= 0) {
Counter2<String> counter = langToCountriesOfficial.get(baseLanguage);
if (counter == null) {
langToCountriesOfficial.put(baseLanguage, counter = new Counter2<>());
}
counter.add(territory, literatePopulationLangRegion);
}
double populationRegion = terrData.getPopulation();
double literatePopulationRegion = terrData.getLiteratePopulation();
double factor = literatePopulationLangRegion / literatePopulationRegion;
//out.println("LC\tName\tCC\tName\tStatus\tLitPop\tblank\t%LitPop(CC)");
out.println(fixLang(languageCode)
+ "\t" + languageName
+ "\t" + territory
+ "\t" + territoryName
+ "\t" + status
+ "\t" + literatePopulationLangRegion
// + "\t" + ""
// + "\t" + factor
);
// double gdp = terrData.getGdp() * factor;
// if (!Double.isNaN(gdp)) {
// langToGDP.add(baseLanguage, gdp);
// }
}
}
out.flush();
out.println("\n@sheet:CLDR Lang-Countries");
out.println("LangCode\tRegionCode");
Set<String> missing = new TreeSet<>(info.getLanguages());
for (Entry<String, Counter2<String>> entry : langToCountriesOfficial.entrySet()) {
Counter2<String> regions = entry.getValue();
ArrayList<String> top = new ArrayList<>(regions.getKeysetSortedByCount(false));
String lang = entry.getKey();
out.println(fixLang(lang)
+ "\t" + (top.size() < 6 ? CollectionUtilities.join(top, ", ")
: CollectionUtilities.join(top.subList(0, 5), ", ") + ", …"));
missing.remove(lang);
}
for (String lang : missing) {
out.println(fixLang(lang)
+ "\tnone");
}
// for (String language :langToPopulation.keySet()) {
// out.println(
// english.getName(language)
// + "\t" + language
// + "\t" + langToPopulation.getCount(language)
// + "\t" + langToGDP.getCount(language)
// );
// }
}
}
private String fixLang(String key) {
return key.replace('_', '-');
}
}