blob: a1b1e99f0b1f66489d86e31a07db9226aed5d925 [file] [log] [blame]
package org.unicode.cldr.util;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.regex.Pattern;
import com.ibm.icu.impl.Relation;
import com.ibm.icu.util.ICUUncheckedIOException;
public class Iso639Data {
static Map<String, String> toAlpha3;
static Map<String, String> fromAlpha3;
static Map<String, String> toBiblio3;
static Map<String, String> fromBiblio3;
static Relation<String, String> toNames;
static Relation<String, String> toRetirements;
static Map<String, Scope> toScope;
static Map<String, List<String>> toHeirarchy;
static Map<String, Type> toType;
static Map<String, String> encompassed_macro;
static Relation<String, String> macro_encompassed;
static Map<String, Source> toSource;
private static String version;
/**
* <h3><a NAME="I">Individual</a> languages</h3>
* <p>
* Judgments regarding when two varieties are considered to be the same or different languages are based on a number
* of factors, including linguistic similarity, intelligibility, a common literature, the views of speakers
* concerning the relationship between language and identity, and other factors.
* </p>
* <h3><a NAME="M">Macrolanguages</a></h3>
* <p>
* In various parts of the world, there are clusters of closely-related language varieties that, based on the
* criteria discussed above, can be considered distinct individual languages, yet in certain usage contexts a single
* language identity for all is needed.
* </p>
* <p>
* Macrolanguages are distinguished from language collections in that the individual languages that correspond to a
* macrolanguage must be very closely related, and there must be some domain in which only a single language
* identity is recognized.
* </p>
*
* <h3><a NAME="C">Collections</a> of languages</h3>
* <p>
* A collective language code element is an identifier that represents a group of individual languages that are not
* deemed to be one language in any usage context.
* </p>
* </p> <h3><a NAME="R">Private Use</a></h3>
* <p>
* Identifiers <tt>qaa</tt> through <tt>qtz</tt> are reserved for local use, to be used in cases in which there is
* no suitable existing code in ISO 639. There are no constraints as to scope of denotation. These identifiers may
* only be used locally, and may not be used in interchange without a private agreement.
* </p>
* <h3><a NAME="S">Special situations</a></h3>
* <p>
* A few code elements are defined for other special situations.
* </p>
* For more information, see http://www.sil.org/iso639-3/scope.asp
* <p>
* Note that the casing on these enum values is chosen to match standard usage.
* </p>
*/
public enum Scope {
Individual, Macrolanguage, Special, Collection, PrivateUse, Unknown;
public static Scope fromString(String input) {
input = input.replace("-", "");
for (Scope item : Scope.values()) {
if (item.toString().equalsIgnoreCase(input)) {
return item;
}
}
return Scope.valueOf(input); // to get exception
}
};
/**
* <h3><a NAME="L"></a>Living languages</h3>
* <p>
* A language is listed as <i>living</i> when there are people still living who learned it as a first language.
* </p>
* <h3><a NAME="E"></a>Extinct languages</h3>
*
* <p>
* A language is listed as <i>extinct</i> if it has gone extinct in recent times. (e.g. in the last few centuries).
* </p>
* <h3><a NAME="A"></a>Ancient languages</h3>
* <p>
* A language is listed as <i>ancient</i> if it went extinct in ancient times (e.g. more than a millennium ago).
* </p>
* <h3><a NAME="H"></a>Historic languages</h3>
* <p>
* A language is listed as <i>historic</i> when it is considered to be distinct from any modern languages that are
* descended from it; for instance, Old English and Middle English.
* </p>
*
* <h3><a NAME="C"></a>Constructed languages</h3>
* <p>
* Artificial languages are those like Esperanto: it excludes programming languages.
* </p>
* <p>
* Note that the casing on these enum values is chosen to match standard usage. <i>For more information, see
* http://www.sil.org/iso639-3/scope.asp</i>
* </p>
*/
public enum Type {
Ancient, Constructed, Extinct, Historical, Living, Special, Collection, Unknown
};
/**
* This indicates the source of the language subtag.
*
* @author markdavis
*
*/
public enum Source {
ISO_639_1, ISO_639_2, ISO_639_3, BCP47, CLDR
};
public static String getVersion() {
return version;
}
public static Source getSource(String languageSubtag) {
if (toAlpha3 == null) {
getData();
}
if (!isValid(languageSubtag)) {
return null;
}
Source result = toSource.get(languageSubtag);
if (result == null)
return Source.ISO_639_3;
return result;
}
public static String toAlpha3(String languageSubtag) {
if (toAlpha3 == null) {
getData();
}
if (!isValid(languageSubtag)) {
return null;
}
return toAlpha3.get(languageSubtag);
}
public static String fromAlpha3(String alpha3) {
if (fromAlpha3 == null) {
getData();
}
String alpha2 = fromAlpha3.get(alpha3);
if (alpha2 != null) {
return alpha2;
}
// it only exists if it has a name
if (isValid(alpha3)) {
return alpha3;
}
return null;
}
private static boolean isValid(String alpha3) {
return toNames.containsKey(alpha3);
}
public static String fromBiblio3(String biblio3) {
if (toNames == null) {
getData();
}
String result = fromBiblio3.get(biblio3);
if (result != null) {
return result;
}
return fromAlpha3(biblio3);
}
public static String toBiblio3(String languageTag) {
if (toNames == null) {
getData();
}
String result = toBiblio3.get(languageTag);
if (result != null) {
return result;
}
return toAlpha3(languageTag);
}
public static Set<String> hasBiblio3() {
return toBiblio3.keySet();
}
public static Set<String> getNames(String languageSubtag) {
if (toNames == null) {
getData();
}
return toNames.getAll(languageSubtag);
}
public static Scope getScope(String languageSubtag) {
if (toScope == null) {
getData();
}
if (!isValid(languageSubtag))
return Scope.Unknown;
Scope result = toScope.get(languageSubtag);
if (result != null)
return result;
return Scope.Individual;
}
/**
* Returns the ISO 639-5 heirarchy if available, otherwise null.
*/
public static List<String> getHeirarchy(String languageSubtag) {
if (toHeirarchy == null) {
getData();
}
return toHeirarchy.get(languageSubtag);
}
public static Type getType(String languageSubtag) {
if (toAlpha3 == null) {
getData();
}
if (!isValid(languageSubtag))
return Type.Unknown;
Type result = toType.get(languageSubtag);
if (result != null)
return result;
return Type.Living;
}
/**
* Id char(3) NOT NULL, -- The three-letter 639-3 identifier Part2B char(3)
* NULL, -- Equivalent 639-2 identifier of the bibliographic applications code
* set, if there is one Part2T char(3) NULL, -- Equivalent 639-2 identifier of
* the terminology applications code set, if there is one Part1 char(2) NULL, --
* Equivalent 639-1 identifier, if there is one Scope char(1) NOT NULL, --
* I(ndividual), M(acrolanguage), S(pecial) Type char(1) NOT NULL, --
* A(ncient), C(onstructed), -- E(xtinct), H(istorical), L(iving), S(pecial)
* Ref_Name varchar(150) NOT NULL) -- Reference language name
*
* @throws IOException
*/
enum IsoColumn {
Id, Part2B, Part2T, Part1, Scope, Type, Ref_Name
};
/**
* Id char(3) NOT NULL, -- The three-letter 639-3 identifier Print_Name
* varchar(75) NOT NULL, -- One of the names associated with this identifier
* Inverted_Name varchar(75) NOT NULL) -- The inverted form of this Print_Name
* form
*/
enum IsoNamesColumn {
Id, Print_Name, Inverted_Name
};
private static void getData() {
try {
BufferedReader in = CldrUtility.getUTF8Data("iso-639-3-version.tab");
version = in.readLine().trim();
in.close();
in = CldrUtility.getUTF8Data("iso-639-3.tab");
Pattern tabs = PatternCache.get("\\t");
toAlpha3 = new HashMap<String, String>();
fromAlpha3 = new HashMap<String, String>();
toBiblio3 = new HashMap<String, String>();
fromBiblio3 = new HashMap<String, String>();
toScope = new HashMap<String, Scope>();
toType = new HashMap<String, Type>();
toNames = Relation.of(new TreeMap<String, Set<String>>(), LinkedHashSet.class);
toRetirements = Relation.of(new TreeMap<String, Set<String>>(), LinkedHashSet.class);
macro_encompassed = Relation.of(new TreeMap<String, Set<String>>(), LinkedHashSet.class);
encompassed_macro = new HashMap<String, String>();
toSource = new HashMap<String, Source>();
toSource.put("sh", Source.ISO_639_1); // add deprecated language
while (true) {
String line = in.readLine();
if (line == null)
break;
if (line.startsWith("\uFEFF"))
line = line.substring(1);
String[] parts = tabs.split(line);
String alpha3 = parts[IsoColumn.Id.ordinal()];
if (alpha3.equals("Id"))
continue;
String languageSubtag = alpha3;
if (parts[IsoColumn.Part1.ordinal()].length() != 0) { // parts.length >
// IsoColumn.Part1.ordinal()
// &&
languageSubtag = parts[IsoColumn.Part1.ordinal()];
toAlpha3.put(languageSubtag, alpha3);
fromAlpha3.put(alpha3, languageSubtag);
}
if (parts[IsoColumn.Part2B.ordinal()].length() != 0) { // parts.length >
// IsoColumn.Part1.ordinal()
// &&
String biblio = parts[IsoColumn.Part2B.ordinal()];
if (!biblio.equals(alpha3)) {
toBiblio3.put(languageSubtag, biblio);
fromBiblio3.put(biblio, languageSubtag);
}
}
toNames.put(languageSubtag, parts[IsoColumn.Ref_Name.ordinal()]);
Scope scope = findMatchToPrefix(parts[IsoColumn.Scope.ordinal()], Scope.values());
if (scope != Scope.Individual)
toScope.put(languageSubtag, scope);
Type type = findMatchToPrefix(parts[IsoColumn.Type.ordinal()], Type.values());
if (type != Type.Living)
toType.put(languageSubtag, type);
}
// System.out.println("Size:\t" + toNames.size());
in.close();
// Id Ref_Name Ret_Reason Change_To Ret_Remedy Effective
in = CldrUtility.getUTF8Data("iso-639-3_Retirements.tab");
while (true) {
String line = in.readLine();
if (line == null)
break;
if (line.startsWith("\uFEFF"))
line = line.substring(1);
String[] parts = tabs.split(line);
String alpha3 = parts[0];
if (alpha3.equals("Id"))
continue;
toNames.put(alpha3, parts[1]);
toRetirements.put(alpha3, line);
// skip inverted name for now
}
// System.out.println("Size:\t" + toNames.size());
in.close();
// Id Print_Name Inverted_Name
in = CldrUtility.getUTF8Data("iso-639-3-macrolanguages.tab");
while (true) {
String line = in.readLine();
if (line == null)
break;
if (line.startsWith("\uFEFF"))
line = line.substring(1);
String[] parts = tabs.split(line);
String prefix = parts[0];
if (prefix.equals("M_Id"))
continue;
prefix = fromAlpha3(prefix);
String suffix = fromAlpha3(parts[1]);
if (suffix == null || prefix == null) {
throw new IllegalArgumentException();
}
encompassed_macro.put(suffix, prefix);
macro_encompassed.put(prefix, suffix);
// skip inverted name for now
}
// System.out.println("Size:\t" + toNames.size());
in.close();
// Id Print_Name Inverted_Name
in = CldrUtility.getUTF8Data("iso-639-3_Name_Index.tab");
while (true) {
String line = in.readLine();
if (line == null)
break;
if (line.startsWith("\uFEFF"))
line = line.substring(1);
String[] parts = tabs.split(line);
String alpha3 = parts[IsoColumn.Id.ordinal()];
if (alpha3.equals("Id"))
continue;
String languageSubTag = fromAlpha3(alpha3);
toNames.put(languageSubTag, parts[IsoNamesColumn.Print_Name.ordinal()]);
// skip inverted name for now
}
// System.out.println("Size:\t" + toNames.size());
in.close();
in = CldrUtility.getUTF8Data("ISO-639-2_values_8bits.txt");
// An alpha-3 (bibliographic) code,
// an alpha-3 (terminologic) code (when given),
// an alpha-2 code (when given),
// an English name,
// and a French name of a language are all separated by pipe (|)
// characters.
while (true) {
String line = in.readLine();
if (line == null)
break;
if (line.startsWith("\uFEFF"))
line = line.substring(1);
String[] parts = line.split("\\s*\\|\\s*");
String alpha3 = parts[0];
if (alpha3.equals("qaa-qtz")) {
for (char second = 'a'; second <= 't'; ++second) {
for (char third = 'a'; third <= 'z'; ++third) {
String languageSubtag = (("q" + second) + third);
toScope.put(languageSubtag, Scope.PrivateUse);
toType.put(languageSubtag, Type.Special);
toNames.put(languageSubtag, "private-use");
toSource.put(languageSubtag, Source.ISO_639_2);
}
}
continue;
}
if (parts[1].length() != 0)
alpha3 = parts[1];
String languageSubtag = parts[2];
if (languageSubtag.length() == 0) {
languageSubtag = alpha3;
}
String[] english = parts[3].split(";");
toSource.put(languageSubtag, languageSubtag.length() == 2 ? Source.ISO_639_1 : Source.ISO_639_2);
if (!isValid(languageSubtag)) {
// we don't have it already,
// System.out.println("Adding2: " + alpha3 + "\t" + languageSubtag + "\t" + Arrays.asList(english));
if (languageSubtag.length() == 2) {
toAlpha3.put(languageSubtag, alpha3);
fromAlpha3.put(alpha3, languageSubtag);
}
toScope.put(languageSubtag, Scope.Collection);
toType.put(languageSubtag, Type.Special);
toNames.putAll(languageSubtag, Arrays.asList(english));
}
// skip inverted name for now
}
in.close();
Map<String, String> toHeirarchyTemp = new TreeMap<String, String>();
in = CldrUtility.getUTF8Data("external/Iso639-5.html");
String lastCode = null;
int column = 0;
boolean lastAttributeIsScope = false;
boolean lastElementIsTD = false;
boolean hadPop = true;
// if the table level is 1 (we are in the main table), then we look for <td>...</td><td>...</td>. That means
// that we have column 1 and column 2.
SimpleHtmlParser simple = new SimpleHtmlParser().setReader(in);
StringBuilder result = new StringBuilder();
main: while (true) {
SimpleHtmlParser.Type x = simple.next(result);
// System.out.println(column + "\t" + x + "\t" + result);
switch (x) {
case ELEMENT_START:
hadPop = false;
lastElementIsTD = false;
break;
case ELEMENT:
if (SimpleHtmlParser.equals("tr", result)) {
column = 0;
} else if (SimpleHtmlParser.equals("td", result)) {
lastElementIsTD = true;
}
break;
case ELEMENT_POP:
hadPop = true;
break;
case ELEMENT_END:
// if we get a POP and a TD, and we have column > 0, we increment
if (lastElementIsTD && hadPop && column > 0) {
++column;
}
break;
case ELEMENT_CONTENT:
/*
* <th scope="col">Identifier<br />Indicatif</th>
* <th scope="col">English name<br />Nom anglais</th>
* <th scope="col">French name<br />Nom français</th>
* <th scope="col">639-2</th>
* <th scope="col">Hierarchy<br />Hiérarchie</th>
* <th scope="col">Notes<br />Notes</th>
*
* <td scope="row">apa</td>
* <td>Apache languages</td>
* <td>apaches, langues</td>
* <td>language group<br />groupe de langues</td>
* <td>nai : xnd : ath : apa</td>
* <td>
* <br />
* </td>
*/
switch (column) {
case 1:
lastCode = result.toString();
break;
case 5:
String old = toHeirarchyTemp.get(lastCode);
toHeirarchyTemp.put(lastCode, old == null || old.length() == 0 ? result.toString().trim()
: old + " " + result.toString().trim());
break;
case 2:
break;
case 3:
break;
case 4:
break;
case 0:
break;
default:
break;
}
break;
case ATTRIBUTE:
lastAttributeIsScope = SimpleHtmlParser.equals("scope", result);
break;
case ATTRIBUTE_CONTENT:
if (lastAttributeIsScope && SimpleHtmlParser.equals("row", result)) {
column = 1;
}
break;
case QUOTE:
break;
case DONE:
break main;
}
}
in.close();
Pattern SPLIT_HEIRARCHY = PatternCache.get("\\s*:\\s*");
toHeirarchy = new TreeMap<String, List<String>>();
// for (String code : toHeirarchyTemp.keySet()) {
// System.out.println(code + " => " + toHeirarchyTemp.get(code));
// }
for (String code : toHeirarchyTemp.keySet()) {
String valueString = toHeirarchyTemp.get(code);
String[] values = SPLIT_HEIRARCHY.split(valueString);
for (String value : values) {
if (toScope.get(value) == null && toHeirarchyTemp.get(value) == null) {
throw new IllegalArgumentException("Unexpected value in heirarchy:\t" + value + "\t" + code
+ "\t" + valueString);
}
}
toHeirarchy.put(code, Arrays.asList(values));
}
// System.out.println("Size:\t" + toNames.size());
// make data unmodifiable, just to prevent mistakes
toAlpha3 = Collections.unmodifiableMap(toAlpha3);
fromAlpha3 = Collections.unmodifiableMap(fromAlpha3);
toBiblio3 = Collections.unmodifiableMap(toBiblio3);
fromBiblio3 = Collections.unmodifiableMap(fromBiblio3);
toScope = Collections.unmodifiableMap(toScope);
toType = Collections.unmodifiableMap(toType);
toHeirarchy = Collections.unmodifiableMap(toHeirarchy);
toNames.freeze();
toRetirements.freeze();
macro_encompassed.freeze();
} catch (IOException e) {
throw new ICUUncheckedIOException("Cannot parse file", e);
}
}
public static <T> T findMatchToPrefix(String prefix, T[] values) {
for (T x : values) {
if (x.toString().startsWith(prefix)) {
return x;
}
}
throw new IllegalArgumentException("Prefix <" + prefix + "> not found in " + Arrays.asList(values));
}
public static Set<String> getAvailable() {
if (toAlpha3 == null) {
getData();
}
return toNames.keySet();
}
public static String getMacroForEncompassed(String suffix) {
String prefix = encompassed_macro.get(suffix);
if (prefix != null)
return prefix;
if (suffix.equals("sgn"))
return null;
Set<String> names = toNames.getAll(suffix);
if (names == null)
return null;
for (String name : names) {
if (name.contains("Sign Language"))
return "sgn";
}
return null;
}
public static Set<String> getEncompassedForMacro(String prefix) {
return macro_encompassed.getAll(prefix);
}
public static Set<String> getMacros() {
return macro_encompassed.keySet();
}
public static Set<String> getEncompassed() {
return encompassed_macro.keySet();
}
}