blob: ee9081d6399fe4ace963fe03ebf19851880d0891 [file] [log] [blame]
/*
**********************************************************************
* Copyright (c) 2002-2011, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Author: Mark Davis
**********************************************************************
*/
package org.unicode.cldr.util;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.regex.Pattern;
import org.unicode.cldr.tool.LikelySubtags;
import com.google.common.base.CharMatcher;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.ibm.icu.impl.Relation;
import com.ibm.icu.impl.Row.R2;
import com.ibm.icu.text.UnicodeSet;
public class LanguageTagParser {
/**
* @return Returns the language, or "" if none.
*/
public String getLanguage() {
return language;
}
/**
* @return Returns the script, or "" if none.
*/
public String getScript() {
return script;
}
/**
* @return Returns the region, or "" if none.
*/
public String getRegion() {
return region;
}
/**
* @return Returns the variants.
*/
public List<String> getVariants() {
return ImmutableList.copyOf(variants);
}
/**
* @return Returns the grandfathered flag
*/
public boolean isGrandfathered() {
return grandfathered;
}
/**
* @return Returns the extensions.
*/
@Deprecated
public Map<String, String> getExtensions() {
return OutputOption.ICU.convert(extensions);
}
/**
* @return Returns the localeExtensions.
*/
@Deprecated
public Map<String, String> getLocaleExtensions() {
return OutputOption.ICU.convert(localeExtensions);
}
/**
* @return Returns the extensions.
*/
public Map<String, List<String>> getExtensionsDetailed() {
return ImmutableMap.copyOf(extensions);
}
/**
* @return Returns the localeExtensions.
*/
public Map<String, List<String>> getLocaleExtensionsDetailed() {
return ImmutableMap.copyOf(localeExtensions);
}
/**
* @return Returns the original, preparsed language tag
*/
public String getOriginal() {
return original;
}
/**
* @return Returns the language-script (or language) part of a tag.
*/
public String getLanguageScript() {
if (script.length() != 0) return language + "_" + script;
return language;
}
/**
* @param in
* Collection of language tag strings
* @return Returns each of the language-script tags in the collection.
*/
public static Set<String> getLanguageScript(Collection<String> in) {
return getLanguageAndScript(in, null);
}
/**
* @param in
* Collection of language tag strings
* @return Returns each of the language-script tags in the collection.
*/
public static Set<String> getLanguageAndScript(Collection<String> in, Set<String> output) {
if (output == null) output = new TreeSet<String>();
LanguageTagParser lparser = new LanguageTagParser();
for (Iterator<String> it = in.iterator(); it.hasNext();) {
output.add(lparser.set(it.next()).getLanguageScript());
}
return output;
}
// private fields
private String original;
private boolean grandfathered = false;
private String language;
private String script;
private String region;
private Set<String> variants = new TreeSet<String>();
private Map<String, List<String>> extensions = new TreeMap<String, List<String>>(); // use tree map
private Map<String, List<String>> localeExtensions = new TreeMap<String, List<String>>();
private static final UnicodeSet ALPHA = new UnicodeSet("[a-zA-Z]").freeze();
private static final UnicodeSet DIGIT = new UnicodeSet("[0-9]").freeze();
private static final UnicodeSet ALPHANUM = new UnicodeSet("[0-9a-zA-Z]").freeze();
private static final UnicodeSet EXTENSION_VALUE = new UnicodeSet("[0-9a-zA-Z/_]").freeze();
private static final UnicodeSet X = new UnicodeSet("[xX]").freeze();
private static final UnicodeSet ALPHA_MINUS_X = new UnicodeSet(ALPHA).removeAll(X).freeze();
private static StandardCodes standardCodes = StandardCodes.make();
private static final Set<String> grandfatheredCodes = standardCodes.getAvailableCodes("grandfathered");
private static final String separator = "-_"; // '-' alone for 3066bis language tags
private static final UnicodeSet SEPARATORS = new UnicodeSet().addAll(separator).freeze();
private static final Splitter SPLIT_BAR = Splitter.on(CharMatcher.anyOf(separator));
private static final Splitter SPLIT_COLON = Splitter.on(';');
private static final Splitter SPLIT_EQUAL = Splitter.on('=');
private static final SupplementalDataInfo SDI = SupplementalDataInfo.getInstance();
private static final Relation<R2<String, String>, String> BCP47_ALIASES = SDI.getBcp47Aliases();
/**
* Parses out a language tag, setting a number of fields that can subsequently be retrieved.
* If a private-use field is found, it is returned as the last extension.<br>
* This only checks for well-formedness (syntax), not for validity (subtags in registry). For the latter, see
* isValid.
*
* @param languageTag
* @return
*/
public LanguageTagParser set(String languageTag) {
if (languageTag.length() == 0) {
throw new IllegalArgumentException("Language tag cannot be empty");
}
languageTag = languageTag.toLowerCase(Locale.ROOT);
// clear everything out
language = region = script = "";
grandfathered = false;
variants.clear();
extensions.clear();
localeExtensions.clear();
original = languageTag;
int localeExtensionsPosition = languageTag.indexOf('@');
if (localeExtensionsPosition >= 0) {
final String localeExtensionsString = languageTag.substring(localeExtensionsPosition + 1);
for (String keyValue : SPLIT_COLON.split(localeExtensionsString)) {
final Iterator<String> keyValuePair = SPLIT_EQUAL.split(keyValue).iterator();
final String key = keyValuePair.next();
final String value = keyValuePair.next();
if (keyValuePair.hasNext() || !ALPHANUM.containsAll(key) || !EXTENSION_VALUE.containsAll(value)) {
throwError(keyValue, "Invalid key/value pair");
}
localeExtensions.put(key, SPLIT_BAR.splitToList(value));
}
languageTag = languageTag.substring(0, localeExtensionsPosition);
}
// first test for grandfathered
if (grandfatheredCodes.contains(languageTag)) {
language = languageTag;
grandfathered = true;
return this;
}
// each time we fetch a token, we check for length from 1..8, and all alphanum
StringTokenizer st = new StringTokenizer(languageTag, separator);
String subtag;
try {
subtag = getSubtag(st);
} catch (Exception e1) {
throw new IllegalArgumentException("Illegal language tag: " + languageTag, e1);
}
// check for private use (x-...) and return if so
if (subtag.equalsIgnoreCase("x")) {
getExtension(subtag, st, 1);
return this;
}
// check that language subtag is valid
if (!ALPHA.containsAll(subtag) || subtag.length() < 2) {
throwError(subtag, "Invalid language subtag");
}
try { // The try block is to catch the out-of-tokens case. Easier than checking each time.
language = subtag;
subtag = getSubtag(st); // prepare for next
// check for script, 4 letters
if (subtag.length() == 4 && ALPHA.containsAll(subtag)) {
script = subtag;
script = script.substring(0, 1).toUpperCase(Locale.ROOT)
+ script.substring(1);
subtag = getSubtag(st); // prepare for next
}
// check for region, 2 letters or 3 digits
if (subtag.length() == 2 && ALPHA.containsAll(subtag)
|| subtag.length() == 3 && DIGIT.containsAll(subtag)) {
region = subtag.toUpperCase(Locale.ENGLISH);
subtag = getSubtag(st); // prepare for next
}
// get variants: length > 4 or len=4 & starts with digit
while (isValidVariant(subtag)) {
variants.add(subtag);
subtag = getSubtag(st); // prepare for next
}
// get extensions: singleton '-' subtag (2-8 long)
while (subtag.length() == 1 && ALPHA_MINUS_X.contains(subtag)) {
subtag = getExtension(subtag, st, 2);
if (subtag == null) return this; // done
}
if (subtag.equalsIgnoreCase("x")) {
getExtension(subtag, st, 1);
return this;
}
// if we make it to this point, then we have an error
throwError(subtag, "Illegal subtag");
} catch (NoSuchElementException e) {
// this exception just means we ran out of tokens. That's ok, so we just return.
}
return this;
}
private boolean isValidVariant(String subtag) {
return subtag != null && ALPHANUM.containsAll(subtag)
&& (subtag.length() > 4 || subtag.length() == 4 && DIGIT.contains(subtag.charAt(0)));
}
/**
*
* @return true iff the language tag validates
*/
public boolean isValid() {
if (grandfathered) return true; // don't need further checking, since we already did so when parsing
if (!validates(language, "language")) return false;
if (!validates(script, "script")) return false;
if (!validates(region, "territory")) return false;
for (Iterator<String> it = variants.iterator(); it.hasNext();) {
if (!validates(it.next(), "variant")) return false;
}
return true; // passed the gauntlet
}
public enum Status {
WELL_FORMED, VALID, CANONICAL, MINIMAL
}
public Status getStatus(Set<String> errors) {
errors.clear();
if (!isValid()) {
return Status.WELL_FORMED;
// TODO, check the bcp47 extension codes also
}
Map<String, Map<String, R2<List<String>, String>>> aliasInfo = SDI.getLocaleAliasInfo();
Map<String, Map<String, String>> languageInfo = StandardCodes.getLStreg().get("language");
if (aliasInfo.get("language").containsKey(language)) {
errors.add("Non-canonical language: " + language);
}
Map<String, String> lstrInfo = languageInfo.get(language);
if (lstrInfo != null) {
String scope = lstrInfo.get("Scope");
if ("collection".equals(scope)) {
errors.add("Collection language: " + language);
}
}
if (aliasInfo.get("script").containsKey(script)) {
errors.add("Non-canonical script: " + script);
}
if (aliasInfo.get("territory").containsKey(region)) {
errors.add("Non-canonical region: " + region);
}
if (!errors.isEmpty()) {
return Status.VALID;
}
String tag = language + (script.isEmpty() ? "" : "_" + script) + (region.isEmpty() ? "" : "_" + region);
String minimized = LikelySubtags.minimize(tag, SDI.getLikelySubtags(), false);
if (minimized == null) {
errors.add("No minimal data for:" + tag);
if (script.isEmpty() && region.isEmpty()) {
return Status.MINIMAL;
} else {
return Status.CANONICAL;
}
}
if (!tag.equals(minimized)) {
errors.add("Not minimal:" + tag + "-->" + minimized);
return Status.CANONICAL;
}
return Status.MINIMAL;
}
/**
* @param subtag
* @param type
* @return true if the subtag is empty, or if it is in the registry
*/
private boolean validates(String subtag, String type) {
return subtag.length() == 0 || standardCodes.getAvailableCodes(type).contains(subtag);
}
/**
* Internal method
*
* @param minLength
* TODO
*/
private String getExtension(String subtag, StringTokenizer st, int minLength) {
final String key = subtag;
if (extensions.containsKey(key)) {
throwError(subtag, "Can't have two extensions with the same key");
}
if (!st.hasMoreElements()) {
throwError(subtag, "Private Use / Extension requires subsequent subtag");
}
ImmutableList.Builder<String> result = ImmutableList.builder();
try {
while (st.hasMoreElements()) {
subtag = getSubtag(st);
if (subtag.length() < minLength) {
return subtag;
}
result.add(subtag);
}
return null;
} finally {
extensions.put(key, result.build());
}
}
/**
* Internal method
*/
private String getSubtag(StringTokenizer st) {
String result = st.nextToken();
if (result.length() < 1 || result.length() > 8) {
throwError(result, "Illegal length (must be 1..8)");
}
if (!ALPHANUM.containsAll(result)) {
throwError(result, "Illegal characters (" + new UnicodeSet().addAll(result).removeAll(ALPHANUM) + ")");
}
return result;
}
/**
* Internal method
*/
private void throwError(String subtag, String errorText) {
throw new IllegalArgumentException(errorText + ": " + subtag + " in " + original);
}
public LanguageTagParser setRegion(String region) {
this.region = region;
return this;
}
public LanguageTagParser setScript(String script) {
this.script = script;
return this;
}
public enum OutputOption {
ICU('_'),
BCP47('-');
final char separator;
final Joiner joiner;
private OutputOption(char separator) {
this.separator = separator;
joiner = Joiner.on(separator);
}
public Map<String, String> convert(Map<String, List<String>> mapToList) {
if (mapToList.isEmpty()) {
return Collections.emptyMap();
}
ImmutableMap.Builder<String, String> builder = ImmutableMap.builder();
for (Entry<String, List<String>> entry : mapToList.entrySet()) {
builder.put(entry.getKey(), joiner.join(entry.getValue()));
}
return builder.build();
}
}
public String toString() {
return toString(OutputOption.ICU);
}
public String toString(OutputOption oo) {
StringBuilder result = new StringBuilder(language); // optimize for the simple cases
if (this.script.length() != 0) result.append(oo.separator).append(script);
if (this.region.length() != 0) result.append(oo.separator).append(region);
if (this.variants.size() != 0) {
for (String variant : variants) {
result.append(oo.separator).append(oo != OutputOption.ICU ? variant : variant.toUpperCase(Locale.ROOT));
}
}
if (this.extensions.size() != 0) {
for (Entry<String, List<String>> extension : extensions.entrySet()) {
String key = extension.getKey();
String value = oo.joiner.join(extension.getValue());
result.append(oo.separator).append(key)
.append(oo.separator).append(value);
}
}
if (this.localeExtensions.size() != 0) {
if (oo == OutputOption.BCP47) {
throw new IllegalArgumentException("Cannot represent as BCP47 without canonicalizing first");
}
result.append('@');
for (Entry<String, List<String>> extension : localeExtensions.entrySet()) {
String key = extension.getKey();
String value = oo.joiner.join(extension.getValue());
result.append(oo != OutputOption.ICU ? key : key.toUpperCase(Locale.ROOT))
.append('=').append(oo != OutputOption.ICU ? value : value.toUpperCase(Locale.ROOT));
}
}
return result.toString();
}
/**
* Return just the language, script, and region (no variants or extensions)
* @return
*/
public String toLSR() {
String result = language; // optimize for the simple cases
if (this.script.length() != 0) result += "_" + script;
if (this.region.length() != 0) result += "_" + region;
return result;
}
public enum Fields {
LANGUAGE, SCRIPT, REGION, VARIANTS
};
public static Set<Fields> LANGUAGE_SCRIPT = Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE, Fields.SCRIPT));
public static Set<Fields> LANGUAGE_REGION = Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE, Fields.REGION));
public static Set<Fields> LANGUAGE_SCRIPT_REGION = Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE,
Fields.SCRIPT, Fields.REGION));
public String toString(Set<Fields> selection) {
String result = language;
if (selection.contains(Fields.SCRIPT) && script.length() != 0) result += "_" + script;
if (selection.contains(Fields.REGION) && region.length() != 0) result += "_" + region;
if (selection.contains(Fields.VARIANTS) && variants.size() != 0) {
for (String variant : (Collection<String>) variants) {
result += "_" + variant;
}
}
return result;
}
public LanguageTagParser setLanguage(String language) {
if (SEPARATORS.containsSome(language)) {
String oldScript = script;
String oldRegion = region;
Set<String> oldVariants = variants;
set(language);
if (script.length() == 0) {
script = oldScript;
}
if (region.length() == 0) {
region = oldRegion;
}
if (oldVariants.size() != 0) {
variants = oldVariants;
}
} else {
this.language = language;
}
return this;
}
public LanguageTagParser setLocaleExtensions(Map<String, String> localeExtensions) {
this.localeExtensions = expandMap(localeExtensions, 1, Integer.MAX_VALUE);
return this;
}
public LanguageTagParser setVariants(Collection<String> newVariants) {
for (String variant : newVariants) {
if (!isValidVariant(variant)) {
throw new IllegalArgumentException("Illegal variant: " + variant);
}
}
variants.clear();
variants.addAll(newVariants);
return this;
}
static final Pattern EXTENSION_PATTERN = PatternCache.get("([0-9a-zA-Z]{2,8}(-[0-9a-zA-Z]{2,8})*)?");
public LanguageTagParser setExtensions(Map<String, String> newExtensions) {
this.extensions = expandMap(newExtensions, 2, 8);
return this;
}
public static String getSimpleParent(String s) {
int lastBar = s.lastIndexOf('_');
return lastBar >= 0 ? s.substring(0, lastBar) : "";
}
private Map<String, List<String>> expandMap(Map<String, String> newLocaleExtensions, int minLength, int maxLength) {
if (newLocaleExtensions.isEmpty()) {
return Collections.emptyMap();
}
ImmutableMap.Builder<String, List<String>> result = ImmutableMap.builder();
for (Entry<String, String> entry : newLocaleExtensions.entrySet()) {
result.put(entry.getKey(), split(entry.getValue(), minLength, maxLength));
}
return result.build();
}
private List<String> split(String value, int minLength, int maxLength) {
List<String> values = SPLIT_BAR.splitToList(value);
for (String s : values) {
if (s.length() < minLength || s.length() > maxLength) {
throw new IllegalArgumentException("Illegal subtag length for: " + s);
}
if (!ALPHANUM.contains(s)) {
throw new IllegalArgumentException("Illegal locale character in: " + s);
}
}
return values;
}
}