| /* |
| ********************************************************************** |
| * Copyright (c) 2002-2011, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ********************************************************************** |
| * Author: Mark Davis |
| ********************************************************************** |
| */ |
| package org.unicode.cldr.util; |
| |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.EnumSet; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Locale; |
| import java.util.Map; |
| import java.util.Map.Entry; |
| import java.util.NoSuchElementException; |
| import java.util.Set; |
| import java.util.StringTokenizer; |
| import java.util.TreeMap; |
| import java.util.TreeSet; |
| import java.util.regex.Pattern; |
| |
| import org.unicode.cldr.tool.LikelySubtags; |
| |
| import com.google.common.base.CharMatcher; |
| import com.google.common.base.Joiner; |
| import com.google.common.base.Splitter; |
| import com.google.common.collect.ImmutableList; |
| import com.google.common.collect.ImmutableMap; |
| import com.ibm.icu.impl.Relation; |
| import com.ibm.icu.impl.Row.R2; |
| import com.ibm.icu.text.UnicodeSet; |
| |
| public class LanguageTagParser { |
| /** |
| * @return Returns the language, or "" if none. |
| */ |
| public String getLanguage() { |
| return language; |
| } |
| |
| /** |
| * @return Returns the script, or "" if none. |
| */ |
| public String getScript() { |
| return script; |
| } |
| |
| /** |
| * @return Returns the region, or "" if none. |
| */ |
| public String getRegion() { |
| return region; |
| } |
| |
| /** |
| * @return Returns the variants. |
| */ |
| public List<String> getVariants() { |
| return ImmutableList.copyOf(variants); |
| } |
| |
| /** |
| * @return Returns the grandfathered flag |
| */ |
| public boolean isGrandfathered() { |
| return grandfathered; |
| } |
| |
| /** |
| * @return Returns the extensions. |
| */ |
| @Deprecated |
| public Map<String, String> getExtensions() { |
| return OutputOption.ICU.convert(extensions); |
| } |
| |
| /** |
| * @return Returns the localeExtensions. |
| */ |
| @Deprecated |
| public Map<String, String> getLocaleExtensions() { |
| return OutputOption.ICU.convert(localeExtensions); |
| } |
| |
| /** |
| * @return Returns the extensions. |
| */ |
| public Map<String, List<String>> getExtensionsDetailed() { |
| return ImmutableMap.copyOf(extensions); |
| } |
| |
| /** |
| * @return Returns the localeExtensions. |
| */ |
| public Map<String, List<String>> getLocaleExtensionsDetailed() { |
| return ImmutableMap.copyOf(localeExtensions); |
| } |
| |
| /** |
| * @return Returns the original, preparsed language tag |
| */ |
| public String getOriginal() { |
| return original; |
| } |
| |
| /** |
| * @return Returns the language-script (or language) part of a tag. |
| */ |
| public String getLanguageScript() { |
| if (script.length() != 0) return language + "_" + script; |
| return language; |
| } |
| |
| /** |
| * @param in |
| * Collection of language tag strings |
| * @return Returns each of the language-script tags in the collection. |
| */ |
| public static Set<String> getLanguageScript(Collection<String> in) { |
| return getLanguageAndScript(in, null); |
| } |
| |
| /** |
| * @param in |
| * Collection of language tag strings |
| * @return Returns each of the language-script tags in the collection. |
| */ |
| public static Set<String> getLanguageAndScript(Collection<String> in, Set<String> output) { |
| if (output == null) output = new TreeSet<String>(); |
| LanguageTagParser lparser = new LanguageTagParser(); |
| for (Iterator<String> it = in.iterator(); it.hasNext();) { |
| output.add(lparser.set(it.next()).getLanguageScript()); |
| } |
| return output; |
| } |
| |
| // private fields |
| |
| private String original; |
| private boolean grandfathered = false; |
| private String language; |
| private String script; |
| private String region; |
| private Set<String> variants = new TreeSet<String>(); |
| private Map<String, List<String>> extensions = new TreeMap<String, List<String>>(); // use tree map |
| private Map<String, List<String>> localeExtensions = new TreeMap<String, List<String>>(); |
| |
| private static final UnicodeSet ALPHA = new UnicodeSet("[a-zA-Z]").freeze(); |
| private static final UnicodeSet DIGIT = new UnicodeSet("[0-9]").freeze(); |
| private static final UnicodeSet ALPHANUM = new UnicodeSet("[0-9a-zA-Z]").freeze(); |
| private static final UnicodeSet EXTENSION_VALUE = new UnicodeSet("[0-9a-zA-Z/_]").freeze(); |
| private static final UnicodeSet X = new UnicodeSet("[xX]").freeze(); |
| private static final UnicodeSet ALPHA_MINUS_X = new UnicodeSet(ALPHA).removeAll(X).freeze(); |
| private static StandardCodes standardCodes = StandardCodes.make(); |
| private static final Set<String> grandfatheredCodes = standardCodes.getAvailableCodes("grandfathered"); |
| private static final String separator = "-_"; // '-' alone for 3066bis language tags |
| private static final UnicodeSet SEPARATORS = new UnicodeSet().addAll(separator).freeze(); |
| private static final Splitter SPLIT_BAR = Splitter.on(CharMatcher.anyOf(separator)); |
| private static final Splitter SPLIT_COLON = Splitter.on(';'); |
| private static final Splitter SPLIT_EQUAL = Splitter.on('='); |
| private static final SupplementalDataInfo SDI = SupplementalDataInfo.getInstance(); |
| private static final Relation<R2<String, String>, String> BCP47_ALIASES = SDI.getBcp47Aliases(); |
| |
| /** |
| * Parses out a language tag, setting a number of fields that can subsequently be retrieved. |
| * If a private-use field is found, it is returned as the last extension.<br> |
| * This only checks for well-formedness (syntax), not for validity (subtags in registry). For the latter, see |
| * isValid. |
| * |
| * @param languageTag |
| * @return |
| */ |
| public LanguageTagParser set(String languageTag) { |
| if (languageTag.length() == 0) { |
| throw new IllegalArgumentException("Language tag cannot be empty"); |
| } |
| languageTag = languageTag.toLowerCase(Locale.ROOT); |
| |
| // clear everything out |
| language = region = script = ""; |
| grandfathered = false; |
| variants.clear(); |
| extensions.clear(); |
| localeExtensions.clear(); |
| original = languageTag; |
| int localeExtensionsPosition = languageTag.indexOf('@'); |
| if (localeExtensionsPosition >= 0) { |
| final String localeExtensionsString = languageTag.substring(localeExtensionsPosition + 1); |
| for (String keyValue : SPLIT_COLON.split(localeExtensionsString)) { |
| final Iterator<String> keyValuePair = SPLIT_EQUAL.split(keyValue).iterator(); |
| final String key = keyValuePair.next(); |
| final String value = keyValuePair.next(); |
| if (keyValuePair.hasNext() || !ALPHANUM.containsAll(key) || !EXTENSION_VALUE.containsAll(value)) { |
| throwError(keyValue, "Invalid key/value pair"); |
| } |
| localeExtensions.put(key, SPLIT_BAR.splitToList(value)); |
| } |
| languageTag = languageTag.substring(0, localeExtensionsPosition); |
| } |
| |
| // first test for grandfathered |
| if (grandfatheredCodes.contains(languageTag)) { |
| language = languageTag; |
| grandfathered = true; |
| return this; |
| } |
| |
| // each time we fetch a token, we check for length from 1..8, and all alphanum |
| StringTokenizer st = new StringTokenizer(languageTag, separator); |
| String subtag; |
| try { |
| subtag = getSubtag(st); |
| } catch (Exception e1) { |
| throw new IllegalArgumentException("Illegal language tag: " + languageTag, e1); |
| } |
| |
| // check for private use (x-...) and return if so |
| if (subtag.equalsIgnoreCase("x")) { |
| getExtension(subtag, st, 1); |
| return this; |
| } |
| |
| // check that language subtag is valid |
| if (!ALPHA.containsAll(subtag) || subtag.length() < 2) { |
| throwError(subtag, "Invalid language subtag"); |
| } |
| try { // The try block is to catch the out-of-tokens case. Easier than checking each time. |
| language = subtag; |
| subtag = getSubtag(st); // prepare for next |
| |
| // check for script, 4 letters |
| if (subtag.length() == 4 && ALPHA.containsAll(subtag)) { |
| script = subtag; |
| script = script.substring(0, 1).toUpperCase(Locale.ROOT) |
| + script.substring(1); |
| subtag = getSubtag(st); // prepare for next |
| } |
| |
| // check for region, 2 letters or 3 digits |
| if (subtag.length() == 2 && ALPHA.containsAll(subtag) |
| || subtag.length() == 3 && DIGIT.containsAll(subtag)) { |
| region = subtag.toUpperCase(Locale.ENGLISH); |
| subtag = getSubtag(st); // prepare for next |
| } |
| |
| // get variants: length > 4 or len=4 & starts with digit |
| while (isValidVariant(subtag)) { |
| variants.add(subtag); |
| subtag = getSubtag(st); // prepare for next |
| } |
| |
| // get extensions: singleton '-' subtag (2-8 long) |
| while (subtag.length() == 1 && ALPHA_MINUS_X.contains(subtag)) { |
| subtag = getExtension(subtag, st, 2); |
| if (subtag == null) return this; // done |
| } |
| |
| if (subtag.equalsIgnoreCase("x")) { |
| getExtension(subtag, st, 1); |
| return this; |
| } |
| |
| // if we make it to this point, then we have an error |
| throwError(subtag, "Illegal subtag"); |
| |
| } catch (NoSuchElementException e) { |
| // this exception just means we ran out of tokens. That's ok, so we just return. |
| } |
| return this; |
| } |
| |
| private boolean isValidVariant(String subtag) { |
| return subtag != null && ALPHANUM.containsAll(subtag) |
| && (subtag.length() > 4 || subtag.length() == 4 && DIGIT.contains(subtag.charAt(0))); |
| } |
| |
| /** |
| * |
| * @return true iff the language tag validates |
| */ |
| public boolean isValid() { |
| if (grandfathered) return true; // don't need further checking, since we already did so when parsing |
| if (!validates(language, "language")) return false; |
| if (!validates(script, "script")) return false; |
| if (!validates(region, "territory")) return false; |
| for (Iterator<String> it = variants.iterator(); it.hasNext();) { |
| if (!validates(it.next(), "variant")) return false; |
| } |
| return true; // passed the gauntlet |
| } |
| |
| public enum Status { |
| WELL_FORMED, VALID, CANONICAL, MINIMAL |
| } |
| |
| public Status getStatus(Set<String> errors) { |
| errors.clear(); |
| if (!isValid()) { |
| return Status.WELL_FORMED; |
| // TODO, check the bcp47 extension codes also |
| } |
| Map<String, Map<String, R2<List<String>, String>>> aliasInfo = SDI.getLocaleAliasInfo(); |
| Map<String, Map<String, String>> languageInfo = StandardCodes.getLStreg().get("language"); |
| |
| if (aliasInfo.get("language").containsKey(language)) { |
| errors.add("Non-canonical language: " + language); |
| } |
| Map<String, String> lstrInfo = languageInfo.get(language); |
| if (lstrInfo != null) { |
| String scope = lstrInfo.get("Scope"); |
| if ("collection".equals(scope)) { |
| errors.add("Collection language: " + language); |
| } |
| } |
| if (aliasInfo.get("script").containsKey(script)) { |
| errors.add("Non-canonical script: " + script); |
| } |
| if (aliasInfo.get("territory").containsKey(region)) { |
| errors.add("Non-canonical region: " + region); |
| } |
| if (!errors.isEmpty()) { |
| return Status.VALID; |
| } |
| String tag = language + (script.isEmpty() ? "" : "_" + script) + (region.isEmpty() ? "" : "_" + region); |
| String minimized = LikelySubtags.minimize(tag, SDI.getLikelySubtags(), false); |
| if (minimized == null) { |
| errors.add("No minimal data for:" + tag); |
| if (script.isEmpty() && region.isEmpty()) { |
| return Status.MINIMAL; |
| } else { |
| return Status.CANONICAL; |
| } |
| } |
| if (!tag.equals(minimized)) { |
| errors.add("Not minimal:" + tag + "-->" + minimized); |
| return Status.CANONICAL; |
| } |
| return Status.MINIMAL; |
| } |
| |
| /** |
| * @param subtag |
| * @param type |
| * @return true if the subtag is empty, or if it is in the registry |
| */ |
| private boolean validates(String subtag, String type) { |
| return subtag.length() == 0 || standardCodes.getAvailableCodes(type).contains(subtag); |
| } |
| |
| /** |
| * Internal method |
| * |
| * @param minLength |
| * TODO |
| */ |
| private String getExtension(String subtag, StringTokenizer st, int minLength) { |
| final String key = subtag; |
| if (extensions.containsKey(key)) { |
| throwError(subtag, "Can't have two extensions with the same key"); |
| } |
| if (!st.hasMoreElements()) { |
| throwError(subtag, "Private Use / Extension requires subsequent subtag"); |
| } |
| ImmutableList.Builder<String> result = ImmutableList.builder(); |
| try { |
| while (st.hasMoreElements()) { |
| subtag = getSubtag(st); |
| if (subtag.length() < minLength) { |
| return subtag; |
| } |
| result.add(subtag); |
| } |
| return null; |
| } finally { |
| extensions.put(key, result.build()); |
| } |
| } |
| |
| /** |
| * Internal method |
| */ |
| private String getSubtag(StringTokenizer st) { |
| String result = st.nextToken(); |
| if (result.length() < 1 || result.length() > 8) { |
| throwError(result, "Illegal length (must be 1..8)"); |
| } |
| if (!ALPHANUM.containsAll(result)) { |
| throwError(result, "Illegal characters (" + new UnicodeSet().addAll(result).removeAll(ALPHANUM) + ")"); |
| } |
| return result; |
| } |
| |
| /** |
| * Internal method |
| */ |
| private void throwError(String subtag, String errorText) { |
| throw new IllegalArgumentException(errorText + ": " + subtag + " in " + original); |
| } |
| |
| public LanguageTagParser setRegion(String region) { |
| this.region = region; |
| return this; |
| } |
| |
| public LanguageTagParser setScript(String script) { |
| this.script = script; |
| return this; |
| } |
| |
| public enum OutputOption { |
| ICU('_'), |
| BCP47('-'); |
| final char separator; |
| final Joiner joiner; |
| private OutputOption(char separator) { |
| this.separator = separator; |
| joiner = Joiner.on(separator); |
| } |
| public Map<String, String> convert(Map<String, List<String>> mapToList) { |
| if (mapToList.isEmpty()) { |
| return Collections.emptyMap(); |
| } |
| ImmutableMap.Builder<String, String> builder = ImmutableMap.builder(); |
| for (Entry<String, List<String>> entry : mapToList.entrySet()) { |
| builder.put(entry.getKey(), joiner.join(entry.getValue())); |
| } |
| return builder.build(); |
| } |
| } |
| |
| public String toString() { |
| return toString(OutputOption.ICU); |
| } |
| |
| public String toString(OutputOption oo) { |
| StringBuilder result = new StringBuilder(language); // optimize for the simple cases |
| if (this.script.length() != 0) result.append(oo.separator).append(script); |
| if (this.region.length() != 0) result.append(oo.separator).append(region); |
| if (this.variants.size() != 0) { |
| for (String variant : variants) { |
| result.append(oo.separator).append(oo != OutputOption.ICU ? variant : variant.toUpperCase(Locale.ROOT)); |
| } |
| } |
| if (this.extensions.size() != 0) { |
| for (Entry<String, List<String>> extension : extensions.entrySet()) { |
| String key = extension.getKey(); |
| String value = oo.joiner.join(extension.getValue()); |
| result.append(oo.separator).append(key) |
| .append(oo.separator).append(value); |
| } |
| } |
| if (this.localeExtensions.size() != 0) { |
| if (oo == OutputOption.BCP47) { |
| throw new IllegalArgumentException("Cannot represent as BCP47 without canonicalizing first"); |
| } |
| result.append('@'); |
| for (Entry<String, List<String>> extension : localeExtensions.entrySet()) { |
| String key = extension.getKey(); |
| String value = oo.joiner.join(extension.getValue()); |
| result.append(oo != OutputOption.ICU ? key : key.toUpperCase(Locale.ROOT)) |
| .append('=').append(oo != OutputOption.ICU ? value : value.toUpperCase(Locale.ROOT)); |
| } |
| } |
| return result.toString(); |
| } |
| |
| /** |
| * Return just the language, script, and region (no variants or extensions) |
| * @return |
| */ |
| public String toLSR() { |
| String result = language; // optimize for the simple cases |
| if (this.script.length() != 0) result += "_" + script; |
| if (this.region.length() != 0) result += "_" + region; |
| return result; |
| } |
| |
| public enum Fields { |
| LANGUAGE, SCRIPT, REGION, VARIANTS |
| }; |
| |
| public static Set<Fields> LANGUAGE_SCRIPT = Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE, Fields.SCRIPT)); |
| public static Set<Fields> LANGUAGE_REGION = Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE, Fields.REGION)); |
| public static Set<Fields> LANGUAGE_SCRIPT_REGION = Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE, |
| Fields.SCRIPT, Fields.REGION)); |
| |
| public String toString(Set<Fields> selection) { |
| String result = language; |
| if (selection.contains(Fields.SCRIPT) && script.length() != 0) result += "_" + script; |
| if (selection.contains(Fields.REGION) && region.length() != 0) result += "_" + region; |
| if (selection.contains(Fields.VARIANTS) && variants.size() != 0) { |
| for (String variant : (Collection<String>) variants) { |
| result += "_" + variant; |
| } |
| } |
| return result; |
| } |
| |
| public LanguageTagParser setLanguage(String language) { |
| if (SEPARATORS.containsSome(language)) { |
| String oldScript = script; |
| String oldRegion = region; |
| Set<String> oldVariants = variants; |
| set(language); |
| if (script.length() == 0) { |
| script = oldScript; |
| } |
| if (region.length() == 0) { |
| region = oldRegion; |
| } |
| if (oldVariants.size() != 0) { |
| variants = oldVariants; |
| } |
| } else { |
| this.language = language; |
| } |
| return this; |
| } |
| |
| public LanguageTagParser setLocaleExtensions(Map<String, String> localeExtensions) { |
| this.localeExtensions = expandMap(localeExtensions, 1, Integer.MAX_VALUE); |
| return this; |
| } |
| |
| public LanguageTagParser setVariants(Collection<String> newVariants) { |
| for (String variant : newVariants) { |
| if (!isValidVariant(variant)) { |
| throw new IllegalArgumentException("Illegal variant: " + variant); |
| } |
| } |
| variants.clear(); |
| variants.addAll(newVariants); |
| return this; |
| } |
| |
| static final Pattern EXTENSION_PATTERN = PatternCache.get("([0-9a-zA-Z]{2,8}(-[0-9a-zA-Z]{2,8})*)?"); |
| |
| public LanguageTagParser setExtensions(Map<String, String> newExtensions) { |
| this.extensions = expandMap(newExtensions, 2, 8); |
| return this; |
| } |
| |
| public static String getSimpleParent(String s) { |
| int lastBar = s.lastIndexOf('_'); |
| return lastBar >= 0 ? s.substring(0, lastBar) : ""; |
| } |
| |
| private Map<String, List<String>> expandMap(Map<String, String> newLocaleExtensions, int minLength, int maxLength) { |
| if (newLocaleExtensions.isEmpty()) { |
| return Collections.emptyMap(); |
| } |
| ImmutableMap.Builder<String, List<String>> result = ImmutableMap.builder(); |
| for (Entry<String, String> entry : newLocaleExtensions.entrySet()) { |
| result.put(entry.getKey(), split(entry.getValue(), minLength, maxLength)); |
| } |
| return result.build(); |
| } |
| |
| private List<String> split(String value, int minLength, int maxLength) { |
| List<String> values = SPLIT_BAR.splitToList(value); |
| for (String s : values) { |
| if (s.length() < minLength || s.length() > maxLength) { |
| throw new IllegalArgumentException("Illegal subtag length for: " + s); |
| } |
| if (!ALPHANUM.contains(s)) { |
| throw new IllegalArgumentException("Illegal locale character in: " + s); |
| } |
| } |
| return values; |
| } |
| } |