| // © 2017 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html#License |
| package com.ibm.icu.impl.number.parse; |
| |
| import java.util.EnumMap; |
| import java.util.Map; |
| |
| import com.ibm.icu.text.UnicodeSet; |
| |
| /** |
| * This class statically initializes UnicodeSets useful for number parsing. Microbenchmarks show this to |
| * bring a very sizeable performance boost. |
| * |
| * IMPORTANT ASSUMPTION: All of the sets contain code points (no strings) and they are all case-folded. |
| * If this assumption were ever broken, logic in classes such as SymbolMatcher would need to be updated |
| * in order to return well-formed sets upon calls to getLeadCodePoints(). |
| * |
| * @author sffc |
| */ |
| public class UnicodeSetStaticCache { |
| public static enum Key { |
| // Ignorables |
| BIDI, |
| WHITESPACE, |
| DEFAULT_IGNORABLES, |
| STRICT_IGNORABLES, |
| |
| // Separators |
| // Notes: |
| // - COMMA is a superset of STRICT_COMMA |
| // - PERIOD is a superset of SCRICT_PERIOD |
| // - ALL_SEPARATORS is the union of COMMA, PERIOD, and OTHER_GROUPING_SEPARATORS |
| // - STRICT_ALL_SEPARATORS is the union of STRICT_COMMA, STRICT_PERIOD, and OTHER_GRP_SEPARATORS |
| COMMA, |
| PERIOD, |
| STRICT_COMMA, |
| STRICT_PERIOD, |
| OTHER_GROUPING_SEPARATORS, |
| ALL_SEPARATORS, |
| STRICT_ALL_SEPARATORS, |
| |
| // Symbols |
| // TODO: NaN? |
| MINUS_SIGN, |
| PLUS_SIGN, |
| PERCENT_SIGN, |
| PERMILLE_SIGN, |
| INFINITY, |
| |
| // Other |
| DIGITS, |
| NAN_LEAD, |
| SCIENTIFIC_LEAD, |
| CWCF, // TODO: Check if this is being used and remove it if not. |
| |
| // Combined Separators with Digits (for lead code points) |
| DIGITS_OR_ALL_SEPARATORS, |
| DIGITS_OR_STRICT_ALL_SEPARATORS, |
| }; |
| |
| private static final Map<Key, UnicodeSet> unicodeSets = new EnumMap<Key, UnicodeSet>(Key.class); |
| |
| public static UnicodeSet get(Key key) { |
| return unicodeSets.get(key); |
| } |
| |
| public static Key chooseFrom(String str, Key key1) { |
| return get(key1).contains(str) ? key1 : null; |
| } |
| |
| public static Key chooseFrom(String str, Key key1, Key key2) { |
| return get(key1).contains(str) ? key1 : chooseFrom(str, key2); |
| } |
| |
| public static Key chooseFrom(String str, Key key1, Key key2, Key key3) { |
| return get(key1).contains(str) ? key1 : chooseFrom(str, key2, key3); |
| } |
| |
| private static UnicodeSet computeUnion(Key k1, Key k2) { |
| return new UnicodeSet().addAll(get(k1)).addAll(get(k2)).freeze(); |
| } |
| |
| private static UnicodeSet computeUnion(Key k1, Key k2, Key k3) { |
| return new UnicodeSet().addAll(get(k1)).addAll(get(k2)).addAll(get(k3)).freeze(); |
| } |
| |
| static { |
| // BiDi characters are skipped over and ignored at any point in the string, even in strict mode. |
| unicodeSets.put(Key.BIDI, new UnicodeSet("[[\\u200E\\u200F\\u061C]]").freeze()); |
| |
| // This set was decided after discussion with icu-design@. See ticket #13309. |
| // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property). |
| unicodeSets.put(Key.WHITESPACE, new UnicodeSet("[[:Zs:][\\u0009]]").freeze()); |
| |
| unicodeSets.put(Key.DEFAULT_IGNORABLES, computeUnion(Key.BIDI, Key.WHITESPACE)); |
| unicodeSets.put(Key.STRICT_IGNORABLES, get(Key.BIDI)); |
| |
| // TODO: Re-generate these sets from the UCD. They probably haven't been updated in a while. |
| unicodeSets.put(Key.COMMA, new UnicodeSet("[,،٫、︐︑﹐﹑,、]").freeze()); |
| unicodeSets.put(Key.STRICT_COMMA, new UnicodeSet("[,٫︐﹐,]").freeze()); |
| unicodeSets.put(Key.PERIOD, new UnicodeSet("[.․。︒﹒.。]").freeze()); |
| unicodeSets.put(Key.STRICT_PERIOD, new UnicodeSet("[.․﹒.。]").freeze()); |
| unicodeSets.put(Key.OTHER_GROUPING_SEPARATORS, |
| new UnicodeSet("['٬‘’'\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]").freeze()); |
| unicodeSets.put(Key.ALL_SEPARATORS, |
| computeUnion(Key.COMMA, Key.PERIOD, Key.OTHER_GROUPING_SEPARATORS)); |
| unicodeSets.put(Key.STRICT_ALL_SEPARATORS, |
| computeUnion(Key.STRICT_COMMA, Key.STRICT_PERIOD, Key.OTHER_GROUPING_SEPARATORS)); |
| |
| unicodeSets.put(Key.MINUS_SIGN, new UnicodeSet("[-⁻₋−➖﹣-]").freeze()); |
| unicodeSets.put(Key.PLUS_SIGN, new UnicodeSet("[+⁺₊➕﬩﹢+]").freeze()); |
| |
| // TODO: Fill in the next three sets. |
| unicodeSets.put(Key.PERCENT_SIGN, new UnicodeSet("[%٪]").freeze()); |
| unicodeSets.put(Key.PERMILLE_SIGN, new UnicodeSet("[‰؉]").freeze()); |
| unicodeSets.put(Key.INFINITY, new UnicodeSet("[∞]").freeze()); |
| |
| unicodeSets.put(Key.DIGITS, new UnicodeSet("[:digit:]").freeze()); |
| unicodeSets.put(Key.NAN_LEAD, |
| new UnicodeSet("[NnТтmeՈոс¤НнчTtsҳ\u975e\u1002\u0e9a\u10d0\u0f68\u0644\u0646]") |
| .freeze()); |
| unicodeSets.put(Key.SCIENTIFIC_LEAD, new UnicodeSet("[Ee×·е\u0627]").freeze()); |
| unicodeSets.put(Key.CWCF, new UnicodeSet("[:CWCF:]").freeze()); |
| |
| unicodeSets.put(Key.DIGITS_OR_ALL_SEPARATORS, computeUnion(Key.DIGITS, Key.ALL_SEPARATORS)); |
| unicodeSets.put(Key.DIGITS_OR_STRICT_ALL_SEPARATORS, |
| computeUnion(Key.DIGITS, Key.STRICT_ALL_SEPARATORS)); |
| } |
| } |