| /* GENERATED SOURCE. DO NOT MODIFY. */ |
| // © 2017 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html#License |
| package android.icu.impl.number; |
| |
| import java.math.BigDecimal; |
| import java.math.MathContext; |
| import java.text.ParseException; |
| import java.text.ParsePosition; |
| import java.util.HashSet; |
| import java.util.Iterator; |
| import java.util.Set; |
| import java.util.concurrent.ConcurrentHashMap; |
| |
| import android.icu.impl.StandardPlural; |
| import android.icu.impl.TextTrieMap; |
| import android.icu.lang.UCharacter; |
| import android.icu.text.CurrencyPluralInfo; |
| import android.icu.text.DecimalFormatSymbols; |
| import android.icu.text.NumberFormat; |
| import android.icu.text.UnicodeSet; |
| import android.icu.util.Currency; |
| import android.icu.util.Currency.CurrencyStringInfo; |
| import android.icu.util.CurrencyAmount; |
| import android.icu.util.ULocale; |
| |
| /** |
| * A parser designed to convert an arbitrary human-generated string to its best representation as a |
| * number: a long, a BigInteger, or a BigDecimal. |
| * |
| * <p>The parser may traverse multiple parse paths in the same strings if there is ambiguity. For |
| * example, the string "12,345.67" has two main interpretations: it could be "12.345" in a locale |
| * that uses '.' as the grouping separator, or it could be "12345.67" in a locale that uses ',' as |
| * the grouping separator. Since the second option has a longer parse path (consumes more of the |
| * input string), the parser will accept the second option. |
| * @hide Only a subset of ICU is exposed in Android |
| */ |
| public class Parse { |
| |
| /** Controls the set of rules for parsing a string. */ |
| public static enum ParseMode { |
| /** |
| * Lenient mode should be used if you want to accept malformed user input. It will use |
| * heuristics to attempt to parse through typographical errors in the string. |
| */ |
| LENIENT, |
| |
| /** |
| * Strict mode should be used if you want to require that the input is well-formed. More |
| * specifically, it differs from lenient mode in the following ways: |
| * |
| * <ul> |
| * <li>Grouping widths must match the grouping settings. For example, "12,3,45" will fail if |
| * the grouping width is 3, as in the pattern "#,##0". |
| * <li>The string must contain a complete prefix and suffix. For example, if the pattern is |
| * "{#};(#)", then "{123}" or "(123)" would match, but "{123", "123}", and "123" would all |
| * fail. (The latter strings would be accepted in lenient mode.) |
| * <li>Whitespace may not appear at arbitrary places in the string. In lenient mode, |
| * whitespace is allowed to occur arbitrarily before and after prefixes and exponent |
| * separators. |
| * <li>Leading grouping separators are not allowed, as in ",123". |
| * <li>Minus and plus signs can only appear if specified in the pattern. In lenient mode, a |
| * plus or minus sign can always precede a number. |
| * <li>The set of characters that can be interpreted as a decimal or grouping separator is |
| * smaller. |
| * <li><strong>If currency parsing is enabled,</strong> currencies must only appear where |
| * specified in either the current pattern string or in a valid pattern string for the |
| * current locale. For example, if the pattern is "¤0.00", then "$1.23" would match, but |
| * "1.23$" would fail to match. |
| * </ul> |
| */ |
| STRICT, |
| |
| /** |
| * Fast mode should be used in applications that don't require prefixes and suffixes to match. |
| * |
| * <p>In addition to ignoring prefixes and suffixes, fast mode performs the following |
| * optimizations: |
| * |
| * <ul> |
| * <li>Ignores digit strings from {@link DecimalFormatSymbols} and only uses the code point's |
| * Unicode digit property. If you are not using custom digit strings, this should not |
| * cause a change in behavior. |
| * <li>Instead of traversing multiple possible parse paths, a "greedy" parsing strategy is |
| * used, which might mean that fast mode won't accept strings that lenient or strict mode |
| * would accept. Since prefix and suffix strings are ignored, this is not an issue unless |
| * you are using custom symbols. |
| * </ul> |
| */ |
| FAST, |
| } |
| |
| /** |
| * An enum containing the choices for strategy in parsing when choosing between grouping and |
| * decimal separators. |
| */ |
| public static enum GroupingMode { |
| /** |
| * Accept decimal equivalents as decimals, and if that fails, accept all equivalence classes |
| * (periods, commas, and whitespace-like) as grouping. This is a more lenient strategy. |
| * |
| * <p>For example, if the formatter's current locale is <em>fr-FR</em>, then "1.234" will parse |
| * as 1234, even though <em>fr-FR</em> does not use a period as the grouping separator. |
| */ |
| DEFAULT, |
| |
| /** |
| * Accept decimal equivalents as decimals and grouping equivalents as grouping. This strategy is |
| * more strict. |
| * |
| * <p>For example, if the formatter's current locale is <em>fr-FR</em>, then "1.234" will fail |
| * to parse since <em>fr-FR</em> does not use a period as the grouping separator. |
| */ |
| RESTRICTED |
| } |
| |
| /** |
| * @see Parse#parse(String, ParsePosition, ParseMode, boolean, boolean, DecimalFormatProperties, |
| * DecimalFormatSymbols) |
| */ |
| private static enum StateName { |
| BEFORE_PREFIX, |
| AFTER_PREFIX, |
| AFTER_INTEGER_DIGIT, |
| AFTER_FRACTION_DIGIT, |
| AFTER_EXPONENT_SEPARATOR, |
| AFTER_EXPONENT_DIGIT, |
| BEFORE_SUFFIX, |
| BEFORE_SUFFIX_SEEN_EXPONENT, |
| AFTER_SUFFIX, |
| INSIDE_CURRENCY, |
| INSIDE_DIGIT, |
| INSIDE_STRING, |
| INSIDE_AFFIX_PATTERN; |
| } |
| |
| // This set was decided after discussion with icu-design@. See ticket #13309. |
| // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property). |
| private static final UnicodeSet UNISET_WHITESPACE = |
| new UnicodeSet("[[:Zs:][\\u0009]]").freeze(); |
| |
| // BiDi characters are skipped over and ignored at any point in the string, even in strict mode. |
| private static final UnicodeSet UNISET_BIDI = |
| new UnicodeSet("[[\\u200E\\u200F\\u061C]]").freeze(); |
| |
| // TODO: Re-generate these sets from the database. They probably haven't been updated in a while. |
| private static final UnicodeSet UNISET_PERIOD_LIKE = |
| new UnicodeSet("[.\\u2024\\u3002\\uFE12\\uFE52\\uFF0E\\uFF61]").freeze(); |
| private static final UnicodeSet UNISET_STRICT_PERIOD_LIKE = |
| new UnicodeSet("[.\\u2024\\uFE52\\uFF0E\\uFF61]").freeze(); |
| private static final UnicodeSet UNISET_COMMA_LIKE = |
| new UnicodeSet("[,\\u060C\\u066B\\u3001\\uFE10\\uFE11\\uFE50\\uFE51\\uFF0C\\uFF64]").freeze(); |
| private static final UnicodeSet UNISET_STRICT_COMMA_LIKE = |
| new UnicodeSet("[,\\u066B\\uFE10\\uFE50\\uFF0C]").freeze(); |
| private static final UnicodeSet UNISET_OTHER_GROUPING_SEPARATORS = |
| new UnicodeSet( |
| "[\\ '\\u00A0\\u066C\\u2000-\\u200A\\u2018\\u2019\\u202F\\u205F\\u3000\\uFF07]") |
| .freeze(); |
| |
| // For parse return value calculation. |
| private static final BigDecimal MIN_LONG_AS_BIG_DECIMAL = new BigDecimal(Long.MIN_VALUE); |
| private static final BigDecimal MAX_LONG_AS_BIG_DECIMAL = new BigDecimal(Long.MAX_VALUE); |
| |
| private enum SeparatorType { |
| COMMA_LIKE, |
| PERIOD_LIKE, |
| OTHER_GROUPING, |
| UNKNOWN; |
| |
| static SeparatorType fromCp(int cp, ParseMode mode) { |
| if (mode == ParseMode.FAST) { |
| return SeparatorType.UNKNOWN; |
| } else if (mode == ParseMode.STRICT) { |
| if (UNISET_STRICT_COMMA_LIKE.contains(cp)) return COMMA_LIKE; |
| if (UNISET_STRICT_PERIOD_LIKE.contains(cp)) return PERIOD_LIKE; |
| if (UNISET_OTHER_GROUPING_SEPARATORS.contains(cp)) return OTHER_GROUPING; |
| return UNKNOWN; |
| } else { |
| if (UNISET_COMMA_LIKE.contains(cp)) return COMMA_LIKE; |
| if (UNISET_PERIOD_LIKE.contains(cp)) return PERIOD_LIKE; |
| if (UNISET_OTHER_GROUPING_SEPARATORS.contains(cp)) return OTHER_GROUPING; |
| return UNKNOWN; |
| } |
| } |
| } |
| |
| private static enum DigitType { |
| INTEGER, |
| FRACTION, |
| EXPONENT |
| } |
| |
| /** |
| * Holds a snapshot in time of a single parse path. This includes the digits seen so far, the |
| * current state name, and other properties like the grouping separator used on this parse path, |
| * details about the exponent and negative signs, etc. |
| */ |
| private static class StateItem { |
| // Parser state: |
| // The "trailingChars" is used to keep track of how many characters from the end of the string |
| // are ignorable and should be removed from the parse position should this item be accepted. |
| // The "score" is used to help rank two otherwise equivalent parse paths. Currently, the only |
| // function giving points to the score is prefix/suffix. |
| StateName name; |
| int trailingCount; |
| int score; |
| |
| // Numerical value: |
| DecimalQuantity_DualStorageBCD fq = new DecimalQuantity_DualStorageBCD(); |
| int numDigits; |
| int trailingZeros; |
| int exponent; |
| |
| // Other items that we've seen: |
| int groupingCp; |
| long groupingWidths; |
| String isoCode; |
| boolean sawNegative; |
| boolean sawNegativeExponent; |
| boolean sawCurrency; |
| boolean sawNaN; |
| boolean sawInfinity; |
| AffixHolder affix; |
| boolean sawPrefix; |
| boolean sawSuffix; |
| boolean sawDecimalPoint; |
| boolean sawExponentDigit; |
| |
| // Data for intermediate parsing steps: |
| StateName returnTo1; |
| StateName returnTo2; |
| // For string literals: |
| CharSequence currentString; |
| int currentOffset; |
| boolean currentTrailing; |
| // For affix patterns: |
| CharSequence currentAffixPattern; |
| long currentStepwiseParserTag; |
| // For currency: |
| TextTrieMap<CurrencyStringInfo>.ParseState currentCurrencyTrieState; |
| // For multi-code-point digits: |
| TextTrieMap<Byte>.ParseState currentDigitTrieState; |
| DigitType currentDigitType; |
| |
| // Identification for path tracing: |
| final char id; |
| String path; |
| |
| StateItem(char _id) { |
| id = _id; |
| } |
| |
| /** |
| * Clears the instance so that it can be re-used. |
| * |
| * @return Myself, for chaining. |
| */ |
| StateItem clear() { |
| // Parser state: |
| name = StateName.BEFORE_PREFIX; |
| trailingCount = 0; |
| score = 0; |
| |
| // Numerical value: |
| fq.clear(); |
| numDigits = 0; |
| trailingZeros = 0; |
| exponent = 0; |
| |
| // Other items we've seen: |
| groupingCp = -1; |
| groupingWidths = 0L; |
| isoCode = null; |
| sawNegative = false; |
| sawNegativeExponent = false; |
| sawCurrency = false; |
| sawNaN = false; |
| sawInfinity = false; |
| affix = null; |
| sawPrefix = false; |
| sawSuffix = false; |
| sawDecimalPoint = false; |
| sawExponentDigit = false; |
| |
| // Data for intermediate parsing steps: |
| returnTo1 = null; |
| returnTo2 = null; |
| currentString = null; |
| currentOffset = 0; |
| currentTrailing = false; |
| currentAffixPattern = null; |
| currentStepwiseParserTag = 0L; |
| currentCurrencyTrieState = null; |
| currentDigitTrieState = null; |
| currentDigitType = null; |
| |
| // Identification for path tracing: |
| // id is constant and is not cleared |
| path = ""; |
| |
| return this; |
| } |
| |
| /** |
| * Sets the internal value of this instance equal to another instance. |
| * |
| * <p>newName and cpOrN1 are required as parameters to this function because every time a code |
| * point is consumed and a state item is copied, both of the corresponding fields should be |
| * updated; it would be an error if they weren't updated. |
| * |
| * @param other The instance to copy from. |
| * @param newName The state name that the new copy should take on. |
| * @param trailing If positive, record this code point as trailing; if negative, reset the |
| * trailing count to zero. |
| * @return Myself, for chaining. |
| */ |
| StateItem copyFrom(StateItem other, StateName newName, int trailing) { |
| // Parser state: |
| name = newName; |
| score = other.score; |
| |
| // Either reset trailingCount or add the width of the current code point. |
| trailingCount = (trailing < 0) ? 0 : other.trailingCount + Character.charCount(trailing); |
| |
| // Numerical value: |
| fq.copyFrom(other.fq); |
| numDigits = other.numDigits; |
| trailingZeros = other.trailingZeros; |
| exponent = other.exponent; |
| |
| // Other items we've seen: |
| groupingCp = other.groupingCp; |
| groupingWidths = other.groupingWidths; |
| isoCode = other.isoCode; |
| sawNegative = other.sawNegative; |
| sawNegativeExponent = other.sawNegativeExponent; |
| sawCurrency = other.sawCurrency; |
| sawNaN = other.sawNaN; |
| sawInfinity = other.sawInfinity; |
| affix = other.affix; |
| sawPrefix = other.sawPrefix; |
| sawSuffix = other.sawSuffix; |
| sawDecimalPoint = other.sawDecimalPoint; |
| sawExponentDigit = other.sawExponentDigit; |
| |
| // Data for intermediate parsing steps: |
| returnTo1 = other.returnTo1; |
| returnTo2 = other.returnTo2; |
| currentString = other.currentString; |
| currentOffset = other.currentOffset; |
| currentTrailing = other.currentTrailing; |
| currentAffixPattern = other.currentAffixPattern; |
| currentStepwiseParserTag = other.currentStepwiseParserTag; |
| currentCurrencyTrieState = other.currentCurrencyTrieState; |
| currentDigitTrieState = other.currentDigitTrieState; |
| currentDigitType = other.currentDigitType; |
| |
| // Record source node if debugging |
| if (DEBUGGING) { |
| path = other.path + other.id; |
| } |
| |
| return this; |
| } |
| |
| /** |
| * Adds a digit to the internal representation of this instance. |
| * |
| * @param digit The digit that was read from the string. |
| * @param type Whether the digit occured after the decimal point. |
| */ |
| void appendDigit(byte digit, DigitType type) { |
| if (type == DigitType.EXPONENT) { |
| sawExponentDigit = true; |
| int newExponent = exponent * 10 + digit; |
| if (newExponent < exponent) { |
| // overflow |
| exponent = Integer.MAX_VALUE; |
| } else { |
| exponent = newExponent; |
| } |
| } else { |
| numDigits++; |
| if (type == DigitType.FRACTION && digit == 0) { |
| trailingZeros++; |
| } else if (type == DigitType.FRACTION) { |
| fq.appendDigit(digit, trailingZeros, false); |
| trailingZeros = 0; |
| } else { |
| fq.appendDigit(digit, 0, true); |
| } |
| } |
| } |
| |
| /** @return Whether or not this item contains a valid number. */ |
| public boolean hasNumber() { |
| return numDigits > 0 || sawNaN || sawInfinity; |
| } |
| |
| /** |
| * Converts the internal digits from this instance into a Number, preferring a Long, then a |
| * BigInteger, then a BigDecimal. A Double is used for NaN, infinity, and -0.0. |
| * |
| * @return The Number. Never null. |
| */ |
| Number toNumber(DecimalFormatProperties properties) { |
| // Check for NaN, infinity, and -0.0 |
| if (sawNaN) { |
| return Double.NaN; |
| } |
| if (sawInfinity) { |
| if (sawNegative) { |
| return Double.NEGATIVE_INFINITY; |
| } else { |
| return Double.POSITIVE_INFINITY; |
| } |
| } |
| if (fq.isZero() && sawNegative) { |
| return -0.0; |
| } |
| |
| // Check for exponent overflow |
| boolean forceBigDecimal = properties.getParseToBigDecimal(); |
| if (exponent == Integer.MAX_VALUE) { |
| if (sawNegativeExponent && sawNegative) { |
| return -0.0; |
| } else if (sawNegativeExponent) { |
| return 0.0; |
| } else if (sawNegative) { |
| return Double.NEGATIVE_INFINITY; |
| } else { |
| return Double.POSITIVE_INFINITY; |
| } |
| } else if (exponent > 1000) { |
| // BigDecimals can handle huge values better than BigIntegers. |
| forceBigDecimal = true; |
| } |
| |
| // Multipliers must be applied in reverse. |
| BigDecimal multiplier = properties.getMultiplier(); |
| if (properties.getMagnitudeMultiplier() != 0) { |
| if (multiplier == null) multiplier = BigDecimal.ONE; |
| multiplier = multiplier.scaleByPowerOfTen(properties.getMagnitudeMultiplier()); |
| } |
| int delta = (sawNegativeExponent ? -1 : 1) * exponent; |
| |
| // We need to use a math context in order to prevent non-terminating decimal expansions. |
| // This is only used when dividing by the multiplier. |
| MathContext mc = RoundingUtils.getMathContextOr34Digits(properties); |
| |
| // Construct the output number. |
| // This is the only step during fast-mode parsing that incurs object creations. |
| BigDecimal result = fq.toBigDecimal(); |
| if (sawNegative) result = result.negate(); |
| result = result.scaleByPowerOfTen(delta); |
| if (multiplier != null) { |
| result = result.divide(multiplier, mc); |
| } |
| result = result.stripTrailingZeros(); |
| if (forceBigDecimal || result.scale() > 0) { |
| return result; |
| } else if (result.compareTo(MIN_LONG_AS_BIG_DECIMAL) >= 0 |
| && result.compareTo(MAX_LONG_AS_BIG_DECIMAL) <= 0) { |
| return result.longValueExact(); |
| } else { |
| return result.toBigIntegerExact(); |
| } |
| } |
| |
| /** |
| * Converts the internal digits to a number, and also associates the number with the parsed |
| * currency. |
| * |
| * @return The CurrencyAmount. Never null. |
| */ |
| public CurrencyAmount toCurrencyAmount(DecimalFormatProperties properties) { |
| assert isoCode != null; |
| Number number = toNumber(properties); |
| Currency currency = Currency.getInstance(isoCode); |
| return new CurrencyAmount(number, currency); |
| } |
| |
| @Override |
| public String toString() { |
| StringBuilder sb = new StringBuilder(); |
| sb.append("["); |
| sb.append(path); |
| sb.append("] "); |
| sb.append(name.name()); |
| if (name == StateName.INSIDE_STRING) { |
| sb.append("{"); |
| sb.append(currentString); |
| sb.append(":"); |
| sb.append(currentOffset); |
| sb.append("}"); |
| } |
| if (name == StateName.INSIDE_AFFIX_PATTERN) { |
| sb.append("{"); |
| sb.append(currentAffixPattern); |
| sb.append(":"); |
| sb.append(AffixUtils.getOffset(currentStepwiseParserTag) - 1); |
| sb.append("}"); |
| } |
| sb.append(" "); |
| sb.append(fq.toBigDecimal()); |
| sb.append(" grouping:"); |
| sb.append(groupingCp == -1 ? new char[] {'?'} : Character.toChars(groupingCp)); |
| sb.append(" widths:"); |
| sb.append(Long.toHexString(groupingWidths)); |
| sb.append(" seen:"); |
| sb.append(sawNegative ? 1 : 0); |
| sb.append(sawNegativeExponent ? 1 : 0); |
| sb.append(sawNaN ? 1 : 0); |
| sb.append(sawInfinity ? 1 : 0); |
| sb.append(sawPrefix ? 1 : 0); |
| sb.append(sawSuffix ? 1 : 0); |
| sb.append(sawDecimalPoint ? 1 : 0); |
| sb.append(" trailing:"); |
| sb.append(trailingCount); |
| sb.append(" score:"); |
| sb.append(score); |
| sb.append(" affix:"); |
| sb.append(affix); |
| sb.append(" currency:"); |
| sb.append(isoCode); |
| return sb.toString(); |
| } |
| } |
| |
| /** |
| * Holds an ordered list of {@link StateItem} and other metadata about the string to be parsed. |
| * There are two internal arrays of {@link StateItem}, which are swapped back and forth in order |
| * to avoid object creations. The items in one array can be populated at the same time that items |
| * in the other array are being read from. |
| */ |
| private static class ParserState { |
| |
| // Basic ParserStateItem lists: |
| StateItem[] items = new StateItem[16]; |
| StateItem[] prevItems = new StateItem[16]; |
| int length; |
| int prevLength; |
| |
| // Properties and Symbols memory: |
| DecimalFormatProperties properties; |
| DecimalFormatSymbols symbols; |
| ParseMode mode; |
| boolean caseSensitive; |
| boolean parseCurrency; |
| GroupingMode groupingMode; |
| |
| // Other pre-computed fields: |
| int decimalCp1; |
| int decimalCp2; |
| int groupingCp1; |
| int groupingCp2; |
| SeparatorType decimalType1; |
| SeparatorType decimalType2; |
| SeparatorType groupingType1; |
| SeparatorType groupingType2; |
| |
| TextTrieMap<Byte> digitTrie; |
| Set<AffixHolder> affixHolders = new HashSet<AffixHolder>(); |
| |
| ParserState() { |
| for (int i = 0; i < items.length; i++) { |
| items[i] = new StateItem((char) ('A' + i)); |
| prevItems[i] = new StateItem((char) ('A' + i)); |
| } |
| } |
| |
| /** |
| * Clears the internal state in order to prepare for parsing a new string. |
| * |
| * @return Myself, for chaining. |
| */ |
| ParserState clear() { |
| length = 0; |
| prevLength = 0; |
| digitTrie = null; |
| affixHolders.clear(); |
| return this; |
| } |
| |
| /** |
| * Swaps the internal arrays of {@link StateItem}. Sets the length of the primary list to zero, |
| * so that it can be appended to. |
| */ |
| void swap() { |
| StateItem[] temp = prevItems; |
| prevItems = items; |
| items = temp; |
| prevLength = length; |
| length = 0; |
| } |
| |
| /** |
| * Swaps the internal arrays of {@link StateItem}. Sets the length of the primary list to the |
| * length of the previous list, so that it can be read from. |
| */ |
| void swapBack() { |
| StateItem[] temp = prevItems; |
| prevItems = items; |
| items = temp; |
| length = prevLength; |
| prevLength = 0; |
| } |
| |
| /** |
| * Gets the next available {@link StateItem} from the primary list for writing. This method |
| * should be thought of like a list append method, except that there are no object creations |
| * taking place. |
| * |
| * <p>It is the caller's responsibility to call either {@link StateItem#clear} or {@link |
| * StateItem#copyFrom} on the returned object. |
| * |
| * @return A dirty {@link StateItem}. |
| */ |
| StateItem getNext() { |
| if (length >= items.length) { |
| // TODO: What to do here? Expand the array? |
| // This case is rare and would happen only with specially designed input. |
| // For now, just overwrite the last entry. |
| length = items.length - 1; |
| } |
| StateItem item = items[length]; |
| length++; |
| return item; |
| } |
| |
| /** @return The index of the last inserted StateItem via a call to {@link #getNext}. */ |
| public int lastInsertedIndex() { |
| assert length > 0; |
| return length - 1; |
| } |
| |
| /** |
| * Gets a {@link StateItem} from the primary list. Assumes that the item has already been added |
| * via a call to {@link #getNext}. |
| * |
| * @param i The index of the item to get. |
| * @return The item. |
| */ |
| public StateItem getItem(int i) { |
| assert i >= 0 && i < length; |
| return items[i]; |
| } |
| |
| @Override |
| public String toString() { |
| StringBuilder sb = new StringBuilder(); |
| sb.append("<ParseState mode:"); |
| sb.append(mode); |
| sb.append(" caseSensitive:"); |
| sb.append(caseSensitive); |
| sb.append(" parseCurrency:"); |
| sb.append(parseCurrency); |
| sb.append(" groupingMode:"); |
| sb.append(groupingMode); |
| sb.append(" decimalCps:"); |
| sb.append((char) decimalCp1); |
| sb.append((char) decimalCp2); |
| sb.append(" groupingCps:"); |
| sb.append((char) groupingCp1); |
| sb.append((char) groupingCp2); |
| sb.append(" affixes:"); |
| sb.append(affixHolders); |
| sb.append(">"); |
| return sb.toString(); |
| } |
| } |
| |
| /** |
| * A wrapper for affixes. Affixes can be string-based or pattern-based, and they can come from |
| * several sources, including the property bag and the locale paterns from CLDR data. |
| */ |
| private static class AffixHolder { |
| final String p; // prefix |
| final String s; // suffix |
| final boolean strings; |
| final boolean negative; |
| |
| static final AffixHolder EMPTY_POSITIVE = new AffixHolder("", "", true, false); |
| static final AffixHolder EMPTY_NEGATIVE = new AffixHolder("", "", true, true); |
| |
| static void addToState(ParserState state, DecimalFormatProperties properties) { |
| AffixHolder pp = fromPropertiesPositivePattern(properties); |
| AffixHolder np = fromPropertiesNegativePattern(properties); |
| AffixHolder ps = fromPropertiesPositiveString(properties); |
| AffixHolder ns = fromPropertiesNegativeString(properties); |
| if (pp != null) state.affixHolders.add(pp); |
| if (ps != null) state.affixHolders.add(ps); |
| if (np != null) state.affixHolders.add(np); |
| if (ns != null) state.affixHolders.add(ns); |
| } |
| |
| static AffixHolder fromPropertiesPositivePattern(DecimalFormatProperties properties) { |
| String ppp = properties.getPositivePrefixPattern(); |
| String psp = properties.getPositiveSuffixPattern(); |
| if (properties.getSignAlwaysShown()) { |
| // TODO: This logic is somewhat duplicated from MurkyModifier. |
| boolean foundSign = false; |
| String npp = properties.getNegativePrefixPattern(); |
| String nsp = properties.getNegativeSuffixPattern(); |
| if (AffixUtils.containsType(npp, AffixUtils.TYPE_MINUS_SIGN)) { |
| foundSign = true; |
| ppp = AffixUtils.replaceType(npp, AffixUtils.TYPE_MINUS_SIGN, '+'); |
| } |
| if (AffixUtils.containsType(nsp, AffixUtils.TYPE_MINUS_SIGN)) { |
| foundSign = true; |
| psp = AffixUtils.replaceType(nsp, AffixUtils.TYPE_MINUS_SIGN, '+'); |
| } |
| if (!foundSign) { |
| ppp = "+" + ppp; |
| } |
| } |
| return getInstance(ppp, psp, false, false); |
| } |
| |
| static AffixHolder fromPropertiesNegativePattern(DecimalFormatProperties properties) { |
| String npp = properties.getNegativePrefixPattern(); |
| String nsp = properties.getNegativeSuffixPattern(); |
| if (npp == null && nsp == null) { |
| npp = properties.getPositivePrefixPattern(); |
| nsp = properties.getPositiveSuffixPattern(); |
| if (npp == null) { |
| npp = "-"; |
| } else { |
| npp = "-" + npp; |
| } |
| } |
| return getInstance(npp, nsp, false, true); |
| } |
| |
| static AffixHolder fromPropertiesPositiveString(DecimalFormatProperties properties) { |
| String pp = properties.getPositivePrefix(); |
| String ps = properties.getPositiveSuffix(); |
| if (pp == null && ps == null) return null; |
| return getInstance(pp, ps, true, false); |
| } |
| |
| static AffixHolder fromPropertiesNegativeString(DecimalFormatProperties properties) { |
| String np = properties.getNegativePrefix(); |
| String ns = properties.getNegativeSuffix(); |
| if (np == null && ns == null) return null; |
| return getInstance(np, ns, true, true); |
| } |
| |
| static AffixHolder getInstance(String p, String s, boolean strings, boolean negative) { |
| if (p == null && s == null) return negative ? EMPTY_NEGATIVE : EMPTY_POSITIVE; |
| if (p == null) p = ""; |
| if (s == null) s = ""; |
| if (p.length() == 0 && s.length() == 0) return negative ? EMPTY_NEGATIVE : EMPTY_POSITIVE; |
| return new AffixHolder(p, s, strings, negative); |
| } |
| |
| AffixHolder(String pp, String sp, boolean strings, boolean negative) { |
| this.p = pp; |
| this.s = sp; |
| this.strings = strings; |
| this.negative = negative; |
| } |
| |
| @Override |
| public boolean equals(Object other) { |
| if (other == null) return false; |
| if (this == other) return true; |
| if (!(other instanceof AffixHolder)) return false; |
| AffixHolder _other = (AffixHolder) other; |
| if (!p.equals(_other.p)) return false; |
| if (!s.equals(_other.s)) return false; |
| if (strings != _other.strings) return false; |
| if (negative != _other.negative) return false; |
| return true; |
| } |
| |
| @Override |
| public int hashCode() { |
| return p.hashCode() ^ s.hashCode(); |
| } |
| |
| @Override |
| public String toString() { |
| StringBuilder sb = new StringBuilder(); |
| sb.append("{"); |
| sb.append(p); |
| sb.append("|"); |
| sb.append(s); |
| sb.append("|"); |
| sb.append(strings ? 'S' : 'P'); |
| sb.append("}"); |
| return sb.toString(); |
| } |
| } |
| |
| /** |
| * A class that holds information about all currency affix patterns for the locale. This allows |
| * the parser to accept currencies in any format that are valid for the locale. |
| */ |
| private static class CurrencyAffixPatterns { |
| private final Set<AffixHolder> set = new HashSet<AffixHolder>(); |
| |
| private static final ConcurrentHashMap<ULocale, CurrencyAffixPatterns> currencyAffixPatterns = |
| new ConcurrentHashMap<ULocale, CurrencyAffixPatterns>(); |
| |
| static void addToState(ULocale uloc, ParserState state) { |
| CurrencyAffixPatterns value = currencyAffixPatterns.get(uloc); |
| if (value == null) { |
| // There can be multiple threads computing the same CurrencyAffixPatterns simultaneously, |
| // but that scenario is harmless. |
| CurrencyAffixPatterns newValue = new CurrencyAffixPatterns(uloc); |
| currencyAffixPatterns.putIfAbsent(uloc, newValue); |
| value = currencyAffixPatterns.get(uloc); |
| } |
| state.affixHolders.addAll(value.set); |
| } |
| |
| private CurrencyAffixPatterns(ULocale uloc) { |
| // Get the basic currency pattern. |
| String pattern = NumberFormat.getPatternForStyle(uloc, NumberFormat.CURRENCYSTYLE); |
| addPattern(pattern); |
| |
| // Get the currency plural patterns. |
| // TODO: Update this after CurrencyPluralInfo is replaced. |
| CurrencyPluralInfo pluralInfo = CurrencyPluralInfo.getInstance(uloc); |
| for (StandardPlural plural : StandardPlural.VALUES) { |
| pattern = pluralInfo.getCurrencyPluralPattern(plural.getKeyword()); |
| addPattern(pattern); |
| } |
| } |
| |
| private static final ThreadLocal<DecimalFormatProperties> threadLocalProperties = |
| new ThreadLocal<DecimalFormatProperties>() { |
| @Override |
| protected DecimalFormatProperties initialValue() { |
| return new DecimalFormatProperties(); |
| } |
| }; |
| |
| private void addPattern(String pattern) { |
| DecimalFormatProperties properties = threadLocalProperties.get(); |
| try { |
| PatternStringParser.parseToExistingProperties(pattern, properties); |
| } catch (IllegalArgumentException e) { |
| // This should only happen if there is a bug in CLDR data. Fail silently. |
| } |
| set.add(AffixHolder.fromPropertiesPositivePattern(properties)); |
| set.add(AffixHolder.fromPropertiesNegativePattern(properties)); |
| } |
| } |
| |
| /** |
| * Makes a {@link TextTrieMap} for parsing digit strings. A trie is required only if the digit |
| * strings are longer than one code point. In order for this to be the case, the user would have |
| * needed to specify custom multi-character digits, like "(0)". |
| * |
| * @param digitStrings The list of digit strings from DecimalFormatSymbols. |
| * @return A trie, or null if a trie is not required. |
| */ |
| static TextTrieMap<Byte> makeDigitTrie(String[] digitStrings) { |
| boolean requiresTrie = false; |
| for (int i = 0; i < 10; i++) { |
| String str = digitStrings[i]; |
| if (Character.charCount(Character.codePointAt(str, 0)) != str.length()) { |
| requiresTrie = true; |
| break; |
| } |
| } |
| if (!requiresTrie) return null; |
| |
| // TODO: Consider caching the tries so they don't need to be re-created run to run. |
| // (Low-priority since multi-character digits are rare in practice) |
| TextTrieMap<Byte> trieMap = new TextTrieMap<Byte>(false); |
| for (int i = 0; i < 10; i++) { |
| trieMap.put(digitStrings[i], (byte) i); |
| } |
| return trieMap; |
| } |
| |
| protected static final ThreadLocal<ParserState> threadLocalParseState = |
| new ThreadLocal<ParserState>() { |
| @Override |
| protected ParserState initialValue() { |
| return new ParserState(); |
| } |
| }; |
| |
| protected static final ThreadLocal<ParsePosition> threadLocalParsePosition = |
| new ThreadLocal<ParsePosition>() { |
| @Override |
| protected ParsePosition initialValue() { |
| return new ParsePosition(0); |
| } |
| }; |
| |
| /** |
| * @deprecated This API is ICU internal only. TODO: Remove this set from ScientificNumberFormat. |
| * @hide draft / provisional / internal are hidden on Android |
| */ |
| @Deprecated |
| public static final UnicodeSet UNISET_PLUS = |
| new UnicodeSet( |
| 0x002B, 0x002B, 0x207A, 0x207A, 0x208A, 0x208A, 0x2795, 0x2795, 0xFB29, 0xFB29, |
| 0xFE62, 0xFE62, 0xFF0B, 0xFF0B) |
| .freeze(); |
| |
| /** |
| * @deprecated This API is ICU internal only. TODO: Remove this set from ScientificNumberFormat. |
| * @hide draft / provisional / internal are hidden on Android |
| */ |
| @Deprecated |
| public static final UnicodeSet UNISET_MINUS = |
| new UnicodeSet( |
| 0x002D, 0x002D, 0x207B, 0x207B, 0x208B, 0x208B, 0x2212, 0x2212, 0x2796, 0x2796, |
| 0xFE63, 0xFE63, 0xFF0D, 0xFF0D) |
| .freeze(); |
| |
| public static Number parse(String input, DecimalFormatProperties properties, DecimalFormatSymbols symbols) { |
| ParsePosition ppos = threadLocalParsePosition.get(); |
| ppos.setIndex(0); |
| return parse(input, ppos, properties, symbols); |
| } |
| |
| // TODO: DELETE ME once debugging is finished |
| public static volatile boolean DEBUGGING = false; |
| |
| /** |
| * Implements an iterative parser that maintains a lists of possible states at each code point in |
| * the string. At each code point in the string, the list of possible states is updated based on |
| * the states coming from the previous code point. The parser stops when it reaches the end of the |
| * string or when there are no possible parse paths remaining in the string. |
| * |
| * <p>TODO: This API is not fully flushed out. Right now this is internal-only. |
| * |
| * @param input The string to parse. |
| * @param ppos A {@link ParsePosition} to hold the index at which parsing stopped. |
| * @param properties A property bag, used only for determining the prefix/suffix strings and the |
| * padding character. |
| * @param symbols A {@link DecimalFormatSymbols} object, used for determining locale-specific |
| * symbols for grouping/decimal separators, digit strings, and prefix/suffix substitutions. |
| * @return A Number matching the parser's best interpretation of the string. |
| */ |
| public static Number parse( |
| CharSequence input, |
| ParsePosition ppos, |
| DecimalFormatProperties properties, |
| DecimalFormatSymbols symbols) { |
| StateItem best = _parse(input, ppos, false, properties, symbols); |
| return (best == null) ? null : best.toNumber(properties); |
| } |
| |
| public static CurrencyAmount parseCurrency( |
| String input, DecimalFormatProperties properties, DecimalFormatSymbols symbols) throws ParseException { |
| return parseCurrency(input, null, properties, symbols); |
| } |
| |
| public static CurrencyAmount parseCurrency( |
| CharSequence input, ParsePosition ppos, DecimalFormatProperties properties, DecimalFormatSymbols symbols) |
| throws ParseException { |
| if (ppos == null) { |
| ppos = threadLocalParsePosition.get(); |
| ppos.setIndex(0); |
| ppos.setErrorIndex(-1); |
| } |
| StateItem best = _parse(input, ppos, true, properties, symbols); |
| return (best == null) ? null : best.toCurrencyAmount(properties); |
| } |
| |
| private static StateItem _parse( |
| CharSequence input, |
| ParsePosition ppos, |
| boolean parseCurrency, |
| DecimalFormatProperties properties, |
| DecimalFormatSymbols symbols) { |
| |
| if (input == null || ppos == null || properties == null || symbols == null) { |
| throw new IllegalArgumentException("All arguments are required for parse."); |
| } |
| |
| ParseMode mode = properties.getParseMode(); |
| if (mode == null) mode = ParseMode.LENIENT; |
| boolean integerOnly = properties.getParseIntegerOnly(); |
| boolean ignoreExponent = properties.getParseNoExponent(); |
| boolean ignoreGrouping = properties.getGroupingSize() <= 0; |
| |
| // Set up the initial state |
| ParserState state = threadLocalParseState.get().clear(); |
| state.properties = properties; |
| state.symbols = symbols; |
| state.mode = mode; |
| state.parseCurrency = parseCurrency; |
| state.groupingMode = properties.getParseGroupingMode(); |
| if (state.groupingMode == null) state.groupingMode = GroupingMode.DEFAULT; |
| state.caseSensitive = properties.getParseCaseSensitive(); |
| state.decimalCp1 = Character.codePointAt(symbols.getDecimalSeparatorString(), 0); |
| state.decimalCp2 = Character.codePointAt(symbols.getMonetaryDecimalSeparatorString(), 0); |
| state.groupingCp1 = Character.codePointAt(symbols.getGroupingSeparatorString(), 0); |
| state.groupingCp2 = Character.codePointAt(symbols.getMonetaryGroupingSeparatorString(), 0); |
| state.decimalType1 = SeparatorType.fromCp(state.decimalCp1, mode); |
| state.decimalType2 = SeparatorType.fromCp(state.decimalCp2, mode); |
| state.groupingType1 = SeparatorType.fromCp(state.groupingCp1, mode); |
| state.groupingType2 = SeparatorType.fromCp(state.groupingCp2, mode); |
| StateItem initialStateItem = state.getNext().clear(); |
| initialStateItem.name = StateName.BEFORE_PREFIX; |
| |
| if (mode == ParseMode.LENIENT || mode == ParseMode.STRICT) { |
| state.digitTrie = makeDigitTrie(symbols.getDigitStringsLocal()); |
| AffixHolder.addToState(state, properties); |
| if (parseCurrency) { |
| CurrencyAffixPatterns.addToState(symbols.getULocale(), state); |
| } |
| } |
| |
| if (DEBUGGING) { |
| System.out.println("Parsing: " + input); |
| System.out.println(properties); |
| System.out.println(state); |
| } |
| |
| // Start walking through the string, one codepoint at a time. Backtracking is not allowed. This |
| // is to enforce linear runtime and prevent cases that could result in an infinite loop. |
| int offset = ppos.getIndex(); |
| for (; offset < input.length(); ) { |
| int cp = Character.codePointAt(input, offset); |
| state.swap(); |
| for (int i = 0; i < state.prevLength; i++) { |
| StateItem item = state.prevItems[i]; |
| if (DEBUGGING) { |
| System.out.println(":" + offset + item.id + " " + item); |
| } |
| |
| // In the switch statement below, if you see a line like: |
| // if (state.length > 0 && mode == ParseMode.FAST) break; |
| // it is used for accelerating the fast parse mode. The check is performed only in the |
| // states BEFORE_PREFIX, AFTER_INTEGER_DIGIT, and AFTER_FRACTION_DIGIT, which are the |
| // most common states. |
| |
| switch (item.name) { |
| case BEFORE_PREFIX: |
| // Beginning of string |
| if (mode == ParseMode.LENIENT || mode == ParseMode.FAST) { |
| acceptMinusOrPlusSign(cp, StateName.BEFORE_PREFIX, state, item, false); |
| if (state.length > 0 && mode == ParseMode.FAST) break; |
| } |
| acceptIntegerDigit(cp, StateName.AFTER_INTEGER_DIGIT, state, item); |
| if (state.length > 0 && mode == ParseMode.FAST) break; |
| acceptBidi(cp, StateName.BEFORE_PREFIX, state, item); |
| if (state.length > 0 && mode == ParseMode.FAST) break; |
| acceptWhitespace(cp, StateName.BEFORE_PREFIX, state, item); |
| if (state.length > 0 && mode == ParseMode.FAST) break; |
| acceptPadding(cp, StateName.BEFORE_PREFIX, state, item); |
| if (state.length > 0 && mode == ParseMode.FAST) break; |
| acceptNan(cp, StateName.BEFORE_SUFFIX, state, item); |
| if (state.length > 0 && mode == ParseMode.FAST) break; |
| acceptInfinity(cp, StateName.BEFORE_SUFFIX, state, item); |
| if (state.length > 0 && mode == ParseMode.FAST) break; |
| if (!integerOnly) { |
| acceptDecimalPoint(cp, StateName.AFTER_FRACTION_DIGIT, state, item); |
| if (state.length > 0 && mode == ParseMode.FAST) break; |
| } |
| if (mode == ParseMode.LENIENT || mode == ParseMode.STRICT) { |
| acceptPrefix(cp, StateName.AFTER_PREFIX, state, item); |
| } |
| if (mode == ParseMode.LENIENT || mode == ParseMode.FAST) { |
| if (!ignoreGrouping) { |
| acceptGrouping(cp, StateName.AFTER_INTEGER_DIGIT, state, item); |
| if (state.length > 0 && mode == ParseMode.FAST) break; |
| } |
| if (parseCurrency) { |
| acceptCurrency(cp, StateName.BEFORE_PREFIX, state, item); |
| } |
| } |
| break; |
| |
| case AFTER_PREFIX: |
| // Prefix is consumed |
| acceptBidi(cp, StateName.AFTER_PREFIX, state, item); |
| acceptPadding(cp, StateName.AFTER_PREFIX, state, item); |
| acceptNan(cp, StateName.BEFORE_SUFFIX, state, item); |
| acceptInfinity(cp, StateName.BEFORE_SUFFIX, state, item); |
| acceptIntegerDigit(cp, StateName.AFTER_INTEGER_DIGIT, state, item); |
| if (!integerOnly) { |
| acceptDecimalPoint(cp, StateName.AFTER_FRACTION_DIGIT, state, item); |
| } |
| if (mode == ParseMode.LENIENT || mode == ParseMode.FAST) { |
| acceptWhitespace(cp, StateName.AFTER_PREFIX, state, item); |
| if (!ignoreGrouping) { |
| acceptGrouping(cp, StateName.AFTER_INTEGER_DIGIT, state, item); |
| } |
| if (parseCurrency) { |
| acceptCurrency(cp, StateName.AFTER_PREFIX, state, item); |
| } |
| } |
| break; |
| |
| case AFTER_INTEGER_DIGIT: |
| // Previous character was an integer digit (or grouping/whitespace) |
| acceptIntegerDigit(cp, StateName.AFTER_INTEGER_DIGIT, state, item); |
| if (state.length > 0 && mode == ParseMode.FAST) break; |
| if (!integerOnly) { |
| acceptDecimalPoint(cp, StateName.AFTER_FRACTION_DIGIT, state, item); |
| if (state.length > 0 && mode == ParseMode.FAST) break; |
| } |
| if (!ignoreGrouping) { |
| acceptGrouping(cp, StateName.AFTER_INTEGER_DIGIT, state, item); |
| if (state.length > 0 && mode == ParseMode.FAST) break; |
| } |
| acceptBidi(cp, StateName.BEFORE_SUFFIX, state, item); |
| if (state.length > 0 && mode == ParseMode.FAST) break; |
| acceptPadding(cp, StateName.BEFORE_SUFFIX, state, item); |
| if (state.length > 0 && mode == ParseMode.FAST) break; |
| if (!ignoreExponent) { |
| acceptExponentSeparator(cp, StateName.AFTER_EXPONENT_SEPARATOR, state, item); |
| if (state.length > 0 && mode == ParseMode.FAST) break; |
| } |
| if (mode == ParseMode.LENIENT || mode == ParseMode.STRICT) { |
| acceptSuffix(cp, StateName.AFTER_SUFFIX, state, item); |
| } |
| if (mode == ParseMode.LENIENT || mode == ParseMode.FAST) { |
| acceptWhitespace(cp, StateName.BEFORE_SUFFIX, state, item); |
| if (state.length > 0 && mode == ParseMode.FAST) break; |
| // TODO(sffc): acceptMinusOrPlusSign(cp, StateName.BEFORE_SUFFIX, state, item, false); |
| if (state.length > 0 && mode == ParseMode.FAST) break; |
| if (parseCurrency) { |
| acceptCurrency(cp, StateName.BEFORE_SUFFIX, state, item); |
| } |
| } |
| break; |
| |
| case AFTER_FRACTION_DIGIT: |
| // We encountered a decimal point |
| acceptFractionDigit(cp, StateName.AFTER_FRACTION_DIGIT, state, item); |
| if (state.length > 0 && mode == ParseMode.FAST) break; |
| acceptBidi(cp, StateName.BEFORE_SUFFIX, state, item); |
| if (state.length > 0 && mode == ParseMode.FAST) break; |
| acceptPadding(cp, StateName.BEFORE_SUFFIX, state, item); |
| if (state.length > 0 && mode == ParseMode.FAST) break; |
| if (!ignoreExponent) { |
| acceptExponentSeparator(cp, StateName.AFTER_EXPONENT_SEPARATOR, state, item); |
| if (state.length > 0 && mode == ParseMode.FAST) break; |
| } |
| if (mode == ParseMode.LENIENT || mode == ParseMode.STRICT) { |
| acceptSuffix(cp, StateName.AFTER_SUFFIX, state, item); |
| } |
| if (mode == ParseMode.LENIENT || mode == ParseMode.FAST) { |
| acceptWhitespace(cp, StateName.BEFORE_SUFFIX, state, item); |
| if (state.length > 0 && mode == ParseMode.FAST) break; |
| // TODO(sffc): acceptMinusOrPlusSign(cp, StateName.BEFORE_SUFFIX, state, item, false); |
| if (state.length > 0 && mode == ParseMode.FAST) break; |
| if (parseCurrency) { |
| acceptCurrency(cp, StateName.BEFORE_SUFFIX, state, item); |
| } |
| } |
| break; |
| |
| case AFTER_EXPONENT_SEPARATOR: |
| acceptBidi(cp, StateName.AFTER_EXPONENT_SEPARATOR, state, item); |
| acceptMinusOrPlusSign(cp, StateName.AFTER_EXPONENT_SEPARATOR, state, item, true); |
| acceptExponentDigit(cp, StateName.AFTER_EXPONENT_DIGIT, state, item); |
| break; |
| |
| case AFTER_EXPONENT_DIGIT: |
| acceptBidi(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item); |
| acceptPadding(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item); |
| acceptExponentDigit(cp, StateName.AFTER_EXPONENT_DIGIT, state, item); |
| if (mode == ParseMode.LENIENT || mode == ParseMode.STRICT) { |
| acceptSuffix(cp, StateName.AFTER_SUFFIX, state, item); |
| } |
| if (mode == ParseMode.LENIENT || mode == ParseMode.FAST) { |
| acceptWhitespace(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item); |
| // TODO(sffc): acceptMinusOrPlusSign(cp, StateName.BEFORE_SUFFIX, state, item, false); |
| if (parseCurrency) { |
| acceptCurrency(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item); |
| } |
| } |
| break; |
| |
| case BEFORE_SUFFIX: |
| // Accept whitespace, suffixes, and exponent separators |
| acceptBidi(cp, StateName.BEFORE_SUFFIX, state, item); |
| acceptPadding(cp, StateName.BEFORE_SUFFIX, state, item); |
| if (!ignoreExponent) { |
| acceptExponentSeparator(cp, StateName.AFTER_EXPONENT_SEPARATOR, state, item); |
| } |
| if (mode == ParseMode.LENIENT || mode == ParseMode.STRICT) { |
| acceptSuffix(cp, StateName.AFTER_SUFFIX, state, item); |
| } |
| if (mode == ParseMode.LENIENT || mode == ParseMode.FAST) { |
| acceptWhitespace(cp, StateName.BEFORE_SUFFIX, state, item); |
| // TODO(sffc): acceptMinusOrPlusSign(cp, StateName.BEFORE_SUFFIX, state, item, false); |
| if (parseCurrency) { |
| acceptCurrency(cp, StateName.BEFORE_SUFFIX, state, item); |
| } |
| } |
| break; |
| |
| case BEFORE_SUFFIX_SEEN_EXPONENT: |
| // Accept whitespace and suffixes but not exponent separators |
| acceptBidi(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item); |
| acceptPadding(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item); |
| if (mode == ParseMode.LENIENT || mode == ParseMode.STRICT) { |
| acceptSuffix(cp, StateName.AFTER_SUFFIX, state, item); |
| } |
| if (mode == ParseMode.LENIENT || mode == ParseMode.FAST) { |
| acceptWhitespace(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item); |
| // TODO(sffc): acceptMinusOrPlusSign(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item, false); |
| if (parseCurrency) { |
| acceptCurrency(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item); |
| } |
| } |
| break; |
| |
| case AFTER_SUFFIX: |
| if ((mode == ParseMode.LENIENT || mode == ParseMode.FAST) && parseCurrency) { |
| // Continue traversing in case there is a currency symbol to consume |
| acceptBidi(cp, StateName.AFTER_SUFFIX, state, item); |
| acceptPadding(cp, StateName.AFTER_SUFFIX, state, item); |
| acceptWhitespace(cp, StateName.AFTER_SUFFIX, state, item); |
| // TODO(sffc): acceptMinusOrPlusSign(cp, StateName.AFTER_SUFFIX, state, item, false); |
| if (parseCurrency) { |
| acceptCurrency(cp, StateName.AFTER_SUFFIX, state, item); |
| } |
| } |
| // Otherwise, do not accept any more characters. |
| break; |
| |
| case INSIDE_CURRENCY: |
| acceptCurrencyOffset(cp, state, item); |
| break; |
| |
| case INSIDE_DIGIT: |
| acceptDigitTrieOffset(cp, state, item); |
| break; |
| |
| case INSIDE_STRING: |
| acceptStringOffset(cp, state, item); |
| break; |
| |
| case INSIDE_AFFIX_PATTERN: |
| acceptAffixPatternOffset(cp, state, item); |
| break; |
| } |
| } |
| |
| if (state.length == 0) { |
| // No parse paths continue past this point. We have found the longest parsable string |
| // from the input. Restore previous state without the offset and break. |
| state.swapBack(); |
| break; |
| } |
| |
| offset += Character.charCount(cp); |
| } |
| |
| // Post-processing |
| if (state.length == 0) { |
| if (DEBUGGING) { |
| System.out.println("No matches found"); |
| System.out.println("- - - - - - - - - -"); |
| } |
| return null; |
| } else { |
| |
| // Loop through the candidates. "continue" skips a candidate as invalid. |
| StateItem best = null; |
| outer: |
| for (int i = 0; i < state.length; i++) { |
| StateItem item = state.items[i]; |
| |
| if (DEBUGGING) { |
| System.out.println(":end " + item); |
| } |
| |
| // Check that at least one digit was read. |
| if (!item.hasNumber()) { |
| if (DEBUGGING) System.out.println("-> rejected due to no number value"); |
| continue; |
| } |
| |
| if (mode == ParseMode.STRICT) { |
| // Perform extra checks for strict mode. |
| // We require that the affixes match. |
| boolean sawPrefix = item.sawPrefix || (item.affix != null && item.affix.p.isEmpty()); |
| boolean sawSuffix = item.sawSuffix || (item.affix != null && item.affix.s.isEmpty()); |
| boolean hasEmptyAffix = |
| state.affixHolders.contains(AffixHolder.EMPTY_POSITIVE) |
| || state.affixHolders.contains(AffixHolder.EMPTY_NEGATIVE); |
| if (sawPrefix && sawSuffix) { |
| // OK |
| } else if (!sawPrefix && !sawSuffix && hasEmptyAffix) { |
| // OK |
| } else { |
| // Has a prefix or suffix that doesn't match |
| if (DEBUGGING) System.out.println("-> rejected due to mismatched prefix/suffix"); |
| continue; |
| } |
| |
| // Check for scientific notation. |
| if (properties.getMinimumExponentDigits() > 0 && !item.sawExponentDigit) { |
| if (DEBUGGING) System.out.println("-> reject due to lack of exponent"); |
| continue; |
| } |
| |
| // Check that grouping sizes are valid. |
| int grouping1 = properties.getGroupingSize(); |
| int grouping2 = properties.getSecondaryGroupingSize(); |
| grouping1 = grouping1 > 0 ? grouping1 : grouping2; |
| grouping2 = grouping2 > 0 ? grouping2 : grouping1; |
| long groupingWidths = item.groupingWidths; |
| int numGroupingRegions = 16 - Long.numberOfLeadingZeros(groupingWidths) / 4; |
| // If the last grouping is zero, accept strings like "1," but reject string like "1,.23" |
| // Strip off multiple last-groupings to handle cases like "123,," or "123 " |
| while (numGroupingRegions > 1 && (groupingWidths & 0xf) == 0) { |
| if (item.sawDecimalPoint) { |
| if (DEBUGGING) System.out.println("-> rejected due to decimal point after grouping"); |
| continue outer; |
| } else { |
| groupingWidths >>>= 4; |
| numGroupingRegions--; |
| } |
| } |
| if (grouping1 <= 0) { |
| // OK (no grouping data available) |
| } else if (numGroupingRegions <= 1) { |
| // OK (no grouping digits) |
| } else if ((groupingWidths & 0xf) != grouping1) { |
| // First grouping size is invalid |
| if (DEBUGGING) System.out.println("-> rejected due to first grouping violation"); |
| continue; |
| } else if (((groupingWidths >>> ((numGroupingRegions - 1) * 4)) & 0xf) > grouping2) { |
| // String like "1234,567" where the highest grouping is too large |
| if (DEBUGGING) System.out.println("-> rejected due to final grouping violation"); |
| continue; |
| } else { |
| for (int j = 1; j < numGroupingRegions - 1; j++) { |
| if (((groupingWidths >>> (j * 4)) & 0xf) != grouping2) { |
| // A grouping size somewhere in the middle is invalid |
| if (DEBUGGING) System.out.println("-> rejected due to inner grouping violation"); |
| continue outer; |
| } |
| } |
| } |
| } |
| |
| // Optionally require that the presence of a decimal point matches the pattern. |
| if (properties.getDecimalPatternMatchRequired() |
| && item.sawDecimalPoint |
| != (properties.getDecimalSeparatorAlwaysShown() |
| || properties.getMaximumFractionDigits() != 0)) { |
| if (DEBUGGING) System.out.println("-> rejected due to decimal point violation"); |
| continue; |
| } |
| |
| // When parsing currencies, require that a currency symbol was found. |
| if (parseCurrency && !item.sawCurrency) { |
| if (DEBUGGING) System.out.println("-> rejected due to lack of currency"); |
| continue; |
| } |
| |
| // If we get here, then this candidate is acceptable. |
| // Use the earliest candidate in the list, or the one with the highest score, or the |
| // one with the fewest trailing digits. |
| if (best == null) { |
| best = item; |
| } else if (item.score > best.score) { |
| best = item; |
| } else if (item.trailingCount < best.trailingCount) { |
| best = item; |
| } |
| } |
| |
| if (DEBUGGING) { |
| System.out.println("- - - - - - - - - -"); |
| } |
| |
| if (best != null) { |
| ppos.setIndex(offset - best.trailingCount); |
| return best; |
| } else { |
| ppos.setErrorIndex(offset); |
| return null; |
| } |
| } |
| } |
| |
| /** |
| * If <code>cp</code> is whitespace (as determined by the unicode set {@link #UNISET_WHITESPACE}), |
| * copies <code>item</code> to the new list in <code>state</code> and sets its state name to |
| * <code>nextName</code>. |
| * |
| * @param cp The code point to check. |
| * @param nextName The new state name if the check passes. |
| * @param state The state object to update. |
| * @param item The old state leading into the code point. |
| */ |
| private static void acceptWhitespace( |
| int cp, StateName nextName, ParserState state, StateItem item) { |
| if (UNISET_WHITESPACE.contains(cp)) { |
| state.getNext().copyFrom(item, nextName, cp); |
| } |
| } |
| |
| /** |
| * If <code>cp</code> is a bidi control character (as determined by the unicode set {@link |
| * #UNISET_BIDI}), copies <code>item</code> to the new list in <code>state</code> and sets its |
| * state name to <code>nextName</code>. |
| * |
| * @param cp The code point to check. |
| * @param nextName The new state name if the check passes. |
| * @param state The state object to update. |
| * @param item The old state leading into the code point. |
| */ |
| private static void acceptBidi(int cp, StateName nextName, ParserState state, StateItem item) { |
| if (UNISET_BIDI.contains(cp)) { |
| state.getNext().copyFrom(item, nextName, cp); |
| } |
| } |
| |
| /** |
| * If <code>cp</code> is a padding character (as determined by {@link ParserState#paddingCp}), |
| * copies <code>item</code> to the new list in <code>state</code> and sets its state name to |
| * <code>nextName</code>. |
| * |
| * @param cp The code point to check. |
| * @param nextName The new state name if the check passes. |
| * @param state The state object to update. |
| * @param item The old state leading into the code point. |
| */ |
| private static void acceptPadding(int cp, StateName nextName, ParserState state, StateItem item) { |
| CharSequence padding = state.properties.getPadString(); |
| if (padding == null || padding.length() == 0) return; |
| int referenceCp = Character.codePointAt(padding, 0); |
| if (cp == referenceCp) { |
| state.getNext().copyFrom(item, nextName, cp); |
| } |
| } |
| |
| private static void acceptIntegerDigit( |
| int cp, StateName nextName, ParserState state, StateItem item) { |
| acceptDigitHelper(cp, nextName, state, item, DigitType.INTEGER); |
| } |
| |
| private static void acceptFractionDigit( |
| int cp, StateName nextName, ParserState state, StateItem item) { |
| acceptDigitHelper(cp, nextName, state, item, DigitType.FRACTION); |
| } |
| |
| private static void acceptExponentDigit( |
| int cp, StateName nextName, ParserState state, StateItem item) { |
| acceptDigitHelper(cp, nextName, state, item, DigitType.EXPONENT); |
| } |
| |
| /** |
| * If <code>cp</code> is a digit character (as determined by either {@link UCharacter#digit} or |
| * {@link ParserState#digitCps}), copies <code>item</code> to the new list in <code>state</code> |
| * and sets its state name to one determined by <code>type</code>. Also copies the digit into a |
| * field in the new item determined by <code>type</code>. |
| * |
| * @param cp The code point to check. |
| * @param nextName The state to set if a digit is accepted. |
| * @param state The state object to update. |
| * @param item The old state leading into the code point. |
| * @param type The digit type, which determines the next state and the field into which to insert |
| * the digit. |
| */ |
| private static void acceptDigitHelper( |
| int cp, StateName nextName, ParserState state, StateItem item, DigitType type) { |
| // Check the Unicode digit character property |
| byte digit = (byte) UCharacter.digit(cp, 10); |
| StateItem next = null; |
| |
| // Look for the digit: |
| if (digit >= 0) { |
| // Code point is a number |
| next = state.getNext().copyFrom(item, nextName, -1); |
| } |
| |
| // Do not perform the expensive string manipulations in fast mode. |
| if (digit < 0 && (state.mode == ParseMode.LENIENT || state.mode == ParseMode.STRICT)) { |
| if (state.digitTrie == null) { |
| // Check custom digits, all of which are at most one code point |
| for (byte d = 0; d < 10; d++) { |
| int referenceCp = Character.codePointAt(state.symbols.getDigitStringsLocal()[d], 0); |
| if (cp == referenceCp) { |
| digit = d; |
| next = state.getNext().copyFrom(item, nextName, -1); |
| } |
| } |
| } else { |
| // Custom digits have more than one code point |
| acceptDigitTrie(cp, nextName, state, item, type); |
| } |
| } |
| |
| // Save state |
| recordDigit(next, digit, type); |
| } |
| |
| /** |
| * Helper function for {@link acceptDigit} and {@link acceptDigitTrie} to save a complete digit in |
| * a state item and update grouping widths. |
| * |
| * @param next The new StateItem |
| * @param digit The digit to record |
| * @param type The type of the digit to record (INTEGER, FRACTION, or EXPONENT) |
| */ |
| private static void recordDigit(StateItem next, byte digit, DigitType type) { |
| if (next == null) return; |
| next.appendDigit(digit, type); |
| if (type == DigitType.INTEGER && (next.groupingWidths & 0xf) < 15) { |
| next.groupingWidths++; |
| } |
| } |
| |
| /** |
| * If <code>cp</code> is a sign (as determined by the unicode sets {@link #UNISET_PLUS} and {@link |
| * #UNISET_MINUS}), copies <code>item</code> to the new list in <code>state</code>. Loops back to |
| * the same state name. |
| * |
| * @param cp The code point to check. |
| * @param state The state object to update. |
| * @param item The old state leading into the code point. |
| */ |
| private static void acceptMinusOrPlusSign( |
| int cp, StateName nextName, ParserState state, StateItem item, boolean exponent) { |
| acceptMinusSign(cp, nextName, null, state, item, exponent); |
| acceptPlusSign(cp, nextName, null, state, item, exponent); |
| } |
| |
| private static long acceptMinusSign( |
| int cp, |
| StateName returnTo1, |
| StateName returnTo2, |
| ParserState state, |
| StateItem item, |
| boolean exponent) { |
| if (UNISET_MINUS.contains(cp)) { |
| StateItem next = state.getNext().copyFrom(item, returnTo1, -1); |
| next.returnTo1 = returnTo2; |
| if (exponent) { |
| next.sawNegativeExponent = true; |
| } else { |
| next.sawNegative = true; |
| } |
| return 1L << state.lastInsertedIndex(); |
| } else { |
| return 0L; |
| } |
| } |
| |
| private static long acceptPlusSign( |
| int cp, |
| StateName returnTo1, |
| StateName returnTo2, |
| ParserState state, |
| StateItem item, |
| boolean exponent) { |
| if (UNISET_PLUS.contains(cp)) { |
| StateItem next = state.getNext().copyFrom(item, returnTo1, -1); |
| next.returnTo1 = returnTo2; |
| return 1L << state.lastInsertedIndex(); |
| } else { |
| return 0L; |
| } |
| } |
| |
| /** |
| * If <code>cp</code> is a grouping separator (as determined by the unicode set {@link |
| * #UNISET_GROUPING}), copies <code>item</code> to the new list in <code>state</code> and loops |
| * back to the same state. Also accepts if <code>cp</code> is the locale-specific grouping |
| * separator in {@link ParserState#groupingCp}, in which case the {@link |
| * StateItem#usesLocaleSymbols} flag is also set. |
| * |
| * @param cp The code point to check. |
| * @param state The state object to update. |
| * @param item The old state leading into the code point. |
| */ |
| private static void acceptGrouping( |
| int cp, StateName nextName, ParserState state, StateItem item) { |
| // Do not accept mixed grouping separators in the same string. |
| if (item.groupingCp == -1) { |
| // First time seeing a grouping separator. |
| SeparatorType cpType = SeparatorType.fromCp(cp, state.mode); |
| |
| // Always accept if exactly the same as the locale grouping separator. |
| if (cp != state.groupingCp1 && cp != state.groupingCp2) { |
| // Reject if not in one of the three primary equivalence classes. |
| if (cpType == SeparatorType.UNKNOWN) { |
| return; |
| } |
| if (state.groupingMode == GroupingMode.RESTRICTED) { |
| // Reject if not in the same class as the locale grouping separator. |
| if (cpType != state.groupingType1 || cpType != state.groupingType2) { |
| return; |
| } |
| } else { |
| // Reject if in the same class as the decimal separator. |
| if (cpType == SeparatorType.COMMA_LIKE |
| && (state.decimalType1 == SeparatorType.COMMA_LIKE |
| || state.decimalType2 == SeparatorType.COMMA_LIKE)) { |
| return; |
| } |
| if (cpType == SeparatorType.PERIOD_LIKE |
| && (state.decimalType1 == SeparatorType.PERIOD_LIKE |
| || state.decimalType2 == SeparatorType.PERIOD_LIKE)) { |
| return; |
| } |
| } |
| } |
| |
| // A match was found. |
| StateItem next = state.getNext().copyFrom(item, nextName, cp); |
| next.groupingCp = cp; |
| next.groupingWidths <<= 4; |
| } else { |
| // Have already seen a grouping separator. |
| if (cp == item.groupingCp) { |
| StateItem next = state.getNext().copyFrom(item, nextName, cp); |
| next.groupingWidths <<= 4; |
| } |
| } |
| } |
| |
| /** |
| * If <code>cp</code> is a decimal (as determined by the unicode set {@link #UNISET_DECIMAL}), |
| * copies <code>item</code> to the new list in <code>state</code> and goes to {@link |
| * StateName#AFTER_FRACTION_DIGIT}. Also accepts if <code>cp</code> is the locale-specific decimal |
| * point in {@link ParserState#decimalCp}, in which case the {@link StateItem#usesLocaleSymbols} |
| * flag is also set. |
| * |
| * @param cp The code point to check. |
| * @param state The state object to update. |
| * @param item The old state leading into the code point. |
| */ |
| private static void acceptDecimalPoint( |
| int cp, StateName nextName, ParserState state, StateItem item) { |
| if (cp == item.groupingCp) { |
| // Don't accept a decimal point that is the same as the grouping separator |
| return; |
| } |
| |
| SeparatorType cpType = SeparatorType.fromCp(cp, state.mode); |
| |
| // We require that the decimal separator be in the same class as the locale. |
| if (cpType != state.decimalType1 && cpType != state.decimalType2) { |
| return; |
| } |
| |
| // If in UNKNOWN or OTHER, require an exact match. |
| if (cpType == SeparatorType.OTHER_GROUPING || cpType == SeparatorType.UNKNOWN) { |
| if (cp != state.decimalCp1 && cp != state.decimalCp2) { |
| return; |
| } |
| } |
| |
| // A match was found. |
| StateItem next = state.getNext().copyFrom(item, nextName, -1); |
| next.sawDecimalPoint = true; |
| } |
| |
| private static void acceptNan(int cp, StateName nextName, ParserState state, StateItem item) { |
| CharSequence nan = state.symbols.getNaN(); |
| long added = acceptString(cp, nextName, null, state, item, nan, 0, false); |
| |
| // Set state in the items that were added by the function call |
| for (int i = Long.numberOfTrailingZeros(added); (1L << i) <= added; i++) { |
| if (((1L << i) & added) != 0) { |
| state.getItem(i).sawNaN = true; |
| } |
| } |
| } |
| |
| private static void acceptInfinity( |
| int cp, StateName nextName, ParserState state, StateItem item) { |
| CharSequence inf = state.symbols.getInfinity(); |
| long added = acceptString(cp, nextName, null, state, item, inf, 0, false); |
| |
| // Set state in the items that were added by the function call |
| for (int i = Long.numberOfTrailingZeros(added); (1L << i) <= added; i++) { |
| if (((1L << i) & added) != 0) { |
| state.getItem(i).sawInfinity = true; |
| } |
| } |
| } |
| |
| private static void acceptExponentSeparator( |
| int cp, StateName nextName, ParserState state, StateItem item) { |
| CharSequence exp = state.symbols.getExponentSeparator(); |
| acceptString(cp, nextName, null, state, item, exp, 0, true); |
| } |
| |
| private static void acceptPrefix(int cp, StateName nextName, ParserState state, StateItem item) { |
| for (AffixHolder holder : state.affixHolders) { |
| acceptAffixHolder(cp, nextName, state, item, holder, true); |
| } |
| } |
| |
| private static void acceptSuffix(int cp, StateName nextName, ParserState state, StateItem item) { |
| if (item.affix != null) { |
| acceptAffixHolder(cp, nextName, state, item, item.affix, false); |
| } else { |
| for (AffixHolder holder : state.affixHolders) { |
| acceptAffixHolder(cp, nextName, state, item, holder, false); |
| } |
| } |
| } |
| |
| private static void acceptAffixHolder( |
| int cp, |
| StateName nextName, |
| ParserState state, |
| StateItem item, |
| AffixHolder holder, |
| boolean prefix) { |
| if (holder == null) return; |
| String str = prefix ? holder.p : holder.s; |
| long added; |
| if (holder.strings) { |
| added = acceptString(cp, nextName, null, state, item, str, 0, false); |
| } else { |
| added = |
| acceptAffixPattern(cp, nextName, state, item, str, AffixUtils.nextToken(0, str)); |
| } |
| // Record state in the added entries |
| for (int i = Long.numberOfTrailingZeros(added); (1L << i) <= added; i++) { |
| if (((1L << i) & added) != 0) { |
| StateItem next = state.getItem(i); |
| next.affix = holder; |
| if (prefix) next.sawPrefix = true; |
| if (!prefix) next.sawSuffix = true; |
| if (holder.negative) next.sawNegative = true; |
| // 10 point reward for consuming a prefix/suffix: |
| next.score += 10; |
| // 1 point reward for positive holders (if there is ambiguity, we want to favor positive): |
| if (!holder.negative) next.score += 1; |
| // 5 point reward for affix holders that have an empty prefix or suffix (we won't see them again): |
| if (!next.sawPrefix && holder.p.isEmpty()) next.score += 5; |
| if (!next.sawSuffix && holder.s.isEmpty()) next.score += 5; |
| } |
| } |
| } |
| |
| private static long acceptStringOffset(int cp, ParserState state, StateItem item) { |
| return acceptString( |
| cp, |
| item.returnTo1, |
| item.returnTo2, |
| state, |
| item, |
| item.currentString, |
| item.currentOffset, |
| item.currentTrailing); |
| } |
| |
| /** |
| * Accepts a code point if the code point is compatible with the string at the given offset. |
| * Handles runs of ignorable characters. |
| * |
| * <p>This method will add either one or two {@link StateItem} to the {@link ParserState}. |
| * |
| * @param cp The current code point, which will be checked for a match to the string. |
| * @param ret1 The state to return to after reaching the end of the string. |
| * @param ret2 The state to save in <code>returnTo1</code> after reaching the end of the string. |
| * Set to null if returning to the main state loop. |
| * @param trailing true if this string should be ignored for the purposes of recording trailing |
| * code points; false if it trailing count should be reset after reading the string. |
| * @param state The current {@link ParserState} |
| * @param item The current {@link StateItem} |
| * @param str The string against which to check for a match. |
| * @param offset The number of chars into the string. Initial value should be 0. |
| * @param trailing false if this string is strong and should reset trailing count to zero when it |
| * is fully consumed. |
| * @return A bitmask where the bits correspond to the items that were added. Set to 0L if no items |
| * were added. |
| */ |
| private static long acceptString( |
| int cp, |
| StateName ret1, |
| StateName ret2, |
| ParserState state, |
| StateItem item, |
| CharSequence str, |
| int offset, |
| boolean trailing) { |
| if (str == null || str.length() == 0) return 0L; |
| return acceptStringOrAffixPatternWithIgnorables( |
| cp, ret1, ret2, state, item, str, offset, trailing, true); |
| } |
| |
| private static long acceptStringNonIgnorable( |
| int cp, |
| StateName ret1, |
| StateName ret2, |
| ParserState state, |
| StateItem item, |
| CharSequence str, |
| boolean trailing, |
| int referenceCp, |
| long firstOffsetOrTag, |
| long nextOffsetOrTag) { |
| long added = 0L; |
| int firstOffset = (int) firstOffsetOrTag; |
| int nextOffset = (int) nextOffsetOrTag; |
| if (codePointEquals(referenceCp, cp, state)) { |
| if (firstOffset < str.length()) { |
| added |= acceptStringHelper(cp, ret1, ret2, state, item, str, firstOffset, trailing); |
| } |
| if (nextOffset >= str.length()) { |
| added |= acceptStringHelper(cp, ret1, ret2, state, item, str, nextOffset, trailing); |
| } |
| return added; |
| } else { |
| return 0L; |
| } |
| } |
| |
| /** |
| * Internal method that is used to step to the next code point of a string or exit the string if |
| * at the end. |
| * |
| * @param cp See {@link #acceptString} |
| * @param returnTo1 See {@link #acceptString} |
| * @param returnTo2 See {@link #acceptString} |
| * @param state See {@link #acceptString} |
| * @param item See {@link #acceptString} |
| * @param str See {@link #acceptString} |
| * @param newOffset The offset at which the next step should start. If past the end of the string, |
| * exit the string and return to the outer loop. |
| * @param trailing See {@link #acceptString} |
| * @return Bitmask containing one entry, the one that was added. |
| */ |
| private static long acceptStringHelper( |
| int cp, |
| StateName returnTo1, |
| StateName returnTo2, |
| ParserState state, |
| StateItem item, |
| CharSequence str, |
| int newOffset, |
| boolean trailing) { |
| StateItem next = state.getNext().copyFrom(item, null, cp); |
| next.score += 1; // reward for consuming a cp from string |
| if (newOffset < str.length()) { |
| // String has more code points. |
| next.name = StateName.INSIDE_STRING; |
| next.returnTo1 = returnTo1; |
| next.returnTo2 = returnTo2; |
| next.currentString = str; |
| next.currentOffset = newOffset; |
| next.currentTrailing = trailing; |
| } else { |
| // We've reached the end of the string. |
| next.name = returnTo1; |
| if (!trailing) next.trailingCount = 0; |
| next.returnTo1 = returnTo2; |
| next.returnTo2 = null; |
| } |
| return 1L << state.lastInsertedIndex(); |
| } |
| |
| private static long acceptAffixPatternOffset(int cp, ParserState state, StateItem item) { |
| return acceptAffixPattern( |
| cp, item.returnTo1, state, item, item.currentAffixPattern, item.currentStepwiseParserTag); |
| } |
| |
| /** |
| * Accepts a code point if the code point is compatible with the affix pattern at the offset |
| * encoded in the tag argument. |
| * |
| * @param cp The current code point, which will be checked for a match to the string. |
| * @param returnTo The state to return to after reaching the end of the string. |
| * @param state The current {@link ParserState} |
| * @param item The current {@link StateItem} |
| * @param str The string containing the affix pattern. |
| * @param tag The current state of the stepwise parser. Initial value should be 0L. |
| * @return A bitmask where the bits correspond to the items that were added. Set to 0L if no items |
| * were added. |
| */ |
| private static long acceptAffixPattern( |
| int cp, StateName ret1, ParserState state, StateItem item, CharSequence str, long tag) { |
| if (str == null || str.length() == 0) return 0L; |
| return acceptStringOrAffixPatternWithIgnorables( |
| cp, ret1, null, state, item, str, tag, false, false); |
| } |
| |
| private static long acceptAffixPatternNonIgnorable( |
| int cp, |
| StateName returnTo, |
| ParserState state, |
| StateItem item, |
| CharSequence str, |
| int typeOrCp, |
| long firstTag, |
| long nextTag) { |
| |
| // Convert from the returned tag to a code point, string, or currency to check |
| int resolvedCp = -1; |
| CharSequence resolvedStr = null; |
| boolean resolvedMinusSign = false; |
| boolean resolvedPlusSign = false; |
| boolean resolvedCurrency = false; |
| if (typeOrCp < 0) { |
| // Symbol |
| switch (typeOrCp) { |
| case AffixUtils.TYPE_MINUS_SIGN: |
| resolvedMinusSign = true; |
| break; |
| case AffixUtils.TYPE_PLUS_SIGN: |
| resolvedPlusSign = true; |
| break; |
| case AffixUtils.TYPE_PERCENT: |
| resolvedStr = state.symbols.getPercentString(); |
| if (resolvedStr.length() != 1 || resolvedStr.charAt(0) != '%') { |
| resolvedCp = '%'; // accept ASCII percent as well as locale percent |
| } |
| break; |
| case AffixUtils.TYPE_PERMILLE: |
| resolvedStr = state.symbols.getPerMillString(); |
| if (resolvedStr.length() != 1 || resolvedStr.charAt(0) != '‰') { |
| resolvedCp = '‰'; // accept ASCII permille as well as locale permille |
| } |
| break; |
| case AffixUtils.TYPE_CURRENCY_SINGLE: |
| case AffixUtils.TYPE_CURRENCY_DOUBLE: |
| case AffixUtils.TYPE_CURRENCY_TRIPLE: |
| case AffixUtils.TYPE_CURRENCY_QUAD: |
| case AffixUtils.TYPE_CURRENCY_QUINT: |
| case AffixUtils.TYPE_CURRENCY_OVERFLOW: |
| resolvedCurrency = true; |
| break; |
| default: |
| throw new AssertionError(); |
| } |
| } else { |
| resolvedCp = typeOrCp; |
| } |
| |
| long added = 0L; |
| if (resolvedCp >= 0 && codePointEquals(cp, resolvedCp, state)) { |
| if (firstTag >= 0) { |
| added |= acceptAffixPatternHelper(cp, returnTo, state, item, str, firstTag); |
| } |
| if (nextTag < 0) { |
| added |= acceptAffixPatternHelper(cp, returnTo, state, item, str, nextTag); |
| } |
| } |
| if (resolvedMinusSign) { |
| if (firstTag >= 0) { |
| added |= acceptMinusSign(cp, StateName.INSIDE_AFFIX_PATTERN, returnTo, state, item, false); |
| } |
| if (nextTag < 0) { |
| added |= acceptMinusSign(cp, returnTo, null, state, item, false); |
| } |
| if (added == 0L) { |
| // Also attempt to accept custom minus sign string |
| String mss = state.symbols.getMinusSignString(); |
| int mssCp = Character.codePointAt(mss, 0); |
| if (mss.length() != Character.charCount(mssCp) || !UNISET_MINUS.contains(mssCp)) { |
| resolvedStr = mss; |
| } |
| } |
| } |
| if (resolvedPlusSign) { |
| if (firstTag >= 0) { |
| added |= acceptPlusSign(cp, StateName.INSIDE_AFFIX_PATTERN, returnTo, state, item, false); |
| } |
| if (nextTag < 0) { |
| added |= acceptPlusSign(cp, returnTo, null, state, item, false); |
| } |
| if (added == 0L) { |
| // Also attempt to accept custom plus sign string |
| String pss = state.symbols.getPlusSignString(); |
| int pssCp = Character.codePointAt(pss, 0); |
| if (pss.length() != Character.charCount(pssCp) || !UNISET_MINUS.contains(pssCp)) { |
| resolvedStr = pss; |
| } |
| } |
| } |
| if (resolvedStr != null) { |
| if (firstTag >= 0) { |
| added |= |
| acceptString( |
| cp, StateName.INSIDE_AFFIX_PATTERN, returnTo, state, item, resolvedStr, 0, false); |
| } |
| if (nextTag < 0) { |
| added |= acceptString(cp, returnTo, null, state, item, resolvedStr, 0, false); |
| } |
| } |
| if (resolvedCurrency) { |
| if (firstTag >= 0) { |
| added |= acceptCurrency(cp, StateName.INSIDE_AFFIX_PATTERN, returnTo, state, item); |
| } |
| if (nextTag < 0) { |
| added |= acceptCurrency(cp, returnTo, null, state, item); |
| } |
| } |
| |
| // Set state in the items that were added by the function calls |
| for (int i = Long.numberOfTrailingZeros(added); (1L << i) <= added; i++) { |
| if (((1L << i) & added) != 0) { |
| state.getItem(i).currentAffixPattern = str; |
| state.getItem(i).currentStepwiseParserTag = firstTag; |
| } |
| } |
| return added; |
| } |
| |
| /** |
| * Internal method that is used to step to the next token of a affix pattern or exit the affix |
| * pattern if at the end. |
| * |
| * @param cp See {@link #acceptAffixPattern} |
| * @param returnTo1 See {@link #acceptAffixPattern} |
| * @param state See {@link #acceptAffixPattern} |
| * @param item See {@link #acceptAffixPattern} |
| * @param str See {@link #acceptAffixPattern} |
| * @param newOffset The tag corresponding to the next token in the affix pattern that should be |
| * recorded and consumed in a future call to {@link #acceptAffixPatternOffset}. |
| * @return Bitmask containing one entry, the one that was added. |
| */ |
| private static long acceptAffixPatternHelper( |
| int cp, |
| StateName returnTo, |
| ParserState state, |
| StateItem item, |
| CharSequence str, |
| long newTag) { |
| StateItem next = state.getNext().copyFrom(item, null, cp); |
| next.score += 1; // reward for consuming a cp from pattern |
| if (newTag >= 0) { |
| // Additional tokens in affix string. |
| next.name = StateName.INSIDE_AFFIX_PATTERN; |
| next.returnTo1 = returnTo; |
| next.currentAffixPattern = str; |
| next.currentStepwiseParserTag = newTag; |
| } else { |
| // Reached last token in affix string. |
| next.name = returnTo; |
| next.trailingCount = 0; |
| next.returnTo1 = null; |
| } |
| return 1L << state.lastInsertedIndex(); |
| } |
| |
| /** |
| * Consumes tokens from a string or affix pattern following ICU's rules for handling of whitespace |
| * and bidi control characters (collectively called "ignorables"). The methods {@link |
| * #acceptStringHelper}, {@link #acceptAffixPatternHelper}, {@link #acceptStringNonIgnorable}, and |
| * {@link #acceptAffixPatternNonIgnorable} will be called by this method to actually add parse |
| * paths. |
| * |
| * <p>In the "NonIgnorable" functions, two arguments are passed: firstOffsetOrTag and |
| * nextOffsetOrTag. These two arguments should add parse paths according to the following rules: |
| * |
| * <pre> |
| * if (firstOffsetOrTag is valid or inside string boundary) { |
| * // Add parse path going to firstOffsetOrTag |
| * } |
| * if (nextOffsetOrTag is invalid or beyond string boundary) { |
| * // Add parse path leaving the string |
| * } |
| * </pre> |
| * |
| * <p>Note that there may be multiple parse paths added by these lines. This is important in order |
| * to properly handle runs of ignorables. |
| * |
| * @param cp See {@link #acceptString} and {@link #acceptAffixPattern} |
| * @param ret1 See {@link #acceptString} and {@link #acceptAffixPattern} |
| * @param ret2 See {@link #acceptString} (affix pattern can pass null) |
| * @param state See {@link #acceptString} and {@link #acceptAffixPattern} |
| * @param item See {@link #acceptString} and {@link #acceptAffixPattern} |
| * @param str See {@link #acceptString} and {@link #acceptAffixPattern} |
| * @param offsetOrTag The current int offset for strings, or the current tag for affix patterns. |
| * @param trailing See {@link #acceptString} (affix patterns can pass false) |
| * @param isString true if the parameters correspond to a string; false if they correspond to an |
| * affix pattern. |
| * @return A bitmask containing the entries that were added. |
| */ |
| private static long acceptStringOrAffixPatternWithIgnorables( |
| int cp, |
| StateName ret1, |
| StateName ret2 /* String only */, |
| ParserState state, |
| StateItem item, |
| CharSequence str, |
| long offsetOrTag /* offset for string; tag for affix pattern */, |
| boolean trailing /* String only */, |
| boolean isString) { |
| |
| // Runs of ignorables (whitespace and bidi control marks) can occur at the beginning, middle, |
| // or end of the reference string, or a run across the entire string. |
| // |
| // - A run at the beginning or in the middle corresponds to a run of length *zero or more* |
| // in the input. |
| // - A run at the end need to be matched exactly. |
| // - A string that contains only ignorable characters also needs to be matched exactly. |
| // |
| // Because the behavior differs, we need logic here to determine which case we have. |
| |
| int typeOrCp = |
| isString |
| ? Character.codePointAt(str, (int) offsetOrTag) |
| : AffixUtils.getTypeOrCp(offsetOrTag); |
| |
| if (isIgnorable(typeOrCp, state)) { |
| // Look for the next nonignorable code point |
| int nextTypeOrCp = typeOrCp; |
| long prevOffsetOrTag; |
| long nextOffsetOrTag = offsetOrTag; |
| long firstOffsetOrTag = 0L; |
| while (true) { |
| prevOffsetOrTag = nextOffsetOrTag; |
| nextOffsetOrTag = |
| isString |
| ? nextOffsetOrTag + Character.charCount(nextTypeOrCp) |
| : AffixUtils.nextToken(nextOffsetOrTag, str); |
| if (firstOffsetOrTag == 0L) firstOffsetOrTag = nextOffsetOrTag; |
| if (isString ? nextOffsetOrTag >= str.length() : nextOffsetOrTag < 0) { |
| // Integer.MIN_VALUE is an invalid value for either a type or a cp; |
| // use it to indicate the end of the string. |
| nextTypeOrCp = Integer.MIN_VALUE; |
| break; |
| } |
| nextTypeOrCp = |
| isString |
| ? Character.codePointAt(str, (int) nextOffsetOrTag) |
| : AffixUtils.getTypeOrCp(nextOffsetOrTag); |
| if (!isIgnorable(nextTypeOrCp, state)) break; |
| } |
| |
| if (nextTypeOrCp == Integer.MIN_VALUE) { |
| // Run at end or string that contains only ignorable characters. |
| if (codePointEquals(cp, typeOrCp, state)) { |
| // Step forward and also exit the string if not at very end. |
| // RETURN |
| long added = 0L; |
| added |= |
| isString |
| ? acceptStringHelper( |
| cp, ret1, ret2, state, item, str, (int) firstOffsetOrTag, trailing) |
| : acceptAffixPatternHelper(cp, ret1, state, item, str, firstOffsetOrTag); |
| if (firstOffsetOrTag != nextOffsetOrTag) { |
| added |= |
| isString |
| ? acceptStringHelper( |
| cp, ret1, ret2, state, item, str, (int) nextOffsetOrTag, trailing) |
| : acceptAffixPatternHelper(cp, ret1, state, item, str, nextOffsetOrTag); |
| } |
| return added; |
| } else { |
| // Code point does not exactly match the run at end. |
| // RETURN |
| return 0L; |
| } |
| } else { |
| // Run at beginning or in middle. |
| if (isIgnorable(cp, state)) { |
| // Consume the ignorable. |
| // RETURN |
| return isString |
| ? acceptStringHelper( |
| cp, ret1, ret2, state, item, str, (int) prevOffsetOrTag, trailing) |
| : acceptAffixPatternHelper(cp, ret1, state, item, str, prevOffsetOrTag); |
| } else { |
| // Go to nonignorable cp. |
| // FALL THROUGH |
| } |
| } |
| |
| // Fall through to the nonignorable code point found above. |
| assert nextTypeOrCp != Integer.MIN_VALUE; |
| typeOrCp = nextTypeOrCp; |
| offsetOrTag = nextOffsetOrTag; |
| } |
| assert !isIgnorable(typeOrCp, state); |
| |
| // Look for the next nonignorable code point after this nonignorable code point |
| // to determine if we are at the end of the string. |
| int nextTypeOrCp = typeOrCp; |
| long nextOffsetOrTag = offsetOrTag; |
| long firstOffsetOrTag = 0L; |
| while (true) { |
| nextOffsetOrTag = |
| isString |
| ? nextOffsetOrTag + Character.charCount(nextTypeOrCp) |
| : AffixUtils.nextToken(nextOffsetOrTag, str); |
| if (firstOffsetOrTag == 0L) firstOffsetOrTag = nextOffsetOrTag; |
| if (isString ? nextOffsetOrTag >= str.length() : nextOffsetOrTag < 0) { |
| nextTypeOrCp = -1; |
| break; |
| } |
| nextTypeOrCp = |
| isString |
| ? Character.codePointAt(str, (int) nextOffsetOrTag) |
| : AffixUtils.getTypeOrCp(nextOffsetOrTag); |
| if (!isIgnorable(nextTypeOrCp, state)) break; |
| } |
| |
| // Nonignorable logic. |
| return isString |
| ? acceptStringNonIgnorable( |
| cp, ret1, ret2, state, item, str, trailing, typeOrCp, firstOffsetOrTag, nextOffsetOrTag) |
| : acceptAffixPatternNonIgnorable( |
| cp, ret1, state, item, str, typeOrCp, firstOffsetOrTag, nextOffsetOrTag); |
| } |
| |
| /** |
| * This method can add up to four items to the new list in <code>state</code>. |
| * |
| * <p>If <code>cp</code> is equal to any known ISO code or long name, copies <code>item</code> to |
| * the new list in <code>state</code> and sets its ISO code to the corresponding currency. |
| * |
| * <p>If <code>cp</code> is the first code point of any ISO code or long name having more them one |
| * code point in length, copies <code>item</code> to the new list in <code>state</code> along with |
| * an instance of {@link TextTrieMap.ParseState} for tracking the following code points. |
| * |
| * @param cp The code point to check. |
| * @param state The state object to update. |
| * @param item The old state leading into the code point. |
| */ |
| private static void acceptCurrency( |
| int cp, StateName nextName, ParserState state, StateItem item) { |
| acceptCurrency(cp, nextName, null, state, item); |
| } |
| |
| private static long acceptCurrency( |
| int cp, StateName returnTo1, StateName returnTo2, ParserState state, StateItem item) { |
| if (item.sawCurrency) return 0L; |
| long added = 0L; |
| |
| // Accept from local currency information |
| String str1, str2; |
| Currency currency = state.properties.getCurrency(); |
| if (currency != null) { |
| str1 = currency.getName(state.symbols.getULocale(), Currency.SYMBOL_NAME, null); |
| str2 = currency.getCurrencyCode(); |
| // TODO: Should we also accept long names? In currency mode, they are in the CLDR data. |
| } else { |
| currency = state.symbols.getCurrency(); |
| str1 = state.symbols.getCurrencySymbol(); |
| str2 = state.symbols.getInternationalCurrencySymbol(); |
| } |
| added |= acceptString(cp, returnTo1, returnTo2, state, item, str1, 0, false); |
| added |= acceptString(cp, returnTo1, returnTo2, state, item, str2, 0, false); |
| for (int i = Long.numberOfTrailingZeros(added); (1L << i) <= added; i++) { |
| if (((1L << i) & added) != 0) { |
| state.getItem(i).sawCurrency = true; |
| state.getItem(i).isoCode = str2; |
| } |
| } |
| |
| // Accept from CLDR data |
| if (state.parseCurrency) { |
| ULocale uloc = state.symbols.getULocale(); |
| TextTrieMap<Currency.CurrencyStringInfo>.ParseState trie1 = |
| Currency.openParseState(uloc, cp, Currency.LONG_NAME); |
| TextTrieMap<Currency.CurrencyStringInfo>.ParseState trie2 = |
| Currency.openParseState(uloc, cp, Currency.SYMBOL_NAME); |
| added |= acceptCurrencyHelper(cp, returnTo1, returnTo2, state, item, trie1); |
| added |= acceptCurrencyHelper(cp, returnTo1, returnTo2, state, item, trie2); |
| } |
| |
| return added; |
| } |
| |
| /** |
| * If <code>cp</code> is the next code point of any currency, copies <code>item</code> to the new |
| * list in <code>state</code> along with an instance of {@link TextTrieMap.ParseState} for |
| * tracking the following code points. |
| * |
| * <p>This method should only be called in a state following {@link #acceptCurrency}. |
| * |
| * @param cp The code point to check. |
| * @param state The state object to update. |
| * @param item The old state leading into the code point. |
| */ |
| private static void acceptCurrencyOffset(int cp, ParserState state, StateItem item) { |
| acceptCurrencyHelper( |
| cp, item.returnTo1, item.returnTo2, state, item, item.currentCurrencyTrieState); |
| } |
| |
| private static long acceptCurrencyHelper( |
| int cp, |
| StateName returnTo1, |
| StateName returnTo2, |
| ParserState state, |
| StateItem item, |
| TextTrieMap<Currency.CurrencyStringInfo>.ParseState trieState) { |
| if (trieState == null) return 0L; |
| trieState.accept(cp); |
| long added = 0L; |
| Iterator<Currency.CurrencyStringInfo> currentMatches = trieState.getCurrentMatches(); |
| if (currentMatches != null) { |
| // Match on current code point |
| // TODO: What should happen with multiple currency matches? |
| StateItem next = state.getNext().copyFrom(item, returnTo1, -1); |
| next.returnTo1 = returnTo2; |
| next.returnTo2 = null; |
| next.sawCurrency = true; |
| next.isoCode = currentMatches.next().getISOCode(); |
| added |= 1L << state.lastInsertedIndex(); |
| } |
| if (!trieState.atEnd()) { |
| // Prepare for matches on future code points |
| StateItem next = state.getNext().copyFrom(item, StateName.INSIDE_CURRENCY, -1); |
| next.returnTo1 = returnTo1; |
| next.returnTo2 = returnTo2; |
| next.currentCurrencyTrieState = trieState; |
| added |= 1L << state.lastInsertedIndex(); |
| } |
| return added; |
| } |
| |
| private static long acceptDigitTrie( |
| int cp, StateName nextName, ParserState state, StateItem item, DigitType type) { |
| assert state.digitTrie != null; |
| TextTrieMap<Byte>.ParseState trieState = state.digitTrie.openParseState(cp); |
| if (trieState == null) return 0L; |
| return acceptDigitTrieHelper(cp, nextName, state, item, type, trieState); |
| } |
| |
| private static void acceptDigitTrieOffset(int cp, ParserState state, StateItem item) { |
| acceptDigitTrieHelper( |
| cp, item.returnTo1, state, item, item.currentDigitType, item.currentDigitTrieState); |
| } |
| |
| private static long acceptDigitTrieHelper( |
| int cp, |
| StateName returnTo1, |
| ParserState state, |
| StateItem item, |
| DigitType type, |
| TextTrieMap<Byte>.ParseState trieState) { |
| if (trieState == null) return 0L; |
| trieState.accept(cp); |
| long added = 0L; |
| Iterator<Byte> currentMatches = trieState.getCurrentMatches(); |
| if (currentMatches != null) { |
| // Match on current code point |
| byte digit = currentMatches.next(); |
| StateItem next = state.getNext().copyFrom(item, returnTo1, -1); |
| next.returnTo1 = null; |
| recordDigit(next, digit, type); |
| added |= 1L << state.lastInsertedIndex(); |
| } |
| if (!trieState.atEnd()) { |
| // Prepare for matches on future code points |
| StateItem next = state.getNext().copyFrom(item, StateName.INSIDE_DIGIT, -1); |
| next.returnTo1 = returnTo1; |
| next.currentDigitTrieState = trieState; |
| next.currentDigitType = type; |
| added |= 1L << state.lastInsertedIndex(); |
| } |
| return added; |
| } |
| |
| /** |
| * Checks whether the two given code points are equal after applying case mapping as requested in |
| * the ParserState. |
| * |
| * @see #acceptString |
| * @see #acceptAffixPattern |
| */ |
| private static boolean codePointEquals(int cp1, int cp2, ParserState state) { |
| if (!state.caseSensitive) { |
| cp1 = UCharacter.foldCase(cp1, true); |
| cp2 = UCharacter.foldCase(cp2, true); |
| } |
| return cp1 == cp2; |
| } |
| |
| /** |
| * Checks whether the given code point is "ignorable" and should be skipped. BiDi control marks |
| * are always ignorable, and whitespace is ignorable in lenient mode. |
| * |
| * <p>Returns false if cp is negative. |
| * |
| * @param cp The code point to test. |
| * @param state The current {@link ParserState}, used for determining strict mode. |
| * @return true if cp is ignorable; false otherwise. |
| */ |
| private static boolean isIgnorable(int cp, ParserState state) { |
| if (cp < 0) return false; |
| if (UNISET_BIDI.contains(cp)) return true; |
| return state.mode == ParseMode.LENIENT && UNISET_WHITESPACE.contains(cp); |
| } |
| } |