blob: 8fbaf28e9f070868d89f323f0fcc4aa8070d6dd1 [file] [log] [blame]
/* GENERATED SOURCE. DO NOT MODIFY. */
// © 2017 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
package android.icu.impl.number;
import java.math.BigDecimal;
import java.math.MathContext;
import java.text.ParseException;
import java.text.ParsePosition;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import android.icu.impl.StandardPlural;
import android.icu.impl.TextTrieMap;
import android.icu.lang.UCharacter;
import android.icu.text.CurrencyPluralInfo;
import android.icu.text.DecimalFormatSymbols;
import android.icu.text.NumberFormat;
import android.icu.text.UnicodeSet;
import android.icu.util.Currency;
import android.icu.util.Currency.CurrencyStringInfo;
import android.icu.util.CurrencyAmount;
import android.icu.util.ULocale;
/**
* A parser designed to convert an arbitrary human-generated string to its best representation as a
* number: a long, a BigInteger, or a BigDecimal.
*
* <p>The parser may traverse multiple parse paths in the same strings if there is ambiguity. For
* example, the string "12,345.67" has two main interpretations: it could be "12.345" in a locale
* that uses '.' as the grouping separator, or it could be "12345.67" in a locale that uses ',' as
* the grouping separator. Since the second option has a longer parse path (consumes more of the
* input string), the parser will accept the second option.
* @hide Only a subset of ICU is exposed in Android
*/
public class Parse {
/** Controls the set of rules for parsing a string. */
public static enum ParseMode {
/**
* Lenient mode should be used if you want to accept malformed user input. It will use
* heuristics to attempt to parse through typographical errors in the string.
*/
LENIENT,
/**
* Strict mode should be used if you want to require that the input is well-formed. More
* specifically, it differs from lenient mode in the following ways:
*
* <ul>
* <li>Grouping widths must match the grouping settings. For example, "12,3,45" will fail if
* the grouping width is 3, as in the pattern "#,##0".
* <li>The string must contain a complete prefix and suffix. For example, if the pattern is
* "{#};(#)", then "{123}" or "(123)" would match, but "{123", "123}", and "123" would all
* fail. (The latter strings would be accepted in lenient mode.)
* <li>Whitespace may not appear at arbitrary places in the string. In lenient mode,
* whitespace is allowed to occur arbitrarily before and after prefixes and exponent
* separators.
* <li>Leading grouping separators are not allowed, as in ",123".
* <li>Minus and plus signs can only appear if specified in the pattern. In lenient mode, a
* plus or minus sign can always precede a number.
* <li>The set of characters that can be interpreted as a decimal or grouping separator is
* smaller.
* <li><strong>If currency parsing is enabled,</strong> currencies must only appear where
* specified in either the current pattern string or in a valid pattern string for the
* current locale. For example, if the pattern is "¤0.00", then "$1.23" would match, but
* "1.23$" would fail to match.
* </ul>
*/
STRICT,
/**
* Fast mode should be used in applications that don't require prefixes and suffixes to match.
*
* <p>In addition to ignoring prefixes and suffixes, fast mode performs the following
* optimizations:
*
* <ul>
* <li>Ignores digit strings from {@link DecimalFormatSymbols} and only uses the code point's
* Unicode digit property. If you are not using custom digit strings, this should not
* cause a change in behavior.
* <li>Instead of traversing multiple possible parse paths, a "greedy" parsing strategy is
* used, which might mean that fast mode won't accept strings that lenient or strict mode
* would accept. Since prefix and suffix strings are ignored, this is not an issue unless
* you are using custom symbols.
* </ul>
*/
FAST,
}
/**
* An enum containing the choices for strategy in parsing when choosing between grouping and
* decimal separators.
*/
public static enum GroupingMode {
/**
* Accept decimal equivalents as decimals, and if that fails, accept all equivalence classes
* (periods, commas, and whitespace-like) as grouping. This is a more lenient strategy.
*
* <p>For example, if the formatter's current locale is <em>fr-FR</em>, then "1.234" will parse
* as 1234, even though <em>fr-FR</em> does not use a period as the grouping separator.
*/
DEFAULT,
/**
* Accept decimal equivalents as decimals and grouping equivalents as grouping. This strategy is
* more strict.
*
* <p>For example, if the formatter's current locale is <em>fr-FR</em>, then "1.234" will fail
* to parse since <em>fr-FR</em> does not use a period as the grouping separator.
*/
RESTRICTED
}
/**
* @see Parse#parse(String, ParsePosition, ParseMode, boolean, boolean, DecimalFormatProperties,
* DecimalFormatSymbols)
*/
private static enum StateName {
BEFORE_PREFIX,
AFTER_PREFIX,
AFTER_INTEGER_DIGIT,
AFTER_FRACTION_DIGIT,
AFTER_EXPONENT_SEPARATOR,
AFTER_EXPONENT_DIGIT,
BEFORE_SUFFIX,
BEFORE_SUFFIX_SEEN_EXPONENT,
AFTER_SUFFIX,
INSIDE_CURRENCY,
INSIDE_DIGIT,
INSIDE_STRING,
INSIDE_AFFIX_PATTERN;
}
// This set was decided after discussion with icu-design@. See ticket #13309.
// Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
private static final UnicodeSet UNISET_WHITESPACE =
new UnicodeSet("[[:Zs:][\\u0009]]").freeze();
// BiDi characters are skipped over and ignored at any point in the string, even in strict mode.
private static final UnicodeSet UNISET_BIDI =
new UnicodeSet("[[\\u200E\\u200F\\u061C]]").freeze();
// TODO: Re-generate these sets from the database. They probably haven't been updated in a while.
private static final UnicodeSet UNISET_PERIOD_LIKE =
new UnicodeSet("[.\\u2024\\u3002\\uFE12\\uFE52\\uFF0E\\uFF61]").freeze();
private static final UnicodeSet UNISET_STRICT_PERIOD_LIKE =
new UnicodeSet("[.\\u2024\\uFE52\\uFF0E\\uFF61]").freeze();
private static final UnicodeSet UNISET_COMMA_LIKE =
new UnicodeSet("[,\\u060C\\u066B\\u3001\\uFE10\\uFE11\\uFE50\\uFE51\\uFF0C\\uFF64]").freeze();
private static final UnicodeSet UNISET_STRICT_COMMA_LIKE =
new UnicodeSet("[,\\u066B\\uFE10\\uFE50\\uFF0C]").freeze();
private static final UnicodeSet UNISET_OTHER_GROUPING_SEPARATORS =
new UnicodeSet(
"[\\ '\\u00A0\\u066C\\u2000-\\u200A\\u2018\\u2019\\u202F\\u205F\\u3000\\uFF07]")
.freeze();
// For parse return value calculation.
private static final BigDecimal MIN_LONG_AS_BIG_DECIMAL = new BigDecimal(Long.MIN_VALUE);
private static final BigDecimal MAX_LONG_AS_BIG_DECIMAL = new BigDecimal(Long.MAX_VALUE);
private enum SeparatorType {
COMMA_LIKE,
PERIOD_LIKE,
OTHER_GROUPING,
UNKNOWN;
static SeparatorType fromCp(int cp, ParseMode mode) {
if (mode == ParseMode.FAST) {
return SeparatorType.UNKNOWN;
} else if (mode == ParseMode.STRICT) {
if (UNISET_STRICT_COMMA_LIKE.contains(cp)) return COMMA_LIKE;
if (UNISET_STRICT_PERIOD_LIKE.contains(cp)) return PERIOD_LIKE;
if (UNISET_OTHER_GROUPING_SEPARATORS.contains(cp)) return OTHER_GROUPING;
return UNKNOWN;
} else {
if (UNISET_COMMA_LIKE.contains(cp)) return COMMA_LIKE;
if (UNISET_PERIOD_LIKE.contains(cp)) return PERIOD_LIKE;
if (UNISET_OTHER_GROUPING_SEPARATORS.contains(cp)) return OTHER_GROUPING;
return UNKNOWN;
}
}
}
private static enum DigitType {
INTEGER,
FRACTION,
EXPONENT
}
/**
* Holds a snapshot in time of a single parse path. This includes the digits seen so far, the
* current state name, and other properties like the grouping separator used on this parse path,
* details about the exponent and negative signs, etc.
*/
private static class StateItem {
// Parser state:
// The "trailingChars" is used to keep track of how many characters from the end of the string
// are ignorable and should be removed from the parse position should this item be accepted.
// The "score" is used to help rank two otherwise equivalent parse paths. Currently, the only
// function giving points to the score is prefix/suffix.
StateName name;
int trailingCount;
int score;
// Numerical value:
DecimalQuantity_DualStorageBCD fq = new DecimalQuantity_DualStorageBCD();
int numDigits;
int trailingZeros;
int exponent;
// Other items that we've seen:
int groupingCp;
long groupingWidths;
String isoCode;
boolean sawNegative;
boolean sawNegativeExponent;
boolean sawCurrency;
boolean sawNaN;
boolean sawInfinity;
AffixHolder affix;
boolean sawPrefix;
boolean sawSuffix;
boolean sawDecimalPoint;
boolean sawExponentDigit;
// Data for intermediate parsing steps:
StateName returnTo1;
StateName returnTo2;
// For string literals:
CharSequence currentString;
int currentOffset;
boolean currentTrailing;
// For affix patterns:
CharSequence currentAffixPattern;
long currentStepwiseParserTag;
// For currency:
TextTrieMap<CurrencyStringInfo>.ParseState currentCurrencyTrieState;
// For multi-code-point digits:
TextTrieMap<Byte>.ParseState currentDigitTrieState;
DigitType currentDigitType;
// Identification for path tracing:
final char id;
String path;
StateItem(char _id) {
id = _id;
}
/**
* Clears the instance so that it can be re-used.
*
* @return Myself, for chaining.
*/
StateItem clear() {
// Parser state:
name = StateName.BEFORE_PREFIX;
trailingCount = 0;
score = 0;
// Numerical value:
fq.clear();
numDigits = 0;
trailingZeros = 0;
exponent = 0;
// Other items we've seen:
groupingCp = -1;
groupingWidths = 0L;
isoCode = null;
sawNegative = false;
sawNegativeExponent = false;
sawCurrency = false;
sawNaN = false;
sawInfinity = false;
affix = null;
sawPrefix = false;
sawSuffix = false;
sawDecimalPoint = false;
sawExponentDigit = false;
// Data for intermediate parsing steps:
returnTo1 = null;
returnTo2 = null;
currentString = null;
currentOffset = 0;
currentTrailing = false;
currentAffixPattern = null;
currentStepwiseParserTag = 0L;
currentCurrencyTrieState = null;
currentDigitTrieState = null;
currentDigitType = null;
// Identification for path tracing:
// id is constant and is not cleared
path = "";
return this;
}
/**
* Sets the internal value of this instance equal to another instance.
*
* <p>newName and cpOrN1 are required as parameters to this function because every time a code
* point is consumed and a state item is copied, both of the corresponding fields should be
* updated; it would be an error if they weren't updated.
*
* @param other The instance to copy from.
* @param newName The state name that the new copy should take on.
* @param trailing If positive, record this code point as trailing; if negative, reset the
* trailing count to zero.
* @return Myself, for chaining.
*/
StateItem copyFrom(StateItem other, StateName newName, int trailing) {
// Parser state:
name = newName;
score = other.score;
// Either reset trailingCount or add the width of the current code point.
trailingCount = (trailing < 0) ? 0 : other.trailingCount + Character.charCount(trailing);
// Numerical value:
fq.copyFrom(other.fq);
numDigits = other.numDigits;
trailingZeros = other.trailingZeros;
exponent = other.exponent;
// Other items we've seen:
groupingCp = other.groupingCp;
groupingWidths = other.groupingWidths;
isoCode = other.isoCode;
sawNegative = other.sawNegative;
sawNegativeExponent = other.sawNegativeExponent;
sawCurrency = other.sawCurrency;
sawNaN = other.sawNaN;
sawInfinity = other.sawInfinity;
affix = other.affix;
sawPrefix = other.sawPrefix;
sawSuffix = other.sawSuffix;
sawDecimalPoint = other.sawDecimalPoint;
sawExponentDigit = other.sawExponentDigit;
// Data for intermediate parsing steps:
returnTo1 = other.returnTo1;
returnTo2 = other.returnTo2;
currentString = other.currentString;
currentOffset = other.currentOffset;
currentTrailing = other.currentTrailing;
currentAffixPattern = other.currentAffixPattern;
currentStepwiseParserTag = other.currentStepwiseParserTag;
currentCurrencyTrieState = other.currentCurrencyTrieState;
currentDigitTrieState = other.currentDigitTrieState;
currentDigitType = other.currentDigitType;
// Record source node if debugging
if (DEBUGGING) {
path = other.path + other.id;
}
return this;
}
/**
* Adds a digit to the internal representation of this instance.
*
* @param digit The digit that was read from the string.
* @param type Whether the digit occured after the decimal point.
*/
void appendDigit(byte digit, DigitType type) {
if (type == DigitType.EXPONENT) {
sawExponentDigit = true;
int newExponent = exponent * 10 + digit;
if (newExponent < exponent) {
// overflow
exponent = Integer.MAX_VALUE;
} else {
exponent = newExponent;
}
} else {
numDigits++;
if (type == DigitType.FRACTION && digit == 0) {
trailingZeros++;
} else if (type == DigitType.FRACTION) {
fq.appendDigit(digit, trailingZeros, false);
trailingZeros = 0;
} else {
fq.appendDigit(digit, 0, true);
}
}
}
/** @return Whether or not this item contains a valid number. */
public boolean hasNumber() {
return numDigits > 0 || sawNaN || sawInfinity;
}
/**
* Converts the internal digits from this instance into a Number, preferring a Long, then a
* BigInteger, then a BigDecimal. A Double is used for NaN, infinity, and -0.0.
*
* @return The Number. Never null.
*/
Number toNumber(DecimalFormatProperties properties) {
// Check for NaN, infinity, and -0.0
if (sawNaN) {
return Double.NaN;
}
if (sawInfinity) {
if (sawNegative) {
return Double.NEGATIVE_INFINITY;
} else {
return Double.POSITIVE_INFINITY;
}
}
if (fq.isZero() && sawNegative) {
return -0.0;
}
// Check for exponent overflow
boolean forceBigDecimal = properties.getParseToBigDecimal();
if (exponent == Integer.MAX_VALUE) {
if (sawNegativeExponent && sawNegative) {
return -0.0;
} else if (sawNegativeExponent) {
return 0.0;
} else if (sawNegative) {
return Double.NEGATIVE_INFINITY;
} else {
return Double.POSITIVE_INFINITY;
}
} else if (exponent > 1000) {
// BigDecimals can handle huge values better than BigIntegers.
forceBigDecimal = true;
}
// Multipliers must be applied in reverse.
BigDecimal multiplier = properties.getMultiplier();
if (properties.getMagnitudeMultiplier() != 0) {
if (multiplier == null) multiplier = BigDecimal.ONE;
multiplier = multiplier.scaleByPowerOfTen(properties.getMagnitudeMultiplier());
}
int delta = (sawNegativeExponent ? -1 : 1) * exponent;
// We need to use a math context in order to prevent non-terminating decimal expansions.
// This is only used when dividing by the multiplier.
MathContext mc = RoundingUtils.getMathContextOr34Digits(properties);
// Construct the output number.
// This is the only step during fast-mode parsing that incurs object creations.
BigDecimal result = fq.toBigDecimal();
if (sawNegative) result = result.negate();
result = result.scaleByPowerOfTen(delta);
if (multiplier != null) {
result = result.divide(multiplier, mc);
}
result = result.stripTrailingZeros();
if (forceBigDecimal || result.scale() > 0) {
return result;
} else if (result.compareTo(MIN_LONG_AS_BIG_DECIMAL) >= 0
&& result.compareTo(MAX_LONG_AS_BIG_DECIMAL) <= 0) {
return result.longValueExact();
} else {
return result.toBigIntegerExact();
}
}
/**
* Converts the internal digits to a number, and also associates the number with the parsed
* currency.
*
* @return The CurrencyAmount. Never null.
*/
public CurrencyAmount toCurrencyAmount(DecimalFormatProperties properties) {
assert isoCode != null;
Number number = toNumber(properties);
Currency currency = Currency.getInstance(isoCode);
return new CurrencyAmount(number, currency);
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("[");
sb.append(path);
sb.append("] ");
sb.append(name.name());
if (name == StateName.INSIDE_STRING) {
sb.append("{");
sb.append(currentString);
sb.append(":");
sb.append(currentOffset);
sb.append("}");
}
if (name == StateName.INSIDE_AFFIX_PATTERN) {
sb.append("{");
sb.append(currentAffixPattern);
sb.append(":");
sb.append(AffixUtils.getOffset(currentStepwiseParserTag) - 1);
sb.append("}");
}
sb.append(" ");
sb.append(fq.toBigDecimal());
sb.append(" grouping:");
sb.append(groupingCp == -1 ? new char[] {'?'} : Character.toChars(groupingCp));
sb.append(" widths:");
sb.append(Long.toHexString(groupingWidths));
sb.append(" seen:");
sb.append(sawNegative ? 1 : 0);
sb.append(sawNegativeExponent ? 1 : 0);
sb.append(sawNaN ? 1 : 0);
sb.append(sawInfinity ? 1 : 0);
sb.append(sawPrefix ? 1 : 0);
sb.append(sawSuffix ? 1 : 0);
sb.append(sawDecimalPoint ? 1 : 0);
sb.append(" trailing:");
sb.append(trailingCount);
sb.append(" score:");
sb.append(score);
sb.append(" affix:");
sb.append(affix);
sb.append(" currency:");
sb.append(isoCode);
return sb.toString();
}
}
/**
* Holds an ordered list of {@link StateItem} and other metadata about the string to be parsed.
* There are two internal arrays of {@link StateItem}, which are swapped back and forth in order
* to avoid object creations. The items in one array can be populated at the same time that items
* in the other array are being read from.
*/
private static class ParserState {
// Basic ParserStateItem lists:
StateItem[] items = new StateItem[16];
StateItem[] prevItems = new StateItem[16];
int length;
int prevLength;
// Properties and Symbols memory:
DecimalFormatProperties properties;
DecimalFormatSymbols symbols;
ParseMode mode;
boolean caseSensitive;
boolean parseCurrency;
GroupingMode groupingMode;
// Other pre-computed fields:
int decimalCp1;
int decimalCp2;
int groupingCp1;
int groupingCp2;
SeparatorType decimalType1;
SeparatorType decimalType2;
SeparatorType groupingType1;
SeparatorType groupingType2;
TextTrieMap<Byte> digitTrie;
Set<AffixHolder> affixHolders = new HashSet<AffixHolder>();
ParserState() {
for (int i = 0; i < items.length; i++) {
items[i] = new StateItem((char) ('A' + i));
prevItems[i] = new StateItem((char) ('A' + i));
}
}
/**
* Clears the internal state in order to prepare for parsing a new string.
*
* @return Myself, for chaining.
*/
ParserState clear() {
length = 0;
prevLength = 0;
digitTrie = null;
affixHolders.clear();
return this;
}
/**
* Swaps the internal arrays of {@link StateItem}. Sets the length of the primary list to zero,
* so that it can be appended to.
*/
void swap() {
StateItem[] temp = prevItems;
prevItems = items;
items = temp;
prevLength = length;
length = 0;
}
/**
* Swaps the internal arrays of {@link StateItem}. Sets the length of the primary list to the
* length of the previous list, so that it can be read from.
*/
void swapBack() {
StateItem[] temp = prevItems;
prevItems = items;
items = temp;
length = prevLength;
prevLength = 0;
}
/**
* Gets the next available {@link StateItem} from the primary list for writing. This method
* should be thought of like a list append method, except that there are no object creations
* taking place.
*
* <p>It is the caller's responsibility to call either {@link StateItem#clear} or {@link
* StateItem#copyFrom} on the returned object.
*
* @return A dirty {@link StateItem}.
*/
StateItem getNext() {
if (length >= items.length) {
// TODO: What to do here? Expand the array?
// This case is rare and would happen only with specially designed input.
// For now, just overwrite the last entry.
length = items.length - 1;
}
StateItem item = items[length];
length++;
return item;
}
/** @return The index of the last inserted StateItem via a call to {@link #getNext}. */
public int lastInsertedIndex() {
assert length > 0;
return length - 1;
}
/**
* Gets a {@link StateItem} from the primary list. Assumes that the item has already been added
* via a call to {@link #getNext}.
*
* @param i The index of the item to get.
* @return The item.
*/
public StateItem getItem(int i) {
assert i >= 0 && i < length;
return items[i];
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("<ParseState mode:");
sb.append(mode);
sb.append(" caseSensitive:");
sb.append(caseSensitive);
sb.append(" parseCurrency:");
sb.append(parseCurrency);
sb.append(" groupingMode:");
sb.append(groupingMode);
sb.append(" decimalCps:");
sb.append((char) decimalCp1);
sb.append((char) decimalCp2);
sb.append(" groupingCps:");
sb.append((char) groupingCp1);
sb.append((char) groupingCp2);
sb.append(" affixes:");
sb.append(affixHolders);
sb.append(">");
return sb.toString();
}
}
/**
* A wrapper for affixes. Affixes can be string-based or pattern-based, and they can come from
* several sources, including the property bag and the locale paterns from CLDR data.
*/
private static class AffixHolder {
final String p; // prefix
final String s; // suffix
final boolean strings;
final boolean negative;
static final AffixHolder EMPTY_POSITIVE = new AffixHolder("", "", true, false);
static final AffixHolder EMPTY_NEGATIVE = new AffixHolder("", "", true, true);
static void addToState(ParserState state, DecimalFormatProperties properties) {
AffixHolder pp = fromPropertiesPositivePattern(properties);
AffixHolder np = fromPropertiesNegativePattern(properties);
AffixHolder ps = fromPropertiesPositiveString(properties);
AffixHolder ns = fromPropertiesNegativeString(properties);
if (pp != null) state.affixHolders.add(pp);
if (ps != null) state.affixHolders.add(ps);
if (np != null) state.affixHolders.add(np);
if (ns != null) state.affixHolders.add(ns);
}
static AffixHolder fromPropertiesPositivePattern(DecimalFormatProperties properties) {
String ppp = properties.getPositivePrefixPattern();
String psp = properties.getPositiveSuffixPattern();
if (properties.getSignAlwaysShown()) {
// TODO: This logic is somewhat duplicated from MurkyModifier.
boolean foundSign = false;
String npp = properties.getNegativePrefixPattern();
String nsp = properties.getNegativeSuffixPattern();
if (AffixUtils.containsType(npp, AffixUtils.TYPE_MINUS_SIGN)) {
foundSign = true;
ppp = AffixUtils.replaceType(npp, AffixUtils.TYPE_MINUS_SIGN, '+');
}
if (AffixUtils.containsType(nsp, AffixUtils.TYPE_MINUS_SIGN)) {
foundSign = true;
psp = AffixUtils.replaceType(nsp, AffixUtils.TYPE_MINUS_SIGN, '+');
}
if (!foundSign) {
ppp = "+" + ppp;
}
}
return getInstance(ppp, psp, false, false);
}
static AffixHolder fromPropertiesNegativePattern(DecimalFormatProperties properties) {
String npp = properties.getNegativePrefixPattern();
String nsp = properties.getNegativeSuffixPattern();
if (npp == null && nsp == null) {
npp = properties.getPositivePrefixPattern();
nsp = properties.getPositiveSuffixPattern();
if (npp == null) {
npp = "-";
} else {
npp = "-" + npp;
}
}
return getInstance(npp, nsp, false, true);
}
static AffixHolder fromPropertiesPositiveString(DecimalFormatProperties properties) {
String pp = properties.getPositivePrefix();
String ps = properties.getPositiveSuffix();
if (pp == null && ps == null) return null;
return getInstance(pp, ps, true, false);
}
static AffixHolder fromPropertiesNegativeString(DecimalFormatProperties properties) {
String np = properties.getNegativePrefix();
String ns = properties.getNegativeSuffix();
if (np == null && ns == null) return null;
return getInstance(np, ns, true, true);
}
static AffixHolder getInstance(String p, String s, boolean strings, boolean negative) {
if (p == null && s == null) return negative ? EMPTY_NEGATIVE : EMPTY_POSITIVE;
if (p == null) p = "";
if (s == null) s = "";
if (p.length() == 0 && s.length() == 0) return negative ? EMPTY_NEGATIVE : EMPTY_POSITIVE;
return new AffixHolder(p, s, strings, negative);
}
AffixHolder(String pp, String sp, boolean strings, boolean negative) {
this.p = pp;
this.s = sp;
this.strings = strings;
this.negative = negative;
}
@Override
public boolean equals(Object other) {
if (other == null) return false;
if (this == other) return true;
if (!(other instanceof AffixHolder)) return false;
AffixHolder _other = (AffixHolder) other;
if (!p.equals(_other.p)) return false;
if (!s.equals(_other.s)) return false;
if (strings != _other.strings) return false;
if (negative != _other.negative) return false;
return true;
}
@Override
public int hashCode() {
return p.hashCode() ^ s.hashCode();
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("{");
sb.append(p);
sb.append("|");
sb.append(s);
sb.append("|");
sb.append(strings ? 'S' : 'P');
sb.append("}");
return sb.toString();
}
}
/**
* A class that holds information about all currency affix patterns for the locale. This allows
* the parser to accept currencies in any format that are valid for the locale.
*/
private static class CurrencyAffixPatterns {
private final Set<AffixHolder> set = new HashSet<AffixHolder>();
private static final ConcurrentHashMap<ULocale, CurrencyAffixPatterns> currencyAffixPatterns =
new ConcurrentHashMap<ULocale, CurrencyAffixPatterns>();
static void addToState(ULocale uloc, ParserState state) {
CurrencyAffixPatterns value = currencyAffixPatterns.get(uloc);
if (value == null) {
// There can be multiple threads computing the same CurrencyAffixPatterns simultaneously,
// but that scenario is harmless.
CurrencyAffixPatterns newValue = new CurrencyAffixPatterns(uloc);
currencyAffixPatterns.putIfAbsent(uloc, newValue);
value = currencyAffixPatterns.get(uloc);
}
state.affixHolders.addAll(value.set);
}
private CurrencyAffixPatterns(ULocale uloc) {
// Get the basic currency pattern.
String pattern = NumberFormat.getPatternForStyle(uloc, NumberFormat.CURRENCYSTYLE);
addPattern(pattern);
// Get the currency plural patterns.
// TODO: Update this after CurrencyPluralInfo is replaced.
CurrencyPluralInfo pluralInfo = CurrencyPluralInfo.getInstance(uloc);
for (StandardPlural plural : StandardPlural.VALUES) {
pattern = pluralInfo.getCurrencyPluralPattern(plural.getKeyword());
addPattern(pattern);
}
}
private static final ThreadLocal<DecimalFormatProperties> threadLocalProperties =
new ThreadLocal<DecimalFormatProperties>() {
@Override
protected DecimalFormatProperties initialValue() {
return new DecimalFormatProperties();
}
};
private void addPattern(String pattern) {
DecimalFormatProperties properties = threadLocalProperties.get();
try {
PatternStringParser.parseToExistingProperties(pattern, properties);
} catch (IllegalArgumentException e) {
// This should only happen if there is a bug in CLDR data. Fail silently.
}
set.add(AffixHolder.fromPropertiesPositivePattern(properties));
set.add(AffixHolder.fromPropertiesNegativePattern(properties));
}
}
/**
* Makes a {@link TextTrieMap} for parsing digit strings. A trie is required only if the digit
* strings are longer than one code point. In order for this to be the case, the user would have
* needed to specify custom multi-character digits, like "(0)".
*
* @param digitStrings The list of digit strings from DecimalFormatSymbols.
* @return A trie, or null if a trie is not required.
*/
static TextTrieMap<Byte> makeDigitTrie(String[] digitStrings) {
boolean requiresTrie = false;
for (int i = 0; i < 10; i++) {
String str = digitStrings[i];
if (Character.charCount(Character.codePointAt(str, 0)) != str.length()) {
requiresTrie = true;
break;
}
}
if (!requiresTrie) return null;
// TODO: Consider caching the tries so they don't need to be re-created run to run.
// (Low-priority since multi-character digits are rare in practice)
TextTrieMap<Byte> trieMap = new TextTrieMap<Byte>(false);
for (int i = 0; i < 10; i++) {
trieMap.put(digitStrings[i], (byte) i);
}
return trieMap;
}
protected static final ThreadLocal<ParserState> threadLocalParseState =
new ThreadLocal<ParserState>() {
@Override
protected ParserState initialValue() {
return new ParserState();
}
};
protected static final ThreadLocal<ParsePosition> threadLocalParsePosition =
new ThreadLocal<ParsePosition>() {
@Override
protected ParsePosition initialValue() {
return new ParsePosition(0);
}
};
/**
* @deprecated This API is ICU internal only. TODO: Remove this set from ScientificNumberFormat.
* @hide draft / provisional / internal are hidden on Android
*/
@Deprecated
public static final UnicodeSet UNISET_PLUS =
new UnicodeSet(
0x002B, 0x002B, 0x207A, 0x207A, 0x208A, 0x208A, 0x2795, 0x2795, 0xFB29, 0xFB29,
0xFE62, 0xFE62, 0xFF0B, 0xFF0B)
.freeze();
/**
* @deprecated This API is ICU internal only. TODO: Remove this set from ScientificNumberFormat.
* @hide draft / provisional / internal are hidden on Android
*/
@Deprecated
public static final UnicodeSet UNISET_MINUS =
new UnicodeSet(
0x002D, 0x002D, 0x207B, 0x207B, 0x208B, 0x208B, 0x2212, 0x2212, 0x2796, 0x2796,
0xFE63, 0xFE63, 0xFF0D, 0xFF0D)
.freeze();
public static Number parse(String input, DecimalFormatProperties properties, DecimalFormatSymbols symbols) {
ParsePosition ppos = threadLocalParsePosition.get();
ppos.setIndex(0);
return parse(input, ppos, properties, symbols);
}
// TODO: DELETE ME once debugging is finished
public static volatile boolean DEBUGGING = false;
/**
* Implements an iterative parser that maintains a lists of possible states at each code point in
* the string. At each code point in the string, the list of possible states is updated based on
* the states coming from the previous code point. The parser stops when it reaches the end of the
* string or when there are no possible parse paths remaining in the string.
*
* <p>TODO: This API is not fully flushed out. Right now this is internal-only.
*
* @param input The string to parse.
* @param ppos A {@link ParsePosition} to hold the index at which parsing stopped.
* @param properties A property bag, used only for determining the prefix/suffix strings and the
* padding character.
* @param symbols A {@link DecimalFormatSymbols} object, used for determining locale-specific
* symbols for grouping/decimal separators, digit strings, and prefix/suffix substitutions.
* @return A Number matching the parser's best interpretation of the string.
*/
public static Number parse(
CharSequence input,
ParsePosition ppos,
DecimalFormatProperties properties,
DecimalFormatSymbols symbols) {
StateItem best = _parse(input, ppos, false, properties, symbols);
return (best == null) ? null : best.toNumber(properties);
}
public static CurrencyAmount parseCurrency(
String input, DecimalFormatProperties properties, DecimalFormatSymbols symbols) throws ParseException {
return parseCurrency(input, null, properties, symbols);
}
public static CurrencyAmount parseCurrency(
CharSequence input, ParsePosition ppos, DecimalFormatProperties properties, DecimalFormatSymbols symbols)
throws ParseException {
if (ppos == null) {
ppos = threadLocalParsePosition.get();
ppos.setIndex(0);
ppos.setErrorIndex(-1);
}
StateItem best = _parse(input, ppos, true, properties, symbols);
return (best == null) ? null : best.toCurrencyAmount(properties);
}
private static StateItem _parse(
CharSequence input,
ParsePosition ppos,
boolean parseCurrency,
DecimalFormatProperties properties,
DecimalFormatSymbols symbols) {
if (input == null || ppos == null || properties == null || symbols == null) {
throw new IllegalArgumentException("All arguments are required for parse.");
}
ParseMode mode = properties.getParseMode();
if (mode == null) mode = ParseMode.LENIENT;
boolean integerOnly = properties.getParseIntegerOnly();
boolean ignoreExponent = properties.getParseNoExponent();
boolean ignoreGrouping = properties.getGroupingSize() <= 0;
// Set up the initial state
ParserState state = threadLocalParseState.get().clear();
state.properties = properties;
state.symbols = symbols;
state.mode = mode;
state.parseCurrency = parseCurrency;
state.groupingMode = properties.getParseGroupingMode();
if (state.groupingMode == null) state.groupingMode = GroupingMode.DEFAULT;
state.caseSensitive = properties.getParseCaseSensitive();
state.decimalCp1 = Character.codePointAt(symbols.getDecimalSeparatorString(), 0);
state.decimalCp2 = Character.codePointAt(symbols.getMonetaryDecimalSeparatorString(), 0);
state.groupingCp1 = Character.codePointAt(symbols.getGroupingSeparatorString(), 0);
state.groupingCp2 = Character.codePointAt(symbols.getMonetaryGroupingSeparatorString(), 0);
state.decimalType1 = SeparatorType.fromCp(state.decimalCp1, mode);
state.decimalType2 = SeparatorType.fromCp(state.decimalCp2, mode);
state.groupingType1 = SeparatorType.fromCp(state.groupingCp1, mode);
state.groupingType2 = SeparatorType.fromCp(state.groupingCp2, mode);
StateItem initialStateItem = state.getNext().clear();
initialStateItem.name = StateName.BEFORE_PREFIX;
if (mode == ParseMode.LENIENT || mode == ParseMode.STRICT) {
state.digitTrie = makeDigitTrie(symbols.getDigitStringsLocal());
AffixHolder.addToState(state, properties);
if (parseCurrency) {
CurrencyAffixPatterns.addToState(symbols.getULocale(), state);
}
}
if (DEBUGGING) {
System.out.println("Parsing: " + input);
System.out.println(properties);
System.out.println(state);
}
// Start walking through the string, one codepoint at a time. Backtracking is not allowed. This
// is to enforce linear runtime and prevent cases that could result in an infinite loop.
int offset = ppos.getIndex();
for (; offset < input.length(); ) {
int cp = Character.codePointAt(input, offset);
state.swap();
for (int i = 0; i < state.prevLength; i++) {
StateItem item = state.prevItems[i];
if (DEBUGGING) {
System.out.println(":" + offset + item.id + " " + item);
}
// In the switch statement below, if you see a line like:
// if (state.length > 0 && mode == ParseMode.FAST) break;
// it is used for accelerating the fast parse mode. The check is performed only in the
// states BEFORE_PREFIX, AFTER_INTEGER_DIGIT, and AFTER_FRACTION_DIGIT, which are the
// most common states.
switch (item.name) {
case BEFORE_PREFIX:
// Beginning of string
if (mode == ParseMode.LENIENT || mode == ParseMode.FAST) {
acceptMinusOrPlusSign(cp, StateName.BEFORE_PREFIX, state, item, false);
if (state.length > 0 && mode == ParseMode.FAST) break;
}
acceptIntegerDigit(cp, StateName.AFTER_INTEGER_DIGIT, state, item);
if (state.length > 0 && mode == ParseMode.FAST) break;
acceptBidi(cp, StateName.BEFORE_PREFIX, state, item);
if (state.length > 0 && mode == ParseMode.FAST) break;
acceptWhitespace(cp, StateName.BEFORE_PREFIX, state, item);
if (state.length > 0 && mode == ParseMode.FAST) break;
acceptPadding(cp, StateName.BEFORE_PREFIX, state, item);
if (state.length > 0 && mode == ParseMode.FAST) break;
acceptNan(cp, StateName.BEFORE_SUFFIX, state, item);
if (state.length > 0 && mode == ParseMode.FAST) break;
acceptInfinity(cp, StateName.BEFORE_SUFFIX, state, item);
if (state.length > 0 && mode == ParseMode.FAST) break;
if (!integerOnly) {
acceptDecimalPoint(cp, StateName.AFTER_FRACTION_DIGIT, state, item);
if (state.length > 0 && mode == ParseMode.FAST) break;
}
if (mode == ParseMode.LENIENT || mode == ParseMode.STRICT) {
acceptPrefix(cp, StateName.AFTER_PREFIX, state, item);
}
if (mode == ParseMode.LENIENT || mode == ParseMode.FAST) {
if (!ignoreGrouping) {
acceptGrouping(cp, StateName.AFTER_INTEGER_DIGIT, state, item);
if (state.length > 0 && mode == ParseMode.FAST) break;
}
if (parseCurrency) {
acceptCurrency(cp, StateName.BEFORE_PREFIX, state, item);
}
}
break;
case AFTER_PREFIX:
// Prefix is consumed
acceptBidi(cp, StateName.AFTER_PREFIX, state, item);
acceptPadding(cp, StateName.AFTER_PREFIX, state, item);
acceptNan(cp, StateName.BEFORE_SUFFIX, state, item);
acceptInfinity(cp, StateName.BEFORE_SUFFIX, state, item);
acceptIntegerDigit(cp, StateName.AFTER_INTEGER_DIGIT, state, item);
if (!integerOnly) {
acceptDecimalPoint(cp, StateName.AFTER_FRACTION_DIGIT, state, item);
}
if (mode == ParseMode.LENIENT || mode == ParseMode.FAST) {
acceptWhitespace(cp, StateName.AFTER_PREFIX, state, item);
if (!ignoreGrouping) {
acceptGrouping(cp, StateName.AFTER_INTEGER_DIGIT, state, item);
}
if (parseCurrency) {
acceptCurrency(cp, StateName.AFTER_PREFIX, state, item);
}
}
break;
case AFTER_INTEGER_DIGIT:
// Previous character was an integer digit (or grouping/whitespace)
acceptIntegerDigit(cp, StateName.AFTER_INTEGER_DIGIT, state, item);
if (state.length > 0 && mode == ParseMode.FAST) break;
if (!integerOnly) {
acceptDecimalPoint(cp, StateName.AFTER_FRACTION_DIGIT, state, item);
if (state.length > 0 && mode == ParseMode.FAST) break;
}
if (!ignoreGrouping) {
acceptGrouping(cp, StateName.AFTER_INTEGER_DIGIT, state, item);
if (state.length > 0 && mode == ParseMode.FAST) break;
}
acceptBidi(cp, StateName.BEFORE_SUFFIX, state, item);
if (state.length > 0 && mode == ParseMode.FAST) break;
acceptPadding(cp, StateName.BEFORE_SUFFIX, state, item);
if (state.length > 0 && mode == ParseMode.FAST) break;
if (!ignoreExponent) {
acceptExponentSeparator(cp, StateName.AFTER_EXPONENT_SEPARATOR, state, item);
if (state.length > 0 && mode == ParseMode.FAST) break;
}
if (mode == ParseMode.LENIENT || mode == ParseMode.STRICT) {
acceptSuffix(cp, StateName.AFTER_SUFFIX, state, item);
}
if (mode == ParseMode.LENIENT || mode == ParseMode.FAST) {
acceptWhitespace(cp, StateName.BEFORE_SUFFIX, state, item);
if (state.length > 0 && mode == ParseMode.FAST) break;
// TODO(sffc): acceptMinusOrPlusSign(cp, StateName.BEFORE_SUFFIX, state, item, false);
if (state.length > 0 && mode == ParseMode.FAST) break;
if (parseCurrency) {
acceptCurrency(cp, StateName.BEFORE_SUFFIX, state, item);
}
}
break;
case AFTER_FRACTION_DIGIT:
// We encountered a decimal point
acceptFractionDigit(cp, StateName.AFTER_FRACTION_DIGIT, state, item);
if (state.length > 0 && mode == ParseMode.FAST) break;
acceptBidi(cp, StateName.BEFORE_SUFFIX, state, item);
if (state.length > 0 && mode == ParseMode.FAST) break;
acceptPadding(cp, StateName.BEFORE_SUFFIX, state, item);
if (state.length > 0 && mode == ParseMode.FAST) break;
if (!ignoreExponent) {
acceptExponentSeparator(cp, StateName.AFTER_EXPONENT_SEPARATOR, state, item);
if (state.length > 0 && mode == ParseMode.FAST) break;
}
if (mode == ParseMode.LENIENT || mode == ParseMode.STRICT) {
acceptSuffix(cp, StateName.AFTER_SUFFIX, state, item);
}
if (mode == ParseMode.LENIENT || mode == ParseMode.FAST) {
acceptWhitespace(cp, StateName.BEFORE_SUFFIX, state, item);
if (state.length > 0 && mode == ParseMode.FAST) break;
// TODO(sffc): acceptMinusOrPlusSign(cp, StateName.BEFORE_SUFFIX, state, item, false);
if (state.length > 0 && mode == ParseMode.FAST) break;
if (parseCurrency) {
acceptCurrency(cp, StateName.BEFORE_SUFFIX, state, item);
}
}
break;
case AFTER_EXPONENT_SEPARATOR:
acceptBidi(cp, StateName.AFTER_EXPONENT_SEPARATOR, state, item);
acceptMinusOrPlusSign(cp, StateName.AFTER_EXPONENT_SEPARATOR, state, item, true);
acceptExponentDigit(cp, StateName.AFTER_EXPONENT_DIGIT, state, item);
break;
case AFTER_EXPONENT_DIGIT:
acceptBidi(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item);
acceptPadding(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item);
acceptExponentDigit(cp, StateName.AFTER_EXPONENT_DIGIT, state, item);
if (mode == ParseMode.LENIENT || mode == ParseMode.STRICT) {
acceptSuffix(cp, StateName.AFTER_SUFFIX, state, item);
}
if (mode == ParseMode.LENIENT || mode == ParseMode.FAST) {
acceptWhitespace(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item);
// TODO(sffc): acceptMinusOrPlusSign(cp, StateName.BEFORE_SUFFIX, state, item, false);
if (parseCurrency) {
acceptCurrency(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item);
}
}
break;
case BEFORE_SUFFIX:
// Accept whitespace, suffixes, and exponent separators
acceptBidi(cp, StateName.BEFORE_SUFFIX, state, item);
acceptPadding(cp, StateName.BEFORE_SUFFIX, state, item);
if (!ignoreExponent) {
acceptExponentSeparator(cp, StateName.AFTER_EXPONENT_SEPARATOR, state, item);
}
if (mode == ParseMode.LENIENT || mode == ParseMode.STRICT) {
acceptSuffix(cp, StateName.AFTER_SUFFIX, state, item);
}
if (mode == ParseMode.LENIENT || mode == ParseMode.FAST) {
acceptWhitespace(cp, StateName.BEFORE_SUFFIX, state, item);
// TODO(sffc): acceptMinusOrPlusSign(cp, StateName.BEFORE_SUFFIX, state, item, false);
if (parseCurrency) {
acceptCurrency(cp, StateName.BEFORE_SUFFIX, state, item);
}
}
break;
case BEFORE_SUFFIX_SEEN_EXPONENT:
// Accept whitespace and suffixes but not exponent separators
acceptBidi(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item);
acceptPadding(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item);
if (mode == ParseMode.LENIENT || mode == ParseMode.STRICT) {
acceptSuffix(cp, StateName.AFTER_SUFFIX, state, item);
}
if (mode == ParseMode.LENIENT || mode == ParseMode.FAST) {
acceptWhitespace(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item);
// TODO(sffc): acceptMinusOrPlusSign(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item, false);
if (parseCurrency) {
acceptCurrency(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item);
}
}
break;
case AFTER_SUFFIX:
if ((mode == ParseMode.LENIENT || mode == ParseMode.FAST) && parseCurrency) {
// Continue traversing in case there is a currency symbol to consume
acceptBidi(cp, StateName.AFTER_SUFFIX, state, item);
acceptPadding(cp, StateName.AFTER_SUFFIX, state, item);
acceptWhitespace(cp, StateName.AFTER_SUFFIX, state, item);
// TODO(sffc): acceptMinusOrPlusSign(cp, StateName.AFTER_SUFFIX, state, item, false);
if (parseCurrency) {
acceptCurrency(cp, StateName.AFTER_SUFFIX, state, item);
}
}
// Otherwise, do not accept any more characters.
break;
case INSIDE_CURRENCY:
acceptCurrencyOffset(cp, state, item);
break;
case INSIDE_DIGIT:
acceptDigitTrieOffset(cp, state, item);
break;
case INSIDE_STRING:
acceptStringOffset(cp, state, item);
break;
case INSIDE_AFFIX_PATTERN:
acceptAffixPatternOffset(cp, state, item);
break;
}
}
if (state.length == 0) {
// No parse paths continue past this point. We have found the longest parsable string
// from the input. Restore previous state without the offset and break.
state.swapBack();
break;
}
offset += Character.charCount(cp);
}
// Post-processing
if (state.length == 0) {
if (DEBUGGING) {
System.out.println("No matches found");
System.out.println("- - - - - - - - - -");
}
return null;
} else {
// Loop through the candidates. "continue" skips a candidate as invalid.
StateItem best = null;
outer:
for (int i = 0; i < state.length; i++) {
StateItem item = state.items[i];
if (DEBUGGING) {
System.out.println(":end " + item);
}
// Check that at least one digit was read.
if (!item.hasNumber()) {
if (DEBUGGING) System.out.println("-> rejected due to no number value");
continue;
}
if (mode == ParseMode.STRICT) {
// Perform extra checks for strict mode.
// We require that the affixes match.
boolean sawPrefix = item.sawPrefix || (item.affix != null && item.affix.p.isEmpty());
boolean sawSuffix = item.sawSuffix || (item.affix != null && item.affix.s.isEmpty());
boolean hasEmptyAffix =
state.affixHolders.contains(AffixHolder.EMPTY_POSITIVE)
|| state.affixHolders.contains(AffixHolder.EMPTY_NEGATIVE);
if (sawPrefix && sawSuffix) {
// OK
} else if (!sawPrefix && !sawSuffix && hasEmptyAffix) {
// OK
} else {
// Has a prefix or suffix that doesn't match
if (DEBUGGING) System.out.println("-> rejected due to mismatched prefix/suffix");
continue;
}
// Check for scientific notation.
if (properties.getMinimumExponentDigits() > 0 && !item.sawExponentDigit) {
if (DEBUGGING) System.out.println("-> reject due to lack of exponent");
continue;
}
// Check that grouping sizes are valid.
int grouping1 = properties.getGroupingSize();
int grouping2 = properties.getSecondaryGroupingSize();
grouping1 = grouping1 > 0 ? grouping1 : grouping2;
grouping2 = grouping2 > 0 ? grouping2 : grouping1;
long groupingWidths = item.groupingWidths;
int numGroupingRegions = 16 - Long.numberOfLeadingZeros(groupingWidths) / 4;
// If the last grouping is zero, accept strings like "1," but reject string like "1,.23"
// Strip off multiple last-groupings to handle cases like "123,," or "123 "
while (numGroupingRegions > 1 && (groupingWidths & 0xf) == 0) {
if (item.sawDecimalPoint) {
if (DEBUGGING) System.out.println("-> rejected due to decimal point after grouping");
continue outer;
} else {
groupingWidths >>>= 4;
numGroupingRegions--;
}
}
if (grouping1 <= 0) {
// OK (no grouping data available)
} else if (numGroupingRegions <= 1) {
// OK (no grouping digits)
} else if ((groupingWidths & 0xf) != grouping1) {
// First grouping size is invalid
if (DEBUGGING) System.out.println("-> rejected due to first grouping violation");
continue;
} else if (((groupingWidths >>> ((numGroupingRegions - 1) * 4)) & 0xf) > grouping2) {
// String like "1234,567" where the highest grouping is too large
if (DEBUGGING) System.out.println("-> rejected due to final grouping violation");
continue;
} else {
for (int j = 1; j < numGroupingRegions - 1; j++) {
if (((groupingWidths >>> (j * 4)) & 0xf) != grouping2) {
// A grouping size somewhere in the middle is invalid
if (DEBUGGING) System.out.println("-> rejected due to inner grouping violation");
continue outer;
}
}
}
}
// Optionally require that the presence of a decimal point matches the pattern.
if (properties.getDecimalPatternMatchRequired()
&& item.sawDecimalPoint
!= (properties.getDecimalSeparatorAlwaysShown()
|| properties.getMaximumFractionDigits() != 0)) {
if (DEBUGGING) System.out.println("-> rejected due to decimal point violation");
continue;
}
// When parsing currencies, require that a currency symbol was found.
if (parseCurrency && !item.sawCurrency) {
if (DEBUGGING) System.out.println("-> rejected due to lack of currency");
continue;
}
// If we get here, then this candidate is acceptable.
// Use the earliest candidate in the list, or the one with the highest score, or the
// one with the fewest trailing digits.
if (best == null) {
best = item;
} else if (item.score > best.score) {
best = item;
} else if (item.trailingCount < best.trailingCount) {
best = item;
}
}
if (DEBUGGING) {
System.out.println("- - - - - - - - - -");
}
if (best != null) {
ppos.setIndex(offset - best.trailingCount);
return best;
} else {
ppos.setErrorIndex(offset);
return null;
}
}
}
/**
* If <code>cp</code> is whitespace (as determined by the unicode set {@link #UNISET_WHITESPACE}),
* copies <code>item</code> to the new list in <code>state</code> and sets its state name to
* <code>nextName</code>.
*
* @param cp The code point to check.
* @param nextName The new state name if the check passes.
* @param state The state object to update.
* @param item The old state leading into the code point.
*/
private static void acceptWhitespace(
int cp, StateName nextName, ParserState state, StateItem item) {
if (UNISET_WHITESPACE.contains(cp)) {
state.getNext().copyFrom(item, nextName, cp);
}
}
/**
* If <code>cp</code> is a bidi control character (as determined by the unicode set {@link
* #UNISET_BIDI}), copies <code>item</code> to the new list in <code>state</code> and sets its
* state name to <code>nextName</code>.
*
* @param cp The code point to check.
* @param nextName The new state name if the check passes.
* @param state The state object to update.
* @param item The old state leading into the code point.
*/
private static void acceptBidi(int cp, StateName nextName, ParserState state, StateItem item) {
if (UNISET_BIDI.contains(cp)) {
state.getNext().copyFrom(item, nextName, cp);
}
}
/**
* If <code>cp</code> is a padding character (as determined by {@link ParserState#paddingCp}),
* copies <code>item</code> to the new list in <code>state</code> and sets its state name to
* <code>nextName</code>.
*
* @param cp The code point to check.
* @param nextName The new state name if the check passes.
* @param state The state object to update.
* @param item The old state leading into the code point.
*/
private static void acceptPadding(int cp, StateName nextName, ParserState state, StateItem item) {
CharSequence padding = state.properties.getPadString();
if (padding == null || padding.length() == 0) return;
int referenceCp = Character.codePointAt(padding, 0);
if (cp == referenceCp) {
state.getNext().copyFrom(item, nextName, cp);
}
}
private static void acceptIntegerDigit(
int cp, StateName nextName, ParserState state, StateItem item) {
acceptDigitHelper(cp, nextName, state, item, DigitType.INTEGER);
}
private static void acceptFractionDigit(
int cp, StateName nextName, ParserState state, StateItem item) {
acceptDigitHelper(cp, nextName, state, item, DigitType.FRACTION);
}
private static void acceptExponentDigit(
int cp, StateName nextName, ParserState state, StateItem item) {
acceptDigitHelper(cp, nextName, state, item, DigitType.EXPONENT);
}
/**
* If <code>cp</code> is a digit character (as determined by either {@link UCharacter#digit} or
* {@link ParserState#digitCps}), copies <code>item</code> to the new list in <code>state</code>
* and sets its state name to one determined by <code>type</code>. Also copies the digit into a
* field in the new item determined by <code>type</code>.
*
* @param cp The code point to check.
* @param nextName The state to set if a digit is accepted.
* @param state The state object to update.
* @param item The old state leading into the code point.
* @param type The digit type, which determines the next state and the field into which to insert
* the digit.
*/
private static void acceptDigitHelper(
int cp, StateName nextName, ParserState state, StateItem item, DigitType type) {
// Check the Unicode digit character property
byte digit = (byte) UCharacter.digit(cp, 10);
StateItem next = null;
// Look for the digit:
if (digit >= 0) {
// Code point is a number
next = state.getNext().copyFrom(item, nextName, -1);
}
// Do not perform the expensive string manipulations in fast mode.
if (digit < 0 && (state.mode == ParseMode.LENIENT || state.mode == ParseMode.STRICT)) {
if (state.digitTrie == null) {
// Check custom digits, all of which are at most one code point
for (byte d = 0; d < 10; d++) {
int referenceCp = Character.codePointAt(state.symbols.getDigitStringsLocal()[d], 0);
if (cp == referenceCp) {
digit = d;
next = state.getNext().copyFrom(item, nextName, -1);
}
}
} else {
// Custom digits have more than one code point
acceptDigitTrie(cp, nextName, state, item, type);
}
}
// Save state
recordDigit(next, digit, type);
}
/**
* Helper function for {@link acceptDigit} and {@link acceptDigitTrie} to save a complete digit in
* a state item and update grouping widths.
*
* @param next The new StateItem
* @param digit The digit to record
* @param type The type of the digit to record (INTEGER, FRACTION, or EXPONENT)
*/
private static void recordDigit(StateItem next, byte digit, DigitType type) {
if (next == null) return;
next.appendDigit(digit, type);
if (type == DigitType.INTEGER && (next.groupingWidths & 0xf) < 15) {
next.groupingWidths++;
}
}
/**
* If <code>cp</code> is a sign (as determined by the unicode sets {@link #UNISET_PLUS} and {@link
* #UNISET_MINUS}), copies <code>item</code> to the new list in <code>state</code>. Loops back to
* the same state name.
*
* @param cp The code point to check.
* @param state The state object to update.
* @param item The old state leading into the code point.
*/
private static void acceptMinusOrPlusSign(
int cp, StateName nextName, ParserState state, StateItem item, boolean exponent) {
acceptMinusSign(cp, nextName, null, state, item, exponent);
acceptPlusSign(cp, nextName, null, state, item, exponent);
}
private static long acceptMinusSign(
int cp,
StateName returnTo1,
StateName returnTo2,
ParserState state,
StateItem item,
boolean exponent) {
if (UNISET_MINUS.contains(cp)) {
StateItem next = state.getNext().copyFrom(item, returnTo1, -1);
next.returnTo1 = returnTo2;
if (exponent) {
next.sawNegativeExponent = true;
} else {
next.sawNegative = true;
}
return 1L << state.lastInsertedIndex();
} else {
return 0L;
}
}
private static long acceptPlusSign(
int cp,
StateName returnTo1,
StateName returnTo2,
ParserState state,
StateItem item,
boolean exponent) {
if (UNISET_PLUS.contains(cp)) {
StateItem next = state.getNext().copyFrom(item, returnTo1, -1);
next.returnTo1 = returnTo2;
return 1L << state.lastInsertedIndex();
} else {
return 0L;
}
}
/**
* If <code>cp</code> is a grouping separator (as determined by the unicode set {@link
* #UNISET_GROUPING}), copies <code>item</code> to the new list in <code>state</code> and loops
* back to the same state. Also accepts if <code>cp</code> is the locale-specific grouping
* separator in {@link ParserState#groupingCp}, in which case the {@link
* StateItem#usesLocaleSymbols} flag is also set.
*
* @param cp The code point to check.
* @param state The state object to update.
* @param item The old state leading into the code point.
*/
private static void acceptGrouping(
int cp, StateName nextName, ParserState state, StateItem item) {
// Do not accept mixed grouping separators in the same string.
if (item.groupingCp == -1) {
// First time seeing a grouping separator.
SeparatorType cpType = SeparatorType.fromCp(cp, state.mode);
// Always accept if exactly the same as the locale grouping separator.
if (cp != state.groupingCp1 && cp != state.groupingCp2) {
// Reject if not in one of the three primary equivalence classes.
if (cpType == SeparatorType.UNKNOWN) {
return;
}
if (state.groupingMode == GroupingMode.RESTRICTED) {
// Reject if not in the same class as the locale grouping separator.
if (cpType != state.groupingType1 || cpType != state.groupingType2) {
return;
}
} else {
// Reject if in the same class as the decimal separator.
if (cpType == SeparatorType.COMMA_LIKE
&& (state.decimalType1 == SeparatorType.COMMA_LIKE
|| state.decimalType2 == SeparatorType.COMMA_LIKE)) {
return;
}
if (cpType == SeparatorType.PERIOD_LIKE
&& (state.decimalType1 == SeparatorType.PERIOD_LIKE
|| state.decimalType2 == SeparatorType.PERIOD_LIKE)) {
return;
}
}
}
// A match was found.
StateItem next = state.getNext().copyFrom(item, nextName, cp);
next.groupingCp = cp;
next.groupingWidths <<= 4;
} else {
// Have already seen a grouping separator.
if (cp == item.groupingCp) {
StateItem next = state.getNext().copyFrom(item, nextName, cp);
next.groupingWidths <<= 4;
}
}
}
/**
* If <code>cp</code> is a decimal (as determined by the unicode set {@link #UNISET_DECIMAL}),
* copies <code>item</code> to the new list in <code>state</code> and goes to {@link
* StateName#AFTER_FRACTION_DIGIT}. Also accepts if <code>cp</code> is the locale-specific decimal
* point in {@link ParserState#decimalCp}, in which case the {@link StateItem#usesLocaleSymbols}
* flag is also set.
*
* @param cp The code point to check.
* @param state The state object to update.
* @param item The old state leading into the code point.
*/
private static void acceptDecimalPoint(
int cp, StateName nextName, ParserState state, StateItem item) {
if (cp == item.groupingCp) {
// Don't accept a decimal point that is the same as the grouping separator
return;
}
SeparatorType cpType = SeparatorType.fromCp(cp, state.mode);
// We require that the decimal separator be in the same class as the locale.
if (cpType != state.decimalType1 && cpType != state.decimalType2) {
return;
}
// If in UNKNOWN or OTHER, require an exact match.
if (cpType == SeparatorType.OTHER_GROUPING || cpType == SeparatorType.UNKNOWN) {
if (cp != state.decimalCp1 && cp != state.decimalCp2) {
return;
}
}
// A match was found.
StateItem next = state.getNext().copyFrom(item, nextName, -1);
next.sawDecimalPoint = true;
}
private static void acceptNan(int cp, StateName nextName, ParserState state, StateItem item) {
CharSequence nan = state.symbols.getNaN();
long added = acceptString(cp, nextName, null, state, item, nan, 0, false);
// Set state in the items that were added by the function call
for (int i = Long.numberOfTrailingZeros(added); (1L << i) <= added; i++) {
if (((1L << i) & added) != 0) {
state.getItem(i).sawNaN = true;
}
}
}
private static void acceptInfinity(
int cp, StateName nextName, ParserState state, StateItem item) {
CharSequence inf = state.symbols.getInfinity();
long added = acceptString(cp, nextName, null, state, item, inf, 0, false);
// Set state in the items that were added by the function call
for (int i = Long.numberOfTrailingZeros(added); (1L << i) <= added; i++) {
if (((1L << i) & added) != 0) {
state.getItem(i).sawInfinity = true;
}
}
}
private static void acceptExponentSeparator(
int cp, StateName nextName, ParserState state, StateItem item) {
CharSequence exp = state.symbols.getExponentSeparator();
acceptString(cp, nextName, null, state, item, exp, 0, true);
}
private static void acceptPrefix(int cp, StateName nextName, ParserState state, StateItem item) {
for (AffixHolder holder : state.affixHolders) {
acceptAffixHolder(cp, nextName, state, item, holder, true);
}
}
private static void acceptSuffix(int cp, StateName nextName, ParserState state, StateItem item) {
if (item.affix != null) {
acceptAffixHolder(cp, nextName, state, item, item.affix, false);
} else {
for (AffixHolder holder : state.affixHolders) {
acceptAffixHolder(cp, nextName, state, item, holder, false);
}
}
}
private static void acceptAffixHolder(
int cp,
StateName nextName,
ParserState state,
StateItem item,
AffixHolder holder,
boolean prefix) {
if (holder == null) return;
String str = prefix ? holder.p : holder.s;
long added;
if (holder.strings) {
added = acceptString(cp, nextName, null, state, item, str, 0, false);
} else {
added =
acceptAffixPattern(cp, nextName, state, item, str, AffixUtils.nextToken(0, str));
}
// Record state in the added entries
for (int i = Long.numberOfTrailingZeros(added); (1L << i) <= added; i++) {
if (((1L << i) & added) != 0) {
StateItem next = state.getItem(i);
next.affix = holder;
if (prefix) next.sawPrefix = true;
if (!prefix) next.sawSuffix = true;
if (holder.negative) next.sawNegative = true;
// 10 point reward for consuming a prefix/suffix:
next.score += 10;
// 1 point reward for positive holders (if there is ambiguity, we want to favor positive):
if (!holder.negative) next.score += 1;
// 5 point reward for affix holders that have an empty prefix or suffix (we won't see them again):
if (!next.sawPrefix && holder.p.isEmpty()) next.score += 5;
if (!next.sawSuffix && holder.s.isEmpty()) next.score += 5;
}
}
}
private static long acceptStringOffset(int cp, ParserState state, StateItem item) {
return acceptString(
cp,
item.returnTo1,
item.returnTo2,
state,
item,
item.currentString,
item.currentOffset,
item.currentTrailing);
}
/**
* Accepts a code point if the code point is compatible with the string at the given offset.
* Handles runs of ignorable characters.
*
* <p>This method will add either one or two {@link StateItem} to the {@link ParserState}.
*
* @param cp The current code point, which will be checked for a match to the string.
* @param ret1 The state to return to after reaching the end of the string.
* @param ret2 The state to save in <code>returnTo1</code> after reaching the end of the string.
* Set to null if returning to the main state loop.
* @param trailing true if this string should be ignored for the purposes of recording trailing
* code points; false if it trailing count should be reset after reading the string.
* @param state The current {@link ParserState}
* @param item The current {@link StateItem}
* @param str The string against which to check for a match.
* @param offset The number of chars into the string. Initial value should be 0.
* @param trailing false if this string is strong and should reset trailing count to zero when it
* is fully consumed.
* @return A bitmask where the bits correspond to the items that were added. Set to 0L if no items
* were added.
*/
private static long acceptString(
int cp,
StateName ret1,
StateName ret2,
ParserState state,
StateItem item,
CharSequence str,
int offset,
boolean trailing) {
if (str == null || str.length() == 0) return 0L;
return acceptStringOrAffixPatternWithIgnorables(
cp, ret1, ret2, state, item, str, offset, trailing, true);
}
private static long acceptStringNonIgnorable(
int cp,
StateName ret1,
StateName ret2,
ParserState state,
StateItem item,
CharSequence str,
boolean trailing,
int referenceCp,
long firstOffsetOrTag,
long nextOffsetOrTag) {
long added = 0L;
int firstOffset = (int) firstOffsetOrTag;
int nextOffset = (int) nextOffsetOrTag;
if (codePointEquals(referenceCp, cp, state)) {
if (firstOffset < str.length()) {
added |= acceptStringHelper(cp, ret1, ret2, state, item, str, firstOffset, trailing);
}
if (nextOffset >= str.length()) {
added |= acceptStringHelper(cp, ret1, ret2, state, item, str, nextOffset, trailing);
}
return added;
} else {
return 0L;
}
}
/**
* Internal method that is used to step to the next code point of a string or exit the string if
* at the end.
*
* @param cp See {@link #acceptString}
* @param returnTo1 See {@link #acceptString}
* @param returnTo2 See {@link #acceptString}
* @param state See {@link #acceptString}
* @param item See {@link #acceptString}
* @param str See {@link #acceptString}
* @param newOffset The offset at which the next step should start. If past the end of the string,
* exit the string and return to the outer loop.
* @param trailing See {@link #acceptString}
* @return Bitmask containing one entry, the one that was added.
*/
private static long acceptStringHelper(
int cp,
StateName returnTo1,
StateName returnTo2,
ParserState state,
StateItem item,
CharSequence str,
int newOffset,
boolean trailing) {
StateItem next = state.getNext().copyFrom(item, null, cp);
next.score += 1; // reward for consuming a cp from string
if (newOffset < str.length()) {
// String has more code points.
next.name = StateName.INSIDE_STRING;
next.returnTo1 = returnTo1;
next.returnTo2 = returnTo2;
next.currentString = str;
next.currentOffset = newOffset;
next.currentTrailing = trailing;
} else {
// We've reached the end of the string.
next.name = returnTo1;
if (!trailing) next.trailingCount = 0;
next.returnTo1 = returnTo2;
next.returnTo2 = null;
}
return 1L << state.lastInsertedIndex();
}
private static long acceptAffixPatternOffset(int cp, ParserState state, StateItem item) {
return acceptAffixPattern(
cp, item.returnTo1, state, item, item.currentAffixPattern, item.currentStepwiseParserTag);
}
/**
* Accepts a code point if the code point is compatible with the affix pattern at the offset
* encoded in the tag argument.
*
* @param cp The current code point, which will be checked for a match to the string.
* @param returnTo The state to return to after reaching the end of the string.
* @param state The current {@link ParserState}
* @param item The current {@link StateItem}
* @param str The string containing the affix pattern.
* @param tag The current state of the stepwise parser. Initial value should be 0L.
* @return A bitmask where the bits correspond to the items that were added. Set to 0L if no items
* were added.
*/
private static long acceptAffixPattern(
int cp, StateName ret1, ParserState state, StateItem item, CharSequence str, long tag) {
if (str == null || str.length() == 0) return 0L;
return acceptStringOrAffixPatternWithIgnorables(
cp, ret1, null, state, item, str, tag, false, false);
}
private static long acceptAffixPatternNonIgnorable(
int cp,
StateName returnTo,
ParserState state,
StateItem item,
CharSequence str,
int typeOrCp,
long firstTag,
long nextTag) {
// Convert from the returned tag to a code point, string, or currency to check
int resolvedCp = -1;
CharSequence resolvedStr = null;
boolean resolvedMinusSign = false;
boolean resolvedPlusSign = false;
boolean resolvedCurrency = false;
if (typeOrCp < 0) {
// Symbol
switch (typeOrCp) {
case AffixUtils.TYPE_MINUS_SIGN:
resolvedMinusSign = true;
break;
case AffixUtils.TYPE_PLUS_SIGN:
resolvedPlusSign = true;
break;
case AffixUtils.TYPE_PERCENT:
resolvedStr = state.symbols.getPercentString();
if (resolvedStr.length() != 1 || resolvedStr.charAt(0) != '%') {
resolvedCp = '%'; // accept ASCII percent as well as locale percent
}
break;
case AffixUtils.TYPE_PERMILLE:
resolvedStr = state.symbols.getPerMillString();
if (resolvedStr.length() != 1 || resolvedStr.charAt(0) != '‰') {
resolvedCp = '‰'; // accept ASCII permille as well as locale permille
}
break;
case AffixUtils.TYPE_CURRENCY_SINGLE:
case AffixUtils.TYPE_CURRENCY_DOUBLE:
case AffixUtils.TYPE_CURRENCY_TRIPLE:
case AffixUtils.TYPE_CURRENCY_QUAD:
case AffixUtils.TYPE_CURRENCY_QUINT:
case AffixUtils.TYPE_CURRENCY_OVERFLOW:
resolvedCurrency = true;
break;
default:
throw new AssertionError();
}
} else {
resolvedCp = typeOrCp;
}
long added = 0L;
if (resolvedCp >= 0 && codePointEquals(cp, resolvedCp, state)) {
if (firstTag >= 0) {
added |= acceptAffixPatternHelper(cp, returnTo, state, item, str, firstTag);
}
if (nextTag < 0) {
added |= acceptAffixPatternHelper(cp, returnTo, state, item, str, nextTag);
}
}
if (resolvedMinusSign) {
if (firstTag >= 0) {
added |= acceptMinusSign(cp, StateName.INSIDE_AFFIX_PATTERN, returnTo, state, item, false);
}
if (nextTag < 0) {
added |= acceptMinusSign(cp, returnTo, null, state, item, false);
}
if (added == 0L) {
// Also attempt to accept custom minus sign string
String mss = state.symbols.getMinusSignString();
int mssCp = Character.codePointAt(mss, 0);
if (mss.length() != Character.charCount(mssCp) || !UNISET_MINUS.contains(mssCp)) {
resolvedStr = mss;
}
}
}
if (resolvedPlusSign) {
if (firstTag >= 0) {
added |= acceptPlusSign(cp, StateName.INSIDE_AFFIX_PATTERN, returnTo, state, item, false);
}
if (nextTag < 0) {
added |= acceptPlusSign(cp, returnTo, null, state, item, false);
}
if (added == 0L) {
// Also attempt to accept custom plus sign string
String pss = state.symbols.getPlusSignString();
int pssCp = Character.codePointAt(pss, 0);
if (pss.length() != Character.charCount(pssCp) || !UNISET_MINUS.contains(pssCp)) {
resolvedStr = pss;
}
}
}
if (resolvedStr != null) {
if (firstTag >= 0) {
added |=
acceptString(
cp, StateName.INSIDE_AFFIX_PATTERN, returnTo, state, item, resolvedStr, 0, false);
}
if (nextTag < 0) {
added |= acceptString(cp, returnTo, null, state, item, resolvedStr, 0, false);
}
}
if (resolvedCurrency) {
if (firstTag >= 0) {
added |= acceptCurrency(cp, StateName.INSIDE_AFFIX_PATTERN, returnTo, state, item);
}
if (nextTag < 0) {
added |= acceptCurrency(cp, returnTo, null, state, item);
}
}
// Set state in the items that were added by the function calls
for (int i = Long.numberOfTrailingZeros(added); (1L << i) <= added; i++) {
if (((1L << i) & added) != 0) {
state.getItem(i).currentAffixPattern = str;
state.getItem(i).currentStepwiseParserTag = firstTag;
}
}
return added;
}
/**
* Internal method that is used to step to the next token of a affix pattern or exit the affix
* pattern if at the end.
*
* @param cp See {@link #acceptAffixPattern}
* @param returnTo1 See {@link #acceptAffixPattern}
* @param state See {@link #acceptAffixPattern}
* @param item See {@link #acceptAffixPattern}
* @param str See {@link #acceptAffixPattern}
* @param newOffset The tag corresponding to the next token in the affix pattern that should be
* recorded and consumed in a future call to {@link #acceptAffixPatternOffset}.
* @return Bitmask containing one entry, the one that was added.
*/
private static long acceptAffixPatternHelper(
int cp,
StateName returnTo,
ParserState state,
StateItem item,
CharSequence str,
long newTag) {
StateItem next = state.getNext().copyFrom(item, null, cp);
next.score += 1; // reward for consuming a cp from pattern
if (newTag >= 0) {
// Additional tokens in affix string.
next.name = StateName.INSIDE_AFFIX_PATTERN;
next.returnTo1 = returnTo;
next.currentAffixPattern = str;
next.currentStepwiseParserTag = newTag;
} else {
// Reached last token in affix string.
next.name = returnTo;
next.trailingCount = 0;
next.returnTo1 = null;
}
return 1L << state.lastInsertedIndex();
}
/**
* Consumes tokens from a string or affix pattern following ICU's rules for handling of whitespace
* and bidi control characters (collectively called "ignorables"). The methods {@link
* #acceptStringHelper}, {@link #acceptAffixPatternHelper}, {@link #acceptStringNonIgnorable}, and
* {@link #acceptAffixPatternNonIgnorable} will be called by this method to actually add parse
* paths.
*
* <p>In the "NonIgnorable" functions, two arguments are passed: firstOffsetOrTag and
* nextOffsetOrTag. These two arguments should add parse paths according to the following rules:
*
* <pre>
* if (firstOffsetOrTag is valid or inside string boundary) {
* // Add parse path going to firstOffsetOrTag
* }
* if (nextOffsetOrTag is invalid or beyond string boundary) {
* // Add parse path leaving the string
* }
* </pre>
*
* <p>Note that there may be multiple parse paths added by these lines. This is important in order
* to properly handle runs of ignorables.
*
* @param cp See {@link #acceptString} and {@link #acceptAffixPattern}
* @param ret1 See {@link #acceptString} and {@link #acceptAffixPattern}
* @param ret2 See {@link #acceptString} (affix pattern can pass null)
* @param state See {@link #acceptString} and {@link #acceptAffixPattern}
* @param item See {@link #acceptString} and {@link #acceptAffixPattern}
* @param str See {@link #acceptString} and {@link #acceptAffixPattern}
* @param offsetOrTag The current int offset for strings, or the current tag for affix patterns.
* @param trailing See {@link #acceptString} (affix patterns can pass false)
* @param isString true if the parameters correspond to a string; false if they correspond to an
* affix pattern.
* @return A bitmask containing the entries that were added.
*/
private static long acceptStringOrAffixPatternWithIgnorables(
int cp,
StateName ret1,
StateName ret2 /* String only */,
ParserState state,
StateItem item,
CharSequence str,
long offsetOrTag /* offset for string; tag for affix pattern */,
boolean trailing /* String only */,
boolean isString) {
// Runs of ignorables (whitespace and bidi control marks) can occur at the beginning, middle,
// or end of the reference string, or a run across the entire string.
//
// - A run at the beginning or in the middle corresponds to a run of length *zero or more*
// in the input.
// - A run at the end need to be matched exactly.
// - A string that contains only ignorable characters also needs to be matched exactly.
//
// Because the behavior differs, we need logic here to determine which case we have.
int typeOrCp =
isString
? Character.codePointAt(str, (int) offsetOrTag)
: AffixUtils.getTypeOrCp(offsetOrTag);
if (isIgnorable(typeOrCp, state)) {
// Look for the next nonignorable code point
int nextTypeOrCp = typeOrCp;
long prevOffsetOrTag;
long nextOffsetOrTag = offsetOrTag;
long firstOffsetOrTag = 0L;
while (true) {
prevOffsetOrTag = nextOffsetOrTag;
nextOffsetOrTag =
isString
? nextOffsetOrTag + Character.charCount(nextTypeOrCp)
: AffixUtils.nextToken(nextOffsetOrTag, str);
if (firstOffsetOrTag == 0L) firstOffsetOrTag = nextOffsetOrTag;
if (isString ? nextOffsetOrTag >= str.length() : nextOffsetOrTag < 0) {
// Integer.MIN_VALUE is an invalid value for either a type or a cp;
// use it to indicate the end of the string.
nextTypeOrCp = Integer.MIN_VALUE;
break;
}
nextTypeOrCp =
isString
? Character.codePointAt(str, (int) nextOffsetOrTag)
: AffixUtils.getTypeOrCp(nextOffsetOrTag);
if (!isIgnorable(nextTypeOrCp, state)) break;
}
if (nextTypeOrCp == Integer.MIN_VALUE) {
// Run at end or string that contains only ignorable characters.
if (codePointEquals(cp, typeOrCp, state)) {
// Step forward and also exit the string if not at very end.
// RETURN
long added = 0L;
added |=
isString
? acceptStringHelper(
cp, ret1, ret2, state, item, str, (int) firstOffsetOrTag, trailing)
: acceptAffixPatternHelper(cp, ret1, state, item, str, firstOffsetOrTag);
if (firstOffsetOrTag != nextOffsetOrTag) {
added |=
isString
? acceptStringHelper(
cp, ret1, ret2, state, item, str, (int) nextOffsetOrTag, trailing)
: acceptAffixPatternHelper(cp, ret1, state, item, str, nextOffsetOrTag);
}
return added;
} else {
// Code point does not exactly match the run at end.
// RETURN
return 0L;
}
} else {
// Run at beginning or in middle.
if (isIgnorable(cp, state)) {
// Consume the ignorable.
// RETURN
return isString
? acceptStringHelper(
cp, ret1, ret2, state, item, str, (int) prevOffsetOrTag, trailing)
: acceptAffixPatternHelper(cp, ret1, state, item, str, prevOffsetOrTag);
} else {
// Go to nonignorable cp.
// FALL THROUGH
}
}
// Fall through to the nonignorable code point found above.
assert nextTypeOrCp != Integer.MIN_VALUE;
typeOrCp = nextTypeOrCp;
offsetOrTag = nextOffsetOrTag;
}
assert !isIgnorable(typeOrCp, state);
// Look for the next nonignorable code point after this nonignorable code point
// to determine if we are at the end of the string.
int nextTypeOrCp = typeOrCp;
long nextOffsetOrTag = offsetOrTag;
long firstOffsetOrTag = 0L;
while (true) {
nextOffsetOrTag =
isString
? nextOffsetOrTag + Character.charCount(nextTypeOrCp)
: AffixUtils.nextToken(nextOffsetOrTag, str);
if (firstOffsetOrTag == 0L) firstOffsetOrTag = nextOffsetOrTag;
if (isString ? nextOffsetOrTag >= str.length() : nextOffsetOrTag < 0) {
nextTypeOrCp = -1;
break;
}
nextTypeOrCp =
isString
? Character.codePointAt(str, (int) nextOffsetOrTag)
: AffixUtils.getTypeOrCp(nextOffsetOrTag);
if (!isIgnorable(nextTypeOrCp, state)) break;
}
// Nonignorable logic.
return isString
? acceptStringNonIgnorable(
cp, ret1, ret2, state, item, str, trailing, typeOrCp, firstOffsetOrTag, nextOffsetOrTag)
: acceptAffixPatternNonIgnorable(
cp, ret1, state, item, str, typeOrCp, firstOffsetOrTag, nextOffsetOrTag);
}
/**
* This method can add up to four items to the new list in <code>state</code>.
*
* <p>If <code>cp</code> is equal to any known ISO code or long name, copies <code>item</code> to
* the new list in <code>state</code> and sets its ISO code to the corresponding currency.
*
* <p>If <code>cp</code> is the first code point of any ISO code or long name having more them one
* code point in length, copies <code>item</code> to the new list in <code>state</code> along with
* an instance of {@link TextTrieMap.ParseState} for tracking the following code points.
*
* @param cp The code point to check.
* @param state The state object to update.
* @param item The old state leading into the code point.
*/
private static void acceptCurrency(
int cp, StateName nextName, ParserState state, StateItem item) {
acceptCurrency(cp, nextName, null, state, item);
}
private static long acceptCurrency(
int cp, StateName returnTo1, StateName returnTo2, ParserState state, StateItem item) {
if (item.sawCurrency) return 0L;
long added = 0L;
// Accept from local currency information
String str1, str2;
Currency currency = state.properties.getCurrency();
if (currency != null) {
str1 = currency.getName(state.symbols.getULocale(), Currency.SYMBOL_NAME, null);
str2 = currency.getCurrencyCode();
// TODO: Should we also accept long names? In currency mode, they are in the CLDR data.
} else {
currency = state.symbols.getCurrency();
str1 = state.symbols.getCurrencySymbol();
str2 = state.symbols.getInternationalCurrencySymbol();
}
added |= acceptString(cp, returnTo1, returnTo2, state, item, str1, 0, false);
added |= acceptString(cp, returnTo1, returnTo2, state, item, str2, 0, false);
for (int i = Long.numberOfTrailingZeros(added); (1L << i) <= added; i++) {
if (((1L << i) & added) != 0) {
state.getItem(i).sawCurrency = true;
state.getItem(i).isoCode = str2;
}
}
// Accept from CLDR data
if (state.parseCurrency) {
ULocale uloc = state.symbols.getULocale();
TextTrieMap<Currency.CurrencyStringInfo>.ParseState trie1 =
Currency.openParseState(uloc, cp, Currency.LONG_NAME);
TextTrieMap<Currency.CurrencyStringInfo>.ParseState trie2 =
Currency.openParseState(uloc, cp, Currency.SYMBOL_NAME);
added |= acceptCurrencyHelper(cp, returnTo1, returnTo2, state, item, trie1);
added |= acceptCurrencyHelper(cp, returnTo1, returnTo2, state, item, trie2);
}
return added;
}
/**
* If <code>cp</code> is the next code point of any currency, copies <code>item</code> to the new
* list in <code>state</code> along with an instance of {@link TextTrieMap.ParseState} for
* tracking the following code points.
*
* <p>This method should only be called in a state following {@link #acceptCurrency}.
*
* @param cp The code point to check.
* @param state The state object to update.
* @param item The old state leading into the code point.
*/
private static void acceptCurrencyOffset(int cp, ParserState state, StateItem item) {
acceptCurrencyHelper(
cp, item.returnTo1, item.returnTo2, state, item, item.currentCurrencyTrieState);
}
private static long acceptCurrencyHelper(
int cp,
StateName returnTo1,
StateName returnTo2,
ParserState state,
StateItem item,
TextTrieMap<Currency.CurrencyStringInfo>.ParseState trieState) {
if (trieState == null) return 0L;
trieState.accept(cp);
long added = 0L;
Iterator<Currency.CurrencyStringInfo> currentMatches = trieState.getCurrentMatches();
if (currentMatches != null) {
// Match on current code point
// TODO: What should happen with multiple currency matches?
StateItem next = state.getNext().copyFrom(item, returnTo1, -1);
next.returnTo1 = returnTo2;
next.returnTo2 = null;
next.sawCurrency = true;
next.isoCode = currentMatches.next().getISOCode();
added |= 1L << state.lastInsertedIndex();
}
if (!trieState.atEnd()) {
// Prepare for matches on future code points
StateItem next = state.getNext().copyFrom(item, StateName.INSIDE_CURRENCY, -1);
next.returnTo1 = returnTo1;
next.returnTo2 = returnTo2;
next.currentCurrencyTrieState = trieState;
added |= 1L << state.lastInsertedIndex();
}
return added;
}
private static long acceptDigitTrie(
int cp, StateName nextName, ParserState state, StateItem item, DigitType type) {
assert state.digitTrie != null;
TextTrieMap<Byte>.ParseState trieState = state.digitTrie.openParseState(cp);
if (trieState == null) return 0L;
return acceptDigitTrieHelper(cp, nextName, state, item, type, trieState);
}
private static void acceptDigitTrieOffset(int cp, ParserState state, StateItem item) {
acceptDigitTrieHelper(
cp, item.returnTo1, state, item, item.currentDigitType, item.currentDigitTrieState);
}
private static long acceptDigitTrieHelper(
int cp,
StateName returnTo1,
ParserState state,
StateItem item,
DigitType type,
TextTrieMap<Byte>.ParseState trieState) {
if (trieState == null) return 0L;
trieState.accept(cp);
long added = 0L;
Iterator<Byte> currentMatches = trieState.getCurrentMatches();
if (currentMatches != null) {
// Match on current code point
byte digit = currentMatches.next();
StateItem next = state.getNext().copyFrom(item, returnTo1, -1);
next.returnTo1 = null;
recordDigit(next, digit, type);
added |= 1L << state.lastInsertedIndex();
}
if (!trieState.atEnd()) {
// Prepare for matches on future code points
StateItem next = state.getNext().copyFrom(item, StateName.INSIDE_DIGIT, -1);
next.returnTo1 = returnTo1;
next.currentDigitTrieState = trieState;
next.currentDigitType = type;
added |= 1L << state.lastInsertedIndex();
}
return added;
}
/**
* Checks whether the two given code points are equal after applying case mapping as requested in
* the ParserState.
*
* @see #acceptString
* @see #acceptAffixPattern
*/
private static boolean codePointEquals(int cp1, int cp2, ParserState state) {
if (!state.caseSensitive) {
cp1 = UCharacter.foldCase(cp1, true);
cp2 = UCharacter.foldCase(cp2, true);
}
return cp1 == cp2;
}
/**
* Checks whether the given code point is "ignorable" and should be skipped. BiDi control marks
* are always ignorable, and whitespace is ignorable in lenient mode.
*
* <p>Returns false if cp is negative.
*
* @param cp The code point to test.
* @param state The current {@link ParserState}, used for determining strict mode.
* @return true if cp is ignorable; false otherwise.
*/
private static boolean isIgnorable(int cp, ParserState state) {
if (cp < 0) return false;
if (UNISET_BIDI.contains(cp)) return true;
return state.mode == ParseMode.LENIENT && UNISET_WHITESPACE.contains(cp);
}
}