blob: fca9dfa4f8dc2b78ebaa6eb2d72497c6fc03c933 [file] [log] [blame]
package org.unicode.cldr.draft;
import java.text.FieldPosition;
import java.text.Format;
import java.text.ParsePosition;
import java.util.BitSet;
import java.util.Set;
import java.util.TreeSet;
import org.unicode.cldr.draft.PatternFixer.Target;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
public class UnicodeSetFormat extends Format {
public UnicodeSetFormat(Target target) {
this.target = target;
}
public UnicodeSetFormat(Target target, int patternOptions) {
this.target = target;
this.options = patternOptions;
}
// main methods
@Override
// TODO clean up prototype
public StringBuffer format(Object obj, StringBuffer toAppendTo, FieldPosition pos) {
// API for Format calls for StringBuffer, but should update to StringBuilder
int startPos = toAppendTo.length();
Set<String> strings = null;
toAppendTo.append('[');
for (UnicodeSetIterator it = new UnicodeSetIterator((UnicodeSet) obj); it.nextRange();) {
if (it.codepoint == UnicodeSetIterator.IS_STRING) {
if (strings == null) {
strings = new TreeSet<String>();
}
strings.add(it.string);
continue;
}
appendQuoted(toAppendTo, it.codepoint);
if (it.codepointEnd != it.codepoint) {
appendQuoted(toAppendTo.append('-'), it.codepointEnd);
}
}
toAppendTo.append(']');
if (strings != null) { // edge case
StringBuffer extras = new StringBuffer("(?:");
for (String string : strings) {
appendQuoted(extras, string).append('|');
}
toAppendTo.insert(startPos, extras);
toAppendTo.append(')');
}
return toAppendTo;
}
// TODO optimize this to only quote what is needed for the particular target
// and (possibly) the given location in the character class
private StringBuffer appendQuoted(StringBuffer target, int codePoint) {
switch (codePoint) {
case '[': // SET_OPEN:
case ']': // SET_CLOSE:
case '-': // HYPHEN:
case '^': // COMPLEMENT:
case '&': // INTERSECTION:
case '\\': // BACKSLASH:
case '{':
case '}':
case '$':
case ':':
target.append('\\');
break;
default:
if (toQuote.contains(codePoint)) {
if (codePoint > 0xFFFF) {
target.append("\\u");
target.append(Utility.hex(UTF16.getLeadSurrogate(codePoint), 4));
codePoint = UTF16.getTrailSurrogate(codePoint);
}
target.append("\\u");
target.append(Utility.hex(codePoint, 4));
return target;
}
}
UTF16.append(target, codePoint);
return target;
}
private StringBuffer appendQuoted(StringBuffer target, String string) {
for (int i = 0; i < string.length(); ++i) {
appendQuoted(target, string.charAt(i));
// don't worry about surrogates; this works in Java
// for other Targets we may have to fix.
}
return target;
}
@Override
public final UnicodeSet parseObject(String pattern, ParsePosition pos) {
return new UnicodeSet(pattern, pos, null);
}
// settings
public Target getTarget() {
return target;
}
public UnicodeSetFormat setTarget(Target target) {
this.target = target;
return this;
}
public int getOptions() {
return options;
}
public UnicodeSetFormat setOptions(int options) {
this.options = options;
return this;
}
public Extension[] getExtensions() {
return extensions;
}
public UnicodeSetFormat setExtensions(Extension... extensions) {
this.extensions = extensions;
return this;
}
public abstract class Extension {
/**
* Is called every time an unquoted $ is found. Should parse out variables as appropriate
* and return how far we got, and the replacement string. Returns null if doesn't match a variable.
*
* @pos on input should be set to the position just before the dollar sign.
* On output should be set to the end of the text to replace.
*/
public abstract String replaceVariable(String pattern, ParsePosition pos);
/**
* Resolves anything that looks like a property, eg: <br>
* encountering \p{whitespace} or [:whitespace:] would call
* getProperty("whitespace", "", false, result)<br>
* while
* \p{bidi_class=neutral} would call getProperty("bidi_class", "neutral",
* false, result) and <br>
* \p{name=/DOT/} would call
* getProperty("bidi_class", "neutral", false, result) <br>
* (for an example of the latter, see {@linkplain http
* ://unicode.org/cldr/utility/list-unicodeset.jsp?a=\p name=/WITH%20DOT%20ABOVE/}
*
* @param regex
* Set to true if the property value is a regex "find" expression. In that case,
* the return value should be the set of Unicode characters that match the regex.
*/
public abstract boolean getProperty(String propertyName, String propertyValue, boolean regex, UnicodeSet result);
}
public String formatWithProperties(UnicodeSet original, boolean addOthers, UnicodeSet expandBlockIgnorables,
int... properties) {
UnicodeSet remainder = new UnicodeSet().addAll(original);
Set<String> propSet = new TreeSet<String>();
BitSet props = new BitSet();
for (int i = 0; i < properties.length; ++i) {
reduceByProperty(original, expandBlockIgnorables, properties[i], remainder, propSet);
props.set(i);
}
if (addOthers) {
for (int i = UProperty.INT_START; i < UProperty.INT_LIMIT; ++i) {
if (props.get(i)) continue;
reduceByProperty(original, expandBlockIgnorables, i, remainder, propSet);
}
}
StringBuffer result = new StringBuffer("[ ");
for (String prop : propSet) {
result.append(prop).append(" ");
}
if (expandBlockIgnorables != null) {
result.append("- ").append(expandBlockIgnorables.toPattern(true));
}
if (remainder.size() > 0) {
result.append(" ").append(remainder.toPattern(true));
}
result.append("]");
return result.toString();
}
static final int blockEnum = UCharacter.getPropertyEnum("block");
private void reduceByProperty(UnicodeSet original, UnicodeSet expandBlockIgnorables, int property,
UnicodeSet remainder, Set<String> result) {
String propertyAlias = UCharacter.getPropertyName(property, UProperty.NameChoice.SHORT);
UnicodeSet valueChars = new UnicodeSet();
for (int i = UCharacter.getIntPropertyMinValue(property); i <= UCharacter.getIntPropertyMaxValue(property); ++i) {
String valueAlias = UCharacter.getPropertyValueName(property, i, UProperty.NameChoice.SHORT);
if (valueAlias == null) {
valueAlias = UCharacter.getPropertyValueName(property, i, UProperty.NameChoice.LONG);
}
if (valueAlias == null) continue;
valueChars.clear();
valueChars.applyPropertyAlias(propertyAlias, valueAlias);
if (remainder.containsSome(valueChars)) {
if (original.containsAll(valueChars)) {
result.add("[:" + propertyAlias + '=' + valueAlias + ":]");
remainder.removeAll(valueChars);
} else if (property == blockEnum && expandBlockIgnorables != null) {
UnicodeSet hasScript = new UnicodeSet(valueChars).removeAll(expandBlockIgnorables);
if (hasScript.size() > 5 && original.containsAll(hasScript)) {
System.out.println("Broadening to block: " + valueAlias);
result.add("[:" + propertyAlias + '=' + valueAlias + ":]");
remainder.removeAll(valueChars);
}
}
}
}
}
// ===== PRIVATES =====
private static final long serialVersionUID = 1L;
private Target target;
private int options;
private Extension[] extensions;
private static final UnicodeSet toQuote = (UnicodeSet) new UnicodeSet(
"[[:Cn:][:Default_Ignorable_Code_Point:][:patternwhitespace:]]").freeze();
}