tools/java/org/unicode/cldr/draft/UnicodeSetFormat.java - platform/external/cldr - Git at Google

 package org.unicode.cldr.draft;

 import java.text.FieldPosition;
 import java.text.Format;
 import java.text.ParsePosition;
 import java.util.BitSet;
 import java.util.Set;
 import java.util.TreeSet;

 import org.unicode.cldr.draft.PatternFixer.Target;

 import com.ibm.icu.impl.Utility;
 import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.lang.UProperty;
 import com.ibm.icu.text.UTF16;
 import com.ibm.icu.text.UnicodeSet;
 import com.ibm.icu.text.UnicodeSetIterator;

 public class UnicodeSetFormat extends Format {

     public UnicodeSetFormat(Target target) {
         this.target = target;
     }

     public UnicodeSetFormat(Target target, int patternOptions) {
         this.target = target;
         this.options = patternOptions;
     }

     // main methods

     @Override
     // TODO clean up prototype
     public StringBuffer format(Object obj, StringBuffer toAppendTo, FieldPosition pos) {
         // API for Format calls for StringBuffer, but should update to StringBuilder
         int startPos = toAppendTo.length();
         Set<String> strings = null;
         toAppendTo.append('[');
         for (UnicodeSetIterator it = new UnicodeSetIterator((UnicodeSet) obj); it.nextRange();) {
             if (it.codepoint == UnicodeSetIterator.IS_STRING) {
                 if (strings == null) {
                     strings = new TreeSet<String>();
                 }
                 strings.add(it.string);
                 continue;
             }
             appendQuoted(toAppendTo, it.codepoint);
             if (it.codepointEnd != it.codepoint) {
                 appendQuoted(toAppendTo.append('-'), it.codepointEnd);
             }
         }
         toAppendTo.append(']');
         if (strings != null) { // edge case
             StringBuffer extras = new StringBuffer("(?:");
             for (String string : strings) {
                 appendQuoted(extras, string).append('|');
             }
             toAppendTo.insert(startPos, extras);
             toAppendTo.append(')');
         }
         return toAppendTo;
     }

     // TODO optimize this to only quote what is needed for the particular target
     // and (possibly) the given location in the character class
     private StringBuffer appendQuoted(StringBuffer target, int codePoint) {
         switch (codePoint) {
         case '[': // SET_OPEN:
         case ']': // SET_CLOSE:
         case '-': // HYPHEN:
         case '^': // COMPLEMENT:
         case '&': // INTERSECTION:
         case '\\': // BACKSLASH:
         case '{':
         case '}':
         case '$':
         case ':':
             target.append('\\');
             break;
         default:
             if (toQuote.contains(codePoint)) {
                 if (codePoint > 0xFFFF) {
                     target.append("\\u");
                     target.append(Utility.hex(UTF16.getLeadSurrogate(codePoint), 4));
                     codePoint = UTF16.getTrailSurrogate(codePoint);
                 }
                 target.append("\\u");
                 target.append(Utility.hex(codePoint, 4));
                 return target;
             }
         }
         UTF16.append(target, codePoint);
         return target;
     }

     private StringBuffer appendQuoted(StringBuffer target, String string) {
         for (int i = 0; i < string.length(); ++i) {
             appendQuoted(target, string.charAt(i));
             // don't worry about surrogates; this works in Java
             // for other Targets we may have to fix.
         }
         return target;
     }

     @Override
     public final UnicodeSet parseObject(String pattern, ParsePosition pos) {
         return new UnicodeSet(pattern, pos, null);
     }

     // settings

     public Target getTarget() {
         return target;
     }

     public UnicodeSetFormat setTarget(Target target) {
         this.target = target;
         return this;
     }

     public int getOptions() {
         return options;
     }

     public UnicodeSetFormat setOptions(int options) {
         this.options = options;
         return this;
     }

     public Extension[] getExtensions() {
         return extensions;
     }

     public UnicodeSetFormat setExtensions(Extension... extensions) {
         this.extensions = extensions;
         return this;
     }

     public abstract class Extension {

         /**
          * Is called every time an unquoted $ is found. Should parse out variables as appropriate
          * and return how far we got, and the replacement string. Returns null if doesn't match a variable.
          *
          * @pos on input should be set to the position just before the dollar sign.
          *      On output should be set to the end of the text to replace.
          */
         public abstract String replaceVariable(String pattern, ParsePosition pos);

         /**
          * Resolves anything that looks like a property, eg: <br>
          * encountering \p{whitespace} or [:whitespace:] would call
          * getProperty("whitespace", "", false, result)<br>
          * while
          * \p{bidi_class=neutral} would call getProperty("bidi_class", "neutral",
          * false, result) and <br>
          * \p{name=/DOT/} would call
          * getProperty("bidi_class", "neutral", false, result) <br>
          * (for an example of the latter, see {@linkplain http
          * ://unicode.org/cldr/utility/list-unicodeset.jsp?a=\p name=/WITH%20DOT%20ABOVE/}
          *
          * @param regex
          *            Set to true if the property value is a regex "find" expression. In that case,
          *            the return value should be the set of Unicode characters that match the regex.
          */
         public abstract boolean getProperty(String propertyName, String propertyValue, boolean regex, UnicodeSet result);

     }

     public String formatWithProperties(UnicodeSet original, boolean addOthers, UnicodeSet expandBlockIgnorables,
         int... properties) {
         UnicodeSet remainder = new UnicodeSet().addAll(original);
         Set<String> propSet = new TreeSet<String>();
         BitSet props = new BitSet();

         for (int i = 0; i < properties.length; ++i) {
             reduceByProperty(original, expandBlockIgnorables, properties[i], remainder, propSet);
             props.set(i);
         }
         if (addOthers) {
             for (int i = UProperty.INT_START; i < UProperty.INT_LIMIT; ++i) {
                 if (props.get(i)) continue;
                 reduceByProperty(original, expandBlockIgnorables, i, remainder, propSet);
             }
         }
         StringBuffer result = new StringBuffer("[ ");
         for (String prop : propSet) {
             result.append(prop).append(" ");
         }
         if (expandBlockIgnorables != null) {
             result.append("- ").append(expandBlockIgnorables.toPattern(true));
         }
         if (remainder.size() > 0) {
             result.append(" ").append(remainder.toPattern(true));
         }
         result.append("]");
         return result.toString();
     }

     static final int blockEnum = UCharacter.getPropertyEnum("block");

     private void reduceByProperty(UnicodeSet original, UnicodeSet expandBlockIgnorables, int property,
         UnicodeSet remainder, Set<String> result) {
         String propertyAlias = UCharacter.getPropertyName(property, UProperty.NameChoice.SHORT);
         UnicodeSet valueChars = new UnicodeSet();
         for (int i = UCharacter.getIntPropertyMinValue(property); i <= UCharacter.getIntPropertyMaxValue(property); ++i) {
             String valueAlias = UCharacter.getPropertyValueName(property, i, UProperty.NameChoice.SHORT);
             if (valueAlias == null) {
                 valueAlias = UCharacter.getPropertyValueName(property, i, UProperty.NameChoice.LONG);
             }
             if (valueAlias == null) continue;

             valueChars.clear();
             valueChars.applyPropertyAlias(propertyAlias, valueAlias);
             if (remainder.containsSome(valueChars)) {
                 if (original.containsAll(valueChars)) {
                     result.add("[:" + propertyAlias + '=' + valueAlias + ":]");
                     remainder.removeAll(valueChars);
                 } else if (property == blockEnum && expandBlockIgnorables != null) {
                     UnicodeSet hasScript = new UnicodeSet(valueChars).removeAll(expandBlockIgnorables);
                     if (hasScript.size() > 5 && original.containsAll(hasScript)) {
                         System.out.println("Broadening to block: " + valueAlias);
                         result.add("[:" + propertyAlias + '=' + valueAlias + ":]");
                         remainder.removeAll(valueChars);
                     }
                 }
             }
         }
     }

     // ===== PRIVATES =====
     private static final long serialVersionUID = 1L;
     private Target target;
     private int options;
     private Extension[] extensions;
     private static final UnicodeSet toQuote = (UnicodeSet) new UnicodeSet(
         "[[:Cn:][:Default_Ignorable_Code_Point:][:patternwhitespace:]]").freeze();

 }
	package org.unicode.cldr.draft;

	import java.text.FieldPosition;
	import java.text.Format;
	import java.text.ParsePosition;
	import java.util.BitSet;
	import java.util.Set;
	import java.util.TreeSet;

	import org.unicode.cldr.draft.PatternFixer.Target;

	import com.ibm.icu.impl.Utility;
	import com.ibm.icu.lang.UCharacter;
	import com.ibm.icu.lang.UProperty;
	import com.ibm.icu.text.UTF16;
	import com.ibm.icu.text.UnicodeSet;
	import com.ibm.icu.text.UnicodeSetIterator;

	public class UnicodeSetFormat extends Format {

	public UnicodeSetFormat(Target target) {
	this.target = target;
	}

	public UnicodeSetFormat(Target target, int patternOptions) {
	this.target = target;
	this.options = patternOptions;
	}

	// main methods

	@Override
	// TODO clean up prototype
	public StringBuffer format(Object obj, StringBuffer toAppendTo, FieldPosition pos) {
	// API for Format calls for StringBuffer, but should update to StringBuilder
	int startPos = toAppendTo.length();
	Set<String> strings = null;
	toAppendTo.append('[');
	for (UnicodeSetIterator it = new UnicodeSetIterator((UnicodeSet) obj); it.nextRange();) {
	if (it.codepoint == UnicodeSetIterator.IS_STRING) {
	if (strings == null) {
	strings = new TreeSet<String>();
	}
	strings.add(it.string);
	continue;
	}
	appendQuoted(toAppendTo, it.codepoint);
	if (it.codepointEnd != it.codepoint) {
	appendQuoted(toAppendTo.append('-'), it.codepointEnd);
	}
	}
	toAppendTo.append(']');
	if (strings != null) { // edge case
	StringBuffer extras = new StringBuffer("(?:");
	for (String string : strings) {
	appendQuoted(extras, string).append('\|');
	}
	toAppendTo.insert(startPos, extras);
	toAppendTo.append(')');
	}
	return toAppendTo;
	}

	// TODO optimize this to only quote what is needed for the particular target
	// and (possibly) the given location in the character class
	private StringBuffer appendQuoted(StringBuffer target, int codePoint) {
	switch (codePoint) {
	case '[': // SET_OPEN:
	case ']': // SET_CLOSE:
	case '-': // HYPHEN:
	case '^': // COMPLEMENT:
	case '&': // INTERSECTION:
	case '\\': // BACKSLASH:
	case '{':
	case '}':
	case '$':
	case ':':
	target.append('\\');
	break;
	default:
	if (toQuote.contains(codePoint)) {
	if (codePoint > 0xFFFF) {
	target.append("\\u");
	target.append(Utility.hex(UTF16.getLeadSurrogate(codePoint), 4));
	codePoint = UTF16.getTrailSurrogate(codePoint);
	}
	target.append("\\u");
	target.append(Utility.hex(codePoint, 4));
	return target;
	}
	}
	UTF16.append(target, codePoint);
	return target;
	}

	private StringBuffer appendQuoted(StringBuffer target, String string) {
	for (int i = 0; i < string.length(); ++i) {
	appendQuoted(target, string.charAt(i));
	// don't worry about surrogates; this works in Java
	// for other Targets we may have to fix.
	}
	return target;
	}

	@Override
	public final UnicodeSet parseObject(String pattern, ParsePosition pos) {
	return new UnicodeSet(pattern, pos, null);
	}

	// settings

	public Target getTarget() {
	return target;
	}

	public UnicodeSetFormat setTarget(Target target) {
	this.target = target;
	return this;
	}

	public int getOptions() {
	return options;
	}

	public UnicodeSetFormat setOptions(int options) {
	this.options = options;
	return this;
	}

	public Extension[] getExtensions() {
	return extensions;
	}

	public UnicodeSetFormat setExtensions(Extension... extensions) {
	this.extensions = extensions;
	return this;
	}

	public abstract class Extension {

	/**
	* Is called every time an unquoted $ is found. Should parse out variables as appropriate
	* and return how far we got, and the replacement string. Returns null if doesn't match a variable.
	*
	* @pos on input should be set to the position just before the dollar sign.
	* On output should be set to the end of the text to replace.
	*/
	public abstract String replaceVariable(String pattern, ParsePosition pos);

	/**
	* Resolves anything that looks like a property, eg: <br>
	* encountering \p{whitespace} or [:whitespace:] would call
	* getProperty("whitespace", "", false, result)<br>
	* while
	* \p{bidi_class=neutral} would call getProperty("bidi_class", "neutral",
	* false, result) and <br>
	* \p{name=/DOT/} would call
	* getProperty("bidi_class", "neutral", false, result) <br>
	* (for an example of the latter, see {@linkplain http
	* ://unicode.org/cldr/utility/list-unicodeset.jsp?a=\p name=/WITH%20DOT%20ABOVE/}
	*
	* @param regex
	* Set to true if the property value is a regex "find" expression. In that case,
	* the return value should be the set of Unicode characters that match the regex.
	*/
	public abstract boolean getProperty(String propertyName, String propertyValue, boolean regex, UnicodeSet result);

	}

	public String formatWithProperties(UnicodeSet original, boolean addOthers, UnicodeSet expandBlockIgnorables,
	int... properties) {
	UnicodeSet remainder = new UnicodeSet().addAll(original);
	Set<String> propSet = new TreeSet<String>();
	BitSet props = new BitSet();

	for (int i = 0; i < properties.length; ++i) {
	reduceByProperty(original, expandBlockIgnorables, properties[i], remainder, propSet);
	props.set(i);
	}
	if (addOthers) {
	for (int i = UProperty.INT_START; i < UProperty.INT_LIMIT; ++i) {
	if (props.get(i)) continue;
	reduceByProperty(original, expandBlockIgnorables, i, remainder, propSet);
	}
	}
	StringBuffer result = new StringBuffer("[ ");
	for (String prop : propSet) {
	result.append(prop).append(" ");
	}
	if (expandBlockIgnorables != null) {
	result.append("- ").append(expandBlockIgnorables.toPattern(true));
	}
	if (remainder.size() > 0) {
	result.append(" ").append(remainder.toPattern(true));
	}
	result.append("]");
	return result.toString();
	}

	static final int blockEnum = UCharacter.getPropertyEnum("block");

	private void reduceByProperty(UnicodeSet original, UnicodeSet expandBlockIgnorables, int property,
	UnicodeSet remainder, Set<String> result) {
	String propertyAlias = UCharacter.getPropertyName(property, UProperty.NameChoice.SHORT);
	UnicodeSet valueChars = new UnicodeSet();
	for (int i = UCharacter.getIntPropertyMinValue(property); i <= UCharacter.getIntPropertyMaxValue(property); ++i) {
	String valueAlias = UCharacter.getPropertyValueName(property, i, UProperty.NameChoice.SHORT);
	if (valueAlias == null) {
	valueAlias = UCharacter.getPropertyValueName(property, i, UProperty.NameChoice.LONG);
	}
	if (valueAlias == null) continue;

	valueChars.clear();
	valueChars.applyPropertyAlias(propertyAlias, valueAlias);
	if (remainder.containsSome(valueChars)) {
	if (original.containsAll(valueChars)) {
	result.add("[:" + propertyAlias + '=' + valueAlias + ":]");
	remainder.removeAll(valueChars);
	} else if (property == blockEnum && expandBlockIgnorables != null) {
	UnicodeSet hasScript = new UnicodeSet(valueChars).removeAll(expandBlockIgnorables);
	if (hasScript.size() > 5 && original.containsAll(hasScript)) {
	System.out.println("Broadening to block: " + valueAlias);
	result.add("[:" + propertyAlias + '=' + valueAlias + ":]");
	remainder.removeAll(valueChars);
	}
	}
	}
	}
	}

	// ===== PRIVATES =====
	private static final long serialVersionUID = 1L;
	private Target target;
	private int options;
	private Extension[] extensions;
	private static final UnicodeSet toQuote = (UnicodeSet) new UnicodeSet(
	"[[:Cn:][:Default_Ignorable_Code_Point:][:patternwhitespace:]]").freeze();

	}