blob: 110a3404e0dab875ae463a8369bb48d60ec28a11 [file] [log] [blame]
/*
**********************************************************************
* Copyright (c) 2006-2007, Google and others. All Rights Reserved.
**********************************************************************
* Author: Mark Davis
**********************************************************************
*/
package org.unicode.cldr.util;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import org.unicode.cldr.util.CharUtilities.CharSourceWrapper;
import org.unicode.cldr.util.Dictionary.Matcher;
import org.unicode.cldr.util.Dictionary.Matcher.Filter;
import org.unicode.cldr.util.Dictionary.Matcher.Status;
import org.unicode.cldr.util.SimpleDictionary.SimpleDictionaryBuilder;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.DateFormat;
import com.ibm.icu.text.SimpleDateFormat;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.icu.util.TimeZone;
import com.ibm.icu.util.ULocale;
/**
* Should be in the package usertest, but it's a pain to rename files in CVS.
*
* @author markdavis
*
* @param <T>
*/
public class TestStateDictionaryBuilder<T> {
private static final boolean SHORT_TEST = true;
private static final boolean SHOW_CONTENTS = true;
private static final boolean CHECK_BOOLEAN = false;
private final boolean SHOW_STATES = true;
boolean SIMPLE_ONLY = false;
boolean TEST_AGAINST_SIMPLE = true;
Dictionary<T> stateDictionary;
Dictionary.Matcher<T> stateMatcher;
Dictionary<T> simpleDictionary;
Dictionary.Matcher<T> simpleMatcher;
Map<CharSequence, T> baseMapping = new TreeMap<CharSequence, T>();
final StateDictionaryBuilder<T> stateDictionaryBuilder = new StateDictionaryBuilder<T>();
final SimpleDictionaryBuilder<T> simpleDictionaryBuilder = new SimpleDictionaryBuilder<T>();
// TODO: convert to TestFramework
public static void main(String[] args) {
try {
new TestStateDictionaryBuilder<String>().test(args);
} finally {
System.out.println("DONE");
}
}
@SuppressWarnings({ "unchecked" })
public void test(String[] args) {
for (String arg : args) {
if (arg.equalsIgnoreCase("utf8")) {
stateDictionaryBuilder.setByteConverter(new Utf8StringByteConverter());
} else if (arg.equalsIgnoreCase("normal")) {
stateDictionaryBuilder.setByteConverter(new CompactStringByteConverter(false));
} else if (arg.equalsIgnoreCase("compact")) {
stateDictionaryBuilder.setByteConverter(new CompactStringByteConverter(true));
}
}
baseMapping.put("GMT+0000", (T) ("t"));
baseMapping.put("GMT+0100", (T) ("t"));
baseMapping.put("GMT+0200", (T) ("t"));
baseMapping.put("GMT+0300", (T) ("t"));
baseMapping.put("GMT+0307", (T) ("t"));
showDictionaryContents();
addToBoth("man", 1);
addToBoth("manner", 100);
addToBoth("many", 10);
addToBoth("any", 83);
showDictionaryContents();
baseMapping.put("man", (T) "Woman");
baseMapping.put("many", (T) "Few");
baseMapping.put("any", (T) "All");
showDictionaryContents();
for (Filter filter : Filter.values()) {
final String string = "many manners ma";
tryFind(string, new CharSourceWrapper<CharSequence>(string), stateDictionary, filter);
}
showWords("ma");
showWords("ma!");
showWords("!ma");
showWords("man");
showWords("man!");
showWords("mann");
showWords("mann!");
showWords("many");
showWords("many!");
compare();
addToBoth("m\u03B1nner", 1000);
showDictionaryContents();
showWords("m\u03B1");
compare();
// if (true) return;
// clear out
addToBoth("fish", 10);
showDictionaryContents();
showWords("a fisherman");
compare();
addToBoth("fisher", 13);
showDictionaryContents();
showWords("a fisherman");
compare();
addToBoth("her", 55);
showDictionaryContents();
showWords("a fisherman");
compare();
// clear out
// check some non-latin
String[] zoneIDs = TimeZone.getAvailableIDs();
SimpleDateFormat dt = (SimpleDateFormat) DateFormat.getDateInstance(DateFormat.LONG, new ULocale("hi"));
dt.applyPattern("vvvv");
for (String zoneID : zoneIDs) {
TimeZone zone = TimeZone.getTimeZone(zoneID);
dt.setTimeZone(zone);
String zoneName = dt.format(0);
addToBoth(zoneName, (T) (CHECK_BOOLEAN ? "t" : zoneID));
}
compare();
showDictionaryContents();
((StateDictionary<T>) stateDictionary).flatten();
if (SIMPLE_ONLY) {
testWithUnicodeNames();
((StateDictionary<T>) stateDictionary).flatten();
compare();
System.out.println();
showDictionaryContents();
}
}
static public <U> void tryFind(CharSequence originalText, CharSource charListText, Dictionary<U> dictionary,
Filter filter) {
System.out.println("Using dictionary: "
+ Dictionary.load(dictionary.getMapping(), new TreeMap<CharSequence, U>()));
System.out.println("Searching in: {" + originalText + "} with filter=" + filter);
// Dictionaries are immutable, so we create a Matcher to search/test text.
Matcher<U> matcher = dictionary.getMatcher();
matcher.setText(charListText);
while (true) {
Status status = matcher.find(filter);
String unique = ""; // only set if needed
if (status == Status.NONE) {
break;
} else if (status == Status.PARTIAL) {
// sets the match value to the "first" partial match
if (matcher.nextUniquePartial()) {
unique = "\tUnique";
} else {
unique = "\tNot Unique";
}
}
// Show results
System.out.println("{"
+ showBoth(charListText, 0, matcher.getOffset()) + "[["
+ showBoth(charListText, matcher.getOffset(), matcher.getMatchEnd())
+ "]]" + showBoth(charListText, matcher.getMatchEnd(), charListText.getKnownLength())
+ "}\t" + status + " \t{" + matcher.getMatchValue() + "}\t" + unique);
}
System.out.println();
}
static public CharSequence showBoth(CharSource source, int start, int end) {
if (source instanceof CharSourceWrapper) {
CharSourceWrapper new_name = (CharSourceWrapper) source;
return new_name.sourceSubSequence(start, end);
}
return source.subSequence(start, end);
}
private void showDictionaryContents() {
// build stuff to use from now on
simpleDictionary = simpleDictionaryBuilder.make(baseMapping);
simpleMatcher = simpleDictionary.getMatcher();
stateDictionary = stateDictionaryBuilder.make(baseMapping);
stateMatcher = stateDictionary.getMatcher();
baseMapping.clear();
// ((Dictionary.Builder) simpleDictionary).addMapping(string, i);
// ((Dictionary.Builder) stateDictionary).addMapping(string, i);
System.out.println("Dictionary: "
+ Dictionary.load(stateDictionary.getMapping(), new TreeMap<CharSequence, T>()));
System.out.println();
if (SHOW_STATES) {
System.out.println("States:" + CldrUtility.LINE_SEPARATOR + stateDictionary);
System.out.println();
}
if (SHOW_CONTENTS) {
System.out.println("Structure:" + CldrUtility.LINE_SEPARATOR + stateDictionary.debugShow());
System.out.println();
}
}
@SuppressWarnings("unchecked")
private void testWithUnicodeNames() {
UnicodeSet testSet = new UnicodeSet(
"[[:assigned:] - [:ideographic:] - [:Co:] - [:Cs:]]"); // &
// [\\u0000-\\u0FFF]
int count = 0;
Map<String, T> data = new TreeMap<String, T>();
for (UnicodeSetIterator it = new UnicodeSetIterator(testSet); it.next();) {
String name = UCharacter.getExtendedName(it.codepoint);
if (name == null) {
continue;
}
if ((++count & 0xFF) == 0) {
System.out.println(count + ":\t"
+ com.ibm.icu.impl.Utility.hex(it.codepoint) + "\t" + name);
}
data.put(name, (T) com.ibm.icu.impl.Utility.hex(it.codepoint, 4));
}
count = 0;
for (String item : data.keySet()) {
if (SHORT_TEST && count++ > 500) continue; //
addToBoth(item, data.get(item));
}
simpleDictionary = simpleDictionaryBuilder.make(baseMapping);
stateDictionary = stateDictionaryBuilder.make(baseMapping);
baseMapping.clear();
compare();
}
private void compare() {
System.out.println("Comparing results: ");
Map<CharSequence, T> dictionaryData = Dictionary.load(stateDictionary.getMapping(),
new HashMap<CharSequence, T>());
Map<CharSequence, T> simpleDictionaryData = Dictionary.load(simpleDictionary.getMapping(),
new HashMap<CharSequence, T>());
assert dictionaryData.equals(simpleDictionaryData) : showDifference(dictionaryData, simpleDictionaryData);
if (SHOW_STATES) {
System.out.println("Size: " + dictionaryData.size());
System.out.println("Rows: "
+ ((StateDictionary<T>) stateDictionary).getRowCount());
}
System.out.println("Checking values: state dictionary");
checkSimpleMatches(stateMatcher, dictionaryData);
System.out.println("Checking values: simple dictionary");
checkSimpleMatches(simpleMatcher, simpleDictionaryData);
int count = 0;
System.out.println("Cross-checking all values");
for (CharSequence myText : simpleDictionaryData.keySet()) {
if ((++count & 0xFF) == 0xFF) {
System.out.println(count + ":\t" + myText);
}
crossCheck(new CharSourceWrapper<CharSequence>(myText));
crossCheck("!" + myText);
crossCheck(myText + "!");
}
}
private String showDifference(Map<CharSequence, T> dictionaryData, Map<CharSequence, T> simpleDictionaryData) {
System.out.println(dictionaryData.size() + ", " + simpleDictionaryData.size());
Iterator<Entry<CharSequence, T>> it1 = dictionaryData.entrySet().iterator();
Iterator<Entry<CharSequence, T>> it2 = simpleDictionaryData.entrySet().iterator();
while (it1.hasNext() || it2.hasNext()) {
Entry<CharSequence, T> item1 = it1.hasNext() ? it1.next() : null;
Entry<CharSequence, T> item2 = it2.hasNext() ? it2.next() : null;
System.out.println(item1 + ", " + item2);
if (item1 == null || item2 == null || !item1.equals(item2)) {
return item1 + "!=" + item2;
}
}
return "no difference";
}
private void crossCheck(CharSequence myText) {
crossCheck(new CharSourceWrapper<CharSequence>(myText));
}
private void crossCheck(CharSource myText) {
stateMatcher.setText(myText); // set the text to operate on
simpleMatcher.setText(myText); // set the text to operate on
for (int i = 0; stateMatcher.getText().hasCharAt(i); ++i) {
stateMatcher.setOffset(i);
simpleMatcher.setOffset(i);
while (true) {
Status stateStatus = stateMatcher.next();
Status simpleStatus = simpleMatcher.next();
assert stateStatus == simpleStatus : showValues(stateStatus, simpleStatus);
final int stateEnd = stateMatcher.getMatchEnd();
final int simpleEnd = simpleMatcher.getMatchEnd();
assert stateEnd == simpleEnd : showValues(stateStatus, simpleStatus);
if (stateStatus == Status.PARTIAL) {
boolean stateUnique = stateMatcher.nextUniquePartial();
boolean simpleUnique = simpleMatcher.nextUniquePartial();
assert stateUnique == simpleUnique : showValues(stateStatus, simpleStatus);
}
// test this after checking PARTIAL
assert stateMatcher.getMatchValue() == simpleMatcher.getMatchValue() : showValues(stateStatus,
simpleStatus);
if (stateStatus != Status.MATCH) {
break;
}
}
}
}
private String showValues(Status stateStatus, Status simpleStatus) {
return CldrUtility.LINE_SEPARATOR + "TEXT:\t" + stateMatcher.text + CldrUtility.LINE_SEPARATOR + "STATE:\t"
+ showValues(stateStatus, stateMatcher) + CldrUtility.LINE_SEPARATOR + "SIMPLE:\t"
+ showValues(simpleStatus, simpleMatcher);
}
private String showValues(Status status, Matcher<T> matcher) {
boolean uniquePartial = status == Status.PARTIAL && matcher.nextUniquePartial(); // sets matchValue for PARTIAL
return String.format("\tOffsets: %s,%s\tStatus: %s\tString: \"%s\"\tValue: %s %s",
matcher.getOffset(),
matcher.getMatchEnd(),
status,
matcher.getMatchText(),
matcher.getMatchValue(),
status == Status.PARTIAL && uniquePartial ? "\tUNIQUE" : "");
}
/**
* Check that the words all match against themselves.
*
* @param matcher
* @param data
*/
private void checkSimpleMatches(Matcher<T> matcher, Map<CharSequence, T> data) {
int count = 0;
for (CharSequence myText : data.keySet()) {
if ((count++ & 0xFF) == 0xFF) {
System.out.println(count + ":\t" + myText);
}
matcher.setText(myText); // set the text to operate on
matcher.setOffset(0);
int matchEnd = -1;
T matchValue = null;
// find the longest match
while (true) {
Dictionary.Matcher.Status next1 = matcher.next();
if (next1 == Dictionary.Matcher.Status.MATCH) {
matchEnd = matcher.getMatchEnd();
matchValue = matcher.getMatchValue();
} else {
break;
}
}
assert matchEnd == myText.length() : "failed to find end of <" + myText + "> got instead " + matchEnd;
assert matchValue == data.get(myText);
}
}
@SuppressWarnings("unchecked")
private void addToBoth(CharSequence string, int i) {
baseMapping.put(string, (T) (i + "/" + string));
}
private void addToBoth(CharSequence string, T i) {
baseMapping.put(string, i);
// if (simpleDictionary.contains(string)) return;
// if (!stateDictionary.contains(string)) {
// stateDictionary.contains(string);
// }
// assert stateDictionary.contains(string);
}
private void showWords(String myText) {
System.out.format("Finding words in: \"%s\"" + CldrUtility.LINE_SEPARATOR, myText);
if (SIMPLE_ONLY) {
showWords("", simpleMatcher, myText);
} else {
Set<String> simpleResult = showWords("Simple", simpleMatcher, myText);
Set<String> stateResult = showWords("STATE", stateMatcher, myText);
if (!simpleResult.equals(stateResult)) {
// repeat, for debugging
System.out.println(" DIFFERENCE");
showWords("Simple", simpleMatcher, myText);
showWords("STATE", stateMatcher, myText);
Set<String> simpleMinusState = new LinkedHashSet<String>(simpleResult);
simpleMinusState.removeAll(stateResult);
System.out.println("Simple-State" + simpleMinusState);
Set<String> stateMinusSimple = new LinkedHashSet<String>(stateResult);
stateMinusSimple.removeAll(simpleResult);
System.out.println("State-Simple" + stateMinusSimple);
}
}
}
private Set<String> showWords(String title, Matcher<T> matcher, String myText) {
title = title.equals("") ? "" : "\tType: " + title;
// Walk through a strings and gather information about what we find
// according to the matcher
Set<String> result = new LinkedHashSet<String>();
// Set the text to operate on
matcher.setText(myText);
boolean uniquePartial = false;
for (int i = 0; matcher.hasCharAt(i); ++i) {
matcher.setOffset(i);
Status status;
// We might get multiple matches at each point, so walk through all of
// them. The last one might be a partial, so collect some extra
// information in that case.
do {
// Sets matchValue if there is a MATCH
status = matcher.next();
if (status == Status.PARTIAL) {
// Sets matchValue if the next() status was PARTIAL
uniquePartial = matcher.nextUniquePartial();
}
// Format all of the information
String info = String.format(
"\tOffsets: %s,%s\tStatus: %s\tString: \"%s\"\tValue: %s%s", //
matcher.getOffset(), matcher.getMatchEnd(), status, //
matcher.getMatchText(), matcher.getMatchValue(), //
status == Status.PARTIAL && uniquePartial ? "\tUNIQUE" : "");
result.add(info);
if (status != Status.NONE) {
// If there was a match or partial match, show what we got
System.out.println(title + info);
}
} while (status == Status.MATCH);
}
return result;
}
}