blob: a3c4eb5d434afd024d24118589c2f604cecc47a1 [file] [log] [blame]
package org.unicode.cldr.draft;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.CanonicalIterator;
import com.ibm.icu.text.Normalizer;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UTF16.StringComparator;
import com.ibm.icu.text.UnicodeSet;
public class ShortestCanonicalForm {
static final UnicodeSet trailing = new UnicodeSet();
static final UnicodeSet leading = new UnicodeSet();
static final UnicodeSet leading3 = new UnicodeSet();
static StringComparator cpCompare = new UTF16.StringComparator(true, false, 0);
static final Map<Integer, Set<String>> leadToTrail = new TreeMap<Integer, Set<String>>();
static UnicodeSet skip = new UnicodeSet(
"[[:hangulsyllabletype=l:][:hangulsyllabletype=v:][:hangulsyllabletype=t:]]");
static Map<String, String> restoreExclusions = new TreeMap<String, String>();
static UnicodeSet breakingTrail = new UnicodeSet();
static {
for (int i = 0; i <= 0x10FFFF; ++i) {
final String nfd = Normalizer.normalize(i, Normalizer.NFD);
if (skip.containsAll(nfd)) {
continue;
}
final String nfc = Normalizer.normalize(i, Normalizer.NFC);
if (nfc.codePointCount(0, nfc.length()) > 1) {
restoreExclusions.put(nfc, UTF16.valueOf(i));
int first = nfc.codePointAt(0);
final String trailingString = nfc.substring(first > 0xFFFF ? 2 : 1);
breakingTrail.addAll(trailingString);
}
int nfdLen = nfd.codePointCount(0, nfd.length());
if (nfdLen > 1) {
int first = nfd.codePointAt(0);
leading.add(first);
final String trailingString = nfd.substring(first > 0xFFFF ? 2 : 1);
trailing.addAll(trailingString);
Set<String> trails = leadToTrail.get(first);
if (trails == null) {
leadToTrail.put(first, trails = new TreeSet<String>(cpCompare));
}
trails.add(trailingString);
if (UScript.getScript(i) != UScript.HANGUL
&& trailingString.codePointCount(0, trailingString.length()) > 1) {
leading3.add(first);
}
}
}
leading.freeze();
trailing.freeze();
for (int first : leadToTrail.keySet()) {
if (leading3.contains(first)) {
for (String trail : leadToTrail.get(first)) {
System.out.println(Utility.hex(first, 4) + "," + Utility.hex(trail, 4, ",") + "\t"
+ UTF16.valueOf(first) + trail);
}
}
}
System.out.println("Breaking Trail:\t" + breakingTrail);
System.out.println("Lead-only: " + new UnicodeSet(leading).removeAll(trailing));
System.out.println("Trail-only: " + new UnicodeSet(trailing).removeAll(leading));
System.out.println("Lead and trail: " + new UnicodeSet(leading).retainAll(trailing));
UnicodeSet blockers = new UnicodeSet();
for (int lead : leadToTrail.keySet()) {
String leadStr = UTF16.valueOf(lead);
System.out.println("Testing: " + leadStr);
Set<String> trails = leadToTrail.get(lead);
for (String trail1 : trails) {
for (String trail2 : trails) {
if (trail1.length() >= trail2.length()) {
continue;
}
for (Iterator<String> it = new SubstringIterator(trail1); it.hasNext();) {
String sub = it.next();
String trial = leadStr + sub + trail2;
String nfc = shortNFC(trial);
String shortest = getShortest(trial, nfc);
if (!shortest.equals(nfc)) {
final int blocker = nfc.codePointAt(0);
if (!blockers.contains(blocker)) {
blockers.add(blocker);
System.out.println("Adding blocker: " + Utility.hex(blocker, 4) + "\t"
+ UTF16.valueOf(blocker));
System.out.println("\tNFC: " + Utility.hex(nfc) + "\t" + nfc);
System.out.println("\tShort: " + Utility.hex(shortest) + "\t" + shortest);
}
}
}
}
}
}
System.out.println("Blockers: " + blockers);
}
private static String shortNFC(String trial) {
String result = Normalizer.normalize(trial, Normalizer.NFC);
result = result.replace("\u0308\u0301", "\u0344");
for (String exclusion : restoreExclusions.keySet()) {
// just simple case
result = result.replace(exclusion, restoreExclusions.get(exclusion));
}
return result;
}
static class SubstringIterator implements Iterator<String> {
String string;
int start;
int end;
public SubstringIterator(String s) {
string = s;
start = 0;
end = UCharacter.charCount(string.codePointAt(start));
}
public boolean hasNext() {
return start < string.length();
}
public String next() {
String result = string.substring(start, end);
if (end < string.length()) {
end += UCharacter.charCount(string.codePointAt(end));
} else {
start += UCharacter.charCount(string.codePointAt(start));
if (start < string.length()) {
end = start + UCharacter.charCount(string.codePointAt(start));
}
}
return result;
}
public void remove() {
throw new UnsupportedOperationException();
}
// boolean insideCodePoint(String s, int offset) {
// if (offset == 0 || offset == s.length()) {
// return false;
// }
// char before = s.charAt(offset - 1);
// char after = s.charAt(offset);
// return 0xD800 <= before && before <= 0xDBFF && 0xDC00 <= after && after <= 0xDFFF;
// }
}
private static String getShortest(String trial, String nfc) {
String best = nfc;
int bestLength = nfc.codePointCount(0, nfc.length());
CanonicalIterator ci = new CanonicalIterator(trial);
for (String s = ci.next(); s != null; s = ci.next()) {
final int currentLength = s.codePointCount(0, s.length());
if (currentLength < bestLength) {
bestLength = currentLength;
best = s;
}
}
return best;
}
CanonicalIterator ci = new CanonicalIterator("");
public String normalize(String input) {
// optimize later
StringBuffer b = new StringBuffer();
boolean inProblem = false;
int start = 0;
final int len = input.length();
// find chunks of NFD !NFD !NFD... and process them
int cp;
for (int i = 0; i < len; i += Character.charCount(cp)) {
cp = input.codePointAt(i);
boolean isNFD = trailing.contains(cp);
if (isNFD) {
if (inProblem) {
// continue;
} else {// reached end of problem
b.append(process(input, start, i));
start = i;
}
} else { // not NFD
inProblem = true;
}
}
if (start < len) {
b.append(process(input, start, len));
}
return b.toString();
}
// of the form NFD? !NFD !NFD...
private String process(String input, int start, int end) {
final String piece = input.substring(start, end);
String nfc = shortNFC(piece);
final int nfcLength = nfc.codePointCount(0, nfc.length());
if (nfcLength == 1) {
return piece;
}
ci.setSource(piece);
String best = null;
int bestLength = Integer.MAX_VALUE;
for (String trial = ci.next(); trial != null; trial = ci.next()) {
int len = trial.codePointCount(0, trial.length());
if (len < bestLength) {
best = trial;
bestLength = len;
}
}
if (nfcLength <= bestLength) {
return nfc;
}
return best;
}
public static void main(String[] args) {
ShortestCanonicalForm scf = new ShortestCanonicalForm();
for (int i = 0; i <= 0x10FFFF; ++i) {
String s = UTF16.valueOf(i);
final String nfc = shortNFC(s);
String shortest = scf.normalize(nfc);
if (!shortest.equals(nfc)) {
System.out.println("NFC not shortest: " + shortest + ", " + nfc);
}
}
}
}