blob: db429ce5e5f93d9e83225644c1cd1fa05aee81c5 [file] [log] [blame]
package build.tools.generatecharacter;
import java.util.regex.*;
import java.util.*;
import java.io.*;
public class CharacterScript {
// generate the code needed for j.l.C.UnicodeScript
static void fortest(String fmt, Object... o) {
//System.out.printf(fmt, o);
}
static void print(String fmt, Object... o) {
System.out.printf(fmt, o);
}
static void debug(String fmt, Object... o) {
//System.out.printf(fmt, o);
}
public static void main(String args[]){
try {
if (args.length != 1) {
System.out.println("java CharacterScript script.txt out");
System.exit(1);
}
int i, j;
BufferedReader sbfr = new BufferedReader(new FileReader(args[0]));
HashMap<String,Integer> scriptMap = new HashMap<String,Integer>();
String line = null;
Matcher m = Pattern.compile("(\\p{XDigit}+)(?:\\.{2}(\\p{XDigit}+))?\\s+;\\s+(\\w+)\\s+#.*").matcher("");
int prevS = -1;
int prevE = -1;
String prevN = null;
int[][] scripts = new int[1024][3];
int scriptSize = 0;
while ((line = sbfr.readLine()) != null) {
if (line.length() <= 1 || line.charAt(0) == '#') {
continue;
}
m.reset(line);
if (m.matches()) {
int start = Integer.parseInt(m.group(1), 16);
int end = (m.group(2)==null)?start
:Integer.parseInt(m.group(2), 16);
String name = m.group(3);
if (name.equals(prevN) && start == prevE + 1) {
prevE = end;
} else {
if (prevS != -1) {
if (scriptMap.get(prevN) == null) {
scriptMap.put(prevN, scriptMap.size());
}
scripts[scriptSize][0] = prevS;
scripts[scriptSize][1] = prevE;
scripts[scriptSize][2] = scriptMap.get(prevN);
scriptSize++;
}
debug("%x-%x\t%s%n", prevS, prevE, prevN);
prevS = start; prevE = end; prevN = name;
}
} else {
debug("Warning: Unrecognized line <%s>%n", line);
}
}
//last one.
if (scriptMap.get(prevN) == null) {
scriptMap.put(prevN, scriptMap.size());
}
scripts[scriptSize][0] = prevS;
scripts[scriptSize][1] = prevE;
scripts[scriptSize][2] = scriptMap.get(prevN);
scriptSize++;
debug("%x-%x\t%s%n", prevS, prevE, prevN);
debug("-----------------%n");
debug("Total scripts=%s%n", scriptMap.size());
debug("-----------------%n%n");
String[] names = new String[scriptMap.size()];
for (String name: scriptMap.keySet()) {
names[scriptMap.get(name).intValue()] = name;
}
for (j = 0; j < scriptSize; j++) {
for (int cp = scripts[j][0]; cp <= scripts[j][1]; cp++) {
String name = names[scripts[j][2]].toUpperCase(Locale.ENGLISH);;
if (cp > 0xffff)
System.out.printf("%05X %s%n", cp, name);
else
System.out.printf("%05X %s%n", cp, name);
}
}
Arrays.sort(scripts, 0, scriptSize,
new Comparator<int[]>() {
public int compare(int[] a1, int[] a2) {
return a1[0] - a2[0];
}
public boolean compare(Object obj) {
return obj == this;
}
});
// Consolidation: there are lots of "reserved" code points
// embedded in those otherwise "sequential" blocks.
// To make the lookup table smaller, we combine those
// separated segments with the assumption that the lookup
// implementation checks
// Character.getType() != Character.UNASSIGNED
// first (return UNKNOWN for unassigned)
ArrayList<int[]> list = new ArrayList();
list.add(scripts[0]);
int[] last = scripts[0];
for (i = 1; i < scriptSize; i++) {
if (scripts[i][0] != (last[1] + 1)) {
boolean isNotUnassigned = false;
for (int cp = last[1] + 1; cp < scripts[i][0]; cp++) {
if (Character.getType(cp) != Character.UNASSIGNED) {
isNotUnassigned = true;
debug("Warning: [%x] is ASSIGNED but in NON script%n", cp);
break;
}
}
if (isNotUnassigned) {
// surrogates only?
int[] a = new int[3];
a[0] = last[1] + 1;
a[1] = scripts[i][0] - 1;
a[2] = -1; // unknown
list.add(a);
} else {
if (last[2] == scripts[i][2]) {
//combine
last[1] = scripts[i][1];
continue;
} else {
// expand last
last[1] = scripts[i][0] - 1;
}
}
}
list.add(scripts[i]);
last = scripts[i];
}
for (i = 0; i < list.size(); i++) {
int[] a = (int[])list.get(i);
String name = "UNKNOWN";
if (a[2] != -1)
name = names[a[2]].toUpperCase(Locale.US);
debug("0x%05x, 0x%05x %s%n", a[0], a[1], name);
}
debug("--->total=%d%n", list.size());
//////////////////OUTPUT//////////////////////////////////
print("public class Scripts {%n%n");
print(" public static enum UnicodeScript {%n");
for (i = 0; i < names.length; i++) {
print(" /**%n * Unicode script \"%s\".%n */%n", names[i]);
print(" %s,%n%n", names[i].toUpperCase(Locale.US));
}
print(" /**%n * Unicode script \"Unknown\".%n */%n UNKNOWN;%n%n");
// lookup table
print(" private static final int[] scriptStarts = {%n");
for (int[] a : list) {
String name = "UNKNOWN";
if (a[2] != -1)
name = names[a[2]].toUpperCase(Locale.US);
if (a[0] < 0x10000)
print(" 0x%04X, // %04X..%04X; %s%n",
a[0], a[0], a[1], name);
else
print(" 0x%05X, // %05X..%05X; %s%n",
a[0], a[0], a[1], name);
}
last = list.get(list.size() -1);
if (last[1] != Character.MAX_CODE_POINT)
print(" 0x%05X // %05X..%06X; %s%n",
last[1] + 1, last[1] + 1, Character.MAX_CODE_POINT,
"UNKNOWN");
print("%n };%n%n");
print(" private static final UnicodeScript[] scripts = {%n");
for (int[] a : list) {
String name = "UNKNOWN";
if (a[2] != -1)
name = names[a[2]].toUpperCase(Locale.US);
print(" %s,%n", name);
}
if (last[1] != Character.MAX_CODE_POINT)
print(" UNKNOWN%n");
print(" };%n");
print(" }%n");
print("}%n");
} catch (Exception e) {
e.printStackTrace();
}
}
}