blob: 400176253a12d07a97d47009f38943df1dfda9aa [file] [log] [blame]
/*
* Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/**
* This is a tool to generate categoryNames and categoryMap which are used in
* CharSet.java.
*/
package build.tools.generatebreakiteratordata;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.StringTokenizer;
class CharacterCategory {
/**
* A list of Unicode category names.
*/
static final String[] categoryNames = {
"Ll", /* Letter, Lowercase */
"Lu", /* Letter, Uppercase */
"Lt", /* Letter, Titlecase */
"Lo", /* Letter, Other */
"Lm", /* Letter, Modifier */
"Nd", /* Number, Decimal Digit */
"Nl", /* Number, Letter */
"No", /* Number, Other */
"Ps", /* Punctuation, Open */
"Pe", /* Punctuation, Close */
"Pi", /* Punctuation, Initial quote */
"Pf", /* Punctuation, Final quote */
"Pd", /* Punctuation, Dash */
"Pc", /* Punctuation, Connector */
"Po", /* Punctuation, Other */
"Sc", /* Symbol, Currency */
"Sm", /* Symbol, Math */
"So", /* Symbol, Other */
"Mn", /* Mark, Non-Spacing */
"Mc", /* Mark, Spacing Combining */
"Me", /* Mark, Enclosing */
"Zl", /* Separator, Line */
"Zp", /* Separator, Paragraph */
"Zs", /* Separator, Space */
"Cc", /* Other, Control */
"Cf", /* Other, Format */
"--", /* Dummy, ignored */
// Don't add anything after the Dummy entry!!
};
/**
* A array of Unicode code points for each category.
*/
private static int[][] categoryMap;
/**
* Generates CategoryMap for GenerateBreakIteratorData.
*/
static void makeCategoryMap(String filename) {
/* Overwrite specfile name */
specfile = filename;
/* Generate data in current format (1.5.0) */
generateNewData();
/* Copy generated data to cateogyMap */
categoryMap = new int[categoryNames.length-1][];
for (int i = 0; i < categoryNames.length-1; i++) {
int len = newListCount[BMP][i] + newListCount[nonBMP][i];
categoryMap[i] = new int[len];
System.arraycopy(newList[i], 0, categoryMap[i], 0, len);
}
}
/**
* Returns categoryMap for the given category.
*/
static int[] getCategoryMap(int category) {
return categoryMap[category];
}
/**
* Only used for debugging and generating a test program.
*/
public static void main(String[] args) {
/* Parses command-line options */
processArgs(args);
/* Generates data in current format (1.5.0) */
generateNewData();
/*
* Generates data in older format (1.4.X and earlier) and creates
* the old CategoryMap if "oldFilename" is not null.
*/
if (!oldDatafile.equals("")) {
generateOldData();
generateOldDatafile();
}
/* Displays summary of generated data */
showSummary();
/*
* Generates a test program which compares the new data and the return
* values of Character.getType().
* and the old data and the new data.
*/
generateTestProgram();
}
/**
* Spec (Unicode data file)
*/
private static String specfile = "UnicodeData.txt";
/**
* Output directory
*/
private static String outputDir = "";
/**
* Old data filename
*/
private static String oldDatafile = "";
/**
* Parses the specified arguments and sets up the variables.
*/
private static void processArgs(String[] args) {
for (int i = 0; i < args.length; i++) {
String arg =args[i];
if (arg.equals("-spec")) {
specfile = args[++i];
} else if (arg.equals("-old")) {
oldDatafile = args[++i];
} else if (arg.equals("-o")) {
outputDir = args[++i];
} else {
System.err.println("Usage: java CharacterCategory [-spec specfile]");
System.exit(1);
}
}
}
/**
* Displays summary of generated data
*/
private static void showSummary() {
int oldSum = 0;
int newSum = 0;
int oldSuppSum = 0;
int newSuppSum = 0;
for (int i = 0; i < categoryNames.length-1; i++) {
int newNum = newListCount[BMP][i] + newListCount[nonBMP][i];
if (oldTotalCount[i] != newNum) {
System.err.println("Error: The number of generated data is different between the new approach and the old approach.");
}
if (oldListCount[SURROGATE][i] != newListCount[nonBMP][i]) {
System.err.println("Error: The number of generated supplementarycharacters is different between the new approach and the old approach.");
}
System.out.println(" " + categoryNames[i] + ": " +
oldTotalCount[i] +
"(" + oldListCount[BEFORE][i] +
" + " + oldListCount[SURROGATE][i] +
" + " + oldListCount[AFTER][i] + ")" +
" --- " + newNum +
"(" + newListCount[BMP][i] +
" + " + newListCount[nonBMP][i] + ")");
oldSum += oldListCount[BEFORE][i] * 2 +
oldListCount[SURROGATE][i] * 4 +
oldListCount[AFTER][i] * 2;
newSum += newNum * 4 ;
oldSuppSum += oldListCount[SURROGATE][i] * 4;
newSuppSum += newListCount[nonBMP][i] * 4;
}
System.out.println("\nTotal buffer sizes are:\n " +
oldSum + "bytes(Including " + oldSuppSum +
"bytes for supplementary characters)\n " +
newSum + "bytes(Including " + newSuppSum +
"bytes for supplementary characters)");
if (!ignoredOld.toString().equals(ignoredNew.toString())) {
System.err.println("Ignored categories: Error: List mismatch: " +
ignoredOld + " vs. " + ignoredNew);
} else {
System.out.println("\nIgnored categories: " + ignoredOld);
System.out.println("Please confirm that they aren't used in BreakIteratorRules.");
}
}
private static final int HighSurrogate_CodeUnit_Start = 0xD800;
private static final int LowSurrogate_CodeUnit_Start = 0xDC00;
private static final int Supplementary_CodePoint_Start = 0x10000;
private static StringBuffer ignoredOld = new StringBuffer();
private static int[] oldTotalCount = new int[categoryNames.length];
private static int[][] oldListCount = new int[3][categoryNames.length];
private static int[][] oldListLen = new int[3][categoryNames.length];
private static StringBuffer[][] oldList = new StringBuffer[3][categoryNames.length];
private static final int BEFORE = 0;
private static final int SURROGATE = 1;
private static final int AFTER = 2;
/**
* Makes CategoryMap in ordler format which had been used by JDK 1.4.X and
* earlier versions.
*/
private static void generateOldData() {
/* Initialize arrays. */
for (int i = 0; i<categoryNames.length; i++) {
for (int j = BEFORE; j <= AFTER; j++) {
oldListCount[j][i] = 0;
oldList[j][i] = new StringBuffer();
oldListLen[j][i] = 17;
}
}
storeOldData();
if (oldTotalCount[categoryNames.length-1] != 1) {
System.err.println("This should not happen. Unicode data which belongs to an undefined category exists");
System.exit(1);
}
}
private static void storeOldData() {
try {
FileReader fin = new FileReader(specfile);
BufferedReader bin = new BufferedReader(fin);
String prevCode = "????";
String line;
int prevIndex = categoryNames.length - 1;
int prevCodeValue = -1;
int curCodeValue = 0;
boolean setFirst = false;
while ((line = bin.readLine()) != null) {
if (line.length() == 0) {
continue;
}
StringTokenizer st = new StringTokenizer(line, ";");
String code = st.nextToken();
char c = code.charAt(0);
if (c == '#' || c == '/') {
continue;
}
int i = Integer.valueOf(code, 16).intValue();
String characterName = st.nextToken();
String category = st.nextToken();
int index;
for (index = 0; index < categoryNames.length; index++) {
if (category.equals(categoryNames[index])) {
break;
}
}
if (index != categoryNames.length) {
curCodeValue = Integer.parseInt(code, 16);
if (prevIndex != index) {
appendOldChar(prevIndex, prevCodeValue, prevCode);
appendOldChar(index, curCodeValue, code);
prevIndex = index;
} else if (prevCodeValue != curCodeValue - 1) {
if (setFirst && characterName.endsWith(" Last>")) {
setFirst = false;
} else {
appendOldChar(prevIndex, prevCodeValue, prevCode);
appendOldChar(index, curCodeValue, code);
}
}
prevCodeValue = curCodeValue;
prevCode = code;
if (characterName.endsWith(" First>")) {
setFirst = true;
}
} else {
if (ignoredOld.indexOf(category) == -1) {
ignoredOld.append(category);
ignoredOld.append(' ');
}
}
}
appendOldChar(prevIndex, prevCodeValue, prevCode);
bin.close();
fin.close();
}
catch (Exception e) {
throw new InternalError(e.toString());
}
}
private static void appendOldChar(int index, int code, String s) {
int range;
if (code < HighSurrogate_CodeUnit_Start) {
range = BEFORE;
} else if (code < Supplementary_CodePoint_Start) {
range = AFTER;
} else {
range = SURROGATE;
}
if (oldListLen[range][index] > 64) {
oldList[range][index].append("\"\n + \"");
oldListLen[range][index] = 19;
}
if (code == 0x22 || code == 0x5c) {
oldList[range][index].append('\\');
oldList[range][index].append((char)code);
oldListLen[range][index] += 2;
} else if (code > 0x20 && code < 0x7F) {
oldList[range][index].append((char)code);
oldListLen[range][index] ++;
} else {
if (range == SURROGATE) {// Need to convert code point to code unit
oldList[range][index].append(toCodeUnit(code));
oldListLen[range][index] += 12;
} else {
oldList[range][index].append("\\u");
oldList[range][index].append(s);
oldListLen[range][index] += 6;
}
}
oldListCount[range][index] ++;
oldTotalCount[index]++;
}
private static String toCodeUnit(int i) {
StringBuffer sb = new StringBuffer();
sb.append("\\u");
sb.append(Integer.toString((i - Supplementary_CodePoint_Start) / 0x400 + HighSurrogate_CodeUnit_Start, 16).toUpperCase());
sb.append("\\u");
sb.append(Integer.toString(i % 0x400 + LowSurrogate_CodeUnit_Start, 16).toUpperCase());
return sb.toString();
}
private static int toCodePoint(String s) {
char c1 = s.charAt(0);
if (s.length() == 1 || !Character.isHighSurrogate(c1)) {
return (int)c1;
} else {
char c2 = s.charAt(1);
if (s.length() != 2 || !Character.isLowSurrogate(c2)) {
return -1;
}
return Character.toCodePoint(c1, c2);
}
}
private static StringBuffer ignoredNew = new StringBuffer();
private static int[] newTotalCount = new int[categoryNames.length];
private static int[][] newListCount = new int[2][categoryNames.length];
private static int[][] newList = new int[categoryNames.length][];
private static final int BMP = 0;
private static final int nonBMP = 1;
/**
* Makes CategoryMap in newer format which is used by JDK 1.5.0.
*/
private static void generateNewData() {
/* Initialize arrays. */
for (int i = 0; i<categoryNames.length; i++) {
newList[i] = new int[10];
}
storeNewData();
if (newListCount[BMP][categoryNames.length-1] != 1) {
System.err.println("This should not happen. Unicode data which belongs to an undefined category exists");
System.exit(1);
}
}
private static void storeNewData() {
try {
FileReader fin = new FileReader(specfile);
BufferedReader bin = new BufferedReader(fin);
String line;
int prevIndex = categoryNames.length - 1;
int prevCodeValue = -1;
int curCodeValue = 0;
boolean setFirst = false;
while ((line = bin.readLine()) != null) {
if (line.length() == 0) {
continue;
}
StringTokenizer st = new StringTokenizer(line, ";");
String code = st.nextToken();
char c = code.charAt(0);
if (c == '#' || c == '/') {
continue;
}
int i = Integer.valueOf(code, 16).intValue();
String characterName = st.nextToken();
String category = st.nextToken();
int index;
for (index = 0; index < categoryNames.length; index++) {
if (category.equals(categoryNames[index])) {
break;
}
}
if (index != categoryNames.length) {
curCodeValue = Integer.parseInt(code, 16);
if (prevIndex == index) {
if (setFirst) {
if (characterName.endsWith(" Last>")) {
setFirst = false;
} else {
System.err.println("*** Error 1 at " + code);
}
} else {
if (characterName.endsWith(" First>")) {
setFirst = true;
} else if (characterName.endsWith(" Last>")) {
System.err.println("*** Error 2 at " + code);
} else {
if (prevCodeValue != curCodeValue - 1) {
appendNewChar(prevIndex, prevCodeValue);
appendNewChar(index, curCodeValue);
}
}
}
} else {
if (setFirst) {
System.err.println("*** Error 3 at " + code);
} else if (characterName.endsWith(" First>")) {
setFirst = true;
} else if (characterName.endsWith(" Last>")) {
System.err.println("*** Error 4 at " + code);
}
appendNewChar(prevIndex, prevCodeValue);
appendNewChar(index, curCodeValue);
prevIndex = index;
}
prevCodeValue = curCodeValue;
} else {
if (ignoredNew.indexOf(category) == -1) {
ignoredNew.append(category);
ignoredNew.append(' ');
}
}
}
appendNewChar(prevIndex, prevCodeValue);
bin.close();
fin.close();
}
catch (Exception e) {
System.err.println("Error occurred on accessing " + specfile);
e.printStackTrace();
System.exit(1);
}
}
private static void appendNewChar(int index, int code) {
int bufLen = newList[index].length;
if (newTotalCount[index] == bufLen) {
int[] tmpBuf = new int[bufLen + 10];
System.arraycopy(newList[index], 0, tmpBuf, 0, bufLen);
newList[index] = tmpBuf;
}
newList[index][newTotalCount[index]++] = code;
if (code < 0x10000) {
newListCount[BMP][index]++;
} else {
newListCount[nonBMP][index]++;
}
}
/* Generates the old CategoryMap. */
private static void generateOldDatafile() {
try {
FileWriter fout = new FileWriter(oldDatafile);
BufferedWriter bout = new BufferedWriter(fout);
bout.write("\n //\n // The following String[][] can be used in CharSet.java as is.\n //\n\n private static final String[][] categoryMap = {\n");
for (int i = 0; i < categoryNames.length - 1; i++) {
if (oldTotalCount[i] != 0) {
bout.write(" { \"" + categoryNames[i] + "\",");
/* 0x0000-0xD7FF */
if (oldListCount[BEFORE][i] != 0) {
bout.write(" \"");
bout.write(oldList[BEFORE][i].toString() + "\"\n");
}
/* 0xD800-0xFFFF */
if (oldListCount[AFTER][i] != 0) {
if (oldListCount[BEFORE][i] != 0) {
bout.write(" + \"");
} else {
bout.write(" \"");
}
bout.write(oldList[AFTER][i].toString() + "\"\n");
}
/* 0xD800DC00(0x10000)-0xDBFF0xDFFFF(0x10FFFF) */
if (oldListCount[SURROGATE][i] != 0) {
if (oldListCount[BEFORE][i] != 0 || oldListCount[AFTER][i] != 0) {
bout.write(" + \"");
} else {
bout.write(" \"");
}
bout.write(oldList[SURROGATE][i].toString() + "\"\n");
}
bout.write(" },\n");
}
}
bout.write(" };\n\n");
bout.close();
fout.close();
}
catch (Exception e) {
System.err.println("Error occurred on accessing " + oldDatafile);
e.printStackTrace();
System.exit(1);
}
System.out.println("\n" + oldDatafile + " has been generated.");
}
/**
* Test program to be generated
*/
private static final String outfile = "CharacterCategoryTest.java";
/*
* Generates a test program which compare the generated date (newer one)
* with the return values of Characger.getType().
*/
private static void generateTestProgram() {
try {
FileWriter fout = new FileWriter(outfile);
BufferedWriter bout = new BufferedWriter(fout);
bout.write(collationMethod);
bout.write("\n //\n // The following arrays can be used in CharSet.java as is.\n //\n\n");
bout.write(" private static final String[] categoryNames = {");
for (int i = 0; i < categoryNames.length - 1; i++) {
if (i % 10 == 0) {
bout.write("\n ");
}
bout.write("\"" + categoryNames[i] + "\", ");
}
bout.write("\n };\n\n");
bout.write(" private static final int[][] categoryMap = {\n");
for (int i = 0; i < categoryNames.length - 1; i++) {
StringBuffer sb = new StringBuffer(" { /* Data for \"" + categoryNames[i] + "\" category */");
for (int j = 0; j < newTotalCount[i]; j++) {
if (j % 8 == 0) {
sb.append("\n ");
}
sb.append(" 0x");
sb.append(Integer.toString(newList[i][j], 16).toUpperCase());
sb.append(',');
}
sb.append("\n },\n");
bout.write(sb.toString());
}
bout.write(" };\n");
bout.write("\n}\n");
bout.close();
fout.close();
}
catch (Exception e) {
System.err.println("Error occurred on accessing " + outfile);
e.printStackTrace();
System.exit(1);
}
System.out.println("\n" + outfile + " has been generated.");
}
static String collationMethod =
"public class CharacterCategoryTest {\n\n" +
" static final int SIZE = 0x110000;\n" +
" static final String[] category = {\n" +
" \"Cn\", \"Lu\", \"Ll\", \"Lt\", \"Lm\", \"Lo\", \"Mn\", \"Me\",\n" +
" \"Mc\", \"Nd\", \"Nl\", \"No\", \"Zs\", \"Zl\", \"Zp\", \"Cc\",\n" +
" \"Cf\", \"\", \"Co\", \"Cs\", \"Pd\", \"Ps\", \"Pe\", \"Pc\",\n" +
" \"Po\", \"Sm\", \"Sc\", \"Sk\", \"So\", \"Pi\", \"Pf\"\n" +
" };\n\n" +
" public static void main(String[] args) {\n" +
" boolean err = false;\n" +
" byte[] b = new byte[SIZE];\n" +
" for (int i = 0; i < SIZE; i++) {\n" +
" b[i] = 0;\n" +
" }\n" +
" for (int i = 0; i < categoryMap.length; i++) {\n" +
" byte categoryNum = 0;\n" +
" String categoryName = categoryNames[i];\n" +
" for (int j = 0; j < category.length; j++) {\n" +
" if (categoryName.equals(category[j])) {\n" +
" categoryNum = (byte)j;\n" +
" break;\n" +
" }\n" +
" }\n" +
" int[] values = categoryMap[i];\n" +
" for (int j = 0; j < values.length;) {\n" +
" int firstChar = values[j++];\n" +
" int lastChar = values[j++];\n" +
" for (int k = firstChar; k <= lastChar; k++) {\n" +
" b[k] = categoryNum;\n" +
" }\n" +
" }\n" +
" }\n" +
" for (int i = 0; i < SIZE; i++) {\n" +
" int characterType = Character.getType(i);\n" +
" if (b[i] != characterType) {\n" +
" /* Co, Cs and Sk categories are ignored in CharacterCategory. */\n" +
" if (characterType == Character.PRIVATE_USE ||\n" +
" characterType == Character.SURROGATE ||\n" +
" characterType == Character.MODIFIER_SYMBOL) {\n" +
" continue;\n" +
" }\n" +
" err = true;\n" +
" System.err.println(\"Category conflict for a character(0x\" +\n" +
" Integer.toHexString(i) +\n" +
" \"). CharSet.categoryMap:\" +\n" +
" category[b[i]] +\n" +
" \" Character.getType():\" +\n" +
" category[characterType]);\n" +
" }\n" +
" }\n\n" +
" if (err) {\n" +
" throw new RuntimeException(\"Conflict occurred between Charset.categoryMap and Character.getType()\");\n" +
" }\n" +
" }\n";
}