make/src/classes/build/tools/generatebreakiteratordata/CharacterCategory.java - platform/external/jetbrains/jdk8u_jdk - Git at Google

 /*
  * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License version 2 only, as
  * published by the Free Software Foundation.  Oracle designates this
  * particular file as subject to the "Classpath" exception as provided
  * by Oracle in the LICENSE file that accompanied this code.
  *
  * This code is distributed in the hope that it will be useful, but WITHOUT
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  * version 2 for more details (a copy is included in the LICENSE file that
  * accompanied this code).
  *
  * You should have received a copy of the GNU General Public License version
  * 2 along with this work; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  *
  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  * or visit www.oracle.com if you need additional information or have any
  * questions.
  */

 /**
  * This is a tool to generate categoryNames and categoryMap which are used in
  * CharSet.java.
  */

 package build.tools.generatebreakiteratordata;

 import java.io.BufferedReader;
 import java.io.BufferedWriter;
 import java.io.FileReader;
 import java.io.FileWriter;
 import java.util.StringTokenizer;

 class CharacterCategory {

     /**
      * A list of Unicode category names.
      */
     static final String[] categoryNames = {
         "Ll",        /* Letter, Lowercase */
         "Lu",        /* Letter, Uppercase */
         "Lt",        /* Letter, Titlecase */
         "Lo",        /* Letter, Other */
         "Lm",        /* Letter, Modifier */
         "Nd",        /* Number, Decimal Digit */
         "Nl",        /* Number, Letter */
         "No",        /* Number, Other */
         "Ps",        /* Punctuation, Open */
         "Pe",        /* Punctuation, Close */
         "Pi",        /* Punctuation, Initial quote */
         "Pf",        /* Punctuation, Final quote */
         "Pd",        /* Punctuation, Dash */
         "Pc",        /* Punctuation, Connector */
         "Po",        /* Punctuation, Other */
         "Sc",        /* Symbol, Currency */
         "Sm",        /* Symbol, Math */
         "So",         /* Symbol, Other */
         "Mn",        /* Mark, Non-Spacing */
         "Mc",        /* Mark, Spacing Combining */
         "Me",        /* Mark, Enclosing */
         "Zl",        /* Separator, Line */
         "Zp",        /* Separator, Paragraph */
         "Zs",        /* Separator, Space */
         "Cc",        /* Other, Control */
         "Cf",        /* Other, Format */
         "--",        /* Dummy, ignored */
         // Don't add anything after the Dummy entry!!
     };

     /**
      * A array of Unicode code points for each category.
      */
     private static int[][] categoryMap;


     /**
      * Generates CategoryMap for GenerateBreakIteratorData.
      */
     static void makeCategoryMap(String filename) {
         /* Overwrite specfile name */
         specfile = filename;

         /* Generate data in current format (1.5.0) */
         generateNewData();

         /* Copy generated data to cateogyMap */
         categoryMap = new int[categoryNames.length-1][];
         for (int i = 0; i < categoryNames.length-1; i++) {
             int len = newListCount[BMP][i] + newListCount[nonBMP][i];
             categoryMap[i] = new int[len];
             System.arraycopy(newList[i], 0, categoryMap[i], 0, len);
         }
     }

     /**
      * Returns categoryMap for the given category.
      */
     static int[] getCategoryMap(int category) {
         return categoryMap[category];
     }


     /**
      * Only used for debugging and generating a test program.
      */
     public static void main(String[] args) {
         /* Parses command-line options */
         processArgs(args);

         /* Generates data in current format (1.5.0) */
         generateNewData();

         /*
          * Generates data in older format (1.4.X and earlier) and creates
          * the old CategoryMap if "oldFilename" is not null.
          */
         if (!oldDatafile.equals("")) {
             generateOldData();
             generateOldDatafile();
         }

         /* Displays summary of generated data */
          showSummary();

         /*
          * Generates a test program which compares the new data and the return
          * values of Character.getType().
          * and the old data and the new data.
          */
         generateTestProgram();
     }


     /**
      * Spec (Unicode data file)
      */
     private static String specfile = "UnicodeData.txt";

     /**
      * Output directory
      */
     private static String outputDir = "";

     /**
      * Old data filename
      */
     private static String oldDatafile = "";

     /**
      * Parses the specified arguments and sets up the variables.
      */
     private static void processArgs(String[] args) {
         for (int i = 0; i < args.length; i++) {
             String arg =args[i];
             if (arg.equals("-spec")) {
                 specfile = args[++i];
             } else if (arg.equals("-old")) {
                 oldDatafile = args[++i];
             } else if (arg.equals("-o")) {
                 outputDir = args[++i];
             } else {
                 System.err.println("Usage: java CharacterCategory [-spec specfile]");
                 System.exit(1);
             }
         }
     }


     /**
      * Displays summary of generated data
      */
     private static void showSummary() {
         int oldSum = 0;
         int newSum = 0;
         int oldSuppSum = 0;
         int newSuppSum = 0;

         for (int i = 0; i < categoryNames.length-1; i++) {
             int newNum = newListCount[BMP][i] + newListCount[nonBMP][i];

             if (oldTotalCount[i] != newNum) {
                 System.err.println("Error: The number of generated data is different between the new approach and the old approach.");
             }
             if (oldListCount[SURROGATE][i] != newListCount[nonBMP][i]) {
                 System.err.println("Error: The number of generated supplementarycharacters is different between the new approach and the old approach.");
             }

             System.out.println("    " + categoryNames[i] + ": " +
                                oldTotalCount[i] +
                                "(" + oldListCount[BEFORE][i] +
                                " + " + oldListCount[SURROGATE][i] +
                                " + " + oldListCount[AFTER][i] + ")" +
                                " --- " + newNum +
                                "(" + newListCount[BMP][i] +
                                " + " + newListCount[nonBMP][i] + ")");

             oldSum += oldListCount[BEFORE][i] * 2 +
                       oldListCount[SURROGATE][i] * 4 +
                       oldListCount[AFTER][i] * 2;
             newSum += newNum * 4 ;
             oldSuppSum += oldListCount[SURROGATE][i] * 4;
             newSuppSum += newListCount[nonBMP][i] * 4;
         }

         System.out.println("\nTotal buffer sizes are:\n    " +
                            oldSum + "bytes(Including " + oldSuppSum +
                            "bytes for supplementary characters)\n    " +
                            newSum + "bytes(Including " + newSuppSum +
                            "bytes for supplementary characters)");

         if (!ignoredOld.toString().equals(ignoredNew.toString())) {
             System.err.println("Ignored categories: Error: List mismatch: " +
                                 ignoredOld + " vs. " + ignoredNew);
         } else {
             System.out.println("\nIgnored categories: " + ignoredOld);
             System.out.println("Please confirm that they aren't used in BreakIteratorRules.");
         }
     }


     private static final int HighSurrogate_CodeUnit_Start = 0xD800;
     private static final int LowSurrogate_CodeUnit_Start  = 0xDC00;
     private static final int Supplementary_CodePoint_Start    = 0x10000;


     private static StringBuffer ignoredOld = new StringBuffer();
     private static int[] oldTotalCount = new int[categoryNames.length];
     private static int[][] oldListCount = new int[3][categoryNames.length];
     private static int[][] oldListLen = new int[3][categoryNames.length];
     private static StringBuffer[][] oldList = new StringBuffer[3][categoryNames.length];

     private static final int BEFORE = 0;
     private static final int SURROGATE = 1;
     private static final int AFTER = 2;

     /**
      * Makes CategoryMap in ordler format which had been used by JDK 1.4.X and
      * earlier versions.
      */
     private static void generateOldData() {
         /* Initialize arrays. */
         for (int i = 0; i<categoryNames.length; i++) {
             for (int j = BEFORE; j <= AFTER; j++) {
                 oldListCount[j][i] = 0;
                 oldList[j][i] = new StringBuffer();
                 oldListLen[j][i] = 17;
             }
         }

         storeOldData();

         if (oldTotalCount[categoryNames.length-1] != 1) {
             System.err.println("This should not happen. Unicode data which belongs to an undefined category exists");
             System.exit(1);
         }
     }

     private static void storeOldData() {
         try {
             FileReader fin = new FileReader(specfile);
             BufferedReader bin = new BufferedReader(fin);

             String prevCode = "????";
             String line;
             int prevIndex = categoryNames.length - 1;
             int prevCodeValue = -1;
             int curCodeValue = 0;
             boolean setFirst = false;

             while ((line = bin.readLine()) != null) {
                 if (line.length() == 0) {
                     continue;
                 }

                 StringTokenizer st = new StringTokenizer(line, ";");
                 String code = st.nextToken();

                 char c = code.charAt(0);
                 if (c == '#' || c == '/') {
                     continue;
                 }

                 int i = Integer.valueOf(code, 16).intValue();

                 String characterName = st.nextToken();
                 String category = st.nextToken();

                 int index;
                 for (index = 0; index < categoryNames.length; index++) {
                     if (category.equals(categoryNames[index])) {
                         break;
                     }
                 }

                 if (index != categoryNames.length) {
                     curCodeValue = Integer.parseInt(code, 16);
                     if (prevIndex != index) {
                         appendOldChar(prevIndex, prevCodeValue, prevCode);
                         appendOldChar(index, curCodeValue, code);
                         prevIndex = index;
                     } else if (prevCodeValue != curCodeValue - 1) {
                         if (setFirst && characterName.endsWith(" Last>")) {
                             setFirst = false;
                         } else {
                             appendOldChar(prevIndex, prevCodeValue, prevCode);
                             appendOldChar(index, curCodeValue, code);
                         }
                     }
                     prevCodeValue = curCodeValue;
                     prevCode = code;
                     if (characterName.endsWith(" First>")) {
                         setFirst = true;
                     }
                 } else {
                     if (ignoredOld.indexOf(category) == -1) {
                         ignoredOld.append(category);
                         ignoredOld.append(' ');
                     }
                 }
             }
             appendOldChar(prevIndex, prevCodeValue, prevCode);

             bin.close();
             fin.close();
         }
         catch (Exception e) {
             throw new InternalError(e.toString());
         }
     }

     private static void appendOldChar(int index, int code, String s) {
         int range;
         if (code < HighSurrogate_CodeUnit_Start) {
             range = BEFORE;
         } else if (code < Supplementary_CodePoint_Start) {
             range = AFTER;
         } else {
             range = SURROGATE;
         }

         if (oldListLen[range][index] > 64) {
             oldList[range][index].append("\"\n                + \"");
             oldListLen[range][index] = 19;
         }

         if (code == 0x22 || code == 0x5c) {
             oldList[range][index].append('\\');
             oldList[range][index].append((char)code);
             oldListLen[range][index] += 2;
         } else if (code > 0x20 && code < 0x7F) {
             oldList[range][index].append((char)code);
             oldListLen[range][index] ++;
         } else {
             if (range == SURROGATE) {// Need to convert code point to code unit
                 oldList[range][index].append(toCodeUnit(code));
                 oldListLen[range][index] += 12;
             } else {
                 oldList[range][index].append("\\u");
                 oldList[range][index].append(s);
                 oldListLen[range][index] += 6;
             }
         }
         oldListCount[range][index] ++;
         oldTotalCount[index]++;
     }

     private static String toCodeUnit(int i) {
         StringBuffer sb = new StringBuffer();
         sb.append("\\u");
         sb.append(Integer.toString((i - Supplementary_CodePoint_Start) / 0x400 + HighSurrogate_CodeUnit_Start, 16).toUpperCase());
         sb.append("\\u");
         sb.append(Integer.toString(i % 0x400 + LowSurrogate_CodeUnit_Start, 16).toUpperCase());
         return sb.toString();
     }

     private static int toCodePoint(String s) {
         char c1 = s.charAt(0);

         if (s.length() == 1 || !Character.isHighSurrogate(c1)) {
             return (int)c1;
         } else {
             char c2 = s.charAt(1);
             if (s.length() != 2 || !Character.isLowSurrogate(c2)) {
                 return -1;
             }
             return Character.toCodePoint(c1, c2);
         }
     }


     private static StringBuffer ignoredNew = new StringBuffer();
     private static int[] newTotalCount = new int[categoryNames.length];
     private static int[][] newListCount = new int[2][categoryNames.length];
     private static int[][] newList = new int[categoryNames.length][];

     private static final int BMP = 0;
     private static final int nonBMP = 1;

     /**
      * Makes CategoryMap in newer format which is used by JDK 1.5.0.
      */
     private static void generateNewData() {
         /* Initialize arrays. */
         for (int i = 0; i<categoryNames.length; i++) {
             newList[i] = new int[10];
         }

         storeNewData();

         if (newListCount[BMP][categoryNames.length-1] != 1) {
             System.err.println("This should not happen. Unicode data which belongs to an undefined category exists");
             System.exit(1);
         }
     }

     private static void storeNewData() {
         try {
             FileReader fin = new FileReader(specfile);
             BufferedReader bin = new BufferedReader(fin);

             String line;
             int prevIndex = categoryNames.length - 1;
             int prevCodeValue = -1;
             int curCodeValue = 0;
             boolean setFirst = false;

             while ((line = bin.readLine()) != null) {
                 if (line.length() == 0) {
                     continue;
                 }

                 StringTokenizer st = new StringTokenizer(line, ";");
                 String code = st.nextToken();

                 char c = code.charAt(0);
                 if (c == '#' || c == '/') {
                     continue;
                 }

                 int i = Integer.valueOf(code, 16).intValue();

                 String characterName = st.nextToken();
                 String category = st.nextToken();

                 int index;
                 for (index = 0; index < categoryNames.length; index++) {
                     if (category.equals(categoryNames[index])) {
                         break;
                     }
                 }

                 if (index != categoryNames.length) {
                     curCodeValue = Integer.parseInt(code, 16);
                     if (prevIndex == index) {
                         if (setFirst) {
                             if (characterName.endsWith(" Last>")) {
                                 setFirst = false;
                             } else {
                                 System.err.println("*** Error 1 at " + code);
                             }
                         } else {
                             if (characterName.endsWith(" First>")) {
                                 setFirst = true;
                             } else if (characterName.endsWith(" Last>")) {
                                 System.err.println("*** Error 2 at " + code);
                             } else {
                                 if (prevCodeValue != curCodeValue - 1) {
                                     appendNewChar(prevIndex, prevCodeValue);
                                     appendNewChar(index, curCodeValue);
                                 }
                             }
                         }
                     } else {
                         if (setFirst) {
                             System.err.println("*** Error 3 at " + code);
                         } else if (characterName.endsWith(" First>")) {
                             setFirst = true;
                         } else if (characterName.endsWith(" Last>")) {
                             System.err.println("*** Error 4 at " + code);
                         }
                         appendNewChar(prevIndex, prevCodeValue);
                         appendNewChar(index, curCodeValue);
                         prevIndex = index;
                     }
                     prevCodeValue = curCodeValue;
                 } else {
                     if (ignoredNew.indexOf(category) == -1) {
                         ignoredNew.append(category);
                         ignoredNew.append(' ');
                     }
                 }
             }
             appendNewChar(prevIndex, prevCodeValue);

             bin.close();
             fin.close();
         }
         catch (Exception e) {
             System.err.println("Error occurred on accessing " + specfile);
             e.printStackTrace();
             System.exit(1);
         }
     }

     private static void appendNewChar(int index, int code) {
         int bufLen = newList[index].length;
         if (newTotalCount[index] == bufLen) {
             int[] tmpBuf = new int[bufLen + 10];
             System.arraycopy(newList[index], 0, tmpBuf, 0, bufLen);
             newList[index] = tmpBuf;
         }

         newList[index][newTotalCount[index]++] = code;
         if (code < 0x10000) {
             newListCount[BMP][index]++;
         } else {
             newListCount[nonBMP][index]++;
         }
     }


     /* Generates the old CategoryMap. */
     private static void generateOldDatafile() {
         try {
             FileWriter fout = new FileWriter(oldDatafile);
             BufferedWriter bout = new BufferedWriter(fout);

             bout.write("\n    //\n    // The following String[][] can be used in CharSet.java as is.\n    //\n\n    private static final String[][] categoryMap = {\n");
             for (int i = 0; i < categoryNames.length - 1; i++) {
                 if (oldTotalCount[i] != 0) {
                     bout.write("        { \"" + categoryNames[i] + "\",");

                     /* 0x0000-0xD7FF */
                     if (oldListCount[BEFORE][i] != 0) {
                         bout.write(" \"");

                         bout.write(oldList[BEFORE][i].toString() + "\"\n");
                     }

                     /* 0xD800-0xFFFF */
                     if (oldListCount[AFTER][i] != 0) {
                         if (oldListCount[BEFORE][i] != 0) {
                             bout.write("                + \"");
                         } else {
                             bout.write(" \"");
                         }
                         bout.write(oldList[AFTER][i].toString() + "\"\n");
                     }

                     /* 0xD800DC00(0x10000)-0xDBFF0xDFFFF(0x10FFFF) */
                     if (oldListCount[SURROGATE][i] != 0) {
                         if (oldListCount[BEFORE][i] != 0 || oldListCount[AFTER][i] != 0) {
                             bout.write("                + \"");
                         } else {
                             bout.write(" \"");
                         }
                         bout.write(oldList[SURROGATE][i].toString() + "\"\n");
                     }
                     bout.write("        },\n");

                 }
             }
             bout.write("    };\n\n");
             bout.close();
             fout.close();
         }
         catch (Exception e) {
             System.err.println("Error occurred on accessing " + oldDatafile);
             e.printStackTrace();
             System.exit(1);
         }

         System.out.println("\n" + oldDatafile + " has been generated.");
     }


     /**
      * Test program to be generated
      */
     private static final String outfile = "CharacterCategoryTest.java";

     /*
      * Generates a test program which compare the generated date (newer one)
      * with the return values of Characger.getType().
      */
     private static void generateTestProgram() {
         try {
             FileWriter fout = new FileWriter(outfile);
             BufferedWriter bout = new BufferedWriter(fout);

             bout.write(collationMethod);
             bout.write("\n    //\n    // The following arrays can be used in CharSet.java as is.\n    //\n\n");

             bout.write("    private static final String[] categoryNames = {");
             for (int i = 0; i < categoryNames.length - 1; i++) {
                 if (i % 10 == 0) {
                     bout.write("\n        ");
                 }
                 bout.write("\"" + categoryNames[i] + "\", ");
             }
             bout.write("\n    };\n\n");

             bout.write("    private static final int[][] categoryMap = {\n");

             for (int i = 0; i < categoryNames.length - 1; i++) {
                 StringBuffer sb = new StringBuffer("        { /*  Data for \"" + categoryNames[i] + "\" category */");

                 for (int j = 0; j < newTotalCount[i]; j++) {
                     if (j % 8 == 0) {
                         sb.append("\n        ");
                     }
                     sb.append(" 0x");
                     sb.append(Integer.toString(newList[i][j], 16).toUpperCase());
                     sb.append(',');
                 }
                 sb.append("\n        },\n");
                 bout.write(sb.toString());
             }

             bout.write("    };\n");

             bout.write("\n}\n");

             bout.close();
             fout.close();
         }
         catch (Exception e) {
             System.err.println("Error occurred on accessing " + outfile);
             e.printStackTrace();
             System.exit(1);
         }

         System.out.println("\n" + outfile + " has been generated.");
     }

     static String collationMethod =
 "public class CharacterCategoryTest {\n\n" +
 "    static final int SIZE = 0x110000;\n" +
 "    static final String[] category = {\n" +
 "       \"Cn\", \"Lu\", \"Ll\", \"Lt\", \"Lm\", \"Lo\", \"Mn\", \"Me\",\n" +
 "       \"Mc\", \"Nd\", \"Nl\", \"No\", \"Zs\", \"Zl\", \"Zp\", \"Cc\",\n" +
 "       \"Cf\", \"\",   \"Co\", \"Cs\", \"Pd\", \"Ps\", \"Pe\", \"Pc\",\n" +
 "       \"Po\", \"Sm\", \"Sc\", \"Sk\", \"So\", \"Pi\", \"Pf\"\n" +
 "    };\n\n" +
 "    public static void main(String[] args) {\n" +
 "        boolean err = false;\n" +
 "        byte[] b = new byte[SIZE];\n" +
 "        for (int i = 0; i < SIZE; i++) {\n" +
 "            b[i] = 0;\n" +
 "        }\n" +
 "        for (int i = 0; i < categoryMap.length; i++) {\n" +
 "            byte categoryNum = 0;\n" +
 "            String categoryName = categoryNames[i];\n" +
 "            for (int j = 0; j < category.length; j++) {\n" +
 "                if (categoryName.equals(category[j])) {\n" +
 "                    categoryNum = (byte)j;\n" +
 "                    break;\n" +
 "                }\n" +
 "            }\n" +
 "            int[] values = categoryMap[i];\n" +
 "            for (int j = 0; j < values.length;) {\n" +
 "                int firstChar = values[j++];\n" +
 "                int lastChar = values[j++];\n" +
 "                for (int k = firstChar; k <= lastChar; k++) {\n" +
 "                    b[k] = categoryNum;\n" +
 "                }\n" +
 "            }\n" +
 "        }\n" +
 "        for (int i = 0; i < SIZE; i++) {\n" +
 "            int characterType = Character.getType(i);\n" +
 "            if (b[i] != characterType) {\n" +
 "                /* Co, Cs and Sk categories are ignored in CharacterCategory. */\n" +
 "                if (characterType == Character.PRIVATE_USE ||\n" +
 "                    characterType == Character.SURROGATE ||\n" +
 "                    characterType == Character.MODIFIER_SYMBOL) {\n" +
 "                    continue;\n" +
 "                }\n" +
 "                err = true;\n" +
 "                System.err.println(\"Category conflict for a character(0x\" +\n" +
 "                                   Integer.toHexString(i) +\n" +
 "                                   \"). CharSet.categoryMap:\" +\n" +
 "                                   category[b[i]] +\n" +
 "                                   \"  Character.getType():\" +\n" +
 "                                   category[characterType]);\n" +
 "            }\n" +
 "        }\n\n" +
 "        if (err) {\n" +
 "            throw new RuntimeException(\"Conflict occurred between Charset.categoryMap and Character.getType()\");\n" +
 "        }\n" +
 "    }\n";

 }
	/*
	* Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
	*
	* This code is free software; you can redistribute it and/or modify it
	* under the terms of the GNU General Public License version 2 only, as
	* published by the Free Software Foundation. Oracle designates this
	* particular file as subject to the "Classpath" exception as provided
	* by Oracle in the LICENSE file that accompanied this code.
	*
	* This code is distributed in the hope that it will be useful, but WITHOUT
	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	* version 2 for more details (a copy is included in the LICENSE file that
	* accompanied this code).
	*
	* You should have received a copy of the GNU General Public License version
	* 2 along with this work; if not, write to the Free Software Foundation,
	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
	*
	* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
	* or visit www.oracle.com if you need additional information or have any
	* questions.
	*/

	/**
	* This is a tool to generate categoryNames and categoryMap which are used in
	* CharSet.java.
	*/

	package build.tools.generatebreakiteratordata;

	import java.io.BufferedReader;
	import java.io.BufferedWriter;
	import java.io.FileReader;
	import java.io.FileWriter;
	import java.util.StringTokenizer;

	class CharacterCategory {

	/**
	* A list of Unicode category names.
	*/
	static final String[] categoryNames = {
	"Ll", /* Letter, Lowercase */
	"Lu", /* Letter, Uppercase */
	"Lt", /* Letter, Titlecase */
	"Lo", /* Letter, Other */
	"Lm", /* Letter, Modifier */
	"Nd", /* Number, Decimal Digit */
	"Nl", /* Number, Letter */
	"No", /* Number, Other */
	"Ps", /* Punctuation, Open */
	"Pe", /* Punctuation, Close */
	"Pi", /* Punctuation, Initial quote */
	"Pf", /* Punctuation, Final quote */
	"Pd", /* Punctuation, Dash */
	"Pc", /* Punctuation, Connector */
	"Po", /* Punctuation, Other */
	"Sc", /* Symbol, Currency */
	"Sm", /* Symbol, Math */
	"So", /* Symbol, Other */
	"Mn", /* Mark, Non-Spacing */
	"Mc", /* Mark, Spacing Combining */
	"Me", /* Mark, Enclosing */
	"Zl", /* Separator, Line */
	"Zp", /* Separator, Paragraph */
	"Zs", /* Separator, Space */
	"Cc", /* Other, Control */
	"Cf", /* Other, Format */
	"--", /* Dummy, ignored */
	// Don't add anything after the Dummy entry!!
	};

	/**
	* A array of Unicode code points for each category.
	*/
	private static int[][] categoryMap;


	/**
	* Generates CategoryMap for GenerateBreakIteratorData.
	*/
	static void makeCategoryMap(String filename) {
	/* Overwrite specfile name */
	specfile = filename;

	/* Generate data in current format (1.5.0) */
	generateNewData();

	/* Copy generated data to cateogyMap */
	categoryMap = new int[categoryNames.length-1][];
	for (int i = 0; i < categoryNames.length-1; i++) {
	int len = newListCount[BMP][i] + newListCount[nonBMP][i];
	categoryMap[i] = new int[len];
	System.arraycopy(newList[i], 0, categoryMap[i], 0, len);
	}
	}

	/**
	* Returns categoryMap for the given category.
	*/
	static int[] getCategoryMap(int category) {
	return categoryMap[category];
	}


	/**
	* Only used for debugging and generating a test program.
	*/
	public static void main(String[] args) {
	/* Parses command-line options */
	processArgs(args);

	/* Generates data in current format (1.5.0) */
	generateNewData();

	/*
	* Generates data in older format (1.4.X and earlier) and creates
	* the old CategoryMap if "oldFilename" is not null.
	*/
	if (!oldDatafile.equals("")) {
	generateOldData();
	generateOldDatafile();
	}

	/* Displays summary of generated data */
	showSummary();

	/*
	* Generates a test program which compares the new data and the return
	* values of Character.getType().
	* and the old data and the new data.
	*/
	generateTestProgram();
	}


	/**
	* Spec (Unicode data file)
	*/
	private static String specfile = "UnicodeData.txt";

	/**
	* Output directory
	*/
	private static String outputDir = "";

	/**
	* Old data filename
	*/
	private static String oldDatafile = "";

	/**
	* Parses the specified arguments and sets up the variables.
	*/
	private static void processArgs(String[] args) {
	for (int i = 0; i < args.length; i++) {
	String arg =args[i];
	if (arg.equals("-spec")) {
	specfile = args[++i];
	} else if (arg.equals("-old")) {
	oldDatafile = args[++i];
	} else if (arg.equals("-o")) {
	outputDir = args[++i];
	} else {
	System.err.println("Usage: java CharacterCategory [-spec specfile]");
	System.exit(1);
	}
	}
	}


	/**
	* Displays summary of generated data
	*/
	private static void showSummary() {
	int oldSum = 0;
	int newSum = 0;
	int oldSuppSum = 0;
	int newSuppSum = 0;

	for (int i = 0; i < categoryNames.length-1; i++) {
	int newNum = newListCount[BMP][i] + newListCount[nonBMP][i];

	if (oldTotalCount[i] != newNum) {
	System.err.println("Error: The number of generated data is different between the new approach and the old approach.");
	}
	if (oldListCount[SURROGATE][i] != newListCount[nonBMP][i]) {
	System.err.println("Error: The number of generated supplementarycharacters is different between the new approach and the old approach.");
	}

	System.out.println(" " + categoryNames[i] + ": " +
	oldTotalCount[i] +
	"(" + oldListCount[BEFORE][i] +
	" + " + oldListCount[SURROGATE][i] +
	" + " + oldListCount[AFTER][i] + ")" +
	" --- " + newNum +
	"(" + newListCount[BMP][i] +
	" + " + newListCount[nonBMP][i] + ")");

	oldSum += oldListCount[BEFORE][i] * 2 +
	oldListCount[SURROGATE][i] * 4 +
	oldListCount[AFTER][i] * 2;
	newSum += newNum * 4 ;
	oldSuppSum += oldListCount[SURROGATE][i] * 4;
	newSuppSum += newListCount[nonBMP][i] * 4;
	}

	System.out.println("\nTotal buffer sizes are:\n " +
	oldSum + "bytes(Including " + oldSuppSum +
	"bytes for supplementary characters)\n " +
	newSum + "bytes(Including " + newSuppSum +
	"bytes for supplementary characters)");

	if (!ignoredOld.toString().equals(ignoredNew.toString())) {
	System.err.println("Ignored categories: Error: List mismatch: " +
	ignoredOld + " vs. " + ignoredNew);
	} else {
	System.out.println("\nIgnored categories: " + ignoredOld);
	System.out.println("Please confirm that they aren't used in BreakIteratorRules.");
	}
	}


	private static final int HighSurrogate_CodeUnit_Start = 0xD800;
	private static final int LowSurrogate_CodeUnit_Start = 0xDC00;
	private static final int Supplementary_CodePoint_Start = 0x10000;


	private static StringBuffer ignoredOld = new StringBuffer();
	private static int[] oldTotalCount = new int[categoryNames.length];
	private static int[][] oldListCount = new int[3][categoryNames.length];
	private static int[][] oldListLen = new int[3][categoryNames.length];
	private static StringBuffer[][] oldList = new StringBuffer[3][categoryNames.length];

	private static final int BEFORE = 0;
	private static final int SURROGATE = 1;
	private static final int AFTER = 2;

	/**
	* Makes CategoryMap in ordler format which had been used by JDK 1.4.X and
	* earlier versions.
	*/
	private static void generateOldData() {
	/* Initialize arrays. */
	for (int i = 0; i<categoryNames.length; i++) {
	for (int j = BEFORE; j <= AFTER; j++) {
	oldListCount[j][i] = 0;
	oldList[j][i] = new StringBuffer();
	oldListLen[j][i] = 17;
	}
	}

	storeOldData();

	if (oldTotalCount[categoryNames.length-1] != 1) {
	System.err.println("This should not happen. Unicode data which belongs to an undefined category exists");
	System.exit(1);
	}
	}

	private static void storeOldData() {
	try {
	FileReader fin = new FileReader(specfile);
	BufferedReader bin = new BufferedReader(fin);

	String prevCode = "????";
	String line;
	int prevIndex = categoryNames.length - 1;
	int prevCodeValue = -1;
	int curCodeValue = 0;
	boolean setFirst = false;

	while ((line = bin.readLine()) != null) {
	if (line.length() == 0) {
	continue;
	}

	StringTokenizer st = new StringTokenizer(line, ";");
	String code = st.nextToken();

	char c = code.charAt(0);
	if (c == '#' \|\| c == '/') {
	continue;
	}

	int i = Integer.valueOf(code, 16).intValue();

	String characterName = st.nextToken();
	String category = st.nextToken();

	int index;
	for (index = 0; index < categoryNames.length; index++) {
	if (category.equals(categoryNames[index])) {
	break;
	}
	}

	if (index != categoryNames.length) {
	curCodeValue = Integer.parseInt(code, 16);
	if (prevIndex != index) {
	appendOldChar(prevIndex, prevCodeValue, prevCode);
	appendOldChar(index, curCodeValue, code);
	prevIndex = index;
	} else if (prevCodeValue != curCodeValue - 1) {
	if (setFirst && characterName.endsWith(" Last>")) {
	setFirst = false;
	} else {
	appendOldChar(prevIndex, prevCodeValue, prevCode);
	appendOldChar(index, curCodeValue, code);
	}
	}
	prevCodeValue = curCodeValue;
	prevCode = code;
	if (characterName.endsWith(" First>")) {
	setFirst = true;
	}
	} else {
	if (ignoredOld.indexOf(category) == -1) {
	ignoredOld.append(category);
	ignoredOld.append(' ');
	}
	}
	}
	appendOldChar(prevIndex, prevCodeValue, prevCode);

	bin.close();
	fin.close();
	}
	catch (Exception e) {
	throw new InternalError(e.toString());
	}
	}

	private static void appendOldChar(int index, int code, String s) {
	int range;
	if (code < HighSurrogate_CodeUnit_Start) {
	range = BEFORE;
	} else if (code < Supplementary_CodePoint_Start) {
	range = AFTER;
	} else {
	range = SURROGATE;
	}

	if (oldListLen[range][index] > 64) {
	oldList[range][index].append("\"\n + \"");
	oldListLen[range][index] = 19;
	}

	if (code == 0x22 \|\| code == 0x5c) {
	oldList[range][index].append('\\');
	oldList[range][index].append((char)code);
	oldListLen[range][index] += 2;
	} else if (code > 0x20 && code < 0x7F) {
	oldList[range][index].append((char)code);
	oldListLen[range][index] ++;
	} else {
	if (range == SURROGATE) {// Need to convert code point to code unit
	oldList[range][index].append(toCodeUnit(code));
	oldListLen[range][index] += 12;
	} else {
	oldList[range][index].append("\\u");
	oldList[range][index].append(s);
	oldListLen[range][index] += 6;
	}
	}
	oldListCount[range][index] ++;
	oldTotalCount[index]++;
	}

	private static String toCodeUnit(int i) {
	StringBuffer sb = new StringBuffer();
	sb.append("\\u");
	sb.append(Integer.toString((i - Supplementary_CodePoint_Start) / 0x400 + HighSurrogate_CodeUnit_Start, 16).toUpperCase());
	sb.append("\\u");
	sb.append(Integer.toString(i % 0x400 + LowSurrogate_CodeUnit_Start, 16).toUpperCase());
	return sb.toString();
	}

	private static int toCodePoint(String s) {
	char c1 = s.charAt(0);

	if (s.length() == 1 \|\| !Character.isHighSurrogate(c1)) {
	return (int)c1;
	} else {
	char c2 = s.charAt(1);
	if (s.length() != 2 \|\| !Character.isLowSurrogate(c2)) {
	return -1;
	}
	return Character.toCodePoint(c1, c2);
	}
	}


	private static StringBuffer ignoredNew = new StringBuffer();
	private static int[] newTotalCount = new int[categoryNames.length];
	private static int[][] newListCount = new int[2][categoryNames.length];
	private static int[][] newList = new int[categoryNames.length][];

	private static final int BMP = 0;
	private static final int nonBMP = 1;

	/**
	* Makes CategoryMap in newer format which is used by JDK 1.5.0.
	*/
	private static void generateNewData() {
	/* Initialize arrays. */
	for (int i = 0; i<categoryNames.length; i++) {
	newList[i] = new int[10];
	}

	storeNewData();

	if (newListCount[BMP][categoryNames.length-1] != 1) {
	System.err.println("This should not happen. Unicode data which belongs to an undefined category exists");
	System.exit(1);
	}
	}

	private static void storeNewData() {
	try {
	FileReader fin = new FileReader(specfile);
	BufferedReader bin = new BufferedReader(fin);

	String line;
	int prevIndex = categoryNames.length - 1;
	int prevCodeValue = -1;
	int curCodeValue = 0;
	boolean setFirst = false;

	while ((line = bin.readLine()) != null) {
	if (line.length() == 0) {
	continue;
	}

	StringTokenizer st = new StringTokenizer(line, ";");
	String code = st.nextToken();

	char c = code.charAt(0);
	if (c == '#' \|\| c == '/') {
	continue;
	}

	int i = Integer.valueOf(code, 16).intValue();

	String characterName = st.nextToken();
	String category = st.nextToken();

	int index;
	for (index = 0; index < categoryNames.length; index++) {
	if (category.equals(categoryNames[index])) {
	break;
	}
	}

	if (index != categoryNames.length) {
	curCodeValue = Integer.parseInt(code, 16);
	if (prevIndex == index) {
	if (setFirst) {
	if (characterName.endsWith(" Last>")) {
	setFirst = false;
	} else {
	System.err.println("*** Error 1 at " + code);
	}
	} else {
	if (characterName.endsWith(" First>")) {
	setFirst = true;
	} else if (characterName.endsWith(" Last>")) {
	System.err.println("*** Error 2 at " + code);
	} else {
	if (prevCodeValue != curCodeValue - 1) {
	appendNewChar(prevIndex, prevCodeValue);
	appendNewChar(index, curCodeValue);
	}
	}
	}
	} else {
	if (setFirst) {
	System.err.println("*** Error 3 at " + code);
	} else if (characterName.endsWith(" First>")) {
	setFirst = true;
	} else if (characterName.endsWith(" Last>")) {
	System.err.println("*** Error 4 at " + code);
	}
	appendNewChar(prevIndex, prevCodeValue);
	appendNewChar(index, curCodeValue);
	prevIndex = index;
	}
	prevCodeValue = curCodeValue;
	} else {
	if (ignoredNew.indexOf(category) == -1) {
	ignoredNew.append(category);
	ignoredNew.append(' ');
	}
	}
	}
	appendNewChar(prevIndex, prevCodeValue);

	bin.close();
	fin.close();
	}
	catch (Exception e) {
	System.err.println("Error occurred on accessing " + specfile);
	e.printStackTrace();
	System.exit(1);
	}
	}

	private static void appendNewChar(int index, int code) {
	int bufLen = newList[index].length;
	if (newTotalCount[index] == bufLen) {
	int[] tmpBuf = new int[bufLen + 10];
	System.arraycopy(newList[index], 0, tmpBuf, 0, bufLen);
	newList[index] = tmpBuf;
	}

	newList[index][newTotalCount[index]++] = code;
	if (code < 0x10000) {
	newListCount[BMP][index]++;
	} else {
	newListCount[nonBMP][index]++;
	}
	}


	/* Generates the old CategoryMap. */
	private static void generateOldDatafile() {
	try {
	FileWriter fout = new FileWriter(oldDatafile);
	BufferedWriter bout = new BufferedWriter(fout);

	bout.write("\n //\n // The following String[][] can be used in CharSet.java as is.\n //\n\n private static final String[][] categoryMap = {\n");
	for (int i = 0; i < categoryNames.length - 1; i++) {
	if (oldTotalCount[i] != 0) {
	bout.write(" { \"" + categoryNames[i] + "\",");

	/* 0x0000-0xD7FF */
	if (oldListCount[BEFORE][i] != 0) {
	bout.write(" \"");

	bout.write(oldList[BEFORE][i].toString() + "\"\n");
	}

	/* 0xD800-0xFFFF */
	if (oldListCount[AFTER][i] != 0) {
	if (oldListCount[BEFORE][i] != 0) {
	bout.write(" + \"");
	} else {
	bout.write(" \"");
	}
	bout.write(oldList[AFTER][i].toString() + "\"\n");
	}

	/* 0xD800DC00(0x10000)-0xDBFF0xDFFFF(0x10FFFF) */
	if (oldListCount[SURROGATE][i] != 0) {
	if (oldListCount[BEFORE][i] != 0 \|\| oldListCount[AFTER][i] != 0) {
	bout.write(" + \"");
	} else {
	bout.write(" \"");
	}
	bout.write(oldList[SURROGATE][i].toString() + "\"\n");
	}
	bout.write(" },\n");

	}
	}
	bout.write(" };\n\n");
	bout.close();
	fout.close();
	}
	catch (Exception e) {
	System.err.println("Error occurred on accessing " + oldDatafile);
	e.printStackTrace();
	System.exit(1);
	}

	System.out.println("\n" + oldDatafile + " has been generated.");
	}


	/**
	* Test program to be generated
	*/
	private static final String outfile = "CharacterCategoryTest.java";

	/*
	* Generates a test program which compare the generated date (newer one)
	* with the return values of Characger.getType().
	*/
	private static void generateTestProgram() {
	try {
	FileWriter fout = new FileWriter(outfile);
	BufferedWriter bout = new BufferedWriter(fout);

	bout.write(collationMethod);
	bout.write("\n //\n // The following arrays can be used in CharSet.java as is.\n //\n\n");

	bout.write(" private static final String[] categoryNames = {");
	for (int i = 0; i < categoryNames.length - 1; i++) {
	if (i % 10 == 0) {
	bout.write("\n ");
	}
	bout.write("\"" + categoryNames[i] + "\", ");
	}
	bout.write("\n };\n\n");

	bout.write(" private static final int[][] categoryMap = {\n");

	for (int i = 0; i < categoryNames.length - 1; i++) {
	StringBuffer sb = new StringBuffer(" { /* Data for \"" + categoryNames[i] + "\" category */");

	for (int j = 0; j < newTotalCount[i]; j++) {
	if (j % 8 == 0) {
	sb.append("\n ");
	}
	sb.append(" 0x");
	sb.append(Integer.toString(newList[i][j], 16).toUpperCase());
	sb.append(',');
	}
	sb.append("\n },\n");
	bout.write(sb.toString());
	}

	bout.write(" };\n");

	bout.write("\n}\n");

	bout.close();
	fout.close();
	}
	catch (Exception e) {
	System.err.println("Error occurred on accessing " + outfile);
	e.printStackTrace();
	System.exit(1);
	}

	System.out.println("\n" + outfile + " has been generated.");
	}

	static String collationMethod =
	"public class CharacterCategoryTest {\n\n" +
	" static final int SIZE = 0x110000;\n" +
	" static final String[] category = {\n" +
	" \"Cn\", \"Lu\", \"Ll\", \"Lt\", \"Lm\", \"Lo\", \"Mn\", \"Me\",\n" +
	" \"Mc\", \"Nd\", \"Nl\", \"No\", \"Zs\", \"Zl\", \"Zp\", \"Cc\",\n" +
	" \"Cf\", \"\", \"Co\", \"Cs\", \"Pd\", \"Ps\", \"Pe\", \"Pc\",\n" +
	" \"Po\", \"Sm\", \"Sc\", \"Sk\", \"So\", \"Pi\", \"Pf\"\n" +
	" };\n\n" +
	" public static void main(String[] args) {\n" +
	" boolean err = false;\n" +
	" byte[] b = new byte[SIZE];\n" +
	" for (int i = 0; i < SIZE; i++) {\n" +
	" b[i] = 0;\n" +
	" }\n" +
	" for (int i = 0; i < categoryMap.length; i++) {\n" +
	" byte categoryNum = 0;\n" +
	" String categoryName = categoryNames[i];\n" +
	" for (int j = 0; j < category.length; j++) {\n" +
	" if (categoryName.equals(category[j])) {\n" +
	" categoryNum = (byte)j;\n" +
	" break;\n" +
	" }\n" +
	" }\n" +
	" int[] values = categoryMap[i];\n" +
	" for (int j = 0; j < values.length;) {\n" +
	" int firstChar = values[j++];\n" +
	" int lastChar = values[j++];\n" +
	" for (int k = firstChar; k <= lastChar; k++) {\n" +
	" b[k] = categoryNum;\n" +
	" }\n" +
	" }\n" +
	" }\n" +
	" for (int i = 0; i < SIZE; i++) {\n" +
	" int characterType = Character.getType(i);\n" +
	" if (b[i] != characterType) {\n" +
	" /* Co, Cs and Sk categories are ignored in CharacterCategory. */\n" +
	" if (characterType == Character.PRIVATE_USE \|\|\n" +
	" characterType == Character.SURROGATE \|\|\n" +
	" characterType == Character.MODIFIER_SYMBOL) {\n" +
	" continue;\n" +
	" }\n" +
	" err = true;\n" +
	" System.err.println(\"Category conflict for a character(0x\" +\n" +
	" Integer.toHexString(i) +\n" +
	" \"). CharSet.categoryMap:\" +\n" +
	" category[b[i]] +\n" +
	" \" Character.getType():\" +\n" +
	" category[characterType]);\n" +
	" }\n" +
	" }\n\n" +
	" if (err) {\n" +
	" throw new RuntimeException(\"Conflict occurred between Charset.categoryMap and Character.getType()\");\n" +
	" }\n" +
	" }\n";

	}