blob: 6e79846beeceaa214f3d0599ba95aafc7e1ed530 [file] [log] [blame]
/*
**********************************************************************
* Copyright (c) 2005-2011, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Author: John Emmons
**********************************************************************
*/
package org.unicode.cldr.posix;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.ListIterator;
import org.unicode.cldr.draft.FileUtilities;
import org.unicode.cldr.icu.SimpleConverter;
import com.ibm.icu.dev.tool.UOption;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
/**
* Class to generate POSIX format charmap
*
* @author John C. Emmons
*/
public class GenerateCharmap {
private static final int
DESTDIR = 2,
UNICODESET = 3,
CHARSET = 4;
private static final UOption[] options = {
UOption.HELP_H(),
UOption.HELP_QUESTION_MARK(),
UOption.create("destdir", 'd', UOption.REQUIRES_ARG).setDefault("."),
UOption.create("unicodeset", 'u', UOption.REQUIRES_ARG).setDefault("[\\u0000-\\U0010FFFF]"),
UOption.create("charset", 'c', UOption.REQUIRES_ARG).setDefault("UTF-8"),
};
public static void main(String[] args) throws IOException {
UOption.parseArgs(args, options);
String codeset = options[CHARSET].value;
GenerateCharmap gp = new GenerateCharmap(new UnicodeSet(options[UNICODESET].value),
Charset.forName(codeset), codeset);
PrintWriter out = FileUtilities.openUTF8Writer(options[DESTDIR].value + File.separator, codeset + ".cm");
gp.write(out);
out.close();
}
public class CharmapLine implements Comparable<Object>
{
public String CharacterValue;
public String CharacterName;
public String CharacterAltName;
public CharmapLine(String Name, String AltName, String Value)
{
CharacterName = Name;
CharacterAltName = AltName;
CharacterValue = Value;
if (Name.equals(AltName))
CharacterAltName = "";
}
public int compareTo(Object o)
{
CharmapLine c = (CharmapLine) o;
return (CharacterValue.compareTo(c.CharacterValue));
}
}
UnicodeSet chars;
Charset cs;
String codeset;
public GenerateCharmap(UnicodeSet chars, Charset cs, String codeset) {
this.cs = cs;
if (cs != null && !cs.name().equals("UTF-8")) {
UnicodeSet csset = new SimpleConverter(cs).getCharset();
chars = new UnicodeSet(chars).retainAll(csset);
}
this.chars = chars;
this.codeset = codeset;
}
public void write(PrintWriter out) {
out.println("######################");
out.println("# POSIX charmap ");
out.println("# Generated automatically from the Unicode Character Database and Common Locale Data Repository");
out.println("# see http://www.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap07.html");
out.println("# charset:\t" + codeset);
out.println("######################");
out.println("#################################################################################################");
out.println("# Copyright 1991-2011 Unicode, Inc. All rights reserved. Distributed under the Terms of Use in #");
out.println("# http://www.unicode.org/copyright.html. #");
out.println("# #");
out.println("# Permission is hereby granted, free of charge, to any person obtaining a copy of the Unicode #");
out.println("# data files and any associated documentation (the \"Data Files\") or Unicode software and any #");
out.println("# associated documentation (the \"Software\") to deal in the Data Files or Software without #");
out.println("# restriction, including without limitation the rights to use, copy, modify, merge, publish, #");
out.println("# distribute, and/or sell copies of the Data Files or Software, and to permit persons to whom #");
out.println("# the Data Files or Software are furnished to do so, provided that (a) the above copyright #");
out.println("# notice(s) and this permission notice appear with all copies of the Data Files or Software, #");
out.println("# (b) both the above copyright notice(s) and this permission notice appear in associated #");
out.println("# documentation, and (c) there is clear notice in each modified Data File or in the Software as #");
out.println("# well as in the documentation associated with the Data File(s) or Software that the data or #");
out.println("# software has been modified. #");
out.println("# #");
out.println("# THE DATA FILES AND SOFTWARE ARE PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR #");
out.println("# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A #");
out.println("# PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT #");
out.println("# HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR #");
out.println("# CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, #");
out.println("# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN #");
out.println("# CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA FILES OR SOFTWARE. #");
out.println("#################################################################################################");
out.println();
doCharmap(out, cs);
out.println("######################");
out.println();
}
/**
* @param out
*/
private void doCharmap(PrintWriter out, Charset cs)
{
// print character types, restricted to the charset
int LongestCharNameLength = 0;
int LongestCharValueLength = 0;
UnicodeSet us = new UnicodeSet("[^[:Noncharacter_Code_Point:][:Cn:][:Cs:]]").retainAll(chars);
List<CharmapLine> cml = new ArrayList<CharmapLine>();
CharmapLine current;
for (UnicodeSetIterator it = new UnicodeSetIterator(us); it.next();)
{
String Name = POSIXUtilities.POSIXCharFullName(it.getString());
String AltName = POSIXUtilities.POSIXCharName(it.getString());
String Value = getCodepointValue(it.getString(), cs);
current = new CharmapLine(Name, AltName, Value);
cml.add(current);
if (current.CharacterName.length() > LongestCharNameLength)
LongestCharNameLength = current.CharacterName.length();
if (current.CharacterValue.length() > LongestCharValueLength)
LongestCharValueLength = current.CharacterValue.length();
}
Collections.sort(cml);
out.print("<code_set_name> \"");
out.print(codeset);
out.println("\"");
out.println("<mb_cur_min> 1");
out.print("<mb_cur_max> ");
out.print(LongestCharValueLength / 4);
out.println();
out.println();
out.println("CHARMAP");
for (ListIterator<CharmapLine> li = cml.listIterator(); li.hasNext();)
{
current = li.next();
out.print(current.CharacterName);
for (int i = LongestCharNameLength + 1; i > current.CharacterName.length(); i--)
out.print(" ");
out.println(current.CharacterValue);
if (current.CharacterAltName.length() > 0)
{
out.print(current.CharacterAltName);
for (int i = LongestCharNameLength + 1; i > current.CharacterAltName.length(); i--)
out.print(" ");
out.println(current.CharacterValue);
}
}
out.println();
out.println("END CHARMAP");
out.println();
}
private String getCodepointValue(String cp, Charset cs)
{
StringBuffer result = new StringBuffer();
ByteBuffer bb = cs.encode(cp);
int i;
while (bb.hasRemaining())
{
result.append("\\x");
byte b = bb.get();
if (b < 0)
i = b + 256;
else
i = b;
result.append(Utility.hex(i, 2));
}
return result.toString();
}
}