blob: 4ead9e9e74638cde858b55ea31871a6c728ab26e [file] [log] [blame]
package org.unicode.cldr.tool;
import java.util.Arrays;
import java.util.Collection;
import java.util.Locale;
import java.util.Set;
import java.util.TreeSet;
import org.unicode.cldr.util.Iso639Data;
import org.unicode.cldr.util.StandardCodes;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;
public class GenerateBcp47Bits {
static final boolean QUICK = false; // true for debugging
/**
* Generate the data, throwing an exception if anything goes wrong.
*
* @param args
*/
public static void main(String[] args) {
// Get the data to compress
final Collection<String> languages;
final Collection<String> regions;
final long[] regionStaticTest;
final long[] languageStaticTest;
if (QUICK) {
languages = Arrays.asList("aa zz aaa abc zzz".split("\\s"));
languageStaticTest = null;
regions = Arrays.asList("AA AB ZZ 000 003 999".split("\\s"));
regionStaticTest = null;
} else {
languages = Iso639Data.getAvailable();
languageStaticTest = null; // javaTestLanguages;
regions = StandardCodes.make().getAvailableCodes("territory");
regionStaticTest = null; // javaTestRegions;
}
// Compress it
System.out.println("\t<bits>");
writeInfo("region", regions, new Bcp47StringBitTransform(Bcp47StringBitTransform.Type.REGION), regionStaticTest);
writeInfo("language", languages, new Bcp47StringBitTransform(Bcp47StringBitTransform.Type.LANGUAGE),
languageStaticTest);
System.out.println("\t</bits>");
}
/**
* Utility for writing compression. Work is done in writeBits
*
* @param element
* @param codes
* @param transform
* @param staticTest2
*/
private static void writeInfo(String element, Collection<String> codes, StringBitTransform transform,
long[] testStatic) {
System.out.println("<!-- " + codes + " -->");
System.out.println("\t\t<" + element + ">");
writeBits(codes, transform, "\t\t\t\t", testStatic);
System.out.println("\t\t</" + element + ">");
}
/**
* Write out the bits in the source, using the transform.
*
* @param source
* @param transform
* @param indent
* @param testStatic
*/
private static void writeBits(final Collection<String> source, StringBitTransform transform, String indent,
long[] testStatic) {
Set<String> codes = new TreeSet<String>(source);
// Transform into bits
Bits bits = new Bits(transform.getLimit());
if (QUICK) System.out.println(bits);
for (String code : codes) {
int bit = transform.toBit(code);
String verify = transform.fromBit(bit);
bits.set(bit);
// Verify that our transform and bitset worked
if (QUICK) {
System.out.println(bit + "\t" + bits);
}
if (!verify.equalsIgnoreCase(code)) {
bit = transform.toBit(code);
verify = transform.fromBit(bit);
throw new IllegalArgumentException("StringBitTransform failure with " + code);
}
if (!bits.get(bit)) {
throw new IllegalArgumentException("Bit failure with " + code);
}
}
// Verify that the sets are the same afterwards
if (QUICK) System.out.println(bits);
Set<String> verifySet = new TreeSet<String>();
for (int i = 0; i < transform.getLimit(); ++i) {
if (!bits.get(i)) continue;
String verify = transform.fromBit(i);
// if (!codes.contains(verify)) {
// bits.get(i);
// transform.fromBit(i);
// throw new IllegalArgumentException("Roundtrip failure with " + verify);
// }
verifySet.add(verify);
}
if (QUICK) System.out.println(verifySet);
if (!verifySet.equals(codes)) {
throw new IllegalArgumentException("Roundtrip failure with " + verifySet.toString());
}
// Actually write the result
final String stringForm = bits.toString(16, indent, 4);
System.out.println(stringForm);
// Verify that the string form of the bits roundtrips
Bits reversed = Bits.fromString(transform.getLimit(), stringForm, 16);
if (QUICK) System.out.println(reversed);
if (QUICK) System.out.println(reversed.toString(16, indent, 4));
if (!reversed.equals(bits)) {
//int diff = reversed.firstDifference(bits);
throw new IllegalArgumentException("Reversal failure" + bits + " != " + reversed);
}
// Verify that the static versions are the same
if (testStatic != null) {
Bits staticTest = Bits.fromInts(transform.getLimit(), testStatic);
if (!staticTest.equals(bits)) {
throw new IllegalArgumentException("Static failure");
}
}
}
/**
* Simple Bitset (java BitSet doesn't do enough).
* Note that bits are stored in order, eg 0 => 800..., 1 => 40...
*/
static class Bits {
private final long[] bits;
static final int SHIFT = 6;
static final int MASK = (1 << SHIFT) - 1;
Bits(int size) {
bits = new long[((size - 1) >> SHIFT) + 1];
}
public Bits set(int bit) {
final int index = bit >> SHIFT;
final int remainder = bit & MASK;
//int restore = (index << SHIFT) | remainder;
final long mask = 1L << remainder;
bits[index] |= mask;
return this;
}
public boolean get(int bit) {
final int index = bit >> SHIFT;
final int remainder = bit & MASK;
//int restore = (index << SHIFT) | remainder;
final long mask = 1L << remainder;
final long masked = bits[index] & mask;
return 0 != masked;
}
public String toString(int radix, String indent, int perLineMax) {
StringBuilder buffer = new StringBuilder();
buffer.append(indent);
int count = perLineMax;
// strip final zeros
int last = bits.length;
while (bits[--last] == 0) {
}
// write out up to those
for (int i = 0; i <= last; ++i) {
if (i != 0) {
if (--count == 0) {
buffer.append(",\n");
buffer.append(indent);
count = perLineMax;
} else {
buffer.append(", ");
}
}
buffer.append(Long.toString(bits[i], radix));
}
return buffer.toString();
}
static Bits fromString(int size, String value, int radix) {
String[] parts = value.trim().split(",\\s*|\\s+");
if (parts.length > size) {
throw new IllegalArgumentException("Too many integers");
}
Bits result = new Bits(size);
for (int i = 0; i < parts.length; ++i) {
result.bits[i] = Long.parseLong(parts[i], radix);
}
return result;
}
static Bits fromInts(int size, long[] ints) {
Bits result = new Bits(size);
if (ints.length > result.bits.length) {
throw new IllegalArgumentException("Too many integers");
}
for (int i = 0; i < ints.length; ++i) {
result.bits[i] = ints[i];
}
return result;
}
public boolean equals(Bits other) {
if (bits.length != other.bits.length) {
return false;
}
for (int i = 0; i < bits.length; ++i) {
if (bits[i] != other.bits[i]) {
return false;
}
}
return true;
}
public int firstDifference(Bits other) {
int limit = other.bits.length;
if (limit < bits.length) limit = bits.length;
limit = limit << SHIFT;
for (int i = 0; i < limit; ++i) {
if (get(i) != other.get(i)) {
return i;
}
}
return -1;
}
public String toString() {
StringBuilder result = new StringBuilder("[");
int limit = bits.length;
limit = limit << SHIFT;
for (int i = 0; i < limit; ++i) {
if (get(i)) {
if (result.length() > 1) {
result.append(",");
}
result.append(i);
}
}
result.append("]");
return result.toString();
}
}
/**
* Encapsulates a transformation between string and bit
*/
static interface StringBitTransform {
public int getLimit();
public int toBit(String string);
public String fromBit(int bit);
}
static class Bcp47StringBitTransform implements StringBitTransform {
enum Type {
LANGUAGE, REGION
};
Type type;
int limit;
Bcp47StringBitTransform(Type type) {
this.type = type;
limit = type == Type.LANGUAGE ? 26 * 26 + 26 * 26 * 26 : 26 * 26 + 1000;
}
public int getLimit() {
return limit;
}
static final UnicodeSet alpha = (UnicodeSet) new UnicodeSet("[a-z]").freeze();
static final UnicodeSet num = (UnicodeSet) new UnicodeSet("[0-9]").freeze();
public int toBit(String string) {
string = UCharacter.toLowerCase(ULocale.ENGLISH, string);
switch (string.length()) {
case 2:
if (!alpha.containsAll(string)) {
throw new IllegalArgumentException(string);
}
return (string.codePointAt(0) - 'a') * 26
+ (string.codePointAt(1) - 'a');
case 3:
if (alpha.containsAll(string)) {
return 26 * 26 +
(string.codePointAt(0) - 'a') * 26 * 26
+ (string.codePointAt(1) - 'a') * 26
+ (string.codePointAt(2) - 'a');
}
if (!num.containsAll(string)) {
throw new IllegalArgumentException(string);
}
return 26 * 26 + Integer.parseInt(string);
default:
throw new IllegalArgumentException(string);
}
}
public String fromBit(int bit) {
StringBuilder result = new StringBuilder();
if (bit < 0) {
throw new IllegalArgumentException(String.valueOf(bit));
}
if (bit < 26 * 26) {
result.appendCodePoint('a' + bit / 26)
.appendCodePoint('a' + bit % 26);
} else {
bit -= 26 * 26;
if (type == Type.LANGUAGE) {
if (bit >= 26 * 26 + 26 * 26 * 26) {
throw new IllegalArgumentException(String.valueOf(bit));
}
result.appendCodePoint('a' + bit / (26 * 26));
bit %= (26 * 26);
result.appendCodePoint('a' + bit / 26)
.appendCodePoint('a' + bit % 26);
} else {
if (bit >= 26 * 26 + 1000) {
throw new IllegalArgumentException(String.valueOf(bit));
}
result.append(bit / (10 * 10));
bit %= (10 * 10);
result.append(bit / 10)
.append(bit % 10);
}
}
if (type == Type.REGION) return result.toString().toUpperCase(Locale.ENGLISH);
return result.toString();
}
}
/**
* The following are for testing, using generated data.
*/
static long[] javaTestRegions = {};
static long[] javaTestLanguages = {};
}