blob: baf79f21449d8db695c1040bea443347b8460801 [file] [log] [blame]
/**
* Shim for doing compaction of strings
*/
package org.unicode.cldr.draft;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import org.unicode.cldr.draft.CharacterListCompressor.Interval;
import org.unicode.cldr.util.CldrUtility;
import com.ibm.icu.text.UTF16;
public class Compacter {
public static double totalOld;
public static double totalNew;
static boolean useCibus = true;
/**
* @param result
* output buffer for compacted form
* @param set2
* input is collection of strings, where each string is exactly one codepoint. The collection is read in
* Iterator order.
*/
public static String encodeString(Collection<String> set2) {
int size = 2; // size in bytes
for (String s : set2) {
int cp;
for (int i = 0; i < s.length(); i += Character.charCount(cp)) {
cp = s.charAt(i);
if (cp < 0x80)
size += 1;
else if (cp < 0x800)
size += 2;
else if (cp < 0x10000)
size += 3;
else
size += 4;
}
size += 1; // for null byte;
}
List<Interval> intermediate = codePointsToIntervals(set2);
String result = CharacterListCompressor.base88EncodeList(intermediate);
totalOld += size;
totalNew += (result.length() + 2);
return result;
}
/**
* @param result
* output buffer for compacted form
* @param set2
* input is collection of strings, where each string is exactly one codepoint. The collection is read in
* Iterator order.
*/
public static List<String> decodeString(String encodedString) {
List<Interval> intermediate = CharacterListCompressor.base88DecodeList(encodedString);
List<String> result = new ArrayList<String>();
for (Interval interval : intermediate) {
for (int i = interval.first; i <= interval.last; ++i) {
result.add(UTF16.valueOf(i));
}
}
return result;
}
private static List<Interval> codePointsToIntervals(Collection<String> set2) {
List<Interval> result = new ArrayList<Interval>();
int first = -1;
int last = -1;
for (String item : set2) {
int cp = UTF16.charAt(item, 0);
if (first == -1) {
first = last = cp;
} else if (cp == last + 1) {
last = cp;
} else {
result.add(new Interval(first, last));
first = last = cp;
}
}
if (first != -1) {
result.add(new Interval(first, last));
}
return result;
}
/**
* @param result
* output buffer for compacted form
* @param set2
* input is collection of strings, where each string is exactly one codepoint. The collection is read in
* Iterator order.
*/
public static String appendCompacted(Collection<String> set2) {
if (Compacter.useCibus) {
return encodeString(set2);
}
String resultStr = getInternalRangeString(set2);
return resultStr;
}
public static String getInternalRangeString(Collection<String> set2) {
StringBuilder result = new StringBuilder();
int first = -1;
int last = -1;
for (String item : set2) {
int cp = UTF16.charAt(item, 0);
if (first == -1) {
first = last = cp;
} else if (cp == last + 1) {
last = cp;
} else {
appendRange(result, first, last);
first = last = cp;
}
}
if (first != -1) {
appendRange(result, first, last);
}
String resultStr = result.toString();
return resultStr;
}
private static void appendRange(StringBuilder result, int first, int last) {
result.append(UTF16.valueOf(first));
if (first != last) {
int delta = 0xE000 + last - first;
if (delta >= 0xF800) {
throw new IllegalArgumentException("Range too large: " + CldrUtility.toHex(first, true) + "-"
+ CldrUtility.toHex(last, true));
}
result.appendCodePoint(delta);
}
}
/**
* @param in
* compacted form
* @return result list of strings, where each string is exactly one codepoint. The order is exactly the same as the
* input to compaction,
* although the collection may be different.
*/
public static List<String> getFromCompacted(String in) {
if (Compacter.useCibus) {
return decodeString(in);
}
List<String> result = new ArrayList<String>();
int cp;
int first = 0;
for (int i = 0; i < in.length(); i += Character.charCount(cp)) {
cp = in.codePointAt(i);
if (0xE000 <= cp && cp < 0xF800) {
for (int j = first + 1; j <= first + cp - 0xE000; ++j) {
result.add(UTF16.valueOf(j));
}
} else {
result.add(UTF16.valueOf(cp));
}
first = cp;
}
return result;
}
/**
* @return the totalOld
*/
public static double getTotalOld() {
return totalOld;
}
/**
* @return the totalNew
*/
public static double getTotalNew() {
return totalNew;
}
}