blob: f098123a38f401e0894d71ab3559e7786436112f [file] [log] [blame]
/*
* Copyright (C) 2013 The Guava Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.common.base;
import com.google.common.annotations.GwtCompatible;
import com.google.common.annotations.GwtIncompatible;
import junit.framework.TestCase;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Random;
/**
* Unit tests for {@link Utf8}.
*
* @author Jon Perlow
* @author Martin Buchholz
* @author Clément Roux
*/
@GwtCompatible(emulated = true)
public class Utf8Test extends TestCase {
public void testEncodedLength_validStrings() {
assertEquals(0, Utf8.encodedLength(""));
assertEquals(11, Utf8.encodedLength("Hello world"));
assertEquals(8, Utf8.encodedLength("Résumé"));
assertEquals(461, Utf8.encodedLength("威廉·莎士比亞(William Shakespeare,"
+ "1564年4月26號—1616年4月23號[1])係隻英國嗰演員、劇作家同詩人,"
+ "有時間佢簡稱莎翁;中國清末民初哈拕翻譯做舌克斯毕、沙斯皮耳、筛斯比耳、"
+ "莎基斯庇尔、索士比尔、夏克思芘尔、希哀苦皮阿、叶斯壁、沙克皮尔、"
+ "狹斯丕爾。[2]莎士比亞編寫過好多作品,佢嗰劇作響西洋文學好有影響,"
+ "哈都拕人翻譯做好多話。"));
// A surrogate pair
assertEquals(4, Utf8.encodedLength(
newString(Character.MIN_HIGH_SURROGATE, Character.MIN_LOW_SURROGATE)));
}
@GwtIncompatible("StringBuilder.appendCodePoint()")
public void testEncodedLength_validStrings2() {
HashMap<Integer, Integer> utf8Lengths = new HashMap<Integer, Integer>();
utf8Lengths.put(0x00, 1);
utf8Lengths.put(0x7f, 1);
utf8Lengths.put(0x80, 2);
utf8Lengths.put(0x7ff, 2);
utf8Lengths.put(0x800, 3);
utf8Lengths.put(Character.MIN_SUPPLEMENTARY_CODE_POINT - 1, 3);
utf8Lengths.put(Character.MIN_SUPPLEMENTARY_CODE_POINT, 4);
utf8Lengths.put(Character.MAX_CODE_POINT, 4);
Integer[] codePoints = utf8Lengths.keySet().toArray(new Integer[]{});
StringBuilder sb = new StringBuilder();
Random rnd = new Random();
for (int trial = 0; trial < 100; trial++) {
sb.setLength(0);
int utf8Length = 0;
for (int i = 0; i < 6; i++) {
Integer randomCodePoint = codePoints[rnd.nextInt(codePoints.length)];
sb.appendCodePoint(randomCodePoint);
utf8Length += utf8Lengths.get(randomCodePoint);
if (utf8Length != Utf8.encodedLength(sb)) {
StringBuilder repro = new StringBuilder();
for (int j = 0; j < sb.length(); j++) {
repro.append(" " + (int) sb.charAt(j)); // GWT compatible
}
assertEquals(repro.toString(), utf8Length, Utf8.encodedLength(sb));
}
}
}
}
public void testEncodedLength_invalidStrings() {
testEncodedLengthFails(newString(Character.MIN_HIGH_SURROGATE), 0);
testEncodedLengthFails("foobar" + newString(Character.MIN_HIGH_SURROGATE), 6);
testEncodedLengthFails(newString(Character.MIN_LOW_SURROGATE), 0);
testEncodedLengthFails("foobar" + newString(Character.MIN_LOW_SURROGATE), 6);
testEncodedLengthFails(
newString(
Character.MIN_HIGH_SURROGATE,
Character.MIN_HIGH_SURROGATE), 0);
}
private static void testEncodedLengthFails(String invalidString,
int invalidCodePointIndex) {
try {
Utf8.encodedLength(invalidString);
fail();
} catch (IllegalArgumentException expected) {
assertEquals("Unpaired surrogate at index " + invalidCodePointIndex,
expected.getMessage());
}
}
// 128 - [chars 0x0000 to 0x007f]
private static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
0x007f - 0x0000 + 1;
// 128
private static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT =
ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
// 1920 [chars 0x0080 to 0x07FF]
private static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS =
0x07FF - 0x0080 + 1;
// 18,304
private static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT =
// Both bytes are one byte characters
(long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2) +
// The possible number of two byte characters
TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS;
// 2048
private static final long THREE_BYTE_SURROGATES = 2 * 1024;
// 61,440 [chars 0x0800 to 0xFFFF, minus surrogates]
private static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES;
// 2,650,112
private static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT =
// All one byte characters
(long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3) +
// One two byte character and a one byte character
2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS *
ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
// Three byte characters
THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
// 1,048,576 [chars 0x10000L to 0x10FFFF]
private static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS =
0x10FFFF - 0x10000L + 1;
// 289,571,839
private static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT =
// All one byte characters
(long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4) +
// One and three byte characters
2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS *
ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
// Two two byte characters
TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS +
// Permutations of one and two byte characters
3 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS *
ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS *
ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
// Four byte characters
FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS;
/** Tests that round tripping of all two byte permutations work. */
@GwtIncompatible("java.nio.charset.Charset")
public void testIsWellFormed_1Byte() {
testBytes(1, EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT);
}
/** Tests that round tripping of all two byte permutations work. */
@GwtIncompatible("java.nio.charset.Charset")
public void testIsWellFormed_2Bytes() {
testBytes(2, EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT);
}
/** Tests that round tripping of all three byte permutations work. */
@GwtIncompatible("java.nio.charset.Charset")
public void testIsWellFormed_3Bytes() {
testBytes(3, EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT);
}
/**
* Tests that round tripping of a sample of four byte permutations work.
* All permutations are prohibitively expensive to test for automated runs.
* This method tests specific four-byte cases.
*/
public void testIsWellFormed_4BytesSamples() {
// Valid 4 byte.
assertWellFormed(0xF0, 0xA4, 0xAD, 0xA2);
// Bad trailing bytes
assertNotWellFormed(0xF0, 0xA4, 0xAD, 0x7F);
assertNotWellFormed(0xF0, 0xA4, 0xAD, 0xC0);
// Special cases for byte2
assertNotWellFormed(0xF0, 0x8F, 0xAD, 0xA2);
assertNotWellFormed(0xF4, 0x90, 0xAD, 0xA2);
}
/** Tests some hard-coded test cases. */
public void testSomeSequences() {
// Empty
assertWellFormed();
// One-byte characters, including control characters
assertWellFormed(0x00, 0x61, 0x62, 0x63, 0x7F); // "\u0000abc\u007f"
// Two-byte characters
assertWellFormed(0xC2, 0xA2, 0xC2, 0xA2); // "\u00a2\u00a2"
// Three-byte characters
assertWellFormed(0xc8, 0x8a, 0x63, 0xc8, 0x8a, 0x63); // "\u020ac\u020ac"
// Four-byte characters
// "\u024B62\u024B62"
assertWellFormed(0xc9, 0x8b, 0x36, 0x32, 0xc9, 0x8b, 0x36, 0x32);
// Mixed string
// "a\u020ac\u00a2b\\u024B62u020acc\u00a2de\u024B62"
assertWellFormed(0x61, 0xc8, 0x8a, 0x63, 0xc2, 0xa2, 0x62, 0x5c, 0x75, 0x30,
0x32, 0x34, 0x42, 0x36, 0x32, 0x75, 0x30, 0x32, 0x30, 0x61, 0x63, 0x63,
0xc2, 0xa2, 0x64, 0x65, 0xc9, 0x8b, 0x36, 0x32);
// Not a valid string
assertNotWellFormed(-1, 0, -1, 0);
}
public void testShardsHaveExpectedRoundTrippables() {
// A sanity check.
long actual = 0;
for (long expected : generateFourByteShardsExpectedRunnables()) {
actual += expected;
}
assertEquals(EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT, actual);
}
private String newString(char... chars) {
return new String(chars);
}
private byte[] toByteArray(int... bytes) {
byte[] realBytes = new byte[bytes.length];
for (int i = 0; i < bytes.length; i++) {
realBytes[i] = (byte) bytes[i];
}
return realBytes;
}
private void assertWellFormed(int... bytes) {
assertTrue(Utf8.isWellFormed(toByteArray(bytes)));
}
private void assertNotWellFormed(int... bytes) {
assertFalse(Utf8.isWellFormed(toByteArray(bytes)));
}
private static long[] generateFourByteShardsExpectedRunnables() {
long[] expected = new long[128];
// 0-63 are all 5300224
for (int i = 0; i <= 63; i++) {
expected[i] = 5300224;
}
// 97-111 are all 2342912
for (int i = 97; i <= 111; i++) {
expected[i] = 2342912;
}
// 113-117 are all 1048576
for (int i = 113; i <= 117; i++) {
expected[i] = 1048576;
}
// One offs
expected[112] = 786432;
expected[118] = 786432;
expected[119] = 1048576;
expected[120] = 458752;
expected[121] = 524288;
expected[122] = 65536;
// Anything not assigned was the default 0.
return expected;
}
/**
* Helper to run the loop to test all the permutations for the number of bytes
* specified.
*
* @param numBytes the number of bytes in the byte array
* @param expectedCount the expected number of roundtrippable permutations
*/
@GwtIncompatible("java.nio.charset.Charset")
private static void testBytes(int numBytes, long expectedCount) {
testBytes(numBytes, expectedCount, 0, -1);
}
/**
* Helper to run the loop to test all the permutations for the number of bytes
* specified. This overload is useful for debugging to get the loop to start
* at a certain character.
*
* @param numBytes the number of bytes in the byte array
* @param expectedCount the expected number of roundtrippable permutations
* @param start the starting bytes encoded as a long as big-endian
* @param lim the limit of bytes to process encoded as a long as big-endian,
* or -1 to mean the max limit for numBytes
*/
@GwtIncompatible("java.nio.charset.Charset")
private static void testBytes(int numBytes, long expectedCount, long start,
long lim) {
byte[] bytes = new byte[numBytes];
if (lim == -1) {
lim = 1L << (numBytes * 8);
}
long countRoundTripped = 0;
for (long byteChar = start; byteChar < lim; byteChar++) {
long tmpByteChar = byteChar;
for (int i = 0; i < numBytes; i++) {
bytes[bytes.length - i - 1] = (byte) tmpByteChar;
tmpByteChar = tmpByteChar >> 8;
}
boolean isRoundTrippable = Utf8.isWellFormed(bytes);
assertEquals(isRoundTrippable, Utf8.isWellFormed(bytes, 0, numBytes));
String s = new String(bytes, Charsets.UTF_8);
byte[] bytesReencoded = s.getBytes(Charsets.UTF_8);
boolean bytesEqual = Arrays.equals(bytes, bytesReencoded);
if (bytesEqual != isRoundTrippable) {
fail();
}
if (isRoundTrippable) {
countRoundTripped++;
}
}
assertEquals(expectedCount, countRoundTripped);
}
}