| /* |
| * Copyright (C) 2013 The Guava Authors |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
| * in compliance with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software distributed under the License |
| * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
| * or implied. See the License for the specific language governing permissions and limitations under |
| * the License. |
| */ |
| |
| package com.google.common.base; |
| |
| import static com.google.common.base.Preconditions.checkPositionIndexes; |
| import static java.lang.Character.MAX_SURROGATE; |
| import static java.lang.Character.MIN_SURROGATE; |
| |
| import com.google.common.annotations.Beta; |
| import com.google.common.annotations.GwtCompatible; |
| |
| /** |
| * Low-level, high-performance utility methods related to the {@linkplain Charsets#UTF_8 UTF-8} |
| * character encoding. UTF-8 is defined in section D92 of <a |
| * href="http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf">The Unicode Standard Core |
| * Specification, Chapter 3</a>. |
| * |
| * <p>The variant of UTF-8 implemented by this class is the restricted definition of UTF-8 |
| * introduced in Unicode 3.1. One implication of this is that it rejects <a |
| * href="http://www.unicode.org/versions/corrigendum1.html">"non-shortest form"</a> byte sequences, |
| * even though the JDK decoder may accept them. |
| * |
| * @author Martin Buchholz |
| * @author Clément Roux |
| * @since 16.0 |
| */ |
| @Beta |
| @GwtCompatible(emulated = true) |
| public final class Utf8 { |
| /** |
| * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string, this |
| * method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in both |
| * time and space. |
| * |
| * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired |
| * surrogates) |
| */ |
| public static int encodedLength(CharSequence sequence) { |
| // Warning to maintainers: this implementation is highly optimized. |
| int utf16Length = sequence.length(); |
| int utf8Length = utf16Length; |
| int i = 0; |
| |
| // This loop optimizes for pure ASCII. |
| while (i < utf16Length && sequence.charAt(i) < 0x80) { |
| i++; |
| } |
| |
| // This loop optimizes for chars less than 0x800. |
| for (; i < utf16Length; i++) { |
| char c = sequence.charAt(i); |
| if (c < 0x800) { |
| utf8Length += ((0x7f - c) >>> 31); // branch free! |
| } else { |
| utf8Length += encodedLengthGeneral(sequence, i); |
| break; |
| } |
| } |
| |
| if (utf8Length < utf16Length) { |
| // Necessary and sufficient condition for overflow because of maximum 3x expansion |
| throw new IllegalArgumentException( |
| "UTF-8 length does not fit in int: " + (utf8Length + (1L << 32))); |
| } |
| return utf8Length; |
| } |
| |
| private static int encodedLengthGeneral(CharSequence sequence, int start) { |
| int utf16Length = sequence.length(); |
| int utf8Length = 0; |
| for (int i = start; i < utf16Length; i++) { |
| char c = sequence.charAt(i); |
| if (c < 0x800) { |
| utf8Length += (0x7f - c) >>> 31; // branch free! |
| } else { |
| utf8Length += 2; |
| // jdk7+: if (Character.isSurrogate(c)) { |
| if (MIN_SURROGATE <= c && c <= MAX_SURROGATE) { |
| // Check that we have a well-formed surrogate pair. |
| if (Character.codePointAt(sequence, i) == c) { |
| throw new IllegalArgumentException(unpairedSurrogateMsg(i)); |
| } |
| i++; |
| } |
| } |
| } |
| return utf8Length; |
| } |
| |
| /** |
| * Returns {@code true} if {@code bytes} is a <i>well-formed</i> UTF-8 byte sequence according to |
| * Unicode 6.0. Note that this is a stronger criterion than simply whether the bytes can be |
| * decoded. For example, some versions of the JDK decoder will accept "non-shortest form" byte |
| * sequences, but encoding never reproduces these. Such byte sequences are <i>not</i> considered |
| * well-formed. |
| * |
| * <p>This method returns {@code true} if and only if {@code Arrays.equals(bytes, new |
| * String(bytes, UTF_8).getBytes(UTF_8))} does, but is more efficient in both time and space. |
| */ |
| public static boolean isWellFormed(byte[] bytes) { |
| return isWellFormed(bytes, 0, bytes.length); |
| } |
| |
| /** |
| * Returns whether the given byte array slice is a well-formed UTF-8 byte sequence, as defined by |
| * {@link #isWellFormed(byte[])}. Note that this can be false even when {@code |
| * isWellFormed(bytes)} is true. |
| * |
| * @param bytes the input buffer |
| * @param off the offset in the buffer of the first byte to read |
| * @param len the number of bytes to read from the buffer |
| */ |
| public static boolean isWellFormed(byte[] bytes, int off, int len) { |
| int end = off + len; |
| checkPositionIndexes(off, end, bytes.length); |
| // Look for the first non-ASCII character. |
| for (int i = off; i < end; i++) { |
| if (bytes[i] < 0) { |
| return isWellFormedSlowPath(bytes, i, end); |
| } |
| } |
| return true; |
| } |
| |
| private static boolean isWellFormedSlowPath(byte[] bytes, int off, int end) { |
| int index = off; |
| while (true) { |
| int byte1; |
| |
| // Optimize for interior runs of ASCII bytes. |
| do { |
| if (index >= end) { |
| return true; |
| } |
| } while ((byte1 = bytes[index++]) >= 0); |
| |
| if (byte1 < (byte) 0xE0) { |
| // Two-byte form. |
| if (index == end) { |
| return false; |
| } |
| // Simultaneously check for illegal trailing-byte in leading position |
| // and overlong 2-byte form. |
| if (byte1 < (byte) 0xC2 || bytes[index++] > (byte) 0xBF) { |
| return false; |
| } |
| } else if (byte1 < (byte) 0xF0) { |
| // Three-byte form. |
| if (index + 1 >= end) { |
| return false; |
| } |
| int byte2 = bytes[index++]; |
| if (byte2 > (byte) 0xBF |
| // Overlong? 5 most significant bits must not all be zero. |
| || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) |
| // Check for illegal surrogate codepoints. |
| || (byte1 == (byte) 0xED && (byte) 0xA0 <= byte2) |
| // Third byte trailing-byte test. |
| || bytes[index++] > (byte) 0xBF) { |
| return false; |
| } |
| } else { |
| // Four-byte form. |
| if (index + 2 >= end) { |
| return false; |
| } |
| int byte2 = bytes[index++]; |
| if (byte2 > (byte) 0xBF |
| // Check that 1 <= plane <= 16. Tricky optimized form of: |
| // if (byte1 > (byte) 0xF4 |
| // || byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 |
| // || byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) |
| || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 |
| // Third byte trailing-byte test |
| || bytes[index++] > (byte) 0xBF |
| // Fourth byte trailing-byte test |
| || bytes[index++] > (byte) 0xBF) { |
| return false; |
| } |
| } |
| } |
| } |
| |
| private static String unpairedSurrogateMsg(int i) { |
| return "Unpaired surrogate at index " + i; |
| } |
| |
| private Utf8() {} |
| } |