blob: b7f95c525168dd5768ab8e5356a18872d59b602c [file] [log] [blame]
/*
* Copyright 2000-2014 JetBrains s.r.o.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.intellij.openapi.vfs;
import com.intellij.util.ArrayUtil;
import gnu.trove.THashMap;
import org.jetbrains.annotations.NonNls;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import java.io.*;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Map;
/**
* <p>Utility class to guess the encoding of a given byte array.
* The guess is unfortunately not 100% sure. Especially for 8-bit charsets.
* It's not possible to know which 8-bit charset is used.
* We will then infer that the charset encountered is the same as the default standard charset.</p>
*
* <p>On the other hand, unicode files encoded in UTF-16 (low or big endian) or UTF-8 files
* with a Byte Order Marker are easy to find. For UTF-8 files with no BOM, if the buffer
* is wide enough, it's easy to guess.</p>
*
* <p>Tested against a complicated UTF-8 file, Sun's implementation does not render bad UTF-8
* constructs as expected by the specification. But with a buffer wide enough, the method guessEncoding()
* did behave correctly and recognized the UTF-8 charset.</p>
*
* <p>A byte buffer of 4KB or 8KB is sufficient to be able to guessEncoding the encoding.</p>
*
* <p>Usage:</p>
* <pre>
* // guess the encoding
* Charset guessedCharset = CharsetToolkit.guessEncoding(file, 4096);
*
* // create a reader with the charset we've just discovered
* FileInputStream fis = new FileInputStream(file);
* InputStreamReader isr = new InputStreamReader(fis, guessedCharset);
* BufferedReader br = new BufferedReader(isr);
*
* // read the file content
* String line;
* while ((line = br.readLine())!= null)
* {
* System.out.println(line);
* }
* </pre>
* <p>An interesting improvement would be to create a custom <code>InputStream</code> that has a
* method discovering the <code>Charset</code> of the underlying file. Thus, we would not have to
* read the beginning of the file twice: once for guessing the encoding, the second time for reading
* its content. Therefore, we could englobe this stream within an <code>InputStreamReader</code>.</p>
*
* <p>Date: 18 juil. 2002</p>
* @author Guillaume LAFORGE
*/
public class CharsetToolkit {
@NonNls public static final String UTF8 = "UTF-8";
public static final Charset UTF8_CHARSET = Charset.forName(UTF8);
public static final Charset UTF_16LE_CHARSET = Charset.forName("UTF-16LE");
public static final Charset UTF_16BE_CHARSET = Charset.forName("UTF-16BE");
public static final Charset UTF_32BE_CHARSET = Charset.forName("UTF-32BE");
public static final Charset UTF_32LE_CHARSET = Charset.forName("UTF-32LE");
public static final Charset UTF_16_CHARSET = Charset.forName("UTF-16");
private static final byte FF = (byte)0xff;
private static final byte FE = (byte)0xfe;
private static final byte EF = (byte)0xef;
private static final byte BB = (byte)0xbb;
private static final byte BF = (byte)0xbf;
private static final int BINARY_THRESHOLD = 9; // characters with codes below this considered to be binary
private final byte[] buffer;
private final Charset defaultCharset;
private boolean enforce8Bit = false;
public static final byte[] UTF8_BOM = {0xffffffef, 0xffffffbb, 0xffffffbf};
public static final byte[] UTF16LE_BOM = {-1, -2, };
public static final byte[] UTF16BE_BOM = {-2, -1, };
public static final byte[] UTF32BE_BOM = {0, 0, -2, -1, };
public static final byte[] UTF32LE_BOM = {-1, -2, 0, 0 };
@NonNls public static final String FILE_ENCODING_PROPERTY = "file.encoding";
@NonNls private static final Map<Charset, byte[]> CHARSET_TO_MANDATORY_BOM = new THashMap<Charset, byte[]>(2);
static {
CHARSET_TO_MANDATORY_BOM.put(UTF_16LE_CHARSET, UTF16LE_BOM);
CHARSET_TO_MANDATORY_BOM.put(UTF_16BE_CHARSET, UTF16BE_BOM);
CHARSET_TO_MANDATORY_BOM.put(UTF_32BE_CHARSET, UTF32BE_BOM);
CHARSET_TO_MANDATORY_BOM.put(UTF_32LE_CHARSET, UTF32LE_BOM);
}
/**
* Constructor of the <code>CharsetToolkit</code> utility class.
*
* @param buffer the byte buffer of which we want to know the encoding.
*/
public CharsetToolkit(@NotNull byte[] buffer) {
this.buffer = buffer;
defaultCharset = getDefaultSystemCharset();
}
/**
* Constructor of the <code>CharsetToolkit</code> utility class.
*
* @param buffer the byte buffer of which we want to know the encoding.
* @param defaultCharset the default Charset to use in case an 8-bit charset is recognized.
*/
public CharsetToolkit(@NotNull byte[] buffer, Charset defaultCharset) {
this.buffer = buffer;
this.defaultCharset = defaultCharset == null ? getDefaultSystemCharset() : defaultCharset;
}
@NotNull
public static InputStream inputStreamSkippingBOM(@NotNull InputStream stream) throws IOException {
assert stream.markSupported() :stream;
stream.mark(4);
boolean mustReset = true;
try {
int ret = stream.read();
if (ret == -1) {
return stream; // no bom
}
byte b0 = (byte)ret;
if (b0 != EF && b0 != FF && b0 != FE && b0 != 0) return stream; // no bom
ret = stream.read();
if (ret == -1) {
return stream; // no bom
}
byte b1 = (byte)ret;
if (b0 == FF && b1 == FE) {
stream.mark(2);
ret = stream.read();
if (ret == -1) {
return stream; // utf-16 LE
}
byte b2 = (byte)ret;
if (b2 != 0) {
return stream; // utf-16 LE
}
ret = stream.read();
if (ret == -1) {
return stream;
}
byte b3 = (byte)ret;
if (b3 != 0) {
return stream; // utf-16 LE
}
// utf-32 LE
mustReset = false;
return stream;
}
if (b0 == FE && b1 == FF) {
mustReset = false;
return stream; // utf-16 BE
}
if (b0 == EF && b1 == BB) {
ret = stream.read();
if (ret == -1) {
return stream; // no bom
}
byte b2 = (byte)ret;
if (b2 == BF) {
mustReset = false;
return stream; // utf-8 bom
}
// no bom
return stream;
}
if (b0 == 0 && b1 == 0) {
ret = stream.read();
if (ret == -1) {
return stream; // no bom
}
byte b2 = (byte)ret;
if (b2 != FE) {
return stream; // no bom
}
ret = stream.read();
if (ret == -1) {
return stream; // no bom
}
byte b3 = (byte)ret;
if (b3 != FF) {
return stream; // no bom
}
mustReset = false;
return stream; // UTF-32 BE
}
// no bom
return stream;
}
finally {
if (mustReset) stream.reset();
}
}
/**
* If US-ASCII is recognized, enforce to return the default encoding, rather than US-ASCII.
* It might be a file without any special character in the range 128-255, but that may be or become
* a file encoded with the default <code>charset</code> rather than US-ASCII.
*
* @param enforce a boolean specifying the use or not of US-ASCII.
*/
public void setEnforce8Bit(boolean enforce) {
enforce8Bit = enforce;
}
/**
* Gets the enforce8Bit flag, in case we do not want to ever get a US-ASCII encoding.
*
* @return a boolean representing the flag of use of US-ASCII.
*/
public boolean getEnforce8Bit() {
return enforce8Bit;
}
/**
* Retrieves the default Charset
*/
public Charset getDefaultCharset() {
return defaultCharset;
}
/**
* <p>Guess the encoding of the provided buffer.</p>
* If Byte Order Markers are encountered at the beginning of the buffer, we immediately
* return the charset implied by this BOM. Otherwise, the file would not be a human
* readable text file.</p>
*
* <p>If there is no BOM, this method tries to discern whether the file is UTF-8 or not.
* If it is not UTF-8, we assume the encoding is the default system encoding
* (of course, it might be any 8-bit charset, but usually, an 8-bit charset is the default one).</p>
*
* <p>It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence.</p>
* <pre>
* UCS-4 range (hex.) UTF-8 octet sequence (binary)
* 0000 0000-0000 007F 0xxxxxxx
* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
* 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
* 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
* 0400 0000-7FFF FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
* </pre>
* <p>With UTF-8, 0xFE and 0xFF never appear.</p>
*
* @return the Charset recognized.
*/
public Charset guessEncoding(int guess_length, Charset defaultCharset) {
// if the file has a Byte Order Marker, we can assume the file is in UTF-xx
// otherwise, the file would not be human readable
Charset charset = guessFromBOM();
if (charset != null) return charset;
GuessedEncoding encoding = guessFromContent(guess_length);
switch (encoding) {
case SEVEN_BIT:
// if no byte with an high order bit set, the encoding is US-ASCII
// (it might have been UTF-7, but this encoding is usually internally used only by mail systems)
// returns the default charset rather than US-ASCII if the enforce8Bit flag is set.
return enforce8Bit ? defaultCharset : Charset.forName("US-ASCII");
case INVALID_UTF8:
return defaultCharset;
case VALID_UTF8:
return UTF8_CHARSET;
case BINARY:
break;
default:
break;
}
return null;
}
@NotNull
public static String bytesToString(@NotNull byte[] bytes, @NotNull final Charset defaultCharset) {
Charset charset = new CharsetToolkit(bytes, defaultCharset).guessEncoding(bytes.length);
if (charset == null) charset = defaultCharset; // binary content. This is silly but method contract says to return something anyway
int bomLength = getBOMLength(bytes, charset);
final CharBuffer charBuffer = charset.decode(ByteBuffer.wrap(bytes, bomLength, bytes.length - bomLength));
return charBuffer.toString();
}
public enum GuessedEncoding {
SEVEN_BIT, // ASCII
VALID_UTF8, // UTF-8
INVALID_UTF8, // invalid UTF
BINARY // binary
}
@NotNull
public GuessedEncoding guessFromContent(int guess_length) {
// if a byte has its most significant bit set, the file is in UTF-8 or in the default encoding
// otherwise, the file is in US-ASCII
boolean highOrderBit = false;
// if the file is in UTF-8, high order bytes must have a certain value, in order to be valid
// if it's not the case, we can assume the encoding is the default encoding of the system
boolean validU8Char = true;
// true if char bytes < BINARY_THRESHOLD occurred
boolean hasBinary = false;
int length = Math.min(buffer.length, guess_length);
int i = 0;
while (i < length) {
byte b0 = buffer[i];
byte b1 = i + 1 >= length ? 0 : buffer[i + 1];
byte b2 = i + 2 >= length ? 0 : buffer[i + 2];
byte b3 = i + 3 >= length ? 0 : buffer[i + 3];
byte b4 = i + 4 >= length ? 0 : buffer[i + 4];
byte b5 = i + 5 >= length ? 0 : buffer[i + 5];
if (b0 < 0) {
// a high order bit was encountered, thus the encoding is not US-ASCII
// it may be either an 8-bit encoding or UTF-8
highOrderBit = true;
// a two-bytes sequence was encountered
if (isTwoBytesSequence(b0)) {
// there must be one continuation byte of the form 10xxxxxx,
// otherwise the following characters is not a valid UTF-8 construct
if (!isContinuationChar(b1)) {
validU8Char = false;
}
else {
i++;
}
}
// a three-bytes sequence was encountered
else if (isThreeBytesSequence(b0)) {
// there must be two continuation bytes of the form 10xxxxxx,
// otherwise the following characters is not a valid UTF-8 construct
if (!(isContinuationChar(b1) && isContinuationChar(b2))) {
validU8Char = false;
}
else {
i += 2;
}
}
// a four-bytes sequence was encountered
else if (isFourBytesSequence(b0)) {
// there must be three continuation bytes of the form 10xxxxxx,
// otherwise the following characters is not a valid UTF-8 construct
if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3))) {
validU8Char = false;
}
else {
i += 3;
}
}
// a five-bytes sequence was encountered
else if (isFiveBytesSequence(b0)) {
// there must be four continuation bytes of the form 10xxxxxx,
// otherwise the following characters is not a valid UTF-8 construct
if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3) && isContinuationChar(b4))) {
validU8Char = false;
}
else {
i += 4;
}
}
// a six-bytes sequence was encountered
else if (isSixBytesSequence(b0)) {
// there must be five continuation bytes of the form 10xxxxxx,
// otherwise the following characters is not a valid UTF-8 construct
if (!(isContinuationChar(b1) &&
isContinuationChar(b2) &&
isContinuationChar(b3) &&
isContinuationChar(b4) &&
isContinuationChar(b5))) {
validU8Char = false;
}
else {
i += 5;
}
}
else {
validU8Char = false;
}
}
else if (b0 < BINARY_THRESHOLD) {
hasBinary = true;
}
if (!validU8Char) break;
i++;
}
if (!highOrderBit && !hasBinary) {
return GuessedEncoding.SEVEN_BIT;
}
// finally, if it's not UTF-8 nor US-ASCII
if (!validU8Char) return GuessedEncoding.INVALID_UTF8;
if (hasBinary) return GuessedEncoding.BINARY;
// if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8,
// otherwise the file would not be human readable
return GuessedEncoding.VALID_UTF8;
}
@Nullable
public Charset guessFromBOM() {
return guessFromBOM(buffer);
}
@Nullable
public static Charset guessFromBOM(@NotNull byte[] buffer) {
if (hasUTF8Bom(buffer)) return UTF8_CHARSET;
if (hasUTF32BEBom(buffer)) return UTF_32BE_CHARSET;
if (hasUTF32LEBom(buffer)) return UTF_32LE_CHARSET;
if (hasUTF16LEBom(buffer)) return UTF_16LE_CHARSET;
if (hasUTF16BEBom(buffer)) return UTF_16BE_CHARSET;
return null;
}
public Charset guessEncoding(int guess_length) {
return guessEncoding(guess_length, defaultCharset);
}
public static Charset guessEncoding(@NotNull File f, int bufferLength, Charset defaultCharset) throws IOException {
byte[] buffer = new byte[bufferLength];
int read;
FileInputStream fis = new FileInputStream(f);
try {
read = fis.read(buffer);
}
finally {
fis.close();
}
CharsetToolkit toolkit = new CharsetToolkit(buffer, defaultCharset);
return toolkit.guessEncoding(read);
}
/**
* If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character;
*
* @param b a byte.
* @return true if it's a continuation char.
*/
private static boolean isContinuationChar(byte b) {
return -128 <= b && b <= -65;
}
/**
* If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character.
*
* @param b a byte.
* @return true if it's the first byte of a two-bytes sequence.
*/
private static boolean isTwoBytesSequence(byte b) {
return -64 <= b && b <= -33;
}
/**
* If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character.
*
* @param b a byte.
* @return true if it's the first byte of a three-bytes sequence.
*/
private static boolean isThreeBytesSequence(byte b) {
return -32 <= b && b <= -17;
}
/**
* If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character.
*
* @param b a byte.
* @return true if it's the first byte of a four-bytes sequence.
*/
private static boolean isFourBytesSequence(byte b) {
return -16 <= b && b <= -9;
}
/**
* If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character.
*
* @param b a byte.
* @return true if it's the first byte of a five-bytes sequence.
*/
private static boolean isFiveBytesSequence(byte b) {
return -8 <= b && b <= -5;
}
/**
* If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character.
*
* @param b a byte.
* @return true if it's the first byte of a six-bytes sequence.
*/
private static boolean isSixBytesSequence(byte b) {
return -4 <= b && b <= -3;
}
/**
* Retrieve the default charset of the system.
*
* @return the default <code>Charset</code>.
*/
@Nullable
public static Charset getDefaultSystemCharset() {
Charset charset = null;
try {
charset = Charset.forName(System.getProperty(FILE_ENCODING_PROPERTY));
} catch (Exception ignored) {
// Null is OK here.
}
return charset;
}
/**
* Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors).
*
* @param bom a buffer.
* @return true if the buffer has a BOM for UTF8.
*/
public static boolean hasUTF8Bom(@NotNull byte[] bom) {
return ArrayUtil.startsWith(bom, UTF8_BOM);
}
/**
* Has a Byte Order Marker for UTF-16 Low Endian
* (ucs-2le, ucs-4le, and ucs-16le).
*
* @param bom a buffer.
* @return true if the buffer has a BOM for UTF-16 Low Endian.
*/
public static boolean hasUTF16LEBom(@NotNull byte[] bom) {
return ArrayUtil.startsWith(bom, UTF16LE_BOM);
}
/**
* Has a Byte Order Marker for UTF-16 Big Endian
* (utf-16 and ucs-2).
*
* @param bom a buffer.
* @return true if the buffer has a BOM for UTF-16 Big Endian.
*/
public static boolean hasUTF16BEBom(@NotNull byte[] bom) {
return ArrayUtil.startsWith(bom, UTF16BE_BOM);
}
public static boolean hasUTF32BEBom(@NotNull byte[] bom) {
return ArrayUtil.startsWith(bom, UTF32BE_BOM);
}
public static boolean hasUTF32LEBom(@NotNull byte[] bom) {
return ArrayUtil.startsWith(bom, UTF32LE_BOM);
}
/**
* Retrieves all the available <code>Charset</code>s on the platform,
* among which the default <code>charset</code>.
*
* @return an array of <code>Charset</code>s.
*/
@NotNull
public static Charset[] getAvailableCharsets() {
Collection<Charset> collection = Charset.availableCharsets().values();
return collection.toArray(new Charset[collection.size()]);
}
@NotNull
public static byte[] getUtf8Bytes(@NotNull String s) {
try {
return s.getBytes(UTF8);
}
catch (UnsupportedEncodingException e) {
throw new RuntimeException("UTF-8 must be supported", e);
}
}
public static int getBOMLength(@NotNull byte[] content, Charset charset) {
if (charset != null && charset.name().contains(UTF8) && hasUTF8Bom(content)) {
return UTF8_BOM.length;
}
if (hasUTF32BEBom(content)) {
return UTF32BE_BOM.length;
}
if (hasUTF32BEBom(content)) {
return UTF32BE_BOM.length;
}
if (hasUTF16LEBom(content)) {
return UTF16LE_BOM.length;
}
if (hasUTF16BEBom(content)) {
return UTF16BE_BOM.length;
}
return 0;
}
/**
* @deprecated use {@link CharsetToolkit#getMandatoryBom(java.nio.charset.Charset)}
*/
@Nullable
public static byte[] getBom(@NotNull Charset charset) {
return getMandatoryBom(charset);
}
/**
* @return BOM which is associated with this charset and the charset must have this BOM, or null otherwise.
* Currently these are UTF-16xx and UTF-32xx families.
* UTF-8, on the other hand, might have BOM {@link #UTF8_BOM} which is optional, thus it will not returned in this method
*/
@Nullable
public static byte[] getMandatoryBom(@NotNull Charset charset) {
return CHARSET_TO_MANDATORY_BOM.get(charset);
}
// byte sequence for this encoding is allowed to be prepended with this BOM
public static boolean canHaveBom(@NotNull Charset charset, @NotNull byte[] bom) {
return charset.equals(UTF8_CHARSET) && Arrays.equals(bom, UTF8_BOM)
|| Arrays.equals(getMandatoryBom(charset), bom);
}
@Nullable
public static Charset forName(@Nullable String name) {
Charset charset = null;
if (name != null) {
try {
charset = Charset.forName(name);
}
catch (IllegalCharsetNameException ignored) {
//ignore
}
catch(UnsupportedCharsetException ignored){
//ignore
}
}
return charset;
}
}