| /* |
| * Copyright 2000-2014 JetBrains s.r.o. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package com.intellij.openapi.vfs; |
| |
| import com.intellij.util.ArrayUtil; |
| import gnu.trove.THashMap; |
| import org.jetbrains.annotations.NonNls; |
| import org.jetbrains.annotations.NotNull; |
| import org.jetbrains.annotations.Nullable; |
| |
| import java.io.*; |
| import java.nio.ByteBuffer; |
| import java.nio.CharBuffer; |
| import java.nio.charset.Charset; |
| import java.nio.charset.IllegalCharsetNameException; |
| import java.nio.charset.UnsupportedCharsetException; |
| import java.util.Arrays; |
| import java.util.Collection; |
| import java.util.Map; |
| |
| /** |
| * <p>Utility class to guess the encoding of a given byte array. |
| * The guess is unfortunately not 100% sure. Especially for 8-bit charsets. |
| * It's not possible to know which 8-bit charset is used. |
| * We will then infer that the charset encountered is the same as the default standard charset.</p> |
| * |
| * <p>On the other hand, unicode files encoded in UTF-16 (low or big endian) or UTF-8 files |
| * with a Byte Order Marker are easy to find. For UTF-8 files with no BOM, if the buffer |
| * is wide enough, it's easy to guess.</p> |
| * |
| * <p>Tested against a complicated UTF-8 file, Sun's implementation does not render bad UTF-8 |
| * constructs as expected by the specification. But with a buffer wide enough, the method guessEncoding() |
| * did behave correctly and recognized the UTF-8 charset.</p> |
| * |
| * <p>A byte buffer of 4KB or 8KB is sufficient to be able to guessEncoding the encoding.</p> |
| * |
| * <p>Usage:</p> |
| * <pre> |
| * // guess the encoding |
| * Charset guessedCharset = CharsetToolkit.guessEncoding(file, 4096); |
| * |
| * // create a reader with the charset we've just discovered |
| * FileInputStream fis = new FileInputStream(file); |
| * InputStreamReader isr = new InputStreamReader(fis, guessedCharset); |
| * BufferedReader br = new BufferedReader(isr); |
| * |
| * // read the file content |
| * String line; |
| * while ((line = br.readLine())!= null) |
| * { |
| * System.out.println(line); |
| * } |
| * </pre> |
| * <p>An interesting improvement would be to create a custom <code>InputStream</code> that has a |
| * method discovering the <code>Charset</code> of the underlying file. Thus, we would not have to |
| * read the beginning of the file twice: once for guessing the encoding, the second time for reading |
| * its content. Therefore, we could englobe this stream within an <code>InputStreamReader</code>.</p> |
| * |
| * <p>Date: 18 juil. 2002</p> |
| * @author Guillaume LAFORGE |
| */ |
| public class CharsetToolkit { |
| @NonNls public static final String UTF8 = "UTF-8"; |
| public static final Charset UTF8_CHARSET = Charset.forName(UTF8); |
| public static final Charset UTF_16LE_CHARSET = Charset.forName("UTF-16LE"); |
| public static final Charset UTF_16BE_CHARSET = Charset.forName("UTF-16BE"); |
| public static final Charset UTF_32BE_CHARSET = Charset.forName("UTF-32BE"); |
| public static final Charset UTF_32LE_CHARSET = Charset.forName("UTF-32LE"); |
| public static final Charset UTF_16_CHARSET = Charset.forName("UTF-16"); |
| private static final byte FF = (byte)0xff; |
| private static final byte FE = (byte)0xfe; |
| private static final byte EF = (byte)0xef; |
| private static final byte BB = (byte)0xbb; |
| private static final byte BF = (byte)0xbf; |
| private static final int BINARY_THRESHOLD = 9; // characters with codes below this considered to be binary |
| |
| private final byte[] buffer; |
| private final Charset defaultCharset; |
| private boolean enforce8Bit = false; |
| |
| public static final byte[] UTF8_BOM = {0xffffffef, 0xffffffbb, 0xffffffbf}; |
| public static final byte[] UTF16LE_BOM = {-1, -2, }; |
| public static final byte[] UTF16BE_BOM = {-2, -1, }; |
| public static final byte[] UTF32BE_BOM = {0, 0, -2, -1, }; |
| public static final byte[] UTF32LE_BOM = {-1, -2, 0, 0 }; |
| @NonNls public static final String FILE_ENCODING_PROPERTY = "file.encoding"; |
| |
| @NonNls private static final Map<Charset, byte[]> CHARSET_TO_MANDATORY_BOM = new THashMap<Charset, byte[]>(2); |
| static { |
| CHARSET_TO_MANDATORY_BOM.put(UTF_16LE_CHARSET, UTF16LE_BOM); |
| CHARSET_TO_MANDATORY_BOM.put(UTF_16BE_CHARSET, UTF16BE_BOM); |
| CHARSET_TO_MANDATORY_BOM.put(UTF_32BE_CHARSET, UTF32BE_BOM); |
| CHARSET_TO_MANDATORY_BOM.put(UTF_32LE_CHARSET, UTF32LE_BOM); |
| } |
| |
| /** |
| * Constructor of the <code>CharsetToolkit</code> utility class. |
| * |
| * @param buffer the byte buffer of which we want to know the encoding. |
| */ |
| public CharsetToolkit(@NotNull byte[] buffer) { |
| this.buffer = buffer; |
| defaultCharset = getDefaultSystemCharset(); |
| } |
| |
| /** |
| * Constructor of the <code>CharsetToolkit</code> utility class. |
| * |
| * @param buffer the byte buffer of which we want to know the encoding. |
| * @param defaultCharset the default Charset to use in case an 8-bit charset is recognized. |
| */ |
| public CharsetToolkit(@NotNull byte[] buffer, Charset defaultCharset) { |
| this.buffer = buffer; |
| this.defaultCharset = defaultCharset == null ? getDefaultSystemCharset() : defaultCharset; |
| } |
| |
| @NotNull |
| public static InputStream inputStreamSkippingBOM(@NotNull InputStream stream) throws IOException { |
| assert stream.markSupported() :stream; |
| stream.mark(4); |
| boolean mustReset = true; |
| try { |
| int ret = stream.read(); |
| if (ret == -1) { |
| return stream; // no bom |
| } |
| byte b0 = (byte)ret; |
| if (b0 != EF && b0 != FF && b0 != FE && b0 != 0) return stream; // no bom |
| |
| ret = stream.read(); |
| if (ret == -1) { |
| return stream; // no bom |
| } |
| byte b1 = (byte)ret; |
| if (b0 == FF && b1 == FE) { |
| stream.mark(2); |
| ret = stream.read(); |
| if (ret == -1) { |
| return stream; // utf-16 LE |
| } |
| byte b2 = (byte)ret; |
| if (b2 != 0) { |
| return stream; // utf-16 LE |
| } |
| ret = stream.read(); |
| if (ret == -1) { |
| return stream; |
| } |
| byte b3 = (byte)ret; |
| if (b3 != 0) { |
| return stream; // utf-16 LE |
| } |
| |
| // utf-32 LE |
| mustReset = false; |
| return stream; |
| } |
| if (b0 == FE && b1 == FF) { |
| mustReset = false; |
| return stream; // utf-16 BE |
| } |
| if (b0 == EF && b1 == BB) { |
| ret = stream.read(); |
| if (ret == -1) { |
| return stream; // no bom |
| } |
| byte b2 = (byte)ret; |
| if (b2 == BF) { |
| mustReset = false; |
| return stream; // utf-8 bom |
| } |
| |
| // no bom |
| return stream; |
| } |
| |
| if (b0 == 0 && b1 == 0) { |
| ret = stream.read(); |
| if (ret == -1) { |
| return stream; // no bom |
| } |
| byte b2 = (byte)ret; |
| if (b2 != FE) { |
| return stream; // no bom |
| } |
| ret = stream.read(); |
| if (ret == -1) { |
| return stream; // no bom |
| } |
| byte b3 = (byte)ret; |
| if (b3 != FF) { |
| return stream; // no bom |
| } |
| |
| mustReset = false; |
| return stream; // UTF-32 BE |
| } |
| |
| // no bom |
| return stream; |
| } |
| finally { |
| if (mustReset) stream.reset(); |
| } |
| } |
| |
| /** |
| * If US-ASCII is recognized, enforce to return the default encoding, rather than US-ASCII. |
| * It might be a file without any special character in the range 128-255, but that may be or become |
| * a file encoded with the default <code>charset</code> rather than US-ASCII. |
| * |
| * @param enforce a boolean specifying the use or not of US-ASCII. |
| */ |
| public void setEnforce8Bit(boolean enforce) { |
| enforce8Bit = enforce; |
| } |
| |
| /** |
| * Gets the enforce8Bit flag, in case we do not want to ever get a US-ASCII encoding. |
| * |
| * @return a boolean representing the flag of use of US-ASCII. |
| */ |
| public boolean getEnforce8Bit() { |
| return enforce8Bit; |
| } |
| |
| /** |
| * Retrieves the default Charset |
| */ |
| public Charset getDefaultCharset() { |
| return defaultCharset; |
| } |
| |
| /** |
| * <p>Guess the encoding of the provided buffer.</p> |
| * If Byte Order Markers are encountered at the beginning of the buffer, we immediately |
| * return the charset implied by this BOM. Otherwise, the file would not be a human |
| * readable text file.</p> |
| * |
| * <p>If there is no BOM, this method tries to discern whether the file is UTF-8 or not. |
| * If it is not UTF-8, we assume the encoding is the default system encoding |
| * (of course, it might be any 8-bit charset, but usually, an 8-bit charset is the default one).</p> |
| * |
| * <p>It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence.</p> |
| * <pre> |
| * UCS-4 range (hex.) UTF-8 octet sequence (binary) |
| * 0000 0000-0000 007F 0xxxxxxx |
| * 0000 0080-0000 07FF 110xxxxx 10xxxxxx |
| * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx |
| * 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
| * 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
| * 0400 0000-7FFF FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
| * </pre> |
| * <p>With UTF-8, 0xFE and 0xFF never appear.</p> |
| * |
| * @return the Charset recognized. |
| */ |
| public Charset guessEncoding(int guess_length, Charset defaultCharset) { |
| // if the file has a Byte Order Marker, we can assume the file is in UTF-xx |
| // otherwise, the file would not be human readable |
| Charset charset = guessFromBOM(); |
| if (charset != null) return charset; |
| GuessedEncoding encoding = guessFromContent(guess_length); |
| switch (encoding) { |
| case SEVEN_BIT: |
| // if no byte with an high order bit set, the encoding is US-ASCII |
| // (it might have been UTF-7, but this encoding is usually internally used only by mail systems) |
| // returns the default charset rather than US-ASCII if the enforce8Bit flag is set. |
| return enforce8Bit ? defaultCharset : Charset.forName("US-ASCII"); |
| case INVALID_UTF8: |
| return defaultCharset; |
| case VALID_UTF8: |
| return UTF8_CHARSET; |
| case BINARY: |
| break; |
| default: |
| break; |
| } |
| return null; |
| } |
| |
| @NotNull |
| public static String bytesToString(@NotNull byte[] bytes, @NotNull final Charset defaultCharset) { |
| Charset charset = new CharsetToolkit(bytes, defaultCharset).guessEncoding(bytes.length); |
| if (charset == null) charset = defaultCharset; // binary content. This is silly but method contract says to return something anyway |
| int bomLength = getBOMLength(bytes, charset); |
| final CharBuffer charBuffer = charset.decode(ByteBuffer.wrap(bytes, bomLength, bytes.length - bomLength)); |
| return charBuffer.toString(); |
| } |
| |
| public enum GuessedEncoding { |
| SEVEN_BIT, // ASCII |
| VALID_UTF8, // UTF-8 |
| INVALID_UTF8, // invalid UTF |
| BINARY // binary |
| } |
| |
| @NotNull |
| public GuessedEncoding guessFromContent(int guess_length) { |
| // if a byte has its most significant bit set, the file is in UTF-8 or in the default encoding |
| // otherwise, the file is in US-ASCII |
| boolean highOrderBit = false; |
| |
| // if the file is in UTF-8, high order bytes must have a certain value, in order to be valid |
| // if it's not the case, we can assume the encoding is the default encoding of the system |
| boolean validU8Char = true; |
| |
| // true if char bytes < BINARY_THRESHOLD occurred |
| boolean hasBinary = false; |
| |
| int length = Math.min(buffer.length, guess_length); |
| int i = 0; |
| while (i < length) { |
| byte b0 = buffer[i]; |
| byte b1 = i + 1 >= length ? 0 : buffer[i + 1]; |
| byte b2 = i + 2 >= length ? 0 : buffer[i + 2]; |
| byte b3 = i + 3 >= length ? 0 : buffer[i + 3]; |
| byte b4 = i + 4 >= length ? 0 : buffer[i + 4]; |
| byte b5 = i + 5 >= length ? 0 : buffer[i + 5]; |
| if (b0 < 0) { |
| // a high order bit was encountered, thus the encoding is not US-ASCII |
| // it may be either an 8-bit encoding or UTF-8 |
| highOrderBit = true; |
| // a two-bytes sequence was encountered |
| if (isTwoBytesSequence(b0)) { |
| // there must be one continuation byte of the form 10xxxxxx, |
| // otherwise the following characters is not a valid UTF-8 construct |
| if (!isContinuationChar(b1)) { |
| validU8Char = false; |
| } |
| else { |
| i++; |
| } |
| } |
| // a three-bytes sequence was encountered |
| else if (isThreeBytesSequence(b0)) { |
| // there must be two continuation bytes of the form 10xxxxxx, |
| // otherwise the following characters is not a valid UTF-8 construct |
| if (!(isContinuationChar(b1) && isContinuationChar(b2))) { |
| validU8Char = false; |
| } |
| else { |
| i += 2; |
| } |
| } |
| // a four-bytes sequence was encountered |
| else if (isFourBytesSequence(b0)) { |
| // there must be three continuation bytes of the form 10xxxxxx, |
| // otherwise the following characters is not a valid UTF-8 construct |
| if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3))) { |
| validU8Char = false; |
| } |
| else { |
| i += 3; |
| } |
| } |
| // a five-bytes sequence was encountered |
| else if (isFiveBytesSequence(b0)) { |
| // there must be four continuation bytes of the form 10xxxxxx, |
| // otherwise the following characters is not a valid UTF-8 construct |
| if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3) && isContinuationChar(b4))) { |
| validU8Char = false; |
| } |
| else { |
| i += 4; |
| } |
| } |
| // a six-bytes sequence was encountered |
| else if (isSixBytesSequence(b0)) { |
| // there must be five continuation bytes of the form 10xxxxxx, |
| // otherwise the following characters is not a valid UTF-8 construct |
| if (!(isContinuationChar(b1) && |
| isContinuationChar(b2) && |
| isContinuationChar(b3) && |
| isContinuationChar(b4) && |
| isContinuationChar(b5))) { |
| validU8Char = false; |
| } |
| else { |
| i += 5; |
| } |
| } |
| else { |
| validU8Char = false; |
| } |
| } |
| else if (b0 < BINARY_THRESHOLD) { |
| hasBinary = true; |
| } |
| if (!validU8Char) break; |
| i++; |
| } |
| |
| if (!highOrderBit && !hasBinary) { |
| return GuessedEncoding.SEVEN_BIT; |
| } |
| // finally, if it's not UTF-8 nor US-ASCII |
| if (!validU8Char) return GuessedEncoding.INVALID_UTF8; |
| if (hasBinary) return GuessedEncoding.BINARY; |
| |
| // if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8, |
| // otherwise the file would not be human readable |
| return GuessedEncoding.VALID_UTF8; |
| } |
| |
| @Nullable |
| public Charset guessFromBOM() { |
| return guessFromBOM(buffer); |
| } |
| |
| @Nullable |
| public static Charset guessFromBOM(@NotNull byte[] buffer) { |
| if (hasUTF8Bom(buffer)) return UTF8_CHARSET; |
| if (hasUTF32BEBom(buffer)) return UTF_32BE_CHARSET; |
| if (hasUTF32LEBom(buffer)) return UTF_32LE_CHARSET; |
| if (hasUTF16LEBom(buffer)) return UTF_16LE_CHARSET; |
| if (hasUTF16BEBom(buffer)) return UTF_16BE_CHARSET; |
| |
| return null; |
| } |
| |
| public Charset guessEncoding(int guess_length) { |
| return guessEncoding(guess_length, defaultCharset); |
| } |
| |
| public static Charset guessEncoding(@NotNull File f, int bufferLength, Charset defaultCharset) throws IOException { |
| byte[] buffer = new byte[bufferLength]; |
| int read; |
| FileInputStream fis = new FileInputStream(f); |
| try { |
| read = fis.read(buffer); |
| } |
| finally { |
| fis.close(); |
| } |
| CharsetToolkit toolkit = new CharsetToolkit(buffer, defaultCharset); |
| return toolkit.guessEncoding(read); |
| } |
| |
| /** |
| * If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character; |
| * |
| * @param b a byte. |
| * @return true if it's a continuation char. |
| */ |
| private static boolean isContinuationChar(byte b) { |
| return -128 <= b && b <= -65; |
| } |
| |
| /** |
| * If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character. |
| * |
| * @param b a byte. |
| * @return true if it's the first byte of a two-bytes sequence. |
| */ |
| private static boolean isTwoBytesSequence(byte b) { |
| return -64 <= b && b <= -33; |
| } |
| |
| /** |
| * If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character. |
| * |
| * @param b a byte. |
| * @return true if it's the first byte of a three-bytes sequence. |
| */ |
| private static boolean isThreeBytesSequence(byte b) { |
| return -32 <= b && b <= -17; |
| } |
| |
| /** |
| * If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character. |
| * |
| * @param b a byte. |
| * @return true if it's the first byte of a four-bytes sequence. |
| */ |
| private static boolean isFourBytesSequence(byte b) { |
| return -16 <= b && b <= -9; |
| } |
| |
| /** |
| * If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character. |
| * |
| * @param b a byte. |
| * @return true if it's the first byte of a five-bytes sequence. |
| */ |
| private static boolean isFiveBytesSequence(byte b) { |
| return -8 <= b && b <= -5; |
| } |
| |
| /** |
| * If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character. |
| * |
| * @param b a byte. |
| * @return true if it's the first byte of a six-bytes sequence. |
| */ |
| private static boolean isSixBytesSequence(byte b) { |
| return -4 <= b && b <= -3; |
| } |
| |
| /** |
| * Retrieve the default charset of the system. |
| * |
| * @return the default <code>Charset</code>. |
| */ |
| @Nullable |
| public static Charset getDefaultSystemCharset() { |
| Charset charset = null; |
| try { |
| charset = Charset.forName(System.getProperty(FILE_ENCODING_PROPERTY)); |
| } catch (Exception ignored) { |
| // Null is OK here. |
| } |
| |
| return charset; |
| } |
| |
| /** |
| * Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors). |
| * |
| * @param bom a buffer. |
| * @return true if the buffer has a BOM for UTF8. |
| */ |
| public static boolean hasUTF8Bom(@NotNull byte[] bom) { |
| return ArrayUtil.startsWith(bom, UTF8_BOM); |
| } |
| |
| /** |
| * Has a Byte Order Marker for UTF-16 Low Endian |
| * (ucs-2le, ucs-4le, and ucs-16le). |
| * |
| * @param bom a buffer. |
| * @return true if the buffer has a BOM for UTF-16 Low Endian. |
| */ |
| public static boolean hasUTF16LEBom(@NotNull byte[] bom) { |
| return ArrayUtil.startsWith(bom, UTF16LE_BOM); |
| } |
| |
| /** |
| * Has a Byte Order Marker for UTF-16 Big Endian |
| * (utf-16 and ucs-2). |
| * |
| * @param bom a buffer. |
| * @return true if the buffer has a BOM for UTF-16 Big Endian. |
| */ |
| public static boolean hasUTF16BEBom(@NotNull byte[] bom) { |
| return ArrayUtil.startsWith(bom, UTF16BE_BOM); |
| } |
| public static boolean hasUTF32BEBom(@NotNull byte[] bom) { |
| return ArrayUtil.startsWith(bom, UTF32BE_BOM); |
| } |
| public static boolean hasUTF32LEBom(@NotNull byte[] bom) { |
| return ArrayUtil.startsWith(bom, UTF32LE_BOM); |
| } |
| |
| /** |
| * Retrieves all the available <code>Charset</code>s on the platform, |
| * among which the default <code>charset</code>. |
| * |
| * @return an array of <code>Charset</code>s. |
| */ |
| @NotNull |
| public static Charset[] getAvailableCharsets() { |
| Collection<Charset> collection = Charset.availableCharsets().values(); |
| return collection.toArray(new Charset[collection.size()]); |
| } |
| |
| @NotNull |
| public static byte[] getUtf8Bytes(@NotNull String s) { |
| try { |
| return s.getBytes(UTF8); |
| } |
| catch (UnsupportedEncodingException e) { |
| throw new RuntimeException("UTF-8 must be supported", e); |
| } |
| } |
| |
| public static int getBOMLength(@NotNull byte[] content, Charset charset) { |
| if (charset != null && charset.name().contains(UTF8) && hasUTF8Bom(content)) { |
| return UTF8_BOM.length; |
| } |
| if (hasUTF32BEBom(content)) { |
| return UTF32BE_BOM.length; |
| } |
| if (hasUTF32BEBom(content)) { |
| return UTF32BE_BOM.length; |
| } |
| if (hasUTF16LEBom(content)) { |
| return UTF16LE_BOM.length; |
| } |
| if (hasUTF16BEBom(content)) { |
| return UTF16BE_BOM.length; |
| } |
| return 0; |
| } |
| |
| /** |
| * @deprecated use {@link CharsetToolkit#getMandatoryBom(java.nio.charset.Charset)} |
| */ |
| @Nullable |
| public static byte[] getBom(@NotNull Charset charset) { |
| return getMandatoryBom(charset); |
| } |
| |
| /** |
| * @return BOM which is associated with this charset and the charset must have this BOM, or null otherwise. |
| * Currently these are UTF-16xx and UTF-32xx families. |
| * UTF-8, on the other hand, might have BOM {@link #UTF8_BOM} which is optional, thus it will not returned in this method |
| */ |
| @Nullable |
| public static byte[] getMandatoryBom(@NotNull Charset charset) { |
| return CHARSET_TO_MANDATORY_BOM.get(charset); |
| } |
| |
| // byte sequence for this encoding is allowed to be prepended with this BOM |
| public static boolean canHaveBom(@NotNull Charset charset, @NotNull byte[] bom) { |
| return charset.equals(UTF8_CHARSET) && Arrays.equals(bom, UTF8_BOM) |
| || Arrays.equals(getMandatoryBom(charset), bom); |
| } |
| |
| @Nullable |
| public static Charset forName(@Nullable String name) { |
| Charset charset = null; |
| if (name != null) { |
| try { |
| charset = Charset.forName(name); |
| } |
| catch (IllegalCharsetNameException ignored) { |
| //ignore |
| } |
| catch(UnsupportedCharsetException ignored){ |
| //ignore |
| } |
| } |
| |
| return charset; |
| } |
| } |