| /* |
| * Copyright 2000-2014 JetBrains s.r.o. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package com.intellij.openapi.fileEditor.impl; |
| |
| import com.intellij.lang.properties.charset.Native2AsciiCharset; |
| import com.intellij.openapi.fileTypes.*; |
| import com.intellij.openapi.project.Project; |
| import com.intellij.openapi.util.Key; |
| import com.intellij.openapi.util.Pair; |
| import com.intellij.openapi.util.Trinity; |
| import com.intellij.openapi.util.text.StringUtil; |
| import com.intellij.openapi.vfs.CharsetToolkit; |
| import com.intellij.openapi.vfs.VirtualFile; |
| import com.intellij.openapi.vfs.encoding.EncodingRegistry; |
| import com.intellij.testFramework.LightVirtualFile; |
| import com.intellij.util.ArrayUtil; |
| import com.intellij.util.text.CharArrayUtil; |
| import org.jetbrains.annotations.Nls; |
| import org.jetbrains.annotations.NotNull; |
| import org.jetbrains.annotations.Nullable; |
| |
| import java.io.IOException; |
| import java.io.OutputStream; |
| import java.nio.ByteBuffer; |
| import java.nio.CharBuffer; |
| import java.nio.charset.Charset; |
| import java.nio.charset.UnsupportedCharsetException; |
| |
| public final class LoadTextUtil { |
| @Nls private static final String AUTO_DETECTED_FROM_BOM = "auto-detected from BOM"; |
| |
| private LoadTextUtil() { |
| } |
| |
| @NotNull |
| private static Pair<CharSequence, String> convertLineSeparators(@NotNull CharBuffer buffer) { |
| int dst = 0; |
| char prev = ' '; |
| int crCount = 0; |
| int lfCount = 0; |
| int crlfCount = 0; |
| |
| final int length = buffer.length(); |
| final char[] bufferArray = CharArrayUtil.fromSequenceWithoutCopying(buffer); |
| |
| for (int src = 0; src < length; src++) { |
| char c = bufferArray != null ? bufferArray[src]:buffer.charAt(src); |
| switch (c) { |
| case '\r': |
| if(bufferArray != null) bufferArray[dst++] = '\n'; |
| else buffer.put(dst++, '\n'); |
| crCount++; |
| break; |
| case '\n': |
| if (prev == '\r') { |
| crCount--; |
| crlfCount++; |
| } |
| else { |
| if(bufferArray != null) bufferArray[dst++] = '\n'; |
| else buffer.put(dst++, '\n'); |
| lfCount++; |
| } |
| break; |
| default: |
| if(bufferArray != null) bufferArray[dst++] = c; |
| else buffer.put(dst++, c); |
| break; |
| } |
| prev = c; |
| } |
| |
| String detectedLineSeparator = null; |
| if (crlfCount > crCount && crlfCount > lfCount) { |
| detectedLineSeparator = "\r\n"; |
| } |
| else if (crCount > lfCount) { |
| detectedLineSeparator = "\r"; |
| } |
| else if (lfCount > 0) { |
| detectedLineSeparator = "\n"; |
| } |
| |
| CharSequence result; |
| if (buffer.length() == dst) { |
| result = buffer; |
| } |
| else { |
| // in Mac JDK CharBuffer.subSequence() signature differs from Oracle's |
| // more than that, the signature has changed between jd6 and jdk7, |
| // so use more generic CharSequence.subSequence() just in case |
| @SuppressWarnings("UnnecessaryLocalVariable") CharSequence seq = buffer; |
| result = seq.subSequence(0, dst); |
| } |
| return Pair.create(result, detectedLineSeparator); |
| } |
| |
| public static Charset detectCharset(@NotNull VirtualFile virtualFile, @NotNull byte[] content, @NotNull FileType fileType) { |
| Charset charset = null; |
| |
| Trinity<Charset,CharsetToolkit.GuessedEncoding, byte[]> guessed = guessFromContent(virtualFile, content, content.length); |
| if (guessed != null && guessed.first != null) { |
| charset = guessed.first; |
| } |
| else { |
| String charsetName = fileType.getCharset(virtualFile, content); |
| |
| if (charsetName == null) { |
| Charset specifiedExplicitly = EncodingRegistry.getInstance().getEncoding(virtualFile, true); |
| if (specifiedExplicitly != null) { |
| charset = specifiedExplicitly; |
| } |
| } |
| else { |
| charset = CharsetToolkit.forName(charsetName); |
| } |
| } |
| |
| charset = charset == null ? EncodingRegistry.getInstance().getDefaultCharset() : charset; |
| if (fileType.getName().equals("Properties") && EncodingRegistry.getInstance().isNative2AsciiForPropertiesFiles()) { |
| charset = Native2AsciiCharset.wrap(charset); |
| } |
| virtualFile.setCharset(charset); |
| return charset; |
| } |
| |
| @NotNull |
| public static Charset detectCharsetAndSetBOM(@NotNull VirtualFile virtualFile, @NotNull byte[] content) { |
| return doDetectCharsetAndSetBOM(virtualFile, content, true).getFirst(); |
| } |
| |
| @NotNull |
| private static Pair<Charset, byte[]> doDetectCharsetAndSetBOM(@NotNull VirtualFile virtualFile, @NotNull byte[] content, boolean saveBOM) { |
| return doDetectCharsetAndSetBOM(virtualFile, content, saveBOM, virtualFile.getFileType()); |
| } |
| @NotNull |
| private static Pair<Charset, byte[]> doDetectCharsetAndSetBOM(@NotNull VirtualFile virtualFile, @NotNull byte[] content, boolean saveBOM, @NotNull FileType fileType) { |
| Charset charset = virtualFile.isCharsetSet() ? virtualFile.getCharset() : detectCharset(virtualFile, content,fileType); |
| Pair<Charset,byte[]> bomAndCharset = getBOMAndCharset(content, charset); |
| final byte[] bom = bomAndCharset.second; |
| if (saveBOM && bom != null && bom.length != 0) { |
| virtualFile.setBOM(bom); |
| setCharsetWasDetectedFromBytes(virtualFile, AUTO_DETECTED_FROM_BOM); |
| } |
| return bomAndCharset; |
| } |
| |
| private static final boolean GUESS_UTF = Boolean.parseBoolean(System.getProperty("idea.guess.utf.encoding", "true")); |
| |
| @Nullable("null means no luck, otherwise it's tuple(guessed encoding, hint about content if was unable to guess, BOM)") |
| public static Trinity<Charset, CharsetToolkit.GuessedEncoding, byte[]> guessFromContent(@NotNull VirtualFile virtualFile, @NotNull byte[] content, int length) { |
| CharsetToolkit toolkit = GUESS_UTF ? new CharsetToolkit(content, EncodingRegistry.getInstance().getDefaultCharset()) : null; |
| String detectedFromBytes = null; |
| try { |
| if (GUESS_UTF) { |
| toolkit.setEnforce8Bit(true); |
| Charset charset = toolkit.guessFromBOM(); |
| if (charset != null) { |
| detectedFromBytes = AUTO_DETECTED_FROM_BOM; |
| byte[] bom = CharsetToolkit.getMandatoryBom(charset); |
| if (bom == null) bom = CharsetToolkit.UTF8_BOM; |
| return Trinity.create(charset, null, bom); |
| } |
| CharsetToolkit.GuessedEncoding guessed = toolkit.guessFromContent(length); |
| if (guessed == CharsetToolkit.GuessedEncoding.VALID_UTF8) { |
| detectedFromBytes = "auto-detected from bytes"; |
| return Trinity.create(CharsetToolkit.UTF8_CHARSET, guessed, null); //UTF detected, ignore all directives |
| } |
| if (guessed == CharsetToolkit.GuessedEncoding.SEVEN_BIT) { |
| return Trinity.create(null, guessed, null); |
| } |
| } |
| return null; |
| } |
| finally { |
| setCharsetWasDetectedFromBytes(virtualFile, detectedFromBytes); |
| } |
| } |
| |
| @NotNull |
| private static Pair<Charset,byte[]> getBOMAndCharset(@NotNull byte[] content, final Charset charset) { |
| if (charset != null && charset.name().contains(CharsetToolkit.UTF8) && CharsetToolkit.hasUTF8Bom(content)) { |
| return Pair.create(charset, CharsetToolkit.UTF8_BOM); |
| } |
| try { |
| Charset fromBOM = CharsetToolkit.guessFromBOM(content); |
| if (fromBOM != null) { |
| return Pair.create(fromBOM, CharsetToolkit.getMandatoryBom(fromBOM)); |
| } |
| } |
| catch (UnsupportedCharsetException ignore) { |
| } |
| |
| return Pair.create(charset, ArrayUtil.EMPTY_BYTE_ARRAY); |
| } |
| |
| public static void changeLineSeparators(@Nullable Project project, |
| @NotNull VirtualFile file, |
| @NotNull String newSeparator, |
| @NotNull Object requestor) throws IOException |
| { |
| CharSequence currentText = getTextByBinaryPresentation(file.contentsToByteArray(), file, true, false); |
| String currentSeparator = detectLineSeparator(file, false); |
| if (newSeparator.equals(currentSeparator)) { |
| return; |
| } |
| String newText = StringUtil.convertLineSeparators(currentText.toString(), newSeparator); |
| |
| file.setDetectedLineSeparator(newSeparator); |
| write(project, file, requestor, newText, -1); |
| } |
| |
| /** |
| * Overwrites file with text and sets modification stamp and time stamp to the specified values. |
| * <p/> |
| * Normally you should not use this method. |
| * |
| * @param requestor any object to control who called this method. Note that |
| * it is considered to be an external change if <code>requestor</code> is <code>null</code>. |
| * See {@link com.intellij.openapi.vfs.VirtualFileEvent#getRequestor} |
| * @param newModificationStamp new modification stamp or -1 if no special value should be set @return <code>Writer</code> |
| * @throws java.io.IOException if an I/O error occurs |
| * @see VirtualFile#getModificationStamp() |
| */ |
| public static void write(@Nullable Project project, |
| @NotNull VirtualFile virtualFile, |
| @NotNull Object requestor, |
| @NotNull String text, |
| long newModificationStamp) throws IOException { |
| Charset existing = virtualFile.getCharset(); |
| Pair<Charset, byte[]> chosen = charsetForWriting(project, virtualFile, text, existing); |
| Charset charset = chosen.first; |
| byte[] buffer = chosen.second; |
| if (charset != null) { |
| if (!charset.equals(existing)) { |
| virtualFile.setCharset(charset); |
| } |
| } |
| setDetectedFromBytesFlagBack(virtualFile, buffer); |
| |
| OutputStream outputStream = virtualFile.getOutputStream(requestor, newModificationStamp, -1); |
| try { |
| outputStream.write(buffer); |
| } |
| finally { |
| outputStream.close(); |
| } |
| } |
| |
| @NotNull |
| private static Pair<Charset, byte[]> charsetForWriting(@Nullable Project project, |
| @NotNull VirtualFile virtualFile, |
| @NotNull String text, |
| @Nullable Charset existing) { |
| Charset specified = extractCharsetFromFileContent(project, virtualFile, text); |
| Pair<Charset, byte[]> chosen = chooseMostlyHarmlessCharset(existing, specified, text); |
| Charset charset = chosen.first; |
| |
| // in case of "UTF-16", OutputStreamWriter sometimes adds BOM on it's own. |
| // see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6800103 |
| byte[] bom = virtualFile.getBOM(); |
| Charset fromBom = bom == null ? null : CharsetToolkit.guessFromBOM(bom); |
| if (fromBom != null && !fromBom.equals(charset)) { |
| chosen = Pair.create(fromBom, toBytes(text, fromBom)); |
| } |
| return chosen; |
| } |
| |
| public static void setDetectedFromBytesFlagBack(@NotNull VirtualFile virtualFile, @NotNull byte[] content) { |
| if (virtualFile.getBOM() == null) { |
| guessFromContent(virtualFile, content, content.length); |
| } |
| else { |
| // prevent file to be reloaded in other encoding after save with BOM |
| setCharsetWasDetectedFromBytes(virtualFile, AUTO_DETECTED_FROM_BOM); |
| } |
| } |
| |
| @NotNull |
| public static Pair<Charset, byte[]> chooseMostlyHarmlessCharset(Charset existing, Charset specified, @NotNull String text) { |
| try { |
| if (existing == null) return Pair.create(specified, toBytes(text, specified)); |
| if (specified == null || specified.equals(existing)) return Pair.create(specified, toBytes(text, existing)); |
| |
| byte[] out = isSupported(specified, text); |
| if (out != null) return Pair.create(specified, out); //if explicitly specified encoding is safe, return it |
| out = isSupported(existing, text); |
| if (out != null) return Pair.create(existing, out); //otherwise stick to the old encoding if it's ok |
| return Pair.create(specified, toBytes(text, specified)); //if both are bad there is no difference |
| } |
| catch (RuntimeException e) { |
| return Pair.create(Charset.defaultCharset(), toBytes(text, null)); //if both are bad and there is no hope, use the default charset |
| } |
| } |
| |
| @NotNull |
| private static byte[] toBytes(@NotNull String text, @Nullable Charset charset) throws RuntimeException { |
| //noinspection SSBasedInspection |
| return charset == null ? text.getBytes() : text.getBytes(charset); |
| } |
| |
| @Nullable("null means not supported, otherwise it is converted byte stream") |
| private static byte[] isSupported(@NotNull Charset charset, @NotNull String str) { |
| try { |
| if (!charset.canEncode()) return null; |
| byte[] bytes = str.getBytes(charset); |
| if (!str.equals(new String(bytes, charset))) { |
| return null; |
| } |
| |
| return bytes; |
| } |
| catch (Exception e) { |
| return null;//wow, some charsets throw NPE inside .getBytes() when unable to encode (JIS_X0212-1990) |
| } |
| } |
| |
| public static Charset extractCharsetFromFileContent(@Nullable Project project, @NotNull VirtualFile virtualFile, @NotNull String text) { |
| Charset charset = charsetFromContentOrNull(project, virtualFile, text); |
| if (charset == null) charset = virtualFile.getCharset(); |
| return charset; |
| } |
| |
| /** |
| * @deprecated use {@link #charsetFromContentOrNull(com.intellij.openapi.project.Project, com.intellij.openapi.vfs.VirtualFile, CharSequence)} |
| */ |
| @Nullable("returns null if cannot determine from content") |
| public static Charset charsetFromContentOrNull(@Nullable Project project, @NotNull VirtualFile virtualFile, @NotNull String text) { |
| return CharsetUtil.extractCharsetFromFileContent(project, virtualFile, virtualFile.getFileType(), text); |
| } |
| |
| @Nullable("returns null if cannot determine from content") |
| public static Charset charsetFromContentOrNull(@Nullable Project project, @NotNull VirtualFile virtualFile, @NotNull CharSequence text) { |
| return CharsetUtil.extractCharsetFromFileContent(project, virtualFile, virtualFile.getFileType(), text); |
| } |
| |
| @NotNull |
| public static CharSequence loadText(@NotNull VirtualFile file) { |
| if (file instanceof LightVirtualFile) { |
| return ((LightVirtualFile)file).getContent(); |
| } |
| |
| if (file.isDirectory()) { |
| throw new AssertionError("'" + file.getPresentableUrl() + "' is directory"); |
| } |
| final FileType fileType = file.getFileType(); |
| |
| if (fileType.isBinary()) { |
| final BinaryFileDecompiler decompiler = BinaryFileTypeDecompilers.INSTANCE.forFileType(fileType); |
| if (decompiler != null) { |
| CharSequence text = decompiler.decompile(file); |
| StringUtil.assertValidSeparators(text); |
| return text; |
| } |
| |
| throw new IllegalArgumentException("Attempt to load text for binary file, that doesn't have decompiler plugged in: "+file.getPresentableUrl()); |
| } |
| |
| try { |
| byte[] bytes = file.contentsToByteArray(); |
| return getTextByBinaryPresentation(bytes, file); |
| } |
| catch (IOException e) { |
| return ArrayUtil.EMPTY_CHAR_SEQUENCE; |
| } |
| } |
| |
| @NotNull |
| public static CharSequence getTextByBinaryPresentation(@NotNull final byte[] bytes, @NotNull VirtualFile virtualFile) { |
| return getTextByBinaryPresentation(bytes, virtualFile, true, true); |
| } |
| |
| @NotNull |
| public static CharSequence getTextByBinaryPresentation(@NotNull byte[] bytes, |
| @NotNull VirtualFile virtualFile, |
| boolean saveDetectedSeparators, |
| boolean saveBOM) { |
| return getTextByBinaryPresentation(bytes, virtualFile, saveDetectedSeparators, saveBOM, virtualFile.getFileType()); |
| } |
| @NotNull |
| public static CharSequence getTextByBinaryPresentation(@NotNull byte[] bytes, |
| @NotNull VirtualFile virtualFile, |
| boolean saveDetectedSeparators, |
| boolean saveBOM, @NotNull FileType fileType) { |
| Pair<Charset, byte[]> pair = doDetectCharsetAndSetBOM(virtualFile, bytes, saveBOM, fileType); |
| Charset charset = pair.getFirst(); |
| byte[] bom = pair.getSecond(); |
| int offset = bom == null ? 0 : bom.length; |
| |
| Pair<CharSequence, String> result = convertBytes(bytes, charset, offset); |
| if (saveDetectedSeparators) { |
| virtualFile.setDetectedLineSeparator(result.getSecond()); |
| } |
| return result.getFirst(); |
| } |
| |
| /** |
| * Get detected line separator, if the file never been loaded, is loaded if checkFile parameter is specified. |
| * |
| * @param file the file to check |
| * @param checkFile if the line separator was not detected before, try to detect it |
| * @return the detected line separator or null |
| */ |
| @Nullable |
| public static String detectLineSeparator(@NotNull VirtualFile file, boolean checkFile) { |
| String lineSeparator = getDetectedLineSeparator(file); |
| if (lineSeparator == null && checkFile) { |
| try { |
| getTextByBinaryPresentation(file.contentsToByteArray(), file); |
| lineSeparator = getDetectedLineSeparator(file); |
| } |
| catch (IOException e) { |
| // null will be returned |
| } |
| } |
| return lineSeparator; |
| } |
| |
| static String getDetectedLineSeparator(@NotNull VirtualFile file) { |
| return file.getDetectedLineSeparator(); |
| } |
| |
| @NotNull |
| public static CharSequence getTextByBinaryPresentation(@NotNull byte[] bytes, Charset charset) { |
| Pair<Charset, byte[]> pair = getBOMAndCharset(bytes, charset); |
| byte[] bom = pair.getSecond(); |
| int offset = bom == null ? 0 : bom.length; |
| |
| final Pair<CharSequence, String> result = convertBytes(bytes, charset, offset); |
| return result.getFirst(); |
| } |
| |
| // do not need to think about BOM here. it is processed outside |
| @NotNull |
| private static Pair<CharSequence, String> convertBytes(@NotNull byte[] bytes, Charset charset, final int startOffset) { |
| ByteBuffer byteBuffer = ByteBuffer.wrap(bytes, startOffset, bytes.length - startOffset); |
| |
| if (charset == null) { |
| charset = CharsetToolkit.getDefaultSystemCharset(); |
| } |
| if (charset == null) { |
| charset = Charset.forName("ISO-8859-1"); |
| } |
| CharBuffer charBuffer; |
| try { |
| charBuffer = charset.decode(byteBuffer); |
| } |
| catch (Exception e) { |
| // esoteric charsets can throw any kind of exception |
| charBuffer = CharBuffer.wrap(ArrayUtil.EMPTY_CHAR_ARRAY); |
| } |
| return convertLineSeparators(charBuffer); |
| } |
| |
| private static final Key<String> CHARSET_WAS_DETECTED_FROM_BYTES = Key.create("CHARSET_WAS_DETECTED_FROM_BYTES"); |
| @Nullable("null if was not detected, otherwise the reason it was") |
| public static String wasCharsetDetectedFromBytes(@NotNull VirtualFile virtualFile) { |
| return virtualFile.getUserData(CHARSET_WAS_DETECTED_FROM_BYTES); |
| } |
| |
| public static void setCharsetWasDetectedFromBytes(@NotNull VirtualFile virtualFile, |
| @Nullable("null if was not detected, otherwise the reason it was") String reason) { |
| virtualFile.putUserData(CHARSET_WAS_DETECTED_FROM_BYTES, reason); |
| } |
| } |