blob: f5f234abab4968169892d4ac3cf1728e115f4644 [file] [log] [blame]
/*
* Copyright 2000-2014 JetBrains s.r.o.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.intellij.openapi.fileEditor.impl;
import com.intellij.lang.properties.charset.Native2AsciiCharset;
import com.intellij.openapi.fileTypes.*;
import com.intellij.openapi.project.Project;
import com.intellij.openapi.util.Key;
import com.intellij.openapi.util.Pair;
import com.intellij.openapi.util.Trinity;
import com.intellij.openapi.util.text.StringUtil;
import com.intellij.openapi.vfs.CharsetToolkit;
import com.intellij.openapi.vfs.VirtualFile;
import com.intellij.openapi.vfs.encoding.EncodingRegistry;
import com.intellij.testFramework.LightVirtualFile;
import com.intellij.util.ArrayUtil;
import com.intellij.util.text.CharArrayUtil;
import org.jetbrains.annotations.Nls;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
public final class LoadTextUtil {
@Nls private static final String AUTO_DETECTED_FROM_BOM = "auto-detected from BOM";
private LoadTextUtil() {
}
@NotNull
private static Pair<CharSequence, String> convertLineSeparators(@NotNull CharBuffer buffer) {
int dst = 0;
char prev = ' ';
int crCount = 0;
int lfCount = 0;
int crlfCount = 0;
final int length = buffer.length();
final char[] bufferArray = CharArrayUtil.fromSequenceWithoutCopying(buffer);
for (int src = 0; src < length; src++) {
char c = bufferArray != null ? bufferArray[src]:buffer.charAt(src);
switch (c) {
case '\r':
if(bufferArray != null) bufferArray[dst++] = '\n';
else buffer.put(dst++, '\n');
crCount++;
break;
case '\n':
if (prev == '\r') {
crCount--;
crlfCount++;
}
else {
if(bufferArray != null) bufferArray[dst++] = '\n';
else buffer.put(dst++, '\n');
lfCount++;
}
break;
default:
if(bufferArray != null) bufferArray[dst++] = c;
else buffer.put(dst++, c);
break;
}
prev = c;
}
String detectedLineSeparator = null;
if (crlfCount > crCount && crlfCount > lfCount) {
detectedLineSeparator = "\r\n";
}
else if (crCount > lfCount) {
detectedLineSeparator = "\r";
}
else if (lfCount > 0) {
detectedLineSeparator = "\n";
}
CharSequence result;
if (buffer.length() == dst) {
result = buffer;
}
else {
// in Mac JDK CharBuffer.subSequence() signature differs from Oracle's
// more than that, the signature has changed between jd6 and jdk7,
// so use more generic CharSequence.subSequence() just in case
@SuppressWarnings("UnnecessaryLocalVariable") CharSequence seq = buffer;
result = seq.subSequence(0, dst);
}
return Pair.create(result, detectedLineSeparator);
}
public static Charset detectCharset(@NotNull VirtualFile virtualFile, @NotNull byte[] content, @NotNull FileType fileType) {
Charset charset = null;
Trinity<Charset,CharsetToolkit.GuessedEncoding, byte[]> guessed = guessFromContent(virtualFile, content, content.length);
if (guessed != null && guessed.first != null) {
charset = guessed.first;
}
else {
String charsetName = fileType.getCharset(virtualFile, content);
if (charsetName == null) {
Charset specifiedExplicitly = EncodingRegistry.getInstance().getEncoding(virtualFile, true);
if (specifiedExplicitly != null) {
charset = specifiedExplicitly;
}
}
else {
charset = CharsetToolkit.forName(charsetName);
}
}
charset = charset == null ? EncodingRegistry.getInstance().getDefaultCharset() : charset;
if (fileType.getName().equals("Properties") && EncodingRegistry.getInstance().isNative2AsciiForPropertiesFiles()) {
charset = Native2AsciiCharset.wrap(charset);
}
virtualFile.setCharset(charset);
return charset;
}
@NotNull
public static Charset detectCharsetAndSetBOM(@NotNull VirtualFile virtualFile, @NotNull byte[] content) {
return doDetectCharsetAndSetBOM(virtualFile, content, true).getFirst();
}
@NotNull
private static Pair<Charset, byte[]> doDetectCharsetAndSetBOM(@NotNull VirtualFile virtualFile, @NotNull byte[] content, boolean saveBOM) {
return doDetectCharsetAndSetBOM(virtualFile, content, saveBOM, virtualFile.getFileType());
}
@NotNull
private static Pair<Charset, byte[]> doDetectCharsetAndSetBOM(@NotNull VirtualFile virtualFile, @NotNull byte[] content, boolean saveBOM, @NotNull FileType fileType) {
Charset charset = virtualFile.isCharsetSet() ? virtualFile.getCharset() : detectCharset(virtualFile, content,fileType);
Pair<Charset,byte[]> bomAndCharset = getBOMAndCharset(content, charset);
final byte[] bom = bomAndCharset.second;
if (saveBOM && bom != null && bom.length != 0) {
virtualFile.setBOM(bom);
setCharsetWasDetectedFromBytes(virtualFile, AUTO_DETECTED_FROM_BOM);
}
return bomAndCharset;
}
private static final boolean GUESS_UTF = Boolean.parseBoolean(System.getProperty("idea.guess.utf.encoding", "true"));
@Nullable("null means no luck, otherwise it's tuple(guessed encoding, hint about content if was unable to guess, BOM)")
public static Trinity<Charset, CharsetToolkit.GuessedEncoding, byte[]> guessFromContent(@NotNull VirtualFile virtualFile, @NotNull byte[] content, int length) {
CharsetToolkit toolkit = GUESS_UTF ? new CharsetToolkit(content, EncodingRegistry.getInstance().getDefaultCharset()) : null;
String detectedFromBytes = null;
try {
if (GUESS_UTF) {
toolkit.setEnforce8Bit(true);
Charset charset = toolkit.guessFromBOM();
if (charset != null) {
detectedFromBytes = AUTO_DETECTED_FROM_BOM;
byte[] bom = CharsetToolkit.getMandatoryBom(charset);
if (bom == null) bom = CharsetToolkit.UTF8_BOM;
return Trinity.create(charset, null, bom);
}
CharsetToolkit.GuessedEncoding guessed = toolkit.guessFromContent(length);
if (guessed == CharsetToolkit.GuessedEncoding.VALID_UTF8) {
detectedFromBytes = "auto-detected from bytes";
return Trinity.create(CharsetToolkit.UTF8_CHARSET, guessed, null); //UTF detected, ignore all directives
}
if (guessed == CharsetToolkit.GuessedEncoding.SEVEN_BIT) {
return Trinity.create(null, guessed, null);
}
}
return null;
}
finally {
setCharsetWasDetectedFromBytes(virtualFile, detectedFromBytes);
}
}
@NotNull
private static Pair<Charset,byte[]> getBOMAndCharset(@NotNull byte[] content, final Charset charset) {
if (charset != null && charset.name().contains(CharsetToolkit.UTF8) && CharsetToolkit.hasUTF8Bom(content)) {
return Pair.create(charset, CharsetToolkit.UTF8_BOM);
}
try {
Charset fromBOM = CharsetToolkit.guessFromBOM(content);
if (fromBOM != null) {
return Pair.create(fromBOM, CharsetToolkit.getMandatoryBom(fromBOM));
}
}
catch (UnsupportedCharsetException ignore) {
}
return Pair.create(charset, ArrayUtil.EMPTY_BYTE_ARRAY);
}
public static void changeLineSeparators(@Nullable Project project,
@NotNull VirtualFile file,
@NotNull String newSeparator,
@NotNull Object requestor) throws IOException
{
CharSequence currentText = getTextByBinaryPresentation(file.contentsToByteArray(), file, true, false);
String currentSeparator = detectLineSeparator(file, false);
if (newSeparator.equals(currentSeparator)) {
return;
}
String newText = StringUtil.convertLineSeparators(currentText.toString(), newSeparator);
file.setDetectedLineSeparator(newSeparator);
write(project, file, requestor, newText, -1);
}
/**
* Overwrites file with text and sets modification stamp and time stamp to the specified values.
* <p/>
* Normally you should not use this method.
*
* @param requestor any object to control who called this method. Note that
* it is considered to be an external change if <code>requestor</code> is <code>null</code>.
* See {@link com.intellij.openapi.vfs.VirtualFileEvent#getRequestor}
* @param newModificationStamp new modification stamp or -1 if no special value should be set @return <code>Writer</code>
* @throws java.io.IOException if an I/O error occurs
* @see VirtualFile#getModificationStamp()
*/
public static void write(@Nullable Project project,
@NotNull VirtualFile virtualFile,
@NotNull Object requestor,
@NotNull String text,
long newModificationStamp) throws IOException {
Charset existing = virtualFile.getCharset();
Pair<Charset, byte[]> chosen = charsetForWriting(project, virtualFile, text, existing);
Charset charset = chosen.first;
byte[] buffer = chosen.second;
if (charset != null) {
if (!charset.equals(existing)) {
virtualFile.setCharset(charset);
}
}
setDetectedFromBytesFlagBack(virtualFile, buffer);
OutputStream outputStream = virtualFile.getOutputStream(requestor, newModificationStamp, -1);
try {
outputStream.write(buffer);
}
finally {
outputStream.close();
}
}
@NotNull
private static Pair<Charset, byte[]> charsetForWriting(@Nullable Project project,
@NotNull VirtualFile virtualFile,
@NotNull String text,
@Nullable Charset existing) {
Charset specified = extractCharsetFromFileContent(project, virtualFile, text);
Pair<Charset, byte[]> chosen = chooseMostlyHarmlessCharset(existing, specified, text);
Charset charset = chosen.first;
// in case of "UTF-16", OutputStreamWriter sometimes adds BOM on it's own.
// see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6800103
byte[] bom = virtualFile.getBOM();
Charset fromBom = bom == null ? null : CharsetToolkit.guessFromBOM(bom);
if (fromBom != null && !fromBom.equals(charset)) {
chosen = Pair.create(fromBom, toBytes(text, fromBom));
}
return chosen;
}
public static void setDetectedFromBytesFlagBack(@NotNull VirtualFile virtualFile, @NotNull byte[] content) {
if (virtualFile.getBOM() == null) {
guessFromContent(virtualFile, content, content.length);
}
else {
// prevent file to be reloaded in other encoding after save with BOM
setCharsetWasDetectedFromBytes(virtualFile, AUTO_DETECTED_FROM_BOM);
}
}
@NotNull
public static Pair<Charset, byte[]> chooseMostlyHarmlessCharset(Charset existing, Charset specified, @NotNull String text) {
try {
if (existing == null) return Pair.create(specified, toBytes(text, specified));
if (specified == null || specified.equals(existing)) return Pair.create(specified, toBytes(text, existing));
byte[] out = isSupported(specified, text);
if (out != null) return Pair.create(specified, out); //if explicitly specified encoding is safe, return it
out = isSupported(existing, text);
if (out != null) return Pair.create(existing, out); //otherwise stick to the old encoding if it's ok
return Pair.create(specified, toBytes(text, specified)); //if both are bad there is no difference
}
catch (RuntimeException e) {
return Pair.create(Charset.defaultCharset(), toBytes(text, null)); //if both are bad and there is no hope, use the default charset
}
}
@NotNull
private static byte[] toBytes(@NotNull String text, @Nullable Charset charset) throws RuntimeException {
//noinspection SSBasedInspection
return charset == null ? text.getBytes() : text.getBytes(charset);
}
@Nullable("null means not supported, otherwise it is converted byte stream")
private static byte[] isSupported(@NotNull Charset charset, @NotNull String str) {
try {
if (!charset.canEncode()) return null;
byte[] bytes = str.getBytes(charset);
if (!str.equals(new String(bytes, charset))) {
return null;
}
return bytes;
}
catch (Exception e) {
return null;//wow, some charsets throw NPE inside .getBytes() when unable to encode (JIS_X0212-1990)
}
}
public static Charset extractCharsetFromFileContent(@Nullable Project project, @NotNull VirtualFile virtualFile, @NotNull String text) {
Charset charset = charsetFromContentOrNull(project, virtualFile, text);
if (charset == null) charset = virtualFile.getCharset();
return charset;
}
/**
* @deprecated use {@link #charsetFromContentOrNull(com.intellij.openapi.project.Project, com.intellij.openapi.vfs.VirtualFile, CharSequence)}
*/
@Nullable("returns null if cannot determine from content")
public static Charset charsetFromContentOrNull(@Nullable Project project, @NotNull VirtualFile virtualFile, @NotNull String text) {
return CharsetUtil.extractCharsetFromFileContent(project, virtualFile, virtualFile.getFileType(), text);
}
@Nullable("returns null if cannot determine from content")
public static Charset charsetFromContentOrNull(@Nullable Project project, @NotNull VirtualFile virtualFile, @NotNull CharSequence text) {
return CharsetUtil.extractCharsetFromFileContent(project, virtualFile, virtualFile.getFileType(), text);
}
@NotNull
public static CharSequence loadText(@NotNull VirtualFile file) {
if (file instanceof LightVirtualFile) {
return ((LightVirtualFile)file).getContent();
}
if (file.isDirectory()) {
throw new AssertionError("'" + file.getPresentableUrl() + "' is directory");
}
final FileType fileType = file.getFileType();
if (fileType.isBinary()) {
final BinaryFileDecompiler decompiler = BinaryFileTypeDecompilers.INSTANCE.forFileType(fileType);
if (decompiler != null) {
CharSequence text = decompiler.decompile(file);
StringUtil.assertValidSeparators(text);
return text;
}
throw new IllegalArgumentException("Attempt to load text for binary file, that doesn't have decompiler plugged in: "+file.getPresentableUrl());
}
try {
byte[] bytes = file.contentsToByteArray();
return getTextByBinaryPresentation(bytes, file);
}
catch (IOException e) {
return ArrayUtil.EMPTY_CHAR_SEQUENCE;
}
}
@NotNull
public static CharSequence getTextByBinaryPresentation(@NotNull final byte[] bytes, @NotNull VirtualFile virtualFile) {
return getTextByBinaryPresentation(bytes, virtualFile, true, true);
}
@NotNull
public static CharSequence getTextByBinaryPresentation(@NotNull byte[] bytes,
@NotNull VirtualFile virtualFile,
boolean saveDetectedSeparators,
boolean saveBOM) {
return getTextByBinaryPresentation(bytes, virtualFile, saveDetectedSeparators, saveBOM, virtualFile.getFileType());
}
@NotNull
public static CharSequence getTextByBinaryPresentation(@NotNull byte[] bytes,
@NotNull VirtualFile virtualFile,
boolean saveDetectedSeparators,
boolean saveBOM, @NotNull FileType fileType) {
Pair<Charset, byte[]> pair = doDetectCharsetAndSetBOM(virtualFile, bytes, saveBOM, fileType);
Charset charset = pair.getFirst();
byte[] bom = pair.getSecond();
int offset = bom == null ? 0 : bom.length;
Pair<CharSequence, String> result = convertBytes(bytes, charset, offset);
if (saveDetectedSeparators) {
virtualFile.setDetectedLineSeparator(result.getSecond());
}
return result.getFirst();
}
/**
* Get detected line separator, if the file never been loaded, is loaded if checkFile parameter is specified.
*
* @param file the file to check
* @param checkFile if the line separator was not detected before, try to detect it
* @return the detected line separator or null
*/
@Nullable
public static String detectLineSeparator(@NotNull VirtualFile file, boolean checkFile) {
String lineSeparator = getDetectedLineSeparator(file);
if (lineSeparator == null && checkFile) {
try {
getTextByBinaryPresentation(file.contentsToByteArray(), file);
lineSeparator = getDetectedLineSeparator(file);
}
catch (IOException e) {
// null will be returned
}
}
return lineSeparator;
}
static String getDetectedLineSeparator(@NotNull VirtualFile file) {
return file.getDetectedLineSeparator();
}
@NotNull
public static CharSequence getTextByBinaryPresentation(@NotNull byte[] bytes, Charset charset) {
Pair<Charset, byte[]> pair = getBOMAndCharset(bytes, charset);
byte[] bom = pair.getSecond();
int offset = bom == null ? 0 : bom.length;
final Pair<CharSequence, String> result = convertBytes(bytes, charset, offset);
return result.getFirst();
}
// do not need to think about BOM here. it is processed outside
@NotNull
private static Pair<CharSequence, String> convertBytes(@NotNull byte[] bytes, Charset charset, final int startOffset) {
ByteBuffer byteBuffer = ByteBuffer.wrap(bytes, startOffset, bytes.length - startOffset);
if (charset == null) {
charset = CharsetToolkit.getDefaultSystemCharset();
}
if (charset == null) {
charset = Charset.forName("ISO-8859-1");
}
CharBuffer charBuffer;
try {
charBuffer = charset.decode(byteBuffer);
}
catch (Exception e) {
// esoteric charsets can throw any kind of exception
charBuffer = CharBuffer.wrap(ArrayUtil.EMPTY_CHAR_ARRAY);
}
return convertLineSeparators(charBuffer);
}
private static final Key<String> CHARSET_WAS_DETECTED_FROM_BYTES = Key.create("CHARSET_WAS_DETECTED_FROM_BYTES");
@Nullable("null if was not detected, otherwise the reason it was")
public static String wasCharsetDetectedFromBytes(@NotNull VirtualFile virtualFile) {
return virtualFile.getUserData(CHARSET_WAS_DETECTED_FROM_BYTES);
}
public static void setCharsetWasDetectedFromBytes(@NotNull VirtualFile virtualFile,
@Nullable("null if was not detected, otherwise the reason it was") String reason) {
virtualFile.putUserData(CHARSET_WAS_DETECTED_FROM_BYTES, reason);
}
}