blob: b90a9c93eb7e468a8b042bc88416269f45690cab [file] [log] [blame]
/*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package sun.nio.cs.ext;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.MalformedInputException;
import sun.nio.cs.DelegatableDecoder;
import sun.nio.cs.HistoricallyNamedCharset;
import java.security.AccessController;
import java.security.PrivilegedAction;
import sun.nio.cs.*;
import static java.lang.Character.UnicodeBlock;
public class JISAutoDetect
extends Charset
implements HistoricallyNamedCharset
{
private final static int EUCJP_MASK = 0x01;
private final static int SJIS2B_MASK = 0x02;
private final static int SJIS1B_MASK = 0x04;
private final static int EUCJP_KANA1_MASK = 0x08;
private final static int EUCJP_KANA2_MASK = 0x10;
public JISAutoDetect() {
super("x-JISAutoDetect", ExtendedCharsets.aliasesFor("x-JISAutoDetect"));
}
public boolean contains(Charset cs) {
return ((cs.name().equals("US-ASCII"))
|| (cs instanceof SJIS)
|| (cs instanceof EUC_JP)
|| (cs instanceof ISO2022_JP));
}
public boolean canEncode() {
return false;
}
public CharsetDecoder newDecoder() {
return new Decoder(this);
}
public String historicalName() {
return "JISAutoDetect";
}
public CharsetEncoder newEncoder() {
throw new UnsupportedOperationException();
}
// A heuristic algorithm for guessing if EUC-decoded text really
// might be Japanese text. Better heuristics are possible...
private static boolean looksLikeJapanese(CharBuffer cb) {
int hiragana = 0; // Fullwidth Hiragana
int katakana = 0; // Halfwidth Katakana
while (cb.hasRemaining()) {
char c = cb.get();
if (0x3040 <= c && c <= 0x309f && ++hiragana > 1) return true;
if (0xff65 <= c && c <= 0xff9f && ++katakana > 1) return true;
}
return false;
}
private static class Decoder extends CharsetDecoder {
private final static String osName = AccessController.doPrivileged(
(PrivilegedAction<String>) () -> System.getProperty("os.name"));
private final static String SJISName = getSJISName();
private final static String EUCJPName = getEUCJPName();
private DelegatableDecoder detectedDecoder = null;
public Decoder(Charset cs) {
super(cs, 0.5f, 1.0f);
}
private static boolean isPlainASCII(byte b) {
return b >= 0 && b != 0x1b;
}
private static void copyLeadingASCII(ByteBuffer src, CharBuffer dst) {
int start = src.position();
int limit = start + Math.min(src.remaining(), dst.remaining());
int p;
byte b;
for (p = start; p < limit && isPlainASCII(b = src.get(p)); p++)
dst.put((char)(b & 0xff));
src.position(p);
}
private CoderResult decodeLoop(DelegatableDecoder decoder,
ByteBuffer src, CharBuffer dst) {
((CharsetDecoder)decoder).reset();
detectedDecoder = decoder;
return detectedDecoder.decodeLoop(src, dst);
}
protected CoderResult decodeLoop(ByteBuffer src, CharBuffer dst) {
if (detectedDecoder == null) {
copyLeadingASCII(src, dst);
// All ASCII?
if (! src.hasRemaining())
return CoderResult.UNDERFLOW;
// Overflow only if there is still ascii but no out buffer.
if (!dst.hasRemaining() &&
isPlainASCII(src.get(src.position())))
return CoderResult.OVERFLOW;
// We need to perform double, not float, arithmetic; otherwise
// we lose low order bits when src is larger than 2**24.
int cbufsiz = (int)(src.limit() * (double)maxCharsPerByte());
CharBuffer sandbox = CharBuffer.allocate(cbufsiz);
// First try ISO-2022-JP, since there is no ambiguity
Charset cs2022 = Charset.forName("ISO-2022-JP");
DelegatableDecoder dd2022
= (DelegatableDecoder) cs2022.newDecoder();
ByteBuffer src2022 = src.asReadOnlyBuffer();
CoderResult res2022 = dd2022.decodeLoop(src2022, sandbox);
if (! res2022.isError())
return decodeLoop(dd2022, src, dst);
// We must choose between EUC and SJIS
Charset csEUCJ = Charset.forName(EUCJPName);
Charset csSJIS = Charset.forName(SJISName);
DelegatableDecoder ddEUCJ
= (DelegatableDecoder) csEUCJ.newDecoder();
DelegatableDecoder ddSJIS
= (DelegatableDecoder) csSJIS.newDecoder();
ByteBuffer srcEUCJ = src.asReadOnlyBuffer();
sandbox.clear();
CoderResult resEUCJ = ddEUCJ.decodeLoop(srcEUCJ, sandbox);
// If EUC decoding fails, must be SJIS
if (resEUCJ.isError())
return decodeLoop(ddSJIS, src, dst);
ByteBuffer srcSJIS = src.asReadOnlyBuffer();
CharBuffer sandboxSJIS = CharBuffer.allocate(cbufsiz);
CoderResult resSJIS = ddSJIS.decodeLoop(srcSJIS, sandboxSJIS);
// If SJIS decoding fails, must be EUC
if (resSJIS.isError())
return decodeLoop(ddEUCJ, src, dst);
// From here on, we have some ambiguity, and must guess.
// We prefer input that does not appear to end mid-character.
if (srcEUCJ.position() > srcSJIS.position())
return decodeLoop(ddEUCJ, src, dst);
if (srcEUCJ.position() < srcSJIS.position())
return decodeLoop(ddSJIS, src, dst);
// end-of-input is after the first byte of the first char?
if (src.position() == srcEUCJ.position())
return CoderResult.UNDERFLOW;
// Use heuristic knowledge of typical Japanese text
sandbox.flip();
return decodeLoop(looksLikeJapanese(sandbox) ? ddEUCJ : ddSJIS,
src, dst);
}
return detectedDecoder.decodeLoop(src, dst);
}
protected void implReset() {
detectedDecoder = null;
}
protected CoderResult implFlush(CharBuffer out) {
if (detectedDecoder != null)
return detectedDecoder.implFlush(out);
else
return super.implFlush(out);
}
public boolean isAutoDetecting() {
return true;
}
public boolean isCharsetDetected() {
return detectedDecoder != null;
}
public Charset detectedCharset() {
if (detectedDecoder == null)
throw new IllegalStateException("charset not yet detected");
return ((CharsetDecoder) detectedDecoder).charset();
}
/**
* Returned Shift_JIS Charset name is OS dependent
*/
private static String getSJISName() {
if (osName.equals("Solaris") || osName.equals("SunOS"))
return("PCK");
else if (osName.startsWith("Windows"))
return("windows-31J");
else
return("Shift_JIS");
}
/**
* Returned EUC-JP Charset name is OS dependent
*/
private static String getEUCJPName() {
if (osName.equals("Solaris") || osName.equals("SunOS"))
return("x-eucjp-open");
else
return("EUC_JP");
}
}
}