| /* |
| * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. |
| * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
| * |
| * This code is free software; you can redistribute it and/or modify it |
| * under the terms of the GNU General Public License version 2 only, as |
| * published by the Free Software Foundation. Oracle designates this |
| * particular file as subject to the "Classpath" exception as provided |
| * by Oracle in the LICENSE file that accompanied this code. |
| * |
| * This code is distributed in the hope that it will be useful, but WITHOUT |
| * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| * version 2 for more details (a copy is included in the LICENSE file that |
| * accompanied this code). |
| * |
| * You should have received a copy of the GNU General Public License version |
| * 2 along with this work; if not, write to the Free Software Foundation, |
| * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
| * |
| * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
| * or visit www.oracle.com if you need additional information or have any |
| * questions. |
| */ |
| |
| package sun.nio.cs.ext; |
| |
| import java.nio.ByteBuffer; |
| import java.nio.CharBuffer; |
| import java.nio.charset.Charset; |
| import java.nio.charset.CharsetDecoder; |
| import java.nio.charset.CharsetEncoder; |
| import java.nio.charset.CoderResult; |
| import java.nio.charset.CharacterCodingException; |
| import java.nio.charset.MalformedInputException; |
| import sun.nio.cs.DelegatableDecoder; |
| import sun.nio.cs.HistoricallyNamedCharset; |
| import java.security.AccessController; |
| import java.security.PrivilegedAction; |
| import sun.nio.cs.*; |
| import static java.lang.Character.UnicodeBlock; |
| |
| |
| public class JISAutoDetect |
| extends Charset |
| implements HistoricallyNamedCharset |
| { |
| |
| private final static int EUCJP_MASK = 0x01; |
| private final static int SJIS2B_MASK = 0x02; |
| private final static int SJIS1B_MASK = 0x04; |
| private final static int EUCJP_KANA1_MASK = 0x08; |
| private final static int EUCJP_KANA2_MASK = 0x10; |
| |
| public JISAutoDetect() { |
| super("x-JISAutoDetect", ExtendedCharsets.aliasesFor("x-JISAutoDetect")); |
| } |
| |
| public boolean contains(Charset cs) { |
| return ((cs.name().equals("US-ASCII")) |
| || (cs instanceof SJIS) |
| || (cs instanceof EUC_JP) |
| || (cs instanceof ISO2022_JP)); |
| } |
| |
| public boolean canEncode() { |
| return false; |
| } |
| |
| public CharsetDecoder newDecoder() { |
| return new Decoder(this); |
| } |
| |
| public String historicalName() { |
| return "JISAutoDetect"; |
| } |
| |
| public CharsetEncoder newEncoder() { |
| throw new UnsupportedOperationException(); |
| } |
| |
| // A heuristic algorithm for guessing if EUC-decoded text really |
| // might be Japanese text. Better heuristics are possible... |
| private static boolean looksLikeJapanese(CharBuffer cb) { |
| int hiragana = 0; // Fullwidth Hiragana |
| int katakana = 0; // Halfwidth Katakana |
| while (cb.hasRemaining()) { |
| char c = cb.get(); |
| if (0x3040 <= c && c <= 0x309f && ++hiragana > 1) return true; |
| if (0xff65 <= c && c <= 0xff9f && ++katakana > 1) return true; |
| } |
| return false; |
| } |
| |
| private static class Decoder extends CharsetDecoder { |
| private final static String osName = AccessController.doPrivileged( |
| (PrivilegedAction<String>) () -> System.getProperty("os.name")); |
| |
| private final static String SJISName = getSJISName(); |
| private final static String EUCJPName = getEUCJPName(); |
| private DelegatableDecoder detectedDecoder = null; |
| |
| public Decoder(Charset cs) { |
| super(cs, 0.5f, 1.0f); |
| } |
| |
| private static boolean isPlainASCII(byte b) { |
| return b >= 0 && b != 0x1b; |
| } |
| |
| private static void copyLeadingASCII(ByteBuffer src, CharBuffer dst) { |
| int start = src.position(); |
| int limit = start + Math.min(src.remaining(), dst.remaining()); |
| int p; |
| byte b; |
| for (p = start; p < limit && isPlainASCII(b = src.get(p)); p++) |
| dst.put((char)(b & 0xff)); |
| src.position(p); |
| } |
| |
| private CoderResult decodeLoop(DelegatableDecoder decoder, |
| ByteBuffer src, CharBuffer dst) { |
| ((CharsetDecoder)decoder).reset(); |
| detectedDecoder = decoder; |
| return detectedDecoder.decodeLoop(src, dst); |
| } |
| |
| protected CoderResult decodeLoop(ByteBuffer src, CharBuffer dst) { |
| if (detectedDecoder == null) { |
| copyLeadingASCII(src, dst); |
| |
| // All ASCII? |
| if (! src.hasRemaining()) |
| return CoderResult.UNDERFLOW; |
| // Overflow only if there is still ascii but no out buffer. |
| if (!dst.hasRemaining() && |
| isPlainASCII(src.get(src.position()))) |
| return CoderResult.OVERFLOW; |
| |
| // We need to perform double, not float, arithmetic; otherwise |
| // we lose low order bits when src is larger than 2**24. |
| int cbufsiz = (int)(src.limit() * (double)maxCharsPerByte()); |
| CharBuffer sandbox = CharBuffer.allocate(cbufsiz); |
| |
| // First try ISO-2022-JP, since there is no ambiguity |
| Charset cs2022 = Charset.forName("ISO-2022-JP"); |
| DelegatableDecoder dd2022 |
| = (DelegatableDecoder) cs2022.newDecoder(); |
| ByteBuffer src2022 = src.asReadOnlyBuffer(); |
| CoderResult res2022 = dd2022.decodeLoop(src2022, sandbox); |
| if (! res2022.isError()) |
| return decodeLoop(dd2022, src, dst); |
| |
| // We must choose between EUC and SJIS |
| Charset csEUCJ = Charset.forName(EUCJPName); |
| Charset csSJIS = Charset.forName(SJISName); |
| |
| DelegatableDecoder ddEUCJ |
| = (DelegatableDecoder) csEUCJ.newDecoder(); |
| DelegatableDecoder ddSJIS |
| = (DelegatableDecoder) csSJIS.newDecoder(); |
| |
| ByteBuffer srcEUCJ = src.asReadOnlyBuffer(); |
| sandbox.clear(); |
| CoderResult resEUCJ = ddEUCJ.decodeLoop(srcEUCJ, sandbox); |
| // If EUC decoding fails, must be SJIS |
| if (resEUCJ.isError()) |
| return decodeLoop(ddSJIS, src, dst); |
| ByteBuffer srcSJIS = src.asReadOnlyBuffer(); |
| CharBuffer sandboxSJIS = CharBuffer.allocate(cbufsiz); |
| CoderResult resSJIS = ddSJIS.decodeLoop(srcSJIS, sandboxSJIS); |
| // If SJIS decoding fails, must be EUC |
| if (resSJIS.isError()) |
| return decodeLoop(ddEUCJ, src, dst); |
| |
| // From here on, we have some ambiguity, and must guess. |
| |
| // We prefer input that does not appear to end mid-character. |
| if (srcEUCJ.position() > srcSJIS.position()) |
| return decodeLoop(ddEUCJ, src, dst); |
| |
| if (srcEUCJ.position() < srcSJIS.position()) |
| return decodeLoop(ddSJIS, src, dst); |
| |
| // end-of-input is after the first byte of the first char? |
| if (src.position() == srcEUCJ.position()) |
| return CoderResult.UNDERFLOW; |
| |
| // Use heuristic knowledge of typical Japanese text |
| sandbox.flip(); |
| return decodeLoop(looksLikeJapanese(sandbox) ? ddEUCJ : ddSJIS, |
| src, dst); |
| } |
| |
| return detectedDecoder.decodeLoop(src, dst); |
| } |
| |
| protected void implReset() { |
| detectedDecoder = null; |
| } |
| |
| protected CoderResult implFlush(CharBuffer out) { |
| if (detectedDecoder != null) |
| return detectedDecoder.implFlush(out); |
| else |
| return super.implFlush(out); |
| } |
| |
| public boolean isAutoDetecting() { |
| return true; |
| } |
| |
| public boolean isCharsetDetected() { |
| return detectedDecoder != null; |
| } |
| |
| public Charset detectedCharset() { |
| if (detectedDecoder == null) |
| throw new IllegalStateException("charset not yet detected"); |
| return ((CharsetDecoder) detectedDecoder).charset(); |
| } |
| |
| |
| /** |
| * Returned Shift_JIS Charset name is OS dependent |
| */ |
| private static String getSJISName() { |
| if (osName.equals("Solaris") || osName.equals("SunOS")) |
| return("PCK"); |
| else if (osName.startsWith("Windows")) |
| return("windows-31J"); |
| else |
| return("Shift_JIS"); |
| } |
| |
| /** |
| * Returned EUC-JP Charset name is OS dependent |
| */ |
| |
| private static String getEUCJPName() { |
| if (osName.equals("Solaris") || osName.equals("SunOS")) |
| return("x-eucjp-open"); |
| else |
| return("EUC_JP"); |
| } |
| |
| } |
| } |