src/jdk.charsets/share/classes/sun/nio/cs/ext/JISAutoDetect.java - toolchain/jdk/jdk9_jdk - Git at Google

 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License version 2 only, as
  * published by the Free Software Foundation.  Oracle designates this
  * particular file as subject to the "Classpath" exception as provided
  * by Oracle in the LICENSE file that accompanied this code.
  *
  * This code is distributed in the hope that it will be useful, but WITHOUT
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  * version 2 for more details (a copy is included in the LICENSE file that
  * accompanied this code).
  *
  * You should have received a copy of the GNU General Public License version
  * 2 along with this work; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  *
  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  * or visit www.oracle.com if you need additional information or have any
  * questions.
  */

 package sun.nio.cs.ext;

 import java.nio.ByteBuffer;
 import java.nio.CharBuffer;
 import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CharsetEncoder;
 import java.nio.charset.CoderResult;
 import java.nio.charset.CharacterCodingException;
 import java.nio.charset.MalformedInputException;
 import sun.nio.cs.DelegatableDecoder;
 import sun.nio.cs.HistoricallyNamedCharset;
 import java.security.AccessController;
 import java.security.PrivilegedAction;
 import sun.nio.cs.*;
 import static java.lang.Character.UnicodeBlock;


 public class JISAutoDetect
     extends Charset
     implements HistoricallyNamedCharset
 {

     private final static int EUCJP_MASK       = 0x01;
     private final static int SJIS2B_MASK      = 0x02;
     private final static int SJIS1B_MASK      = 0x04;
     private final static int EUCJP_KANA1_MASK = 0x08;
     private final static int EUCJP_KANA2_MASK = 0x10;

     public JISAutoDetect() {
         super("x-JISAutoDetect", ExtendedCharsets.aliasesFor("x-JISAutoDetect"));
     }

     public boolean contains(Charset cs) {
         return ((cs.name().equals("US-ASCII"))
                 || (cs instanceof SJIS)
                 || (cs instanceof EUC_JP)
                 || (cs instanceof ISO2022_JP));
     }

     public boolean canEncode() {
         return false;
     }

     public CharsetDecoder newDecoder() {
         return new Decoder(this);
     }

     public String historicalName() {
         return "JISAutoDetect";
     }

     public CharsetEncoder newEncoder() {
         throw new UnsupportedOperationException();
     }

     // A heuristic algorithm for guessing if EUC-decoded text really
     // might be Japanese text.  Better heuristics are possible...
     private static boolean looksLikeJapanese(CharBuffer cb) {
         int hiragana = 0;       // Fullwidth Hiragana
         int katakana = 0;       // Halfwidth Katakana
         while (cb.hasRemaining()) {
             char c = cb.get();
             if (0x3040 <= c && c <= 0x309f && ++hiragana > 1) return true;
             if (0xff65 <= c && c <= 0xff9f && ++katakana > 1) return true;
         }
         return false;
     }

     private static class Decoder extends CharsetDecoder {
         private final static String osName = AccessController.doPrivileged(
             (PrivilegedAction<String>) () -> System.getProperty("os.name"));

         private final static String SJISName = getSJISName();
         private final static String EUCJPName = getEUCJPName();
         private DelegatableDecoder detectedDecoder = null;

         public Decoder(Charset cs) {
             super(cs, 0.5f, 1.0f);
         }

         private static boolean isPlainASCII(byte b) {
             return b >= 0 && b != 0x1b;
         }

         private static void copyLeadingASCII(ByteBuffer src, CharBuffer dst) {
             int start = src.position();
             int limit = start + Math.min(src.remaining(), dst.remaining());
             int p;
             byte b;
             for (p = start; p < limit && isPlainASCII(b = src.get(p)); p++)
                 dst.put((char)(b & 0xff));
             src.position(p);
         }

         private CoderResult decodeLoop(DelegatableDecoder decoder,
                                        ByteBuffer src, CharBuffer dst) {
             ((CharsetDecoder)decoder).reset();
             detectedDecoder = decoder;
             return detectedDecoder.decodeLoop(src, dst);
         }

         protected CoderResult decodeLoop(ByteBuffer src, CharBuffer dst) {
             if (detectedDecoder == null) {
                 copyLeadingASCII(src, dst);

                 // All ASCII?
                 if (! src.hasRemaining())
                     return CoderResult.UNDERFLOW;
                 // Overflow only if there is still ascii but no out buffer.
                 if (!dst.hasRemaining() &&
                     isPlainASCII(src.get(src.position())))
                     return CoderResult.OVERFLOW;

                 // We need to perform double, not float, arithmetic; otherwise
                 // we lose low order bits when src is larger than 2**24.
                 int cbufsiz = (int)(src.limit() * (double)maxCharsPerByte());
                 CharBuffer sandbox = CharBuffer.allocate(cbufsiz);

                 // First try ISO-2022-JP, since there is no ambiguity
                 Charset cs2022 = Charset.forName("ISO-2022-JP");
                 DelegatableDecoder dd2022
                     = (DelegatableDecoder) cs2022.newDecoder();
                 ByteBuffer src2022 = src.asReadOnlyBuffer();
                 CoderResult res2022 = dd2022.decodeLoop(src2022, sandbox);
                 if (! res2022.isError())
                     return decodeLoop(dd2022, src, dst);

                 // We must choose between EUC and SJIS
                 Charset csEUCJ = Charset.forName(EUCJPName);
                 Charset csSJIS = Charset.forName(SJISName);

                 DelegatableDecoder ddEUCJ
                     = (DelegatableDecoder) csEUCJ.newDecoder();
                 DelegatableDecoder ddSJIS
                     = (DelegatableDecoder) csSJIS.newDecoder();

                 ByteBuffer srcEUCJ = src.asReadOnlyBuffer();
                 sandbox.clear();
                 CoderResult resEUCJ = ddEUCJ.decodeLoop(srcEUCJ, sandbox);
                 // If EUC decoding fails, must be SJIS
                 if (resEUCJ.isError())
                     return decodeLoop(ddSJIS, src, dst);
                 ByteBuffer srcSJIS = src.asReadOnlyBuffer();
                 CharBuffer sandboxSJIS = CharBuffer.allocate(cbufsiz);
                 CoderResult resSJIS = ddSJIS.decodeLoop(srcSJIS, sandboxSJIS);
                 // If SJIS decoding fails, must be EUC
                 if (resSJIS.isError())
                     return decodeLoop(ddEUCJ, src, dst);

                 // From here on, we have some ambiguity, and must guess.

                 // We prefer input that does not appear to end mid-character.
                 if (srcEUCJ.position() > srcSJIS.position())
                     return decodeLoop(ddEUCJ, src, dst);

                 if (srcEUCJ.position() < srcSJIS.position())
                     return decodeLoop(ddSJIS, src, dst);

                 // end-of-input is after the first byte of the first char?
                 if (src.position() == srcEUCJ.position())
                     return CoderResult.UNDERFLOW;

                 // Use heuristic knowledge of typical Japanese text
                 sandbox.flip();
                 return decodeLoop(looksLikeJapanese(sandbox) ? ddEUCJ : ddSJIS,
                                   src, dst);
             }

             return detectedDecoder.decodeLoop(src, dst);
         }

         protected void implReset() {
             detectedDecoder = null;
         }

         protected CoderResult implFlush(CharBuffer out) {
             if (detectedDecoder != null)
                 return detectedDecoder.implFlush(out);
             else
                 return super.implFlush(out);
         }

         public boolean isAutoDetecting() {
             return true;
         }

         public boolean isCharsetDetected() {
             return detectedDecoder != null;
         }

         public Charset detectedCharset() {
             if (detectedDecoder == null)
                 throw new IllegalStateException("charset not yet detected");
             return ((CharsetDecoder) detectedDecoder).charset();
         }


         /**
          * Returned Shift_JIS Charset name is OS dependent
          */
         private static String getSJISName() {
             if (osName.equals("Solaris") || osName.equals("SunOS"))
                 return("PCK");
             else if (osName.startsWith("Windows"))
                 return("windows-31J");
             else
                 return("Shift_JIS");
         }

         /**
          * Returned EUC-JP Charset name is OS dependent
          */

         private static String getEUCJPName() {
             if (osName.equals("Solaris") || osName.equals("SunOS"))
                 return("x-eucjp-open");
             else
                 return("EUC_JP");
         }

     }
 }
	/*
	* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
	*
	* This code is free software; you can redistribute it and/or modify it
	* under the terms of the GNU General Public License version 2 only, as
	* published by the Free Software Foundation. Oracle designates this
	* particular file as subject to the "Classpath" exception as provided
	* by Oracle in the LICENSE file that accompanied this code.
	*
	* This code is distributed in the hope that it will be useful, but WITHOUT
	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	* version 2 for more details (a copy is included in the LICENSE file that
	* accompanied this code).
	*
	* You should have received a copy of the GNU General Public License version
	* 2 along with this work; if not, write to the Free Software Foundation,
	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
	*
	* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
	* or visit www.oracle.com if you need additional information or have any
	* questions.
	*/

	package sun.nio.cs.ext;

	import java.nio.ByteBuffer;
	import java.nio.CharBuffer;
	import java.nio.charset.Charset;
	import java.nio.charset.CharsetDecoder;
	import java.nio.charset.CharsetEncoder;
	import java.nio.charset.CoderResult;
	import java.nio.charset.CharacterCodingException;
	import java.nio.charset.MalformedInputException;
	import sun.nio.cs.DelegatableDecoder;
	import sun.nio.cs.HistoricallyNamedCharset;
	import java.security.AccessController;
	import java.security.PrivilegedAction;
	import sun.nio.cs.*;
	import static java.lang.Character.UnicodeBlock;


	public class JISAutoDetect
	extends Charset
	implements HistoricallyNamedCharset
	{

	private final static int EUCJP_MASK = 0x01;
	private final static int SJIS2B_MASK = 0x02;
	private final static int SJIS1B_MASK = 0x04;
	private final static int EUCJP_KANA1_MASK = 0x08;
	private final static int EUCJP_KANA2_MASK = 0x10;

	public JISAutoDetect() {
	super("x-JISAutoDetect", ExtendedCharsets.aliasesFor("x-JISAutoDetect"));
	}

	public boolean contains(Charset cs) {
	return ((cs.name().equals("US-ASCII"))
	\|\| (cs instanceof SJIS)
	\|\| (cs instanceof EUC_JP)
	\|\| (cs instanceof ISO2022_JP));
	}

	public boolean canEncode() {
	return false;
	}

	public CharsetDecoder newDecoder() {
	return new Decoder(this);
	}

	public String historicalName() {
	return "JISAutoDetect";
	}

	public CharsetEncoder newEncoder() {
	throw new UnsupportedOperationException();
	}

	// A heuristic algorithm for guessing if EUC-decoded text really
	// might be Japanese text. Better heuristics are possible...
	private static boolean looksLikeJapanese(CharBuffer cb) {
	int hiragana = 0; // Fullwidth Hiragana
	int katakana = 0; // Halfwidth Katakana
	while (cb.hasRemaining()) {
	char c = cb.get();
	if (0x3040 <= c && c <= 0x309f && ++hiragana > 1) return true;
	if (0xff65 <= c && c <= 0xff9f && ++katakana > 1) return true;
	}
	return false;
	}

	private static class Decoder extends CharsetDecoder {
	private final static String osName = AccessController.doPrivileged(
	(PrivilegedAction<String>) () -> System.getProperty("os.name"));

	private final static String SJISName = getSJISName();
	private final static String EUCJPName = getEUCJPName();
	private DelegatableDecoder detectedDecoder = null;

	public Decoder(Charset cs) {
	super(cs, 0.5f, 1.0f);
	}

	private static boolean isPlainASCII(byte b) {
	return b >= 0 && b != 0x1b;
	}

	private static void copyLeadingASCII(ByteBuffer src, CharBuffer dst) {
	int start = src.position();
	int limit = start + Math.min(src.remaining(), dst.remaining());
	int p;
	byte b;
	for (p = start; p < limit && isPlainASCII(b = src.get(p)); p++)
	dst.put((char)(b & 0xff));
	src.position(p);
	}

	private CoderResult decodeLoop(DelegatableDecoder decoder,
	ByteBuffer src, CharBuffer dst) {
	((CharsetDecoder)decoder).reset();
	detectedDecoder = decoder;
	return detectedDecoder.decodeLoop(src, dst);
	}

	protected CoderResult decodeLoop(ByteBuffer src, CharBuffer dst) {
	if (detectedDecoder == null) {
	copyLeadingASCII(src, dst);

	// All ASCII?
	if (! src.hasRemaining())
	return CoderResult.UNDERFLOW;
	// Overflow only if there is still ascii but no out buffer.
	if (!dst.hasRemaining() &&
	isPlainASCII(src.get(src.position())))
	return CoderResult.OVERFLOW;

	// We need to perform double, not float, arithmetic; otherwise
	// we lose low order bits when src is larger than 2**24.
	int cbufsiz = (int)(src.limit() * (double)maxCharsPerByte());
	CharBuffer sandbox = CharBuffer.allocate(cbufsiz);

	// First try ISO-2022-JP, since there is no ambiguity
	Charset cs2022 = Charset.forName("ISO-2022-JP");
	DelegatableDecoder dd2022
	= (DelegatableDecoder) cs2022.newDecoder();
	ByteBuffer src2022 = src.asReadOnlyBuffer();
	CoderResult res2022 = dd2022.decodeLoop(src2022, sandbox);
	if (! res2022.isError())
	return decodeLoop(dd2022, src, dst);

	// We must choose between EUC and SJIS
	Charset csEUCJ = Charset.forName(EUCJPName);
	Charset csSJIS = Charset.forName(SJISName);

	DelegatableDecoder ddEUCJ
	= (DelegatableDecoder) csEUCJ.newDecoder();
	DelegatableDecoder ddSJIS
	= (DelegatableDecoder) csSJIS.newDecoder();

	ByteBuffer srcEUCJ = src.asReadOnlyBuffer();
	sandbox.clear();
	CoderResult resEUCJ = ddEUCJ.decodeLoop(srcEUCJ, sandbox);
	// If EUC decoding fails, must be SJIS
	if (resEUCJ.isError())
	return decodeLoop(ddSJIS, src, dst);
	ByteBuffer srcSJIS = src.asReadOnlyBuffer();
	CharBuffer sandboxSJIS = CharBuffer.allocate(cbufsiz);
	CoderResult resSJIS = ddSJIS.decodeLoop(srcSJIS, sandboxSJIS);
	// If SJIS decoding fails, must be EUC
	if (resSJIS.isError())
	return decodeLoop(ddEUCJ, src, dst);

	// From here on, we have some ambiguity, and must guess.

	// We prefer input that does not appear to end mid-character.
	if (srcEUCJ.position() > srcSJIS.position())
	return decodeLoop(ddEUCJ, src, dst);

	if (srcEUCJ.position() < srcSJIS.position())
	return decodeLoop(ddSJIS, src, dst);

	// end-of-input is after the first byte of the first char?
	if (src.position() == srcEUCJ.position())
	return CoderResult.UNDERFLOW;

	// Use heuristic knowledge of typical Japanese text
	sandbox.flip();
	return decodeLoop(looksLikeJapanese(sandbox) ? ddEUCJ : ddSJIS,
	src, dst);
	}

	return detectedDecoder.decodeLoop(src, dst);
	}

	protected void implReset() {
	detectedDecoder = null;
	}

	protected CoderResult implFlush(CharBuffer out) {
	if (detectedDecoder != null)
	return detectedDecoder.implFlush(out);
	else
	return super.implFlush(out);
	}

	public boolean isAutoDetecting() {
	return true;
	}

	public boolean isCharsetDetected() {
	return detectedDecoder != null;
	}

	public Charset detectedCharset() {
	if (detectedDecoder == null)
	throw new IllegalStateException("charset not yet detected");
	return ((CharsetDecoder) detectedDecoder).charset();
	}


	/**
	* Returned Shift_JIS Charset name is OS dependent
	*/
	private static String getSJISName() {
	if (osName.equals("Solaris") \|\| osName.equals("SunOS"))
	return("PCK");
	else if (osName.startsWith("Windows"))
	return("windows-31J");
	else
	return("Shift_JIS");
	}

	/**
	* Returned EUC-JP Charset name is OS dependent
	*/

	private static String getEUCJPName() {
	if (osName.equals("Solaris") \|\| osName.equals("SunOS"))
	return("x-eucjp-open");
	else
	return("EUC_JP");
	}

	}
	}