8207760: SAXException: Invalid UTF-16 surrogate detected: d83c ?
Summary: Properly handle unicode16 characters split across buffer chunks.
Reviewed-by: lancea, dfuchs
diff --git a/src/com/sun/org/apache/xml/internal/serializer/ToHTMLStream.java b/src/com/sun/org/apache/xml/internal/serializer/ToHTMLStream.java
index a68ec6d..633d0e9 100644
--- a/src/com/sun/org/apache/xml/internal/serializer/ToHTMLStream.java
+++ b/src/com/sun/org/apache/xml/internal/serializer/ToHTMLStream.java
@@ -1,6 +1,5 @@
 /*
- * reserved comment block
- * DO NOT REMOVE OR ALTER!
+ * Copyright (c) 2014, 2018, Oracle and/or its affiliates. All rights reserved.
  */
 /*
  * Copyright 2001-2004 The Apache Software Foundation.
@@ -43,6 +42,7 @@
  * because it is used from another package.
  *
  * @xsl.usage internal
+ * @LastModified: Sept 2018
  */
 public final class ToHTMLStream extends ToStream
 {
@@ -1021,7 +1021,7 @@
         String name,
         String value,
         ElemDesc elemDesc)
-        throws IOException
+        throws IOException, SAXException
     {
         writer.write(' ');
 
@@ -1345,7 +1345,7 @@
      */
     public void writeAttrString(
         final java.io.Writer writer, String string, String encoding)
-        throws IOException
+        throws IOException, SAXException
     {
         final int end = string.length();
         if (end > m_attrBuff.length)
@@ -1397,13 +1397,16 @@
                 }
                 else
                 {
-                    if (Encodings.isHighUTF16Surrogate(ch))
+                    if (Encodings.isHighUTF16Surrogate(ch) ||
+                            Encodings.isLowUTF16Surrogate(ch))
                     {
-
-                            writeUTF16Surrogate(ch, chars, i, end);
-                            i++; // two input characters processed
-                                 // this increments by one and the for()
-                                 // loop itself increments by another one.
+                        if (writeUTF16Surrogate(ch, chars, i, end) >= 0) {
+                            // move the index if the low surrogate is consumed
+                            // as writeUTF16Surrogate has written the pair
+                            if (Encodings.isHighUTF16Surrogate(ch)) {
+                                i++;
+                            }
+                        }
                     }
 
                     // The next is kind of a hack to keep from escaping in the case
diff --git a/src/com/sun/org/apache/xml/internal/serializer/ToStream.java b/src/com/sun/org/apache/xml/internal/serializer/ToStream.java
index 2301763..b4da7bb 100644
--- a/src/com/sun/org/apache/xml/internal/serializer/ToStream.java
+++ b/src/com/sun/org/apache/xml/internal/serializer/ToStream.java
@@ -1,6 +1,5 @@
 /*
- * reserved comment block
- * DO NOT REMOVE OR ALTER!
+ * Copyright (c) 2006, 2018, Oracle and/or its affiliates. All rights reserved.
  */
 /*
  * Copyright 2001-2004 The Apache Software Foundation.
@@ -51,6 +50,7 @@
  * serializers (xml, html, text ...) that write output to a stream.
  *
  * @xsl.usage internal
+ * @LastModified: Sept 2018
  */
 abstract public class ToStream extends SerializerBase
 {
@@ -200,6 +200,7 @@
      */
     private boolean m_expandDTDEntities = true;
 
+    private char m_highSurrogate = 0;
 
     /**
      * Default constructor
@@ -947,45 +948,46 @@
      * @param ch Character array.
      * @param i position Where the surrogate was detected.
      * @param end The end index of the significant characters.
-     * @return 0 if the pair of characters was written out as-is,
-     * the unicode code point of the character represented by
-     * the surrogate pair if an entity reference with that value
-     * was written out.
+     * @return the status of writing a surrogate pair.
+     *        -1 -- nothing is written
+     *         0 -- the pair is written as-is
+     *         code point -- the pair is written as an entity reference
      *
      * @throws IOException
      * @throws org.xml.sax.SAXException if invalid UTF-16 surrogate detected.
      */
     protected int writeUTF16Surrogate(char c, char ch[], int i, int end)
-        throws IOException
+        throws IOException, SAXException
     {
-        int codePoint = 0;
+        int status = -1;
         if (i + 1 >= end)
         {
-            throw new IOException(
-                Utils.messages.createMessage(
-                    MsgKey.ER_INVALID_UTF16_SURROGATE,
-                    new Object[] { Integer.toHexString((int) c)}));
+            m_highSurrogate = c;
+            return status;
         }
 
-        final char high = c;
-        final char low = ch[i+1];
+        char high, low;
+        if (m_highSurrogate == 0) {
+            high = c;
+            low = ch[i+1];
+            status = 0;
+        } else {
+            high = m_highSurrogate;
+            low = c;
+            m_highSurrogate = 0;
+        }
+
         if (!Encodings.isLowUTF16Surrogate(low)) {
-            throw new IOException(
-                Utils.messages.createMessage(
-                    MsgKey.ER_INVALID_UTF16_SURROGATE,
-                    new Object[] {
-                        Integer.toHexString((int) c)
-                            + " "
-                            + Integer.toHexString(low)}));
+            throwIOE(high, low);
         }
 
         final java.io.Writer writer = m_writer;
 
         // If we make it to here we have a valid high, low surrogate pair
-        if (m_encodingInfo.isInEncoding(c,low)) {
+        if (m_encodingInfo.isInEncoding(high,low)) {
             // If the character formed by the surrogate pair
             // is in the encoding, so just write it out
-            writer.write(ch,i,2);
+            writer.write(new char[]{high, low}, 0, 2);
         }
         else {
             // Don't know what to do with this char, it is
@@ -993,24 +995,16 @@
             // a surrogate pair, so write out as an entity ref
             final String encoding = getEncoding();
             if (encoding != null) {
-                /* The output encoding is known,
-                 * so somthing is wrong.
-                  */
-                codePoint = Encodings.toCodePoint(high, low);
-                // not in the encoding, so write out a character reference
-                writer.write('&');
-                writer.write('#');
-                writer.write(Integer.toString(codePoint));
-                writer.write(';');
+                status = writeCharRef(writer, high, low);
             } else {
                 /* The output encoding is not known,
                  * so just write it out as-is.
                  */
-                writer.write(ch, i, 2);
+                writer.write(new char[]{high, low}, 0, 2);
             }
         }
         // non-zero only if character reference was written out.
-        return codePoint;
+        return status;
     }
 
     /**
@@ -1100,32 +1094,7 @@
             }
             else if (isCData && (!escapingNotNeeded(c)))
             {
-                //                if (i != 0)
-                if (m_cdataTagOpen)
-                    closeCDATA();
-
-                // This needs to go into a function...
-                if (Encodings.isHighUTF16Surrogate(c))
-                {
-                    writeUTF16Surrogate(c, ch, i, end);
-                    i++ ; // process two input characters
-                }
-                else
-                {
-                    writer.write("&#");
-
-                    String intStr = Integer.toString((int) c);
-
-                    writer.write(intStr);
-                    writer.write(';');
-                }
-
-                //                if ((i != 0) && (i < (end - 1)))
-                //                if (!m_cdataTagOpen && (i < (end - 1)))
-                //                {
-                //                    writer.write(CDATA_DELIMITER_OPEN);
-                //                    m_cdataTagOpen = true;
-                //                }
+                i = handleEscaping(writer, c, ch, i, end);
             }
             else if (
                 isCData
@@ -1149,25 +1118,8 @@
                     }
                     writer.write(c);
                 }
-
-                // This needs to go into a function...
-                else if (Encodings.isHighUTF16Surrogate(c))
-                {
-                    if (m_cdataTagOpen)
-                        closeCDATA();
-                    writeUTF16Surrogate(c, ch, i, end);
-                    i++; // process two input characters
-                }
-                else
-                {
-                    if (m_cdataTagOpen)
-                        closeCDATA();
-                    writer.write("&#");
-
-                    String intStr = Integer.toString((int) c);
-
-                    writer.write(intStr);
-                    writer.write(';');
+                else {
+                    i = handleEscaping(writer, c, ch, i, end);
                 }
             }
         }
@@ -1175,6 +1127,38 @@
     }
 
     /**
+     * Handles escaping, writes either with a surrogate pair or a character
+     * reference.
+     *
+     * @param c the current char
+     * @param ch the character array
+     * @param i the current position
+     * @param end the end index of the array
+     * @return the next index
+     *
+     * @throws IOException
+     * @throws org.xml.sax.SAXException if invalid UTF-16 surrogate detected.
+     */
+    private int handleEscaping(Writer writer, char c, char ch[], int i, int end)
+            throws IOException, SAXException {
+        if (Encodings.isHighUTF16Surrogate(c) || Encodings.isLowUTF16Surrogate(c))
+        {
+            if (writeUTF16Surrogate(c, ch, i, end) >= 0) {
+                // move the index if the low surrogate is consumed
+                // as writeUTF16Surrogate has written the pair
+                if (Encodings.isHighUTF16Surrogate(c)) {
+                    i++ ;
+                }
+            }
+        }
+        else
+        {
+            writeCharRef(writer, c);
+        }
+        return i;
+    }
+
+    /**
      * Ends an un-escaping section.
      *
      * @see #startNonEscaping
@@ -1242,7 +1226,7 @@
             }
             m_ispreserve = true;
 
-            if (shouldIndent())
+            if (!m_cdataTagOpen && shouldIndent())
                 indent();
 
             boolean writeCDataBrackets =
@@ -1564,7 +1548,7 @@
         int i,
         char ch,
         int lastDirty,
-        boolean fromTextNode) throws IOException
+        boolean fromTextNode) throws IOException, SAXException
     {
         int startClean = lastDirty + 1;
         // if we have some clean characters accumulated
@@ -1643,54 +1627,41 @@
         int len,
         boolean fromTextNode,
         boolean escLF)
-        throws IOException
+        throws IOException, SAXException
     {
 
         int pos = accumDefaultEntity(writer, ch, i, chars, len, fromTextNode, escLF);
 
         if (i == pos)
         {
+            if (m_highSurrogate != 0) {
+                if (!(Encodings.isLowUTF16Surrogate(ch))) {
+                    throwIOE(m_highSurrogate, ch);
+                }
+                writeCharRef(writer, m_highSurrogate, ch);
+                m_highSurrogate = 0;
+                return ++pos;
+            }
+
             if (Encodings.isHighUTF16Surrogate(ch))
             {
-
-                // Should be the UTF-16 low surrogate of the hig/low pair.
-                char next;
-                // Unicode code point formed from the high/low pair.
-                int codePoint = 0;
-
                 if (i + 1 >= len)
                 {
-                    throw new IOException(
-                        Utils.messages.createMessage(
-                            MsgKey.ER_INVALID_UTF16_SURROGATE,
-                            new Object[] { Integer.toHexString(ch)}));
-                    //"Invalid UTF-16 surrogate detected: "
-
-                    //+Integer.toHexString(ch)+ " ?");
+                    // save for the next read
+                    m_highSurrogate = ch;
+                    pos++;
                 }
                 else
                 {
-                    next = chars[++i];
+                    // the next should be the UTF-16 low surrogate of the hig/low pair.
+                    char next = chars[++i];
 
                     if (!(Encodings.isLowUTF16Surrogate(next)))
-                        throw new IOException(
-                            Utils.messages.createMessage(
-                                MsgKey
-                                    .ER_INVALID_UTF16_SURROGATE,
-                                new Object[] {
-                                    Integer.toHexString(ch)
-                                        + " "
-                                        + Integer.toHexString(next)}));
-                    //"Invalid UTF-16 surrogate detected: "
+                        throwIOE(ch, next);
 
-                    //+Integer.toHexString(ch)+" "+Integer.toHexString(next));
-                    codePoint = Encodings.toCodePoint(ch,next);
+                    writeCharRef(writer, ch, next);
+                    pos += 2; // count the two characters that went into writing out this entity
                 }
-
-                writer.write("&#");
-                writer.write(Integer.toString(codePoint));
-                writer.write(';');
-                pos += 2; // count the two characters that went into writing out this entity
             }
             else
             {
@@ -1702,18 +1673,14 @@
                 if (isCharacterInC0orC1Range(ch) ||
                         (XMLVERSION11.equals(getVersion()) && isNELorLSEPCharacter(ch)))
                 {
-                    writer.write("&#");
-                    writer.write(Integer.toString(ch));
-                    writer.write(';');
+                    writeCharRef(writer, ch);
                 }
                 else if ((!escapingNotNeeded(ch) ||
                     (  (fromTextNode && m_charInfo.isSpecialTextChar(ch))
                      || (!fromTextNode && m_charInfo.isSpecialAttrChar(ch))))
-                && m_elemContext.m_currentElemDepth > 0)
+                     && m_elemContext.m_currentElemDepth > 0)
                 {
-                    writer.write("&#");
-                    writer.write(Integer.toString(ch));
-                    writer.write(';');
+                    writeCharRef(writer, ch);
                 }
                 else
                 {
@@ -1727,6 +1694,45 @@
     }
 
     /**
+     * Writes out a character reference.
+     * @param writer the writer
+     * @param c the character
+     * @throws IOException
+     */
+    private void writeCharRef(Writer writer, char c) throws IOException, SAXException {
+        if (m_cdataTagOpen)
+            closeCDATA();
+        writer.write("&#");
+        writer.write(Integer.toString(c));
+        writer.write(';');
+    }
+
+    /**
+     * Writes out a pair of surrogates as a character reference
+     * @param writer the writer
+     * @param high the high surrogate
+     * @param low the low surrogate
+     * @throws IOException
+     */
+    private int writeCharRef(Writer writer, char high, char low) throws IOException, SAXException {
+        if (m_cdataTagOpen)
+            closeCDATA();
+        // Unicode code point formed from the high/low pair.
+        int codePoint = Encodings.toCodePoint(high, low);
+        writer.write("&#");
+        writer.write(Integer.toString(codePoint));
+        writer.write(';');
+        return codePoint;
+    }
+
+    private void throwIOE(char ch, char next) throws IOException {
+        throw new IOException(Utils.messages.createMessage(
+                MsgKey.ER_INVALID_UTF16_SURROGATE,
+                new Object[] {Integer.toHexString(ch) + " "
+                        + Integer.toHexString(next)}));
+    }
+
+    /**
      * Receive notification of the beginning of an element, although this is a
      * SAX method additional namespace or attribute information can occur before
      * or after this call, that is associated with this element.
@@ -1962,7 +1968,7 @@
         Writer writer,
         String string,
         String encoding)
-        throws IOException
+        throws IOException, SAXException
     {
         final int len = string.length();
         if (len > m_attrBuff.length)
diff --git a/src/com/sun/org/apache/xml/internal/serializer/ToTextStream.java b/src/com/sun/org/apache/xml/internal/serializer/ToTextStream.java
index 9e39b89..aecf377 100644
--- a/src/com/sun/org/apache/xml/internal/serializer/ToTextStream.java
+++ b/src/com/sun/org/apache/xml/internal/serializer/ToTextStream.java
@@ -1,6 +1,5 @@
 /*
- * reserved comment block
- * DO NOT REMOVE OR ALTER!
+ * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
  */
 /*
  * Copyright 2001-2004 The Apache Software Foundation.
@@ -35,6 +34,7 @@
  * This class converts SAX or SAX-like calls to a
  * serialized document for xsl:output method of "text".
  * @xsl.usage internal
+ * @LastModified: Sept 2018
  */
 public final class ToTextStream extends ToStream
 {
@@ -296,23 +296,32 @@
             } else if (m_encodingInfo.isInEncoding(c)) {
                 writer.write(c);
                 // one input char processed
-            } else if (Encodings.isHighUTF16Surrogate(c)) {
+            } else if (Encodings.isHighUTF16Surrogate(c) ||
+                       Encodings.isLowUTF16Surrogate(c)) {
                 final int codePoint = writeUTF16Surrogate(c, ch, i, end);
-                if (codePoint != 0) {
-                    // I think we can just emit the message,
-                    // not crash and burn.
-                    final String integralValue = Integer.toString(codePoint);
-                    final String msg = Utils.messages.createMessage(
-                        MsgKey.ER_ILLEGAL_CHARACTER,
-                        new Object[] { integralValue, encoding });
+                if (codePoint >= 0) {
+                    // move the index if the low surrogate is consumed
+                    // as writeUTF16Surrogate has written the pair
+                    if (Encodings.isHighUTF16Surrogate(c)) {
+                        i++;
+                    }
 
-                    //Older behavior was to throw the message,
-                    //but newer gentler behavior is to write a message to System.err
-                    //throw new SAXException(msg);
-                    System.err.println(msg);
+                    // printing to the console is not appropriate, but will leave
+                    // it as is for compatibility.
+                    if (codePoint >0) {
+                        // I think we can just emit the message,
+                        // not crash and burn.
+                        final String integralValue = Integer.toString(codePoint);
+                        final String msg = Utils.messages.createMessage(
+                            MsgKey.ER_ILLEGAL_CHARACTER,
+                            new Object[] { integralValue, encoding });
 
+                        //Older behavior was to throw the message,
+                        //but newer gentler behavior is to write a message to System.err
+                        //throw new SAXException(msg);
+                        System.err.println(msg);
+                    }
                 }
-                i++; // two input chars processed
             } else {
                 // Don't know what to do with this char, it is
                 // not in the encoding and not a high char in