| // Copyright (c) 2012, Mike Samuel |
| // All rights reserved. |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions |
| // are met: |
| // |
| // Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // Redistributions in binary form must reproduce the above copyright |
| // notice, this list of conditions and the following disclaimer in the |
| // documentation and/or other materials provided with the distribution. |
| // Neither the name of the OWASP nor the names of its contributors may |
| // be used to endorse or promote products derived from this software |
| // without specific prior written permission. |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
| // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
| // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
| // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
| // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
| // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN |
| // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| // POSSIBILITY OF SUCH DAMAGE. |
| |
| package org.owasp.html; |
| |
| import java.io.IOException; |
| |
| import com.google.common.annotations.VisibleForTesting; |
| |
| /** Encoders and decoders for HTML. */ |
| final class Encoding { |
| |
| /** |
| * Decodes HTML entities to produce a string containing only valid |
| * Unicode scalar values. |
| */ |
| @VisibleForTesting |
| static String decodeHtml(String s) { |
| int firstAmp = s.indexOf('&'); |
| int safeLimit = longestPrefixOfGoodCodeunits(s); |
| if ((firstAmp & safeLimit) < 0) { return s; } |
| |
| StringBuilder sb; |
| { |
| int n = s.length(); |
| sb = new StringBuilder(n); |
| int pos = 0; |
| int amp = firstAmp; |
| while (amp >= 0) { |
| long endAndCodepoint = HtmlEntities.decodeEntityAt(s, amp, n); |
| int end = (int) (endAndCodepoint >>> 32); |
| int codepoint = (int) endAndCodepoint; |
| sb.append(s, pos, amp).appendCodePoint(codepoint); |
| pos = end; |
| amp = s.indexOf('&', end); |
| } |
| sb.append(s, pos, n); |
| } |
| |
| stripBannedCodeunits( |
| sb, |
| firstAmp < 0 |
| ? safeLimit : safeLimit < 0 |
| ? firstAmp : Math.min(firstAmp, safeLimit)); |
| |
| return sb.toString(); |
| } |
| |
| /** |
| * Returns the portion of its input that consists of XML safe chars. |
| * @see <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">XML Ch. 2.2 - Characters</a> |
| */ |
| @TCB |
| static String stripBannedCodeunits(String s) { |
| int safeLimit = longestPrefixOfGoodCodeunits(s); |
| if (safeLimit < 0) { return s; } |
| |
| StringBuilder sb = new StringBuilder(s); |
| stripBannedCodeunits(sb, safeLimit); |
| return sb.toString(); |
| } |
| |
| /** |
| * Leaves in the input buffer only code-units that comprise XML safe chars. |
| * @see <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">XML Ch. 2.2 - Characters</a> |
| */ |
| @TCB |
| static void stripBannedCodeunits(StringBuilder sb) { |
| stripBannedCodeunits(sb, 0); |
| } |
| |
| @TCB |
| private static void stripBannedCodeunits(StringBuilder sb, int start) { |
| int k = start; |
| for (int i = start, n = sb.length(); i < n; ++i) { |
| char ch = sb.charAt(i); |
| if (ch < 0x20) { |
| if (IS_BANNED_ASCII[ch]) { |
| continue; |
| } |
| } else if (0xd800 <= ch) { |
| if (ch <= 0xdfff) { |
| if (i+1 < n) { |
| char next = sb.charAt(i+1); |
| if (Character.isSurrogatePair(ch, next)) { |
| sb.setCharAt(k++, ch); |
| sb.setCharAt(k++, next); |
| ++i; |
| } |
| } |
| continue; |
| } else if ((ch & 0xfffe) == 0xfffe) { |
| continue; |
| } |
| } |
| sb.setCharAt(k++, ch); |
| } |
| sb.setLength(k); |
| } |
| |
| /** |
| * The number of code-units at the front of s that form code-points in the |
| * XML Character production. |
| * @return -1 if all of s is in the XML Character production. |
| */ |
| @TCB |
| private static int longestPrefixOfGoodCodeunits(String s) { |
| int n = s.length(), i; |
| for (i = 0; i < n; ++i) { |
| char ch = s.charAt(i); |
| if (ch < 0x20) { |
| if (IS_BANNED_ASCII[ch]) { |
| return i; |
| } |
| } else if (0xd800 <= ch) { |
| if (ch <= 0xdfff) { |
| if (i+1 < n && Character.isSurrogatePair(ch, s.charAt(i+1))) { |
| ++i; // Skip over low surrogate since we know it's ok. |
| } else { |
| return i; |
| } |
| } else if ((ch & 0xfffe) == 0xfffe) { |
| return i; |
| } |
| } |
| } |
| return -1; |
| } |
| |
| /** |
| * Writes the HTML equivalent of the given plain text to output. |
| * For example, {@code escapeHtmlOnto("1 < 2", w)}, |
| * is equivalent to {@code w.append("1 < 2")} but possibly with fewer |
| * smaller appends. |
| * Elides code-units that are not valid XML Characters. |
| * @see <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">XML Ch. 2.2 - Characters</a> |
| */ |
| @TCB |
| static void encodeHtmlOnto(String plainText, Appendable output) |
| throws IOException { |
| int n = plainText.length(); |
| int pos = 0; |
| for (int i = 0; i < n; ++i) { |
| char ch = plainText.charAt(i); |
| if (ch < REPLACEMENTS.length) { |
| String repl = REPLACEMENTS[ch]; |
| if (repl != null) { |
| output.append(plainText, pos, i).append(repl); |
| pos = i + 1; |
| } |
| } else if (((char) 0xd800) <= ch) { |
| if (ch <= ((char) 0xdfff)) { |
| char next; |
| if (i + 1 < n |
| && Character.isSurrogatePair( |
| ch, next = plainText.charAt(i + 1))) { |
| // Emit supplemental codepoints as entity so that they cannot |
| // be mis-encoded as UTF-8 of surrogates instead of UTF-8 proper |
| // and get involved in UTF-16/UCS-2 confusion. |
| int codepoint = Character.toCodePoint(ch, next); |
| output.append(plainText, pos, i); |
| appendNumericEntity(codepoint, output); |
| ++i; |
| pos = i + 1; |
| } else { |
| output.append(plainText, pos, i); |
| // Elide the orphaned surrogate. |
| pos = i + 1; |
| } |
| } else if (0xff00 <= ch) { |
| output.append(plainText, pos, i); |
| pos = i + 1; |
| // Is a control character or possible full-width version of a |
| // special character. |
| if ((ch & 0xfffe) == 0xfffe) { |
| // Elide since not an the XML Character. |
| } else { |
| appendNumericEntity(ch, output); |
| } |
| } |
| } |
| } |
| output.append(plainText, pos, n); |
| } |
| |
| @TCB |
| static void appendNumericEntity(int codepoint, Appendable output) |
| throws IOException { |
| if (codepoint < 100) { |
| // TODO: is this dead code due to REPLACEMENTS above. |
| output.append("&#"); |
| if (codepoint < 10) { |
| output.append((char) ('0' + codepoint)); |
| } else { |
| output.append((char) ('0' + (codepoint / 10))); |
| output.append((char) ('0' + (codepoint % 10))); |
| } |
| output.append(";"); |
| } else { |
| int nDigits = (codepoint < 0x1000 |
| ? codepoint < 0x100 ? 2 : 3 |
| : (codepoint < 0x10000 ? 4 |
| : codepoint < 0x100000 ? 5 : 6)); |
| output.append("&#x"); |
| for (int digit = nDigits; --digit >= 0;) { |
| int hexDigit = (codepoint >>> (digit << 2)) & 0xf; |
| output.append(HEX_NUMERAL[hexDigit]); |
| } |
| output.append(";"); |
| } |
| } |
| |
| private static final char[] HEX_NUMERAL = { |
| '0', '1', '2', '3', '4', '5', '6', '7', |
| '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', |
| }; |
| |
| /** Maps ASCII chars that need to be encoded to an equivalent HTML entity. */ |
| static final String[] REPLACEMENTS = new String[0x61]; |
| static { |
| for (int i = 0; i < ' '; ++i) { |
| // We elide control characters so that we can ensure that our output is |
| // in the intersection of valid HTML5 and XML. According to |
| // http://www.w3.org/TR/2008/REC-xml-20081126/#charsets |
| // Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] |
| // | [#xE000-#xFFFD] | [#x10000-#x10FFFF] |
| if (i != '\t' && i != '\n' && i != '\r') { |
| REPLACEMENTS[i] = ""; // Elide |
| } |
| } |
| // """ is shorter than """ |
| REPLACEMENTS['"'] = "&#" + ((int) '"') + ";"; // Attribute delimiter. |
| REPLACEMENTS['&'] = "&"; // HTML special. |
| // We don't use ' since that is not in the intersection of HTML&XML. |
| REPLACEMENTS['\''] = "&#" + ((int) '\'') + ";"; // Attribute delimiter. |
| REPLACEMENTS['+'] = "&#" + ((int) '+') + ";"; // UTF-7 special. |
| REPLACEMENTS['<'] = "<"; // HTML special. |
| REPLACEMENTS['='] = "&#" + ((int) '=') + ";"; // Special in attributes. |
| REPLACEMENTS['>'] = ">"; // HTML special. |
| REPLACEMENTS['@'] = "&#" + ((int) '@') + ";"; // Conditional compilation. |
| REPLACEMENTS['`'] = "&#" + ((int) '`') + ";"; // Attribute delimiter. |
| } |
| |
| /** |
| * {@code DECODES_TO_SELF[c]} is true iff the codepoint c decodes to itself in |
| * an HTML5 text node or properly quoted attribute value. |
| */ |
| private static boolean[] IS_BANNED_ASCII = new boolean[0x20]; |
| static { |
| for (int i = 0; i < IS_BANNED_ASCII.length; ++i) { |
| IS_BANNED_ASCII[i] = !(i == '\t' || i == '\n' || i == '\r'); |
| } |
| } |
| |
| } |