| /* Copyright (c) 2008 Google Inc. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.yaml.snakeyaml.external.com.google.gdata.util.common.base; |
| |
| /** |
| * A {@code UnicodeEscaper} that escapes some set of Java characters using the |
| * URI percent encoding scheme. The set of safe characters (those which remain |
| * unescaped) can be specified on construction. |
| * |
| * <p> |
| * For details on escaping URIs for use in web pages, see section 2.4 of <a |
| * href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>. |
| * |
| * <p> |
| * In most cases this class should not need to be used directly. If you have no |
| * special requirements for escaping your URIs, you should use either |
| * {@link CharEscapers#uriEscaper()} or {@link CharEscapers#uriEscaper(boolean)}. |
| * |
| * <p> |
| * When encoding a String, the following rules apply: |
| * <ul> |
| * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0" |
| * through "9" remain the same. |
| * <li>Any additionally specified safe characters remain the same. |
| * <li>If {@code plusForSpace} was specified, the space character " " is |
| * converted into a plus sign "+". |
| * <li>All other characters are converted into one or more bytes using UTF-8 |
| * encoding and each byte is then represented by the 3-character string "%XY", |
| * where "XY" is the two-digit, uppercase, hexadecimal representation of the |
| * byte value. |
| * </ul> |
| * |
| * <p> |
| * RFC 2396 specifies the set of unreserved characters as "-", "_", ".", "!", |
| * "~", "*", "'", "(" and ")". It goes on to state: |
| * |
| * <p> |
| * <i>Unreserved characters can be escaped without changing the semantics of the |
| * URI, but this should not be done unless the URI is being used in a context |
| * that does not allow the unescaped character to appear.</i> |
| * |
| * <p> |
| * For performance reasons the only currently supported character encoding of |
| * this class is UTF-8. |
| * |
| * <p> |
| * <b>Note</b>: This escaper produces uppercase hexidecimal sequences. From <a |
| * href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>:<br> |
| * <i>"URI producers and normalizers should use uppercase hexadecimal digits for |
| * all percent-encodings."</i> |
| * |
| * |
| */ |
| public class PercentEscaper extends UnicodeEscaper { |
| /** |
| * A string of safe characters that mimics the behavior of |
| * {@link java.net.URLEncoder}. |
| * |
| */ |
| public static final String SAFECHARS_URLENCODER = "-_.*"; |
| |
| /** |
| * A string of characters that do not need to be encoded when used in URI |
| * path segments, as specified in RFC 3986. Note that some of these |
| * characters do need to be escaped when used in other parts of the URI. |
| */ |
| public static final String SAFEPATHCHARS_URLENCODER = "-_.!~*'()@:$&,;="; |
| |
| /** |
| * A string of characters that do not need to be encoded when used in URI |
| * query strings, as specified in RFC 3986. Note that some of these |
| * characters do need to be escaped when used in other parts of the URI. |
| */ |
| public static final String SAFEQUERYSTRINGCHARS_URLENCODER = "-_.!~*'()@:$,;/?:"; |
| |
| // In some uri escapers spaces are escaped to '+' |
| private static final char[] URI_ESCAPED_SPACE = { '+' }; |
| |
| private static final char[] UPPER_HEX_DIGITS = "0123456789ABCDEF".toCharArray(); |
| |
| /** |
| * If true we should convert space to the {@code +} character. |
| */ |
| private final boolean plusForSpace; |
| |
| /** |
| * An array of flags where for any {@code char c} if {@code safeOctets[c]} |
| * is true then {@code c} should remain unmodified in the output. If |
| * {@code c > safeOctets.length} then it should be escaped. |
| */ |
| private final boolean[] safeOctets; |
| |
| /** |
| * Constructs a URI escaper with the specified safe characters and optional |
| * handling of the space character. |
| * |
| * @param safeChars |
| * a non null string specifying additional safe characters for |
| * this escaper (the ranges 0..9, a..z and A..Z are always safe |
| * and should not be specified here) |
| * @param plusForSpace |
| * true if ASCII space should be escaped to {@code +} rather than |
| * {@code %20} |
| * @throws IllegalArgumentException |
| * if any of the parameters were invalid |
| */ |
| public PercentEscaper(String safeChars, boolean plusForSpace) { |
| // Avoid any misunderstandings about the behavior of this escaper |
| if (safeChars.matches(".*[0-9A-Za-z].*")) { |
| throw new IllegalArgumentException( |
| "Alphanumeric characters are always 'safe' and should not be " |
| + "explicitly specified"); |
| } |
| // Avoid ambiguous parameters. Safe characters are never modified so if |
| // space is a safe character then setting plusForSpace is meaningless. |
| if (plusForSpace && safeChars.contains(" ")) { |
| throw new IllegalArgumentException( |
| "plusForSpace cannot be specified when space is a 'safe' character"); |
| } |
| if (safeChars.contains("%")) { |
| throw new IllegalArgumentException("The '%' character cannot be specified as 'safe'"); |
| } |
| this.plusForSpace = plusForSpace; |
| this.safeOctets = createSafeOctets(safeChars); |
| } |
| |
| /** |
| * Creates a boolean[] with entries corresponding to the character values |
| * for 0-9, A-Z, a-z and those specified in safeChars set to true. The array |
| * is as small as is required to hold the given character information. |
| */ |
| private static boolean[] createSafeOctets(String safeChars) { |
| int maxChar = 'z'; |
| char[] safeCharArray = safeChars.toCharArray(); |
| for (char c : safeCharArray) { |
| maxChar = Math.max(c, maxChar); |
| } |
| boolean[] octets = new boolean[maxChar + 1]; |
| for (int c = '0'; c <= '9'; c++) { |
| octets[c] = true; |
| } |
| for (int c = 'A'; c <= 'Z'; c++) { |
| octets[c] = true; |
| } |
| for (int c = 'a'; c <= 'z'; c++) { |
| octets[c] = true; |
| } |
| for (char c : safeCharArray) { |
| octets[c] = true; |
| } |
| return octets; |
| } |
| |
| /* |
| * Overridden for performance. For unescaped strings this improved the |
| * performance of the uri escaper from ~760ns to ~400ns as measured by |
| * {@link CharEscapersBenchmark}. |
| */ |
| @Override |
| protected int nextEscapeIndex(CharSequence csq, int index, int end) { |
| for (; index < end; index++) { |
| char c = csq.charAt(index); |
| if (c >= safeOctets.length || !safeOctets[c]) { |
| break; |
| } |
| } |
| return index; |
| } |
| |
| /* |
| * Overridden for performance. For unescaped strings this improved the |
| * performance of the uri escaper from ~400ns to ~170ns as measured by |
| * {@link CharEscapersBenchmark}. |
| */ |
| @Override |
| public String escape(String s) { |
| int slen = s.length(); |
| for (int index = 0; index < slen; index++) { |
| char c = s.charAt(index); |
| if (c >= safeOctets.length || !safeOctets[c]) { |
| return escapeSlow(s, index); |
| } |
| } |
| return s; |
| } |
| |
| /** |
| * Escapes the given Unicode code point in UTF-8. |
| */ |
| @Override |
| protected char[] escape(int cp) { |
| // We should never get negative values here but if we do it will throw |
| // an |
| // IndexOutOfBoundsException, so at least it will get spotted. |
| if (cp < safeOctets.length && safeOctets[cp]) { |
| return null; |
| } else if (cp == ' ' && plusForSpace) { |
| return URI_ESCAPED_SPACE; |
| } else if (cp <= 0x7F) { |
| // Single byte UTF-8 characters |
| // Start with "%--" and fill in the blanks |
| char[] dest = new char[3]; |
| dest[0] = '%'; |
| dest[2] = UPPER_HEX_DIGITS[cp & 0xF]; |
| dest[1] = UPPER_HEX_DIGITS[cp >>> 4]; |
| return dest; |
| } else if (cp <= 0x7ff) { |
| // Two byte UTF-8 characters [cp >= 0x80 && cp <= 0x7ff] |
| // Start with "%--%--" and fill in the blanks |
| char[] dest = new char[6]; |
| dest[0] = '%'; |
| dest[3] = '%'; |
| dest[5] = UPPER_HEX_DIGITS[cp & 0xF]; |
| cp >>>= 4; |
| dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; |
| cp >>>= 2; |
| dest[2] = UPPER_HEX_DIGITS[cp & 0xF]; |
| cp >>>= 4; |
| dest[1] = UPPER_HEX_DIGITS[0xC | cp]; |
| return dest; |
| } else if (cp <= 0xffff) { |
| // Three byte UTF-8 characters [cp >= 0x800 && cp <= 0xffff] |
| // Start with "%E-%--%--" and fill in the blanks |
| char[] dest = new char[9]; |
| dest[0] = '%'; |
| dest[1] = 'E'; |
| dest[3] = '%'; |
| dest[6] = '%'; |
| dest[8] = UPPER_HEX_DIGITS[cp & 0xF]; |
| cp >>>= 4; |
| dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; |
| cp >>>= 2; |
| dest[5] = UPPER_HEX_DIGITS[cp & 0xF]; |
| cp >>>= 4; |
| dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; |
| cp >>>= 2; |
| dest[2] = UPPER_HEX_DIGITS[cp]; |
| return dest; |
| } else if (cp <= 0x10ffff) { |
| char[] dest = new char[12]; |
| // Four byte UTF-8 characters [cp >= 0xffff && cp <= 0x10ffff] |
| // Start with "%F-%--%--%--" and fill in the blanks |
| dest[0] = '%'; |
| dest[1] = 'F'; |
| dest[3] = '%'; |
| dest[6] = '%'; |
| dest[9] = '%'; |
| dest[11] = UPPER_HEX_DIGITS[cp & 0xF]; |
| cp >>>= 4; |
| dest[10] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; |
| cp >>>= 2; |
| dest[8] = UPPER_HEX_DIGITS[cp & 0xF]; |
| cp >>>= 4; |
| dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; |
| cp >>>= 2; |
| dest[5] = UPPER_HEX_DIGITS[cp & 0xF]; |
| cp >>>= 4; |
| dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; |
| cp >>>= 2; |
| dest[2] = UPPER_HEX_DIGITS[cp & 0x7]; |
| return dest; |
| } else { |
| // If this ever happens it is due to bug in UnicodeEscaper, not bad |
| // input. |
| throw new IllegalArgumentException("Invalid unicode character value " + cp); |
| } |
| } |
| } |