src/main/java/org/yaml/snakeyaml/external/com/google/gdata/util/common/base/PercentEscaper.java - platform/external/snakeyaml - Git at Google

 /* Copyright (c) 2008 Google Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.yaml.snakeyaml.external.com.google.gdata.util.common.base;

 /**
  * A {@code UnicodeEscaper} that escapes some set of Java characters using the
  * URI percent encoding scheme. The set of safe characters (those which remain
  * unescaped) can be specified on construction.
  *
  * <p>
  * For details on escaping URIs for use in web pages, see section 2.4 of <a
  * href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
  *
  * <p>
  * In most cases this class should not need to be used directly. If you have no
  * special requirements for escaping your URIs, you should use either
  * {@link CharEscapers#uriEscaper()} or {@link CharEscapers#uriEscaper(boolean)}.
  *
  * <p>
  * When encoding a String, the following rules apply:
  * <ul>
  * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
  * through "9" remain the same.
  * <li>Any additionally specified safe characters remain the same.
  * <li>If {@code plusForSpace} was specified, the space character " " is
  * converted into a plus sign "+".
  * <li>All other characters are converted into one or more bytes using UTF-8
  * encoding and each byte is then represented by the 3-character string "%XY",
  * where "XY" is the two-digit, uppercase, hexadecimal representation of the
  * byte value.
  * </ul>
  *
  * <p>
  * RFC 2396 specifies the set of unreserved characters as "-", "_", ".", "!",
  * "~", "*", "'", "(" and ")". It goes on to state:
  *
  * <p>
  * <i>Unreserved characters can be escaped without changing the semantics of the
  * URI, but this should not be done unless the URI is being used in a context
  * that does not allow the unescaped character to appear.</i>
  *
  * <p>
  * For performance reasons the only currently supported character encoding of
  * this class is UTF-8.
  *
  * <p>
  * <b>Note</b>: This escaper produces uppercase hexidecimal sequences. From <a
  * href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>:<br>
  * <i>"URI producers and normalizers should use uppercase hexadecimal digits for
  * all percent-encodings."</i>
  *
  *
  */
 public class PercentEscaper extends UnicodeEscaper {
     /**
      * A string of safe characters that mimics the behavior of
      * {@link java.net.URLEncoder}.
      *
      */
     public static final String SAFECHARS_URLENCODER = "-_.*";

     /**
      * A string of characters that do not need to be encoded when used in URI
      * path segments, as specified in RFC 3986. Note that some of these
      * characters do need to be escaped when used in other parts of the URI.
      */
     public static final String SAFEPATHCHARS_URLENCODER = "-_.!~*'()@:$&,;=";

     /**
      * A string of characters that do not need to be encoded when used in URI
      * query strings, as specified in RFC 3986. Note that some of these
      * characters do need to be escaped when used in other parts of the URI.
      */
     public static final String SAFEQUERYSTRINGCHARS_URLENCODER = "-_.!~*'()@:$,;/?:";

     // In some uri escapers spaces are escaped to '+'
     private static final char[] URI_ESCAPED_SPACE = { '+' };

     private static final char[] UPPER_HEX_DIGITS = "0123456789ABCDEF".toCharArray();

     /**
      * If true we should convert space to the {@code +} character.
      */
     private final boolean plusForSpace;

     /**
      * An array of flags where for any {@code char c} if {@code safeOctets[c]}
      * is true then {@code c} should remain unmodified in the output. If
      * {@code c > safeOctets.length} then it should be escaped.
      */
     private final boolean[] safeOctets;

     /**
      * Constructs a URI escaper with the specified safe characters and optional
      * handling of the space character.
      *
      * @param safeChars
      *            a non null string specifying additional safe characters for
      *            this escaper (the ranges 0..9, a..z and A..Z are always safe
      *            and should not be specified here)
      * @param plusForSpace
      *            true if ASCII space should be escaped to {@code +} rather than
      *            {@code %20}
      * @throws IllegalArgumentException
      *             if any of the parameters were invalid
      */
     public PercentEscaper(String safeChars, boolean plusForSpace) {
         // Avoid any misunderstandings about the behavior of this escaper
         if (safeChars.matches(".*[0-9A-Za-z].*")) {
             throw new IllegalArgumentException(
                     "Alphanumeric characters are always 'safe' and should not be "
                             + "explicitly specified");
         }
         // Avoid ambiguous parameters. Safe characters are never modified so if
         // space is a safe character then setting plusForSpace is meaningless.
         if (plusForSpace && safeChars.contains(" ")) {
             throw new IllegalArgumentException(
                     "plusForSpace cannot be specified when space is a 'safe' character");
         }
         if (safeChars.contains("%")) {
             throw new IllegalArgumentException("The '%' character cannot be specified as 'safe'");
         }
         this.plusForSpace = plusForSpace;
         this.safeOctets = createSafeOctets(safeChars);
     }

     /**
      * Creates a boolean[] with entries corresponding to the character values
      * for 0-9, A-Z, a-z and those specified in safeChars set to true. The array
      * is as small as is required to hold the given character information.
      */
     private static boolean[] createSafeOctets(String safeChars) {
         int maxChar = 'z';
         char[] safeCharArray = safeChars.toCharArray();
         for (char c : safeCharArray) {
             maxChar = Math.max(c, maxChar);
         }
         boolean[] octets = new boolean[maxChar + 1];
         for (int c = '0'; c <= '9'; c++) {
             octets[c] = true;
         }
         for (int c = 'A'; c <= 'Z'; c++) {
             octets[c] = true;
         }
         for (int c = 'a'; c <= 'z'; c++) {
             octets[c] = true;
         }
         for (char c : safeCharArray) {
             octets[c] = true;
         }
         return octets;
     }

     /*
      * Overridden for performance. For unescaped strings this improved the
      * performance of the uri escaper from ~760ns to ~400ns as measured by
      * {@link CharEscapersBenchmark}.
      */
     @Override
     protected int nextEscapeIndex(CharSequence csq, int index, int end) {
         for (; index < end; index++) {
             char c = csq.charAt(index);
             if (c >= safeOctets.length || !safeOctets[c]) {
                 break;
             }
         }
         return index;
     }

     /*
      * Overridden for performance. For unescaped strings this improved the
      * performance of the uri escaper from ~400ns to ~170ns as measured by
      * {@link CharEscapersBenchmark}.
      */
     @Override
     public String escape(String s) {
         int slen = s.length();
         for (int index = 0; index < slen; index++) {
             char c = s.charAt(index);
             if (c >= safeOctets.length || !safeOctets[c]) {
                 return escapeSlow(s, index);
             }
         }
         return s;
     }

     /**
      * Escapes the given Unicode code point in UTF-8.
      */
     @Override
     protected char[] escape(int cp) {
         // We should never get negative values here but if we do it will throw
         // an
         // IndexOutOfBoundsException, so at least it will get spotted.
         if (cp < safeOctets.length && safeOctets[cp]) {
             return null;
         } else if (cp == ' ' && plusForSpace) {
             return URI_ESCAPED_SPACE;
         } else if (cp <= 0x7F) {
             // Single byte UTF-8 characters
             // Start with "%--" and fill in the blanks
             char[] dest = new char[3];
             dest[0] = '%';
             dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
             dest[1] = UPPER_HEX_DIGITS[cp >>> 4];
             return dest;
         } else if (cp <= 0x7ff) {
             // Two byte UTF-8 characters [cp >= 0x80 && cp <= 0x7ff]
             // Start with "%--%--" and fill in the blanks
             char[] dest = new char[6];
             dest[0] = '%';
             dest[3] = '%';
             dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
             cp >>>= 4;
             dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
             cp >>>= 2;
             dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
             cp >>>= 4;
             dest[1] = UPPER_HEX_DIGITS[0xC | cp];
             return dest;
         } else if (cp <= 0xffff) {
             // Three byte UTF-8 characters [cp >= 0x800 && cp <= 0xffff]
             // Start with "%E-%--%--" and fill in the blanks
             char[] dest = new char[9];
             dest[0] = '%';
             dest[1] = 'E';
             dest[3] = '%';
             dest[6] = '%';
             dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
             cp >>>= 4;
             dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
             cp >>>= 2;
             dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
             cp >>>= 4;
             dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
             cp >>>= 2;
             dest[2] = UPPER_HEX_DIGITS[cp];
             return dest;
         } else if (cp <= 0x10ffff) {
             char[] dest = new char[12];
             // Four byte UTF-8 characters [cp >= 0xffff && cp <= 0x10ffff]
             // Start with "%F-%--%--%--" and fill in the blanks
             dest[0] = '%';
             dest[1] = 'F';
             dest[3] = '%';
             dest[6] = '%';
             dest[9] = '%';
             dest[11] = UPPER_HEX_DIGITS[cp & 0xF];
             cp >>>= 4;
             dest[10] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
             cp >>>= 2;
             dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
             cp >>>= 4;
             dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
             cp >>>= 2;
             dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
             cp >>>= 4;
             dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
             cp >>>= 2;
             dest[2] = UPPER_HEX_DIGITS[cp & 0x7];
             return dest;
         } else {
             // If this ever happens it is due to bug in UnicodeEscaper, not bad
             // input.
             throw new IllegalArgumentException("Invalid unicode character value " + cp);
         }
     }
 }
	/* Copyright (c) 2008 Google Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.yaml.snakeyaml.external.com.google.gdata.util.common.base;

	/**
	* A {@code UnicodeEscaper} that escapes some set of Java characters using the
	* URI percent encoding scheme. The set of safe characters (those which remain
	* unescaped) can be specified on construction.
	*
	* <p>
	* For details on escaping URIs for use in web pages, see section 2.4 of <a
	* href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
	*
	* <p>
	* In most cases this class should not need to be used directly. If you have no
	* special requirements for escaping your URIs, you should use either
	* {@link CharEscapers#uriEscaper()} or {@link CharEscapers#uriEscaper(boolean)}.
	*
	* <p>
	* When encoding a String, the following rules apply:
	* <ul>
	* <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
	* through "9" remain the same.
	* <li>Any additionally specified safe characters remain the same.
	* <li>If {@code plusForSpace} was specified, the space character " " is
	* converted into a plus sign "+".
	* <li>All other characters are converted into one or more bytes using UTF-8
	* encoding and each byte is then represented by the 3-character string "%XY",
	* where "XY" is the two-digit, uppercase, hexadecimal representation of the
	* byte value.
	* </ul>
	*
	* <p>
	* RFC 2396 specifies the set of unreserved characters as "-", "_", ".", "!",
	* "~", "*", "'", "(" and ")". It goes on to state:
	*
	* <p>
	* <i>Unreserved characters can be escaped without changing the semantics of the
	* URI, but this should not be done unless the URI is being used in a context
	* that does not allow the unescaped character to appear.</i>
	*
	* <p>
	* For performance reasons the only currently supported character encoding of
	* this class is UTF-8.
	*
	* <p>
	* <b>Note</b>: This escaper produces uppercase hexidecimal sequences. From <a
	* href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>:<br>
	* <i>"URI producers and normalizers should use uppercase hexadecimal digits for
	* all percent-encodings."</i>
	*
	*
	*/
	public class PercentEscaper extends UnicodeEscaper {
	/**
	* A string of safe characters that mimics the behavior of
	* {@link java.net.URLEncoder}.
	*
	*/
	public static final String SAFECHARS_URLENCODER = "-_.*";

	/**
	* A string of characters that do not need to be encoded when used in URI
	* path segments, as specified in RFC 3986. Note that some of these
	* characters do need to be escaped when used in other parts of the URI.
	*/
	public static final String SAFEPATHCHARS_URLENCODER = "-_.!~*'()@:$&,;=";

	/**
	* A string of characters that do not need to be encoded when used in URI
	* query strings, as specified in RFC 3986. Note that some of these
	* characters do need to be escaped when used in other parts of the URI.
	*/
	public static final String SAFEQUERYSTRINGCHARS_URLENCODER = "-_.!~*'()@:$,;/?:";

	// In some uri escapers spaces are escaped to '+'
	private static final char[] URI_ESCAPED_SPACE = { '+' };

	private static final char[] UPPER_HEX_DIGITS = "0123456789ABCDEF".toCharArray();

	/**
	* If true we should convert space to the {@code +} character.
	*/
	private final boolean plusForSpace;

	/**
	* An array of flags where for any {@code char c} if {@code safeOctets[c]}
	* is true then {@code c} should remain unmodified in the output. If
	* {@code c > safeOctets.length} then it should be escaped.
	*/
	private final boolean[] safeOctets;

	/**
	* Constructs a URI escaper with the specified safe characters and optional
	* handling of the space character.
	*
	* @param safeChars
	* a non null string specifying additional safe characters for
	* this escaper (the ranges 0..9, a..z and A..Z are always safe
	* and should not be specified here)
	* @param plusForSpace
	* true if ASCII space should be escaped to {@code +} rather than
	* {@code %20}
	* @throws IllegalArgumentException
	* if any of the parameters were invalid
	*/
	public PercentEscaper(String safeChars, boolean plusForSpace) {
	// Avoid any misunderstandings about the behavior of this escaper
	if (safeChars.matches(".[0-9A-Za-z].")) {
	throw new IllegalArgumentException(
	"Alphanumeric characters are always 'safe' and should not be "
	+ "explicitly specified");
	}
	// Avoid ambiguous parameters. Safe characters are never modified so if
	// space is a safe character then setting plusForSpace is meaningless.
	if (plusForSpace && safeChars.contains(" ")) {
	throw new IllegalArgumentException(
	"plusForSpace cannot be specified when space is a 'safe' character");
	}
	if (safeChars.contains("%")) {
	throw new IllegalArgumentException("The '%' character cannot be specified as 'safe'");
	}
	this.plusForSpace = plusForSpace;
	this.safeOctets = createSafeOctets(safeChars);
	}

	/**
	* Creates a boolean[] with entries corresponding to the character values
	* for 0-9, A-Z, a-z and those specified in safeChars set to true. The array
	* is as small as is required to hold the given character information.
	*/
	private static boolean[] createSafeOctets(String safeChars) {
	int maxChar = 'z';
	char[] safeCharArray = safeChars.toCharArray();
	for (char c : safeCharArray) {
	maxChar = Math.max(c, maxChar);
	}
	boolean[] octets = new boolean[maxChar + 1];
	for (int c = '0'; c <= '9'; c++) {
	octets[c] = true;
	}
	for (int c = 'A'; c <= 'Z'; c++) {
	octets[c] = true;
	}
	for (int c = 'a'; c <= 'z'; c++) {
	octets[c] = true;
	}
	for (char c : safeCharArray) {
	octets[c] = true;
	}
	return octets;
	}

	/*
	* Overridden for performance. For unescaped strings this improved the
	* performance of the uri escaper from ~760ns to ~400ns as measured by
	* {@link CharEscapersBenchmark}.
	*/
	@Override
	protected int nextEscapeIndex(CharSequence csq, int index, int end) {
	for (; index < end; index++) {
	char c = csq.charAt(index);
	if (c >= safeOctets.length \|\| !safeOctets[c]) {
	break;
	}
	}
	return index;
	}

	/*
	* Overridden for performance. For unescaped strings this improved the
	* performance of the uri escaper from ~400ns to ~170ns as measured by
	* {@link CharEscapersBenchmark}.
	*/
	@Override
	public String escape(String s) {
	int slen = s.length();
	for (int index = 0; index < slen; index++) {
	char c = s.charAt(index);
	if (c >= safeOctets.length \|\| !safeOctets[c]) {
	return escapeSlow(s, index);
	}
	}
	return s;
	}

	/**
	* Escapes the given Unicode code point in UTF-8.
	*/
	@Override
	protected char[] escape(int cp) {
	// We should never get negative values here but if we do it will throw
	// an
	// IndexOutOfBoundsException, so at least it will get spotted.
	if (cp < safeOctets.length && safeOctets[cp]) {
	return null;
	} else if (cp == ' ' && plusForSpace) {
	return URI_ESCAPED_SPACE;
	} else if (cp <= 0x7F) {
	// Single byte UTF-8 characters
	// Start with "%--" and fill in the blanks
	char[] dest = new char[3];
	dest[0] = '%';
	dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
	dest[1] = UPPER_HEX_DIGITS[cp >>> 4];
	return dest;
	} else if (cp <= 0x7ff) {
	// Two byte UTF-8 characters [cp >= 0x80 && cp <= 0x7ff]
	// Start with "%--%--" and fill in the blanks
	char[] dest = new char[6];
	dest[0] = '%';
	dest[3] = '%';
	dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
	cp >>>= 4;
	dest[4] = UPPER_HEX_DIGITS[0x8 \| (cp & 0x3)];
	cp >>>= 2;
	dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
	cp >>>= 4;
	dest[1] = UPPER_HEX_DIGITS[0xC \| cp];
	return dest;
	} else if (cp <= 0xffff) {
	// Three byte UTF-8 characters [cp >= 0x800 && cp <= 0xffff]
	// Start with "%E-%--%--" and fill in the blanks
	char[] dest = new char[9];
	dest[0] = '%';
	dest[1] = 'E';
	dest[3] = '%';
	dest[6] = '%';
	dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
	cp >>>= 4;
	dest[7] = UPPER_HEX_DIGITS[0x8 \| (cp & 0x3)];
	cp >>>= 2;
	dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
	cp >>>= 4;
	dest[4] = UPPER_HEX_DIGITS[0x8 \| (cp & 0x3)];
	cp >>>= 2;
	dest[2] = UPPER_HEX_DIGITS[cp];
	return dest;
	} else if (cp <= 0x10ffff) {
	char[] dest = new char[12];
	// Four byte UTF-8 characters [cp >= 0xffff && cp <= 0x10ffff]
	// Start with "%F-%--%--%--" and fill in the blanks
	dest[0] = '%';
	dest[1] = 'F';
	dest[3] = '%';
	dest[6] = '%';
	dest[9] = '%';
	dest[11] = UPPER_HEX_DIGITS[cp & 0xF];
	cp >>>= 4;
	dest[10] = UPPER_HEX_DIGITS[0x8 \| (cp & 0x3)];
	cp >>>= 2;
	dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
	cp >>>= 4;
	dest[7] = UPPER_HEX_DIGITS[0x8 \| (cp & 0x3)];
	cp >>>= 2;
	dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
	cp >>>= 4;
	dest[4] = UPPER_HEX_DIGITS[0x8 \| (cp & 0x3)];
	cp >>>= 2;
	dest[2] = UPPER_HEX_DIGITS[cp & 0x7];
	return dest;
	} else {
	// If this ever happens it is due to bug in UnicodeEscaper, not bad
	// input.
	throw new IllegalArgumentException("Invalid unicode character value " + cp);
	}
	}
	}