src/org/apache/http/message/BasicTokenIterator.java - platform/external/apache-http - Git at Google

 /*
  * $HeadURL: http://svn.apache.org/repos/asf/httpcomponents/httpcore/trunk/module-main/src/main/java/org/apache/http/message/BasicTokenIterator.java $
  * $Revision: 602520 $
  * $Date: 2007-12-08 09:42:26 -0800 (Sat, 08 Dec 2007) $
  *
  * ====================================================================
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  * ====================================================================
  *
  * This software consists of voluntary contributions made by many
  * individuals on behalf of the Apache Software Foundation.  For more
  * information on the Apache Software Foundation, please see
  * <http://www.apache.org/>.
  *
  */

 package org.apache.http.message;

 import java.util.NoSuchElementException;

 import org.apache.http.HeaderIterator;
 import org.apache.http.ParseException;
 import org.apache.http.TokenIterator;

 /**
  * Basic implementation of a {@link TokenIterator}.
  * This implementation parses <tt>#token<tt> sequences as
  * defined by RFC 2616, section 2.
  * It extends that definition somewhat beyond US-ASCII.
  *
  * @version $Revision: 602520 $
  */
 public class BasicTokenIterator implements TokenIterator {

     /** The HTTP separator characters. Defined in RFC 2616, section 2.2. */
     // the order of the characters here is adjusted to put the
     // most likely candidates at the beginning of the collection
     public final static String HTTP_SEPARATORS = " ,;=()<>@:\\\"/[]?{}\t";


     /** The iterator from which to obtain the next header. */
     protected final HeaderIterator headerIt;

     /**
      * The value of the current header.
      * This is the header value that includes {@link #currentToken}.
      * Undefined if the iteration is over.
      */
     protected String currentHeader;

     /**
      * The token to be returned by the next call to {@link #currentToken}.
      * <code>null</code> if the iteration is over.
      */
     protected String currentToken;

     /**
      * The position after {@link #currentToken} in {@link #currentHeader}.
      * Undefined if the iteration is over.
      */
     protected int searchPos;


     /**
      * Creates a new instance of {@link BasicTokenIterator}.
      *
      * @param headerIterator    the iterator for the headers to tokenize
      */
     public BasicTokenIterator(final HeaderIterator headerIterator) {
         if (headerIterator == null) {
             throw new IllegalArgumentException
                 ("Header iterator must not be null.");
         }

         this.headerIt = headerIterator;
         this.searchPos = findNext(-1);
     }


     // non-javadoc, see interface TokenIterator
     public boolean hasNext() {
         return (this.currentToken != null);
     }


     /**
      * Obtains the next token from this iteration.
      *
      * @return  the next token in this iteration
      *
      * @throws NoSuchElementException   if the iteration is already over
      * @throws ParseException   if an invalid header value is encountered
      */
     public String nextToken()
         throws NoSuchElementException, ParseException {

         if (this.currentToken == null) {
             throw new NoSuchElementException("Iteration already finished.");
         }

         final String result = this.currentToken;
         // updates currentToken, may trigger ParseException:
         this.searchPos = findNext(this.searchPos);

         return result;
     }


     /**
      * Returns the next token.
      * Same as {@link #nextToken}, but with generic return type.
      *
      * @return  the next token in this iteration
      *
      * @throws NoSuchElementException   if there are no more tokens
      * @throws ParseException   if an invalid header value is encountered
      */
     public final Object next()
         throws NoSuchElementException, ParseException {
         return nextToken();
     }


     /**
      * Removing tokens is not supported.
      *
      * @throws UnsupportedOperationException    always
      */
     public final void remove()
         throws UnsupportedOperationException {

         throw new UnsupportedOperationException
             ("Removing tokens is not supported.");
     }


     /**
      * Determines the next token.
      * If found, the token is stored in {@link #currentToken}.
      * The return value indicates the position after the token
      * in {@link #currentHeader}. If necessary, the next header
      * will be obtained from {@link #headerIt}.
      * If not found, {@link #currentToken} is set to <code>null</code>.
      *
      * @param from      the position in the current header at which to
      *                  start the search, -1 to search in the first header
      *
      * @return  the position after the found token in the current header, or
      *          negative if there was no next token
      *
      * @throws ParseException   if an invalid header value is encountered
      */
     protected int findNext(int from)
         throws ParseException {

         if (from < 0) {
             // called from the constructor, initialize the first header
             if (!this.headerIt.hasNext()) {
                 return -1;
             }
             this.currentHeader = this.headerIt.nextHeader().getValue();
             from = 0;
         } else {
             // called after a token, make sure there is a separator
             from = findTokenSeparator(from);
         }

         int start = findTokenStart(from);
         if (start < 0) {
             this.currentToken = null;
             return -1; // nothing found
         }

         int end = findTokenEnd(start);
         this.currentToken = createToken(this.currentHeader, start, end);
         return end;
     }


     /**
      * Creates a new token to be returned.
      * Called from {@link #findNext findNext} after the token is identified.
      * The default implementation simply calls
      * {@link java.lang.String#substring String.substring}.
      * <br/>
      * If header values are significantly longer than tokens, and some
      * tokens are permanently referenced by the application, there can
      * be problems with garbage collection. A substring will hold a
      * reference to the full characters of the original string and
      * therefore occupies more memory than might be expected.
      * To avoid this, override this method and create a new string
      * instead of a substring.
      *
      * @param value     the full header value from which to create a token
      * @param start     the index of the first token character
      * @param end       the index after the last token character
      *
      * @return  a string representing the token identified by the arguments
      */
     protected String createToken(String value, int start, int end) {
         return value.substring(start, end);
     }


     /**
      * Determines the starting position of the next token.
      * This method will iterate over headers if necessary.
      *
      * @param from      the position in the current header at which to
      *                  start the search
      *
      * @return  the position of the token start in the current header,
      *          negative if no token start could be found
      */
     protected int findTokenStart(int from) {
         if (from < 0) {
             throw new IllegalArgumentException
                 ("Search position must not be negative: " + from);
         }

         boolean found = false;
         while (!found && (this.currentHeader != null)) {

             final int to = this.currentHeader.length();
             while (!found && (from < to)) {

                 final char ch = this.currentHeader.charAt(from);
                 if (isTokenSeparator(ch) || isWhitespace(ch)) {
                     // whitspace and token separators are skipped
                     from++;
                 } else if (isTokenChar(this.currentHeader.charAt(from))) {
                     // found the start of a token
                     found = true;
                 } else {
                     throw new ParseException
                         ("Invalid character before token (pos " + from +
                          "): " + this.currentHeader);
                 }
             }
             if (!found) {
                 if (this.headerIt.hasNext()) {
                     this.currentHeader = this.headerIt.nextHeader().getValue();
                     from = 0;
                 } else {
                     this.currentHeader = null;
                 }
             }
         } // while headers

         return found ? from : -1;
     }


     /**
      * Determines the position of the next token separator.
      * Because of multi-header joining rules, the end of a
      * header value is a token separator. This method does
      * therefore not need to iterate over headers.
      *
      * @param from      the position in the current header at which to
      *                  start the search
      *
      * @return  the position of a token separator in the current header,
      *          or at the end
      *
      * @throws ParseException
      *         if a new token is found before a token separator.
      *         RFC 2616, section 2.1 explicitly requires a comma between
      *         tokens for <tt>#</tt>.
      */
     protected int findTokenSeparator(int from) {
         if (from < 0) {
             throw new IllegalArgumentException
                 ("Search position must not be negative: " + from);
         }

         boolean found = false;
         final int to = this.currentHeader.length();
         while (!found && (from < to)) {
             final char ch = this.currentHeader.charAt(from);
             if (isTokenSeparator(ch)) {
                 found = true;
             } else if (isWhitespace(ch)) {
                 from++;
             } else if (isTokenChar(ch)) {
                 throw new ParseException
                     ("Tokens without separator (pos " + from +
                      "): " + this.currentHeader);
             } else {
                 throw new ParseException
                     ("Invalid character after token (pos " + from +
                      "): " + this.currentHeader);
             }
         }

         return from;
     }


     /**
      * Determines the ending position of the current token.
      * This method will not leave the current header value,
      * since the end of the header value is a token boundary.
      *
      * @param from      the position of the first character of the token
      *
      * @return  the position after the last character of the token.
      *          The behavior is undefined if <code>from</code> does not
      *          point to a token character in the current header value.
      */
     protected int findTokenEnd(int from) {
         if (from < 0) {
             throw new IllegalArgumentException
                 ("Token start position must not be negative: " + from);
         }

         final int to = this.currentHeader.length();
         int end = from+1;
         while ((end < to) && isTokenChar(this.currentHeader.charAt(end))) {
             end++;
         }

         return end;
     }


     /**
      * Checks whether a character is a token separator.
      * RFC 2616, section 2.1 defines comma as the separator for
      * <tt>#token</tt> sequences. The end of a header value will
      * also separate tokens, but that is not a character check.
      *
      * @param ch        the character to check
      *
      * @return  <code>true</code> if the character is a token separator,
      *          <code>false</code> otherwise
      */
     protected boolean isTokenSeparator(char ch) {
         return (ch == ',');
     }


     /**
      * Checks whether a character is a whitespace character.
      * RFC 2616, section 2.2 defines space and horizontal tab as whitespace.
      * The optional preceeding line break is irrelevant, since header
      * continuation is handled transparently when parsing messages.
      *
      * @param ch        the character to check
      *
      * @return  <code>true</code> if the character is whitespace,
      *          <code>false</code> otherwise
      */
     protected boolean isWhitespace(char ch) {

         // we do not use Character.isWhitspace(ch) here, since that allows
         // many control characters which are not whitespace as per RFC 2616
         return ((ch == '\t') || Character.isSpaceChar(ch));
     }


     /**
      * Checks whether a character is a valid token character.
      * Whitespace, control characters, and HTTP separators are not
      * valid token characters. The HTTP specification (RFC 2616, section 2.2)
      * defines tokens only for the US-ASCII character set, this
      * method extends the definition to other character sets.
      *
      * @param ch        the character to check
      *
      * @return  <code>true</code> if the character is a valid token start,
      *          <code>false</code> otherwise
      */
     protected boolean isTokenChar(char ch) {

         // common sense extension of ALPHA + DIGIT
         if (Character.isLetterOrDigit(ch))
             return true;

         // common sense extension of CTL
         if (Character.isISOControl(ch))
             return false;

         // no common sense extension for this
         if (isHttpSeparator(ch))
             return false;

         // RFC 2616, section 2.2 defines a token character as
         // "any CHAR except CTLs or separators". The controls
         // and separators are included in the checks above.
         // This will yield unexpected results for Unicode format characters.
         // If that is a problem, overwrite isHttpSeparator(char) to filter
         // out the false positives.
         return true;
     }


     /**
      * Checks whether a character is an HTTP separator.
      * The implementation in this class checks only for the HTTP separators
      * defined in RFC 2616, section 2.2. If you need to detect other
      * separators beyond the US-ASCII character set, override this method.
      *
      * @param ch        the character to check
      *
      * @return  <code>true</code> if the character is an HTTP separator
      */
     protected boolean isHttpSeparator(char ch) {
         return (HTTP_SEPARATORS.indexOf(ch) >= 0);
     }


 } // class BasicTokenIterator
	/*
	* $HeadURL: http://svn.apache.org/repos/asf/httpcomponents/httpcore/trunk/module-main/src/main/java/org/apache/http/message/BasicTokenIterator.java $
	* $Revision: 602520 $
	* $Date: 2007-12-08 09:42:26 -0800 (Sat, 08 Dec 2007) $
	*
	* ====================================================================
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	* ====================================================================
	*
	* This software consists of voluntary contributions made by many
	* individuals on behalf of the Apache Software Foundation. For more
	* information on the Apache Software Foundation, please see
	* <http://www.apache.org/>.
	*
	*/

	package org.apache.http.message;

	import java.util.NoSuchElementException;

	import org.apache.http.HeaderIterator;
	import org.apache.http.ParseException;
	import org.apache.http.TokenIterator;

	/**
	* Basic implementation of a {@link TokenIterator}.
	* This implementation parses <tt>#token<tt> sequences as
	* defined by RFC 2616, section 2.
	* It extends that definition somewhat beyond US-ASCII.
	*
	* @version $Revision: 602520 $
	*/
	public class BasicTokenIterator implements TokenIterator {

	/** The HTTP separator characters. Defined in RFC 2616, section 2.2. */
	// the order of the characters here is adjusted to put the
	// most likely candidates at the beginning of the collection
	public final static String HTTP_SEPARATORS = " ,;=()<>@:\\\"/[]?{}\t";


	/** The iterator from which to obtain the next header. */
	protected final HeaderIterator headerIt;

	/**
	* The value of the current header.
	* This is the header value that includes {@link #currentToken}.
	* Undefined if the iteration is over.
	*/
	protected String currentHeader;

	/**
	* The token to be returned by the next call to {@link #currentToken}.
	* <code>null</code> if the iteration is over.
	*/
	protected String currentToken;

	/**
	* The position after {@link #currentToken} in {@link #currentHeader}.
	* Undefined if the iteration is over.
	*/
	protected int searchPos;


	/**
	* Creates a new instance of {@link BasicTokenIterator}.
	*
	* @param headerIterator the iterator for the headers to tokenize
	*/
	public BasicTokenIterator(final HeaderIterator headerIterator) {
	if (headerIterator == null) {
	throw new IllegalArgumentException
	("Header iterator must not be null.");
	}

	this.headerIt = headerIterator;
	this.searchPos = findNext(-1);
	}


	// non-javadoc, see interface TokenIterator
	public boolean hasNext() {
	return (this.currentToken != null);
	}


	/**
	* Obtains the next token from this iteration.
	*
	* @return the next token in this iteration
	*
	* @throws NoSuchElementException if the iteration is already over
	* @throws ParseException if an invalid header value is encountered
	*/
	public String nextToken()
	throws NoSuchElementException, ParseException {

	if (this.currentToken == null) {
	throw new NoSuchElementException("Iteration already finished.");
	}

	final String result = this.currentToken;
	// updates currentToken, may trigger ParseException:
	this.searchPos = findNext(this.searchPos);

	return result;
	}


	/**
	* Returns the next token.
	* Same as {@link #nextToken}, but with generic return type.
	*
	* @return the next token in this iteration
	*
	* @throws NoSuchElementException if there are no more tokens
	* @throws ParseException if an invalid header value is encountered
	*/
	public final Object next()
	throws NoSuchElementException, ParseException {
	return nextToken();
	}


	/**
	* Removing tokens is not supported.
	*
	* @throws UnsupportedOperationException always
	*/
	public final void remove()
	throws UnsupportedOperationException {

	throw new UnsupportedOperationException
	("Removing tokens is not supported.");
	}


	/**
	* Determines the next token.
	* If found, the token is stored in {@link #currentToken}.
	* The return value indicates the position after the token
	* in {@link #currentHeader}. If necessary, the next header
	* will be obtained from {@link #headerIt}.
	* If not found, {@link #currentToken} is set to <code>null</code>.
	*
	* @param from the position in the current header at which to
	* start the search, -1 to search in the first header
	*
	* @return the position after the found token in the current header, or
	* negative if there was no next token
	*
	* @throws ParseException if an invalid header value is encountered
	*/
	protected int findNext(int from)
	throws ParseException {

	if (from < 0) {
	// called from the constructor, initialize the first header
	if (!this.headerIt.hasNext()) {
	return -1;
	}
	this.currentHeader = this.headerIt.nextHeader().getValue();
	from = 0;
	} else {
	// called after a token, make sure there is a separator
	from = findTokenSeparator(from);
	}

	int start = findTokenStart(from);
	if (start < 0) {
	this.currentToken = null;
	return -1; // nothing found
	}

	int end = findTokenEnd(start);
	this.currentToken = createToken(this.currentHeader, start, end);
	return end;
	}


	/**
	* Creates a new token to be returned.
	* Called from {@link #findNext findNext} after the token is identified.
	* The default implementation simply calls
	* {@link java.lang.String#substring String.substring}.
	* <br/>
	* If header values are significantly longer than tokens, and some
	* tokens are permanently referenced by the application, there can
	* be problems with garbage collection. A substring will hold a
	* reference to the full characters of the original string and
	* therefore occupies more memory than might be expected.
	* To avoid this, override this method and create a new string
	* instead of a substring.
	*
	* @param value the full header value from which to create a token
	* @param start the index of the first token character
	* @param end the index after the last token character
	*
	* @return a string representing the token identified by the arguments
	*/
	protected String createToken(String value, int start, int end) {
	return value.substring(start, end);
	}


	/**
	* Determines the starting position of the next token.
	* This method will iterate over headers if necessary.
	*
	* @param from the position in the current header at which to
	* start the search
	*
	* @return the position of the token start in the current header,
	* negative if no token start could be found
	*/
	protected int findTokenStart(int from) {
	if (from < 0) {
	throw new IllegalArgumentException
	("Search position must not be negative: " + from);
	}

	boolean found = false;
	while (!found && (this.currentHeader != null)) {

	final int to = this.currentHeader.length();
	while (!found && (from < to)) {

	final char ch = this.currentHeader.charAt(from);
	if (isTokenSeparator(ch) \|\| isWhitespace(ch)) {
	// whitspace and token separators are skipped
	from++;
	} else if (isTokenChar(this.currentHeader.charAt(from))) {
	// found the start of a token
	found = true;
	} else {
	throw new ParseException
	("Invalid character before token (pos " + from +
	"): " + this.currentHeader);
	}
	}
	if (!found) {
	if (this.headerIt.hasNext()) {
	this.currentHeader = this.headerIt.nextHeader().getValue();
	from = 0;
	} else {
	this.currentHeader = null;
	}
	}
	} // while headers

	return found ? from : -1;
	}


	/**
	* Determines the position of the next token separator.
	* Because of multi-header joining rules, the end of a
	* header value is a token separator. This method does
	* therefore not need to iterate over headers.
	*
	* @param from the position in the current header at which to
	* start the search
	*
	* @return the position of a token separator in the current header,
	* or at the end
	*
	* @throws ParseException
	* if a new token is found before a token separator.
	* RFC 2616, section 2.1 explicitly requires a comma between
	* tokens for <tt>#</tt>.
	*/
	protected int findTokenSeparator(int from) {
	if (from < 0) {
	throw new IllegalArgumentException
	("Search position must not be negative: " + from);
	}

	boolean found = false;
	final int to = this.currentHeader.length();
	while (!found && (from < to)) {
	final char ch = this.currentHeader.charAt(from);
	if (isTokenSeparator(ch)) {
	found = true;
	} else if (isWhitespace(ch)) {
	from++;
	} else if (isTokenChar(ch)) {
	throw new ParseException
	("Tokens without separator (pos " + from +
	"): " + this.currentHeader);
	} else {
	throw new ParseException
	("Invalid character after token (pos " + from +
	"): " + this.currentHeader);
	}
	}

	return from;
	}


	/**
	* Determines the ending position of the current token.
	* This method will not leave the current header value,
	* since the end of the header value is a token boundary.
	*
	* @param from the position of the first character of the token
	*
	* @return the position after the last character of the token.
	* The behavior is undefined if <code>from</code> does not
	* point to a token character in the current header value.
	*/
	protected int findTokenEnd(int from) {
	if (from < 0) {
	throw new IllegalArgumentException
	("Token start position must not be negative: " + from);
	}

	final int to = this.currentHeader.length();
	int end = from+1;
	while ((end < to) && isTokenChar(this.currentHeader.charAt(end))) {
	end++;
	}

	return end;
	}


	/**
	* Checks whether a character is a token separator.
	* RFC 2616, section 2.1 defines comma as the separator for
	* <tt>#token</tt> sequences. The end of a header value will
	* also separate tokens, but that is not a character check.
	*
	* @param ch the character to check
	*
	* @return <code>true</code> if the character is a token separator,
	* <code>false</code> otherwise
	*/
	protected boolean isTokenSeparator(char ch) {
	return (ch == ',');
	}


	/**
	* Checks whether a character is a whitespace character.
	* RFC 2616, section 2.2 defines space and horizontal tab as whitespace.
	* The optional preceeding line break is irrelevant, since header
	* continuation is handled transparently when parsing messages.
	*
	* @param ch the character to check
	*
	* @return <code>true</code> if the character is whitespace,
	* <code>false</code> otherwise
	*/
	protected boolean isWhitespace(char ch) {

	// we do not use Character.isWhitspace(ch) here, since that allows
	// many control characters which are not whitespace as per RFC 2616
	return ((ch == '\t') \|\| Character.isSpaceChar(ch));
	}


	/**
	* Checks whether a character is a valid token character.
	* Whitespace, control characters, and HTTP separators are not
	* valid token characters. The HTTP specification (RFC 2616, section 2.2)
	* defines tokens only for the US-ASCII character set, this
	* method extends the definition to other character sets.
	*
	* @param ch the character to check
	*
	* @return <code>true</code> if the character is a valid token start,
	* <code>false</code> otherwise
	*/
	protected boolean isTokenChar(char ch) {

	// common sense extension of ALPHA + DIGIT
	if (Character.isLetterOrDigit(ch))
	return true;

	// common sense extension of CTL
	if (Character.isISOControl(ch))
	return false;

	// no common sense extension for this
	if (isHttpSeparator(ch))
	return false;

	// RFC 2616, section 2.2 defines a token character as
	// "any CHAR except CTLs or separators". The controls
	// and separators are included in the checks above.
	// This will yield unexpected results for Unicode format characters.
	// If that is a problem, overwrite isHttpSeparator(char) to filter
	// out the false positives.
	return true;
	}


	/**
	* Checks whether a character is an HTTP separator.
	* The implementation in this class checks only for the HTTP separators
	* defined in RFC 2616, section 2.2. If you need to detect other
	* separators beyond the US-ASCII character set, override this method.
	*
	* @param ch the character to check
	*
	* @return <code>true</code> if the character is an HTTP separator
	*/
	protected boolean isHttpSeparator(char ch) {
	return (HTTP_SEPARATORS.indexOf(ch) >= 0);
	}


	} // class BasicTokenIterator