| /* |
| * $HeadURL: http://svn.apache.org/repos/asf/httpcomponents/httpcore/trunk/module-main/src/main/java/org/apache/http/message/BasicTokenIterator.java $ |
| * $Revision: 602520 $ |
| * $Date: 2007-12-08 09:42:26 -0800 (Sat, 08 Dec 2007) $ |
| * |
| * ==================================================================== |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * ==================================================================== |
| * |
| * This software consists of voluntary contributions made by many |
| * individuals on behalf of the Apache Software Foundation. For more |
| * information on the Apache Software Foundation, please see |
| * <http://www.apache.org/>. |
| * |
| */ |
| |
| package org.apache.http.message; |
| |
| import java.util.NoSuchElementException; |
| |
| import org.apache.http.HeaderIterator; |
| import org.apache.http.ParseException; |
| import org.apache.http.TokenIterator; |
| |
| /** |
| * Basic implementation of a {@link TokenIterator}. |
| * This implementation parses <tt>#token<tt> sequences as |
| * defined by RFC 2616, section 2. |
| * It extends that definition somewhat beyond US-ASCII. |
| * |
| * @version $Revision: 602520 $ |
| */ |
| public class BasicTokenIterator implements TokenIterator { |
| |
| /** The HTTP separator characters. Defined in RFC 2616, section 2.2. */ |
| // the order of the characters here is adjusted to put the |
| // most likely candidates at the beginning of the collection |
| public final static String HTTP_SEPARATORS = " ,;=()<>@:\\\"/[]?{}\t"; |
| |
| |
| /** The iterator from which to obtain the next header. */ |
| protected final HeaderIterator headerIt; |
| |
| /** |
| * The value of the current header. |
| * This is the header value that includes {@link #currentToken}. |
| * Undefined if the iteration is over. |
| */ |
| protected String currentHeader; |
| |
| /** |
| * The token to be returned by the next call to {@link #currentToken}. |
| * <code>null</code> if the iteration is over. |
| */ |
| protected String currentToken; |
| |
| /** |
| * The position after {@link #currentToken} in {@link #currentHeader}. |
| * Undefined if the iteration is over. |
| */ |
| protected int searchPos; |
| |
| |
| /** |
| * Creates a new instance of {@link BasicTokenIterator}. |
| * |
| * @param headerIterator the iterator for the headers to tokenize |
| */ |
| public BasicTokenIterator(final HeaderIterator headerIterator) { |
| if (headerIterator == null) { |
| throw new IllegalArgumentException |
| ("Header iterator must not be null."); |
| } |
| |
| this.headerIt = headerIterator; |
| this.searchPos = findNext(-1); |
| } |
| |
| |
| // non-javadoc, see interface TokenIterator |
| public boolean hasNext() { |
| return (this.currentToken != null); |
| } |
| |
| |
| /** |
| * Obtains the next token from this iteration. |
| * |
| * @return the next token in this iteration |
| * |
| * @throws NoSuchElementException if the iteration is already over |
| * @throws ParseException if an invalid header value is encountered |
| */ |
| public String nextToken() |
| throws NoSuchElementException, ParseException { |
| |
| if (this.currentToken == null) { |
| throw new NoSuchElementException("Iteration already finished."); |
| } |
| |
| final String result = this.currentToken; |
| // updates currentToken, may trigger ParseException: |
| this.searchPos = findNext(this.searchPos); |
| |
| return result; |
| } |
| |
| |
| /** |
| * Returns the next token. |
| * Same as {@link #nextToken}, but with generic return type. |
| * |
| * @return the next token in this iteration |
| * |
| * @throws NoSuchElementException if there are no more tokens |
| * @throws ParseException if an invalid header value is encountered |
| */ |
| public final Object next() |
| throws NoSuchElementException, ParseException { |
| return nextToken(); |
| } |
| |
| |
| /** |
| * Removing tokens is not supported. |
| * |
| * @throws UnsupportedOperationException always |
| */ |
| public final void remove() |
| throws UnsupportedOperationException { |
| |
| throw new UnsupportedOperationException |
| ("Removing tokens is not supported."); |
| } |
| |
| |
| /** |
| * Determines the next token. |
| * If found, the token is stored in {@link #currentToken}. |
| * The return value indicates the position after the token |
| * in {@link #currentHeader}. If necessary, the next header |
| * will be obtained from {@link #headerIt}. |
| * If not found, {@link #currentToken} is set to <code>null</code>. |
| * |
| * @param from the position in the current header at which to |
| * start the search, -1 to search in the first header |
| * |
| * @return the position after the found token in the current header, or |
| * negative if there was no next token |
| * |
| * @throws ParseException if an invalid header value is encountered |
| */ |
| protected int findNext(int from) |
| throws ParseException { |
| |
| if (from < 0) { |
| // called from the constructor, initialize the first header |
| if (!this.headerIt.hasNext()) { |
| return -1; |
| } |
| this.currentHeader = this.headerIt.nextHeader().getValue(); |
| from = 0; |
| } else { |
| // called after a token, make sure there is a separator |
| from = findTokenSeparator(from); |
| } |
| |
| int start = findTokenStart(from); |
| if (start < 0) { |
| this.currentToken = null; |
| return -1; // nothing found |
| } |
| |
| int end = findTokenEnd(start); |
| this.currentToken = createToken(this.currentHeader, start, end); |
| return end; |
| } |
| |
| |
| /** |
| * Creates a new token to be returned. |
| * Called from {@link #findNext findNext} after the token is identified. |
| * The default implementation simply calls |
| * {@link java.lang.String#substring String.substring}. |
| * <br/> |
| * If header values are significantly longer than tokens, and some |
| * tokens are permanently referenced by the application, there can |
| * be problems with garbage collection. A substring will hold a |
| * reference to the full characters of the original string and |
| * therefore occupies more memory than might be expected. |
| * To avoid this, override this method and create a new string |
| * instead of a substring. |
| * |
| * @param value the full header value from which to create a token |
| * @param start the index of the first token character |
| * @param end the index after the last token character |
| * |
| * @return a string representing the token identified by the arguments |
| */ |
| protected String createToken(String value, int start, int end) { |
| return value.substring(start, end); |
| } |
| |
| |
| /** |
| * Determines the starting position of the next token. |
| * This method will iterate over headers if necessary. |
| * |
| * @param from the position in the current header at which to |
| * start the search |
| * |
| * @return the position of the token start in the current header, |
| * negative if no token start could be found |
| */ |
| protected int findTokenStart(int from) { |
| if (from < 0) { |
| throw new IllegalArgumentException |
| ("Search position must not be negative: " + from); |
| } |
| |
| boolean found = false; |
| while (!found && (this.currentHeader != null)) { |
| |
| final int to = this.currentHeader.length(); |
| while (!found && (from < to)) { |
| |
| final char ch = this.currentHeader.charAt(from); |
| if (isTokenSeparator(ch) || isWhitespace(ch)) { |
| // whitspace and token separators are skipped |
| from++; |
| } else if (isTokenChar(this.currentHeader.charAt(from))) { |
| // found the start of a token |
| found = true; |
| } else { |
| throw new ParseException |
| ("Invalid character before token (pos " + from + |
| "): " + this.currentHeader); |
| } |
| } |
| if (!found) { |
| if (this.headerIt.hasNext()) { |
| this.currentHeader = this.headerIt.nextHeader().getValue(); |
| from = 0; |
| } else { |
| this.currentHeader = null; |
| } |
| } |
| } // while headers |
| |
| return found ? from : -1; |
| } |
| |
| |
| /** |
| * Determines the position of the next token separator. |
| * Because of multi-header joining rules, the end of a |
| * header value is a token separator. This method does |
| * therefore not need to iterate over headers. |
| * |
| * @param from the position in the current header at which to |
| * start the search |
| * |
| * @return the position of a token separator in the current header, |
| * or at the end |
| * |
| * @throws ParseException |
| * if a new token is found before a token separator. |
| * RFC 2616, section 2.1 explicitly requires a comma between |
| * tokens for <tt>#</tt>. |
| */ |
| protected int findTokenSeparator(int from) { |
| if (from < 0) { |
| throw new IllegalArgumentException |
| ("Search position must not be negative: " + from); |
| } |
| |
| boolean found = false; |
| final int to = this.currentHeader.length(); |
| while (!found && (from < to)) { |
| final char ch = this.currentHeader.charAt(from); |
| if (isTokenSeparator(ch)) { |
| found = true; |
| } else if (isWhitespace(ch)) { |
| from++; |
| } else if (isTokenChar(ch)) { |
| throw new ParseException |
| ("Tokens without separator (pos " + from + |
| "): " + this.currentHeader); |
| } else { |
| throw new ParseException |
| ("Invalid character after token (pos " + from + |
| "): " + this.currentHeader); |
| } |
| } |
| |
| return from; |
| } |
| |
| |
| /** |
| * Determines the ending position of the current token. |
| * This method will not leave the current header value, |
| * since the end of the header value is a token boundary. |
| * |
| * @param from the position of the first character of the token |
| * |
| * @return the position after the last character of the token. |
| * The behavior is undefined if <code>from</code> does not |
| * point to a token character in the current header value. |
| */ |
| protected int findTokenEnd(int from) { |
| if (from < 0) { |
| throw new IllegalArgumentException |
| ("Token start position must not be negative: " + from); |
| } |
| |
| final int to = this.currentHeader.length(); |
| int end = from+1; |
| while ((end < to) && isTokenChar(this.currentHeader.charAt(end))) { |
| end++; |
| } |
| |
| return end; |
| } |
| |
| |
| /** |
| * Checks whether a character is a token separator. |
| * RFC 2616, section 2.1 defines comma as the separator for |
| * <tt>#token</tt> sequences. The end of a header value will |
| * also separate tokens, but that is not a character check. |
| * |
| * @param ch the character to check |
| * |
| * @return <code>true</code> if the character is a token separator, |
| * <code>false</code> otherwise |
| */ |
| protected boolean isTokenSeparator(char ch) { |
| return (ch == ','); |
| } |
| |
| |
| /** |
| * Checks whether a character is a whitespace character. |
| * RFC 2616, section 2.2 defines space and horizontal tab as whitespace. |
| * The optional preceeding line break is irrelevant, since header |
| * continuation is handled transparently when parsing messages. |
| * |
| * @param ch the character to check |
| * |
| * @return <code>true</code> if the character is whitespace, |
| * <code>false</code> otherwise |
| */ |
| protected boolean isWhitespace(char ch) { |
| |
| // we do not use Character.isWhitspace(ch) here, since that allows |
| // many control characters which are not whitespace as per RFC 2616 |
| return ((ch == '\t') || Character.isSpaceChar(ch)); |
| } |
| |
| |
| /** |
| * Checks whether a character is a valid token character. |
| * Whitespace, control characters, and HTTP separators are not |
| * valid token characters. The HTTP specification (RFC 2616, section 2.2) |
| * defines tokens only for the US-ASCII character set, this |
| * method extends the definition to other character sets. |
| * |
| * @param ch the character to check |
| * |
| * @return <code>true</code> if the character is a valid token start, |
| * <code>false</code> otherwise |
| */ |
| protected boolean isTokenChar(char ch) { |
| |
| // common sense extension of ALPHA + DIGIT |
| if (Character.isLetterOrDigit(ch)) |
| return true; |
| |
| // common sense extension of CTL |
| if (Character.isISOControl(ch)) |
| return false; |
| |
| // no common sense extension for this |
| if (isHttpSeparator(ch)) |
| return false; |
| |
| // RFC 2616, section 2.2 defines a token character as |
| // "any CHAR except CTLs or separators". The controls |
| // and separators are included in the checks above. |
| // This will yield unexpected results for Unicode format characters. |
| // If that is a problem, overwrite isHttpSeparator(char) to filter |
| // out the false positives. |
| return true; |
| } |
| |
| |
| /** |
| * Checks whether a character is an HTTP separator. |
| * The implementation in this class checks only for the HTTP separators |
| * defined in RFC 2616, section 2.2. If you need to detect other |
| * separators beyond the US-ASCII character set, override this method. |
| * |
| * @param ch the character to check |
| * |
| * @return <code>true</code> if the character is an HTTP separator |
| */ |
| protected boolean isHttpSeparator(char ch) { |
| return (HTTP_SEPARATORS.indexOf(ch) >= 0); |
| } |
| |
| |
| } // class BasicTokenIterator |
| |