| /* |
| * reserved comment block |
| * DO NOT REMOVE OR ALTER! |
| */ |
| /* |
| * Copyright 2003-2005 The Apache Software Foundation. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package com.sun.org.apache.xerces.internal.xinclude; |
| |
| import java.io.BufferedInputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.InputStreamReader; |
| import java.io.Reader; |
| import java.net.HttpURLConnection; |
| import java.net.URL; |
| import java.net.URLConnection; |
| import java.util.Iterator; |
| import java.util.Locale; |
| import java.util.Map; |
| |
| import com.sun.org.apache.xerces.internal.impl.XMLEntityManager; |
| import com.sun.org.apache.xerces.internal.impl.XMLErrorReporter; |
| import com.sun.org.apache.xerces.internal.impl.io.ASCIIReader; |
| import com.sun.org.apache.xerces.internal.impl.io.UTF8Reader; |
| import com.sun.org.apache.xerces.internal.impl.msg.XMLMessageFormatter; |
| import com.sun.org.apache.xerces.internal.util.EncodingMap; |
| import com.sun.org.apache.xerces.internal.util.HTTPInputSource; |
| import com.sun.org.apache.xerces.internal.util.MessageFormatter; |
| import com.sun.org.apache.xerces.internal.util.XMLChar; |
| import com.sun.org.apache.xerces.internal.xni.XMLString; |
| import com.sun.org.apache.xerces.internal.xni.parser.XMLInputSource; |
| |
| /** |
| * This class is used for reading resources requested in <include> elements, |
| * when the parse attribute of the <include> element is "text". Using this |
| * class will open the location, detect the encoding, and discard the byte order |
| * mark, if applicable. |
| * |
| * REVISIT: |
| * Much of the code in this class is taken from XMLEntityManager. It would be nice |
| * if this code could be shared in some way. However, since XMLEntityManager is used |
| * for reading files as XML, and this needs to read files as text, there would need |
| * to be some refactoring done. |
| * |
| * @author Michael Glavassevich, IBM |
| * @author Peter McCracken, IBM |
| * @author Ankit Pasricha, IBM |
| * @author Arun Yadav, Sun Microsystems Inc. |
| * |
| * |
| * @see XIncludeHandler |
| */ |
| public class XIncludeTextReader { |
| |
| private Reader fReader; |
| private XIncludeHandler fHandler; |
| private XMLInputSource fSource; |
| private XMLErrorReporter fErrorReporter; |
| private XMLString fTempString = new XMLString(); |
| |
| /** |
| * Construct the XIncludeReader using the XMLInputSource and XIncludeHandler. |
| * |
| * @param source The XMLInputSource to use. |
| * @param handler The XIncludeHandler to use. |
| * @param bufferSize The size of this text reader's buffer. |
| */ |
| public XIncludeTextReader(XMLInputSource source, XIncludeHandler handler, int bufferSize) |
| throws IOException { |
| fHandler = handler; |
| fSource = source; |
| fTempString = new XMLString(new char[bufferSize + 1], 0, 0); |
| } |
| |
| /** |
| * Sets the XMLErrorReporter used for reporting errors while |
| * reading the text include. |
| * |
| * @param errorReporter the XMLErrorReporter to be used for |
| * reporting errors. |
| */ |
| public void setErrorReporter(XMLErrorReporter errorReporter) { |
| fErrorReporter = errorReporter; |
| } |
| |
| /** |
| * Return the Reader for given XMLInputSource. |
| * |
| * @param source The XMLInputSource to use. |
| */ |
| protected Reader getReader(XMLInputSource source) throws IOException { |
| if (source.getCharacterStream() != null) { |
| return source.getCharacterStream(); |
| } |
| else { |
| InputStream stream = null; |
| |
| String encoding = source.getEncoding(); |
| if (encoding == null) { |
| encoding = "UTF-8"; |
| } |
| if (source.getByteStream() != null) { |
| stream = source.getByteStream(); |
| // Wrap the InputStream so that it is possible to rewind it. |
| if (!(stream instanceof BufferedInputStream)) { |
| stream = new BufferedInputStream(stream, fTempString.ch.length); |
| } |
| } |
| else { |
| String expandedSystemId = XMLEntityManager.expandSystemId(source.getSystemId(), source.getBaseSystemId(), false); |
| |
| URL url = new URL(expandedSystemId); |
| URLConnection urlCon = url.openConnection(); |
| |
| // If this is an HTTP connection attach any request properties to the request. |
| if (urlCon instanceof HttpURLConnection && source instanceof HTTPInputSource) { |
| final HttpURLConnection urlConnection = (HttpURLConnection) urlCon; |
| final HTTPInputSource httpInputSource = (HTTPInputSource) source; |
| |
| // set request properties |
| Iterator propIter = httpInputSource.getHTTPRequestProperties(); |
| while (propIter.hasNext()) { |
| Map.Entry entry = (Map.Entry) propIter.next(); |
| urlConnection.setRequestProperty((String) entry.getKey(), (String) entry.getValue()); |
| } |
| |
| // set preference for redirection |
| boolean followRedirects = httpInputSource.getFollowHTTPRedirects(); |
| if (!followRedirects) { |
| XMLEntityManager.setInstanceFollowRedirects(urlConnection, followRedirects); |
| } |
| } |
| |
| // Wrap the InputStream so that it is possible to rewind it. |
| stream = new BufferedInputStream(urlCon.getInputStream()); |
| |
| // content type will be string like "text/xml; charset=UTF-8" or "text/xml" |
| String rawContentType = urlCon.getContentType(); |
| |
| // text/xml and application/xml offer only one optional parameter |
| int index = (rawContentType != null) ? rawContentType.indexOf(';') : -1; |
| |
| String contentType = null; |
| String charset = null; |
| if (index != -1) { |
| // this should be something like "text/xml" |
| contentType = rawContentType.substring(0, index).trim(); |
| |
| // this should be something like "charset=UTF-8", but we want to |
| // strip it down to just "UTF-8" |
| charset = rawContentType.substring(index + 1).trim(); |
| if (charset.startsWith("charset=")) { |
| // 8 is the length of "charset=" |
| charset = charset.substring(8).trim(); |
| // strip quotes, if present |
| if ((charset.charAt(0) == '"' |
| && charset.charAt(charset.length() - 1) == '"') |
| || (charset.charAt(0) == '\'' |
| && charset.charAt(charset.length() - 1) |
| == '\'')) { |
| charset = |
| charset.substring(1, charset.length() - 1); |
| } |
| } |
| else { |
| charset = null; |
| } |
| } |
| else { |
| contentType = rawContentType.trim(); |
| } |
| |
| String detectedEncoding = null; |
| /** The encoding of such a resource is determined by: |
| 1 external encoding information, if available, otherwise |
| -- the most common type of external information is the "charset" parameter of a MIME package |
| 2 if the media type of the resource is text/xml, application/xml, or matches the conventions text/*+xml or application/*+xml as described in XML Media Types [IETF RFC 3023], the encoding is recognized as specified in XML 1.0, otherwise |
| 3 the value of the encoding attribute if one exists, otherwise |
| 4 UTF-8. |
| **/ |
| if (contentType.equals("text/xml")) { |
| if (charset != null) { |
| detectedEncoding = charset; |
| } |
| else { |
| // see RFC2376 or 3023, section 3.1 |
| detectedEncoding = "US-ASCII"; |
| } |
| } |
| else if (contentType.equals("application/xml")) { |
| if (charset != null) { |
| detectedEncoding = charset; |
| } |
| else { |
| // see RFC2376 or 3023, section 3.2 |
| detectedEncoding = getEncodingName(stream); |
| } |
| } |
| else if (contentType.endsWith("+xml")) { |
| detectedEncoding = getEncodingName(stream); |
| } |
| |
| if (detectedEncoding != null) { |
| encoding = detectedEncoding; |
| } |
| // else 3 or 4. |
| } |
| |
| encoding = encoding.toUpperCase(Locale.ENGLISH); |
| |
| // eat the Byte Order Mark |
| encoding = consumeBOM(stream, encoding); |
| |
| // If the document is UTF-8 or US-ASCII use |
| // the Xerces readers for these encodings. For |
| // US-ASCII consult the encoding map since |
| // this encoding has many aliases. |
| if (encoding.equals("UTF-8")) { |
| return new UTF8Reader(stream, |
| fTempString.ch.length, |
| fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN), |
| fErrorReporter.getLocale() ); |
| } |
| |
| // Try to use a Java reader. |
| String javaEncoding = EncodingMap.getIANA2JavaMapping(encoding); |
| |
| // If the specified encoding wasn't a recognized IANA encoding throw an IOException. |
| // The XIncludeHandler will report this as a ResourceError and then will |
| // attempt to include a fallback if there is one. |
| if (javaEncoding == null) { |
| MessageFormatter aFormatter = |
| fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN); |
| Locale aLocale = fErrorReporter.getLocale(); |
| throw new IOException( aFormatter.formatMessage( aLocale, |
| "EncodingDeclInvalid", |
| new Object[] {encoding} ) ); |
| } |
| else if (javaEncoding.equals("ASCII")) { |
| return new ASCIIReader(stream, |
| fTempString.ch.length, |
| fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN), |
| fErrorReporter.getLocale() ); |
| } |
| |
| return new InputStreamReader(stream, javaEncoding); |
| } |
| } |
| |
| /** |
| * XMLEntityManager cares about endian-ness, since it creates its own optimized |
| * readers. Since we're just using generic Java readers for now, we're not caring |
| * about endian-ness. If this changes, even more code needs to be copied from |
| * XMLEntity manager. -- PJM |
| */ |
| protected String getEncodingName(InputStream stream) throws IOException { |
| final byte[] b4 = new byte[4]; |
| String encoding = null; |
| |
| // this has the potential to throw an exception |
| // it will be fixed when we ensure the stream is rewindable (see note above) |
| stream.mark(4); |
| int count = stream.read(b4, 0, 4); |
| stream.reset(); |
| if (count == 4) { |
| encoding = getEncodingName(b4); |
| } |
| |
| return encoding; |
| } |
| |
| /** |
| * Removes the byte order mark from the stream, if |
| * it exists and returns the encoding name. |
| * |
| * @param stream |
| * @param encoding |
| * @throws IOException |
| */ |
| protected String consumeBOM(InputStream stream, String encoding) |
| throws IOException { |
| |
| byte[] b = new byte[3]; |
| int count = 0; |
| stream.mark(3); |
| if (encoding.equals("UTF-8")) { |
| count = stream.read(b, 0, 3); |
| if (count == 3) { |
| final int b0 = b[0] & 0xFF; |
| final int b1 = b[1] & 0xFF; |
| final int b2 = b[2] & 0xFF; |
| if (b0 != 0xEF || b1 != 0xBB || b2 != 0xBF) { |
| // First three bytes are not BOM, so reset. |
| stream.reset(); |
| } |
| } |
| else { |
| stream.reset(); |
| } |
| } |
| else if (encoding.startsWith("UTF-16")) { |
| count = stream.read(b, 0, 2); |
| if (count == 2) { |
| final int b0 = b[0] & 0xFF; |
| final int b1 = b[1] & 0xFF; |
| if (b0 == 0xFE && b1 == 0xFF) { |
| return "UTF-16BE"; |
| } |
| else if (b0 == 0xFF && b1 == 0xFE) { |
| return "UTF-16LE"; |
| } |
| } |
| // First two bytes are not BOM, so reset. |
| stream.reset(); |
| } |
| // We could do UTF-32, but since the getEncodingName() doesn't support that |
| // we won't support it here. |
| // To implement UTF-32, look for: 00 00 FE FF for big-endian |
| // or FF FE 00 00 for little-endian |
| return encoding; |
| } |
| |
| /** |
| * REVISIT: This code is taken from com.sun.org.apache.xerces.internal.impl.XMLEntityManager. |
| * Is there any way we can share the code, without having it implemented twice? |
| * I think we should make it public and static in XMLEntityManager. --PJM |
| * |
| * Returns the IANA encoding name that is auto-detected from |
| * the bytes specified, with the endian-ness of that encoding where appropriate. |
| * |
| * @param b4 The first four bytes of the input. |
| * @return the encoding name, or null if no encoding could be detected |
| */ |
| protected String getEncodingName(byte[] b4) { |
| |
| // UTF-16, with BOM |
| int b0 = b4[0] & 0xFF; |
| int b1 = b4[1] & 0xFF; |
| if (b0 == 0xFE && b1 == 0xFF) { |
| // UTF-16, big-endian |
| return "UTF-16BE"; |
| } |
| if (b0 == 0xFF && b1 == 0xFE) { |
| // UTF-16, little-endian |
| return "UTF-16LE"; |
| } |
| |
| // UTF-8 with a BOM |
| int b2 = b4[2] & 0xFF; |
| if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) { |
| return "UTF-8"; |
| } |
| |
| // other encodings |
| int b3 = b4[3] & 0xFF; |
| if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) { |
| // UCS-4, big endian (1234) |
| return "ISO-10646-UCS-4"; |
| } |
| if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) { |
| // UCS-4, little endian (4321) |
| return "ISO-10646-UCS-4"; |
| } |
| if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) { |
| // UCS-4, unusual octet order (2143) |
| return "ISO-10646-UCS-4"; |
| } |
| if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) { |
| // UCS-4, unusual octect order (3412) |
| return "ISO-10646-UCS-4"; |
| } |
| if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) { |
| // UTF-16, big-endian, no BOM |
| // (or could turn out to be UCS-2... |
| return "UTF-16BE"; |
| } |
| if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) { |
| // UTF-16, little-endian, no BOM |
| // (or could turn out to be UCS-2... |
| return "UTF-16LE"; |
| } |
| if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) { |
| // EBCDIC |
| // a la xerces1, return CP037 instead of EBCDIC here |
| return "CP037"; |
| } |
| |
| // this signals us to use the value from the encoding attribute |
| return null; |
| |
| } // getEncodingName(byte[]):Object[] |
| |
| /** |
| * Read the input stream as text, and pass the text on to the XIncludeHandler |
| * using calls to characters(). This will read all of the text it can from the |
| * resource. |
| * |
| * @throws IOException |
| */ |
| public void parse() throws IOException { |
| |
| fReader = getReader(fSource); |
| fSource = null; |
| int readSize = fReader.read(fTempString.ch, 0, fTempString.ch.length - 1); |
| while (readSize != -1) { |
| for (int i = 0; i < readSize; ++i) { |
| char ch = fTempString.ch[i]; |
| if (!isValid(ch)) { |
| if (XMLChar.isHighSurrogate(ch)) { |
| int ch2; |
| // retrieve next character |
| if (++i < readSize) { |
| ch2 = fTempString.ch[i]; |
| } |
| // handle rare boundary case |
| else { |
| ch2 = fReader.read(); |
| if (ch2 != -1) { |
| fTempString.ch[readSize++] = (char) ch2; |
| } |
| } |
| if (XMLChar.isLowSurrogate(ch2)) { |
| // convert surrogates to a supplemental character |
| int sup = XMLChar.supplemental(ch, (char)ch2); |
| if (!isValid(sup)) { |
| fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN, |
| "InvalidCharInContent", |
| new Object[] { Integer.toString(sup, 16) }, |
| XMLErrorReporter.SEVERITY_FATAL_ERROR); |
| } |
| } |
| else { |
| fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN, |
| "InvalidCharInContent", |
| new Object[] { Integer.toString(ch2, 16) }, |
| XMLErrorReporter.SEVERITY_FATAL_ERROR); |
| } |
| } |
| else { |
| fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN, |
| "InvalidCharInContent", |
| new Object[] { Integer.toString(ch, 16) }, |
| XMLErrorReporter.SEVERITY_FATAL_ERROR); |
| } |
| } |
| } |
| if (fHandler != null && readSize > 0) { |
| fTempString.offset = 0; |
| fTempString.length = readSize; |
| fHandler.characters( |
| fTempString, |
| fHandler.modifyAugmentations(null, true)); |
| } |
| readSize = fReader.read(fTempString.ch, 0, fTempString.ch.length - 1); |
| } |
| |
| } |
| |
| /** |
| * Sets the input source on this text reader. |
| * |
| * @param source The XMLInputSource to use. |
| */ |
| public void setInputSource(XMLInputSource source) { |
| fSource = source; |
| } |
| |
| /** |
| * Closes the stream. Call this after parse(), or when there is no longer any need |
| * for this object. |
| * |
| * @throws IOException |
| */ |
| public void close() throws IOException { |
| if (fReader != null) { |
| fReader.close(); |
| fReader = null; |
| } |
| } |
| |
| /** |
| * Returns true if the specified character is a valid XML character |
| * as per the rules of XML 1.0. |
| * |
| * @param ch The character to check. |
| */ |
| protected boolean isValid(int ch) { |
| return XMLChar.isValid(ch); |
| } |
| |
| /** |
| * Sets the buffer size property for the reader which decides the chunk sizes that are parsed |
| * by the reader at a time and passed to the handler |
| * |
| * @param bufferSize The size of the buffer desired |
| */ |
| protected void setBufferSize(int bufferSize) { |
| if (fTempString.ch.length != ++bufferSize) { |
| fTempString.ch = new char[bufferSize]; |
| } |
| } |
| |
| } |