blob: 5a3c48c02f236e3a862e6d7b2a036ef7de84c950 [file] [log] [blame]
/*
* Copyright (C) 2007 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.harmony.xml.parsers;
import java.io.IOException;
import java.net.URL;
import java.net.URLConnection;
import java.util.StringTokenizer;
import javax.xml.parsers.DocumentBuilder;
import org.kxml2.io.KXmlParser;
import org.w3c.dom.Attr;
import org.w3c.dom.DOMImplementation;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.Text;
import org.xml.sax.EntityResolver;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.helpers.LocatorImpl;
import org.xmlpull.v1.XmlPullParser;
import org.xmlpull.v1.XmlPullParserException;
import org.apache.harmony.xml.dom.DOMImplementationImpl;
/**
* Provides a straightforward DocumentBuilder implementation based on
* XMLPull/KXML. The class is used internally only, thus only notable members
* that are not already in the abstract superclass are documented. Hope that's
* ok.
*/
class DocumentBuilderImpl extends DocumentBuilder {
private static DOMImplementation dom = DOMImplementationImpl.getInstance();
private boolean coalescing;
private EntityResolver entityResolver;
private ErrorHandler errorHandler;
private boolean ignoreComments;
private boolean ignoreElementContentWhitespace;
private boolean namespaceAware;
DocumentBuilderImpl() {
// Do nothing.
}
@Override
public DOMImplementation getDOMImplementation() {
return dom;
}
/**
* Reflects whether this DocumentBuilder is configured to ignore comments.
*
* @return True if and only if comments are ignored.
*/
public boolean isIgnoringComments() {
return ignoreComments;
}
/**
* Reflects whether this DocumentBuilder is configured to ignore element
* content whitespace.
*
* @return True if and only if whitespace element content is ignored.
*/
public boolean isIgnoringElementContentWhitespace() {
return ignoreElementContentWhitespace;
}
@Override
public boolean isNamespaceAware() {
return namespaceAware;
}
@Override
public boolean isValidating() {
return false;
}
@Override
public Document newDocument() {
return dom.createDocument(null, null, null);
}
@Override
public Document parse(InputSource source) throws SAXException, IOException {
if (source == null) {
throw new IllegalArgumentException();
}
Document document = newDocument();
try {
XmlPullParser parser = new KXmlParser();
parser.setFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES,
namespaceAware);
if (source.getByteStream() != null) {
parser.setInput(source.getByteStream(), source.getEncoding());
} else if (source.getCharacterStream() != null) {
parser.setInput(source.getCharacterStream());
} else if (source.getSystemId() != null) {
URL url = new URL(source.getSystemId());
URLConnection urlConnection = url.openConnection();
urlConnection.connect();
String encoding = source.getEncoding();
// TODO: if null, extract the encoding from the Content-Type header?
parser.setInput(urlConnection.getInputStream(), encoding);
} else {
throw new SAXParseException(
"InputSource needs a stream, reader or URI", null);
}
if(parser.nextToken() == XmlPullParser.END_DOCUMENT) {
throw new SAXParseException(
"Unexpected end of document", null);
}
parse(parser, document, document, XmlPullParser.END_DOCUMENT);
parser.require(XmlPullParser.END_DOCUMENT, null, null);
} catch (XmlPullParserException ex) {
if(ex.getDetail() instanceof IOException) {
throw (IOException)ex.getDetail();
}
if(ex.getDetail() instanceof RuntimeException) {
throw (RuntimeException)ex.getDetail();
}
LocatorImpl locator = new LocatorImpl();
locator.setPublicId(source.getPublicId());
locator.setSystemId(source.getSystemId());
locator.setLineNumber(ex.getLineNumber());
locator.setColumnNumber(ex.getColumnNumber());
SAXParseException newEx = new SAXParseException(ex.getMessage(),
locator);
if (errorHandler != null) {
errorHandler.error(newEx);
}
throw newEx;
}
return document;
}
/**
* Implements the whole parsing of the XML document. The XML pull parser is
* actually more of a tokenizer, and we are doing a classical recursive
* descent parsing (the method invokes itself for XML elements). Our
* approach to parsing does accept some illegal documents (more than one
* root element, for example). The assumption is that the DOM implementation
* throws the proper exceptions in these cases.
*
* @param parser The XML pull parser we're reading from.
* @param document The document we're building.
* @param node The node we're currently on (initially the document itself).
* @param endToken The token that will end this recursive call. Either
* XmlPullParser.END_DOCUMENT or XmlPullParser.END_TAG.
*
* @throws XmlPullParserException If a parsing error occurs.
* @throws IOException If a general IO error occurs.
*/
private void parse(XmlPullParser parser, Document document, Node node,
int endToken) throws XmlPullParserException, IOException {
int token = parser.getEventType();
/*
* The main parsing loop. The precondition is that we are already on the
* token to be processed. This holds for each iteration of the loop, so
* the inner statements have to ensure that (in particular the recursive
* call).
*/
while (token != endToken && token != XmlPullParser.END_DOCUMENT) {
if (token == XmlPullParser.PROCESSING_INSTRUCTION) {
/*
* Found a processing instructions. We need to split the token
* text at the first whitespace character.
*/
String text = parser.getText();
int dot = text.indexOf(' ');
String target = (dot != -1 ? text.substring(0, dot) : text);
String data = (dot != -1 ? text.substring(dot + 1) : "");
node.appendChild(document.createProcessingInstruction(target,
data));
} else if (token == XmlPullParser.DOCDECL) {
/*
* Found a document type declaration. Unfortunately KXML doesn't
* have the necessary details. Do we parse it ourselves, or do
* we silently ignore it, since it isn't mandatory in DOM 2
* anyway?
*/
StringTokenizer tokenizer = new StringTokenizer(parser.getText());
if (tokenizer.hasMoreTokens()) {
String name = tokenizer.nextToken();
String pubid = null;
String sysid = null;
if (tokenizer.hasMoreTokens()) {
String text = tokenizer.nextToken();
if ("SYSTEM".equals(text)) {
if (tokenizer.hasMoreTokens()) {
sysid = tokenizer.nextToken();
}
} else if ("PUBLIC".equals(text)) {
if (tokenizer.hasMoreTokens()) {
pubid = tokenizer.nextToken();
}
if (tokenizer.hasMoreTokens()) {
sysid = tokenizer.nextToken();
}
}
}
if (pubid != null && pubid.length() >= 2 && pubid.startsWith("\"") && pubid.endsWith("\"")) {
pubid = pubid.substring(1, pubid.length() - 1);
}
if (sysid != null && sysid.length() >= 2 && sysid.startsWith("\"") && sysid.endsWith("\"")) {
sysid = sysid.substring(1, sysid.length() - 1);
}
document.appendChild(dom.createDocumentType(name, pubid, sysid));
}
} else if (token == XmlPullParser.COMMENT) {
/*
* Found a comment. We simply take the token text, but we only
* create a node if the client wants to see comments at all.
*/
if (!ignoreComments) {
node.appendChild(document.createComment(parser.getText()));
}
} else if (token == XmlPullParser.IGNORABLE_WHITESPACE) {
/*
* Found some ignorable whitespace. We simply take the token
* text, but we only create a node if the client wants to see
* whitespace at all.
*/
if (!ignoreElementContentWhitespace) {
appendText(document, node, true, parser.getText());
}
} else if (token == XmlPullParser.TEXT || token == XmlPullParser.CDSECT) {
/*
* Found a piece of text (possibly encoded as a CDATA section).
* That's the easiest case. We simply take it and create a new text node,
* or merge with an adjacent text node.
*/
appendText(document, node, token == XmlPullParser.TEXT, parser.getText());
} else if (token == XmlPullParser.ENTITY_REF) {
/*
* Found an entity reference. If an entity resolver is
* installed, we replace it by text (if possible). Otherwise we
* add an entity reference node.
*/
String entity = parser.getName();
if (entityResolver != null) {
// TODO Implement this...
}
String replacement = resolveStandardEntity(entity);
if (replacement != null) {
appendText(document, node, true, replacement);
} else {
node.appendChild(document.createEntityReference(entity));
}
} else if (token == XmlPullParser.START_TAG) {
/*
* Found an element start tag. We create an element node with
* the proper info and attributes. We then invoke parse()
* recursively to handle the next level of nesting. When we
* return from this call, we check that we are on the proper
* element end tag. The whole handling differs somewhat
* depending on whether the parser is namespace-aware or not.
*/
if (namespaceAware) {
// Collect info for element node
String namespace = parser.getNamespace();
String name = parser.getName();
String prefix = parser.getPrefix();
if ("".equals(namespace)) {
namespace = null;
}
// Create element node and wire it correctly
Element element = document.createElementNS(namespace, name);
element.setPrefix(prefix);
node.appendChild(element);
for (int i = 0; i < parser.getAttributeCount(); i++) {
// Collect info for a single attribute node
String attrNamespace = parser.getAttributeNamespace(i);
String attrPrefix = parser.getAttributePrefix(i);
String attrName = parser.getAttributeName(i);
String attrValue = parser.getAttributeValue(i);
if ("".equals(attrNamespace)) {
attrNamespace = null;
}
// Create attribute node and wire it correctly
Attr attr = document.createAttributeNS(attrNamespace, attrName);
attr.setPrefix(attrPrefix);
attr.setValue(attrValue);
element.setAttributeNodeNS(attr);
}
// Recursive descent
token = parser.nextToken();
parse(parser, document, element, XmlPullParser.END_TAG);
// Expect the element's end tag here
parser.require(XmlPullParser.END_TAG, namespace, name);
} else {
// Collect info for element node
String name = parser.getName();
// Create element node and wire it correctly
Element element = document.createElement(name);
node.appendChild(element);
for (int i = 0; i < parser.getAttributeCount(); i++) {
// Collect info for a single attribute node
String attrName = parser.getAttributeName(i);
String attrValue = parser.getAttributeValue(i);
// Create attribute node and wire it correctly
Attr attr = document.createAttribute(attrName);
attr.setValue(attrValue);
element.setAttributeNode(attr);
}
// Recursive descent
token = parser.nextToken();
parse(parser, document, element, XmlPullParser.END_TAG);
// Expect the element's end tag here
parser.require(XmlPullParser.END_TAG, "", name);
}
}
token = parser.nextToken();
}
}
/**
* @param isText true for a normal TextNode, false for a CDATA section.
* (If we're not coalescing, it matters which kind of node we put into the DOM.)
*/
private void appendText(Document document, Node node, boolean isText, String text) {
// Ignore empty runs.
if (text.length() == 0) {
return;
}
// Merge with any previous text node if possible.
if (coalescing) {
Node lastChild = node.getLastChild();
if (lastChild != null && lastChild.getNodeType() == Node.TEXT_NODE) {
Text textNode = (Text) lastChild;
textNode.setData(textNode.getNodeValue() + text);
return;
}
}
// Okay, we really do need a new text node
if (isText) {
node.appendChild(document.createTextNode(text));
} else {
node.appendChild(document.createCDATASection(text));
}
}
@Override
public void setEntityResolver(EntityResolver resolver) {
entityResolver = resolver;
}
@Override
public void setErrorHandler(ErrorHandler handler) {
errorHandler = handler;
}
/**
* Controls whether this DocumentBuilder ignores comments.
*
* @param value Turns comment ignorance on or off.
*/
public void setIgnoreComments(boolean value) {
ignoreComments = value;
}
public void setCoalescing(boolean value) {
coalescing = value;
}
/**
* Controls whether this DocumentBuilder ignores element content whitespace.
*
* @param value Turns element whitespace content ignorance on or off.
*/
public void setIgnoreElementContentWhitespace(boolean value) {
ignoreElementContentWhitespace = value;
}
/**
* Controls whether this DocumentBuilder is namespace-aware.
*
* @param value Turns namespace awareness on or off.
*/
public void setNamespaceAware(boolean value) {
namespaceAware = value;
}
/**
* Resolves predefined XML character or entity references.
*
* @param entity The reference to resolve, not including
* the ampersand or the semicolon.
*
* @return The proper replacement, or null, if the entity is unknown.
*/
private String resolveStandardEntity(String entity) {
// Character references, section 4.1 of the XML specification.
if (entity.startsWith("#x")) {
return resolveCharacterReference(entity.substring(2), 16);
} else if (entity.startsWith("#")) {
return resolveCharacterReference(entity.substring(1), 10);
}
// Predefined entities, section 4.6 of the XML specification.
if ("lt".equals(entity)) {
return "<";
} else if ("gt".equals(entity)) {
return ">";
} else if ("amp".equals(entity)) {
return "&";
} else if ("apos".equals(entity)) {
return "'";
} else if ("quot".equals(entity)) {
return "\"";
} else {
return null;
}
}
private String resolveCharacterReference(String value, int base) {
try {
int ch = Integer.parseInt(value, base);
if (ch < 0 || ch > Character.MAX_VALUE) {
return null;
}
return String.valueOf((char) ch);
} catch (NumberFormatException ex) {
return null;
}
}
}