blob: 0997f2370dbc34a514a2e69a644e880e2be7e895 [file] [log] [blame]
// This file is part of TagSoup and is Copyright 2002-2008 by John Cowan.
//
// TagSoup is licensed under the Apache License,
// Version 2.0. You may obtain a copy of this license at
// http://www.apache.org/licenses/LICENSE-2.0 . You may also have
// additional legal rights not granted by this license.
//
// TagSoup is distributed in the hope that it will be useful, but
// unless required by applicable law or agreed to in writing, TagSoup
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
// OF ANY KIND, either express or implied; not even the implied warranty
// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
//
// The TagSoup parser
package org.ccil.cowan.tagsoup;
import java.util.HashMap;
import java.util.ArrayList;
import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import org.xml.sax.*;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.ext.LexicalHandler;
/**
The SAX parser class.
**/
public class Parser extends DefaultHandler implements ScanHandler, XMLReader, LexicalHandler {
// XMLReader implementation
private ContentHandler theContentHandler = this;
private LexicalHandler theLexicalHandler = this;
private DTDHandler theDTDHandler = this;
private ErrorHandler theErrorHandler = this;
private EntityResolver theEntityResolver = this;
private Schema theSchema;
private Scanner theScanner;
private AutoDetector theAutoDetector;
// Default values for feature flags
private static boolean DEFAULT_NAMESPACES = true;
private static boolean DEFAULT_IGNORE_BOGONS = false;
private static boolean DEFAULT_BOGONS_EMPTY = false;
private static boolean DEFAULT_ROOT_BOGONS = true;
private static boolean DEFAULT_DEFAULT_ATTRIBUTES = true;
private static boolean DEFAULT_TRANSLATE_COLONS = false;
private static boolean DEFAULT_RESTART_ELEMENTS = true;
private static boolean DEFAULT_IGNORABLE_WHITESPACE = false;
private static boolean DEFAULT_CDATA_ELEMENTS = true;
// Feature flags.
private boolean namespaces = DEFAULT_NAMESPACES;
private boolean ignoreBogons = DEFAULT_IGNORE_BOGONS;
private boolean bogonsEmpty = DEFAULT_BOGONS_EMPTY;
private boolean rootBogons = DEFAULT_ROOT_BOGONS;
private boolean defaultAttributes = DEFAULT_DEFAULT_ATTRIBUTES;
private boolean translateColons = DEFAULT_TRANSLATE_COLONS;
private boolean restartElements = DEFAULT_RESTART_ELEMENTS;
private boolean ignorableWhitespace = DEFAULT_IGNORABLE_WHITESPACE;
private boolean CDATAElements = DEFAULT_CDATA_ELEMENTS;
/**
A value of "true" indicates namespace URIs and unprefixed local
names for element and attribute names will be available.
**/
public final static String namespacesFeature =
"http://xml.org/sax/features/namespaces";
/**
A value of "true" indicates that XML qualified names (with prefixes)
and attributes (including xmlns* attributes) will be available.
We don't support this value.
**/
public final static String namespacePrefixesFeature =
"http://xml.org/sax/features/namespace-prefixes";
/**
Reports whether this parser processes external general entities
(it doesn't).
**/
public final static String externalGeneralEntitiesFeature =
"http://xml.org/sax/features/external-general-entities";
/**
Reports whether this parser processes external parameter entities
(it doesn't).
**/
public final static String externalParameterEntitiesFeature =
"http://xml.org/sax/features/external-parameter-entities";
/**
May be examined only during a parse, after the startDocument()
callback has been completed; read-only. The value is true if
the document specified standalone="yes" in its XML declaration,
and otherwise is false. (It's always false.)
**/
public final static String isStandaloneFeature =
"http://xml.org/sax/features/is-standalone";
/**
A value of "true" indicates that the LexicalHandler will report
the beginning and end of parameter entities (it won't).
**/
public final static String lexicalHandlerParameterEntitiesFeature =
"http://xml.org/sax/features/lexical-handler/parameter-entities";
/**
A value of "true" indicates that system IDs in declarations will
be absolutized (relative to their base URIs) before reporting.
(This returns true but doesn't actually do anything.)
**/
public final static String resolveDTDURIsFeature =
"http://xml.org/sax/features/resolve-dtd-uris";
/**
Has a value of "true" if all XML names (for elements,
prefixes, attributes, entities, notations, and local
names), as well as Namespace URIs, will have been interned
using java.lang.String.intern. This supports fast testing of
equality/inequality against string constants, rather than forcing
slower calls to String.equals(). (We always intern.)
**/
public final static String stringInterningFeature =
"http://xml.org/sax/features/string-interning";
/**
Returns "true" if the Attributes objects passed by this
parser in ContentHandler.startElement() implement the
org.xml.sax.ext.Attributes2 interface. (They don't.)
**/
public final static String useAttributes2Feature =
"http://xml.org/sax/features/use-attributes2";
/**
Returns "true" if the Locator objects passed by this parser
in ContentHandler.setDocumentLocator() implement the
org.xml.sax.ext.Locator2 interface. (They don't.)
**/
public final static String useLocator2Feature =
"http://xml.org/sax/features/use-locator2";
/**
Returns "true" if, when setEntityResolver is given an object
implementing the org.xml.sax.ext.EntityResolver2 interface,
those new methods will be used. (They won't be.)
**/
public final static String useEntityResolver2Feature =
"http://xml.org/sax/features/use-entity-resolver2";
/**
Controls whether the parser is reporting all validity errors
(We don't report any validity errors.)
**/
public final static String validationFeature =
"http://xml.org/sax/features/validation";
/**
Controls whether the parser reports Unicode normalization
errors as described in section 2.13 and Appendix B of the XML
1.1 Recommendation. (We don't normalize.)
**/
public final static String unicodeNormalizationCheckingFeature =
"http://xml.org/sax/features/unicode-normalization-checking";
/**
Controls whether, when the namespace-prefixes feature is set,
the parser treats namespace declaration attributes as being in
the http://www.w3.org/2000/xmlns/ namespace. (It doesn't.)
**/
public final static String xmlnsURIsFeature =
"http://xml.org/sax/features/xmlns-uris";
/**
Returns "true" if the parser supports both XML 1.1 and XML 1.0.
(Always false.)
**/
public final static String XML11Feature =
"http://xml.org/sax/features/xml-1.1";
/**
A value of "true" indicates that the parser will ignore
unknown elements.
**/
public final static String ignoreBogonsFeature =
"http://www.ccil.org/~cowan/tagsoup/features/ignore-bogons";
/**
A value of "true" indicates that the parser will give unknown
elements a content model of EMPTY; a value of "false", a
content model of ANY.
**/
public final static String bogonsEmptyFeature =
"http://www.ccil.org/~cowan/tagsoup/features/bogons-empty";
/**
A value of "true" indicates that the parser will allow unknown
elements to be the root element.
**/
public final static String rootBogonsFeature =
"http://www.ccil.org/~cowan/tagsoup/features/root-bogons";
/**
A value of "true" indicates that the parser will return default
attribute values for missing attributes that have default values.
**/
public final static String defaultAttributesFeature =
"http://www.ccil.org/~cowan/tagsoup/features/default-attributes";
/**
A value of "true" indicates that the parser will
translate colons into underscores in names.
**/
public final static String translateColonsFeature =
"http://www.ccil.org/~cowan/tagsoup/features/translate-colons";
/**
A value of "true" indicates that the parser will
attempt to restart the restartable elements.
**/
public final static String restartElementsFeature =
"http://www.ccil.org/~cowan/tagsoup/features/restart-elements";
/**
A value of "true" indicates that the parser will
transmit whitespace in element-only content via the SAX
ignorableWhitespace callback. Normally this is not done,
because HTML is an SGML application and SGML suppresses
such whitespace.
**/
public final static String ignorableWhitespaceFeature =
"http://www.ccil.org/~cowan/tagsoup/features/ignorable-whitespace";
/**
A value of "true" indicates that the parser will treat CDATA
elements specially. Normally true, since the input is by
default HTML.
**/
public final static String CDATAElementsFeature =
"http://www.ccil.org/~cowan/tagsoup/features/cdata-elements";
/**
Used to see some syntax events that are essential in some
applications: comments, CDATA delimiters, selected general
entity inclusions, and the start and end of the DTD (and
declaration of document element name). The Object must implement
org.xml.sax.ext.LexicalHandler.
**/
public final static String lexicalHandlerProperty =
"http://xml.org/sax/properties/lexical-handler";
/**
Specifies the Scanner object this Parser uses.
**/
public final static String scannerProperty =
"http://www.ccil.org/~cowan/tagsoup/properties/scanner";
/**
Specifies the Schema object this Parser uses.
**/
public final static String schemaProperty =
"http://www.ccil.org/~cowan/tagsoup/properties/schema";
/**
Specifies the AutoDetector (for encoding detection) this Parser uses.
**/
public final static String autoDetectorProperty =
"http://www.ccil.org/~cowan/tagsoup/properties/auto-detector";
// Due to sucky Java order of initialization issues, these
// entries are maintained separately from the initial values of
// the corresponding instance variables, but care must be taken
// to keep them in sync.
private HashMap theFeatures = new HashMap();
{
theFeatures.put(namespacesFeature, truthValue(DEFAULT_NAMESPACES));
theFeatures.put(namespacePrefixesFeature, Boolean.FALSE);
theFeatures.put(externalGeneralEntitiesFeature, Boolean.FALSE);
theFeatures.put(externalParameterEntitiesFeature, Boolean.FALSE);
theFeatures.put(isStandaloneFeature, Boolean.FALSE);
theFeatures.put(lexicalHandlerParameterEntitiesFeature,
Boolean.FALSE);
theFeatures.put(resolveDTDURIsFeature, Boolean.TRUE);
theFeatures.put(stringInterningFeature, Boolean.TRUE);
theFeatures.put(useAttributes2Feature, Boolean.FALSE);
theFeatures.put(useLocator2Feature, Boolean.FALSE);
theFeatures.put(useEntityResolver2Feature, Boolean.FALSE);
theFeatures.put(validationFeature, Boolean.FALSE);
theFeatures.put(xmlnsURIsFeature, Boolean.FALSE);
theFeatures.put(xmlnsURIsFeature, Boolean.FALSE);
theFeatures.put(XML11Feature, Boolean.FALSE);
theFeatures.put(ignoreBogonsFeature, truthValue(DEFAULT_IGNORE_BOGONS));
theFeatures.put(bogonsEmptyFeature, truthValue(DEFAULT_BOGONS_EMPTY));
theFeatures.put(rootBogonsFeature, truthValue(DEFAULT_ROOT_BOGONS));
theFeatures.put(defaultAttributesFeature, truthValue(DEFAULT_DEFAULT_ATTRIBUTES));
theFeatures.put(translateColonsFeature, truthValue(DEFAULT_TRANSLATE_COLONS));
theFeatures.put(restartElementsFeature, truthValue(DEFAULT_RESTART_ELEMENTS));
theFeatures.put(ignorableWhitespaceFeature, truthValue(DEFAULT_IGNORABLE_WHITESPACE));
theFeatures.put(CDATAElementsFeature, truthValue(DEFAULT_CDATA_ELEMENTS));
}
// Private clone of Boolean.valueOf that is guaranteed to return
// Boolean.TRUE or Boolean.FALSE
private static Boolean truthValue(boolean b) {
return b ? Boolean.TRUE : Boolean.FALSE;
}
public boolean getFeature (String name)
throws SAXNotRecognizedException, SAXNotSupportedException {
Boolean b = (Boolean)theFeatures.get(name);
if (b == null) {
throw new SAXNotRecognizedException("Unknown feature " + name);
}
return b.booleanValue();
}
public void setFeature (String name, boolean value)
throws SAXNotRecognizedException, SAXNotSupportedException {
Boolean b = (Boolean)theFeatures.get(name);
if (b == null) {
throw new SAXNotRecognizedException("Unknown feature " + name);
}
if (value) theFeatures.put(name, Boolean.TRUE);
else theFeatures.put(name, Boolean.FALSE);
if (name.equals(namespacesFeature)) namespaces = value;
else if (name.equals(ignoreBogonsFeature)) ignoreBogons = value;
else if (name.equals(bogonsEmptyFeature)) bogonsEmpty = value;
else if (name.equals(rootBogonsFeature)) rootBogons = value;
else if (name.equals(defaultAttributesFeature)) defaultAttributes = value;
else if (name.equals(translateColonsFeature)) translateColons = value;
else if (name.equals(restartElementsFeature)) restartElements = value;
else if (name.equals(ignorableWhitespaceFeature)) ignorableWhitespace = value;
else if (name.equals(CDATAElementsFeature)) CDATAElements = value;
}
public Object getProperty (String name)
throws SAXNotRecognizedException, SAXNotSupportedException {
if (name.equals(lexicalHandlerProperty)) {
return theLexicalHandler == this ? null : theLexicalHandler;
}
else if (name.equals(scannerProperty)) {
return theScanner;
}
else if (name.equals(schemaProperty)) {
return theSchema;
}
else if (name.equals(autoDetectorProperty)) {
return theAutoDetector;
}
else {
throw new SAXNotRecognizedException("Unknown property " + name);
}
}
public void setProperty (String name, Object value)
throws SAXNotRecognizedException, SAXNotSupportedException {
if (name.equals(lexicalHandlerProperty)) {
if (value == null) {
theLexicalHandler = this;
}
else if (value instanceof LexicalHandler) {
theLexicalHandler = (LexicalHandler)value;
}
else {
throw new SAXNotSupportedException("Your lexical handler is not a LexicalHandler");
}
}
else if (name.equals(scannerProperty)) {
if (value instanceof Scanner) {
theScanner = (Scanner)value;
}
else {
throw new SAXNotSupportedException("Your scanner is not a Scanner");
}
}
else if (name.equals(schemaProperty)) {
if (value instanceof Schema) {
theSchema = (Schema)value;
}
else {
throw new SAXNotSupportedException("Your schema is not a Schema");
}
}
else if (name.equals(autoDetectorProperty)) {
if (value instanceof AutoDetector) {
theAutoDetector = (AutoDetector)value;
}
else {
throw new SAXNotSupportedException("Your auto-detector is not an AutoDetector");
}
}
else {
throw new SAXNotRecognizedException("Unknown property " + name);
}
}
public void setEntityResolver (EntityResolver resolver) {
theEntityResolver = (resolver == null) ? this : resolver;
}
public EntityResolver getEntityResolver () {
return (theEntityResolver == this) ? null : theEntityResolver;
}
public void setDTDHandler (DTDHandler handler) {
theDTDHandler = (handler == null) ? this : handler;
}
public DTDHandler getDTDHandler () {
return (theDTDHandler == this) ? null : theDTDHandler;
}
public void setContentHandler (ContentHandler handler) {
theContentHandler = (handler == null) ? this : handler;
}
public ContentHandler getContentHandler () {
return (theContentHandler == this) ? null : theContentHandler;
}
public void setErrorHandler (ErrorHandler handler) {
theErrorHandler = (handler == null) ? this : handler;
}
public ErrorHandler getErrorHandler () {
return (theErrorHandler == this) ? null : theErrorHandler;
}
public void parse (InputSource input) throws IOException, SAXException {
setup();
Reader r = getReader(input);
theContentHandler.startDocument();
theScanner.resetDocumentLocator(input.getPublicId(), input.getSystemId());
if (theScanner instanceof Locator) {
theContentHandler.setDocumentLocator((Locator)theScanner);
}
if (!(theSchema.getURI().equals("")))
theContentHandler.startPrefixMapping(theSchema.getPrefix(),
theSchema.getURI());
theScanner.scan(r, this);
}
public void parse (String systemid) throws IOException, SAXException {
parse(new InputSource(systemid));
}
// Sets up instance variables that haven't been set by setFeature
private void setup() {
if (theSchema == null) theSchema = new HTMLSchema();
if (theScanner == null) theScanner = new HTMLScanner();
if (theAutoDetector == null) {
theAutoDetector = new AutoDetector() {
public Reader autoDetectingReader(InputStream i) {
return new InputStreamReader(i);
}
};
}
theStack = new Element(theSchema.getElementType("<root>"), defaultAttributes);
thePCDATA = new Element(theSchema.getElementType("<pcdata>"), defaultAttributes);
theNewElement = null;
theAttributeName = null;
thePITarget = null;
theSaved = null;
theEntity = 0;
virginStack = true;
theDoctypeName = theDoctypePublicId = theDoctypeSystemId = null;
}
// Return a Reader based on the contents of an InputSource
// Buffer both the InputStream and the Reader
private Reader getReader(InputSource s) throws SAXException, IOException {
Reader r = s.getCharacterStream();
InputStream i = s.getByteStream();
String encoding = s.getEncoding();
String publicid = s.getPublicId();
String systemid = s.getSystemId();
if (r == null) {
if (i == null) i = getInputStream(publicid, systemid);
// i = new BufferedInputStream(i);
if (encoding == null) {
r = theAutoDetector.autoDetectingReader(i);
}
else {
try {
r = new InputStreamReader(i, encoding);
}
catch (UnsupportedEncodingException e) {
r = new InputStreamReader(i);
}
}
}
// r = new BufferedReader(r);
return r;
}
// Get an InputStream based on a publicid and a systemid
private InputStream getInputStream(String publicid, String systemid) throws IOException, SAXException {
URL basis = new URL("file", "", System.getProperty("user.dir") + "/.");
URL url = new URL(basis, systemid);
URLConnection c = url.openConnection();
return c.getInputStream();
}
// We don't process publicids (who uses them anyhow?)
// ScanHandler implementation
private Element theNewElement = null;
private String theAttributeName = null;
private boolean theDoctypeIsPresent = false;
private String theDoctypePublicId = null;
private String theDoctypeSystemId = null;
private String theDoctypeName = null;
private String thePITarget = null;
private Element theStack = null;
private Element theSaved = null;
private Element thePCDATA = null;
private int theEntity = 0; // needs to support chars past U+FFFF
public void adup(char[] buff, int offset, int length) throws SAXException {
if (theNewElement == null || theAttributeName == null) return;
theNewElement.setAttribute(theAttributeName, null, theAttributeName);
theAttributeName = null;
}
public void aname(char[] buff, int offset, int length) throws SAXException {
if (theNewElement == null) return;
// Currently we don't rely on Schema to canonicalize
// attribute names.
theAttributeName = makeName(buff, offset, length).toLowerCase();
// System.err.println("%% Attribute name " + theAttributeName);
}
public void aval(char[] buff, int offset, int length) throws SAXException {
if (theNewElement == null || theAttributeName == null) return;
String value = new String(buff, offset, length);
// System.err.println("%% Attribute value [" + value + "]");
value = expandEntities(value);
theNewElement.setAttribute(theAttributeName, null, value);
theAttributeName = null;
// System.err.println("%% Aval done");
}
// Expand entity references in attribute values selectively.
// Currently we expand a reference iff it is properly terminated
// with a semicolon.
private String expandEntities(String src) {
int refStart = -1;
int len = src.length();
char[] dst = new char[len];
int dstlen = 0;
for (int i = 0; i < len; i++) {
char ch = src.charAt(i);
dst[dstlen++] = ch;
// System.err.print("i = " + i + ", d = " + dstlen + ", ch = [" + ch + "] ");
if (ch == '&' && refStart == -1) {
// start of a ref excluding &
refStart = dstlen;
// System.err.println("start of ref");
}
else if (refStart == -1) {
// not in a ref
// System.err.println("not in ref");
}
else if (Character.isLetter(ch) ||
Character.isDigit(ch) ||
ch == '#') {
// valid entity char
// System.err.println("valid");
}
else if (ch == ';') {
// properly terminated ref
// System.err.print("got [" + new String(dst, refStart, dstlen-refStart-1) + "]");
int ent = lookupEntity(dst, refStart, dstlen - refStart - 1);
// System.err.println(" = " + ent);
if (ent > 0xFFFF) {
ent -= 0x10000;
dst[refStart - 1] = (char)((ent>>10) + 0xD800);
dst[refStart] = (char)((ent&0x3FF) + 0xDC00);
dstlen = refStart + 1;
}
else if (ent != 0) {
dst[refStart - 1] = (char)ent;
dstlen = refStart;
}
refStart = -1;
}
else {
// improperly terminated ref
// System.err.println("end of ref");
refStart = -1;
}
}
return new String(dst, 0, dstlen);
}
public void entity(char[] buff, int offset, int length) throws SAXException {
theEntity = lookupEntity(buff, offset, length);
}
// Process numeric character references,
// deferring to the schema for named ones.
private int lookupEntity(char[] buff, int offset, int length) {
int result = 0;
if (length < 1) return result;
// System.err.println("%% Entity at " + offset + " " + length);
// System.err.println("%% Got entity [" + new String(buff, offset, length) + "]");
if (buff[offset] == '#') {
if (length > 1 && (buff[offset+1] == 'x'
|| buff[offset+1] == 'X')) {
try {
return Integer.parseInt(new String(buff, offset + 2, length - 2), 16);
}
catch (NumberFormatException e) { return 0; }
}
try {
return Integer.parseInt(new String(buff, offset + 1, length - 1), 10);
}
catch (NumberFormatException e) { return 0; }
}
return theSchema.getEntity(new String(buff, offset, length));
}
public void eof(char[] buff, int offset, int length) throws SAXException {
if (virginStack) rectify(thePCDATA);
while (theStack.next() != null) {
pop();
}
if (!(theSchema.getURI().equals("")))
theContentHandler.endPrefixMapping(theSchema.getPrefix());
theContentHandler.endDocument();
}
public void etag(char[] buff, int offset, int length) throws SAXException {
if (etag_cdata(buff, offset, length)) return;
etag_basic(buff, offset, length);
}
private static char[] etagchars = {'<', '/', '>'};
public boolean etag_cdata(char[] buff, int offset, int length) throws SAXException {
String currentName = theStack.name();
// If this is a CDATA element and the tag doesn't match,
// or isn't properly formed (junk after the name),
// restart CDATA mode and process the tag as characters.
if (CDATAElements && (theStack.flags() & Schema.F_CDATA) != 0) {
boolean realTag = (length == currentName.length());
if (realTag) {
for (int i = 0; i < length; i++) {
if (Character.toLowerCase(buff[offset + i]) != Character.toLowerCase(currentName.charAt(i))) {
realTag = false;
break;
}
}
}
if (!realTag) {
theContentHandler.characters(etagchars, 0, 2);
theContentHandler.characters(buff, offset, length);
theContentHandler.characters(etagchars, 2, 1);
theScanner.startCDATA();
return true;
}
}
return false;
}
public void etag_basic(char[] buff, int offset, int length) throws SAXException {
theNewElement = null;
String name;
if (length != 0) {
// Canonicalize case of name
name = makeName(buff, offset, length);
// System.err.println("got etag [" + name + "]");
ElementType type = theSchema.getElementType(name);
if (type == null) return; // mysterious end-tag
name = type.name();
}
else {
name = theStack.name();
}
// System.err.println("%% Got end of " + name);
Element sp;
boolean inNoforce = false;
for (sp = theStack; sp != null; sp = sp.next()) {
if (sp.name().equals(name)) break;
if ((sp.flags() & Schema.F_NOFORCE) != 0) inNoforce = true;
}
if (sp == null) return; // Ignore unknown etags
if (sp.next() == null || sp.next().next() == null) return;
if (inNoforce) { // inside an F_NOFORCE element?
sp.preclose(); // preclose the matching element
}
else { // restartably pop everything above us
while (theStack != sp) {
restartablyPop();
}
pop();
}
// pop any preclosed elements now at the top
while (theStack.isPreclosed()) {
pop();
}
restart(null);
}
// Push restartables on the stack if possible
// e is the next element to be started, if we know what it is
private void restart(Element e) throws SAXException {
while (theSaved != null && theStack.canContain(theSaved) &&
(e == null || theSaved.canContain(e))) {
Element next = theSaved.next();
push(theSaved);
theSaved = next;
}
}
// Pop the stack irrevocably
private void pop() throws SAXException {
if (theStack == null) return; // empty stack
String name = theStack.name();
String localName = theStack.localName();
String namespace = theStack.namespace();
String prefix = prefixOf(name);
// System.err.println("%% Popping " + name);
if (!namespaces) namespace = localName = "";
theContentHandler.endElement(namespace, localName, name);
if (foreign(prefix, namespace)) {
theContentHandler.endPrefixMapping(prefix);
// System.err.println("%% Unmapping [" + prefix + "] for elements to " + namespace);
}
Attributes atts = theStack.atts();
for (int i = atts.getLength() - 1; i >= 0; i--) {
String attNamespace = atts.getURI(i);
String attPrefix = prefixOf(atts.getQName(i));
if (foreign(attPrefix, attNamespace)) {
theContentHandler.endPrefixMapping(attPrefix);
// System.err.println("%% Unmapping [" + attPrefix + "] for attributes to " + attNamespace);
}
}
theStack = theStack.next();
}
// Pop the stack restartably
private void restartablyPop() throws SAXException {
Element popped = theStack;
pop();
if (restartElements && (popped.flags() & Schema.F_RESTART) != 0) {
popped.anonymize();
popped.setNext(theSaved);
theSaved = popped;
}
}
// Push element onto stack
private boolean virginStack = true;
private void push(Element e) throws SAXException {
String name = e.name();
String localName = e.localName();
String namespace = e.namespace();
String prefix = prefixOf(name);
// System.err.println("%% Pushing " + name);
e.clean();
if (!namespaces) namespace = localName = "";
if (virginStack && localName.equalsIgnoreCase(theDoctypeName)) {
try {
theEntityResolver.resolveEntity(theDoctypePublicId, theDoctypeSystemId);
} catch (IOException ew) { } // Can't be thrown for root I believe.
}
if (foreign(prefix, namespace)) {
theContentHandler.startPrefixMapping(prefix, namespace);
// System.err.println("%% Mapping [" + prefix + "] for elements to " + namespace);
}
Attributes atts = e.atts();
int len = atts.getLength();
for (int i = 0; i < len; i++) {
String attNamespace = atts.getURI(i);
String attPrefix = prefixOf(atts.getQName(i));
if (foreign(attPrefix, attNamespace)) {
theContentHandler.startPrefixMapping(attPrefix, attNamespace);
// System.err.println("%% Mapping [" + attPrefix + "] for attributes to " + attNamespace);
}
}
theContentHandler.startElement(namespace, localName, name, e.atts());
e.setNext(theStack);
theStack = e;
virginStack = false;
if (CDATAElements && (theStack.flags() & Schema.F_CDATA) != 0) {
theScanner.startCDATA();
}
}
// Get the prefix from a QName
private String prefixOf(String name) {
int i = name.indexOf(':');
String prefix = "";
if (i != -1) prefix = name.substring(0, i);
// System.err.println("%% " + prefix + " is prefix of " + name);
return prefix;
}
// Return true if we have a foreign name
private boolean foreign(String prefix, String namespace) {
// System.err.print("%% Testing " + prefix + " and " + namespace + " for foreignness -- ");
boolean foreign = !(prefix.equals("") || namespace.equals("") ||
namespace.equals(theSchema.getURI()));
// System.err.println(foreign);
return foreign;
}
/**
* Parsing the complete XML Document Type Definition is way too complex,
* but for many simple cases we can extract something useful from it.
*
* doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
* DeclSep ::= PEReference | S
* intSubset ::= (markupdecl | DeclSep)*
* markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment
* ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
*/
public void decl(char[] buff, int offset, int length) throws SAXException {
String s = new String(buff, offset, length);
String name = null;
String systemid = null;
String publicid = null;
String[] v = split(s);
if (v.length > 0 && "DOCTYPE".equals(v[0])) {
if (theDoctypeIsPresent) return; // one doctype only!
theDoctypeIsPresent = true;
if (v.length > 1) {
name = v[1];
if (v.length>3 && "SYSTEM".equals(v[2])) {
systemid = v[3];
}
else if (v.length > 3 && "PUBLIC".equals(v[2])) {
publicid = v[3];
if (v.length > 4) {
systemid = v[4];
}
else {
systemid = "";
}
}
}
}
publicid = trimquotes(publicid);
systemid = trimquotes(systemid);
if (name != null) {
publicid = cleanPublicid(publicid);
theLexicalHandler.startDTD(name, publicid, systemid);
theLexicalHandler.endDTD();
theDoctypeName = name;
theDoctypePublicId = publicid;
if (theScanner instanceof Locator) { // Must resolve systemid
theDoctypeSystemId = ((Locator)theScanner).getSystemId();
try {
theDoctypeSystemId = new URL(new URL(theDoctypeSystemId), systemid).toString();
} catch (Exception e) {}
}
}
}
// If the String is quoted, trim the quotes.
private static String trimquotes(String in) {
if (in == null) return in;
int length = in.length();
if (length == 0) return in;
char s = in.charAt(0);
char e = in.charAt(length - 1);
if (s == e && (s == '\'' || s == '"')) {
in = in.substring(1, in.length() - 1);
}
return in;
}
// Split the supplied String into words or phrases seperated by spaces.
// Recognises quotes around a phrase and doesn't split it.
private static String[] split(String val) throws IllegalArgumentException {
val = val.trim();
if (val.length() == 0) {
return new String[0];
}
else {
ArrayList l = new ArrayList();
int s = 0;
int e = 0;
boolean sq = false; // single quote
boolean dq = false; // double quote
char lastc = 0;
int len = val.length();
for (e=0; e < len; e++) {
char c = val.charAt(e);
if (!dq && c == '\'' && lastc != '\\') {
sq = !sq;
if (s < 0) s = e;
}
else if (!sq && c == '\"' && lastc != '\\') {
dq = !dq;
if (s < 0) s = e;
}
else if (!sq && !dq) {
if (Character.isWhitespace(c)) {
if (s >= 0) l.add(val.substring(s, e));
s = -1;
}
else if (s < 0 && c != ' ') {
s = e;
}
}
lastc = c;
}
l.add(val.substring(s, e));
return (String[])l.toArray(new String[0]);
}
}
// Replace junk in publicids with spaces
private static String legal =
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-'()+,./:=?;!*#@$_%";
private String cleanPublicid(String src) {
if (src == null) return null;
int len = src.length();
StringBuffer dst = new StringBuffer(len);
boolean suppressSpace = true;
for (int i = 0; i < len; i++) {
char ch = src.charAt(i);
if (legal.indexOf(ch) != -1) { // legal but not whitespace
dst.append(ch);
suppressSpace = false;
}
else if (suppressSpace) { // normalizable whitespace or junk
;
}
else {
dst.append(' ');
suppressSpace = true;
}
}
// System.err.println("%% Publicid [" + dst.toString().trim() + "]");
return dst.toString().trim(); // trim any final junk whitespace
}
public void gi(char[] buff, int offset, int length) throws SAXException {
if (theNewElement != null) return;
String name = makeName(buff, offset, length);
if (name == null) return;
ElementType type = theSchema.getElementType(name);
if (type == null) {
// Suppress unknown elements if ignore-bogons is on
if (ignoreBogons) return;
int bogonModel = bogonsEmpty ? Schema.M_EMPTY : Schema.M_ANY;
int bogonMemberOf = rootBogons ? Schema.M_ANY : (Schema.M_ANY & ~ Schema.M_ROOT);
theSchema.elementType(name, bogonModel, bogonMemberOf, 0);
if (!rootBogons) theSchema.parent(name, theSchema.rootElementType().name());
type = theSchema.getElementType(name);
}
theNewElement = new Element(type, defaultAttributes);
// System.err.println("%% Got GI " + theNewElement.name());
}
public void cdsect(char[] buff, int offset, int length) throws SAXException {
theLexicalHandler.startCDATA();
pcdata(buff, offset, length);
theLexicalHandler.endCDATA();
}
public void pcdata(char[] buff, int offset, int length) throws SAXException {
if (length == 0) return;
boolean allWhite = true;
for (int i = 0; i < length; i++) {
if (!Character.isWhitespace(buff[offset+i])) {
allWhite = false;
}
}
if (allWhite && !theStack.canContain(thePCDATA)) {
if (ignorableWhitespace) {
theContentHandler.ignorableWhitespace(buff, offset, length);
}
}
else {
rectify(thePCDATA);
theContentHandler.characters(buff, offset, length);
}
}
public void pitarget(char[] buff, int offset, int length) throws SAXException {
if (theNewElement != null) return;
thePITarget = makeName(buff, offset, length).replace(':', '_');
}
public void pi(char[] buff, int offset, int length) throws SAXException {
if (theNewElement != null || thePITarget == null) return;
if ("xml".equalsIgnoreCase(thePITarget)) return;
// if (length > 0 && buff[length - 1] == '?') System.err.println("%% Removing ? from PI");
if (length > 0 && buff[length - 1] == '?') length--; // remove trailing ?
theContentHandler.processingInstruction(thePITarget,
new String(buff, offset, length));
thePITarget = null;
}
public void stagc(char[] buff, int offset, int length) throws SAXException {
// System.err.println("%% Start-tag");
if (theNewElement == null) return;
rectify(theNewElement);
if (theStack.model() == Schema.M_EMPTY) {
// Force an immediate end tag
etag_basic(buff, offset, length);
}
}
public void stage(char[] buff, int offset, int length) throws SAXException {
// System.err.println("%% Empty-tag");
if (theNewElement == null) return;
rectify(theNewElement);
// Force an immediate end tag
etag_basic(buff, offset, length);
}
// Comment buffer is twice the size of the output buffer
private char[] theCommentBuffer = new char[2000];
public void cmnt(char[] buff, int offset, int length) throws SAXException {
theLexicalHandler.comment(buff, offset, length);
}
// Rectify the stack, pushing and popping as needed
// so that the argument can be safely pushed
private void rectify(Element e) throws SAXException {
Element sp;
while (true) {
for (sp = theStack; sp != null; sp = sp.next()) {
if (sp.canContain(e)) break;
}
if (sp != null) break;
ElementType parentType = e.parent();
if (parentType == null) break;
Element parent = new Element(parentType, defaultAttributes);
// System.err.println("%% Ascending from " + e.name() + " to " + parent.name());
parent.setNext(e);
e = parent;
}
if (sp == null) return; // don't know what to do
while (theStack != sp) {
if (theStack == null || theStack.next() == null ||
theStack.next().next() == null) break;
restartablyPop();
}
while (e != null) {
Element nexte = e.next();
if (!e.name().equals("<pcdata>")) push(e);
e = nexte;
restart(e);
}
theNewElement = null;
}
public int getEntity() {
return theEntity;
}
// Return the argument as a valid XML name
// This no longer lowercases the result: we depend on Schema to
// canonicalize case.
private String makeName(char[] buff, int offset, int length) {
StringBuffer dst = new StringBuffer(length + 2);
boolean seenColon = false;
boolean start = true;
// String src = new String(buff, offset, length); // DEBUG
for (; length-- > 0; offset++) {
char ch = buff[offset];
if (Character.isLetter(ch) || ch == '_') {
start = false;
dst.append(ch);
}
else if (Character.isDigit(ch) || ch == '-' || ch == '.') {
if (start) dst.append('_');
start = false;
dst.append(ch);
}
else if (ch == ':' && !seenColon) {
seenColon = true;
if (start) dst.append('_');
start = true;
dst.append(translateColons ? '_' : ch);
}
}
int dstLength = dst.length();
if (dstLength == 0 || dst.charAt(dstLength - 1) == ':') dst.append('_');
// System.err.println("Made name \"" + dst + "\" from \"" + src + "\"");
return dst.toString().intern();
}
// Default LexicalHandler implementation
public void comment(char[] ch, int start, int length) throws SAXException { }
public void endCDATA() throws SAXException { }
public void endDTD() throws SAXException { }
public void endEntity(String name) throws SAXException { }
public void startCDATA() throws SAXException { }
public void startDTD(String name, String publicid, String systemid) throws SAXException { }
public void startEntity(String name) throws SAXException { }
}