| // This file is part of TagSoup and is Copyright 2002-2008 by John Cowan. |
| // |
| // TagSoup is licensed under the Apache License, |
| // Version 2.0. You may obtain a copy of this license at |
| // http://www.apache.org/licenses/LICENSE-2.0 . You may also have |
| // additional legal rights not granted by this license. |
| // |
| // TagSoup is distributed in the hope that it will be useful, but |
| // unless required by applicable law or agreed to in writing, TagSoup |
| // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS |
| // OF ANY KIND, either express or implied; not even the implied warranty |
| // of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| // |
| // |
| // The TagSoup parser |
| |
| package org.ccil.cowan.tagsoup; |
| import java.util.HashMap; |
| import java.util.ArrayList; |
| import java.io.*; |
| import java.net.URL; |
| import java.net.URLConnection; |
| import org.xml.sax.*; |
| import org.xml.sax.helpers.DefaultHandler; |
| import org.xml.sax.ext.LexicalHandler; |
| |
| |
| /** |
| The SAX parser class. |
| **/ |
| public class Parser extends DefaultHandler implements ScanHandler, XMLReader, LexicalHandler { |
| |
| // XMLReader implementation |
| |
| private ContentHandler theContentHandler = this; |
| private LexicalHandler theLexicalHandler = this; |
| private DTDHandler theDTDHandler = this; |
| private ErrorHandler theErrorHandler = this; |
| private EntityResolver theEntityResolver = this; |
| private Schema theSchema; |
| private Scanner theScanner; |
| private AutoDetector theAutoDetector; |
| |
| // Default values for feature flags |
| |
| private static boolean DEFAULT_NAMESPACES = true; |
| private static boolean DEFAULT_IGNORE_BOGONS = false; |
| private static boolean DEFAULT_BOGONS_EMPTY = false; |
| private static boolean DEFAULT_ROOT_BOGONS = true; |
| private static boolean DEFAULT_DEFAULT_ATTRIBUTES = true; |
| private static boolean DEFAULT_TRANSLATE_COLONS = false; |
| private static boolean DEFAULT_RESTART_ELEMENTS = true; |
| private static boolean DEFAULT_IGNORABLE_WHITESPACE = false; |
| private static boolean DEFAULT_CDATA_ELEMENTS = true; |
| |
| // Feature flags. |
| |
| private boolean namespaces = DEFAULT_NAMESPACES; |
| private boolean ignoreBogons = DEFAULT_IGNORE_BOGONS; |
| private boolean bogonsEmpty = DEFAULT_BOGONS_EMPTY; |
| private boolean rootBogons = DEFAULT_ROOT_BOGONS; |
| private boolean defaultAttributes = DEFAULT_DEFAULT_ATTRIBUTES; |
| private boolean translateColons = DEFAULT_TRANSLATE_COLONS; |
| private boolean restartElements = DEFAULT_RESTART_ELEMENTS; |
| private boolean ignorableWhitespace = DEFAULT_IGNORABLE_WHITESPACE; |
| private boolean CDATAElements = DEFAULT_CDATA_ELEMENTS; |
| |
| /** |
| A value of "true" indicates namespace URIs and unprefixed local |
| names for element and attribute names will be available. |
| **/ |
| public final static String namespacesFeature = |
| "http://xml.org/sax/features/namespaces"; |
| |
| /** |
| A value of "true" indicates that XML qualified names (with prefixes) |
| and attributes (including xmlns* attributes) will be available. |
| We don't support this value. |
| **/ |
| public final static String namespacePrefixesFeature = |
| "http://xml.org/sax/features/namespace-prefixes"; |
| |
| /** |
| Reports whether this parser processes external general entities |
| (it doesn't). |
| **/ |
| public final static String externalGeneralEntitiesFeature = |
| "http://xml.org/sax/features/external-general-entities"; |
| |
| /** |
| Reports whether this parser processes external parameter entities |
| (it doesn't). |
| **/ |
| public final static String externalParameterEntitiesFeature = |
| "http://xml.org/sax/features/external-parameter-entities"; |
| |
| /** |
| May be examined only during a parse, after the startDocument() |
| callback has been completed; read-only. The value is true if |
| the document specified standalone="yes" in its XML declaration, |
| and otherwise is false. (It's always false.) |
| **/ |
| public final static String isStandaloneFeature = |
| "http://xml.org/sax/features/is-standalone"; |
| |
| /** |
| A value of "true" indicates that the LexicalHandler will report |
| the beginning and end of parameter entities (it won't). |
| **/ |
| public final static String lexicalHandlerParameterEntitiesFeature = |
| "http://xml.org/sax/features/lexical-handler/parameter-entities"; |
| |
| /** |
| A value of "true" indicates that system IDs in declarations will |
| be absolutized (relative to their base URIs) before reporting. |
| (This returns true but doesn't actually do anything.) |
| **/ |
| public final static String resolveDTDURIsFeature = |
| "http://xml.org/sax/features/resolve-dtd-uris"; |
| |
| /** |
| Has a value of "true" if all XML names (for elements, |
| prefixes, attributes, entities, notations, and local |
| names), as well as Namespace URIs, will have been interned |
| using java.lang.String.intern. This supports fast testing of |
| equality/inequality against string constants, rather than forcing |
| slower calls to String.equals(). (We always intern.) |
| **/ |
| public final static String stringInterningFeature = |
| "http://xml.org/sax/features/string-interning"; |
| |
| /** |
| Returns "true" if the Attributes objects passed by this |
| parser in ContentHandler.startElement() implement the |
| org.xml.sax.ext.Attributes2 interface. (They don't.) |
| **/ |
| |
| public final static String useAttributes2Feature = |
| "http://xml.org/sax/features/use-attributes2"; |
| |
| /** |
| Returns "true" if the Locator objects passed by this parser |
| in ContentHandler.setDocumentLocator() implement the |
| org.xml.sax.ext.Locator2 interface. (They don't.) |
| **/ |
| public final static String useLocator2Feature = |
| "http://xml.org/sax/features/use-locator2"; |
| |
| /** |
| Returns "true" if, when setEntityResolver is given an object |
| implementing the org.xml.sax.ext.EntityResolver2 interface, |
| those new methods will be used. (They won't be.) |
| **/ |
| public final static String useEntityResolver2Feature = |
| "http://xml.org/sax/features/use-entity-resolver2"; |
| |
| /** |
| Controls whether the parser is reporting all validity errors |
| (We don't report any validity errors.) |
| **/ |
| public final static String validationFeature = |
| "http://xml.org/sax/features/validation"; |
| |
| /** |
| Controls whether the parser reports Unicode normalization |
| errors as described in section 2.13 and Appendix B of the XML |
| 1.1 Recommendation. (We don't normalize.) |
| **/ |
| public final static String unicodeNormalizationCheckingFeature = |
| "http://xml.org/sax/features/unicode-normalization-checking"; |
| |
| /** |
| Controls whether, when the namespace-prefixes feature is set, |
| the parser treats namespace declaration attributes as being in |
| the http://www.w3.org/2000/xmlns/ namespace. (It doesn't.) |
| **/ |
| public final static String xmlnsURIsFeature = |
| "http://xml.org/sax/features/xmlns-uris"; |
| |
| /** |
| Returns "true" if the parser supports both XML 1.1 and XML 1.0. |
| (Always false.) |
| **/ |
| public final static String XML11Feature = |
| "http://xml.org/sax/features/xml-1.1"; |
| |
| /** |
| A value of "true" indicates that the parser will ignore |
| unknown elements. |
| **/ |
| public final static String ignoreBogonsFeature = |
| "http://www.ccil.org/~cowan/tagsoup/features/ignore-bogons"; |
| |
| /** |
| A value of "true" indicates that the parser will give unknown |
| elements a content model of EMPTY; a value of "false", a |
| content model of ANY. |
| **/ |
| public final static String bogonsEmptyFeature = |
| "http://www.ccil.org/~cowan/tagsoup/features/bogons-empty"; |
| |
| /** |
| A value of "true" indicates that the parser will allow unknown |
| elements to be the root element. |
| **/ |
| public final static String rootBogonsFeature = |
| "http://www.ccil.org/~cowan/tagsoup/features/root-bogons"; |
| |
| /** |
| A value of "true" indicates that the parser will return default |
| attribute values for missing attributes that have default values. |
| **/ |
| public final static String defaultAttributesFeature = |
| "http://www.ccil.org/~cowan/tagsoup/features/default-attributes"; |
| |
| /** |
| A value of "true" indicates that the parser will |
| translate colons into underscores in names. |
| **/ |
| public final static String translateColonsFeature = |
| "http://www.ccil.org/~cowan/tagsoup/features/translate-colons"; |
| |
| /** |
| A value of "true" indicates that the parser will |
| attempt to restart the restartable elements. |
| **/ |
| public final static String restartElementsFeature = |
| "http://www.ccil.org/~cowan/tagsoup/features/restart-elements"; |
| |
| /** |
| A value of "true" indicates that the parser will |
| transmit whitespace in element-only content via the SAX |
| ignorableWhitespace callback. Normally this is not done, |
| because HTML is an SGML application and SGML suppresses |
| such whitespace. |
| **/ |
| public final static String ignorableWhitespaceFeature = |
| "http://www.ccil.org/~cowan/tagsoup/features/ignorable-whitespace"; |
| |
| /** |
| A value of "true" indicates that the parser will treat CDATA |
| elements specially. Normally true, since the input is by |
| default HTML. |
| **/ |
| public final static String CDATAElementsFeature = |
| "http://www.ccil.org/~cowan/tagsoup/features/cdata-elements"; |
| |
| /** |
| Used to see some syntax events that are essential in some |
| applications: comments, CDATA delimiters, selected general |
| entity inclusions, and the start and end of the DTD (and |
| declaration of document element name). The Object must implement |
| org.xml.sax.ext.LexicalHandler. |
| **/ |
| public final static String lexicalHandlerProperty = |
| "http://xml.org/sax/properties/lexical-handler"; |
| |
| /** |
| Specifies the Scanner object this Parser uses. |
| **/ |
| public final static String scannerProperty = |
| "http://www.ccil.org/~cowan/tagsoup/properties/scanner"; |
| |
| /** |
| Specifies the Schema object this Parser uses. |
| **/ |
| public final static String schemaProperty = |
| "http://www.ccil.org/~cowan/tagsoup/properties/schema"; |
| |
| /** |
| Specifies the AutoDetector (for encoding detection) this Parser uses. |
| **/ |
| public final static String autoDetectorProperty = |
| "http://www.ccil.org/~cowan/tagsoup/properties/auto-detector"; |
| |
| // Due to sucky Java order of initialization issues, these |
| // entries are maintained separately from the initial values of |
| // the corresponding instance variables, but care must be taken |
| // to keep them in sync. |
| |
| private HashMap theFeatures = new HashMap(); |
| { |
| theFeatures.put(namespacesFeature, truthValue(DEFAULT_NAMESPACES)); |
| theFeatures.put(namespacePrefixesFeature, Boolean.FALSE); |
| theFeatures.put(externalGeneralEntitiesFeature, Boolean.FALSE); |
| theFeatures.put(externalParameterEntitiesFeature, Boolean.FALSE); |
| theFeatures.put(isStandaloneFeature, Boolean.FALSE); |
| theFeatures.put(lexicalHandlerParameterEntitiesFeature, |
| Boolean.FALSE); |
| theFeatures.put(resolveDTDURIsFeature, Boolean.TRUE); |
| theFeatures.put(stringInterningFeature, Boolean.TRUE); |
| theFeatures.put(useAttributes2Feature, Boolean.FALSE); |
| theFeatures.put(useLocator2Feature, Boolean.FALSE); |
| theFeatures.put(useEntityResolver2Feature, Boolean.FALSE); |
| theFeatures.put(validationFeature, Boolean.FALSE); |
| theFeatures.put(xmlnsURIsFeature, Boolean.FALSE); |
| theFeatures.put(xmlnsURIsFeature, Boolean.FALSE); |
| theFeatures.put(XML11Feature, Boolean.FALSE); |
| theFeatures.put(ignoreBogonsFeature, truthValue(DEFAULT_IGNORE_BOGONS)); |
| theFeatures.put(bogonsEmptyFeature, truthValue(DEFAULT_BOGONS_EMPTY)); |
| theFeatures.put(rootBogonsFeature, truthValue(DEFAULT_ROOT_BOGONS)); |
| theFeatures.put(defaultAttributesFeature, truthValue(DEFAULT_DEFAULT_ATTRIBUTES)); |
| theFeatures.put(translateColonsFeature, truthValue(DEFAULT_TRANSLATE_COLONS)); |
| theFeatures.put(restartElementsFeature, truthValue(DEFAULT_RESTART_ELEMENTS)); |
| theFeatures.put(ignorableWhitespaceFeature, truthValue(DEFAULT_IGNORABLE_WHITESPACE)); |
| theFeatures.put(CDATAElementsFeature, truthValue(DEFAULT_CDATA_ELEMENTS)); |
| } |
| |
| // Private clone of Boolean.valueOf that is guaranteed to return |
| // Boolean.TRUE or Boolean.FALSE |
| private static Boolean truthValue(boolean b) { |
| return b ? Boolean.TRUE : Boolean.FALSE; |
| } |
| |
| |
| public boolean getFeature (String name) |
| throws SAXNotRecognizedException, SAXNotSupportedException { |
| Boolean b = (Boolean)theFeatures.get(name); |
| if (b == null) { |
| throw new SAXNotRecognizedException("Unknown feature " + name); |
| } |
| return b.booleanValue(); |
| } |
| |
| public void setFeature (String name, boolean value) |
| throws SAXNotRecognizedException, SAXNotSupportedException { |
| Boolean b = (Boolean)theFeatures.get(name); |
| if (b == null) { |
| throw new SAXNotRecognizedException("Unknown feature " + name); |
| } |
| if (value) theFeatures.put(name, Boolean.TRUE); |
| else theFeatures.put(name, Boolean.FALSE); |
| |
| if (name.equals(namespacesFeature)) namespaces = value; |
| else if (name.equals(ignoreBogonsFeature)) ignoreBogons = value; |
| else if (name.equals(bogonsEmptyFeature)) bogonsEmpty = value; |
| else if (name.equals(rootBogonsFeature)) rootBogons = value; |
| else if (name.equals(defaultAttributesFeature)) defaultAttributes = value; |
| else if (name.equals(translateColonsFeature)) translateColons = value; |
| else if (name.equals(restartElementsFeature)) restartElements = value; |
| else if (name.equals(ignorableWhitespaceFeature)) ignorableWhitespace = value; |
| else if (name.equals(CDATAElementsFeature)) CDATAElements = value; |
| } |
| |
| public Object getProperty (String name) |
| throws SAXNotRecognizedException, SAXNotSupportedException { |
| if (name.equals(lexicalHandlerProperty)) { |
| return theLexicalHandler == this ? null : theLexicalHandler; |
| } |
| else if (name.equals(scannerProperty)) { |
| return theScanner; |
| } |
| else if (name.equals(schemaProperty)) { |
| return theSchema; |
| } |
| else if (name.equals(autoDetectorProperty)) { |
| return theAutoDetector; |
| } |
| else { |
| throw new SAXNotRecognizedException("Unknown property " + name); |
| } |
| } |
| |
| public void setProperty (String name, Object value) |
| throws SAXNotRecognizedException, SAXNotSupportedException { |
| if (name.equals(lexicalHandlerProperty)) { |
| if (value == null) { |
| theLexicalHandler = this; |
| } |
| else if (value instanceof LexicalHandler) { |
| theLexicalHandler = (LexicalHandler)value; |
| } |
| else { |
| throw new SAXNotSupportedException("Your lexical handler is not a LexicalHandler"); |
| } |
| } |
| else if (name.equals(scannerProperty)) { |
| if (value instanceof Scanner) { |
| theScanner = (Scanner)value; |
| } |
| else { |
| throw new SAXNotSupportedException("Your scanner is not a Scanner"); |
| } |
| } |
| else if (name.equals(schemaProperty)) { |
| if (value instanceof Schema) { |
| theSchema = (Schema)value; |
| } |
| else { |
| throw new SAXNotSupportedException("Your schema is not a Schema"); |
| } |
| } |
| else if (name.equals(autoDetectorProperty)) { |
| if (value instanceof AutoDetector) { |
| theAutoDetector = (AutoDetector)value; |
| } |
| else { |
| throw new SAXNotSupportedException("Your auto-detector is not an AutoDetector"); |
| } |
| } |
| else { |
| throw new SAXNotRecognizedException("Unknown property " + name); |
| } |
| } |
| |
| public void setEntityResolver (EntityResolver resolver) { |
| theEntityResolver = (resolver == null) ? this : resolver; |
| } |
| |
| public EntityResolver getEntityResolver () { |
| return (theEntityResolver == this) ? null : theEntityResolver; |
| } |
| |
| public void setDTDHandler (DTDHandler handler) { |
| theDTDHandler = (handler == null) ? this : handler; |
| } |
| |
| public DTDHandler getDTDHandler () { |
| return (theDTDHandler == this) ? null : theDTDHandler; |
| } |
| |
| public void setContentHandler (ContentHandler handler) { |
| theContentHandler = (handler == null) ? this : handler; |
| } |
| |
| public ContentHandler getContentHandler () { |
| return (theContentHandler == this) ? null : theContentHandler; |
| } |
| |
| public void setErrorHandler (ErrorHandler handler) { |
| theErrorHandler = (handler == null) ? this : handler; |
| } |
| |
| public ErrorHandler getErrorHandler () { |
| return (theErrorHandler == this) ? null : theErrorHandler; |
| } |
| |
| public void parse (InputSource input) throws IOException, SAXException { |
| setup(); |
| Reader r = getReader(input); |
| theContentHandler.startDocument(); |
| theScanner.resetDocumentLocator(input.getPublicId(), input.getSystemId()); |
| if (theScanner instanceof Locator) { |
| theContentHandler.setDocumentLocator((Locator)theScanner); |
| } |
| if (!(theSchema.getURI().equals(""))) |
| theContentHandler.startPrefixMapping(theSchema.getPrefix(), |
| theSchema.getURI()); |
| theScanner.scan(r, this); |
| } |
| |
| public void parse (String systemid) throws IOException, SAXException { |
| parse(new InputSource(systemid)); |
| } |
| |
| // Sets up instance variables that haven't been set by setFeature |
| private void setup() { |
| if (theSchema == null) theSchema = new HTMLSchema(); |
| if (theScanner == null) theScanner = new HTMLScanner(); |
| if (theAutoDetector == null) { |
| theAutoDetector = new AutoDetector() { |
| public Reader autoDetectingReader(InputStream i) { |
| return new InputStreamReader(i); |
| } |
| }; |
| } |
| theStack = new Element(theSchema.getElementType("<root>"), defaultAttributes); |
| thePCDATA = new Element(theSchema.getElementType("<pcdata>"), defaultAttributes); |
| theNewElement = null; |
| theAttributeName = null; |
| thePITarget = null; |
| theSaved = null; |
| theEntity = 0; |
| virginStack = true; |
| theDoctypeName = theDoctypePublicId = theDoctypeSystemId = null; |
| } |
| |
| // Return a Reader based on the contents of an InputSource |
| // Buffer both the InputStream and the Reader |
| private Reader getReader(InputSource s) throws SAXException, IOException { |
| Reader r = s.getCharacterStream(); |
| InputStream i = s.getByteStream(); |
| String encoding = s.getEncoding(); |
| String publicid = s.getPublicId(); |
| String systemid = s.getSystemId(); |
| if (r == null) { |
| if (i == null) i = getInputStream(publicid, systemid); |
| // i = new BufferedInputStream(i); |
| if (encoding == null) { |
| r = theAutoDetector.autoDetectingReader(i); |
| } |
| else { |
| try { |
| r = new InputStreamReader(i, encoding); |
| } |
| catch (UnsupportedEncodingException e) { |
| r = new InputStreamReader(i); |
| } |
| } |
| } |
| // r = new BufferedReader(r); |
| return r; |
| } |
| |
| // Get an InputStream based on a publicid and a systemid |
| private InputStream getInputStream(String publicid, String systemid) throws IOException, SAXException { |
| URL basis = new URL("file", "", System.getProperty("user.dir") + "/."); |
| URL url = new URL(basis, systemid); |
| URLConnection c = url.openConnection(); |
| return c.getInputStream(); |
| } |
| // We don't process publicids (who uses them anyhow?) |
| |
| // ScanHandler implementation |
| |
| private Element theNewElement = null; |
| private String theAttributeName = null; |
| private boolean theDoctypeIsPresent = false; |
| private String theDoctypePublicId = null; |
| private String theDoctypeSystemId = null; |
| private String theDoctypeName = null; |
| private String thePITarget = null; |
| private Element theStack = null; |
| private Element theSaved = null; |
| private Element thePCDATA = null; |
| private int theEntity = 0; // needs to support chars past U+FFFF |
| |
| public void adup(char[] buff, int offset, int length) throws SAXException { |
| if (theNewElement == null || theAttributeName == null) return; |
| theNewElement.setAttribute(theAttributeName, null, theAttributeName); |
| theAttributeName = null; |
| } |
| |
| public void aname(char[] buff, int offset, int length) throws SAXException { |
| if (theNewElement == null) return; |
| // Currently we don't rely on Schema to canonicalize |
| // attribute names. |
| theAttributeName = makeName(buff, offset, length).toLowerCase(); |
| // System.err.println("%% Attribute name " + theAttributeName); |
| } |
| |
| public void aval(char[] buff, int offset, int length) throws SAXException { |
| if (theNewElement == null || theAttributeName == null) return; |
| String value = new String(buff, offset, length); |
| // System.err.println("%% Attribute value [" + value + "]"); |
| value = expandEntities(value); |
| theNewElement.setAttribute(theAttributeName, null, value); |
| theAttributeName = null; |
| // System.err.println("%% Aval done"); |
| } |
| |
| // Expand entity references in attribute values selectively. |
| // Currently we expand a reference iff it is properly terminated |
| // with a semicolon. |
| private String expandEntities(String src) { |
| int refStart = -1; |
| int len = src.length(); |
| char[] dst = new char[len]; |
| int dstlen = 0; |
| for (int i = 0; i < len; i++) { |
| char ch = src.charAt(i); |
| dst[dstlen++] = ch; |
| // System.err.print("i = " + i + ", d = " + dstlen + ", ch = [" + ch + "] "); |
| if (ch == '&' && refStart == -1) { |
| // start of a ref excluding & |
| refStart = dstlen; |
| // System.err.println("start of ref"); |
| } |
| else if (refStart == -1) { |
| // not in a ref |
| // System.err.println("not in ref"); |
| } |
| else if (Character.isLetter(ch) || |
| Character.isDigit(ch) || |
| ch == '#') { |
| // valid entity char |
| // System.err.println("valid"); |
| } |
| else if (ch == ';') { |
| // properly terminated ref |
| // System.err.print("got [" + new String(dst, refStart, dstlen-refStart-1) + "]"); |
| int ent = lookupEntity(dst, refStart, dstlen - refStart - 1); |
| // System.err.println(" = " + ent); |
| if (ent > 0xFFFF) { |
| ent -= 0x10000; |
| dst[refStart - 1] = (char)((ent>>10) + 0xD800); |
| dst[refStart] = (char)((ent&0x3FF) + 0xDC00); |
| dstlen = refStart + 1; |
| } |
| else if (ent != 0) { |
| dst[refStart - 1] = (char)ent; |
| dstlen = refStart; |
| } |
| refStart = -1; |
| } |
| else { |
| // improperly terminated ref |
| // System.err.println("end of ref"); |
| refStart = -1; |
| } |
| } |
| return new String(dst, 0, dstlen); |
| } |
| |
| public void entity(char[] buff, int offset, int length) throws SAXException { |
| theEntity = lookupEntity(buff, offset, length); |
| } |
| |
| // Process numeric character references, |
| // deferring to the schema for named ones. |
| private int lookupEntity(char[] buff, int offset, int length) { |
| int result = 0; |
| if (length < 1) return result; |
| // System.err.println("%% Entity at " + offset + " " + length); |
| // System.err.println("%% Got entity [" + new String(buff, offset, length) + "]"); |
| if (buff[offset] == '#') { |
| if (length > 1 && (buff[offset+1] == 'x' |
| || buff[offset+1] == 'X')) { |
| try { |
| return Integer.parseInt(new String(buff, offset + 2, length - 2), 16); |
| } |
| catch (NumberFormatException e) { return 0; } |
| } |
| try { |
| return Integer.parseInt(new String(buff, offset + 1, length - 1), 10); |
| } |
| catch (NumberFormatException e) { return 0; } |
| } |
| return theSchema.getEntity(new String(buff, offset, length)); |
| } |
| |
| public void eof(char[] buff, int offset, int length) throws SAXException { |
| if (virginStack) rectify(thePCDATA); |
| while (theStack.next() != null) { |
| pop(); |
| } |
| if (!(theSchema.getURI().equals(""))) |
| theContentHandler.endPrefixMapping(theSchema.getPrefix()); |
| theContentHandler.endDocument(); |
| } |
| |
| public void etag(char[] buff, int offset, int length) throws SAXException { |
| if (etag_cdata(buff, offset, length)) return; |
| etag_basic(buff, offset, length); |
| } |
| |
| private static char[] etagchars = {'<', '/', '>'}; |
| public boolean etag_cdata(char[] buff, int offset, int length) throws SAXException { |
| String currentName = theStack.name(); |
| // If this is a CDATA element and the tag doesn't match, |
| // or isn't properly formed (junk after the name), |
| // restart CDATA mode and process the tag as characters. |
| if (CDATAElements && (theStack.flags() & Schema.F_CDATA) != 0) { |
| boolean realTag = (length == currentName.length()); |
| if (realTag) { |
| for (int i = 0; i < length; i++) { |
| if (Character.toLowerCase(buff[offset + i]) != Character.toLowerCase(currentName.charAt(i))) { |
| realTag = false; |
| break; |
| } |
| } |
| } |
| if (!realTag) { |
| theContentHandler.characters(etagchars, 0, 2); |
| theContentHandler.characters(buff, offset, length); |
| theContentHandler.characters(etagchars, 2, 1); |
| theScanner.startCDATA(); |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| public void etag_basic(char[] buff, int offset, int length) throws SAXException { |
| theNewElement = null; |
| String name; |
| if (length != 0) { |
| // Canonicalize case of name |
| name = makeName(buff, offset, length); |
| // System.err.println("got etag [" + name + "]"); |
| ElementType type = theSchema.getElementType(name); |
| if (type == null) return; // mysterious end-tag |
| name = type.name(); |
| } |
| else { |
| name = theStack.name(); |
| } |
| // System.err.println("%% Got end of " + name); |
| |
| Element sp; |
| boolean inNoforce = false; |
| for (sp = theStack; sp != null; sp = sp.next()) { |
| if (sp.name().equals(name)) break; |
| if ((sp.flags() & Schema.F_NOFORCE) != 0) inNoforce = true; |
| } |
| |
| if (sp == null) return; // Ignore unknown etags |
| if (sp.next() == null || sp.next().next() == null) return; |
| if (inNoforce) { // inside an F_NOFORCE element? |
| sp.preclose(); // preclose the matching element |
| } |
| else { // restartably pop everything above us |
| while (theStack != sp) { |
| restartablyPop(); |
| } |
| pop(); |
| } |
| // pop any preclosed elements now at the top |
| while (theStack.isPreclosed()) { |
| pop(); |
| } |
| restart(null); |
| } |
| |
| // Push restartables on the stack if possible |
| // e is the next element to be started, if we know what it is |
| private void restart(Element e) throws SAXException { |
| while (theSaved != null && theStack.canContain(theSaved) && |
| (e == null || theSaved.canContain(e))) { |
| Element next = theSaved.next(); |
| push(theSaved); |
| theSaved = next; |
| } |
| } |
| |
| // Pop the stack irrevocably |
| private void pop() throws SAXException { |
| if (theStack == null) return; // empty stack |
| String name = theStack.name(); |
| String localName = theStack.localName(); |
| String namespace = theStack.namespace(); |
| String prefix = prefixOf(name); |
| |
| // System.err.println("%% Popping " + name); |
| if (!namespaces) namespace = localName = ""; |
| theContentHandler.endElement(namespace, localName, name); |
| if (foreign(prefix, namespace)) { |
| theContentHandler.endPrefixMapping(prefix); |
| // System.err.println("%% Unmapping [" + prefix + "] for elements to " + namespace); |
| } |
| Attributes atts = theStack.atts(); |
| for (int i = atts.getLength() - 1; i >= 0; i--) { |
| String attNamespace = atts.getURI(i); |
| String attPrefix = prefixOf(atts.getQName(i)); |
| if (foreign(attPrefix, attNamespace)) { |
| theContentHandler.endPrefixMapping(attPrefix); |
| // System.err.println("%% Unmapping [" + attPrefix + "] for attributes to " + attNamespace); |
| } |
| } |
| theStack = theStack.next(); |
| } |
| |
| // Pop the stack restartably |
| private void restartablyPop() throws SAXException { |
| Element popped = theStack; |
| pop(); |
| if (restartElements && (popped.flags() & Schema.F_RESTART) != 0) { |
| popped.anonymize(); |
| popped.setNext(theSaved); |
| theSaved = popped; |
| } |
| } |
| |
| // Push element onto stack |
| private boolean virginStack = true; |
| private void push(Element e) throws SAXException { |
| String name = e.name(); |
| String localName = e.localName(); |
| String namespace = e.namespace(); |
| String prefix = prefixOf(name); |
| |
| // System.err.println("%% Pushing " + name); |
| e.clean(); |
| if (!namespaces) namespace = localName = ""; |
| if (virginStack && localName.equalsIgnoreCase(theDoctypeName)) { |
| try { |
| theEntityResolver.resolveEntity(theDoctypePublicId, theDoctypeSystemId); |
| } catch (IOException ew) { } // Can't be thrown for root I believe. |
| } |
| if (foreign(prefix, namespace)) { |
| theContentHandler.startPrefixMapping(prefix, namespace); |
| // System.err.println("%% Mapping [" + prefix + "] for elements to " + namespace); |
| } |
| Attributes atts = e.atts(); |
| int len = atts.getLength(); |
| for (int i = 0; i < len; i++) { |
| String attNamespace = atts.getURI(i); |
| String attPrefix = prefixOf(atts.getQName(i)); |
| if (foreign(attPrefix, attNamespace)) { |
| theContentHandler.startPrefixMapping(attPrefix, attNamespace); |
| // System.err.println("%% Mapping [" + attPrefix + "] for attributes to " + attNamespace); |
| } |
| } |
| theContentHandler.startElement(namespace, localName, name, e.atts()); |
| e.setNext(theStack); |
| theStack = e; |
| virginStack = false; |
| if (CDATAElements && (theStack.flags() & Schema.F_CDATA) != 0) { |
| theScanner.startCDATA(); |
| } |
| } |
| |
| // Get the prefix from a QName |
| private String prefixOf(String name) { |
| int i = name.indexOf(':'); |
| String prefix = ""; |
| if (i != -1) prefix = name.substring(0, i); |
| // System.err.println("%% " + prefix + " is prefix of " + name); |
| return prefix; |
| } |
| |
| // Return true if we have a foreign name |
| private boolean foreign(String prefix, String namespace) { |
| // System.err.print("%% Testing " + prefix + " and " + namespace + " for foreignness -- "); |
| boolean foreign = !(prefix.equals("") || namespace.equals("") || |
| namespace.equals(theSchema.getURI())); |
| // System.err.println(foreign); |
| return foreign; |
| } |
| |
| /** |
| * Parsing the complete XML Document Type Definition is way too complex, |
| * but for many simple cases we can extract something useful from it. |
| * |
| * doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>' |
| * DeclSep ::= PEReference | S |
| * intSubset ::= (markupdecl | DeclSep)* |
| * markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment |
| * ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral |
| */ |
| public void decl(char[] buff, int offset, int length) throws SAXException { |
| String s = new String(buff, offset, length); |
| String name = null; |
| String systemid = null; |
| String publicid = null; |
| String[] v = split(s); |
| if (v.length > 0 && "DOCTYPE".equalsIgnoreCase(v[0])) { |
| if (theDoctypeIsPresent) return; // one doctype only! |
| theDoctypeIsPresent = true; |
| if (v.length > 1) { |
| name = v[1]; |
| if (v.length>3 && "SYSTEM".equals(v[2])) { |
| systemid = v[3]; |
| } |
| else if (v.length > 3 && "PUBLIC".equals(v[2])) { |
| publicid = v[3]; |
| if (v.length > 4) { |
| systemid = v[4]; |
| } |
| else { |
| systemid = ""; |
| } |
| } |
| } |
| } |
| publicid = trimquotes(publicid); |
| systemid = trimquotes(systemid); |
| if (name != null) { |
| publicid = cleanPublicid(publicid); |
| theLexicalHandler.startDTD(name, publicid, systemid); |
| theLexicalHandler.endDTD(); |
| theDoctypeName = name; |
| theDoctypePublicId = publicid; |
| if (theScanner instanceof Locator) { // Must resolve systemid |
| theDoctypeSystemId = ((Locator)theScanner).getSystemId(); |
| try { |
| theDoctypeSystemId = new URL(new URL(theDoctypeSystemId), systemid).toString(); |
| } catch (Exception e) {} |
| } |
| } |
| } |
| |
| // If the String is quoted, trim the quotes. |
| private static String trimquotes(String in) { |
| if (in == null) return in; |
| int length = in.length(); |
| if (length == 0) return in; |
| char s = in.charAt(0); |
| char e = in.charAt(length - 1); |
| if (s == e && (s == '\'' || s == '"')) { |
| in = in.substring(1, in.length() - 1); |
| } |
| return in; |
| } |
| |
| // Split the supplied String into words or phrases seperated by spaces. |
| // Recognises quotes around a phrase and doesn't split it. |
| private static String[] split(String val) throws IllegalArgumentException { |
| val = val.trim(); |
| if (val.length() == 0) { |
| return new String[0]; |
| } |
| else { |
| ArrayList l = new ArrayList(); |
| int s = 0; |
| int e = 0; |
| boolean sq = false; // single quote |
| boolean dq = false; // double quote |
| char lastc = 0; |
| int len = val.length(); |
| for (e=0; e < len; e++) { |
| char c = val.charAt(e); |
| if (!dq && c == '\'' && lastc != '\\') { |
| sq = !sq; |
| if (s < 0) s = e; |
| } |
| else if (!sq && c == '\"' && lastc != '\\') { |
| dq = !dq; |
| if (s < 0) s = e; |
| } |
| else if (!sq && !dq) { |
| if (Character.isWhitespace(c)) { |
| if (s >= 0) l.add(val.substring(s, e)); |
| s = -1; |
| } |
| else if (s < 0 && c != ' ') { |
| s = e; |
| } |
| } |
| lastc = c; |
| } |
| l.add(val.substring(s, e)); |
| return (String[])l.toArray(new String[0]); |
| } |
| } |
| |
| // Replace junk in publicids with spaces |
| private static String legal = |
| "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-'()+,./:=?;!*#@$_%"; |
| |
| private String cleanPublicid(String src) { |
| if (src == null) return null; |
| int len = src.length(); |
| StringBuffer dst = new StringBuffer(len); |
| boolean suppressSpace = true; |
| for (int i = 0; i < len; i++) { |
| char ch = src.charAt(i); |
| if (legal.indexOf(ch) != -1) { // legal but not whitespace |
| dst.append(ch); |
| suppressSpace = false; |
| } |
| else if (suppressSpace) { // normalizable whitespace or junk |
| ; |
| } |
| else { |
| dst.append(' '); |
| suppressSpace = true; |
| } |
| } |
| // System.err.println("%% Publicid [" + dst.toString().trim() + "]"); |
| return dst.toString().trim(); // trim any final junk whitespace |
| } |
| |
| |
| public void gi(char[] buff, int offset, int length) throws SAXException { |
| if (theNewElement != null) return; |
| String name = makeName(buff, offset, length); |
| if (name == null) return; |
| ElementType type = theSchema.getElementType(name); |
| if (type == null) { |
| // Suppress unknown elements if ignore-bogons is on |
| if (ignoreBogons) return; |
| int bogonModel = bogonsEmpty ? Schema.M_EMPTY : Schema.M_ANY; |
| int bogonMemberOf = rootBogons ? Schema.M_ANY : (Schema.M_ANY & ~ Schema.M_ROOT); |
| theSchema.elementType(name, bogonModel, bogonMemberOf, 0); |
| if (!rootBogons) theSchema.parent(name, theSchema.rootElementType().name()); |
| type = theSchema.getElementType(name); |
| } |
| |
| theNewElement = new Element(type, defaultAttributes); |
| // System.err.println("%% Got GI " + theNewElement.name()); |
| } |
| |
| public void cdsect(char[] buff, int offset, int length) throws SAXException { |
| theLexicalHandler.startCDATA(); |
| pcdata(buff, offset, length); |
| theLexicalHandler.endCDATA(); |
| } |
| public void pcdata(char[] buff, int offset, int length) throws SAXException { |
| if (length == 0) return; |
| boolean allWhite = true; |
| for (int i = 0; i < length; i++) { |
| if (!Character.isWhitespace(buff[offset+i])) { |
| allWhite = false; |
| } |
| } |
| if (allWhite && !theStack.canContain(thePCDATA)) { |
| if (ignorableWhitespace) { |
| theContentHandler.ignorableWhitespace(buff, offset, length); |
| } |
| } |
| else { |
| rectify(thePCDATA); |
| theContentHandler.characters(buff, offset, length); |
| } |
| } |
| |
| public void pitarget(char[] buff, int offset, int length) throws SAXException { |
| if (theNewElement != null) return; |
| thePITarget = makeName(buff, offset, length).replace(':', '_'); |
| } |
| |
| public void pi(char[] buff, int offset, int length) throws SAXException { |
| if (theNewElement != null || thePITarget == null) return; |
| if ("xml".equalsIgnoreCase(thePITarget)) return; |
| // if (length > 0 && buff[length - 1] == '?') System.err.println("%% Removing ? from PI"); |
| if (length > 0 && buff[length - 1] == '?') length--; // remove trailing ? |
| theContentHandler.processingInstruction(thePITarget, |
| new String(buff, offset, length)); |
| thePITarget = null; |
| } |
| |
| public void stagc(char[] buff, int offset, int length) throws SAXException { |
| // System.err.println("%% Start-tag"); |
| if (theNewElement == null) return; |
| rectify(theNewElement); |
| if (theStack.model() == Schema.M_EMPTY) { |
| // Force an immediate end tag |
| etag_basic(buff, offset, length); |
| } |
| } |
| |
| public void stage(char[] buff, int offset, int length) throws SAXException { |
| // System.err.println("%% Empty-tag"); |
| if (theNewElement == null) return; |
| rectify(theNewElement); |
| // Force an immediate end tag |
| etag_basic(buff, offset, length); |
| } |
| |
| // Comment buffer is twice the size of the output buffer |
| private char[] theCommentBuffer = new char[2000]; |
| public void cmnt(char[] buff, int offset, int length) throws SAXException { |
| theLexicalHandler.comment(buff, offset, length); |
| } |
| |
| // Rectify the stack, pushing and popping as needed |
| // so that the argument can be safely pushed |
| private void rectify(Element e) throws SAXException { |
| Element sp; |
| while (true) { |
| for (sp = theStack; sp != null; sp = sp.next()) { |
| if (sp.canContain(e)) break; |
| } |
| if (sp != null) break; |
| ElementType parentType = e.parent(); |
| if (parentType == null) break; |
| Element parent = new Element(parentType, defaultAttributes); |
| // System.err.println("%% Ascending from " + e.name() + " to " + parent.name()); |
| parent.setNext(e); |
| e = parent; |
| } |
| if (sp == null) return; // don't know what to do |
| while (theStack != sp) { |
| if (theStack == null || theStack.next() == null || |
| theStack.next().next() == null) break; |
| restartablyPop(); |
| } |
| while (e != null) { |
| Element nexte = e.next(); |
| if (!e.name().equals("<pcdata>")) push(e); |
| e = nexte; |
| restart(e); |
| } |
| theNewElement = null; |
| } |
| |
| public int getEntity() { |
| return theEntity; |
| } |
| |
| // Return the argument as a valid XML name |
| // This no longer lowercases the result: we depend on Schema to |
| // canonicalize case. |
| private String makeName(char[] buff, int offset, int length) { |
| StringBuffer dst = new StringBuffer(length + 2); |
| boolean seenColon = false; |
| boolean start = true; |
| // String src = new String(buff, offset, length); // DEBUG |
| for (; length-- > 0; offset++) { |
| char ch = buff[offset]; |
| if (Character.isLetter(ch) || ch == '_') { |
| start = false; |
| dst.append(ch); |
| } |
| else if (Character.isDigit(ch) || ch == '-' || ch == '.') { |
| if (start) dst.append('_'); |
| start = false; |
| dst.append(ch); |
| } |
| else if (ch == ':' && !seenColon) { |
| seenColon = true; |
| if (start) dst.append('_'); |
| start = true; |
| dst.append(translateColons ? '_' : ch); |
| } |
| } |
| int dstLength = dst.length(); |
| if (dstLength == 0 || dst.charAt(dstLength - 1) == ':') dst.append('_'); |
| // System.err.println("Made name \"" + dst + "\" from \"" + src + "\""); |
| return dst.toString().intern(); |
| } |
| |
| // Default LexicalHandler implementation |
| |
| public void comment(char[] ch, int start, int length) throws SAXException { } |
| public void endCDATA() throws SAXException { } |
| public void endDTD() throws SAXException { } |
| public void endEntity(String name) throws SAXException { } |
| public void startCDATA() throws SAXException { } |
| public void startDTD(String name, String publicid, String systemid) throws SAXException { } |
| public void startEntity(String name) throws SAXException { } |
| |
| } |