src/share/classes/javax/swing/text/html/parser/DocumentParser.java - toolchain/jdk/jdk9_jdk - Git at Google

 /*
  * Copyright (c) 1998, 2003, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License version 2 only, as
  * published by the Free Software Foundation.  Oracle designates this
  * particular file as subject to the "Classpath" exception as provided
  * by Oracle in the LICENSE file that accompanied this code.
  *
  * This code is distributed in the hope that it will be useful, but WITHOUT
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  * version 2 for more details (a copy is included in the LICENSE file that
  * accompanied this code).
  *
  * You should have received a copy of the GNU General Public License version
  * 2 along with this work; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  *
  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  * or visit www.oracle.com if you need additional information or have any
  * questions.
  */

 package javax.swing.text.html.parser;

 import javax.swing.text.SimpleAttributeSet;
 import javax.swing.text.html.HTMLEditorKit;
 import javax.swing.text.html.HTML;
 import javax.swing.text.ChangedCharSetException;

 import java.util.*;
 import java.io.*;
 import java.net.*;

 /**
  * A Parser for HTML Documents (actually, you can specify a DTD, but
  * you should really only use this class with the html dtd in swing).
  * Reads an InputStream of HTML and
  * invokes the appropriate methods in the ParserCallback class. This
  * is the default parser used by HTMLEditorKit to parse HTML url's.
  * <p>This will message the callback for all valid tags, as well as
  * tags that are implied but not explicitly specified. For example, the
  * html string (&lt;p&gt;blah) only has a p tag defined. The callback
  * will see the following methods:
  * <ol><li><i>handleStartTag(html, ...)</i></li>
  *     <li><i>handleStartTag(head, ...)</i></li>
  *     <li><i>handleEndTag(head)</i></li>
  *     <li><i>handleStartTag(body, ...)</i></li>
  *     <li>handleStartTag(p, ...)</i></li>
  *     <li>handleText(...)</li>
  *     <li><i>handleEndTag(p)</i></li>
  *     <li><i>handleEndTag(body)</i></li>
  *     <li><i>handleEndTag(html)</i></li>
  * </ol>
  * The items in <i>italic</i> are implied, that is, although they were not
  * explicitly specified, to be correct html they should have been present
  * (head isn't necessary, but it is still generated). For tags that
  * are implied, the AttributeSet argument will have a value of
  * <code>Boolean.TRUE</code> for the key
  * <code>HTMLEditorKit.ParserCallback.IMPLIED</code>.
  * <p>HTML.Attributes defines a type safe enumeration of html attributes.
  * If an attribute key of a tag is defined in HTML.Attribute, the
  * HTML.Attribute will be used as the key, otherwise a String will be used.
  * For example &lt;p foo=bar class=neat&gt; has two attributes. foo is
  * not defined in HTML.Attribute, where as class is, therefore the
  * AttributeSet will have two values in it, HTML.Attribute.CLASS with
  * a String value of 'neat' and the String key 'foo' with a String value of
  * 'bar'.
  * <p>The position argument will indicate the start of the tag, comment
  * or text. Similiar to arrays, the first character in the stream has a
  * position of 0. For tags that are
  * implied the position will indicate
  * the location of the next encountered tag. In the first example,
  * the implied start body and html tags will have the same position as the
  * p tag, and the implied end p, html and body tags will all have the same
  * position.
  * <p>As html skips whitespace the position for text will be the position
  * of the first valid character, eg in the string '\n\n\nblah'
  * the text 'blah' will have a position of 3, the newlines are skipped.
  * <p>
  * For attributes that do not have a value, eg in the html
  * string <code>&lt;foo blah&gt;</code> the attribute <code>blah</code>
  * does not have a value, there are two possible values that will be
  * placed in the AttributeSet's value:
  * <ul>
  * <li>If the DTD does not contain an definition for the element, or the
  *     definition does not have an explicit value then the value in the
  *     AttributeSet will be <code>HTML.NULL_ATTRIBUTE_VALUE</code>.
  * <li>If the DTD contains an explicit value, as in:
  *     <code>&lt;!ATTLIST OPTION selected (selected) #IMPLIED&gt;</code>
  *     this value from the dtd (in this case selected) will be used.
  * </ul>
  * <p>
  * Once the stream has been parsed, the callback is notified of the most
  * likely end of line string. The end of line string will be one of
  * \n, \r or \r\n, which ever is encountered the most in parsing the
  * stream.
  *
  * @author      Sunita Mani
  */
 public class DocumentParser extends javax.swing.text.html.parser.Parser {

     private int inbody;
     private int intitle;
     private int inhead;
     private int instyle;
     private int inscript;
     private boolean seentitle;
     private HTMLEditorKit.ParserCallback callback = null;
     private boolean ignoreCharSet = false;
     private static final boolean debugFlag = false;

     public DocumentParser(DTD dtd) {
         super(dtd);
     }

     public void parse(Reader in,  HTMLEditorKit.ParserCallback callback, boolean ignoreCharSet) throws IOException {
         this.ignoreCharSet = ignoreCharSet;
         this.callback = callback;
         parse(in);
         // end of line
         callback.handleEndOfLineString(getEndOfLineString());
     }

     /**
      * Handle Start Tag.
      */
     protected void handleStartTag(TagElement tag) {

         Element elem = tag.getElement();
         if (elem == dtd.body) {
             inbody++;
         } else if (elem == dtd.html) {
         } else if (elem == dtd.head) {
             inhead++;
         } else if (elem == dtd.title) {
             intitle++;
         } else if (elem == dtd.style) {
             instyle++;
         } else if (elem == dtd.script) {
             inscript++;
         }
         if (debugFlag) {
             if (tag.fictional()) {
                 debug("Start Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos());
             } else {
                 debug("Start Tag: " + tag.getHTMLTag() + " attributes: " +
                       getAttributes() + " pos: " + getCurrentPos());
             }
         }
         if (tag.fictional()) {
             SimpleAttributeSet attrs = new SimpleAttributeSet();
             attrs.addAttribute(HTMLEditorKit.ParserCallback.IMPLIED,
                                Boolean.TRUE);
             callback.handleStartTag(tag.getHTMLTag(), attrs,
                                     getBlockStartPosition());
         } else {
             callback.handleStartTag(tag.getHTMLTag(), getAttributes(),
                                     getBlockStartPosition());
             flushAttributes();
         }
     }


     protected void handleComment(char text[]) {
         if (debugFlag) {
             debug("comment: ->" + new String(text) + "<-"
                   + " pos: " + getCurrentPos());
         }
         callback.handleComment(text, getBlockStartPosition());
     }

     /**
      * Handle Empty Tag.
      */
     protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException {

         Element elem = tag.getElement();
         if (elem == dtd.meta && !ignoreCharSet) {
             SimpleAttributeSet atts = getAttributes();
             if (atts != null) {
                 String content = (String)atts.getAttribute(HTML.Attribute.CONTENT);
                 if (content != null) {
                     if ("content-type".equalsIgnoreCase((String)atts.getAttribute(HTML.Attribute.HTTPEQUIV))) {
                         if (!content.equalsIgnoreCase("text/html") &&
                                 !content.equalsIgnoreCase("text/plain")) {
                             throw new ChangedCharSetException(content, false);
                         }
                     } else if ("charset" .equalsIgnoreCase((String)atts.getAttribute(HTML.Attribute.HTTPEQUIV))) {
                         throw new ChangedCharSetException(content, true);
                     }
                 }
             }
         }
         if (inbody != 0 || elem == dtd.meta || elem == dtd.base || elem == dtd.isindex || elem == dtd.style || elem == dtd.link) {
             if (debugFlag) {
                 if (tag.fictional()) {
                     debug("Empty Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos());
                 } else {
                     debug("Empty Tag: " + tag.getHTMLTag() + " attributes: "
                           + getAttributes() + " pos: " + getCurrentPos());
                 }
             }
             if (tag.fictional()) {
                 SimpleAttributeSet attrs = new SimpleAttributeSet();
                 attrs.addAttribute(HTMLEditorKit.ParserCallback.IMPLIED,
                                    Boolean.TRUE);
                 callback.handleSimpleTag(tag.getHTMLTag(), attrs,
                                          getBlockStartPosition());
             } else {
                 callback.handleSimpleTag(tag.getHTMLTag(), getAttributes(),
                                          getBlockStartPosition());
                 flushAttributes();
             }
         }
     }

     /**
      * Handle End Tag.
      */
     protected void handleEndTag(TagElement tag) {
         Element elem = tag.getElement();
         if (elem == dtd.body) {
             inbody--;
         } else if (elem == dtd.title) {
             intitle--;
             seentitle = true;
         } else if (elem == dtd.head) {
             inhead--;
         } else if (elem == dtd.style) {
             instyle--;
         } else if (elem == dtd.script) {
             inscript--;
         }
         if (debugFlag) {
             debug("End Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos());
         }
         callback.handleEndTag(tag.getHTMLTag(), getBlockStartPosition());

     }

     /**
      * Handle Text.
      */
     protected void handleText(char data[]) {
         if (data != null) {
             if (inscript != 0) {
                 callback.handleComment(data, getBlockStartPosition());
                 return;
             }
             if (inbody != 0 || ((instyle != 0) ||
                                 ((intitle != 0) && !seentitle))) {
                 if (debugFlag) {
                     debug("text:  ->" + new String(data) + "<-" + " pos: " + getCurrentPos());
                 }
                 callback.handleText(data, getBlockStartPosition());
             }
         }
     }

     /*
      * Error handling.
      */
     protected void handleError(int ln, String errorMsg) {
         if (debugFlag) {
             debug("Error: ->" + errorMsg + "<-" + " pos: " + getCurrentPos());
         }
         /* PENDING: need to improve the error string. */
         callback.handleError(errorMsg, getCurrentPos());
     }


     /*
      * debug messages
      */
     private void debug(String msg) {
         System.out.println(msg);
     }
 }
	/*
	* Copyright (c) 1998, 2003, Oracle and/or its affiliates. All rights reserved.
	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
	*
	* This code is free software; you can redistribute it and/or modify it
	* under the terms of the GNU General Public License version 2 only, as
	* published by the Free Software Foundation. Oracle designates this
	* particular file as subject to the "Classpath" exception as provided
	* by Oracle in the LICENSE file that accompanied this code.
	*
	* This code is distributed in the hope that it will be useful, but WITHOUT
	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	* version 2 for more details (a copy is included in the LICENSE file that
	* accompanied this code).
	*
	* You should have received a copy of the GNU General Public License version
	* 2 along with this work; if not, write to the Free Software Foundation,
	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
	*
	* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
	* or visit www.oracle.com if you need additional information or have any
	* questions.
	*/

	package javax.swing.text.html.parser;

	import javax.swing.text.SimpleAttributeSet;
	import javax.swing.text.html.HTMLEditorKit;
	import javax.swing.text.html.HTML;
	import javax.swing.text.ChangedCharSetException;

	import java.util.*;
	import java.io.*;
	import java.net.*;

	/**
	* A Parser for HTML Documents (actually, you can specify a DTD, but
	* you should really only use this class with the html dtd in swing).
	* Reads an InputStream of HTML and
	* invokes the appropriate methods in the ParserCallback class. This
	* is the default parser used by HTMLEditorKit to parse HTML url's.
	* <p>This will message the callback for all valid tags, as well as
	* tags that are implied but not explicitly specified. For example, the
	* html string (<p>blah) only has a p tag defined. The callback
	* will see the following methods:
	* <ol><li><i>handleStartTag(html, ...)</i></li>
	* <li><i>handleStartTag(head, ...)</i></li>
	* <li><i>handleEndTag(head)</i></li>
	* <li><i>handleStartTag(body, ...)</i></li>
	* <li>handleStartTag(p, ...)</i></li>
	* <li>handleText(...)</li>
	* <li><i>handleEndTag(p)</i></li>
	* <li><i>handleEndTag(body)</i></li>
	* <li><i>handleEndTag(html)</i></li>
	* </ol>
	* The items in <i>italic</i> are implied, that is, although they were not
	* explicitly specified, to be correct html they should have been present
	* (head isn't necessary, but it is still generated). For tags that
	* are implied, the AttributeSet argument will have a value of
	* <code>Boolean.TRUE</code> for the key
	* <code>HTMLEditorKit.ParserCallback.IMPLIED</code>.
	* <p>HTML.Attributes defines a type safe enumeration of html attributes.
	* If an attribute key of a tag is defined in HTML.Attribute, the
	* HTML.Attribute will be used as the key, otherwise a String will be used.
	* For example <p foo=bar class=neat> has two attributes. foo is
	* not defined in HTML.Attribute, where as class is, therefore the
	* AttributeSet will have two values in it, HTML.Attribute.CLASS with
	* a String value of 'neat' and the String key 'foo' with a String value of
	* 'bar'.
	* <p>The position argument will indicate the start of the tag, comment
	* or text. Similiar to arrays, the first character in the stream has a
	* position of 0. For tags that are
	* implied the position will indicate
	* the location of the next encountered tag. In the first example,
	* the implied start body and html tags will have the same position as the
	* p tag, and the implied end p, html and body tags will all have the same
	* position.
	* <p>As html skips whitespace the position for text will be the position
	* of the first valid character, eg in the string '\n\n\nblah'
	* the text 'blah' will have a position of 3, the newlines are skipped.
	* <p>
	* For attributes that do not have a value, eg in the html
	* string <code><foo blah></code> the attribute <code>blah</code>
	* does not have a value, there are two possible values that will be
	* placed in the AttributeSet's value:
	* <ul>
	* <li>If the DTD does not contain an definition for the element, or the
	* definition does not have an explicit value then the value in the
	* AttributeSet will be <code>HTML.NULL_ATTRIBUTE_VALUE</code>.
	* <li>If the DTD contains an explicit value, as in:
	* <code><!ATTLIST OPTION selected (selected) #IMPLIED></code>
	* this value from the dtd (in this case selected) will be used.
	* </ul>
	* <p>
	* Once the stream has been parsed, the callback is notified of the most
	* likely end of line string. The end of line string will be one of
	* \n, \r or \r\n, which ever is encountered the most in parsing the
	* stream.
	*
	* @author Sunita Mani
	*/
	public class DocumentParser extends javax.swing.text.html.parser.Parser {

	private int inbody;
	private int intitle;
	private int inhead;
	private int instyle;
	private int inscript;
	private boolean seentitle;
	private HTMLEditorKit.ParserCallback callback = null;
	private boolean ignoreCharSet = false;
	private static final boolean debugFlag = false;

	public DocumentParser(DTD dtd) {
	super(dtd);
	}

	public void parse(Reader in, HTMLEditorKit.ParserCallback callback, boolean ignoreCharSet) throws IOException {
	this.ignoreCharSet = ignoreCharSet;
	this.callback = callback;
	parse(in);
	// end of line
	callback.handleEndOfLineString(getEndOfLineString());
	}

	/**
	* Handle Start Tag.
	*/
	protected void handleStartTag(TagElement tag) {

	Element elem = tag.getElement();
	if (elem == dtd.body) {
	inbody++;
	} else if (elem == dtd.html) {
	} else if (elem == dtd.head) {
	inhead++;
	} else if (elem == dtd.title) {
	intitle++;
	} else if (elem == dtd.style) {
	instyle++;
	} else if (elem == dtd.script) {
	inscript++;
	}
	if (debugFlag) {
	if (tag.fictional()) {
	debug("Start Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos());
	} else {
	debug("Start Tag: " + tag.getHTMLTag() + " attributes: " +
	getAttributes() + " pos: " + getCurrentPos());
	}
	}
	if (tag.fictional()) {
	SimpleAttributeSet attrs = new SimpleAttributeSet();
	attrs.addAttribute(HTMLEditorKit.ParserCallback.IMPLIED,
	Boolean.TRUE);
	callback.handleStartTag(tag.getHTMLTag(), attrs,
	getBlockStartPosition());
	} else {
	callback.handleStartTag(tag.getHTMLTag(), getAttributes(),
	getBlockStartPosition());
	flushAttributes();
	}
	}


	protected void handleComment(char text[]) {
	if (debugFlag) {
	debug("comment: ->" + new String(text) + "<-"
	+ " pos: " + getCurrentPos());
	}
	callback.handleComment(text, getBlockStartPosition());
	}

	/**
	* Handle Empty Tag.
	*/
	protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException {

	Element elem = tag.getElement();
	if (elem == dtd.meta && !ignoreCharSet) {
	SimpleAttributeSet atts = getAttributes();
	if (atts != null) {
	String content = (String)atts.getAttribute(HTML.Attribute.CONTENT);
	if (content != null) {
	if ("content-type".equalsIgnoreCase((String)atts.getAttribute(HTML.Attribute.HTTPEQUIV))) {
	if (!content.equalsIgnoreCase("text/html") &&
	!content.equalsIgnoreCase("text/plain")) {
	throw new ChangedCharSetException(content, false);
	}
	} else if ("charset" .equalsIgnoreCase((String)atts.getAttribute(HTML.Attribute.HTTPEQUIV))) {
	throw new ChangedCharSetException(content, true);
	}
	}
	}
	}
	if (inbody != 0 \|\| elem == dtd.meta \|\| elem == dtd.base \|\| elem == dtd.isindex \|\| elem == dtd.style \|\| elem == dtd.link) {
	if (debugFlag) {
	if (tag.fictional()) {
	debug("Empty Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos());
	} else {
	debug("Empty Tag: " + tag.getHTMLTag() + " attributes: "
	+ getAttributes() + " pos: " + getCurrentPos());
	}
	}
	if (tag.fictional()) {
	SimpleAttributeSet attrs = new SimpleAttributeSet();
	attrs.addAttribute(HTMLEditorKit.ParserCallback.IMPLIED,
	Boolean.TRUE);
	callback.handleSimpleTag(tag.getHTMLTag(), attrs,
	getBlockStartPosition());
	} else {
	callback.handleSimpleTag(tag.getHTMLTag(), getAttributes(),
	getBlockStartPosition());
	flushAttributes();
	}
	}
	}

	/**
	* Handle End Tag.
	*/
	protected void handleEndTag(TagElement tag) {
	Element elem = tag.getElement();
	if (elem == dtd.body) {
	inbody--;
	} else if (elem == dtd.title) {
	intitle--;
	seentitle = true;
	} else if (elem == dtd.head) {
	inhead--;
	} else if (elem == dtd.style) {
	instyle--;
	} else if (elem == dtd.script) {
	inscript--;
	}
	if (debugFlag) {
	debug("End Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos());
	}
	callback.handleEndTag(tag.getHTMLTag(), getBlockStartPosition());

	}

	/**
	* Handle Text.
	*/
	protected void handleText(char data[]) {
	if (data != null) {
	if (inscript != 0) {
	callback.handleComment(data, getBlockStartPosition());
	return;
	}
	if (inbody != 0 \|\| ((instyle != 0) \|\|
	((intitle != 0) && !seentitle))) {
	if (debugFlag) {
	debug("text: ->" + new String(data) + "<-" + " pos: " + getCurrentPos());
	}
	callback.handleText(data, getBlockStartPosition());
	}
	}
	}

	/*
	* Error handling.
	*/
	protected void handleError(int ln, String errorMsg) {
	if (debugFlag) {
	debug("Error: ->" + errorMsg + "<-" + " pos: " + getCurrentPos());
	}
	/* PENDING: need to improve the error string. */
	callback.handleError(errorMsg, getCurrentPos());
	}


	/*
	* debug messages
	*/
	private void debug(String msg) {
	System.out.println(msg);
	}
	}