| #!/usr/bin/env python |
| # |
| # Copyright 2007 The Closure Linter Authors. All Rights Reserved. |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); |
| # you may not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS-IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| """Utilities for dealing with HTML.""" |
| |
| __author__ = ('robbyw@google.com (Robert Walker)') |
| |
| import cStringIO |
| import formatter |
| import htmllib |
| import HTMLParser |
| import re |
| |
| |
| class ScriptExtractor(htmllib.HTMLParser): |
| """Subclass of HTMLParser that extracts script contents from an HTML file. |
| |
| Also inserts appropriate blank lines so that line numbers in the extracted |
| code match the line numbers in the original HTML. |
| """ |
| |
| def __init__(self): |
| """Initialize a ScriptExtractor.""" |
| htmllib.HTMLParser.__init__(self, formatter.NullFormatter()) |
| self._in_script = False |
| self._text = '' |
| |
| def start_script(self, attrs): |
| """Internal handler for the start of a script tag. |
| |
| Args: |
| attrs: The attributes of the script tag, as a list of tuples. |
| """ |
| for attribute in attrs: |
| if attribute[0].lower() == 'src': |
| # Skip script tags with a src specified. |
| return |
| self._in_script = True |
| |
| def end_script(self): |
| """Internal handler for the end of a script tag.""" |
| self._in_script = False |
| |
| def handle_data(self, data): |
| """Internal handler for character data. |
| |
| Args: |
| data: The character data from the HTML file. |
| """ |
| if self._in_script: |
| # If the last line contains whitespace only, i.e. is just there to |
| # properly align a </script> tag, strip the whitespace. |
| if data.rstrip(' \t') != data.rstrip(' \t\n\r\f'): |
| data = data.rstrip(' \t') |
| self._text += data |
| else: |
| self._AppendNewlines(data) |
| |
| def handle_comment(self, data): |
| """Internal handler for HTML comments. |
| |
| Args: |
| data: The text of the comment. |
| """ |
| self._AppendNewlines(data) |
| |
| def _AppendNewlines(self, data): |
| """Count the number of newlines in the given string and append them. |
| |
| This ensures line numbers are correct for reported errors. |
| |
| Args: |
| data: The data to count newlines in. |
| """ |
| # We append 'x' to both sides of the string to ensure that splitlines |
| # gives us an accurate count. |
| for i in xrange(len(('x' + data + 'x').splitlines()) - 1): |
| self._text += '\n' |
| |
| def GetScriptLines(self): |
| """Return the extracted script lines. |
| |
| Returns: |
| The extracted script lines as a list of strings. |
| """ |
| return self._text.splitlines() |
| |
| |
| def GetScriptLines(f): |
| """Extract script tag contents from the given HTML file. |
| |
| Args: |
| f: The HTML file. |
| |
| Returns: |
| Lines in the HTML file that are from script tags. |
| """ |
| extractor = ScriptExtractor() |
| |
| # The HTML parser chokes on text like Array.<!string>, so we patch |
| # that bug by replacing the < with < - escaping all text inside script |
| # tags would be better but it's a bit of a catch 22. |
| contents = f.read() |
| contents = re.sub(r'<([^\s\w/])', |
| lambda x: '<%s' % x.group(1), |
| contents) |
| |
| extractor.feed(contents) |
| extractor.close() |
| return extractor.GetScriptLines() |
| |
| |
| def StripTags(str): |
| """Returns the string with HTML tags stripped. |
| |
| Args: |
| str: An html string. |
| |
| Returns: |
| The html string with all tags stripped. If there was a parse error, returns |
| the text successfully parsed so far. |
| """ |
| # Brute force approach to stripping as much HTML as possible. If there is a |
| # parsing error, don't strip text before parse error position, and continue |
| # trying from there. |
| final_text = '' |
| finished = False |
| while not finished: |
| try: |
| strip = _HtmlStripper() |
| strip.feed(str) |
| strip.close() |
| str = strip.get_output() |
| final_text += str |
| finished = True |
| except HTMLParser.HTMLParseError, e: |
| final_text += str[:e.offset] |
| str = str[e.offset + 1:] |
| |
| return final_text |
| |
| |
| class _HtmlStripper(HTMLParser.HTMLParser): |
| """Simple class to strip tags from HTML. |
| |
| Does so by doing nothing when encountering tags, and appending character data |
| to a buffer when that is encountered. |
| """ |
| def __init__(self): |
| self.reset() |
| self.__output = cStringIO.StringIO() |
| |
| def handle_data(self, d): |
| self.__output.write(d) |
| |
| def get_output(self): |
| return self.__output.getvalue() |