diff options
Diffstat (limited to 'tools/closure_linter-2.3.4/closure_linter/common/htmlutil.py')
-rwxr-xr-x | tools/closure_linter-2.3.4/closure_linter/common/htmlutil.py | 170 |
1 files changed, 170 insertions, 0 deletions
diff --git a/tools/closure_linter-2.3.4/closure_linter/common/htmlutil.py b/tools/closure_linter-2.3.4/closure_linter/common/htmlutil.py new file mode 100755 index 0000000..26d44c5 --- /dev/null +++ b/tools/closure_linter-2.3.4/closure_linter/common/htmlutil.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python +# +# Copyright 2007 The Closure Linter Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS-IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities for dealing with HTML.""" + +__author__ = ('robbyw@google.com (Robert Walker)') + +import cStringIO +import formatter +import htmllib +import HTMLParser +import re + + +class ScriptExtractor(htmllib.HTMLParser): + """Subclass of HTMLParser that extracts script contents from an HTML file. + + Also inserts appropriate blank lines so that line numbers in the extracted + code match the line numbers in the original HTML. + """ + + def __init__(self): + """Initialize a ScriptExtractor.""" + htmllib.HTMLParser.__init__(self, formatter.NullFormatter()) + self._in_script = False + self._text = '' + + def start_script(self, attrs): + """Internal handler for the start of a script tag. + + Args: + attrs: The attributes of the script tag, as a list of tuples. + """ + for attribute in attrs: + if attribute[0].lower() == 'src': + # Skip script tags with a src specified. + return + self._in_script = True + + def end_script(self): + """Internal handler for the end of a script tag.""" + self._in_script = False + + def handle_data(self, data): + """Internal handler for character data. + + Args: + data: The character data from the HTML file. + """ + if self._in_script: + # If the last line contains whitespace only, i.e. is just there to + # properly align a </script> tag, strip the whitespace. + if data.rstrip(' \t') != data.rstrip(' \t\n\r\f'): + data = data.rstrip(' \t') + self._text += data + else: + self._AppendNewlines(data) + + def handle_comment(self, data): + """Internal handler for HTML comments. + + Args: + data: The text of the comment. + """ + self._AppendNewlines(data) + + def _AppendNewlines(self, data): + """Count the number of newlines in the given string and append them. + + This ensures line numbers are correct for reported errors. + + Args: + data: The data to count newlines in. + """ + # We append 'x' to both sides of the string to ensure that splitlines + # gives us an accurate count. + for i in xrange(len(('x' + data + 'x').splitlines()) - 1): + self._text += '\n' + + def GetScriptLines(self): + """Return the extracted script lines. + + Returns: + The extracted script lines as a list of strings. + """ + return self._text.splitlines() + + +def GetScriptLines(f): + """Extract script tag contents from the given HTML file. + + Args: + f: The HTML file. + + Returns: + Lines in the HTML file that are from script tags. + """ + extractor = ScriptExtractor() + + # The HTML parser chokes on text like Array.<!string>, so we patch + # that bug by replacing the < with < - escaping all text inside script + # tags would be better but it's a bit of a catch 22. + contents = f.read() + contents = re.sub(r'<([^\s\w/])', + lambda x: '<%s' % x.group(1), + contents) + + extractor.feed(contents) + extractor.close() + return extractor.GetScriptLines() + + +def StripTags(str): + """Returns the string with HTML tags stripped. + + Args: + str: An html string. + + Returns: + The html string with all tags stripped. If there was a parse error, returns + the text successfully parsed so far. + """ + # Brute force approach to stripping as much HTML as possible. If there is a + # parsing error, don't strip text before parse error position, and continue + # trying from there. + final_text = '' + finished = False + while not finished: + try: + strip = _HtmlStripper() + strip.feed(str) + strip.close() + str = strip.get_output() + final_text += str + finished = True + except HTMLParser.HTMLParseError, e: + final_text += str[:e.offset] + str = str[e.offset + 1:] + + return final_text + + +class _HtmlStripper(HTMLParser.HTMLParser): + """Simple class to strip tags from HTML. + + Does so by doing nothing when encountering tags, and appending character data + to a buffer when that is encountered. + """ + def __init__(self): + self.reset() + self.__output = cStringIO.StringIO() + + def handle_data(self, d): + self.__output.write(d) + + def get_output(self): + return self.__output.getvalue() |