1 files changed, 170 insertions, 0 deletions
diff --git a/tools/closure_linter-2.3.4/closure_linter/common/htmlutil.py b/tools/closure_linter-2.3.4/closure_linter/common/htmlutil.py
new file mode 100755
index 0000000..26d44c5
--- /dev/null
+++ b/tools/closure_linter-2.3.4/closure_linter/common/htmlutil.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python
+#
+# Copyright 2007 The Closure Linter Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for dealing with HTML."""
+
+__author__ = ('robbyw@google.com (Robert Walker)')
+
+import cStringIO
+import formatter
+import htmllib
+import HTMLParser
+import re
+
+
+class ScriptExtractor(htmllib.HTMLParser):
+  """Subclass of HTMLParser that extracts script contents from an HTML file.
+
+  Also inserts appropriate blank lines so that line numbers in the extracted
+  code match the line numbers in the original HTML.
+  """
+
+  def __init__(self):
+    """Initialize a ScriptExtractor."""
+    htmllib.HTMLParser.__init__(self, formatter.NullFormatter())
+    self._in_script = False
+    self._text = ''
+
+  def start_script(self, attrs):
+    """Internal handler for the start of a script tag.
+
+    Args:
+      attrs: The attributes of the script tag, as a list of tuples.
+    """
+    for attribute in attrs:
+      if attribute[0].lower() == 'src':
+        # Skip script tags with a src specified.
+        return
+    self._in_script = True
+
+  def end_script(self):
+    """Internal handler for the end of a script tag."""
+    self._in_script = False
+
+  def handle_data(self, data):
+    """Internal handler for character data.
+
+    Args:
+      data: The character data from the HTML file.
+    """
+    if self._in_script:
+      # If the last line contains whitespace only, i.e. is just there to
+      # properly align a </script> tag, strip the whitespace.
+      if data.rstrip(' \t') != data.rstrip(' \t\n\r\f'):
+        data = data.rstrip(' \t')
+      self._text += data
+    else:
+      self._AppendNewlines(data)
+
+  def handle_comment(self, data):
+    """Internal handler for HTML comments.
+
+    Args:
+      data: The text of the comment.
+    """
+    self._AppendNewlines(data)
+
+  def _AppendNewlines(self, data):
+    """Count the number of newlines in the given string and append them.
+
+    This ensures line numbers are correct for reported errors.
+
+    Args:
+      data: The data to count newlines in.
+    """
+    # We append 'x' to both sides of the string to ensure that splitlines
+    # gives us an accurate count.
+    for i in xrange(len(('x' + data + 'x').splitlines()) - 1):
+      self._text += '\n'
+
+  def GetScriptLines(self):
+    """Return the extracted script lines.
+
+    Returns:
+      The extracted script lines as a list of strings.
+    """
+    return self._text.splitlines()
+
+
+def GetScriptLines(f):
+  """Extract script tag contents from the given HTML file.
+
+  Args:
+    f: The HTML file.
+
+  Returns:
+    Lines in the HTML file that are from script tags.
+  """
+  extractor = ScriptExtractor()
+
+  # The HTML parser chokes on text like Array.<!string>, so we patch
+  # that bug by replacing the < with &lt; - escaping all text inside script
+  # tags would be better but it's a bit of a catch 22.
+  contents = f.read()
+  contents = re.sub(r'<([^\s\w/])',
+         lambda x: '&lt;%s' % x.group(1),
+         contents)
+
+  extractor.feed(contents)
+  extractor.close()
+  return extractor.GetScriptLines()
+
+
+def StripTags(str):
+  """Returns the string with HTML tags stripped.
+
+  Args:
+    str: An html string.
+
+  Returns:
+    The html string with all tags stripped. If there was a parse error, returns
+    the text successfully parsed so far.
+  """
+  # Brute force approach to stripping as much HTML as possible. If there is a
+  # parsing error, don't strip text before parse error position, and continue
+  # trying from there.
+  final_text = ''
+  finished = False
+  while not finished:
+    try:
+      strip = _HtmlStripper()
+      strip.feed(str)
+      strip.close()
+      str = strip.get_output()
+      final_text += str
+      finished = True
+    except HTMLParser.HTMLParseError, e:
+      final_text += str[:e.offset]
+      str = str[e.offset + 1:]
+
+  return final_text
+
+
+class _HtmlStripper(HTMLParser.HTMLParser):
+  """Simple class to strip tags from HTML.
+
+  Does so by doing nothing when encountering tags, and appending character data
+  to a buffer when that is encountered.
+  """
+  def __init__(self):
+    self.reset()
+    self.__output = cStringIO.StringIO()
+
+  def handle_data(self, d):
+    self.__output.write(d)
+
+  def get_output(self):
+    return self.__output.getvalue()