diff options
Diffstat (limited to 'contexts/data/lib/closure-library/third_party/closure/goog/caja/string/html/htmlparser.js')
-rw-r--r-- | contexts/data/lib/closure-library/third_party/closure/goog/caja/string/html/htmlparser.js | 611 |
1 files changed, 0 insertions, 611 deletions
diff --git a/contexts/data/lib/closure-library/third_party/closure/goog/caja/string/html/htmlparser.js b/contexts/data/lib/closure-library/third_party/closure/goog/caja/string/html/htmlparser.js deleted file mode 100644 index d241d4b..0000000 --- a/contexts/data/lib/closure-library/third_party/closure/goog/caja/string/html/htmlparser.js +++ /dev/null @@ -1,611 +0,0 @@ -// Copyright 2006-2008, The Google Caja project. -// Modifications Copyright 2009 The Closure Library Authors. All Rights Reserved. -// All Rights Reserved - -/** - * @license Portions of this code are from the google-caja project, received by - * Google under the Apache license (http://code.google.com/p/google-caja/). - * All other code is Copyright 2009 Google, Inc. All Rights Reserved. - -// Copyright (C) 2006 Google Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - */ - -/** - * @fileoverview A Html SAX parser. - * - * Examples of usage of the {@code goog.string.html.HtmlParser}: - * <pre> - * var handler = new MyCustomHtmlVisitorHandlerThatExtendsHtmlSaxHandler(); - * var parser = new goog.string.html.HtmlParser(); - * parser.parse(handler, '<html><a href="google.com">link found!</a></html>'); - * </pre> - * - * TODO(user, msamuel): validate sanitizer regex against the HTML5 grammar at - * http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html - * http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html - * http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html - * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html - * - * @supported IE6, IE7, IE8, FF1.5, FF2, FF3, Chrome 3.0, Safari and Opera 10. - */ - -goog.provide('goog.string.html.HtmlParser'); -goog.provide('goog.string.html.HtmlParser.EFlags'); -goog.provide('goog.string.html.HtmlParser.Elements'); -goog.provide('goog.string.html.HtmlParser.Entities'); -goog.provide('goog.string.html.HtmlSaxHandler'); - - -/** - * An Html parser: {@code parse} takes a string and calls methods on - * {@code goog.string.html.HtmlSaxHandler} while it is visiting it. - * - * @constructor - */ -goog.string.html.HtmlParser = function() { -}; - - -/** - * HTML entities that are encoded/decoded. - * TODO(user): use {@code goog.string.htmlEncode} instead. - * @enum {string} - */ -goog.string.html.HtmlParser.Entities = { - lt: '<', - gt: '>', - amp: '&', - nbsp: '\240', - quot: '"', - apos: '\'' -}; - - -/** - * The html eflags, used internally on the parser. - * @enum {number} - */ -goog.string.html.HtmlParser.EFlags = { - OPTIONAL_ENDTAG: 1, - EMPTY: 2, - CDATA: 4, - RCDATA: 8, - UNSAFE: 16, - FOLDABLE: 32 -}; - - -/** - * A map of element to a bitmap of flags it has, used internally on the parser. - * @type {Object} - */ -goog.string.html.HtmlParser.Elements = { - 'a': 0, - 'abbr': 0, - 'acronym': 0, - 'address': 0, - 'applet': goog.string.html.HtmlParser.EFlags.UNSAFE, - 'area': goog.string.html.HtmlParser.EFlags.EMPTY, - 'b': 0, - 'base': goog.string.html.HtmlParser.EFlags.EMPTY | - goog.string.html.HtmlParser.EFlags.UNSAFE, - 'basefont': goog.string.html.HtmlParser.EFlags.EMPTY | - goog.string.html.HtmlParser.EFlags.UNSAFE, - 'bdo': 0, - 'big': 0, - 'blockquote': 0, - 'body': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG | - goog.string.html.HtmlParser.EFlags.UNSAFE | - goog.string.html.HtmlParser.EFlags.FOLDABLE, - 'br': goog.string.html.HtmlParser.EFlags.EMPTY, - 'button': 0, - 'caption': 0, - 'center': 0, - 'cite': 0, - 'code': 0, - 'col': goog.string.html.HtmlParser.EFlags.EMPTY, - 'colgroup': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, - 'dd': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, - 'del': 0, - 'dfn': 0, - 'dir': 0, - 'div': 0, - 'dl': 0, - 'dt': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, - 'em': 0, - 'fieldset': 0, - 'font': 0, - 'form': 0, - 'frame': goog.string.html.HtmlParser.EFlags.EMPTY | - goog.string.html.HtmlParser.EFlags.UNSAFE, - 'frameset': goog.string.html.HtmlParser.EFlags.UNSAFE, - 'h1': 0, - 'h2': 0, - 'h3': 0, - 'h4': 0, - 'h5': 0, - 'h6': 0, - 'head': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG | - goog.string.html.HtmlParser.EFlags.UNSAFE | - goog.string.html.HtmlParser.EFlags.FOLDABLE, - 'hr': goog.string.html.HtmlParser.EFlags.EMPTY, - 'html': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG | - goog.string.html.HtmlParser.EFlags.UNSAFE | - goog.string.html.HtmlParser.EFlags.FOLDABLE, - 'i': 0, - 'iframe': goog.string.html.HtmlParser.EFlags.UNSAFE | - goog.string.html.HtmlParser.EFlags.CDATA, - 'img': goog.string.html.HtmlParser.EFlags.EMPTY, - 'input': goog.string.html.HtmlParser.EFlags.EMPTY, - 'ins': 0, - 'isindex': goog.string.html.HtmlParser.EFlags.EMPTY | - goog.string.html.HtmlParser.EFlags.UNSAFE, - 'kbd': 0, - 'label': 0, - 'legend': 0, - 'li': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, - 'link': goog.string.html.HtmlParser.EFlags.EMPTY | - goog.string.html.HtmlParser.EFlags.UNSAFE, - 'map': 0, - 'menu': 0, - 'meta': goog.string.html.HtmlParser.EFlags.EMPTY | - goog.string.html.HtmlParser.EFlags.UNSAFE, - 'noframes': goog.string.html.HtmlParser.EFlags.UNSAFE | - goog.string.html.HtmlParser.EFlags.CDATA, - 'noscript': goog.string.html.HtmlParser.EFlags.UNSAFE | - goog.string.html.HtmlParser.EFlags.CDATA, - 'object': goog.string.html.HtmlParser.EFlags.UNSAFE, - 'ol': 0, - 'optgroup': 0, - 'option': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, - 'p': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, - 'param': goog.string.html.HtmlParser.EFlags.EMPTY | - goog.string.html.HtmlParser.EFlags.UNSAFE, - 'pre': 0, - 'q': 0, - 's': 0, - 'samp': 0, - 'script': goog.string.html.HtmlParser.EFlags.UNSAFE | - goog.string.html.HtmlParser.EFlags.CDATA, - 'select': 0, - 'small': 0, - 'span': 0, - 'strike': 0, - 'strong': 0, - 'style': goog.string.html.HtmlParser.EFlags.UNSAFE | - goog.string.html.HtmlParser.EFlags.CDATA, - 'sub': 0, - 'sup': 0, - 'table': 0, - 'tbody': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, - 'td': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, - 'textarea': goog.string.html.HtmlParser.EFlags.RCDATA, - 'tfoot': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, - 'th': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, - 'thead': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, - 'title': goog.string.html.HtmlParser.EFlags.RCDATA | - goog.string.html.HtmlParser.EFlags.UNSAFE, - 'tr': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, - 'tt': 0, - 'u': 0, - 'ul': 0, - 'var': 0 -}; - - -/** - * Regular expression that matches &s. - * @type {RegExp} - * @private - */ -goog.string.html.HtmlParser.AMP_RE_ = /&/g; - - -/** - * Regular expression that matches loose &s. - * @type {RegExp} - * @private - */ -goog.string.html.HtmlParser.LOOSE_AMP_RE_ = - /&([^a-z#]|#(?:[^0-9x]|x(?:[^0-9a-f]|$)|$)|$)/gi; - - -/** - * Regular expression that matches <. - * @type {RegExp} - * @private - */ -goog.string.html.HtmlParser.LT_RE_ = /</g; - - -/** - * Regular expression that matches >. - * @type {RegExp} - * @private - */ -goog.string.html.HtmlParser.GT_RE_ = />/g; - - -/** - * Regular expression that matches ". - * @type {RegExp} - * @private - */ -goog.string.html.HtmlParser.QUOTE_RE_ = /\"/g; - - -/** - * Regular expression that matches =. - * @type {RegExp} - * @private - */ -goog.string.html.HtmlParser.EQUALS_RE_ = /=/g; - - -/** - * Regular expression that matches null characters. - * @type {RegExp} - * @private - */ -goog.string.html.HtmlParser.NULL_RE_ = /\0/g; - - -/** - * Regular expression that matches entities. - * @type {RegExp} - * @private - */ -goog.string.html.HtmlParser.ENTITY_RE_ = /&(#\d+|#x[0-9A-Fa-f]+|\w+);/g; - - -/** - * Regular expression that matches decimal numbers. - * @type {RegExp} - * @private - */ -goog.string.html.HtmlParser.DECIMAL_ESCAPE_RE_ = /^#(\d+)$/; - - -/** - * Regular expression that matches hexadecimal numbers. - * @type {RegExp} - * @private - */ -goog.string.html.HtmlParser.HEX_ESCAPE_RE_ = /^#x([0-9A-Fa-f]+)$/; - - -/** - * Regular expression that matches the next token to be processed. - * @type {RegExp} - * @private - */ -goog.string.html.HtmlParser.INSIDE_TAG_TOKEN_ = new RegExp( - // Don't capture space. - '^\\s*(?:' + - // Capture an attribute name in group 1, and value in group 3. - // We capture the fact that there was an attribute in group 2, since - // interpreters are inconsistent in whether a group that matches nothing - // is null, undefined, or the empty string. - ('(?:' + - '([a-z][a-z-]*)' + // attribute name - ('(' + // optionally followed - '\\s*=\\s*' + - ('(' + - // A double quoted string. - '\"[^\"]*\"' + - // A single quoted string. - '|\'[^\']*\'' + - // The positive lookahead is used to make sure that in - // <foo bar= baz=boo>, the value for bar is blank, not "baz=boo". - '|(?=[a-z][a-z-]*\\s*=)' + - // An unquoted value that is not an attribute name. - // We know it is not an attribute name because the previous - // zero-width match would've eliminated that possibility. - '|[^>\"\'\\s]*' + - ')' - ) + - ')' - ) + '?' + - ')' - ) + - // End of tag captured in group 3. - '|(/?>)' + - // Don't capture cruft - '|[^a-z\\s>]+)', - 'i'); - - -/** - * Regular expression that matches the next token to be processed when we are - * outside a tag. - * @type {RegExp} - * @private - */ -goog.string.html.HtmlParser.OUTSIDE_TAG_TOKEN_ = new RegExp( - '^(?:' + - // Entity captured in group 1. - '&(\\#[0-9]+|\\#[x][0-9a-f]+|\\w+);' + - // Comment, doctypes, and processing instructions not captured. - '|<[!]--[\\s\\S]*?-->|<!\\w[^>]*>|<\\?[^>*]*>' + - // '/' captured in group 2 for close tags, and name captured in group 3. - '|<(/)?([a-z][a-z0-9]*)' + - // Text captured in group 4. - '|([^<&>]+)' + - // Cruft captured in group 5. - '|([<&>]))', - 'i'); - - -/** - * Given a SAX-like {@code goog.string.html.HtmlSaxHandler} parses a - * {@code htmlText} and lets the {@code handler} know the structure while - * visiting the nodes. - * - * @param {goog.string.html.HtmlSaxHandler} handler The HtmlSaxHandler that will - * receive the events. - * @param {string} htmlText The html text. - */ -goog.string.html.HtmlParser.prototype.parse = function(handler, htmlText) { - var htmlLower = null; - var inTag = false; // True iff we're currently processing a tag. - var attribs = []; // Accumulates attribute names and values. - var tagName; // The name of the tag currently being processed. - var eflags; // The element flags for the current tag. - var openTag; // True if the current tag is an open tag. - - // Lets the handler know that we are starting to parse the document. - handler.startDoc(); - - // Consumes tokens from the htmlText and stops once all tokens are processed. - while (htmlText) { - var regex = inTag ? - goog.string.html.HtmlParser.INSIDE_TAG_TOKEN_ : - goog.string.html.HtmlParser.OUTSIDE_TAG_TOKEN_; - // Gets the next token - var m = htmlText.match(regex); - // And removes it from the string - htmlText = htmlText.substring(m[0].length); - - // TODO(goto): cleanup this code breaking it into separate methods. - if (inTag) { - if (m[1]) { // Attribute. - // SetAttribute with uppercase names doesn't work on IE6. - var attribName = goog.string.html.toLowerCase(m[1]); - var decodedValue; - if (m[2]) { - var encodedValue = m[3]; - switch (encodedValue.charCodeAt(0)) { // Strip quotes. - case 34: case 39: - encodedValue = encodedValue.substring( - 1, encodedValue.length - 1); - break; - } - decodedValue = this.unescapeEntities_(this.stripNULs_(encodedValue)); - } else { - // Use name as value for valueless attribs, so - // <input type=checkbox checked> - // gets attributes ['type', 'checkbox', 'checked', 'checked'] - decodedValue = attribName; - } - attribs.push(attribName, decodedValue); - } else if (m[4]) { - if (eflags !== void 0) { // False if not in whitelist. - if (openTag) { - if (handler.startTag) { - handler.startTag(/** @type {string} */ (tagName), attribs); - } - } else { - if (handler.endTag) { - handler.endTag(/** @type {string} */ (tagName)); - } - } - } - - if (openTag && (eflags & - (goog.string.html.HtmlParser.EFlags.CDATA | - goog.string.html.HtmlParser.EFlags.RCDATA))) { - if (htmlLower === null) { - htmlLower = goog.string.html.toLowerCase (htmlText); - } else { - htmlLower = htmlLower.substring( - htmlLower.length - htmlText.length); - } - var dataEnd = htmlLower.indexOf('</' + tagName); - if (dataEnd < 0) { - dataEnd = htmlText.length; - } - if (eflags & goog.string.html.HtmlParser.EFlags.CDATA) { - if (handler.cdata) { - handler.cdata(htmlText.substring(0, dataEnd)); - } - } else if (handler.rcdata) { - handler.rcdata( - this.normalizeRCData_(htmlText.substring(0, dataEnd))); - } - htmlText = htmlText.substring(dataEnd); - } - - tagName = eflags = openTag = void 0; - attribs.length = 0; - inTag = false; - } - } else { - if (m[1]) { // Entity. - handler.pcdata(m[0]); - } else if (m[3]) { // Tag. - openTag = !m[2]; - inTag = true; - tagName = goog.string.html.toLowerCase (m[3]); - eflags = goog.string.html.HtmlParser.Elements.hasOwnProperty(tagName) ? - goog.string.html.HtmlParser.Elements[tagName] : void 0; - } else if (m[4]) { // Text. - handler.pcdata(m[4]); - } else if (m[5]) { // Cruft. - switch (m[5]) { - case '<': handler.pcdata('<'); break; - case '>': handler.pcdata('>'); break; - default: handler.pcdata('&'); break; - } - } - } - } - - // Lets the handler know that we are done parsing the document. - handler.endDoc(); -}; - - -/** - * Decodes an HTML entity. - * - * @param {string} name The content between the '&' and the ';'. - * @return {string} A single unicode code-point as a string. - * @private - */ -goog.string.html.HtmlParser.prototype.lookupEntity_ = function(name) { - // TODO(goto): use {goog.string.htmlDecode} instead ? - // TODO(goto): π is different from Π - name = goog.string.html.toLowerCase(name); - if (goog.string.html.HtmlParser.Entities.hasOwnProperty(name)) { - return goog.string.html.HtmlParser.Entities[name]; - } - var m = name.match(goog.string.html.HtmlParser.DECIMAL_ESCAPE_RE_); - if (m) { - return String.fromCharCode(parseInt(m[1], 10)); - } else if ( - !!(m = name.match(goog.string.html.HtmlParser.HEX_ESCAPE_RE_))) { - return String.fromCharCode(parseInt(m[1], 16)); - } - return ''; -}; - - -/** - * Removes null characters on the string. - * @param {string} s The string to have the null characters removed. - * @return {string} A string without null characters. - * @private - */ -goog.string.html.HtmlParser.prototype.stripNULs_ = function(s) { - return s.replace(goog.string.html.HtmlParser.NULL_RE_, ''); -}; - - -/** - * The plain text of a chunk of HTML CDATA which possibly containing. - * - * TODO(goto): use {@code goog.string.unescapeEntities} instead ? - * @param {string} s A chunk of HTML CDATA. It must not start or end inside - * an HTML entity. - * @return {string} The unescaped entities. - * @private - */ -goog.string.html.HtmlParser.prototype.unescapeEntities_ = function(s) { - return s.replace( - goog.string.html.HtmlParser.ENTITY_RE_, - goog.bind(this.lookupEntity_, this)); -}; - - -/** - * Escape entities in RCDATA that can be escaped without changing the meaning. - * @param {string} rcdata The RCDATA string we want to normalize. - * @return {string} A normalized version of RCDATA. - * @private - */ -goog.string.html.HtmlParser.prototype.normalizeRCData_ = function(rcdata) { - return rcdata. - replace(goog.string.html.HtmlParser.LOOSE_AMP_RE_, '&$1'). - replace(goog.string.html.HtmlParser.LT_RE_, '<'). - replace(goog.string.html.HtmlParser.GT_RE_, '>'); -}; - - -/** - * TODO(goto): why isn't this in the string package ? does this solves any - * real problem ? move it to the goog.string package if it does. - * - * @param {string} str The string to lower case. - * @return {string} The str in lower case format. - */ -goog.string.html.toLowerCase = function(str) { - // The below may not be true on browsers in the Turkish locale. - if ('script' === 'SCRIPT'.toLowerCase()) { - return str.toLowerCase(); - } else { - return str.replace(/[A-Z]/g, function(ch) { - return String.fromCharCode(ch.charCodeAt(0) | 32); - }); - } -}; - - -/** - * An interface to the {@code goog.string.html.HtmlParser} visitor, that gets - * called while the HTML is being parsed. - * - * @constructor - */ -goog.string.html.HtmlSaxHandler = function() { -}; - - -/** - * Handler called when the parser found a new tag. - * @param {string} name The name of the tag that is starting. - * @param {Array.<string>} attributes The attributes of the tag. - */ -goog.string.html.HtmlSaxHandler.prototype.startTag = goog.abstractMethod; - - -/** - * Handler called when the parser found a closing tag. - * @param {string} name The name of the tag that is ending. - */ -goog.string.html.HtmlSaxHandler.prototype.endTag = goog.abstractMethod; - - -/** - * Handler called when PCDATA is found. - * @param {string} text The PCDATA text found. - */ -goog.string.html.HtmlSaxHandler.prototype.pcdata = goog.abstractMethod; - - -/** - * Handler called when RCDATA is found. - * @param {string} text The RCDATA text found. - */ -goog.string.html.HtmlSaxHandler.prototype.rcdata = goog.abstractMethod; - - -/** - * Handler called when CDATA is found. - * @param {string} text The CDATA text found. - */ -goog.string.html.HtmlSaxHandler.prototype.cdata = goog.abstractMethod; - - -/** - * Handler called when the parser is starting to parse the document. - */ -goog.string.html.HtmlSaxHandler.prototype.startDoc = goog.abstractMethod; - - -/** - * Handler called when the parsing is done. - */ -goog.string.html.HtmlSaxHandler.prototype.endDoc = goog.abstractMethod; |