diff options
Diffstat (limited to 'tools/closure_linter-2.3.4/closure_linter/javascripttokenizer.py')
-rwxr-xr-x | tools/closure_linter-2.3.4/closure_linter/javascripttokenizer.py | 367 |
1 files changed, 0 insertions, 367 deletions
diff --git a/tools/closure_linter-2.3.4/closure_linter/javascripttokenizer.py b/tools/closure_linter-2.3.4/closure_linter/javascripttokenizer.py deleted file mode 100755 index 991ff80..0000000 --- a/tools/closure_linter-2.3.4/closure_linter/javascripttokenizer.py +++ /dev/null @@ -1,367 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2007 The Closure Linter Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS-IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Regular expression based JavaScript parsing classes.""" - -__author__ = ('robbyw@google.com (Robert Walker)', - 'ajp@google.com (Andy Perelson)') - -import copy -import re - -from closure_linter import javascripttokens -from closure_linter.common import matcher -from closure_linter.common import tokenizer - -# Shorthand -Type = javascripttokens.JavaScriptTokenType -Matcher = matcher.Matcher - - -class JavaScriptModes(object): - """Enumeration of the different matcher modes used for JavaScript.""" - TEXT_MODE = 'text' - SINGLE_QUOTE_STRING_MODE = 'single_quote_string' - DOUBLE_QUOTE_STRING_MODE = 'double_quote_string' - BLOCK_COMMENT_MODE = 'block_comment' - DOC_COMMENT_MODE = 'doc_comment' - DOC_COMMENT_LEX_SPACES_MODE = 'doc_comment_spaces' - LINE_COMMENT_MODE = 'line_comment' - PARAMETER_MODE = 'parameter' - FUNCTION_MODE = 'function' - - -class JavaScriptTokenizer(tokenizer.Tokenizer): - """JavaScript tokenizer. - - Convert JavaScript code in to an array of tokens. - """ - - # Useful patterns for JavaScript parsing. - IDENTIFIER_CHAR = r'A-Za-z0-9_$.'; - - # Number patterns based on: - # http://www.mozilla.org/js/language/js20-2000-07/formal/lexer-grammar.html - MANTISSA = r""" - (\d+(?!\.)) | # Matches '10' - (\d+\.(?!\d)) | # Matches '10.' - (\d*\.\d+) # Matches '.5' or '10.5' - """ - DECIMAL_LITERAL = r'(%s)([eE][-+]?\d+)?' % MANTISSA - HEX_LITERAL = r'0[xX][0-9a-fA-F]+' - NUMBER = re.compile(r""" - ((%s)|(%s)) - """ % (HEX_LITERAL, DECIMAL_LITERAL), re.VERBOSE) - - # Strings come in three parts - first we match the start of the string, then - # the contents, then the end. The contents consist of any character except a - # backslash or end of string, or a backslash followed by any character, or a - # backslash followed by end of line to support correct parsing of multi-line - # strings. - SINGLE_QUOTE = re.compile(r"'") - SINGLE_QUOTE_TEXT = re.compile(r"([^'\\]|\\(.|$))+") - DOUBLE_QUOTE = re.compile(r'"') - DOUBLE_QUOTE_TEXT = re.compile(r'([^"\\]|\\(.|$))+') - - START_SINGLE_LINE_COMMENT = re.compile(r'//') - END_OF_LINE_SINGLE_LINE_COMMENT = re.compile(r'//$') - - START_DOC_COMMENT = re.compile(r'/\*\*') - START_BLOCK_COMMENT = re.compile(r'/\*') - END_BLOCK_COMMENT = re.compile(r'\*/') - BLOCK_COMMENT_TEXT = re.compile(r'([^*]|\*(?!/))+') - - # Comment text is anything that we are not going to parse into another special - # token like (inline) flags or end comments. Complicated regex to match - # most normal characters, and '*', '{', '}', and '@' when we are sure that - # it is safe. Expression [^*{\s]@ must come first, or the other options will - # match everything before @, and we won't match @'s that aren't part of flags - # like in email addresses in the @author tag. - DOC_COMMENT_TEXT = re.compile(r'([^*{}\s]@|[^*{}@]|\*(?!/))+') - DOC_COMMENT_NO_SPACES_TEXT = re.compile(r'([^*{}\s]@|[^*{}@\s]|\*(?!/))+') - - # Match the prefix ' * ' that starts every line of jsdoc. Want to include - # spaces after the '*', but nothing else that occurs after a '*', and don't - # want to match the '*' in '*/'. - DOC_PREFIX = re.compile(r'\s*\*(\s+|(?!/))') - - START_BLOCK = re.compile('{') - END_BLOCK = re.compile('}') - - REGEX_CHARACTER_CLASS = r""" - \[ # Opening bracket - ([^\]\\]|\\.)* # Anything but a ] or \, - # or a backslash followed by anything - \] # Closing bracket - """ - # We ensure the regex is followed by one of the above tokens to avoid - # incorrectly parsing something like x / y / z as x REGEX(/ y /) z - POST_REGEX_LIST = [ - ';', ',', r'\.', r'\)', r'\]', '$', r'\/\/', r'\/\*', ':', '}'] - - REGEX = re.compile(r""" - / # opening slash - (?!\*) # not the start of a comment - (\\.|[^\[\/\\]|(%s))* # a backslash followed by anything, - # or anything but a / or [ or \, - # or a character class - / # closing slash - [gimsx]* # optional modifiers - (?=\s*(%s)) - """ % (REGEX_CHARACTER_CLASS, '|'.join(POST_REGEX_LIST)), - re.VERBOSE) - - ANYTHING = re.compile(r'.*') - PARAMETERS = re.compile(r'[^\)]+') - CLOSING_PAREN_WITH_SPACE = re.compile(r'\)\s*') - - FUNCTION_DECLARATION = re.compile(r'\bfunction\b') - - OPENING_PAREN = re.compile(r'\(') - CLOSING_PAREN = re.compile(r'\)') - - OPENING_BRACKET = re.compile(r'\[') - CLOSING_BRACKET = re.compile(r'\]') - - # We omit these JS keywords from the list: - # function - covered by FUNCTION_DECLARATION. - # delete, in, instanceof, new, typeof - included as operators. - # this - included in identifiers. - # null, undefined - not included, should go in some "special constant" list. - KEYWORD_LIST = ['break', 'case', 'catch', 'continue', 'default', 'do', 'else', - 'finally', 'for', 'if', 'return', 'switch', 'throw', 'try', 'var', - 'while', 'with'] - # Match a keyword string followed by a non-identifier character in order to - # not match something like doSomething as do + Something. - KEYWORD = re.compile('(%s)((?=[^%s])|$)' % ( - '|'.join(KEYWORD_LIST), IDENTIFIER_CHAR)) - - # List of regular expressions to match as operators. Some notes: for our - # purposes, the comma behaves similarly enough to a normal operator that we - # include it here. r'\bin\b' actually matches 'in' surrounded by boundary - # characters - this may not match some very esoteric uses of the in operator. - # Operators that are subsets of larger operators must come later in this list - # for proper matching, e.g., '>>' must come AFTER '>>>'. - OPERATOR_LIST = [',', r'\+\+', '===', '!==', '>>>=', '>>>', '==', '>=', '<=', - '!=', '<<=', '>>=', '<<', '>>', '>', '<', r'\+=', r'\+', - '--', '\^=', '-=', '-', '/=', '/', r'\*=', r'\*', '%=', '%', - '&&', r'\|\|', '&=', '&', r'\|=', r'\|', '=', '!', ':', '\?', - r'\bdelete\b', r'\bin\b', r'\binstanceof\b', r'\bnew\b', - r'\btypeof\b', r'\bvoid\b'] - OPERATOR = re.compile('|'.join(OPERATOR_LIST)) - - WHITESPACE = re.compile(r'\s+') - SEMICOLON = re.compile(r';') - # Technically JavaScript identifiers can't contain '.', but we treat a set of - # nested identifiers as a single identifier. - NESTED_IDENTIFIER = r'[a-zA-Z_$][%s.]*' % IDENTIFIER_CHAR - IDENTIFIER = re.compile(NESTED_IDENTIFIER) - - SIMPLE_LVALUE = re.compile(r""" - (?P<identifier>%s) # a valid identifier - (?=\s* # optional whitespace - \= # look ahead to equal sign - (?!=)) # not follwed by equal - """ % NESTED_IDENTIFIER, re.VERBOSE) - - # A doc flag is a @ sign followed by non-space characters that appears at the - # beginning of the line, after whitespace, or after a '{'. The look-behind - # check is necessary to not match someone@google.com as a flag. - DOC_FLAG = re.compile(r'(^|(?<=\s))@(?P<name>[a-zA-Z]+)') - # To properly parse parameter names, we need to tokenize whitespace into a - # token. - DOC_FLAG_LEX_SPACES = re.compile(r'(^|(?<=\s))@(?P<name>%s)\b' % - '|'.join(['param'])) - - DOC_INLINE_FLAG = re.compile(r'(?<={)@(?P<name>[a-zA-Z]+)') - - # Star followed by non-slash, i.e a star that does not end a comment. - # This is used for TYPE_GROUP below. - SAFE_STAR = r'(\*(?!/))' - - COMMON_DOC_MATCHERS = [ - # Find the end of the comment. - Matcher(END_BLOCK_COMMENT, Type.END_DOC_COMMENT, - JavaScriptModes.TEXT_MODE), - - # Tokenize documented flags like @private. - Matcher(DOC_INLINE_FLAG, Type.DOC_INLINE_FLAG), - Matcher(DOC_FLAG_LEX_SPACES, Type.DOC_FLAG, - JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE), - - # Encountering a doc flag should leave lex spaces mode. - Matcher(DOC_FLAG, Type.DOC_FLAG, JavaScriptModes.DOC_COMMENT_MODE), - - # Tokenize braces so we can find types. - Matcher(START_BLOCK, Type.DOC_START_BRACE), - Matcher(END_BLOCK, Type.DOC_END_BRACE), - Matcher(DOC_PREFIX, Type.DOC_PREFIX, None, True)] - - - # The token matcher groups work as follows: it is an list of Matcher objects. - # The matchers will be tried in this order, and the first to match will be - # returned. Hence the order is important because the matchers that come first - # overrule the matchers that come later. - JAVASCRIPT_MATCHERS = { - # Matchers for basic text mode. - JavaScriptModes.TEXT_MODE: [ - # Check a big group - strings, starting comments, and regexes - all - # of which could be intertwined. 'string with /regex/', - # /regex with 'string'/, /* comment with /regex/ and string */ (and so on) - Matcher(START_DOC_COMMENT, Type.START_DOC_COMMENT, - JavaScriptModes.DOC_COMMENT_MODE), - Matcher(START_BLOCK_COMMENT, Type.START_BLOCK_COMMENT, - JavaScriptModes.BLOCK_COMMENT_MODE), - Matcher(END_OF_LINE_SINGLE_LINE_COMMENT, - Type.START_SINGLE_LINE_COMMENT), - Matcher(START_SINGLE_LINE_COMMENT, Type.START_SINGLE_LINE_COMMENT, - JavaScriptModes.LINE_COMMENT_MODE), - Matcher(SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_START, - JavaScriptModes.SINGLE_QUOTE_STRING_MODE), - Matcher(DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_START, - JavaScriptModes.DOUBLE_QUOTE_STRING_MODE), - Matcher(REGEX, Type.REGEX), - - # Next we check for start blocks appearing outside any of the items above. - Matcher(START_BLOCK, Type.START_BLOCK), - Matcher(END_BLOCK, Type.END_BLOCK), - - # Then we search for function declarations. - Matcher(FUNCTION_DECLARATION, Type.FUNCTION_DECLARATION, - JavaScriptModes.FUNCTION_MODE), - - # Next, we convert non-function related parens to tokens. - Matcher(OPENING_PAREN, Type.START_PAREN), - Matcher(CLOSING_PAREN, Type.END_PAREN), - - # Next, we convert brackets to tokens. - Matcher(OPENING_BRACKET, Type.START_BRACKET), - Matcher(CLOSING_BRACKET, Type.END_BRACKET), - - # Find numbers. This has to happen before operators because scientific - # notation numbers can have + and - in them. - Matcher(NUMBER, Type.NUMBER), - - # Find operators and simple assignments - Matcher(SIMPLE_LVALUE, Type.SIMPLE_LVALUE), - Matcher(OPERATOR, Type.OPERATOR), - - # Find key words and whitespace - Matcher(KEYWORD, Type.KEYWORD), - Matcher(WHITESPACE, Type.WHITESPACE), - - # Find identifiers - Matcher(IDENTIFIER, Type.IDENTIFIER), - - # Finally, we convert semicolons to tokens. - Matcher(SEMICOLON, Type.SEMICOLON)], - - - # Matchers for single quote strings. - JavaScriptModes.SINGLE_QUOTE_STRING_MODE: [ - Matcher(SINGLE_QUOTE_TEXT, Type.STRING_TEXT), - Matcher(SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_END, - JavaScriptModes.TEXT_MODE)], - - - # Matchers for double quote strings. - JavaScriptModes.DOUBLE_QUOTE_STRING_MODE: [ - Matcher(DOUBLE_QUOTE_TEXT, Type.STRING_TEXT), - Matcher(DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_END, - JavaScriptModes.TEXT_MODE)], - - - # Matchers for block comments. - JavaScriptModes.BLOCK_COMMENT_MODE: [ - # First we check for exiting a block comment. - Matcher(END_BLOCK_COMMENT, Type.END_BLOCK_COMMENT, - JavaScriptModes.TEXT_MODE), - - # Match non-comment-ending text.. - Matcher(BLOCK_COMMENT_TEXT, Type.COMMENT)], - - - # Matchers for doc comments. - JavaScriptModes.DOC_COMMENT_MODE: COMMON_DOC_MATCHERS + [ - Matcher(DOC_COMMENT_TEXT, Type.COMMENT)], - - JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: COMMON_DOC_MATCHERS + [ - Matcher(WHITESPACE, Type.COMMENT), - Matcher(DOC_COMMENT_NO_SPACES_TEXT, Type.COMMENT)], - - # Matchers for single line comments. - JavaScriptModes.LINE_COMMENT_MODE: [ - # We greedy match until the end of the line in line comment mode. - Matcher(ANYTHING, Type.COMMENT, JavaScriptModes.TEXT_MODE)], - - - # Matchers for code after the function keyword. - JavaScriptModes.FUNCTION_MODE: [ - # Must match open paren before anything else and move into parameter mode, - # otherwise everything inside the parameter list is parsed incorrectly. - Matcher(OPENING_PAREN, Type.START_PARAMETERS, - JavaScriptModes.PARAMETER_MODE), - Matcher(WHITESPACE, Type.WHITESPACE), - Matcher(IDENTIFIER, Type.FUNCTION_NAME)], - - - # Matchers for function parameters - JavaScriptModes.PARAMETER_MODE: [ - # When in function parameter mode, a closing paren is treated specially. - # Everything else is treated as lines of parameters. - Matcher(CLOSING_PAREN_WITH_SPACE, Type.END_PARAMETERS, - JavaScriptModes.TEXT_MODE), - Matcher(PARAMETERS, Type.PARAMETERS, JavaScriptModes.PARAMETER_MODE)]} - - - # When text is not matched, it is given this default type based on mode. - # If unspecified in this map, the default default is Type.NORMAL. - JAVASCRIPT_DEFAULT_TYPES = { - JavaScriptModes.DOC_COMMENT_MODE: Type.COMMENT, - JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: Type.COMMENT - } - - def __init__(self, parse_js_doc = True): - """Create a tokenizer object. - - Args: - parse_js_doc: Whether to do detailed parsing of javascript doc comments, - or simply treat them as normal comments. Defaults to parsing JsDoc. - """ - matchers = self.JAVASCRIPT_MATCHERS - if not parse_js_doc: - # Make a copy so the original doesn't get modified. - matchers = copy.deepcopy(matchers) - matchers[JavaScriptModes.DOC_COMMENT_MODE] = matchers[ - JavaScriptModes.BLOCK_COMMENT_MODE] - - tokenizer.Tokenizer.__init__(self, JavaScriptModes.TEXT_MODE, matchers, - self.JAVASCRIPT_DEFAULT_TYPES) - - def _CreateToken(self, string, token_type, line, line_number, values=None): - """Creates a new JavaScriptToken object. - - Args: - string: The string of input the token contains. - token_type: The type of token. - line: The text of the line this token is in. - line_number: The line number of the token. - values: A dict of named values within the token. For instance, a - function declaration may have a value called 'name' which captures the - name of the function. - """ - return javascripttokens.JavaScriptToken(string, token_type, line, - line_number, values) |