#!/usr/bin/env python # # Copyright 2007 The Closure Linter Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS-IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Regular expression based JavaScript parsing classes.""" __author__ = ('robbyw@google.com (Robert Walker)', 'ajp@google.com (Andy Perelson)') import copy import re from closure_linter import javascripttokens from closure_linter.common import matcher from closure_linter.common import tokenizer # Shorthand Type = javascripttokens.JavaScriptTokenType Matcher = matcher.Matcher class JavaScriptModes(object): """Enumeration of the different matcher modes used for JavaScript.""" TEXT_MODE = 'text' SINGLE_QUOTE_STRING_MODE = 'single_quote_string' DOUBLE_QUOTE_STRING_MODE = 'double_quote_string' BLOCK_COMMENT_MODE = 'block_comment' DOC_COMMENT_MODE = 'doc_comment' DOC_COMMENT_LEX_SPACES_MODE = 'doc_comment_spaces' LINE_COMMENT_MODE = 'line_comment' PARAMETER_MODE = 'parameter' FUNCTION_MODE = 'function' class JavaScriptTokenizer(tokenizer.Tokenizer): """JavaScript tokenizer. Convert JavaScript code in to an array of tokens. """ # Useful patterns for JavaScript parsing. IDENTIFIER_CHAR = r'A-Za-z0-9_$.'; # Number patterns based on: # http://www.mozilla.org/js/language/js20-2000-07/formal/lexer-grammar.html MANTISSA = r""" (\d+(?!\.)) | # Matches '10' (\d+\.(?!\d)) | # Matches '10.' (\d*\.\d+) # Matches '.5' or '10.5' """ DECIMAL_LITERAL = r'(%s)([eE][-+]?\d+)?' % MANTISSA HEX_LITERAL = r'0[xX][0-9a-fA-F]+' NUMBER = re.compile(r""" ((%s)|(%s)) """ % (HEX_LITERAL, DECIMAL_LITERAL), re.VERBOSE) # Strings come in three parts - first we match the start of the string, then # the contents, then the end. The contents consist of any character except a # backslash or end of string, or a backslash followed by any character, or a # backslash followed by end of line to support correct parsing of multi-line # strings. SINGLE_QUOTE = re.compile(r"'") SINGLE_QUOTE_TEXT = re.compile(r"([^'\\]|\\(.|$))+") DOUBLE_QUOTE = re.compile(r'"') DOUBLE_QUOTE_TEXT = re.compile(r'([^"\\]|\\(.|$))+') START_SINGLE_LINE_COMMENT = re.compile(r'//') END_OF_LINE_SINGLE_LINE_COMMENT = re.compile(r'//$') START_DOC_COMMENT = re.compile(r'/\*\*') START_BLOCK_COMMENT = re.compile(r'/\*') END_BLOCK_COMMENT = re.compile(r'\*/') BLOCK_COMMENT_TEXT = re.compile(r'([^*]|\*(?!/))+') # Comment text is anything that we are not going to parse into another special # token like (inline) flags or end comments. Complicated regex to match # most normal characters, and '*', '{', '}', and '@' when we are sure that # it is safe. Expression [^*{\s]@ must come first, or the other options will # match everything before @, and we won't match @'s that aren't part of flags # like in email addresses in the @author tag. DOC_COMMENT_TEXT = re.compile(r'([^*{}\s]@|[^*{}@]|\*(?!/))+') DOC_COMMENT_NO_SPACES_TEXT = re.compile(r'([^*{}\s]@|[^*{}@\s]|\*(?!/))+') # Match the prefix ' * ' that starts every line of jsdoc. Want to include # spaces after the '*', but nothing else that occurs after a '*', and don't # want to match the '*' in '*/'. DOC_PREFIX = re.compile(r'\s*\*(\s+|(?!/))') START_BLOCK = re.compile('{') END_BLOCK = re.compile('}') REGEX_CHARACTER_CLASS = r""" \[ # Opening bracket ([^\]\\]|\\.)* # Anything but a ] or \, # or a backslash followed by anything \] # Closing bracket """ # We ensure the regex is followed by one of the above tokens to avoid # incorrectly parsing something like x / y / z as x REGEX(/ y /) z POST_REGEX_LIST = [ ';', ',', r'\.', r'\)', r'\]', '$', r'\/\/', r'\/\*', ':', '}'] REGEX = re.compile(r""" / # opening slash (?!\*) # not the start of a comment (\\.|[^\[\/\\]|(%s))* # a backslash followed by anything, # or anything but a / or [ or \, # or a character class / # closing slash [gimsx]* # optional modifiers (?=\s*(%s)) """ % (REGEX_CHARACTER_CLASS, '|'.join(POST_REGEX_LIST)), re.VERBOSE) ANYTHING = re.compile(r'.*') PARAMETERS = re.compile(r'[^\)]+') CLOSING_PAREN_WITH_SPACE = re.compile(r'\)\s*') FUNCTION_DECLARATION = re.compile(r'\bfunction\b') OPENING_PAREN = re.compile(r'\(') CLOSING_PAREN = re.compile(r'\)') OPENING_BRACKET = re.compile(r'\[') CLOSING_BRACKET = re.compile(r'\]') # We omit these JS keywords from the list: # function - covered by FUNCTION_DECLARATION. # delete, in, instanceof, new, typeof - included as operators. # this - included in identifiers. # null, undefined - not included, should go in some "special constant" list. KEYWORD_LIST = ['break', 'case', 'catch', 'continue', 'default', 'do', 'else', 'finally', 'for', 'if', 'return', 'switch', 'throw', 'try', 'var', 'while', 'with'] # Match a keyword string followed by a non-identifier character in order to # not match something like doSomething as do + Something. KEYWORD = re.compile('(%s)((?=[^%s])|$)' % ( '|'.join(KEYWORD_LIST), IDENTIFIER_CHAR)) # List of regular expressions to match as operators. Some notes: for our # purposes, the comma behaves similarly enough to a normal operator that we # include it here. r'\bin\b' actually matches 'in' surrounded by boundary # characters - this may not match some very esoteric uses of the in operator. # Operators that are subsets of larger operators must come later in this list # for proper matching, e.g., '>>' must come AFTER '>>>'. OPERATOR_LIST = [',', r'\+\+', '===', '!==', '>>>=', '>>>', '==', '>=', '<=', '!=', '<<=', '>>=', '<<', '>>', '>', '<', r'\+=', r'\+', '--', '\^=', '-=', '-', '/=', '/', r'\*=', r'\*', '%=', '%', '&&', r'\|\|', '&=', '&', r'\|=', r'\|', '=', '!', ':', '\?', r'\bdelete\b', r'\bin\b', r'\binstanceof\b', r'\bnew\b', r'\btypeof\b', r'\bvoid\b'] OPERATOR = re.compile('|'.join(OPERATOR_LIST)) WHITESPACE = re.compile(r'\s+') SEMICOLON = re.compile(r';') # Technically JavaScript identifiers can't contain '.', but we treat a set of # nested identifiers as a single identifier. NESTED_IDENTIFIER = r'[a-zA-Z_$][%s.]*' % IDENTIFIER_CHAR IDENTIFIER = re.compile(NESTED_IDENTIFIER) SIMPLE_LVALUE = re.compile(r""" (?P%s) # a valid identifier (?=\s* # optional whitespace \= # look ahead to equal sign (?!=)) # not follwed by equal """ % NESTED_IDENTIFIER, re.VERBOSE) # A doc flag is a @ sign followed by non-space characters that appears at the # beginning of the line, after whitespace, or after a '{'. The look-behind # check is necessary to not match someone@google.com as a flag. DOC_FLAG = re.compile(r'(^|(?<=\s))@(?P[a-zA-Z]+)') # To properly parse parameter names, we need to tokenize whitespace into a # token. DOC_FLAG_LEX_SPACES = re.compile(r'(^|(?<=\s))@(?P%s)\b' % '|'.join(['param'])) DOC_INLINE_FLAG = re.compile(r'(?<={)@(?P[a-zA-Z]+)') # Star followed by non-slash, i.e a star that does not end a comment. # This is used for TYPE_GROUP below. SAFE_STAR = r'(\*(?!/))' COMMON_DOC_MATCHERS = [ # Find the end of the comment. Matcher(END_BLOCK_COMMENT, Type.END_DOC_COMMENT, JavaScriptModes.TEXT_MODE), # Tokenize documented flags like @private. Matcher(DOC_INLINE_FLAG, Type.DOC_INLINE_FLAG), Matcher(DOC_FLAG_LEX_SPACES, Type.DOC_FLAG, JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE), # Encountering a doc flag should leave lex spaces mode. Matcher(DOC_FLAG, Type.DOC_FLAG, JavaScriptModes.DOC_COMMENT_MODE), # Tokenize braces so we can find types. Matcher(START_BLOCK, Type.DOC_START_BRACE), Matcher(END_BLOCK, Type.DOC_END_BRACE), Matcher(DOC_PREFIX, Type.DOC_PREFIX, None, True)] # The token matcher groups work as follows: it is an list of Matcher objects. # The matchers will be tried in this order, and the first to match will be # returned. Hence the order is important because the matchers that come first # overrule the matchers that come later. JAVASCRIPT_MATCHERS = { # Matchers for basic text mode. JavaScriptModes.TEXT_MODE: [ # Check a big group - strings, starting comments, and regexes - all # of which could be intertwined. 'string with /regex/', # /regex with 'string'/, /* comment with /regex/ and string */ (and so on) Matcher(START_DOC_COMMENT, Type.START_DOC_COMMENT, JavaScriptModes.DOC_COMMENT_MODE), Matcher(START_BLOCK_COMMENT, Type.START_BLOCK_COMMENT, JavaScriptModes.BLOCK_COMMENT_MODE), Matcher(END_OF_LINE_SINGLE_LINE_COMMENT, Type.START_SINGLE_LINE_COMMENT), Matcher(START_SINGLE_LINE_COMMENT, Type.START_SINGLE_LINE_COMMENT, JavaScriptModes.LINE_COMMENT_MODE), Matcher(SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_START, JavaScriptModes.SINGLE_QUOTE_STRING_MODE), Matcher(DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_START, JavaScriptModes.DOUBLE_QUOTE_STRING_MODE), Matcher(REGEX, Type.REGEX), # Next we check for start blocks appearing outside any of the items above. Matcher(START_BLOCK, Type.START_BLOCK), Matcher(END_BLOCK, Type.END_BLOCK), # Then we search for function declarations. Matcher(FUNCTION_DECLARATION, Type.FUNCTION_DECLARATION, JavaScriptModes.FUNCTION_MODE), # Next, we convert non-function related parens to tokens. Matcher(OPENING_PAREN, Type.START_PAREN), Matcher(CLOSING_PAREN, Type.END_PAREN), # Next, we convert brackets to tokens. Matcher(OPENING_BRACKET, Type.START_BRACKET), Matcher(CLOSING_BRACKET, Type.END_BRACKET), # Find numbers. This has to happen before operators because scientific # notation numbers can have + and - in them. Matcher(NUMBER, Type.NUMBER), # Find operators and simple assignments Matcher(SIMPLE_LVALUE, Type.SIMPLE_LVALUE), Matcher(OPERATOR, Type.OPERATOR), # Find key words and whitespace Matcher(KEYWORD, Type.KEYWORD), Matcher(WHITESPACE, Type.WHITESPACE), # Find identifiers Matcher(IDENTIFIER, Type.IDENTIFIER), # Finally, we convert semicolons to tokens. Matcher(SEMICOLON, Type.SEMICOLON)], # Matchers for single quote strings. JavaScriptModes.SINGLE_QUOTE_STRING_MODE: [ Matcher(SINGLE_QUOTE_TEXT, Type.STRING_TEXT), Matcher(SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_END, JavaScriptModes.TEXT_MODE)], # Matchers for double quote strings. JavaScriptModes.DOUBLE_QUOTE_STRING_MODE: [ Matcher(DOUBLE_QUOTE_TEXT, Type.STRING_TEXT), Matcher(DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_END, JavaScriptModes.TEXT_MODE)], # Matchers for block comments. JavaScriptModes.BLOCK_COMMENT_MODE: [ # First we check for exiting a block comment. Matcher(END_BLOCK_COMMENT, Type.END_BLOCK_COMMENT, JavaScriptModes.TEXT_MODE), # Match non-comment-ending text.. Matcher(BLOCK_COMMENT_TEXT, Type.COMMENT)], # Matchers for doc comments. JavaScriptModes.DOC_COMMENT_MODE: COMMON_DOC_MATCHERS + [ Matcher(DOC_COMMENT_TEXT, Type.COMMENT)], JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: COMMON_DOC_MATCHERS + [ Matcher(WHITESPACE, Type.COMMENT), Matcher(DOC_COMMENT_NO_SPACES_TEXT, Type.COMMENT)], # Matchers for single line comments. JavaScriptModes.LINE_COMMENT_MODE: [ # We greedy match until the end of the line in line comment mode. Matcher(ANYTHING, Type.COMMENT, JavaScriptModes.TEXT_MODE)], # Matchers for code after the function keyword. JavaScriptModes.FUNCTION_MODE: [ # Must match open paren before anything else and move into parameter mode, # otherwise everything inside the parameter list is parsed incorrectly. Matcher(OPENING_PAREN, Type.START_PARAMETERS, JavaScriptModes.PARAMETER_MODE), Matcher(WHITESPACE, Type.WHITESPACE), Matcher(IDENTIFIER, Type.FUNCTION_NAME)], # Matchers for function parameters JavaScriptModes.PARAMETER_MODE: [ # When in function parameter mode, a closing paren is treated specially. # Everything else is treated as lines of parameters. Matcher(CLOSING_PAREN_WITH_SPACE, Type.END_PARAMETERS, JavaScriptModes.TEXT_MODE), Matcher(PARAMETERS, Type.PARAMETERS, JavaScriptModes.PARAMETER_MODE)]} # When text is not matched, it is given this default type based on mode. # If unspecified in this map, the default default is Type.NORMAL. JAVASCRIPT_DEFAULT_TYPES = { JavaScriptModes.DOC_COMMENT_MODE: Type.COMMENT, JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: Type.COMMENT } def __init__(self, parse_js_doc = True): """Create a tokenizer object. Args: parse_js_doc: Whether to do detailed parsing of javascript doc comments, or simply treat them as normal comments. Defaults to parsing JsDoc. """ matchers = self.JAVASCRIPT_MATCHERS if not parse_js_doc: # Make a copy so the original doesn't get modified. matchers = copy.deepcopy(matchers) matchers[JavaScriptModes.DOC_COMMENT_MODE] = matchers[ JavaScriptModes.BLOCK_COMMENT_MODE] tokenizer.Tokenizer.__init__(self, JavaScriptModes.TEXT_MODE, matchers, self.JAVASCRIPT_DEFAULT_TYPES) def _CreateToken(self, string, token_type, line, line_number, values=None): """Creates a new JavaScriptToken object. Args: string: The string of input the token contains. token_type: The type of token. line: The text of the line this token is in. line_number: The line number of the token. values: A dict of named values within the token. For instance, a function declaration may have a value called 'name' which captures the name of the function. """ return javascripttokens.JavaScriptToken(string, token_type, line, line_number, values)