diff options
Diffstat (limited to 'tools/closure_linter-2.3.4/closure_linter/common/tokenizer.py')
-rwxr-xr-x | tools/closure_linter-2.3.4/closure_linter/common/tokenizer.py | 184 |
1 files changed, 184 insertions, 0 deletions
diff --git a/tools/closure_linter-2.3.4/closure_linter/common/tokenizer.py b/tools/closure_linter-2.3.4/closure_linter/common/tokenizer.py new file mode 100755 index 0000000..0234720 --- /dev/null +++ b/tools/closure_linter-2.3.4/closure_linter/common/tokenizer.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python +# +# Copyright 2007 The Closure Linter Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS-IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Regular expression based lexer.""" + +__author__ = ('robbyw@google.com (Robert Walker)', + 'ajp@google.com (Andy Perelson)') + +from closure_linter.common import tokens + +# Shorthand +Type = tokens.TokenType + + +class Tokenizer(object): + """General purpose tokenizer. + + Attributes: + mode: The latest mode of the tokenizer. This allows patterns to distinguish + if they are mid-comment, mid-parameter list, etc. + matchers: Dictionary of modes to sequences of matchers that define the + patterns to check at any given time. + default_types: Dictionary of modes to types, defining what type to give + non-matched text when in the given mode. Defaults to Type.NORMAL. + """ + + def __init__(self, starting_mode, matchers, default_types): + """Initialize the tokenizer. + + Args: + starting_mode: Mode to start in. + matchers: Dictionary of modes to sequences of matchers that defines the + patterns to check at any given time. + default_types: Dictionary of modes to types, defining what type to give + non-matched text when in the given mode. Defaults to Type.NORMAL. + """ + self.__starting_mode = starting_mode + self.matchers = matchers + self.default_types = default_types + + def TokenizeFile(self, file): + """Tokenizes the given file. + + Args: + file: An iterable that yields one line of the file at a time. + + Returns: + The first token in the file + """ + # The current mode. + self.mode = self.__starting_mode + # The first token in the stream. + self.__first_token = None + # The last token added to the token stream. + self.__last_token = None + # The current line number. + self.__line_number = 0 + + for line in file: + self.__line_number += 1 + self.__TokenizeLine(line) + + return self.__first_token + + def _CreateToken(self, string, token_type, line, line_number, values=None): + """Creates a new Token object (or subclass). + + Args: + string: The string of input the token represents. + token_type: The type of token. + line: The text of the line this token is in. + line_number: The line number of the token. + values: A dict of named values within the token. For instance, a + function declaration may have a value called 'name' which captures the + name of the function. + + Returns: + The newly created Token object. + """ + return tokens.Token(string, token_type, line, line_number, values) + + def __TokenizeLine(self, line): + """Tokenizes the given line. + + Args: + line: The contents of the line. + """ + string = line.rstrip('\n\r\f') + line_number = self.__line_number + self.__start_index = 0 + + if not string: + self.__AddToken(self._CreateToken('', Type.BLANK_LINE, line, line_number)) + return + + normal_token = '' + index = 0 + while index < len(string): + for matcher in self.matchers[self.mode]: + if matcher.line_start and index > 0: + continue + + match = matcher.regex.match(string, index) + + if match: + if normal_token: + self.__AddToken( + self.__CreateNormalToken(self.mode, normal_token, line, + line_number)) + normal_token = '' + + # Add the match. + self.__AddToken(self._CreateToken(match.group(), matcher.type, line, + line_number, match.groupdict())) + + # Change the mode to the correct one for after this match. + self.mode = matcher.result_mode or self.mode + + # Shorten the string to be matched. + index = match.end() + + break + + else: + # If the for loop finishes naturally (i.e. no matches) we just add the + # first character to the string of consecutive non match characters. + # These will constitute a NORMAL token. + if string: + normal_token += string[index:index + 1] + index += 1 + + if normal_token: + self.__AddToken( + self.__CreateNormalToken(self.mode, normal_token, line, line_number)) + + def __CreateNormalToken(self, mode, string, line, line_number): + """Creates a normal token. + + Args: + mode: The current mode. + string: The string to tokenize. + line: The line of text. + line_number: The line number within the file. + + Returns: + A Token object, of the default type for the current mode. + """ + type = Type.NORMAL + if mode in self.default_types: + type = self.default_types[mode] + return self._CreateToken(string, type, line, line_number) + + def __AddToken(self, token): + """Add the given token to the token stream. + + Args: + token: The token to add. + """ + # Store the first token, or point the previous token to this one. + if not self.__first_token: + self.__first_token = token + else: + self.__last_token.next = token + + # Establish the doubly linked list + token.previous = self.__last_token + self.__last_token = token + + # Compute the character indices + token.start_index = self.__start_index + self.__start_index += token.length |