aboutsummaryrefslogtreecommitdiff
path: root/tools/closure_linter-2.3.4/closure_linter/common/tokenizer.py
blob: 0234720d73b951a9b2b3a5a0bee8999317fed319 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/usr/bin/env python
#
# Copyright 2007 The Closure Linter Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS-IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Regular expression based lexer."""

__author__ = ('robbyw@google.com (Robert Walker)',
              'ajp@google.com (Andy Perelson)')

from closure_linter.common import tokens

# Shorthand
Type = tokens.TokenType


class Tokenizer(object):
  """General purpose tokenizer.

  Attributes:
    mode: The latest mode of the tokenizer.  This allows patterns to distinguish
        if they are mid-comment, mid-parameter list, etc.
    matchers: Dictionary of modes to sequences of matchers that define the
        patterns to check at any given time.
    default_types: Dictionary of modes to types, defining what type to give
        non-matched text when in the given mode.  Defaults to Type.NORMAL.
  """

  def __init__(self, starting_mode, matchers, default_types):
    """Initialize the tokenizer.

    Args:
      starting_mode: Mode to start in.
      matchers: Dictionary of modes to sequences of matchers that defines the
          patterns to check at any given time.
      default_types: Dictionary of modes to types, defining what type to give
          non-matched text when in the given mode.  Defaults to Type.NORMAL.
    """
    self.__starting_mode = starting_mode
    self.matchers = matchers
    self.default_types = default_types

  def TokenizeFile(self, file):
    """Tokenizes the given file.

    Args:
      file: An iterable that yields one line of the file at a time.

    Returns:
      The first token in the file
    """
    # The current mode.
    self.mode = self.__starting_mode
    # The first token in the stream.
    self.__first_token = None
    # The last token added to the token stream.
    self.__last_token = None
    # The current line number.
    self.__line_number = 0

    for line in file:
      self.__line_number += 1
      self.__TokenizeLine(line)

    return self.__first_token

  def _CreateToken(self, string, token_type, line, line_number, values=None):
    """Creates a new Token object (or subclass).

    Args:
      string: The string of input the token represents.
      token_type: The type of token.
      line: The text of the line this token is in.
      line_number: The line number of the token.
      values: A dict of named values within the token.  For instance, a
        function declaration may have a value called 'name' which captures the
        name of the function.

    Returns:
      The newly created Token object.
    """
    return tokens.Token(string, token_type, line, line_number, values)

  def __TokenizeLine(self, line):
    """Tokenizes the given line.

    Args:
      line: The contents of the line.
    """
    string = line.rstrip('\n\r\f')
    line_number = self.__line_number
    self.__start_index = 0

    if not string:
      self.__AddToken(self._CreateToken('', Type.BLANK_LINE, line, line_number))
      return

    normal_token = ''
    index = 0
    while index < len(string):
      for matcher in self.matchers[self.mode]:
        if matcher.line_start and index > 0:
          continue

        match = matcher.regex.match(string, index)

        if match:
          if normal_token:
            self.__AddToken(
                self.__CreateNormalToken(self.mode, normal_token, line,
                                         line_number))
            normal_token = ''

          # Add the match.
          self.__AddToken(self._CreateToken(match.group(), matcher.type, line,
                                            line_number, match.groupdict()))

          # Change the mode to the correct one for after this match.
          self.mode = matcher.result_mode or self.mode

          # Shorten the string to be matched.
          index = match.end()

          break

      else:
        # If the for loop finishes naturally (i.e. no matches) we just add the
        # first character to the string of consecutive non match characters.
        # These will constitute a NORMAL token.
        if string:
          normal_token += string[index:index + 1]
          index += 1

    if normal_token:
      self.__AddToken(
          self.__CreateNormalToken(self.mode, normal_token, line, line_number))

  def __CreateNormalToken(self, mode, string, line, line_number):
    """Creates a normal token.

    Args:
      mode: The current mode.
      string: The string to tokenize.
      line: The line of text.
      line_number: The line number within the file.

    Returns:
      A Token object, of the default type for the current mode.
    """
    type = Type.NORMAL
    if mode in self.default_types:
      type = self.default_types[mode]
    return self._CreateToken(string, type, line, line_number)

  def __AddToken(self, token):
    """Add the given token to the token stream.

    Args:
      token: The token to add.
    """
    # Store the first token, or point the previous token to this one.
    if not self.__first_token:
      self.__first_token = token
    else:
      self.__last_token.next = token

    # Establish the doubly linked list
    token.previous = self.__last_token
    self.__last_token = token

    # Compute the character indices
    token.start_index = self.__start_index
    self.__start_index += token.length