Stop allocating new tokens in the lexer

There's only one Token and it gets reused. This reduces the memory usage of the lexer. Parsing time seems to be 5%-10% faster with this change on a large file. This makes little difference on the overall performance of Bazel though. RELNOTES: None. PiperOrigin-RevId: 199310860
author: laurentlb <laurentlb@google.com> 2018-06-05 09:42:59 -0700
committer: Copybara-Service <copybara-piper@google.com> 2018-06-05 09:45:02 -0700
commit: fc23edc368244849c59636d1e2ae6c375bebf262 (patch)
tree: 2d742b93785efbd2f1fbe21de03972edd4838d7b /src/main/java/com/google/devtools/build/lib/syntax
parent: c04ce79e4f719df54e3564f58a2ea26529ac1494 (diff)
2 files changed, 94 insertions, 68 deletions
diff --git a/src/main/java/com/google/devtools/build/lib/syntax/Lexer.java b/src/main/java/com/google/devtools/build/lib/syntax/Lexer.java
index 91384f06d4..ed7bd4c17e 100644
--- a/src/main/java/com/google/devtools/build/lib/syntax/Lexer.java
+++ b/src/main/java/com/google/devtools/build/lib/syntax/Lexer.java
@@ -81,8 +81,12 @@ public final class Lexer {
   // bottom.
   private final Stack<Integer> indentStack = new Stack<>();
 
-  /** Token to return */
-  private Token token;
+  /**
+   * Token to return. This token is mutated in-place. Its kind is set to
+   * null to indicate the intermediate state, where the new token has not
+   * been scanned yet.
+   */
+  private final Token token;
 
   private final List<Comment> comments;
 
@@ -112,6 +116,7 @@ public final class Lexer {
     this.checkIndentation = true;
     this.comments = new ArrayList<>();
     this.dents = 0;
+    this.token = new Token(null, -1, -1);
 
     indentStack.push(0);
   }
@@ -146,9 +151,10 @@ public final class Lexer {
    * after EOF has been returned.
    */
   public Token nextToken() {
-    boolean afterNewline = token != null && token.kind == TokenKind.NEWLINE;
-    token = null;
+    boolean afterNewline = token.kind == TokenKind.NEWLINE;
+    token.kind = null;
     tokenize();
+    Preconditions.checkState(token.kind != null);
 
     // Like Python, always end with a NEWLINE token, even if no '\n' in input:
     if (token.kind == TokenKind.EOF && !afterNewline) {
@@ -226,9 +232,20 @@ public final class Lexer {
   }
 
   /** invariant: symbol positions are half-open intervals. */
-  private void setToken(Token s) {
-    Preconditions.checkState(token == null);
-    token = s;
+  private void setToken(TokenKind kind, int left, int right) {
+    Preconditions.checkState(token.kind == null);
+    token.kind = kind;
+    token.left = left;
+    token.right = right;
+    token.value = null;
+  }
+
+  private void setToken(TokenKind kind, int left, int right, Object value) {
+    Preconditions.checkState(token.kind == null);
+    token.kind = kind;
+    token.left = left;
+    token.right = right;
+    token.value = value;
   }
 
   /**
@@ -241,7 +258,7 @@ public final class Lexer {
       newlineInsideExpression(); // in an expression: ignore space
     } else {
       checkIndentation = true;
-      setToken(new Token(TokenKind.NEWLINE, pos - 1, pos));
+      setToken(TokenKind.NEWLINE, pos - 1, pos);
     }
   }
 
@@ -330,7 +347,7 @@ public final class Lexer {
    *
    * @return the string-literal token.
    */
-  private Token escapedStringLiteral(char quot, boolean isRaw) {
+  private void escapedStringLiteral(char quot, boolean isRaw) {
     int literalStartPos = isRaw ? pos - 2 : pos - 1;
     boolean inTriplequote = skipTripleQuote(quot);
     // more expensive second choice that expands escaped into a buffer
@@ -345,12 +362,14 @@ public final class Lexer {
             break;
           } else {
             error("unterminated string literal at eol", literalStartPos, pos);
-            return new Token(TokenKind.STRING, literalStartPos, pos, literal.toString());
+            setToken(TokenKind.STRING, literalStartPos, pos, literal.toString());
+            return;
           }
         case '\\':
           if (pos == buffer.length) {
             error("unterminated string literal at eof", literalStartPos, pos);
-            return new Token(TokenKind.STRING, literalStartPos, pos, literal.toString());
+            setToken(TokenKind.STRING, literalStartPos, pos, literal.toString());
+            return;
           }
           if (isRaw) {
             // Insert \ and the following character.
@@ -454,7 +473,8 @@ public final class Lexer {
             literal.append(c);
           } else {
             // Matching close-delimiter, all done.
-            return new Token(TokenKind.STRING, literalStartPos, pos, literal.toString());
+            setToken(TokenKind.STRING, literalStartPos, pos, literal.toString());
+            return;
           }
           break;
         default:
@@ -463,7 +483,7 @@ public final class Lexer {
       }
     }
     error("unterminated string literal at eof", literalStartPos, pos);
-    return new Token(TokenKind.STRING, literalStartPos, pos, literal.toString());
+    setToken(TokenKind.STRING, literalStartPos, pos, literal.toString());
   }
 
   /**
@@ -477,14 +497,15 @@ public final class Lexer {
    * @param isRaw if true, do not escape the string.
    * @return the string-literal token.
    */
-  private Token stringLiteral(char quot, boolean isRaw) {
+  private void stringLiteral(char quot, boolean isRaw) {
     int literalStartPos = isRaw ? pos - 2 : pos - 1;
     int contentStartPos = pos;
 
     // Don't even attempt to parse triple-quotes here.
     if (skipTripleQuote(quot)) {
       pos -= 2;
-      return escapedStringLiteral(quot, isRaw);
+      escapedStringLiteral(quot, isRaw);
+      return;
     }
 
     // first quick optimistic scan for a simple non-escaped string
@@ -493,17 +514,16 @@ public final class Lexer {
       switch (c) {
         case '\n':
           error("unterminated string literal at eol", literalStartPos, pos);
-          Token t =
-              new Token(
-                  TokenKind.STRING, literalStartPos, pos, bufferSlice(contentStartPos, pos - 1));
-          return t;
+          setToken(TokenKind.STRING, literalStartPos, pos, bufferSlice(contentStartPos, pos - 1));
+          return;
         case '\\':
           if (isRaw) {
             if (lookaheadIs(0, '\r') && lookaheadIs(1, '\n')) {
               // There was a CRLF after the newline. No shortcut possible, since it needs to be
               // transformed into a single LF.
               pos = contentStartPos;
-              return escapedStringLiteral(quot, true);
+              escapedStringLiteral(quot, true);
+              return;
             } else {
               pos++;
               break;
@@ -511,13 +531,15 @@ public final class Lexer {
           }
           // oops, hit an escape, need to start over & build a new string buffer
           pos = contentStartPos;
-          return escapedStringLiteral(quot, false);
+          escapedStringLiteral(quot, false);
+          return;
         case '\'':
         case '"':
           if (c == quot) {
             // close-quote, all done.
-            return new Token(
+            setToken(
                 TokenKind.STRING, literalStartPos, pos, bufferSlice(contentStartPos, pos - 1));
+            return;
           }
           break;
         default: // fall out
@@ -531,7 +553,7 @@ public final class Lexer {
     }
 
     error("unterminated string literal at eof", literalStartPos, pos);
-    return new Token(TokenKind.STRING, literalStartPos, pos, bufferSlice(contentStartPos, pos));
+    setToken(TokenKind.STRING, literalStartPos, pos, bufferSlice(contentStartPos, pos));
   }
 
   private static final Map<String, TokenKind> keywordMap = new HashMap<>();
@@ -578,13 +600,15 @@ public final class Lexer {
    *
    * @return the identifier or keyword token.
    */
-  private Token identifierOrKeyword() {
+  private void identifierOrKeyword() {
     int oldPos = pos - 1;
     String id = scanIdentifier();
     TokenKind kind = keywordMap.get(id);
-    return (kind == null)
-        ? new Token(TokenKind.IDENTIFIER, oldPos, pos, id)
-        : new Token(kind, oldPos, pos, null);
+    if (kind == null) {
+      setToken(TokenKind.IDENTIFIER, oldPos, pos, id);
+    } else {
+      setToken(kind, oldPos, pos, null);
+    }
   }
 
   private String scanIdentifier() {
@@ -649,10 +673,8 @@ public final class Lexer {
    *
    * <p>ON ENTRY: 'pos' is 1 + the index of the first char in the literal.
    * ON EXIT: 'pos' is 1 + the index of the last char in the literal.
-   *
-   * @return the integer token.
    */
-  private Token integer() {
+  private void integer() {
     int oldPos = pos - 1;
     String literal = scanInteger();
 
@@ -679,7 +701,7 @@ public final class Lexer {
       error("invalid base-" + radix + " integer constant: " + literal);
     }
 
-    return new Token(TokenKind.INT, oldPos, pos, value);
+    setToken(TokenKind.INT, oldPos, pos, value);
   }
 
   /**
@@ -701,7 +723,7 @@ public final class Lexer {
     if (tok == null) {
       return false;
     } else {
-      setToken(new Token(tok, pos, pos + 2));
+      setToken(tok, pos, pos + 2);
       return true;
     }
   }
@@ -712,8 +734,8 @@ public final class Lexer {
   }
 
   /**
-   * Performs tokenization of the character buffer of file contents provided to the constructor.
-   * Advances pos and sets the token variable.
+   * Performs tokenization of the character buffer of file contents provided to the constructor. At
+   * least one token will be added to the tokens queue.
    */
   private void tokenize() {
     if (checkIndentation) {
@@ -725,10 +747,10 @@ public final class Lexer {
     if (dents != 0) {
       if (dents < 0) {
         dents++;
-        setToken(new Token(TokenKind.OUTDENT, pos - 1, pos));
+        setToken(TokenKind.OUTDENT, pos - 1, pos);
       } else {
         dents--;
-        setToken(new Token(TokenKind.INDENT, pos - 1, pos));
+        setToken(TokenKind.INDENT, pos - 1, pos);
       }
       return;
     }
@@ -742,94 +764,94 @@ public final class Lexer {
       pos++;
       switch (c) {
       case '{': {
-        setToken(new Token(TokenKind.LBRACE, pos - 1, pos));
+        setToken(TokenKind.LBRACE, pos - 1, pos);
         openParenStackDepth++;
         break;
       }
       case '}': {
-        setToken(new Token(TokenKind.RBRACE, pos - 1, pos));
+        setToken(TokenKind.RBRACE, pos - 1, pos);
         popParen();
         break;
       }
       case '(': {
-        setToken(new Token(TokenKind.LPAREN, pos - 1, pos));
+        setToken(TokenKind.LPAREN, pos - 1, pos);
         openParenStackDepth++;
         break;
       }
       case ')': {
-        setToken(new Token(TokenKind.RPAREN, pos - 1, pos));
+        setToken(TokenKind.RPAREN, pos - 1, pos);
         popParen();
         break;
       }
       case '[': {
-        setToken(new Token(TokenKind.LBRACKET, pos - 1, pos));
+        setToken(TokenKind.LBRACKET, pos - 1, pos);
         openParenStackDepth++;
         break;
       }
       case ']': {
-        setToken(new Token(TokenKind.RBRACKET, pos - 1, pos));
+        setToken(TokenKind.RBRACKET, pos - 1, pos);
         popParen();
         break;
       }
       case '>': {
-        setToken(new Token(TokenKind.GREATER, pos - 1, pos));
+        setToken(TokenKind.GREATER, pos - 1, pos);
         break;
       }
       case '<': {
-        setToken(new Token(TokenKind.LESS, pos - 1, pos));
+        setToken(TokenKind.LESS, pos - 1, pos);
         break;
       }
       case ':': {
-        setToken(new Token(TokenKind.COLON, pos - 1, pos));
+        setToken(TokenKind.COLON, pos - 1, pos);
         break;
       }
       case ',': {
-        setToken(new Token(TokenKind.COMMA, pos - 1, pos));
+        setToken(TokenKind.COMMA, pos - 1, pos);
         break;
       }
       case '+': {
-        setToken(new Token(TokenKind.PLUS, pos - 1, pos));
+        setToken(TokenKind.PLUS, pos - 1, pos);
         break;
       }
       case '-': {
-        setToken(new Token(TokenKind.MINUS, pos - 1, pos));
+        setToken(TokenKind.MINUS, pos - 1, pos);
         break;
       }
       case '|': {
-        setToken(new Token(TokenKind.PIPE, pos - 1, pos));
+        setToken(TokenKind.PIPE, pos - 1, pos);
         break;
       }
       case '=': {
-        setToken(new Token(TokenKind.EQUALS, pos - 1, pos));
+        setToken(TokenKind.EQUALS, pos - 1, pos);
         break;
       }
       case '%': {
-        setToken(new Token(TokenKind.PERCENT, pos - 1, pos));
+        setToken(TokenKind.PERCENT, pos - 1, pos);
         break;
       }
       case '/': {
         if (lookaheadIs(0, '/') && lookaheadIs(1, '=')) {
-          setToken(new Token(TokenKind.SLASH_SLASH_EQUALS, pos - 1, pos + 2));
+          setToken(TokenKind.SLASH_SLASH_EQUALS, pos - 1, pos + 2);
           pos += 2;
         } else if (lookaheadIs(0, '/')) {
-          setToken(new Token(TokenKind.SLASH_SLASH, pos - 1, pos + 1));
+          setToken(TokenKind.SLASH_SLASH, pos - 1, pos + 1);
           pos += 1;
         } else {
           // /= is handled by tokenizeTwoChars.
-          setToken(new Token(TokenKind.SLASH, pos - 1, pos));
+          setToken(TokenKind.SLASH, pos - 1, pos);
         }
         break;
       }
       case ';': {
-        setToken(new Token(TokenKind.SEMI, pos - 1, pos));
+        setToken(TokenKind.SEMI, pos - 1, pos);
         break;
       }
       case '.': {
-        setToken(new Token(TokenKind.DOT, pos - 1, pos));
+        setToken(TokenKind.DOT, pos - 1, pos);
         break;
       }
       case '*': {
-        setToken(new Token(TokenKind.STAR, pos - 1, pos));
+        setToken(TokenKind.STAR, pos - 1, pos);
         break;
       }
       case ' ':
@@ -845,7 +867,7 @@ public final class Lexer {
         } else if (lookaheadIs(0, '\r') && lookaheadIs(1, '\n')) {
           pos += 2;  // skip the CRLF at the end of line
         } else {
-          setToken(new Token(TokenKind.ILLEGAL, pos - 1, pos, Character.toString(c)));
+          setToken(TokenKind.ILLEGAL, pos - 1, pos, Character.toString(c));
         }
         break;
       }
@@ -868,7 +890,7 @@ public final class Lexer {
       }
       case '\'':
       case '\"': {
-        setToken(stringLiteral(c, false));
+        stringLiteral(c, false);
         break;
       }
       default: {
@@ -877,27 +899,27 @@ public final class Lexer {
             && (buffer[pos] == '\'' || buffer[pos] == '\"')) {
           c = buffer[pos];
           pos++;
-          setToken(stringLiteral(c, true));
+          stringLiteral(c, true);
           break;
         }
 
         if (c >= '0' && c <= '9') {
-          setToken(integer());
+          integer();
         } else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_') {
-          setToken(identifierOrKeyword());
+          identifierOrKeyword();
         } else {
           error("invalid character: '" + c + "'");
         }
         break;
       } // default
       } // switch
-      if (token != null) { // stop here if we scanned a token
+      if (token.kind != null) { // stop here if we scanned a token
         return;
       }
     } // while
 
     if (indentStack.size() > 1) { // top of stack is always zero
-      setToken(new Token(TokenKind.NEWLINE, pos - 1, pos));
+      setToken(TokenKind.NEWLINE, pos - 1, pos);
       while (indentStack.size() > 1) {
         indentStack.pop();
         dents--;
@@ -905,7 +927,7 @@ public final class Lexer {
       return;
     }
 
-    setToken(new Token(TokenKind.EOF, pos, pos));
+    setToken(TokenKind.EOF, pos, pos);
   }
 
   /**
diff --git a/src/main/java/com/google/devtools/build/lib/syntax/Token.java b/src/main/java/com/google/devtools/build/lib/syntax/Token.java
index 3c99df5f8c..733b5387d0 100644
--- a/src/main/java/com/google/devtools/build/lib/syntax/Token.java
+++ b/src/main/java/com/google/devtools/build/lib/syntax/Token.java
@@ -22,14 +22,14 @@ import javax.annotation.Nullable;
 class Token {
 
   TokenKind kind;
-  final int left;
-  final int right;
+  int left;
+  int right;
   /**
    * value is an Integer if the kind is INT.
    * It is a String if the kind is STRING, IDENTIFIER, or COMMENT.
    * It is null otherwise.
    */
-  @Nullable final Object value;
+  @Nullable Object value;
 
   Token(TokenKind kind, int left, int right) {
     this(kind, left, right, null);
@@ -42,6 +42,10 @@ class Token {
     this.value = value;
   }
 
+  Token copy() {
+    return new Token(kind, left, right, value);
+  }
+
   /**
    * Constructs an easy-to-read string representation of token, suitable for use
    * in user error messages.
author	laurentlb <laurentlb@google.com>	2018-06-05 09:42:59 -0700
committer	Copybara-Service <copybara-piper@google.com>	2018-06-05 09:45:02 -0700
commit	fc23edc368244849c59636d1e2ae6c375bebf262 (patch)
tree	2d742b93785efbd2f1fbe21de03972edd4838d7b /src/main/java/com/google/devtools/build/lib/syntax
parent	c04ce79e4f719df54e3564f58a2ea26529ac1494 (diff)