diff options
author | laurentlb <laurentlb@google.com> | 2018-06-05 09:42:59 -0700 |
---|---|---|
committer | Copybara-Service <copybara-piper@google.com> | 2018-06-05 09:45:02 -0700 |
commit | fc23edc368244849c59636d1e2ae6c375bebf262 (patch) | |
tree | 2d742b93785efbd2f1fbe21de03972edd4838d7b /src/main/java/com/google/devtools/build/lib/syntax | |
parent | c04ce79e4f719df54e3564f58a2ea26529ac1494 (diff) |
Stop allocating new tokens in the lexer
There's only one Token and it gets reused.
This reduces the memory usage of the lexer. Parsing time seems to be 5%-10%
faster with this change on a large file. This makes little difference on the
overall performance of Bazel though.
RELNOTES: None.
PiperOrigin-RevId: 199310860
Diffstat (limited to 'src/main/java/com/google/devtools/build/lib/syntax')
-rw-r--r-- | src/main/java/com/google/devtools/build/lib/syntax/Lexer.java | 152 | ||||
-rw-r--r-- | src/main/java/com/google/devtools/build/lib/syntax/Token.java | 10 |
2 files changed, 94 insertions, 68 deletions
diff --git a/src/main/java/com/google/devtools/build/lib/syntax/Lexer.java b/src/main/java/com/google/devtools/build/lib/syntax/Lexer.java index 91384f06d4..ed7bd4c17e 100644 --- a/src/main/java/com/google/devtools/build/lib/syntax/Lexer.java +++ b/src/main/java/com/google/devtools/build/lib/syntax/Lexer.java @@ -81,8 +81,12 @@ public final class Lexer { // bottom. private final Stack<Integer> indentStack = new Stack<>(); - /** Token to return */ - private Token token; + /** + * Token to return. This token is mutated in-place. Its kind is set to + * null to indicate the intermediate state, where the new token has not + * been scanned yet. + */ + private final Token token; private final List<Comment> comments; @@ -112,6 +116,7 @@ public final class Lexer { this.checkIndentation = true; this.comments = new ArrayList<>(); this.dents = 0; + this.token = new Token(null, -1, -1); indentStack.push(0); } @@ -146,9 +151,10 @@ public final class Lexer { * after EOF has been returned. */ public Token nextToken() { - boolean afterNewline = token != null && token.kind == TokenKind.NEWLINE; - token = null; + boolean afterNewline = token.kind == TokenKind.NEWLINE; + token.kind = null; tokenize(); + Preconditions.checkState(token.kind != null); // Like Python, always end with a NEWLINE token, even if no '\n' in input: if (token.kind == TokenKind.EOF && !afterNewline) { @@ -226,9 +232,20 @@ public final class Lexer { } /** invariant: symbol positions are half-open intervals. */ - private void setToken(Token s) { - Preconditions.checkState(token == null); - token = s; + private void setToken(TokenKind kind, int left, int right) { + Preconditions.checkState(token.kind == null); + token.kind = kind; + token.left = left; + token.right = right; + token.value = null; + } + + private void setToken(TokenKind kind, int left, int right, Object value) { + Preconditions.checkState(token.kind == null); + token.kind = kind; + token.left = left; + token.right = right; + token.value = value; } /** @@ -241,7 +258,7 @@ public final class Lexer { newlineInsideExpression(); // in an expression: ignore space } else { checkIndentation = true; - setToken(new Token(TokenKind.NEWLINE, pos - 1, pos)); + setToken(TokenKind.NEWLINE, pos - 1, pos); } } @@ -330,7 +347,7 @@ public final class Lexer { * * @return the string-literal token. */ - private Token escapedStringLiteral(char quot, boolean isRaw) { + private void escapedStringLiteral(char quot, boolean isRaw) { int literalStartPos = isRaw ? pos - 2 : pos - 1; boolean inTriplequote = skipTripleQuote(quot); // more expensive second choice that expands escaped into a buffer @@ -345,12 +362,14 @@ public final class Lexer { break; } else { error("unterminated string literal at eol", literalStartPos, pos); - return new Token(TokenKind.STRING, literalStartPos, pos, literal.toString()); + setToken(TokenKind.STRING, literalStartPos, pos, literal.toString()); + return; } case '\\': if (pos == buffer.length) { error("unterminated string literal at eof", literalStartPos, pos); - return new Token(TokenKind.STRING, literalStartPos, pos, literal.toString()); + setToken(TokenKind.STRING, literalStartPos, pos, literal.toString()); + return; } if (isRaw) { // Insert \ and the following character. @@ -454,7 +473,8 @@ public final class Lexer { literal.append(c); } else { // Matching close-delimiter, all done. - return new Token(TokenKind.STRING, literalStartPos, pos, literal.toString()); + setToken(TokenKind.STRING, literalStartPos, pos, literal.toString()); + return; } break; default: @@ -463,7 +483,7 @@ public final class Lexer { } } error("unterminated string literal at eof", literalStartPos, pos); - return new Token(TokenKind.STRING, literalStartPos, pos, literal.toString()); + setToken(TokenKind.STRING, literalStartPos, pos, literal.toString()); } /** @@ -477,14 +497,15 @@ public final class Lexer { * @param isRaw if true, do not escape the string. * @return the string-literal token. */ - private Token stringLiteral(char quot, boolean isRaw) { + private void stringLiteral(char quot, boolean isRaw) { int literalStartPos = isRaw ? pos - 2 : pos - 1; int contentStartPos = pos; // Don't even attempt to parse triple-quotes here. if (skipTripleQuote(quot)) { pos -= 2; - return escapedStringLiteral(quot, isRaw); + escapedStringLiteral(quot, isRaw); + return; } // first quick optimistic scan for a simple non-escaped string @@ -493,17 +514,16 @@ public final class Lexer { switch (c) { case '\n': error("unterminated string literal at eol", literalStartPos, pos); - Token t = - new Token( - TokenKind.STRING, literalStartPos, pos, bufferSlice(contentStartPos, pos - 1)); - return t; + setToken(TokenKind.STRING, literalStartPos, pos, bufferSlice(contentStartPos, pos - 1)); + return; case '\\': if (isRaw) { if (lookaheadIs(0, '\r') && lookaheadIs(1, '\n')) { // There was a CRLF after the newline. No shortcut possible, since it needs to be // transformed into a single LF. pos = contentStartPos; - return escapedStringLiteral(quot, true); + escapedStringLiteral(quot, true); + return; } else { pos++; break; @@ -511,13 +531,15 @@ public final class Lexer { } // oops, hit an escape, need to start over & build a new string buffer pos = contentStartPos; - return escapedStringLiteral(quot, false); + escapedStringLiteral(quot, false); + return; case '\'': case '"': if (c == quot) { // close-quote, all done. - return new Token( + setToken( TokenKind.STRING, literalStartPos, pos, bufferSlice(contentStartPos, pos - 1)); + return; } break; default: // fall out @@ -531,7 +553,7 @@ public final class Lexer { } error("unterminated string literal at eof", literalStartPos, pos); - return new Token(TokenKind.STRING, literalStartPos, pos, bufferSlice(contentStartPos, pos)); + setToken(TokenKind.STRING, literalStartPos, pos, bufferSlice(contentStartPos, pos)); } private static final Map<String, TokenKind> keywordMap = new HashMap<>(); @@ -578,13 +600,15 @@ public final class Lexer { * * @return the identifier or keyword token. */ - private Token identifierOrKeyword() { + private void identifierOrKeyword() { int oldPos = pos - 1; String id = scanIdentifier(); TokenKind kind = keywordMap.get(id); - return (kind == null) - ? new Token(TokenKind.IDENTIFIER, oldPos, pos, id) - : new Token(kind, oldPos, pos, null); + if (kind == null) { + setToken(TokenKind.IDENTIFIER, oldPos, pos, id); + } else { + setToken(kind, oldPos, pos, null); + } } private String scanIdentifier() { @@ -649,10 +673,8 @@ public final class Lexer { * * <p>ON ENTRY: 'pos' is 1 + the index of the first char in the literal. * ON EXIT: 'pos' is 1 + the index of the last char in the literal. - * - * @return the integer token. */ - private Token integer() { + private void integer() { int oldPos = pos - 1; String literal = scanInteger(); @@ -679,7 +701,7 @@ public final class Lexer { error("invalid base-" + radix + " integer constant: " + literal); } - return new Token(TokenKind.INT, oldPos, pos, value); + setToken(TokenKind.INT, oldPos, pos, value); } /** @@ -701,7 +723,7 @@ public final class Lexer { if (tok == null) { return false; } else { - setToken(new Token(tok, pos, pos + 2)); + setToken(tok, pos, pos + 2); return true; } } @@ -712,8 +734,8 @@ public final class Lexer { } /** - * Performs tokenization of the character buffer of file contents provided to the constructor. - * Advances pos and sets the token variable. + * Performs tokenization of the character buffer of file contents provided to the constructor. At + * least one token will be added to the tokens queue. */ private void tokenize() { if (checkIndentation) { @@ -725,10 +747,10 @@ public final class Lexer { if (dents != 0) { if (dents < 0) { dents++; - setToken(new Token(TokenKind.OUTDENT, pos - 1, pos)); + setToken(TokenKind.OUTDENT, pos - 1, pos); } else { dents--; - setToken(new Token(TokenKind.INDENT, pos - 1, pos)); + setToken(TokenKind.INDENT, pos - 1, pos); } return; } @@ -742,94 +764,94 @@ public final class Lexer { pos++; switch (c) { case '{': { - setToken(new Token(TokenKind.LBRACE, pos - 1, pos)); + setToken(TokenKind.LBRACE, pos - 1, pos); openParenStackDepth++; break; } case '}': { - setToken(new Token(TokenKind.RBRACE, pos - 1, pos)); + setToken(TokenKind.RBRACE, pos - 1, pos); popParen(); break; } case '(': { - setToken(new Token(TokenKind.LPAREN, pos - 1, pos)); + setToken(TokenKind.LPAREN, pos - 1, pos); openParenStackDepth++; break; } case ')': { - setToken(new Token(TokenKind.RPAREN, pos - 1, pos)); + setToken(TokenKind.RPAREN, pos - 1, pos); popParen(); break; } case '[': { - setToken(new Token(TokenKind.LBRACKET, pos - 1, pos)); + setToken(TokenKind.LBRACKET, pos - 1, pos); openParenStackDepth++; break; } case ']': { - setToken(new Token(TokenKind.RBRACKET, pos - 1, pos)); + setToken(TokenKind.RBRACKET, pos - 1, pos); popParen(); break; } case '>': { - setToken(new Token(TokenKind.GREATER, pos - 1, pos)); + setToken(TokenKind.GREATER, pos - 1, pos); break; } case '<': { - setToken(new Token(TokenKind.LESS, pos - 1, pos)); + setToken(TokenKind.LESS, pos - 1, pos); break; } case ':': { - setToken(new Token(TokenKind.COLON, pos - 1, pos)); + setToken(TokenKind.COLON, pos - 1, pos); break; } case ',': { - setToken(new Token(TokenKind.COMMA, pos - 1, pos)); + setToken(TokenKind.COMMA, pos - 1, pos); break; } case '+': { - setToken(new Token(TokenKind.PLUS, pos - 1, pos)); + setToken(TokenKind.PLUS, pos - 1, pos); break; } case '-': { - setToken(new Token(TokenKind.MINUS, pos - 1, pos)); + setToken(TokenKind.MINUS, pos - 1, pos); break; } case '|': { - setToken(new Token(TokenKind.PIPE, pos - 1, pos)); + setToken(TokenKind.PIPE, pos - 1, pos); break; } case '=': { - setToken(new Token(TokenKind.EQUALS, pos - 1, pos)); + setToken(TokenKind.EQUALS, pos - 1, pos); break; } case '%': { - setToken(new Token(TokenKind.PERCENT, pos - 1, pos)); + setToken(TokenKind.PERCENT, pos - 1, pos); break; } case '/': { if (lookaheadIs(0, '/') && lookaheadIs(1, '=')) { - setToken(new Token(TokenKind.SLASH_SLASH_EQUALS, pos - 1, pos + 2)); + setToken(TokenKind.SLASH_SLASH_EQUALS, pos - 1, pos + 2); pos += 2; } else if (lookaheadIs(0, '/')) { - setToken(new Token(TokenKind.SLASH_SLASH, pos - 1, pos + 1)); + setToken(TokenKind.SLASH_SLASH, pos - 1, pos + 1); pos += 1; } else { // /= is handled by tokenizeTwoChars. - setToken(new Token(TokenKind.SLASH, pos - 1, pos)); + setToken(TokenKind.SLASH, pos - 1, pos); } break; } case ';': { - setToken(new Token(TokenKind.SEMI, pos - 1, pos)); + setToken(TokenKind.SEMI, pos - 1, pos); break; } case '.': { - setToken(new Token(TokenKind.DOT, pos - 1, pos)); + setToken(TokenKind.DOT, pos - 1, pos); break; } case '*': { - setToken(new Token(TokenKind.STAR, pos - 1, pos)); + setToken(TokenKind.STAR, pos - 1, pos); break; } case ' ': @@ -845,7 +867,7 @@ public final class Lexer { } else if (lookaheadIs(0, '\r') && lookaheadIs(1, '\n')) { pos += 2; // skip the CRLF at the end of line } else { - setToken(new Token(TokenKind.ILLEGAL, pos - 1, pos, Character.toString(c))); + setToken(TokenKind.ILLEGAL, pos - 1, pos, Character.toString(c)); } break; } @@ -868,7 +890,7 @@ public final class Lexer { } case '\'': case '\"': { - setToken(stringLiteral(c, false)); + stringLiteral(c, false); break; } default: { @@ -877,27 +899,27 @@ public final class Lexer { && (buffer[pos] == '\'' || buffer[pos] == '\"')) { c = buffer[pos]; pos++; - setToken(stringLiteral(c, true)); + stringLiteral(c, true); break; } if (c >= '0' && c <= '9') { - setToken(integer()); + integer(); } else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_') { - setToken(identifierOrKeyword()); + identifierOrKeyword(); } else { error("invalid character: '" + c + "'"); } break; } // default } // switch - if (token != null) { // stop here if we scanned a token + if (token.kind != null) { // stop here if we scanned a token return; } } // while if (indentStack.size() > 1) { // top of stack is always zero - setToken(new Token(TokenKind.NEWLINE, pos - 1, pos)); + setToken(TokenKind.NEWLINE, pos - 1, pos); while (indentStack.size() > 1) { indentStack.pop(); dents--; @@ -905,7 +927,7 @@ public final class Lexer { return; } - setToken(new Token(TokenKind.EOF, pos, pos)); + setToken(TokenKind.EOF, pos, pos); } /** diff --git a/src/main/java/com/google/devtools/build/lib/syntax/Token.java b/src/main/java/com/google/devtools/build/lib/syntax/Token.java index 3c99df5f8c..733b5387d0 100644 --- a/src/main/java/com/google/devtools/build/lib/syntax/Token.java +++ b/src/main/java/com/google/devtools/build/lib/syntax/Token.java @@ -22,14 +22,14 @@ import javax.annotation.Nullable; class Token { TokenKind kind; - final int left; - final int right; + int left; + int right; /** * value is an Integer if the kind is INT. * It is a String if the kind is STRING, IDENTIFIER, or COMMENT. * It is null otherwise. */ - @Nullable final Object value; + @Nullable Object value; Token(TokenKind kind, int left, int right) { this(kind, left, right, null); @@ -42,6 +42,10 @@ class Token { this.value = value; } + Token copy() { + return new Token(kind, left, right, value); + } + /** * Constructs an easy-to-read string representation of token, suitable for use * in user error messages. |