diff options
author | laurentlb <laurentlb@google.com> | 2018-06-04 06:03:36 -0700 |
---|---|---|
committer | Copybara-Service <copybara-piper@google.com> | 2018-06-04 06:05:23 -0700 |
commit | 51fdaa3e73c4623dc0a06592ab3cf2f3452d5946 (patch) | |
tree | 34a87686e39cb7baed07295abaf98c35068eb2ca /src | |
parent | 278a5e345b79a5c0301b3bcd31440eb4e22f9ac1 (diff) |
Get rid of the tokens queue in the lexer
Next step will be to skip token allocation.
RELNOTES: None.
PiperOrigin-RevId: 199121625
Diffstat (limited to 'src')
-rw-r--r-- | src/main/java/com/google/devtools/build/lib/syntax/Lexer.java | 131 | ||||
-rw-r--r-- | src/test/java/com/google/devtools/build/lib/syntax/LexerTest.java | 2 |
2 files changed, 69 insertions, 64 deletions
diff --git a/src/main/java/com/google/devtools/build/lib/syntax/Lexer.java b/src/main/java/com/google/devtools/build/lib/syntax/Lexer.java index a50a0a0049..91384f06d4 100644 --- a/src/main/java/com/google/devtools/build/lib/syntax/Lexer.java +++ b/src/main/java/com/google/devtools/build/lib/syntax/Lexer.java @@ -23,7 +23,6 @@ import com.google.devtools.build.lib.events.Location; import com.google.devtools.build.lib.skyframe.serialization.autocodec.AutoCodec; import com.google.devtools.build.lib.util.Pair; import com.google.devtools.build.lib.vfs.PathFragment; -import java.util.ArrayDeque; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -82,12 +81,8 @@ public final class Lexer { // bottom. private final Stack<Integer> indentStack = new Stack<>(); - /** Tokens scanned by the lexer, but not yet consumed by the parser. */ - // TODO(laurentlb): we could do without this queue. - private final ArrayDeque<Token> tokens; - - /** Last Token that was scanned. */ - private Token lastToken; + /** Token to return */ + private Token token; private final List<Comment> comments; @@ -102,6 +97,8 @@ public final class Lexer { */ private boolean checkIndentation; + private int dents; // number of saved INDENT (>0) or OUTDENT (<0) tokens to return + /** * Constructs a lexer which tokenizes the contents of the specified InputBuffer. Any errors during * lexing are reported on "handler". @@ -109,12 +106,12 @@ public final class Lexer { public Lexer( ParserInputSource input, EventHandler eventHandler, LineNumberTable lineNumberTable) { this.buffer = input.getContent(); - this.tokens = new ArrayDeque<>(10); this.pos = 0; this.eventHandler = eventHandler; this.locationInfo = new LocationInfo(input.getPath(), lineNumberTable); this.checkIndentation = true; this.comments = new ArrayList<>(); + this.dents = 0; indentStack.push(0); } @@ -149,11 +146,15 @@ public final class Lexer { * after EOF has been returned. */ public Token nextToken() { - if (tokens.isEmpty()) { - Preconditions.checkState(lastToken == null || lastToken.kind != TokenKind.EOF); - tokenize(); + boolean afterNewline = token != null && token.kind == TokenKind.NEWLINE; + token = null; + tokenize(); + + // Like Python, always end with a NEWLINE token, even if no '\n' in input: + if (token.kind == TokenKind.EOF && !afterNewline) { + token.kind = TokenKind.NEWLINE; } - return tokens.poll(); + return token; } private void popParen() { @@ -225,13 +226,13 @@ public final class Lexer { } /** invariant: symbol positions are half-open intervals. */ - private void addToken(Token s) { - lastToken = s; - tokens.add(s); + private void setToken(Token s) { + Preconditions.checkState(token == null); + token = s; } /** - * Parses an end-of-line sequence. + * Parses an end-of-line sequence, handling statement indentation correctly. * * <p>UNIX newlines are assumed (LF). Carriage returns are always ignored. */ @@ -240,7 +241,7 @@ public final class Lexer { newlineInsideExpression(); // in an expression: ignore space } else { checkIndentation = true; - addToken(new Token(TokenKind.NEWLINE, pos - 1, pos)); + setToken(new Token(TokenKind.NEWLINE, pos - 1, pos)); } } @@ -256,7 +257,8 @@ public final class Lexer { } } - private void newlineOutsideExpression() { + /** Computes indentation (updates dent) and advances pos. */ + private void computeIndentation() { // we're in a stmt: suck up space at beginning of next line int indentLen = 0; while (pos < buffer.length) { @@ -292,12 +294,12 @@ public final class Lexer { int peekedIndent = indentStack.peek(); if (peekedIndent < indentLen) { // push a level indentStack.push(indentLen); - addToken(new Token(TokenKind.INDENT, pos - 1, pos)); + dents++; } else if (peekedIndent > indentLen) { // pop one or more levels while (peekedIndent > indentLen) { indentStack.pop(); - addToken(new Token(TokenKind.OUTDENT, pos - 1, pos)); + dents--; peekedIndent = indentStack.peek(); } @@ -343,7 +345,6 @@ public final class Lexer { break; } else { error("unterminated string literal at eol", literalStartPos, pos); - newline(); return new Token(TokenKind.STRING, literalStartPos, pos, literal.toString()); } case '\\': @@ -495,7 +496,6 @@ public final class Lexer { Token t = new Token( TokenKind.STRING, literalStartPos, pos, bufferSlice(contentStartPos, pos - 1)); - newline(); return t; case '\\': if (isRaw) { @@ -701,7 +701,7 @@ public final class Lexer { if (tok == null) { return false; } else { - addToken(new Token(tok, pos, pos + 2)); + setToken(new Token(tok, pos, pos + 2)); return true; } } @@ -712,16 +712,25 @@ public final class Lexer { } /** - * Performs tokenization of the character buffer of file contents provided to the constructor. At - * least one token will be added to the tokens queue. + * Performs tokenization of the character buffer of file contents provided to the constructor. + * Advances pos and sets the token variable. */ private void tokenize() { if (checkIndentation) { checkIndentation = false; - newlineOutsideExpression(); // generate INDENT/OUTDENT tokens - if (!tokens.isEmpty()) { - return; + computeIndentation(); + } + + // Return saved indentation tokens. + if (dents != 0) { + if (dents < 0) { + dents++; + setToken(new Token(TokenKind.OUTDENT, pos - 1, pos)); + } else { + dents--; + setToken(new Token(TokenKind.INDENT, pos - 1, pos)); } + return; } while (pos < buffer.length) { @@ -733,94 +742,94 @@ public final class Lexer { pos++; switch (c) { case '{': { - addToken(new Token(TokenKind.LBRACE, pos - 1, pos)); + setToken(new Token(TokenKind.LBRACE, pos - 1, pos)); openParenStackDepth++; break; } case '}': { - addToken(new Token(TokenKind.RBRACE, pos - 1, pos)); + setToken(new Token(TokenKind.RBRACE, pos - 1, pos)); popParen(); break; } case '(': { - addToken(new Token(TokenKind.LPAREN, pos - 1, pos)); + setToken(new Token(TokenKind.LPAREN, pos - 1, pos)); openParenStackDepth++; break; } case ')': { - addToken(new Token(TokenKind.RPAREN, pos - 1, pos)); + setToken(new Token(TokenKind.RPAREN, pos - 1, pos)); popParen(); break; } case '[': { - addToken(new Token(TokenKind.LBRACKET, pos - 1, pos)); + setToken(new Token(TokenKind.LBRACKET, pos - 1, pos)); openParenStackDepth++; break; } case ']': { - addToken(new Token(TokenKind.RBRACKET, pos - 1, pos)); + setToken(new Token(TokenKind.RBRACKET, pos - 1, pos)); popParen(); break; } case '>': { - addToken(new Token(TokenKind.GREATER, pos - 1, pos)); + setToken(new Token(TokenKind.GREATER, pos - 1, pos)); break; } case '<': { - addToken(new Token(TokenKind.LESS, pos - 1, pos)); + setToken(new Token(TokenKind.LESS, pos - 1, pos)); break; } case ':': { - addToken(new Token(TokenKind.COLON, pos - 1, pos)); + setToken(new Token(TokenKind.COLON, pos - 1, pos)); break; } case ',': { - addToken(new Token(TokenKind.COMMA, pos - 1, pos)); + setToken(new Token(TokenKind.COMMA, pos - 1, pos)); break; } case '+': { - addToken(new Token(TokenKind.PLUS, pos - 1, pos)); + setToken(new Token(TokenKind.PLUS, pos - 1, pos)); break; } case '-': { - addToken(new Token(TokenKind.MINUS, pos - 1, pos)); + setToken(new Token(TokenKind.MINUS, pos - 1, pos)); break; } case '|': { - addToken(new Token(TokenKind.PIPE, pos - 1, pos)); + setToken(new Token(TokenKind.PIPE, pos - 1, pos)); break; } case '=': { - addToken(new Token(TokenKind.EQUALS, pos - 1, pos)); + setToken(new Token(TokenKind.EQUALS, pos - 1, pos)); break; } case '%': { - addToken(new Token(TokenKind.PERCENT, pos - 1, pos)); + setToken(new Token(TokenKind.PERCENT, pos - 1, pos)); break; } case '/': { if (lookaheadIs(0, '/') && lookaheadIs(1, '=')) { - addToken(new Token(TokenKind.SLASH_SLASH_EQUALS, pos - 1, pos + 2)); + setToken(new Token(TokenKind.SLASH_SLASH_EQUALS, pos - 1, pos + 2)); pos += 2; } else if (lookaheadIs(0, '/')) { - addToken(new Token(TokenKind.SLASH_SLASH, pos - 1, pos + 1)); + setToken(new Token(TokenKind.SLASH_SLASH, pos - 1, pos + 1)); pos += 1; } else { // /= is handled by tokenizeTwoChars. - addToken(new Token(TokenKind.SLASH, pos - 1, pos)); + setToken(new Token(TokenKind.SLASH, pos - 1, pos)); } break; } case ';': { - addToken(new Token(TokenKind.SEMI, pos - 1, pos)); + setToken(new Token(TokenKind.SEMI, pos - 1, pos)); break; } case '.': { - addToken(new Token(TokenKind.DOT, pos - 1, pos)); + setToken(new Token(TokenKind.DOT, pos - 1, pos)); break; } case '*': { - addToken(new Token(TokenKind.STAR, pos - 1, pos)); + setToken(new Token(TokenKind.STAR, pos - 1, pos)); break; } case ' ': @@ -836,7 +845,7 @@ public final class Lexer { } else if (lookaheadIs(0, '\r') && lookaheadIs(1, '\n')) { pos += 2; // skip the CRLF at the end of line } else { - addToken(new Token(TokenKind.ILLEGAL, pos - 1, pos, Character.toString(c))); + setToken(new Token(TokenKind.ILLEGAL, pos - 1, pos, Character.toString(c))); } break; } @@ -859,7 +868,7 @@ public final class Lexer { } case '\'': case '\"': { - addToken(stringLiteral(c, false)); + setToken(stringLiteral(c, false)); break; } default: { @@ -868,39 +877,35 @@ public final class Lexer { && (buffer[pos] == '\'' || buffer[pos] == '\"')) { c = buffer[pos]; pos++; - addToken(stringLiteral(c, true)); + setToken(stringLiteral(c, true)); break; } if (c >= '0' && c <= '9') { - addToken(integer()); + setToken(integer()); } else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_') { - addToken(identifierOrKeyword()); + setToken(identifierOrKeyword()); } else { error("invalid character: '" + c + "'"); } break; } // default } // switch - if (!tokens.isEmpty()) { // stop here if we scanned at least one token + if (token != null) { // stop here if we scanned a token return; } } // while if (indentStack.size() > 1) { // top of stack is always zero - addToken(new Token(TokenKind.NEWLINE, pos - 1, pos)); + setToken(new Token(TokenKind.NEWLINE, pos - 1, pos)); while (indentStack.size() > 1) { indentStack.pop(); - addToken(new Token(TokenKind.OUTDENT, pos - 1, pos)); + dents--; } + return; } - // Like Python, always end with a NEWLINE token, even if no '\n' in input: - if (lastToken == null || lastToken.kind != TokenKind.NEWLINE) { - addToken(new Token(TokenKind.NEWLINE, pos - 1, pos)); - } - - addToken(new Token(TokenKind.EOF, pos, pos)); + setToken(new Token(TokenKind.EOF, pos, pos)); } /** diff --git a/src/test/java/com/google/devtools/build/lib/syntax/LexerTest.java b/src/test/java/com/google/devtools/build/lib/syntax/LexerTest.java index 14c7152f7f..ccb968b9e4 100644 --- a/src/test/java/com/google/devtools/build/lib/syntax/LexerTest.java +++ b/src/test/java/com/google/devtools/build/lib/syntax/LexerTest.java @@ -455,7 +455,7 @@ public class LexerTest { // foo ( bar , { 1 : "[0,3) [3,4) [4,7) [7,8) [9,10) [10,11) [11,12)" // 'quux' } , """b""" , r"" ) NEWLINE EOF - + " [13,19) [19,20) [20,21) [22,29) [29,30) [31,34) [34,35) [34,35) [35,35)"); + + " [13,19) [19,20) [20,21) [22,29) [29,30) [31,34) [34,35) [35,35) [35,35)"); } @Test |