// Copyright 2014 Google Inc. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package com.google.devtools.build.lib.syntax; import com.google.common.collect.ImmutableMap; import com.google.devtools.build.lib.concurrent.ThreadSafety.Immutable; import com.google.devtools.build.lib.events.Event; import com.google.devtools.build.lib.events.EventHandler; import com.google.devtools.build.lib.events.Location; import com.google.devtools.build.lib.util.Pair; import com.google.devtools.build.lib.vfs.Path; import com.google.devtools.build.lib.vfs.PathFragment; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Stack; /** * A tokenizer for the BUILD language. *
* Since BUILD files are small, we just tokenize the entire file a-priori
* instead of interleaving scanning with parsing.
*/
public final class Lexer {
private static final Map UNIX newlines are assumed (LF). Carriage returns are always ignored.
*
* ON ENTRY: 'pos' is the index of the char after '\n'.
* ON EXIT: 'pos' is the index of the next non-space char after '\n'.
*/
private void newline() {
if (openParenStackDepth > 0) {
newlineInsideExpression(); // in an expression: ignore space
} else {
newlineOutsideExpression(); // generate NEWLINE/INDENT/OUTDENT tokens
}
}
private void newlineInsideExpression() {
while (pos < buffer.length) {
switch (buffer[pos]) {
case ' ': case '\t': case '\r':
pos++;
break;
default:
return;
}
}
}
private void newlineOutsideExpression() {
if (pos > 1) { // skip over newline at start of file
addToken(new Token(TokenKind.NEWLINE, pos - 1, pos));
}
// we're in a stmt: suck up space at beginning of next line
int indentLen = 0;
while (pos < buffer.length) {
char c = buffer[pos];
if (c == ' ') {
indentLen++;
pos++;
} else if (c == '\t') {
indentLen += 8 - indentLen % 8;
pos++;
} else if (c == '\n') { // entirely blank line: discard
indentLen = 0;
pos++;
} else if (c == '#') { // line containing only indented comment
int oldPos = pos;
while (pos < buffer.length && c != '\n') {
c = buffer[pos++];
}
addToken(new Token(TokenKind.COMMENT, oldPos, pos - 1, bufferSlice(oldPos, pos - 1)));
indentLen = 0;
} else { // printing character
break;
}
}
if (pos == buffer.length) {
indentLen = 0;
} // trailing space on last line
int peekedIndent = indentStack.peek();
if (peekedIndent < indentLen) { // push a level
indentStack.push(indentLen);
addToken(new Token(TokenKind.INDENT, pos - 1, pos));
} else if (peekedIndent > indentLen) { // pop one or more levels
while (peekedIndent > indentLen) {
indentStack.pop();
addToken(new Token(TokenKind.OUTDENT, pos - 1, pos));
peekedIndent = indentStack.peek();
}
if (peekedIndent < indentLen) {
error("indentation error");
}
}
}
/**
* Returns true if current position is in the middle of a triple quote
* delimiter (3 x quot), and advances 'pos' by two if so.
*/
private boolean skipTripleQuote(char quot) {
if (pos + 1 < buffer.length && buffer[pos] == quot && buffer[pos + 1] == quot) {
pos += 2;
return true;
} else {
return false;
}
}
/**
* Scans a string literal delimited by 'quot', containing escape sequences.
*
* ON ENTRY: 'pos' is 1 + the index of the first delimiter
* ON EXIT: 'pos' is 1 + the index of the last delimiter.
*
* @return the string-literal token.
*/
private Token escapedStringLiteral(char quot) {
boolean inTriplequote = skipTripleQuote(quot);
int oldPos = pos - 1;
// more expensive second choice that expands escaped into a buffer
StringBuilder literal = new StringBuilder();
while (pos < buffer.length) {
char c = buffer[pos];
pos++;
switch (c) {
case '\n':
if (inTriplequote) {
literal.append(c);
break;
} else {
error("unterminated string literal at eol", oldPos, pos);
newline();
return new Token(TokenKind.STRING, oldPos, pos, literal.toString());
}
case '\\':
if (pos == buffer.length) {
error("unterminated string literal at eof", oldPos, pos);
return new Token(TokenKind.STRING, oldPos, pos, literal.toString());
}
c = buffer[pos];
pos++;
switch (c) {
case '\n':
// ignore end of line character
break;
case 'n':
literal.append('\n');
break;
case 'r':
literal.append('\r');
break;
case 't':
literal.append('\t');
break;
case '\\':
literal.append('\\');
break;
case '\'':
literal.append('\'');
break;
case '"':
literal.append('"');
break;
case '0': case '1': case '2': case '3':
case '4': case '5': case '6': case '7': { // octal escape
int octal = c - '0';
if (pos < buffer.length) {
c = buffer[pos];
if (c >= '0' && c <= '7') {
pos++;
octal = (octal << 3) | (c - '0');
if (pos < buffer.length) {
c = buffer[pos];
if (c >= '0' && c <= '7') {
pos++;
octal = (octal << 3) | (c - '0');
}
}
}
}
literal.append((char) (octal & 0xff));
break;
}
case 'a': case 'b': case 'f': case 'N': case 'u': case 'U': case 'v': case 'x':
// exists in Python but not implemented in Blaze => error
error("escape sequence not implemented: \\" + c, oldPos, pos);
break;
default:
// unknown char escape => "\literal"
literal.append('\\');
literal.append(c);
break;
}
break;
case '\'':
case '"':
if (c != quot
|| (inTriplequote && !skipTripleQuote(quot))) {
// Non-matching quote, treat it like a regular char.
literal.append(c);
} else {
// Matching close-delimiter, all done.
return new Token(TokenKind.STRING, oldPos, pos, literal.toString());
}
break;
default:
literal.append(c);
break;
}
}
error("unterminated string literal at eof", oldPos, pos);
return new Token(TokenKind.STRING, oldPos, pos, literal.toString());
}
/**
* Scans a string literal delimited by 'quot'.
*
* ON ENTRY: 'pos' is 1 + the index of the first char in the identifier.
* ON EXIT: 'pos' is 1 + the index of the last char in the identifier.
*
* @return the identifier or keyword token.
*/
private Token identifierOrKeyword() {
int oldPos = pos - 1;
String id = scanIdentifier();
TokenKind kind = getTokenKindForIdentfier(id);
return new Token(kind, oldPos, pos,
(kind == TokenKind.IDENTIFIER) ? id : null);
}
private String scanInteger() {
int oldPos = pos - 1;
while (pos < buffer.length) {
char c = buffer[pos];
switch (c) {
case 'X': case 'x':
case 'a': case 'A':
case 'b': case 'B':
case 'c': case 'C':
case 'd': case 'D':
case 'e': case 'E':
case 'f': case 'F':
case '0': case '1':
case '2': case '3':
case '4': case '5':
case '6': case '7':
case '8': case '9':
pos++;
break;
default:
return bufferSlice(oldPos, pos);
}
}
// TODO(bazel-team): (2009) to do roundtripping when we evaluate the integer
// constants, we must save the actual text of the tokens, not just their
// integer value.
return bufferSlice(oldPos, pos);
}
/**
* Scans an integer literal.
*
* ON ENTRY: 'pos' is 1 + the index of the first char in the literal.
* ON EXIT: 'pos' is 1 + the index of the last char in the literal.
*
* @return the integer token.
*/
private Token integer() {
int oldPos = pos - 1;
String literal = scanInteger();
final String substring;
final int radix;
if (literal.startsWith("0x") || literal.startsWith("0X")) {
radix = 16;
substring = literal.substring(2);
} else if (literal.startsWith("0") && literal.length() > 1) {
radix = 8;
substring = literal.substring(1);
} else {
radix = 10;
substring = literal;
}
int value = 0;
try {
value = Integer.parseInt(substring, radix);
} catch (NumberFormatException e) {
error("invalid base-" + radix + " integer constant: " + literal);
}
return new Token(TokenKind.INT, oldPos, pos, value);
}
/**
* Tokenizes a two-char operator.
* @return true if it tokenized an operator
*/
private boolean tokenizeTwoChars() {
if (pos + 2 >= buffer.length) {
return false;
}
char c1 = buffer[pos];
char c2 = buffer[pos + 1];
TokenKind tok = null;
if (c2 == '=') {
tok = EQUAL_TOKENS.get(c1);
} else if (c2 == '*' && c1 == '*') {
tok = TokenKind.STAR_STAR;
}
if (tok == null) {
return false;
} else {
addToken(new Token(tok, pos, pos + 2));
return true;
}
}
/**
* Performs tokenization of the character buffer of file contents provided to
* the constructor.
*/
private void tokenize() {
while (pos < buffer.length) {
if (tokenizeTwoChars()) {
pos += 2;
continue;
}
char c = buffer[pos];
pos++;
switch (c) {
case '{': {
addToken(new Token(TokenKind.LBRACE, pos - 1, pos));
openParenStackDepth++;
break;
}
case '}': {
addToken(new Token(TokenKind.RBRACE, pos - 1, pos));
popParen();
break;
}
case '(': {
addToken(new Token(TokenKind.LPAREN, pos - 1, pos));
openParenStackDepth++;
break;
}
case ')': {
addToken(new Token(TokenKind.RPAREN, pos - 1, pos));
popParen();
break;
}
case '[': {
addToken(new Token(TokenKind.LBRACKET, pos - 1, pos));
openParenStackDepth++;
break;
}
case ']': {
addToken(new Token(TokenKind.RBRACKET, pos - 1, pos));
popParen();
break;
}
case '>': {
addToken(new Token(TokenKind.GREATER, pos - 1, pos));
break;
}
case '<': {
addToken(new Token(TokenKind.LESS, pos - 1, pos));
break;
}
case ':': {
addToken(new Token(TokenKind.COLON, pos - 1, pos));
break;
}
case ',': {
addToken(new Token(TokenKind.COMMA, pos - 1, pos));
break;
}
case '+': {
addToken(new Token(TokenKind.PLUS, pos - 1, pos));
break;
}
case '-': {
addToken(new Token(TokenKind.MINUS, pos - 1, pos));
break;
}
case '=': {
addToken(new Token(TokenKind.EQUALS, pos - 1, pos));
break;
}
case '%': {
addToken(new Token(TokenKind.PERCENT, pos - 1, pos));
break;
}
case ';': {
addToken(new Token(TokenKind.SEMI, pos - 1, pos));
break;
}
case '.': {
addToken(new Token(TokenKind.DOT, pos - 1, pos));
break;
}
case '*': {
addToken(new Token(TokenKind.STAR, pos - 1, pos));
break;
}
case ' ':
case '\t':
case '\r': {
/* ignore */
break;
}
case '\\': {
// Backslash character is valid only at the end of a line (or in a string)
if (pos + 1 < buffer.length && buffer[pos] == '\n') {
pos++; // skip the end of line character
} else {
addToken(new Token(TokenKind.ILLEGAL, pos - 1, pos, Character.toString(c)));
}
break;
}
case '\n': {
newline();
break;
}
case '#': {
int oldPos = pos - 1;
while (pos < buffer.length) {
c = buffer[pos];
if (c == '\n') {
break;
} else {
pos++;
}
}
addToken(new Token(TokenKind.COMMENT, oldPos, pos, bufferSlice(oldPos, pos)));
break;
}
case '\'':
case '\"': {
addToken(stringLiteral(c, false));
break;
}
default: {
// detect raw strings, e.g. r"str"
if (c == 'r' && pos < buffer.length
&& (buffer[pos] == '\'' || buffer[pos] == '\"')) {
c = buffer[pos];
pos++;
addToken(stringLiteral(c, true));
break;
}
if (Character.isDigit(c)) {
addToken(integer());
} else if (Character.isJavaIdentifierStart(c) && c != '$') {
addToken(identifierOrKeyword());
} else {
// Some characters in Python are not recognized in Blaze syntax (e.g. '!')
if (parsePython) {
addToken(new Token(TokenKind.ILLEGAL, pos - 1, pos, Character.toString(c)));
} else {
error("invalid character: '" + c + "'");
}
}
break;
} // default
} // switch
} // while
if (indentStack.size() > 1) { // top of stack is always zero
addToken(new Token(TokenKind.NEWLINE, pos - 1, pos));
while (indentStack.size() > 1) {
indentStack.pop();
addToken(new Token(TokenKind.OUTDENT, pos - 1, pos));
}
}
// Like Python, always end with a NEWLINE token, even if no '\n' in input:
if (tokens.isEmpty() || tokens.get(tokens.size() - 1).kind != TokenKind.NEWLINE) {
addToken(new Token(TokenKind.NEWLINE, pos - 1, pos));
}
addToken(new Token(TokenKind.EOF, pos, pos));
}
/**
* Returns the character in the input buffer at the given position.
*
* @param at the position to get the character at.
* @return the character at the given position.
*/
public char charAt(int at) {
return buffer[at];
}
/**
* Returns the string at the current line, minus the new line.
*
* @param line the line from which to retrieve the String, 1-based
* @return the text of the line
*/
public String stringAtLine(int line) {
Pair
*
*
* @param isRaw if true, do not escape the string.
* @return the string-literal token.
*/
private Token stringLiteral(char quot, boolean isRaw) {
int oldPos = pos - 1;
// Don't even attempt to parse triple-quotes here.
if (skipTripleQuote(quot)) {
pos -= 2;
return escapedStringLiteral(quot);
}
// first quick optimistic scan for a simple non-escaped string
while (pos < buffer.length) {
char c = buffer[pos++];
switch (c) {
case '\n':
error("unterminated string literal at eol", oldPos, pos);
Token t = new Token(TokenKind.STRING, oldPos, pos,
bufferSlice(oldPos + 1, pos - 1));
newline();
return t;
case '\\':
if (isRaw) {
// skip the next character
pos++;
break;
} else {
// oops, hit an escape, need to start over & build a new string buffer
pos = oldPos + 1;
return escapedStringLiteral(quot);
}
case '\'':
case '"':
if (c == quot) {
// close-quote, all done.
return new Token(TokenKind.STRING, oldPos, pos,
bufferSlice(oldPos + 1, pos - 1));
}
}
}
error("unterminated string literal at eof", oldPos, pos);
return new Token(TokenKind.STRING, oldPos, pos,
bufferSlice(oldPos + 1, pos));
}
private static final Map