diff options
Diffstat (limited to 'src/main/java/com/google/devtools/build/lib/query2/engine/Lexer.java')
-rw-r--r-- | src/main/java/com/google/devtools/build/lib/query2/engine/Lexer.java | 281 |
1 files changed, 281 insertions, 0 deletions
diff --git a/src/main/java/com/google/devtools/build/lib/query2/engine/Lexer.java b/src/main/java/com/google/devtools/build/lib/query2/engine/Lexer.java new file mode 100644 index 0000000000..45b6f6183e --- /dev/null +++ b/src/main/java/com/google/devtools/build/lib/query2/engine/Lexer.java @@ -0,0 +1,281 @@ +// Copyright 2014 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package com.google.devtools.build.lib.query2.engine; + +import java.util.ArrayList; +import java.util.EnumSet; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * A tokenizer for the Blaze query language, revision 2. + * + * Note, we can avoid a lot of quoting by noting that the characters [() ,] do + * not appear in any label, filename, function name, or regular expression we care about. + * + * No string escapes are allowed ("\"). Given the domain, that's not currently + * a problem. + */ +final class Lexer { + + /** + * Discriminator for different kinds of tokens. + */ + public enum TokenKind { + WORD("word"), + EOF("EOF"), + + COMMA(","), + EQUALS("="), + LPAREN("("), + MINUS("-"), + PLUS("+"), + RPAREN(")"), + CARET("^"), + + __ALL_IDENTIFIERS_FOLLOW(""), // See below + + IN("in"), + LET("let"), + SET("set"), + + INTERSECT("intersect"), + EXCEPT("except"), + UNION("union"); + + private final String prettyName; + + private TokenKind(String prettyName) { + this.prettyName = prettyName; + } + + public String getPrettyName() { + return prettyName; + } + } + + public static final Set<TokenKind> BINARY_OPERATORS = EnumSet.of( + TokenKind.INTERSECT, + TokenKind.CARET, + TokenKind.UNION, + TokenKind.PLUS, + TokenKind.EXCEPT, + TokenKind.MINUS); + + private static final Map<String, TokenKind> keywordMap = new HashMap<>(); + static { + for (TokenKind kind : EnumSet.allOf(TokenKind.class)) { + if (kind.ordinal() > TokenKind.__ALL_IDENTIFIERS_FOLLOW.ordinal()) { + keywordMap.put(kind.getPrettyName(), kind); + } + } + } + + /** + * Returns true iff 'word' is a reserved word of the language. + */ + static boolean isReservedWord(String word) { + return keywordMap.containsKey(word); + } + + /** + * Tokens returned by the Lexer. + */ + static class Token { + + public final TokenKind kind; + public final String word; + + Token(TokenKind kind) { + this.kind = kind; + this.word = null; + } + + Token(String word) { + this.kind = TokenKind.WORD; + this.word = word; + } + + @Override + public String toString() { + return kind == TokenKind.WORD ? word : kind.getPrettyName(); + } + } + + /** + * Entry point to the lexer. Returns the list of tokens for the specified + * input, or throws QueryException. + */ + public static List<Token> scan(char[] buffer) throws QueryException { + Lexer lexer = new Lexer(buffer); + lexer.tokenize(); + return lexer.tokens; + } + + // Input buffer and position + private char[] buffer; + private int pos; + + private final List<Token> tokens = new ArrayList<>(); + + private Lexer(char[] buffer) { + this.buffer = buffer; + this.pos = 0; + } + + private void addToken(Token s) { + tokens.add(s); + } + + /** + * Scans a quoted word delimited by 'quot'. + * + * ON ENTRY: 'pos' is 1 + the index of the first delimiter + * ON EXIT: 'pos' is 1 + the index of the last delimiter. + * + * @return the word token. + */ + private Token quotedWord(char quot) throws QueryException { + int oldPos = pos - 1; + while (pos < buffer.length) { + char c = buffer[pos++]; + switch (c) { + case '\'': + case '"': + if (c == quot) { + // close-quote, all done. + return new Token(bufferSlice(oldPos + 1, pos - 1)); + } + } + } + throw new QueryException("unclosed quotation"); + } + + private TokenKind getTokenKindForWord(String word) { + TokenKind kind = keywordMap.get(word); + return kind == null ? TokenKind.WORD : kind; + } + + // Unquoted words may contain [-*$], but not start with them. For user convenience, unquoted + // words must include UNIX filenames, labels and target label patterns, and simple regexps + // (e.g. cc_.*). Keep consistent with TargetLiteral.toString()! + private String scanWord() { + int oldPos = pos - 1; + while (pos < buffer.length) { + switch (buffer[pos]) { + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': + case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': + case 's': case 't': case 'u': case 'v': case 'w': case 'x': + case 'y': case 'z': + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': + case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': + case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': + case 'Y': case 'Z': + case '0': case '1': case '2': case '3': case '4': case '5': + case '6': case '7': case '8': case '9': + case '*': case '/': case '@': case '.': case '-': case '_': + case ':': case '$': + pos++; + break; + default: + return bufferSlice(oldPos, pos); + } + } + return bufferSlice(oldPos, pos); + } + + /** + * Scans a word or keyword. + * + * ON ENTRY: 'pos' is 1 + the index of the first char in the word. + * ON EXIT: 'pos' is 1 + the index of the last char in the word. + * + * @return the word or keyword token. + */ + private Token wordOrKeyword() { + String word = scanWord(); + TokenKind kind = getTokenKindForWord(word); + return kind == TokenKind.WORD ? new Token(word) : new Token(kind); + } + + /** + * Performs tokenization of the character buffer of file contents provided to + * the constructor. + */ + private void tokenize() throws QueryException { + while (pos < buffer.length) { + char c = buffer[pos]; + pos++; + switch (c) { + case '(': { + addToken(new Token(TokenKind.LPAREN)); + break; + } + case ')': { + addToken(new Token(TokenKind.RPAREN)); + break; + } + case ',': { + addToken(new Token(TokenKind.COMMA)); + break; + } + case '+': { + addToken(new Token(TokenKind.PLUS)); + break; + } + case '-': { + addToken(new Token(TokenKind.MINUS)); + break; + } + case '=': { + addToken(new Token(TokenKind.EQUALS)); + break; + } + case '^': { + addToken(new Token(TokenKind.CARET)); + break; + } + case '\n': + case ' ': + case '\t': + case '\r': { + /* ignore */ + break; + } + case '\'': + case '\"': { + addToken(quotedWord(c)); + break; + } + default: { + addToken(wordOrKeyword()); + break; + } // default + } // switch + } // while + + addToken(new Token(TokenKind.EOF)); + + this.buffer = null; // release buffer now that we have our tokens + } + + private String bufferSlice(int start, int end) { + return new String(this.buffer, start, end - start); + } + +} |