Initial checkin of SkSL lexical analyzer generator (not actually in use yet).

Bug: skia: Change-Id: I283feb9ac1cef134588204107ea019ce657e37b3 Reviewed-on: https://skia-review.googlesource.com/42084 Commit-Queue: Ethan Nicholas <ethannicholas@google.com> Reviewed-by: Brian Salomon <bsalomon@google.com>
author: Ethan Nicholas <ethannicholas@google.com> 2017-09-06 16:59:14 -0400
committer: Skia Commit-Bot <skia-commit-bot@chromium.org> 2017-09-06 21:19:18 +0000
commit: 3de0e496b5215bcf4b26db2d69dc84376dfd68d2 (patch)
tree: 5b340a6636dc0544109d623a968bd668abb23cb8 /src/sksl
parent: bb3dc768fd57956feb3947991011052b034dcb7a (diff)
13 files changed, 1241 insertions, 0 deletions
diff --git a/src/sksl/lex/DFA.h b/src/sksl/lex/DFA.h
new file mode 100644
index 0000000000..51e85ca2f8
--- /dev/null
+++ b/src/sksl/lex/DFA.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SKSL_DFA
+#define SKSL_DFA
+
+#include <string>
+#include <vector>
+
+/**
+ * Tables representing a deterministic finite automaton for matching regular expressions.
+ */
+struct DFA {
+    DFA(std::vector<std::vector<int>> transitions, std::vector<int> accepts)
+    : fTransitions(transitions)
+    , fAccepts(accepts) {}
+
+    static constexpr char END_CHAR = 126;
+
+    // one row per character, one column per state
+    std::vector<std::vector<int>> fTransitions;
+
+    // contains, for each state, the token id we should report when matching ends in that state (-1
+    // for no match)
+    std::vector<int> fAccepts;
+};
+
+#endif
diff --git a/src/sksl/lex/DFAState.h b/src/sksl/lex/DFAState.h
new file mode 100644
index 0000000000..b2c89fd4cb
--- /dev/null
+++ b/src/sksl/lex/DFAState.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SKSL_DFASTATE
+#define SKSL_DFASTATE
+
+struct DFAState {
+    struct Label {
+        std::vector<int> fStates;
+
+        Label(std::vector<int> states)
+        : fStates(std::move(states)) {}
+
+        bool operator==(const Label& other) const {
+            return fStates == other.fStates;
+        }
+
+        bool operator!=(const Label& other) const {
+            return !(*this == other);
+        }
+
+        std::string description() const {
+            std::string result = "<";
+            const char* separator = "";
+            for (int s : fStates) {
+                result += separator;
+                result += std::to_string(s);
+                separator = ", ";
+            }
+            result += ">";
+            return result;
+        }
+    };
+
+    DFAState()
+    : fId(-1)
+    , fLabel({}) {}
+
+    DFAState(int id, Label label)
+    : fId(id)
+    , fLabel(std::move(label)) {}
+
+    DFAState(const DFAState& other) = delete;
+
+    int fId;
+
+    Label fLabel;
+
+    bool fIsScanned = false;
+};
+
+namespace std {
+    template<> struct hash<DFAState::Label> {
+        size_t operator()(const DFAState::Label& s) const {
+            int result = 0;
+            for (int i : s.fStates) {
+                result = result * 101 + i;
+            }
+            return result;
+        }
+    };
+} // namespace
+
+#endif
diff --git a/src/sksl/lex/LexUtil.h b/src/sksl/lex/LexUtil.h
new file mode 100644
index 0000000000..d2c462b0ed
--- /dev/null
+++ b/src/sksl/lex/LexUtil.h
@@ -0,0 +1,14 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SKSL_LEXUTIL
+#define SKSL_LEXUTIL
+
+#define ABORT(...) (fprintf(stderr, __VA_ARGS__), abort())
+#define ASSERT(x) (void)((x) || (ABORT("failed assert(%s): %s:%d\n", #x, __FILE__, __LINE__), 0))
+
+#endif
diff --git a/src/sksl/lex/Main.cpp b/src/sksl/lex/Main.cpp
new file mode 100644
index 0000000000..a5c7320356
--- /dev/null
+++ b/src/sksl/lex/Main.cpp
@@ -0,0 +1,197 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "NFAtoDFA.h"
+#include "RegexParser.h"
+
+#include <fstream>
+#include <sstream>
+#include <string>
+
+/**
+ * Processes a .lex file and produces .h and .cpp files which implement a lexical analyzer. The .lex
+ * file is a text file with one token definition per line. Each line is of the form:
+ * <TOKEN_NAME> = <pattern>
+ * where <pattern> is either a regular expression (e.g [0-9]) or a double-quoted literal string.
+ */
+
+static constexpr const char* HEADER =
+    "/*\n"
+    " * Copyright 2017 Google Inc.\n"
+    " *\n"
+    " * Use of this source code is governed by a BSD-style license that can be\n"
+    " * found in the LICENSE file.\n"
+    " */\n"
+    "/*****************************************************************************************\n"
+    " ******************** This file was generated by sksllex. Do not edit. *******************\n"
+    " *****************************************************************************************/\n";
+
+void writeH(const DFA& dfa, const char* lexer, const char* token,
+            const std::vector<std::string>& tokens, const char* hPath) {
+    std::ofstream out(hPath);
+    ASSERT(out.good());
+    out << HEADER;
+    out << "#ifndef SKSL_" << lexer << "\n";
+    out << "#define SKSL_" << lexer << "\n";
+    out << "#include <cstddef>\n";
+    out << "#include <cstdint>\n";
+    out << "namespace SkSL {\n";
+    out << "\n";
+    out << "struct " << token << " {\n";
+    out << "    enum Kind {\n";
+    for (const std::string& t : tokens) {
+        out << "        #undef " << t << "\n";
+        out << "        " << t << ",\n";
+    }
+    out << "    };\n";
+    out << "\n";
+    out << "    " << token << "()\n";
+    out << "    : fKind(Kind::INVALID)\n";
+    out << "    , fOffset(-1)\n";
+    out << "    , fLength(-1) {}\n";
+    out << "\n";
+    out << "    " << token << "(Kind kind, int offset, int length)\n";
+    out << "    : fKind(kind)\n";
+    out << "    , fOffset(offset)\n";
+    out << "    , fLength(length) {}\n";
+    out << "\n";
+    out << "    Kind fKind;\n";
+    out << "    int fOffset;\n";
+    out << "    int fLength;\n";
+    out << "};\n";
+    out << "\n";
+    out << "class " << lexer << " {\n";
+    out << "public:\n";
+    out << "    void start(const char* text, size_t length) {\n";
+    out << "        fText = text;\n";
+    out << "        fLength = length;\n";
+    out << "        fOffset = 0;\n";
+    out << "    }\n";
+    out << "\n";
+    out << "    " << token << " next();\n";
+    out << "\n";
+    out << "private:\n";
+    out << "    const char* fText;\n";
+    out << "    int fLength;\n";
+    out << "    int fOffset;\n";
+    out << "};\n";
+    out << "\n";
+    out << "} // namespace\n";
+    out << "#endif\n";
+}
+
+void writeCPP(const DFA& dfa, const char* lexer, const char* token, const char* include,
+              const char* cppPath) {
+    std::ofstream out(cppPath);
+    ASSERT(out.good());
+    out << HEADER;
+    out << "#include \"" << include << "\"\n";
+    out << "\n";
+    out << "namespace SkSL {\n";
+    out << "\n";
+
+    size_t states = 0;
+    for (const auto& row : dfa.fTransitions) {
+        states = std::max(states, row.size());
+    }
+    out << "static int16_t transitions[" << DFA::END_CHAR + 1 << "][" << states << "] = {\n";
+    for (char c = 0; c <= DFA::END_CHAR; ++c) {
+        out << "    {";
+        for (size_t i = 0; i < states; ++i) {
+            if ((size_t) c < dfa.fTransitions.size() && i < dfa.fTransitions[c].size()) {
+                out << " " << dfa.fTransitions[c][i] << ",";
+            } else {
+                out << " 0,";
+            }
+        }
+        out << " },\n";
+    }
+    out << "};\n";
+    out << "\n";
+
+    out << "static int16_t accepts[" << states << "] = {";
+    for (size_t i = 0; i < states; ++i) {
+        if (i < dfa.fAccepts.size()) {
+            out << " " << dfa.fAccepts[i] << ",";
+        } else {
+            out << " -1,";
+        }
+    }
+    out << " };\n";
+    out << "\n";
+
+    out << token << " " << lexer << "::next() {\n";;
+    out << "    int startOffset = fOffset;\n";
+    out << "    if (startOffset == fLength) {\n";
+    out << "        return " << token << "(" << token << "::END_OF_FILE, startOffset, 0);\n";
+    out << "    }\n";
+    out << "    int offset = startOffset;\n";
+    out << "    int state = 1;\n";
+    out << "    " << token << "::Kind lastAccept = " << token << "::Kind::INVALID;\n";
+    out << "    int lastAcceptEnd = startOffset + 1;\n";
+    out << "    while (offset < fLength) {\n";
+    out << "        state = transitions[(int) fText[offset]][state];\n";
+    out << "        ++offset;\n";
+    out << "        if (!state) {\n";
+    out << "            break;\n";
+    out << "        }\n";
+    out << "        if (accepts[state]) {\n";
+    out << "            lastAccept = (" << token << "::Kind) accepts[state];\n";
+    out << "            lastAcceptEnd = offset;\n";
+    out << "        }\n";
+    out << "    }\n";
+    out << "    fOffset = lastAcceptEnd;\n";
+    out << "    return " << token << "(lastAccept, startOffset, lastAcceptEnd - startOffset);\n";
+    out << "}\n";
+    out << "\n";
+    out << "} // namespace\n";
+}
+
+void process(const char* inPath, const char* lexer, const char* token, const char* hPath,
+             const char* cppPath) {
+    NFA nfa;
+    std::vector<std::string> tokens;
+    tokens.push_back("END_OF_FILE");
+    std::string line;
+    std::ifstream in(inPath);
+    while (std::getline(in, line)) {
+        std::istringstream split(line);
+        std::string name, delimiter, pattern;
+        if (split >> name >> delimiter >> pattern) {
+            ASSERT(split.eof());
+            ASSERT(name != "");
+            ASSERT(delimiter == "=");
+            ASSERT(pattern != "");
+            tokens.push_back(name);
+            if (pattern[0] == '"') {
+                ASSERT(pattern.size() > 2 && pattern[pattern.size() - 1] == '"');
+                RegexNode node = RegexNode(RegexNode::kChar_Kind, pattern[1]);
+                for (size_t i = 2; i < pattern.size() - 1; ++i) {
+                    node = RegexNode(RegexNode::kConcat_Kind, node,
+                                     RegexNode(RegexNode::kChar_Kind, pattern[i]));
+                }
+                nfa.addRegex(node);
+            }
+            else {
+                nfa.addRegex(RegexParser().parse(pattern));
+            }
+        }
+    }
+    NFAtoDFA converter(&nfa);
+    DFA dfa = converter.convert();
+    writeH(dfa, lexer, token, tokens, hPath);
+    writeCPP(dfa, lexer, token, (std::string("SkSL") + lexer + ".h").c_str(), cppPath);
+}
+
+int main(int argc, const char** argv) {
+    if (argc != 6) {
+        printf("usage: sksllex <input.lex> <lexername> <tokenname> <output.h> <output.cpp>\n");
+        exit(1);
+    }
+    process(argv[1], argv[2], argv[3], argv[4], argv[5]);
+    return 0;
+}
diff --git a/src/sksl/lex/NFA.cpp b/src/sksl/lex/NFA.cpp
new file mode 100644
index 0000000000..d1eb7497b3
--- /dev/null
+++ b/src/sksl/lex/NFA.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "NFA.h"
+
+int NFA::match(std::string s) const {
+    std::vector<int> states = fStartStates;
+    for (size_t i = 0; i < s.size(); ++i) {
+        std::vector<int> next;
+        for (int id : states) {
+            if (fStates[id].accept(s[i])) {
+                for (int nextId : fStates[id].fNext) {
+                    if (fStates[nextId].fKind != NFAState::kRemapped_Kind) {
+                        next.push_back(nextId);
+                    } else {
+                        next.insert(next.end(), fStates[nextId].fData.begin(),
+                                    fStates[nextId].fData.end());
+                    }
+                }
+            }
+        }
+        if (!next.size()) {
+            return -1;
+        }
+        states = next;
+    }
+    int accept = -1;
+    for (int id : states) {
+        if (fStates[id].fKind == NFAState::kAccept_Kind) {
+            int result = fStates[id].fData[0];
+            if (accept == -1 || result < accept) {
+                accept = result;
+            }
+        }
+    }
+    return accept;
+}
diff --git a/src/sksl/lex/NFA.h b/src/sksl/lex/NFA.h
new file mode 100644
index 0000000000..e454d37f2a
--- /dev/null
+++ b/src/sksl/lex/NFA.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SKSL_NFA
+#define SKSL_NFA
+
+#include "NFAState.h"
+#include "RegexNode.h"
+
+/**
+ * A nondeterministic finite automaton for matching regular expressions. The NFA is initialized with
+ * a number of regular expressions, and then matches a string against all of them simultaneously.
+ */
+struct NFA {
+    /**
+     * Adds a new regular expression to the set of expressions matched by this automaton, returning
+     * its index.
+     */
+    int addRegex(const RegexNode& regex) {
+        std::vector<int> accept;
+        // we reserve token 0 for END_OF_FILE, so this starts at 1
+        accept.push_back(this->addState(NFAState(++fRegexCount)));
+        std::vector<int> startStates = regex.createStates(this, accept);
+        fStartStates.insert(fStartStates.end(), startStates.begin(), startStates.end());
+        return fStartStates.size() - 1;
+    }
+
+    /**
+     * Adds a new state to the NFA, returning its index.
+     */
+    int addState(NFAState s) {
+        fStates.push_back(std::move(s));
+        return fStates.size() - 1;
+    }
+
+    /**
+     * Matches a string against all of the regexes added to this NFA. Returns the index of the first
+     * (in addRegex order) matching expression, or -1 if no match. This is relatively slow and used
+     * only for debugging purposes; the NFA should be converted to a DFA before actual use.
+     */
+    int match(std::string s) const;
+
+    int fRegexCount = 0;
+
+    std::vector<NFAState> fStates;
+
+    std::vector<int> fStartStates;
+};
+
+#endif
diff --git a/src/sksl/lex/NFAState.h b/src/sksl/lex/NFAState.h
new file mode 100644
index 0000000000..5923507571
--- /dev/null
+++ b/src/sksl/lex/NFAState.h
@@ -0,0 +1,150 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SKSL_NFASTATE
+#define SKSL_NFASTATE
+
+#include <string>
+#include <vector>
+
+#include "LexUtil.h"
+
+struct NFAState {
+    enum Kind {
+        // represents an accept state - if the NFA ends up in this state, we have successfully
+        // matched the token indicated by fData[0]
+        kAccept_Kind,
+        // matches the single character fChar
+        kChar_Kind,
+        // the regex '.'; matches any char but '\n'
+        kDot_Kind,
+        // a state which serves as a placeholder for the states indicated in fData. When we
+        // transition to this state, we instead transition to all of the fData states.
+        kRemapped_Kind,
+        // contains a list of true/false values in fData. fData[c] tells us whether we accept the
+        // character c.
+        kTable_Kind
+    };
+
+    NFAState(Kind kind, std::vector<int> next)
+    : fKind(kind)
+    , fNext(std::move(next)) {}
+
+    NFAState(char c, std::vector<int> next)
+    : fKind(kChar_Kind)
+    , fChar(c)
+    , fNext(std::move(next)) {}
+
+    NFAState(std::vector<int> states)
+    : fKind(kRemapped_Kind)
+    , fData(std::move(states)) {}
+
+    NFAState(bool inverse, std::vector<bool> accepts, std::vector<int> next)
+    : fKind(kTable_Kind)
+    , fInverse(inverse)
+    , fNext(std::move(next)) {
+        for (bool b : accepts) {
+            fData.push_back(b);
+        }
+    }
+
+    NFAState(int token)
+    : fKind(kAccept_Kind) {
+        fData.push_back(token);
+    }
+
+    bool accept(char c) const {
+        switch (fKind) {
+            case kAccept_Kind:
+                return false;
+            case kChar_Kind:
+                return c == fChar;
+            case kDot_Kind:
+                return c != '\n';
+            case kTable_Kind: {
+                bool value;
+                if ((size_t) c < fData.size()) {
+                    value = fData[c];
+                } else {
+                    value = false;
+                }
+                return value != fInverse;
+            }
+            default:
+                ABORT("unreachable");
+        }
+    }
+
+    std::string description() const {
+        switch (fKind) {
+            case kAccept_Kind:
+                return "Accept(" + std::to_string(fData[0]) + ")";
+            case kChar_Kind: {
+                std::string result = "Char('" + std::string(1, fChar) + "'";
+                for (int v : fNext) {
+                    result += ", ";
+                    result += std::to_string(v);
+                }
+                result += ")";
+                return result;
+            }
+            case kDot_Kind: {
+                std::string result = "Dot(";
+                const char* separator = "";
+                for (int v : fNext) {
+                    result += separator;
+                    result += std::to_string(v);
+                    separator = ", ";
+                }
+                result += ")";
+                return result;
+            }
+            case kRemapped_Kind: {
+                std::string result = "Remapped(";
+                const char* separator = "";
+                for (int v : fData) {
+                    result += separator;
+                    result += std::to_string(v);
+                    separator = ", ";
+                }
+                result += ")";
+                return result;
+            }
+            case kTable_Kind: {
+                std::string result = std::string("Table(") + (fInverse ? "true" : "false") + ", [";
+                const char* separator = "";
+                for (int v : fData) {
+                    result += separator;
+                    result += v ? "true" : "false";
+                    separator = ", ";
+                }
+                result += "]";
+                for (int n : fNext) {
+                    result += ", ";
+                    result += std::to_string(n);
+                }
+                result += ")";
+                return result;
+            }
+            default:
+                ABORT("unreachable");
+        }
+    }
+
+    Kind fKind;
+
+    char fChar = 0;
+
+    bool fInverse = false;
+
+    std::vector<int> fData;
+
+    // states we transition to upon a succesful match from this state
+    std::vector<int> fNext;
+};
+
+#endif
diff --git a/src/sksl/lex/NFAtoDFA.h b/src/sksl/lex/NFAtoDFA.h
new file mode 100644
index 0000000000..214c6d12d6
--- /dev/null
+++ b/src/sksl/lex/NFAtoDFA.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "DFA.h"
+#include "DFAState.h"
+#include "NFA.h"
+#include "NFAState.h"
+
+#include <algorithm>
+#include <climits>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+/**
+ * Converts a nondeterministic finite automaton to a deterministic finite automaton. Since NFAs and
+ * DFAs differ only in that an NFA allows multiple states at the same time, we can find each
+ * possible combination of simultaneous NFA states and give this combination a label. These labelled
+ * nodes are our DFA nodes, since we can only be in one such unique set of NFA states at a time.
+ *
+ * As an NFA can end up in multiple accept states at the same time (for instance, the token "while"
+ * is valid for both WHILE and IDENTIFIER), we disambiguate by preferring the first matching regex
+ * (in terms of the order in which they were added to the NFA).
+ */
+class NFAtoDFA {
+public:
+    NFAtoDFA(NFA* nfa)
+    : fNFA(*nfa) {}
+
+    /**
+     * Returns a DFA created from the NFA.
+     */
+    DFA convert() {
+        // create state 0, the "reject" state
+        getState(DFAState::Label({}));
+        // create a state representing being in all of the NFA's start states at once
+        std::vector<int> startStates = fNFA.fStartStates;
+        std::sort(startStates.begin(), startStates.end());
+        // this becomes state 1, our start state
+        DFAState* start = getState(DFAState::Label(startStates));
+        this->scanState(start);
+
+        int stateCount = 0;
+        for (const auto& row : fTransitions) {
+            stateCount = std::max(stateCount, (int) row.size());
+        }
+        return DFA(fTransitions, fAccepts);
+    }
+
+private:
+    /**
+     * Returns an existing state with the given label, or creates a new one and returns it.
+     */
+    DFAState* getState(DFAState::Label label) {
+        auto found = fStates.find(label);
+        if (found == fStates.end()) {
+            int id = fStates.size();
+            fStates[label] = std::unique_ptr<DFAState>(new DFAState(id, label));
+            return fStates[label].get();
+        }
+        return found->second.get();
+    }
+
+    void add(int nfaState, std::vector<int>* states) {
+        NFAState state = fNFA.fStates[nfaState];
+        if (state.fKind == NFAState::kRemapped_Kind) {
+            for (int next : state.fData) {
+                this->add(next, states);
+            }
+        } else {
+            for (int state : *states) {
+                if (nfaState == state) {
+                    return;
+                }
+            }
+            states->push_back(nfaState);
+        }
+    }
+
+    void addTransition(char c, int start, int next) {
+        while (fTransitions.size() <= (size_t) c) {
+            fTransitions.push_back(std::vector<int>());
+        }
+        std::vector<int>& row = fTransitions[c];
+        while (row.size() <= (size_t) start) {
+            row.push_back(-1);
+        }
+        row[start] = next;
+    }
+
+    void scanState(DFAState* state) {
+        state->fIsScanned = true;
+        for (char c = 9; c <= DFA::END_CHAR; ++c) {
+            std::vector<int> next;
+            int bestAccept = INT_MAX;
+            for (int idx : state->fLabel.fStates) {
+                const NFAState& nfaState = fNFA.fStates[idx];
+                if (nfaState.accept(c)) {
+                    for (int nextState : nfaState.fNext) {
+                        if (fNFA.fStates[nextState].fKind == NFAState::kAccept_Kind) {
+                            bestAccept = std::min(bestAccept, fNFA.fStates[nextState].fData[0]);
+                        }
+                        this->add(nextState, &next);
+                    }
+                }
+            }
+            std::sort(next.begin(), next.end());
+            DFAState* nextState = this->getState(DFAState::Label(next));
+            this->addTransition(c, state->fId, nextState->fId);
+            if (bestAccept != INT_MAX) {
+                while (fAccepts.size() <= (size_t) nextState->fId) {
+                    fAccepts.push_back(-1);
+                }
+                fAccepts[nextState->fId] = bestAccept;
+            }
+            if (!nextState->fIsScanned) {
+                this->scanState(nextState);
+            }
+        }
+    }
+
+    const NFA& fNFA;
+    std::unordered_map<DFAState::Label, std::unique_ptr<DFAState>> fStates;
+    std::vector<std::vector<int>> fTransitions;
+    std::vector<int> fAccepts;
+};
diff --git a/src/sksl/lex/RegexNode.cpp b/src/sksl/lex/RegexNode.cpp
new file mode 100644
index 0000000000..8915c60c44
--- /dev/null
+++ b/src/sksl/lex/RegexNode.cpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "RegexNode.h"
+
+#include "NFA.h"
+
+std::vector<int> RegexNode::createStates(NFA* nfa, const std::vector<int>& accept) const {
+    std::vector<int> result;
+    switch (fKind) {
+        case kChar_Kind:
+            result.push_back(nfa->addState(NFAState(fPayload.fChar, accept)));
+            break;
+        case kCharset_Kind: {
+            std::vector<bool> chars;
+            for (const RegexNode& child : fChildren) {
+                if (child.fKind == kChar_Kind) {
+                    while (chars.size() <= (size_t) child.fPayload.fChar) {
+                        chars.push_back(false);
+                    }
+                    chars[child.fPayload.fChar] = true;
+                } else {
+                    ASSERT(child.fKind == kRange_Kind);
+                    while (chars.size() <= (size_t) child.fChildren[1].fPayload.fChar) {
+                        chars.push_back(false);
+                    }
+                    for (char c = child.fChildren[0].fPayload.fChar;
+                         c <= child.fChildren[1].fPayload.fChar;
+                         ++c) {
+                        chars[c] = true;
+                    }
+                }
+            }
+            result.push_back(nfa->addState(NFAState(fPayload.fBool, chars, accept)));
+            break;
+        }
+        case kConcat_Kind: {
+            std::vector<int> right = fChildren[1].createStates(nfa, accept);
+            result = fChildren[0].createStates(nfa, right);
+            break;
+        }
+        case kDot_Kind:
+            result.push_back(nfa->addState(NFAState(NFAState::kDot_Kind, accept)));
+            break;
+        case kOr_Kind: {
+            std::vector<int> states = fChildren[0].createStates(nfa, accept);
+            result.insert(result.end(), states.begin(), states.end());
+            states = fChildren[1].createStates(nfa, accept);
+            result.insert(result.end(), states.begin(), states.end());
+            break;
+        }
+        case kPlus_Kind: {
+            std::vector<int> next = accept;
+            std::vector<int> placeholder;
+            int id = nfa->addState(NFAState(placeholder));
+            next.push_back(id);
+            result = fChildren[0].createStates(nfa, next);
+            nfa->fStates[id] = NFAState(result);
+            break;
+        }
+        case kQuestion_Kind:
+            result = fChildren[0].createStates(nfa, accept);
+            result.insert(result.end(), accept.begin(), accept.end());
+            break;
+        case kRange_Kind:
+            ABORT("unreachable");
+        case kStar_Kind: {
+            std::vector<int> next = accept;
+            std::vector<int> placeholder;
+            int id = nfa->addState(NFAState(placeholder));
+            next.push_back(id);
+            result = fChildren[0].createStates(nfa, next);
+            result.insert(result.end(), accept.begin(), accept.end());
+            nfa->fStates[id] = NFAState(result);
+            break;
+        }
+    }
+    return result;
+}
+
+std::string RegexNode::description() const {
+    switch (fKind) {
+        case kChar_Kind:
+            return std::string(1, fPayload.fChar);
+        case kCharset_Kind: {
+            std::string result("[");
+            if (fPayload.fBool) {
+                result += "^";
+            }
+            for (const RegexNode& c : fChildren) {
+                result += c.description();
+            }
+            result += "]";
+            return result;
+        }
+        case kConcat_Kind:
+            return fChildren[0].description() + fChildren[1].description();
+        case kDot_Kind:
+            return ".";
+        case kOr_Kind:
+            return "(" + fChildren[0].description() + "|" + fChildren[1].description() + ")";
+        case kPlus_Kind:
+            return fChildren[0].description() + "+";
+        case kQuestion_Kind:
+            return fChildren[0].description() + "?";
+        case kRange_Kind:
+            return fChildren[0].description() + "-" + fChildren[1].description();
+        case kStar_Kind:
+            return fChildren[0].description() + "*";
+        default:
+            return "<" + std::to_string(fKind) + ">";
+    }
+}
diff --git a/src/sksl/lex/RegexNode.h b/src/sksl/lex/RegexNode.h
new file mode 100644
index 0000000000..e3aa65b3d1
--- /dev/null
+++ b/src/sksl/lex/RegexNode.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SKSL_REGEXNODE
+#define SKSL_REGEXNODE
+
+#include <string>
+#include <vector>
+
+struct NFA;
+
+/**
+ * Represents a node in the parse tree of a regular expression.
+ */
+struct RegexNode {
+    enum Kind {
+        kChar_Kind,
+        kCharset_Kind,
+        kConcat_Kind,
+        kDot_Kind,
+        kOr_Kind,
+        kPlus_Kind,
+        kRange_Kind,
+        kQuestion_Kind,
+        kStar_Kind
+    };
+
+    RegexNode(Kind kind)
+    : fKind(kind) {}
+
+    RegexNode(Kind kind, char payload)
+    : fKind(kind) {
+        fPayload.fChar = payload;
+    }
+
+    RegexNode(Kind kind, const char* children)
+    : fKind(kind) {
+        fPayload.fBool = false;
+        while (*children != '\0') {
+            fChildren.emplace_back(kChar_Kind, *children);
+            ++children;
+        }
+    }
+
+    RegexNode(Kind kind, RegexNode child)
+    : fKind(kind) {
+        fChildren.push_back(std::move(child));
+    }
+
+    RegexNode(Kind kind, RegexNode child1, RegexNode child2)
+    : fKind(kind) {
+        fChildren.push_back(std::move(child1));
+        fChildren.push_back(std::move(child2));
+    }
+
+    /**
+     * Creates NFA states for this node, with a successful match against this node resulting in a
+     * transition to all of the states in the accept vector.
+     */
+    std::vector<int> createStates(NFA* nfa, const std::vector<int>& accept) const;
+
+    std::string description() const;
+
+    Kind fKind;
+
+    union Payload {
+        char fChar;
+        bool fBool;
+    } fPayload;
+
+    std::vector<RegexNode> fChildren;
+};
+
+#endif
diff --git a/src/sksl/lex/RegexParser.cpp b/src/sksl/lex/RegexParser.cpp
new file mode 100644
index 0000000000..154df159db
--- /dev/null
+++ b/src/sksl/lex/RegexParser.cpp
@@ -0,0 +1,176 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "RegexParser.h"
+
+#include "LexUtil.h"
+
+RegexNode RegexParser::parse(std::string source) {
+    fSource = source;
+    fIndex = 0;
+    ASSERT(fStack.size() == 0);
+    this->regex();
+    ASSERT(fStack.size() == 1);
+    ASSERT(fIndex == source.size());
+    return this->pop();
+}
+
+char RegexParser::peek() {
+    if (fIndex >= fSource.size()) {
+        return END;
+    }
+    return fSource[fIndex];
+}
+
+void RegexParser::expect(char c) {
+    if (this->peek() != c) {
+        printf("expected '%c' at index %d, but found '%c'", c, (int) fIndex, this->peek());
+        exit(1);
+    }
+    ++fIndex;
+}
+
+RegexNode RegexParser::pop() {
+    RegexNode result = fStack.top();
+    fStack.pop();
+    return result;
+}
+
+void RegexParser::term() {
+    switch (this->peek()) {
+        case '(': this->group();  break;
+        case '[': this->set();    break;
+        case '.': this->dot();    break;
+        default: this->literal();
+    }
+}
+
+void RegexParser::quantifiedTerm() {
+    this->term();
+    switch (this->peek()) {
+        case '*': fStack.push(RegexNode(RegexNode::kStar_Kind,     this->pop())); ++fIndex; break;
+        case '+': fStack.push(RegexNode(RegexNode::kPlus_Kind,     this->pop())); ++fIndex; break;
+        case '?': fStack.push(RegexNode(RegexNode::kQuestion_Kind, this->pop())); ++fIndex; break;
+        default:  break;
+    }
+}
+
+void RegexParser::sequence() {
+    this->quantifiedTerm();
+    for (;;) {
+        switch (this->peek()) {
+            case END: // fall through
+            case '|': // fall through
+            case ')': return;
+            default:
+                this->sequence();
+                RegexNode right = this->pop();
+                RegexNode left = this->pop();
+                fStack.emplace(RegexNode::kConcat_Kind, std::move(left), std::move(right));
+        }
+    }
+}
+
+RegexNode RegexParser::escapeSequence(char c) {
+    switch (c) {
+        case 'n': return RegexNode(RegexNode::kChar_Kind, '\n');
+        case 'r': return RegexNode(RegexNode::kChar_Kind, '\r');
+        case 't': return RegexNode(RegexNode::kChar_Kind, '\t');
+        case 's': return RegexNode(RegexNode::kCharset_Kind, " \t\n\r");
+        default:  return RegexNode(RegexNode::kChar_Kind, c);
+    }
+}
+
+void RegexParser::literal() {
+    char c = this->peek();
+    if (c == '\\') {
+        ++fIndex;
+        fStack.push(this->escapeSequence(peek()));
+        ++fIndex;
+    }
+    else {
+        fStack.push(RegexNode(RegexNode::kChar_Kind, c));
+        ++fIndex;
+    }
+}
+
+void RegexParser::dot() {
+    this->expect('.');
+    fStack.push(RegexNode(RegexNode::kDot_Kind));
+}
+
+void RegexParser::group() {
+    this->expect('(');
+    this->regex();
+    this->expect(')');
+}
+
+void RegexParser::setItem() {
+    this->literal();
+    if (this->peek() == '-') {
+        ++fIndex;
+        if (peek() == ']') {
+            fStack.push(RegexNode(RegexNode::kChar_Kind, '-'));
+        }
+        else {
+            literal();
+            RegexNode end = this->pop();
+            ASSERT(end.fKind == RegexNode::kChar_Kind);
+            RegexNode start = this->pop();
+            ASSERT(start.fKind == RegexNode::kChar_Kind);
+            fStack.push(RegexNode(RegexNode::kRange_Kind, std::move(start), std::move(end)));
+        }
+    }
+}
+
+void RegexParser::set() {
+    expect('[');
+    size_t depth = fStack.size();
+    RegexNode set(RegexNode::kCharset_Kind);
+    if (this->peek() == '^') {
+        ++fIndex;
+        set.fPayload.fBool = true;
+    }
+    else {
+        set.fPayload.fBool = false;
+    }
+    for (;;) {
+        switch (this->peek()) {
+            case ']':
+                ++fIndex;
+                while (fStack.size() > depth) {
+                    set.fChildren.push_back(this->pop());
+                }
+                fStack.push(std::move(set));
+                return;
+            case END:
+                printf("unterminated character set\n");
+                exit(1);
+            default:
+                this->setItem();
+        }
+    }
+}
+
+void RegexParser::regex() {
+    this->sequence();
+    switch (this->peek()) {
+        case '|': {
+            ++fIndex;
+            this->regex();
+            RegexNode right = this->pop();
+            RegexNode left = this->pop();
+            fStack.push(RegexNode(RegexNode::kOr_Kind, left, right));
+            break;
+        }
+        case END: // fall through
+        case ')':
+            return;
+        default:
+            ASSERT(false);
+    }
+}
diff --git a/src/sksl/lex/RegexParser.h b/src/sksl/lex/RegexParser.h
new file mode 100644
index 0000000000..7de546fc17
--- /dev/null
+++ b/src/sksl/lex/RegexParser.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SKSL_REGEXPARSER
+#define SKSL_REGEXPARSER
+
+#include "RegexNode.h"
+
+#include <stack>
+#include <string>
+
+/**
+ * Turns a simple regular expression into a parse tree. The regular expression syntax supports only
+ * the basic quantifiers ('*', '+', and '?'), alternation ('|'), character sets ('[a-z]'), and
+ * groups ('()').
+ */
+class RegexParser {
+public:
+    RegexNode parse(std::string source);
+
+private:
+    static constexpr char END = '\0';
+
+    char peek();
+
+    void expect(char c);
+
+    RegexNode pop();
+
+    /**
+     * Matches a char literal, parenthesized group, character set, or dot ('.').
+     */
+    void term();
+
+    /**
+     * Matches a term followed by an optional quantifier ('*', '+', or '?').
+     */
+    void quantifiedTerm();
+
+    /**
+     * Matches a sequence of quantifiedTerms.
+     */
+    void sequence();
+
+    /**
+     * Returns a node representing the given escape character (e.g. escapeSequence('n') returns a
+     * node which matches a newline character).
+     */
+    RegexNode escapeSequence(char c);
+
+    /**
+     * Matches a literal character or escape sequence.
+     */
+    void literal();
+
+    /**
+     * Matches a dot ('.').
+     */
+    void dot();
+
+    /**
+     * Matches a parenthesized group.
+     */
+    void group();
+
+    /**
+     * Matches a literal character, escape sequence, or character range from a character set.
+     */
+    void setItem();
+
+    /**
+     * Matches a character set.
+     */
+    void set();
+
+    void regex();
+
+    std::string fSource;
+
+    size_t fIndex;
+
+    std::stack<RegexNode> fStack;
+};
+
+#endif
diff --git a/src/sksl/lex/sksl.lex b/src/sksl/lex/sksl.lex
new file mode 100644
index 0000000000..ada5e248de
--- /dev/null
+++ b/src/sksl/lex/sksl.lex
@@ -0,0 +1,95 @@
+FLOAT_LITERAL  = [0-9]*\.[0-9]+([eE][+-]?[0-9]+)?|[0-9]+\.[0-9]*([eE][+-]?[0-9]+)?|[0-9]+([eE][+-]?[0-9]+)
+INT_LITERAL    = [0-9]+|0x[0-9a-fA-F]+
+TRUE_LITERAL   = "true"
+FALSE_LITERAL  = "false"
+IF             = "if"
+STATIC_IF      = "@if"
+ELSE           = "else"
+FOR            = "for"
+WHILE          = "while"
+DO             = "do"
+SWITCH         = "switch"
+STATIC_SWITCH  = "@switch"
+CASE           = "case"
+DEFAULT        = "default"
+BREAK          = "break"
+CONTINUE       = "continue"
+DISCARD        = "discard"
+RETURN         = "return"
+IN             = "in"
+OUT            = "out"
+INOUT          = "inout"
+UNIFORM        = "uniform"
+CONST          = "const"
+LOWP           = "lowp"
+MEDIUMP        = "mediump"
+HIGHP          = "highp"
+FLAT           = "flat"
+NOPERSPECTIVE  = "noperspective"
+READONLY       = "readonly"
+WRITEONLY      = "writeonly"
+COHERENT       = "coherent"
+VOLATILE       = "volatile"
+RESTRICT       = "restrict"
+BUFFER         = "buffer"
+HASSIDEEFFECTS = "sk_has_side_effects"
+STRUCT         = "struct"
+LAYOUT         = "layout"
+PRECISION      = "precision"
+IDENTIFIER     = [a-zA-Z_$]([0-9]|[a-zA-Z_$])*
+DIRECTIVE      = #[a-zA-Z_$]([0-9]|[a-zA-Z_$])*
+SECTION        = @[a-zA-Z_$]([0-9]|[a-zA-Z_$])*
+LPAREN         = "("
+RPAREN         = ")"
+LBRACE         = "{"
+RBRACE         = "}"
+LBRACKET       = "["
+RBRACKET       = "]"
+DOT            = "."
+COMMA          = ","
+PLUSPLUS       = "++"
+MINUSMINUS     = "--"
+PLUS           = "+"
+MINUS          = "-"
+STAR           = "*"
+SLASH          = "/"
+PERCENT        = "%"
+SHL            = "<<"
+SHR            = ">>"
+BITWISEOR      = "|"
+BITWISEXOR     = "^"
+BITWISEAND     = "&"
+BITWISENOT     = "~"
+LOGICALOR      = "||"
+LOGICALXOR     = "^^"
+LOGICALAND     = "&&"
+LOGICALNOT     = "!"
+QUESTION       = "?"
+COLON          = ":"
+EQ             = "="
+EQEQ           = "=="
+NEQ            = "!="
+GT             = ">"
+LT             = "<"
+GTEQ           = ">="
+LTEQ           = "<="
+PLUSEQ         = "+="
+MINUSEQ        = "-="
+STAREQ         = "*="
+SLASHEQ        = "/="
+PERCENTEQ      = "%="
+SHLEQ          = "<<="
+SHREQ          = ">>="
+BITWISEOREQ    = "|="
+BITWISEXOREQ   = "^="
+BITWISEANDEQ   = "&="
+LOGICALOREQ    = "||="
+LOGICALXOREQ   = "^^="
+LOGICALANDEQ   = "&&="
+SEMICOLON      = ";"
+ARROW          = "->"
+COLONCOLON     = "::"
+WHITESPACE     = \s+
+LINE_COMMENT   = //.*
+BLOCK_COMMENT  = /\*([^*]|\*[^/])*\*/
+INVALID        = .
+\ No newline at end of file
author	Ethan Nicholas <ethannicholas@google.com>	2017-09-06 16:59:14 -0400
committer	Skia Commit-Bot <skia-commit-bot@chromium.org>	2017-09-06 21:19:18 +0000
commit	3de0e496b5215bcf4b26db2d69dc84376dfd68d2 (patch)
tree	5b340a6636dc0544109d623a968bd668abb23cb8 /src/sksl
parent	bb3dc768fd57956feb3947991011052b034dcb7a (diff)