diff options
author | Ethan Nicholas <ethannicholas@google.com> | 2017-09-07 09:39:50 -0400 |
---|---|---|
committer | Skia Commit-Bot <skia-commit-bot@chromium.org> | 2017-09-07 13:57:51 +0000 |
commit | ca82a922e3d081ae83c940f4fd3ccc8cc6f0916f (patch) | |
tree | 480d95845bf2ce5e24d26ff30ab96cdeb3ee6315 /src/sksl/lex | |
parent | e5756eccc41fabc421a7fe8b00807c3dcd257b9f (diff) |
Revert "Revert "Initial checkin of SkSL lexical analyzer generator (not actually in use yet).""
This reverts commit 3ed4781ee1bd5af9b3ee2623e5356e86ce36e812.
Bug: skia:
Change-Id: If0de7ca17c4da8000d3526a73b71be6ee34ce060
Reviewed-on: https://skia-review.googlesource.com/43561
Reviewed-by: Ethan Nicholas <ethannicholas@google.com>
Commit-Queue: Ethan Nicholas <ethannicholas@google.com>
Diffstat (limited to 'src/sksl/lex')
-rw-r--r-- | src/sksl/lex/DFA.h | 32 | ||||
-rw-r--r-- | src/sksl/lex/DFAState.h | 68 | ||||
-rw-r--r-- | src/sksl/lex/LexUtil.h | 14 | ||||
-rw-r--r-- | src/sksl/lex/Main.cpp | 197 | ||||
-rw-r--r-- | src/sksl/lex/NFA.cpp | 41 | ||||
-rw-r--r-- | src/sksl/lex/NFA.h | 54 | ||||
-rw-r--r-- | src/sksl/lex/NFAState.h | 150 | ||||
-rw-r--r-- | src/sksl/lex/NFAtoDFA.h | 130 | ||||
-rw-r--r-- | src/sksl/lex/RegexNode.cpp | 117 | ||||
-rw-r--r-- | src/sksl/lex/RegexNode.h | 78 | ||||
-rw-r--r-- | src/sksl/lex/RegexParser.cpp | 176 | ||||
-rw-r--r-- | src/sksl/lex/RegexParser.h | 89 | ||||
-rw-r--r-- | src/sksl/lex/sksl.lex | 95 |
13 files changed, 1241 insertions, 0 deletions
diff --git a/src/sksl/lex/DFA.h b/src/sksl/lex/DFA.h new file mode 100644 index 0000000000..51e85ca2f8 --- /dev/null +++ b/src/sksl/lex/DFA.h @@ -0,0 +1,32 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#ifndef SKSL_DFA +#define SKSL_DFA + +#include <string> +#include <vector> + +/** + * Tables representing a deterministic finite automaton for matching regular expressions. + */ +struct DFA { + DFA(std::vector<std::vector<int>> transitions, std::vector<int> accepts) + : fTransitions(transitions) + , fAccepts(accepts) {} + + static constexpr char END_CHAR = 126; + + // one row per character, one column per state + std::vector<std::vector<int>> fTransitions; + + // contains, for each state, the token id we should report when matching ends in that state (-1 + // for no match) + std::vector<int> fAccepts; +}; + +#endif diff --git a/src/sksl/lex/DFAState.h b/src/sksl/lex/DFAState.h new file mode 100644 index 0000000000..b2c89fd4cb --- /dev/null +++ b/src/sksl/lex/DFAState.h @@ -0,0 +1,68 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#ifndef SKSL_DFASTATE +#define SKSL_DFASTATE + +struct DFAState { + struct Label { + std::vector<int> fStates; + + Label(std::vector<int> states) + : fStates(std::move(states)) {} + + bool operator==(const Label& other) const { + return fStates == other.fStates; + } + + bool operator!=(const Label& other) const { + return !(*this == other); + } + + std::string description() const { + std::string result = "<"; + const char* separator = ""; + for (int s : fStates) { + result += separator; + result += std::to_string(s); + separator = ", "; + } + result += ">"; + return result; + } + }; + + DFAState() + : fId(-1) + , fLabel({}) {} + + DFAState(int id, Label label) + : fId(id) + , fLabel(std::move(label)) {} + + DFAState(const DFAState& other) = delete; + + int fId; + + Label fLabel; + + bool fIsScanned = false; +}; + +namespace std { + template<> struct hash<DFAState::Label> { + size_t operator()(const DFAState::Label& s) const { + int result = 0; + for (int i : s.fStates) { + result = result * 101 + i; + } + return result; + } + }; +} // namespace + +#endif diff --git a/src/sksl/lex/LexUtil.h b/src/sksl/lex/LexUtil.h new file mode 100644 index 0000000000..d2c462b0ed --- /dev/null +++ b/src/sksl/lex/LexUtil.h @@ -0,0 +1,14 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#ifndef SKSL_LEXUTIL +#define SKSL_LEXUTIL + +#define ABORT(...) (fprintf(stderr, __VA_ARGS__), abort()) +#define ASSERT(x) (void)((x) || (ABORT("failed assert(%s): %s:%d\n", #x, __FILE__, __LINE__), 0)) + +#endif diff --git a/src/sksl/lex/Main.cpp b/src/sksl/lex/Main.cpp new file mode 100644 index 0000000000..a5c7320356 --- /dev/null +++ b/src/sksl/lex/Main.cpp @@ -0,0 +1,197 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#include "NFAtoDFA.h" +#include "RegexParser.h" + +#include <fstream> +#include <sstream> +#include <string> + +/** + * Processes a .lex file and produces .h and .cpp files which implement a lexical analyzer. The .lex + * file is a text file with one token definition per line. Each line is of the form: + * <TOKEN_NAME> = <pattern> + * where <pattern> is either a regular expression (e.g [0-9]) or a double-quoted literal string. + */ + +static constexpr const char* HEADER = + "/*\n" + " * Copyright 2017 Google Inc.\n" + " *\n" + " * Use of this source code is governed by a BSD-style license that can be\n" + " * found in the LICENSE file.\n" + " */\n" + "/*****************************************************************************************\n" + " ******************** This file was generated by sksllex. Do not edit. *******************\n" + " *****************************************************************************************/\n"; + +void writeH(const DFA& dfa, const char* lexer, const char* token, + const std::vector<std::string>& tokens, const char* hPath) { + std::ofstream out(hPath); + ASSERT(out.good()); + out << HEADER; + out << "#ifndef SKSL_" << lexer << "\n"; + out << "#define SKSL_" << lexer << "\n"; + out << "#include <cstddef>\n"; + out << "#include <cstdint>\n"; + out << "namespace SkSL {\n"; + out << "\n"; + out << "struct " << token << " {\n"; + out << " enum Kind {\n"; + for (const std::string& t : tokens) { + out << " #undef " << t << "\n"; + out << " " << t << ",\n"; + } + out << " };\n"; + out << "\n"; + out << " " << token << "()\n"; + out << " : fKind(Kind::INVALID)\n"; + out << " , fOffset(-1)\n"; + out << " , fLength(-1) {}\n"; + out << "\n"; + out << " " << token << "(Kind kind, int offset, int length)\n"; + out << " : fKind(kind)\n"; + out << " , fOffset(offset)\n"; + out << " , fLength(length) {}\n"; + out << "\n"; + out << " Kind fKind;\n"; + out << " int fOffset;\n"; + out << " int fLength;\n"; + out << "};\n"; + out << "\n"; + out << "class " << lexer << " {\n"; + out << "public:\n"; + out << " void start(const char* text, size_t length) {\n"; + out << " fText = text;\n"; + out << " fLength = length;\n"; + out << " fOffset = 0;\n"; + out << " }\n"; + out << "\n"; + out << " " << token << " next();\n"; + out << "\n"; + out << "private:\n"; + out << " const char* fText;\n"; + out << " int fLength;\n"; + out << " int fOffset;\n"; + out << "};\n"; + out << "\n"; + out << "} // namespace\n"; + out << "#endif\n"; +} + +void writeCPP(const DFA& dfa, const char* lexer, const char* token, const char* include, + const char* cppPath) { + std::ofstream out(cppPath); + ASSERT(out.good()); + out << HEADER; + out << "#include \"" << include << "\"\n"; + out << "\n"; + out << "namespace SkSL {\n"; + out << "\n"; + + size_t states = 0; + for (const auto& row : dfa.fTransitions) { + states = std::max(states, row.size()); + } + out << "static int16_t transitions[" << DFA::END_CHAR + 1 << "][" << states << "] = {\n"; + for (char c = 0; c <= DFA::END_CHAR; ++c) { + out << " {"; + for (size_t i = 0; i < states; ++i) { + if ((size_t) c < dfa.fTransitions.size() && i < dfa.fTransitions[c].size()) { + out << " " << dfa.fTransitions[c][i] << ","; + } else { + out << " 0,"; + } + } + out << " },\n"; + } + out << "};\n"; + out << "\n"; + + out << "static int16_t accepts[" << states << "] = {"; + for (size_t i = 0; i < states; ++i) { + if (i < dfa.fAccepts.size()) { + out << " " << dfa.fAccepts[i] << ","; + } else { + out << " -1,"; + } + } + out << " };\n"; + out << "\n"; + + out << token << " " << lexer << "::next() {\n";; + out << " int startOffset = fOffset;\n"; + out << " if (startOffset == fLength) {\n"; + out << " return " << token << "(" << token << "::END_OF_FILE, startOffset, 0);\n"; + out << " }\n"; + out << " int offset = startOffset;\n"; + out << " int state = 1;\n"; + out << " " << token << "::Kind lastAccept = " << token << "::Kind::INVALID;\n"; + out << " int lastAcceptEnd = startOffset + 1;\n"; + out << " while (offset < fLength) {\n"; + out << " state = transitions[(int) fText[offset]][state];\n"; + out << " ++offset;\n"; + out << " if (!state) {\n"; + out << " break;\n"; + out << " }\n"; + out << " if (accepts[state]) {\n"; + out << " lastAccept = (" << token << "::Kind) accepts[state];\n"; + out << " lastAcceptEnd = offset;\n"; + out << " }\n"; + out << " }\n"; + out << " fOffset = lastAcceptEnd;\n"; + out << " return " << token << "(lastAccept, startOffset, lastAcceptEnd - startOffset);\n"; + out << "}\n"; + out << "\n"; + out << "} // namespace\n"; +} + +void process(const char* inPath, const char* lexer, const char* token, const char* hPath, + const char* cppPath) { + NFA nfa; + std::vector<std::string> tokens; + tokens.push_back("END_OF_FILE"); + std::string line; + std::ifstream in(inPath); + while (std::getline(in, line)) { + std::istringstream split(line); + std::string name, delimiter, pattern; + if (split >> name >> delimiter >> pattern) { + ASSERT(split.eof()); + ASSERT(name != ""); + ASSERT(delimiter == "="); + ASSERT(pattern != ""); + tokens.push_back(name); + if (pattern[0] == '"') { + ASSERT(pattern.size() > 2 && pattern[pattern.size() - 1] == '"'); + RegexNode node = RegexNode(RegexNode::kChar_Kind, pattern[1]); + for (size_t i = 2; i < pattern.size() - 1; ++i) { + node = RegexNode(RegexNode::kConcat_Kind, node, + RegexNode(RegexNode::kChar_Kind, pattern[i])); + } + nfa.addRegex(node); + } + else { + nfa.addRegex(RegexParser().parse(pattern)); + } + } + } + NFAtoDFA converter(&nfa); + DFA dfa = converter.convert(); + writeH(dfa, lexer, token, tokens, hPath); + writeCPP(dfa, lexer, token, (std::string("SkSL") + lexer + ".h").c_str(), cppPath); +} + +int main(int argc, const char** argv) { + if (argc != 6) { + printf("usage: sksllex <input.lex> <lexername> <tokenname> <output.h> <output.cpp>\n"); + exit(1); + } + process(argv[1], argv[2], argv[3], argv[4], argv[5]); + return 0; +} diff --git a/src/sksl/lex/NFA.cpp b/src/sksl/lex/NFA.cpp new file mode 100644 index 0000000000..d1eb7497b3 --- /dev/null +++ b/src/sksl/lex/NFA.cpp @@ -0,0 +1,41 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#include "NFA.h" + +int NFA::match(std::string s) const { + std::vector<int> states = fStartStates; + for (size_t i = 0; i < s.size(); ++i) { + std::vector<int> next; + for (int id : states) { + if (fStates[id].accept(s[i])) { + for (int nextId : fStates[id].fNext) { + if (fStates[nextId].fKind != NFAState::kRemapped_Kind) { + next.push_back(nextId); + } else { + next.insert(next.end(), fStates[nextId].fData.begin(), + fStates[nextId].fData.end()); + } + } + } + } + if (!next.size()) { + return -1; + } + states = next; + } + int accept = -1; + for (int id : states) { + if (fStates[id].fKind == NFAState::kAccept_Kind) { + int result = fStates[id].fData[0]; + if (accept == -1 || result < accept) { + accept = result; + } + } + } + return accept; +} diff --git a/src/sksl/lex/NFA.h b/src/sksl/lex/NFA.h new file mode 100644 index 0000000000..e454d37f2a --- /dev/null +++ b/src/sksl/lex/NFA.h @@ -0,0 +1,54 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#ifndef SKSL_NFA +#define SKSL_NFA + +#include "NFAState.h" +#include "RegexNode.h" + +/** + * A nondeterministic finite automaton for matching regular expressions. The NFA is initialized with + * a number of regular expressions, and then matches a string against all of them simultaneously. + */ +struct NFA { + /** + * Adds a new regular expression to the set of expressions matched by this automaton, returning + * its index. + */ + int addRegex(const RegexNode& regex) { + std::vector<int> accept; + // we reserve token 0 for END_OF_FILE, so this starts at 1 + accept.push_back(this->addState(NFAState(++fRegexCount))); + std::vector<int> startStates = regex.createStates(this, accept); + fStartStates.insert(fStartStates.end(), startStates.begin(), startStates.end()); + return fStartStates.size() - 1; + } + + /** + * Adds a new state to the NFA, returning its index. + */ + int addState(NFAState s) { + fStates.push_back(std::move(s)); + return fStates.size() - 1; + } + + /** + * Matches a string against all of the regexes added to this NFA. Returns the index of the first + * (in addRegex order) matching expression, or -1 if no match. This is relatively slow and used + * only for debugging purposes; the NFA should be converted to a DFA before actual use. + */ + int match(std::string s) const; + + int fRegexCount = 0; + + std::vector<NFAState> fStates; + + std::vector<int> fStartStates; +}; + +#endif diff --git a/src/sksl/lex/NFAState.h b/src/sksl/lex/NFAState.h new file mode 100644 index 0000000000..5923507571 --- /dev/null +++ b/src/sksl/lex/NFAState.h @@ -0,0 +1,150 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#ifndef SKSL_NFASTATE +#define SKSL_NFASTATE + +#include <string> +#include <vector> + +#include "LexUtil.h" + +struct NFAState { + enum Kind { + // represents an accept state - if the NFA ends up in this state, we have successfully + // matched the token indicated by fData[0] + kAccept_Kind, + // matches the single character fChar + kChar_Kind, + // the regex '.'; matches any char but '\n' + kDot_Kind, + // a state which serves as a placeholder for the states indicated in fData. When we + // transition to this state, we instead transition to all of the fData states. + kRemapped_Kind, + // contains a list of true/false values in fData. fData[c] tells us whether we accept the + // character c. + kTable_Kind + }; + + NFAState(Kind kind, std::vector<int> next) + : fKind(kind) + , fNext(std::move(next)) {} + + NFAState(char c, std::vector<int> next) + : fKind(kChar_Kind) + , fChar(c) + , fNext(std::move(next)) {} + + NFAState(std::vector<int> states) + : fKind(kRemapped_Kind) + , fData(std::move(states)) {} + + NFAState(bool inverse, std::vector<bool> accepts, std::vector<int> next) + : fKind(kTable_Kind) + , fInverse(inverse) + , fNext(std::move(next)) { + for (bool b : accepts) { + fData.push_back(b); + } + } + + NFAState(int token) + : fKind(kAccept_Kind) { + fData.push_back(token); + } + + bool accept(char c) const { + switch (fKind) { + case kAccept_Kind: + return false; + case kChar_Kind: + return c == fChar; + case kDot_Kind: + return c != '\n'; + case kTable_Kind: { + bool value; + if ((size_t) c < fData.size()) { + value = fData[c]; + } else { + value = false; + } + return value != fInverse; + } + default: + ABORT("unreachable"); + } + } + + std::string description() const { + switch (fKind) { + case kAccept_Kind: + return "Accept(" + std::to_string(fData[0]) + ")"; + case kChar_Kind: { + std::string result = "Char('" + std::string(1, fChar) + "'"; + for (int v : fNext) { + result += ", "; + result += std::to_string(v); + } + result += ")"; + return result; + } + case kDot_Kind: { + std::string result = "Dot("; + const char* separator = ""; + for (int v : fNext) { + result += separator; + result += std::to_string(v); + separator = ", "; + } + result += ")"; + return result; + } + case kRemapped_Kind: { + std::string result = "Remapped("; + const char* separator = ""; + for (int v : fData) { + result += separator; + result += std::to_string(v); + separator = ", "; + } + result += ")"; + return result; + } + case kTable_Kind: { + std::string result = std::string("Table(") + (fInverse ? "true" : "false") + ", ["; + const char* separator = ""; + for (int v : fData) { + result += separator; + result += v ? "true" : "false"; + separator = ", "; + } + result += "]"; + for (int n : fNext) { + result += ", "; + result += std::to_string(n); + } + result += ")"; + return result; + } + default: + ABORT("unreachable"); + } + } + + Kind fKind; + + char fChar = 0; + + bool fInverse = false; + + std::vector<int> fData; + + // states we transition to upon a succesful match from this state + std::vector<int> fNext; +}; + +#endif diff --git a/src/sksl/lex/NFAtoDFA.h b/src/sksl/lex/NFAtoDFA.h new file mode 100644 index 0000000000..214c6d12d6 --- /dev/null +++ b/src/sksl/lex/NFAtoDFA.h @@ -0,0 +1,130 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#include "DFA.h" +#include "DFAState.h" +#include "NFA.h" +#include "NFAState.h" + +#include <algorithm> +#include <climits> +#include <memory> +#include <unordered_map> +#include <vector> + +/** + * Converts a nondeterministic finite automaton to a deterministic finite automaton. Since NFAs and + * DFAs differ only in that an NFA allows multiple states at the same time, we can find each + * possible combination of simultaneous NFA states and give this combination a label. These labelled + * nodes are our DFA nodes, since we can only be in one such unique set of NFA states at a time. + * + * As an NFA can end up in multiple accept states at the same time (for instance, the token "while" + * is valid for both WHILE and IDENTIFIER), we disambiguate by preferring the first matching regex + * (in terms of the order in which they were added to the NFA). + */ +class NFAtoDFA { +public: + NFAtoDFA(NFA* nfa) + : fNFA(*nfa) {} + + /** + * Returns a DFA created from the NFA. + */ + DFA convert() { + // create state 0, the "reject" state + getState(DFAState::Label({})); + // create a state representing being in all of the NFA's start states at once + std::vector<int> startStates = fNFA.fStartStates; + std::sort(startStates.begin(), startStates.end()); + // this becomes state 1, our start state + DFAState* start = getState(DFAState::Label(startStates)); + this->scanState(start); + + int stateCount = 0; + for (const auto& row : fTransitions) { + stateCount = std::max(stateCount, (int) row.size()); + } + return DFA(fTransitions, fAccepts); + } + +private: + /** + * Returns an existing state with the given label, or creates a new one and returns it. + */ + DFAState* getState(DFAState::Label label) { + auto found = fStates.find(label); + if (found == fStates.end()) { + int id = fStates.size(); + fStates[label] = std::unique_ptr<DFAState>(new DFAState(id, label)); + return fStates[label].get(); + } + return found->second.get(); + } + + void add(int nfaState, std::vector<int>* states) { + NFAState state = fNFA.fStates[nfaState]; + if (state.fKind == NFAState::kRemapped_Kind) { + for (int next : state.fData) { + this->add(next, states); + } + } else { + for (int state : *states) { + if (nfaState == state) { + return; + } + } + states->push_back(nfaState); + } + } + + void addTransition(char c, int start, int next) { + while (fTransitions.size() <= (size_t) c) { + fTransitions.push_back(std::vector<int>()); + } + std::vector<int>& row = fTransitions[c]; + while (row.size() <= (size_t) start) { + row.push_back(-1); + } + row[start] = next; + } + + void scanState(DFAState* state) { + state->fIsScanned = true; + for (char c = 9; c <= DFA::END_CHAR; ++c) { + std::vector<int> next; + int bestAccept = INT_MAX; + for (int idx : state->fLabel.fStates) { + const NFAState& nfaState = fNFA.fStates[idx]; + if (nfaState.accept(c)) { + for (int nextState : nfaState.fNext) { + if (fNFA.fStates[nextState].fKind == NFAState::kAccept_Kind) { + bestAccept = std::min(bestAccept, fNFA.fStates[nextState].fData[0]); + } + this->add(nextState, &next); + } + } + } + std::sort(next.begin(), next.end()); + DFAState* nextState = this->getState(DFAState::Label(next)); + this->addTransition(c, state->fId, nextState->fId); + if (bestAccept != INT_MAX) { + while (fAccepts.size() <= (size_t) nextState->fId) { + fAccepts.push_back(-1); + } + fAccepts[nextState->fId] = bestAccept; + } + if (!nextState->fIsScanned) { + this->scanState(nextState); + } + } + } + + const NFA& fNFA; + std::unordered_map<DFAState::Label, std::unique_ptr<DFAState>> fStates; + std::vector<std::vector<int>> fTransitions; + std::vector<int> fAccepts; +}; diff --git a/src/sksl/lex/RegexNode.cpp b/src/sksl/lex/RegexNode.cpp new file mode 100644 index 0000000000..8915c60c44 --- /dev/null +++ b/src/sksl/lex/RegexNode.cpp @@ -0,0 +1,117 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#include "RegexNode.h" + +#include "NFA.h" + +std::vector<int> RegexNode::createStates(NFA* nfa, const std::vector<int>& accept) const { + std::vector<int> result; + switch (fKind) { + case kChar_Kind: + result.push_back(nfa->addState(NFAState(fPayload.fChar, accept))); + break; + case kCharset_Kind: { + std::vector<bool> chars; + for (const RegexNode& child : fChildren) { + if (child.fKind == kChar_Kind) { + while (chars.size() <= (size_t) child.fPayload.fChar) { + chars.push_back(false); + } + chars[child.fPayload.fChar] = true; + } else { + ASSERT(child.fKind == kRange_Kind); + while (chars.size() <= (size_t) child.fChildren[1].fPayload.fChar) { + chars.push_back(false); + } + for (char c = child.fChildren[0].fPayload.fChar; + c <= child.fChildren[1].fPayload.fChar; + ++c) { + chars[c] = true; + } + } + } + result.push_back(nfa->addState(NFAState(fPayload.fBool, chars, accept))); + break; + } + case kConcat_Kind: { + std::vector<int> right = fChildren[1].createStates(nfa, accept); + result = fChildren[0].createStates(nfa, right); + break; + } + case kDot_Kind: + result.push_back(nfa->addState(NFAState(NFAState::kDot_Kind, accept))); + break; + case kOr_Kind: { + std::vector<int> states = fChildren[0].createStates(nfa, accept); + result.insert(result.end(), states.begin(), states.end()); + states = fChildren[1].createStates(nfa, accept); + result.insert(result.end(), states.begin(), states.end()); + break; + } + case kPlus_Kind: { + std::vector<int> next = accept; + std::vector<int> placeholder; + int id = nfa->addState(NFAState(placeholder)); + next.push_back(id); + result = fChildren[0].createStates(nfa, next); + nfa->fStates[id] = NFAState(result); + break; + } + case kQuestion_Kind: + result = fChildren[0].createStates(nfa, accept); + result.insert(result.end(), accept.begin(), accept.end()); + break; + case kRange_Kind: + ABORT("unreachable"); + case kStar_Kind: { + std::vector<int> next = accept; + std::vector<int> placeholder; + int id = nfa->addState(NFAState(placeholder)); + next.push_back(id); + result = fChildren[0].createStates(nfa, next); + result.insert(result.end(), accept.begin(), accept.end()); + nfa->fStates[id] = NFAState(result); + break; + } + } + return result; +} + +std::string RegexNode::description() const { + switch (fKind) { + case kChar_Kind: + return std::string(1, fPayload.fChar); + case kCharset_Kind: { + std::string result("["); + if (fPayload.fBool) { + result += "^"; + } + for (const RegexNode& c : fChildren) { + result += c.description(); + } + result += "]"; + return result; + } + case kConcat_Kind: + return fChildren[0].description() + fChildren[1].description(); + case kDot_Kind: + return "."; + case kOr_Kind: + return "(" + fChildren[0].description() + "|" + fChildren[1].description() + ")"; + case kPlus_Kind: + return fChildren[0].description() + "+"; + case kQuestion_Kind: + return fChildren[0].description() + "?"; + case kRange_Kind: + return fChildren[0].description() + "-" + fChildren[1].description(); + case kStar_Kind: + return fChildren[0].description() + "*"; + default: + return "<" + std::to_string(fKind) + ">"; + } +} diff --git a/src/sksl/lex/RegexNode.h b/src/sksl/lex/RegexNode.h new file mode 100644 index 0000000000..e3aa65b3d1 --- /dev/null +++ b/src/sksl/lex/RegexNode.h @@ -0,0 +1,78 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#ifndef SKSL_REGEXNODE +#define SKSL_REGEXNODE + +#include <string> +#include <vector> + +struct NFA; + +/** + * Represents a node in the parse tree of a regular expression. + */ +struct RegexNode { + enum Kind { + kChar_Kind, + kCharset_Kind, + kConcat_Kind, + kDot_Kind, + kOr_Kind, + kPlus_Kind, + kRange_Kind, + kQuestion_Kind, + kStar_Kind + }; + + RegexNode(Kind kind) + : fKind(kind) {} + + RegexNode(Kind kind, char payload) + : fKind(kind) { + fPayload.fChar = payload; + } + + RegexNode(Kind kind, const char* children) + : fKind(kind) { + fPayload.fBool = false; + while (*children != '\0') { + fChildren.emplace_back(kChar_Kind, *children); + ++children; + } + } + + RegexNode(Kind kind, RegexNode child) + : fKind(kind) { + fChildren.push_back(std::move(child)); + } + + RegexNode(Kind kind, RegexNode child1, RegexNode child2) + : fKind(kind) { + fChildren.push_back(std::move(child1)); + fChildren.push_back(std::move(child2)); + } + + /** + * Creates NFA states for this node, with a successful match against this node resulting in a + * transition to all of the states in the accept vector. + */ + std::vector<int> createStates(NFA* nfa, const std::vector<int>& accept) const; + + std::string description() const; + + Kind fKind; + + union Payload { + char fChar; + bool fBool; + } fPayload; + + std::vector<RegexNode> fChildren; +}; + +#endif diff --git a/src/sksl/lex/RegexParser.cpp b/src/sksl/lex/RegexParser.cpp new file mode 100644 index 0000000000..154df159db --- /dev/null +++ b/src/sksl/lex/RegexParser.cpp @@ -0,0 +1,176 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#include "RegexParser.h" + +#include "LexUtil.h" + +RegexNode RegexParser::parse(std::string source) { + fSource = source; + fIndex = 0; + ASSERT(fStack.size() == 0); + this->regex(); + ASSERT(fStack.size() == 1); + ASSERT(fIndex == source.size()); + return this->pop(); +} + +char RegexParser::peek() { + if (fIndex >= fSource.size()) { + return END; + } + return fSource[fIndex]; +} + +void RegexParser::expect(char c) { + if (this->peek() != c) { + printf("expected '%c' at index %d, but found '%c'", c, (int) fIndex, this->peek()); + exit(1); + } + ++fIndex; +} + +RegexNode RegexParser::pop() { + RegexNode result = fStack.top(); + fStack.pop(); + return result; +} + +void RegexParser::term() { + switch (this->peek()) { + case '(': this->group(); break; + case '[': this->set(); break; + case '.': this->dot(); break; + default: this->literal(); + } +} + +void RegexParser::quantifiedTerm() { + this->term(); + switch (this->peek()) { + case '*': fStack.push(RegexNode(RegexNode::kStar_Kind, this->pop())); ++fIndex; break; + case '+': fStack.push(RegexNode(RegexNode::kPlus_Kind, this->pop())); ++fIndex; break; + case '?': fStack.push(RegexNode(RegexNode::kQuestion_Kind, this->pop())); ++fIndex; break; + default: break; + } +} + +void RegexParser::sequence() { + this->quantifiedTerm(); + for (;;) { + switch (this->peek()) { + case END: // fall through + case '|': // fall through + case ')': return; + default: + this->sequence(); + RegexNode right = this->pop(); + RegexNode left = this->pop(); + fStack.emplace(RegexNode::kConcat_Kind, std::move(left), std::move(right)); + } + } +} + +RegexNode RegexParser::escapeSequence(char c) { + switch (c) { + case 'n': return RegexNode(RegexNode::kChar_Kind, '\n'); + case 'r': return RegexNode(RegexNode::kChar_Kind, '\r'); + case 't': return RegexNode(RegexNode::kChar_Kind, '\t'); + case 's': return RegexNode(RegexNode::kCharset_Kind, " \t\n\r"); + default: return RegexNode(RegexNode::kChar_Kind, c); + } +} + +void RegexParser::literal() { + char c = this->peek(); + if (c == '\\') { + ++fIndex; + fStack.push(this->escapeSequence(peek())); + ++fIndex; + } + else { + fStack.push(RegexNode(RegexNode::kChar_Kind, c)); + ++fIndex; + } +} + +void RegexParser::dot() { + this->expect('.'); + fStack.push(RegexNode(RegexNode::kDot_Kind)); +} + +void RegexParser::group() { + this->expect('('); + this->regex(); + this->expect(')'); +} + +void RegexParser::setItem() { + this->literal(); + if (this->peek() == '-') { + ++fIndex; + if (peek() == ']') { + fStack.push(RegexNode(RegexNode::kChar_Kind, '-')); + } + else { + literal(); + RegexNode end = this->pop(); + ASSERT(end.fKind == RegexNode::kChar_Kind); + RegexNode start = this->pop(); + ASSERT(start.fKind == RegexNode::kChar_Kind); + fStack.push(RegexNode(RegexNode::kRange_Kind, std::move(start), std::move(end))); + } + } +} + +void RegexParser::set() { + expect('['); + size_t depth = fStack.size(); + RegexNode set(RegexNode::kCharset_Kind); + if (this->peek() == '^') { + ++fIndex; + set.fPayload.fBool = true; + } + else { + set.fPayload.fBool = false; + } + for (;;) { + switch (this->peek()) { + case ']': + ++fIndex; + while (fStack.size() > depth) { + set.fChildren.push_back(this->pop()); + } + fStack.push(std::move(set)); + return; + case END: + printf("unterminated character set\n"); + exit(1); + default: + this->setItem(); + } + } +} + +void RegexParser::regex() { + this->sequence(); + switch (this->peek()) { + case '|': { + ++fIndex; + this->regex(); + RegexNode right = this->pop(); + RegexNode left = this->pop(); + fStack.push(RegexNode(RegexNode::kOr_Kind, left, right)); + break; + } + case END: // fall through + case ')': + return; + default: + ASSERT(false); + } +} diff --git a/src/sksl/lex/RegexParser.h b/src/sksl/lex/RegexParser.h new file mode 100644 index 0000000000..7de546fc17 --- /dev/null +++ b/src/sksl/lex/RegexParser.h @@ -0,0 +1,89 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#ifndef SKSL_REGEXPARSER +#define SKSL_REGEXPARSER + +#include "RegexNode.h" + +#include <stack> +#include <string> + +/** + * Turns a simple regular expression into a parse tree. The regular expression syntax supports only + * the basic quantifiers ('*', '+', and '?'), alternation ('|'), character sets ('[a-z]'), and + * groups ('()'). + */ +class RegexParser { +public: + RegexNode parse(std::string source); + +private: + static constexpr char END = '\0'; + + char peek(); + + void expect(char c); + + RegexNode pop(); + + /** + * Matches a char literal, parenthesized group, character set, or dot ('.'). + */ + void term(); + + /** + * Matches a term followed by an optional quantifier ('*', '+', or '?'). + */ + void quantifiedTerm(); + + /** + * Matches a sequence of quantifiedTerms. + */ + void sequence(); + + /** + * Returns a node representing the given escape character (e.g. escapeSequence('n') returns a + * node which matches a newline character). + */ + RegexNode escapeSequence(char c); + + /** + * Matches a literal character or escape sequence. + */ + void literal(); + + /** + * Matches a dot ('.'). + */ + void dot(); + + /** + * Matches a parenthesized group. + */ + void group(); + + /** + * Matches a literal character, escape sequence, or character range from a character set. + */ + void setItem(); + + /** + * Matches a character set. + */ + void set(); + + void regex(); + + std::string fSource; + + size_t fIndex; + + std::stack<RegexNode> fStack; +}; + +#endif diff --git a/src/sksl/lex/sksl.lex b/src/sksl/lex/sksl.lex new file mode 100644 index 0000000000..ada5e248de --- /dev/null +++ b/src/sksl/lex/sksl.lex @@ -0,0 +1,95 @@ +FLOAT_LITERAL = [0-9]*\.[0-9]+([eE][+-]?[0-9]+)?|[0-9]+\.[0-9]*([eE][+-]?[0-9]+)?|[0-9]+([eE][+-]?[0-9]+) +INT_LITERAL = [0-9]+|0x[0-9a-fA-F]+ +TRUE_LITERAL = "true" +FALSE_LITERAL = "false" +IF = "if" +STATIC_IF = "@if" +ELSE = "else" +FOR = "for" +WHILE = "while" +DO = "do" +SWITCH = "switch" +STATIC_SWITCH = "@switch" +CASE = "case" +DEFAULT = "default" +BREAK = "break" +CONTINUE = "continue" +DISCARD = "discard" +RETURN = "return" +IN = "in" +OUT = "out" +INOUT = "inout" +UNIFORM = "uniform" +CONST = "const" +LOWP = "lowp" +MEDIUMP = "mediump" +HIGHP = "highp" +FLAT = "flat" +NOPERSPECTIVE = "noperspective" +READONLY = "readonly" +WRITEONLY = "writeonly" +COHERENT = "coherent" +VOLATILE = "volatile" +RESTRICT = "restrict" +BUFFER = "buffer" +HASSIDEEFFECTS = "sk_has_side_effects" +STRUCT = "struct" +LAYOUT = "layout" +PRECISION = "precision" +IDENTIFIER = [a-zA-Z_$]([0-9]|[a-zA-Z_$])* +DIRECTIVE = #[a-zA-Z_$]([0-9]|[a-zA-Z_$])* +SECTION = @[a-zA-Z_$]([0-9]|[a-zA-Z_$])* +LPAREN = "(" +RPAREN = ")" +LBRACE = "{" +RBRACE = "}" +LBRACKET = "[" +RBRACKET = "]" +DOT = "." +COMMA = "," +PLUSPLUS = "++" +MINUSMINUS = "--" +PLUS = "+" +MINUS = "-" +STAR = "*" +SLASH = "/" +PERCENT = "%" +SHL = "<<" +SHR = ">>" +BITWISEOR = "|" +BITWISEXOR = "^" +BITWISEAND = "&" +BITWISENOT = "~" +LOGICALOR = "||" +LOGICALXOR = "^^" +LOGICALAND = "&&" +LOGICALNOT = "!" +QUESTION = "?" +COLON = ":" +EQ = "=" +EQEQ = "==" +NEQ = "!=" +GT = ">" +LT = "<" +GTEQ = ">=" +LTEQ = "<=" +PLUSEQ = "+=" +MINUSEQ = "-=" +STAREQ = "*=" +SLASHEQ = "/=" +PERCENTEQ = "%=" +SHLEQ = "<<=" +SHREQ = ">>=" +BITWISEOREQ = "|=" +BITWISEXOREQ = "^=" +BITWISEANDEQ = "&=" +LOGICALOREQ = "||=" +LOGICALXOREQ = "^^=" +LOGICALANDEQ = "&&=" +SEMICOLON = ";" +ARROW = "->" +COLONCOLON = "::" +WHITESPACE = \s+ +LINE_COMMENT = //.* +BLOCK_COMMENT = /\*([^*]|\*[^/])*\*/ +INVALID = .
\ No newline at end of file |