aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/sksl/lex/NFAtoDFA.h
blob: 28ccb547a2634f015056da2a6cc337151a3a5a6c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
/*
 * Copyright 2017 Google Inc.
 *
 * Use of this source code is governed by a BSD-style license that can be
 * found in the LICENSE file.
 */

#include "DFA.h"
#include "DFAState.h"
#include "NFA.h"
#include "NFAState.h"

#include <algorithm>
#include <climits>
#include <memory>
#include <unordered_map>
#include <set>
#include <vector>

/**
 * Converts a nondeterministic finite automaton to a deterministic finite automaton. Since NFAs and
 * DFAs differ only in that an NFA allows multiple states at the same time, we can find each
 * possible combination of simultaneous NFA states and give this combination a label. These labelled
 * nodes are our DFA nodes, since we can only be in one such unique set of NFA states at a time.
 *
 * As an NFA can end up in multiple accept states at the same time (for instance, the token "while"
 * is valid for both WHILE and IDENTIFIER), we disambiguate by preferring the first matching regex
 * (in terms of the order in which they were added to the NFA).
 */
class NFAtoDFA {
public:
    static constexpr char START_CHAR = 9;
    static constexpr char END_CHAR = 126;

    NFAtoDFA(NFA* nfa)
    : fNFA(*nfa) {}

    /**
     * Returns a DFA created from the NFA.
     */
    DFA convert() {
        // create state 0, the "reject" state
        getState(DFAState::Label({}));
        // create a state representing being in all of the NFA's start states at once
        std::vector<int> startStates = fNFA.fStartStates;
        std::sort(startStates.begin(), startStates.end());
        // this becomes state 1, our start state
        DFAState* start = getState(DFAState::Label(startStates));
        this->scanState(start);

        this->computeMappings();

        int stateCount = 0;
        for (const auto& row : fTransitions) {
            stateCount = std::max(stateCount, (int) row.size());
        }
        return DFA(fCharMappings, fTransitions, fAccepts);
    }

private:
    /**
     * Returns an existing state with the given label, or creates a new one and returns it.
     */
    DFAState* getState(DFAState::Label label) {
        auto found = fStates.find(label);
        if (found == fStates.end()) {
            int id = fStates.size();
            fStates[label] = std::unique_ptr<DFAState>(new DFAState(id, label));
            return fStates[label].get();
        }
        return found->second.get();
    }

    void add(int nfaState, std::vector<int>* states) {
        NFAState state = fNFA.fStates[nfaState];
        if (state.fKind == NFAState::kRemapped_Kind) {
            for (int next : state.fData) {
                this->add(next, states);
            }
        } else {
            for (int state : *states) {
                if (nfaState == state) {
                    return;
                }
            }
            states->push_back(nfaState);
        }
    }

    void addTransition(char c, int start, int next) {
        while (fTransitions.size() <= (size_t) c) {
            fTransitions.push_back(std::vector<int>());
        }
        std::vector<int>& row = fTransitions[c];
        while (row.size() <= (size_t) start) {
            row.push_back(INVALID);
        }
        row[start] = next;
    }

    void scanState(DFAState* state) {
        state->fIsScanned = true;
        for (char c = START_CHAR; c <= END_CHAR; ++c) {
            std::vector<int> next;
            int bestAccept = INT_MAX;
            for (int idx : state->fLabel.fStates) {
                const NFAState& nfaState = fNFA.fStates[idx];
                if (nfaState.accept(c)) {
                    for (int nextState : nfaState.fNext) {
                        if (fNFA.fStates[nextState].fKind == NFAState::kAccept_Kind) {
                            bestAccept = std::min(bestAccept, fNFA.fStates[nextState].fData[0]);
                        }
                        this->add(nextState, &next);
                    }
                }
            }
            std::sort(next.begin(), next.end());
            DFAState* nextState = this->getState(DFAState::Label(next));
            this->addTransition(c, state->fId, nextState->fId);
            if (bestAccept != INT_MAX) {
                while (fAccepts.size() <= (size_t) nextState->fId) {
                    fAccepts.push_back(INVALID);
                }
                fAccepts[nextState->fId] = bestAccept;
            }
            if (!nextState->fIsScanned) {
                this->scanState(nextState);
            }
        }
    }

    // collapse rows with the same transitions to a single row. This is common, as each row
    // represents a character and often there are many characters for which all transitions are
    // identical (e.g. [0-9] are treated the same way by all lexer rules)
    void computeMappings() {
        // mappings[<input row>] = <output row>
        std::vector<std::vector<int>*> uniques;
        // this could be done more efficiently, but O(n^2) is plenty fast for our purposes
        for (size_t i = 0; i < fTransitions.size(); ++i) {
            int found = -1;
            for (size_t j = 0; j < uniques.size(); ++j) {
                if (*uniques[j] == fTransitions[i]) {
                    found = j;
                    break;
                }
            }
            if (found == -1) {
                found = (int) uniques.size();
                uniques.push_back(&fTransitions[i]);
            }
            fCharMappings.push_back(found);
        }
        std::vector<std::vector<int>> newTransitions;
        for (std::vector<int>* row : uniques) {
            newTransitions.push_back(*row);
        }
        fTransitions = newTransitions;
    }

    const NFA& fNFA;
    std::unordered_map<DFAState::Label, std::unique_ptr<DFAState>> fStates;
    std::vector<std::vector<int>> fTransitions;
    std::vector<int> fCharMappings;
    std::vector<int> fAccepts;
};