1 | //===--- LRTable.h - Define LR Parsing Table ---------------------*- C++-*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // The LRTable (referred as LR parsing table in the LR literature) is the core |
10 | // component in LR parsers, it drives the LR parsers by specifying an action to |
11 | // take given the current state on the top of the stack and the current |
12 | // lookahead token. |
13 | // |
14 | // The LRTable can be described as a matrix where the rows represent |
15 | // the states of the LR graph, the columns represent the symbols of the |
16 | // grammar, and each entry of the matrix (called action) represents a |
17 | // state transition in the graph. |
18 | // |
19 | // Typically, based on the category of the grammar symbol, the LRTable is |
20 | // broken into two logically separate tables: |
21 | // - ACTION table with terminals as columns -- e.g. ACTION[S, a] specifies |
22 | // next action (shift/reduce) on state S under a lookahead terminal a |
23 | // - GOTO table with nonterminals as columns -- e.g. GOTO[S, X] specifies |
24 | // the state which we transist to from the state S with the nonterminal X |
25 | // |
26 | // LRTable is *performance-critial* as it is consulted frequently during a |
27 | // parse. In general, LRTable is very sparse (most of the entries are empty). |
28 | // For example, for the C++ language, the SLR table has ~1500 states and 650 |
29 | // symbols which results in a matrix having 975K entries, ~90% of entries are |
30 | // empty. |
31 | // |
32 | // This file implements a speed-and-space-efficient LRTable. |
33 | // |
34 | //===----------------------------------------------------------------------===// |
35 | |
36 | #ifndef CLANG_PSEUDO_GRAMMAR_LRTABLE_H |
37 | #define CLANG_PSEUDO_GRAMMAR_LRTABLE_H |
38 | |
39 | #include "clang-pseudo/grammar/Grammar.h" |
40 | #include "llvm/ADT/ArrayRef.h" |
41 | #include "llvm/ADT/BitVector.h" |
42 | #include "llvm/ADT/SmallSet.h" |
43 | #include "llvm/Support/Capacity.h" |
44 | #include "llvm/Support/MathExtras.h" |
45 | #include <cstdint> |
46 | #include <vector> |
47 | |
48 | namespace clang { |
49 | namespace pseudo { |
50 | |
51 | // Represents the LR parsing table, which can efficiently the question "what is |
52 | // the next step given the lookahead token and current state on top of the |
53 | // stack?". |
54 | // |
55 | // This is a dense implementation, which only takes an amount of space that is |
56 | // proportional to the number of non-empty entries in the table. |
57 | // |
58 | // Unlike the typical LR parsing table which allows at most one available action |
59 | // per entry, conflicted actions are allowed in LRTable. The LRTable is designed |
60 | // to be used in nondeterministic LR parsers (e.g. GLR). |
61 | // |
62 | // There are no "accept" actions in the LRTable, instead the stack is inspected |
63 | // after parsing completes: is the state goto(StartState, StartSymbol)? |
64 | class LRTable { |
65 | public: |
66 | // StateID is only 13 bits wide. |
67 | using StateID = uint16_t; |
68 | static constexpr unsigned StateBits = 13; |
69 | |
70 | struct Recovery { |
71 | ExtensionID Strategy; |
72 | SymbolID Result; |
73 | }; |
74 | |
75 | // Returns the state after we reduce a nonterminal. |
76 | // Expected to be called by LR parsers. |
77 | // If the nonterminal is invalid here, returns std::nullopt. |
78 | std::optional<StateID> getGoToState(StateID State, |
79 | SymbolID Nonterminal) const { |
80 | return Gotos.get(Key: gotoIndex(State, Nonterminal, NumStates: numStates())); |
81 | } |
82 | // Returns the state after we shift a terminal. |
83 | // Expected to be called by LR parsers. |
84 | // If the terminal is invalid here, returns std::nullopt. |
85 | std::optional<StateID> getShiftState(StateID State, |
86 | SymbolID Terminal) const { |
87 | return Shifts.get(Key: shiftIndex(State, Terminal, NumStates: numStates())); |
88 | } |
89 | |
90 | // Returns the possible reductions from a state. |
91 | // |
92 | // These are not keyed by a lookahead token. Instead, call canFollow() to |
93 | // check whether a reduction should apply in the current context: |
94 | // for (RuleID R : LR.getReduceRules(S)) { |
95 | // if (!LR.canFollow(G.lookupRule(R).Target, NextToken)) |
96 | // continue; |
97 | // // ...apply reduce... |
98 | // } |
99 | llvm::ArrayRef<RuleID> getReduceRules(StateID State) const { |
100 | assert(State + 1u < ReduceOffset.size()); |
101 | return llvm::ArrayRef(Reduces.data() + ReduceOffset[State], |
102 | Reduces.data() + ReduceOffset[State + 1]); |
103 | } |
104 | // Returns whether Terminal can follow Nonterminal in a valid source file. |
105 | bool canFollow(SymbolID Nonterminal, SymbolID Terminal) const { |
106 | assert(isToken(Terminal)); |
107 | assert(isNonterminal(Nonterminal)); |
108 | // tok::unknown is a sentinel value used in recovery: can follow anything. |
109 | return Terminal == tokenSymbol(TK: tok::unknown) || |
110 | FollowSets.test(Idx: tok::NUM_TOKENS * Nonterminal + |
111 | symbolToToken(SID: Terminal)); |
112 | } |
113 | |
114 | // Looks up available recovery actions if we stopped parsing in this state. |
115 | llvm::ArrayRef<Recovery> getRecovery(StateID State) const { |
116 | return llvm::ArrayRef(Recoveries.data() + RecoveryOffset[State], |
117 | Recoveries.data() + RecoveryOffset[State + 1]); |
118 | } |
119 | |
120 | // Returns the state from which the LR parser should start to parse the input |
121 | // tokens as the given StartSymbol. |
122 | // |
123 | // In LR parsing, the start state of `translation-unit` corresponds to |
124 | // `_ := • translation-unit`. |
125 | // |
126 | // Each start state responds to **a** single grammar rule like `_ := start`. |
127 | // REQUIRE: The given StartSymbol must exist in the grammar (in a form of |
128 | // `_ := start`). |
129 | StateID getStartState(SymbolID StartSymbol) const; |
130 | |
131 | size_t bytes() const { |
132 | return sizeof(*this) + Gotos.bytes() + Shifts.bytes() + |
133 | llvm::capacity_in_bytes(x: Reduces) + |
134 | llvm::capacity_in_bytes(x: ReduceOffset) + |
135 | llvm::capacity_in_bytes(X: FollowSets); |
136 | } |
137 | |
138 | std::string dumpStatistics() const; |
139 | std::string dumpForTests(const Grammar &G) const; |
140 | |
141 | // Build a SLR(1) parsing table. |
142 | static LRTable buildSLR(const Grammar &G); |
143 | |
144 | // Helper for building a table with specified actions/states. |
145 | struct Builder { |
146 | Builder() = default; |
147 | Builder(const Grammar &G) { |
148 | NumNonterminals = G.table().Nonterminals.size(); |
149 | FollowSets = followSets(G); |
150 | } |
151 | |
152 | unsigned int NumNonterminals = 0; |
153 | // States representing `_ := . start` for various start symbols. |
154 | std::vector<std::pair<SymbolID, StateID>> StartStates; |
155 | // State transitions `X := ABC . D EFG` => `X := ABC D . EFG`. |
156 | // Key is (initial state, D), value is final state. |
157 | llvm::DenseMap<std::pair<StateID, SymbolID>, StateID> Transition; |
158 | // Reductions available in a given state. |
159 | llvm::DenseMap<StateID, llvm::SmallSet<RuleID, 4>> Reduce; |
160 | // FollowSets[NT] is the set of terminals that can follow the nonterminal. |
161 | std::vector<llvm::DenseSet<SymbolID>> FollowSets; |
162 | // Recovery options available at each state. |
163 | std::vector<std::pair<StateID, Recovery>> Recoveries; |
164 | |
165 | LRTable build() &&; |
166 | }; |
167 | |
168 | private: |
169 | unsigned numStates() const { return ReduceOffset.size() - 1; } |
170 | |
171 | // A map from unsigned key => StateID, used to store actions. |
172 | // The keys should be sequential but the values are somewhat sparse. |
173 | // |
174 | // In practice, the keys encode (origin state, symbol) pairs, and the values |
175 | // are the state we should move to after seeing that symbol. |
176 | // |
177 | // We store one bit for presence/absence of the value for each key. |
178 | // At every 64th key, we store the offset into the table of values. |
179 | // e.g. key 0x500 is checkpoint 0x500/64 = 20 |
180 | // Checkpoints[20] = 34 |
181 | // get(0x500) = Values[34] (assuming it has a value) |
182 | // To look up values in between, we count the set bits: |
183 | // get(0x509) has a value if HasValue[20] & (1<<9) |
184 | // #values between 0x500 and 0x509: popcnt(HasValue[20] & (1<<9 - 1)) |
185 | // get(0x509) = Values[34 + popcnt(...)] |
186 | // |
187 | // Overall size is 1.25 bits/key + 16 bits/value. |
188 | // Lookup is constant time with a low factor (no hashing). |
189 | class TransitionTable { |
190 | using Word = uint64_t; |
191 | constexpr static unsigned WordBits = CHAR_BIT * sizeof(Word); |
192 | |
193 | std::vector<StateID> Values; |
194 | std::vector<Word> HasValue; |
195 | std::vector<uint16_t> Checkpoints; |
196 | |
197 | public: |
198 | TransitionTable() = default; |
199 | TransitionTable(const llvm::DenseMap<unsigned, StateID> &Entries, |
200 | unsigned NumKeys) { |
201 | assert( |
202 | Entries.size() < |
203 | std::numeric_limits<decltype(Checkpoints)::value_type>::max() && |
204 | "16 bits too small for value offsets!" ); |
205 | unsigned NumWords = (NumKeys + WordBits - 1) / WordBits; |
206 | HasValue.resize(new_size: NumWords, x: 0); |
207 | Checkpoints.reserve(n: NumWords); |
208 | Values.reserve(n: Entries.size()); |
209 | for (unsigned I = 0; I < NumKeys; ++I) { |
210 | if ((I % WordBits) == 0) |
211 | Checkpoints.push_back(x: Values.size()); |
212 | auto It = Entries.find(Val: I); |
213 | if (It != Entries.end()) { |
214 | HasValue[I / WordBits] |= (Word(1) << (I % WordBits)); |
215 | Values.push_back(x: It->second); |
216 | } |
217 | } |
218 | } |
219 | |
220 | std::optional<StateID> get(unsigned Key) const { |
221 | // Do we have a value for this key? |
222 | Word KeyMask = Word(1) << (Key % WordBits); |
223 | unsigned KeyWord = Key / WordBits; |
224 | if ((HasValue[KeyWord] & KeyMask) == 0) |
225 | return std::nullopt; |
226 | // Count the number of values since the checkpoint. |
227 | Word BelowKeyMask = KeyMask - 1; |
228 | unsigned CountSinceCheckpoint = |
229 | llvm::popcount(Value: HasValue[KeyWord] & BelowKeyMask); |
230 | // Find the value relative to the last checkpoint. |
231 | return Values[Checkpoints[KeyWord] + CountSinceCheckpoint]; |
232 | } |
233 | |
234 | unsigned size() const { return Values.size(); } |
235 | |
236 | size_t bytes() const { |
237 | return llvm::capacity_in_bytes(x: HasValue) + |
238 | llvm::capacity_in_bytes(x: Values) + |
239 | llvm::capacity_in_bytes(x: Checkpoints); |
240 | } |
241 | }; |
242 | // Shift and Goto tables are keyed by encoded (State, Symbol). |
243 | static unsigned shiftIndex(StateID State, SymbolID Terminal, |
244 | unsigned NumStates) { |
245 | return NumStates * symbolToToken(SID: Terminal) + State; |
246 | } |
247 | static unsigned gotoIndex(StateID State, SymbolID Nonterminal, |
248 | unsigned NumStates) { |
249 | assert(isNonterminal(Nonterminal)); |
250 | return NumStates * Nonterminal + State; |
251 | } |
252 | TransitionTable Shifts; |
253 | TransitionTable Gotos; |
254 | |
255 | // A sorted table, storing the start state for each target parsing symbol. |
256 | std::vector<std::pair<SymbolID, StateID>> StartStates; |
257 | |
258 | // Given a state ID S, the half-open range of Reduces is |
259 | // [ReduceOffset[S], ReduceOffset[S+1]) |
260 | std::vector<uint32_t> ReduceOffset; |
261 | std::vector<RuleID> Reduces; |
262 | // Conceptually this is a bool[SymbolID][Token], each entry describing whether |
263 | // the grammar allows the (nonterminal) symbol to be followed by the token. |
264 | // |
265 | // This is flattened by encoding the (SymbolID Nonterminal, tok::Kind Token) |
266 | // as an index: Nonterminal * NUM_TOKENS + Token. |
267 | llvm::BitVector FollowSets; |
268 | |
269 | // Recovery stores all recovery actions from all states. |
270 | // A given state has [RecoveryOffset[S], RecoveryOffset[S+1]). |
271 | std::vector<uint32_t> RecoveryOffset; |
272 | std::vector<Recovery> Recoveries; |
273 | }; |
274 | |
275 | } // namespace pseudo |
276 | } // namespace clang |
277 | |
278 | #endif // CLANG_PSEUDO_GRAMMAR_LRTABLE_H |
279 | |