LRTable.h source code [clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h]

1	//===--- LRTable.h - Define LR Parsing Table ---------------------- C++--===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// The LRTable (referred as LR parsing table in the LR literature) is the core
10	// component in LR parsers, it drives the LR parsers by specifying an action to
11	// take given the current state on the top of the stack and the current
12	// lookahead token.
13	//
14	// The LRTable can be described as a matrix where the rows represent
15	// the states of the LR graph, the columns represent the symbols of the
16	// grammar, and each entry of the matrix (called action) represents a
17	// state transition in the graph.
18	//
19	// Typically, based on the category of the grammar symbol, the LRTable is
20	// broken into two logically separate tables:
21	// - ACTION table with terminals as columns -- e.g. ACTION[S, a] specifies
22	// next action (shift/reduce) on state S under a lookahead terminal a
23	// - GOTO table with nonterminals as columns -- e.g. GOTO[S, X] specifies
24	// the state which we transist to from the state S with the nonterminal X
25	//
26	// LRTable is performance-critial* as it is consulted frequently during a*
27	// parse. In general, LRTable is very sparse (most of the entries are empty).
28	// For example, for the C++ language, the SLR table has ~1500 states and 650
29	// symbols which results in a matrix having 975K entries, ~90% of entries are
30	// empty.
31	//
32	// This file implements a speed-and-space-efficient LRTable.
33	//
34	//===----------------------------------------------------------------------===//
35
36	#ifndef CLANG_PSEUDO_GRAMMAR_LRTABLE_H
37	#define CLANG_PSEUDO_GRAMMAR_LRTABLE_H
38
39	#include "clang-pseudo/grammar/Grammar.h"
40	#include "llvm/ADT/ArrayRef.h"
41	#include "llvm/ADT/BitVector.h"
42	#include "llvm/ADT/SmallSet.h"
43	#include "llvm/Support/Capacity.h"
44	#include "llvm/Support/MathExtras.h"
45	#include <cstdint>
46	#include <vector>
47
48	namespace clang {
49	namespace pseudo {
50
51	// Represents the LR parsing table, which can efficiently the question "what is
52	// the next step given the lookahead token and current state on top of the
53	// stack?".
54	//
55	// This is a dense implementation, which only takes an amount of space that is
56	// proportional to the number of non-empty entries in the table.
57	//
58	// Unlike the typical LR parsing table which allows at most one available action
59	// per entry, conflicted actions are allowed in LRTable. The LRTable is designed
60	// to be used in nondeterministic LR parsers (e.g. GLR).
61	//
62	// There are no "accept" actions in the LRTable, instead the stack is inspected
63	// after parsing completes: is the state goto(StartState, StartSymbol)?
64	class LRTable {
65	public:
66	// StateID is only 13 bits wide.
67	using StateID = uint16_t;
68	static constexpr unsigned StateBits = `13`;
69
70	struct Recovery {
71	ExtensionID Strategy;
72	SymbolID Result;
73	};
74
75	// Returns the state after we reduce a nonterminal.
76	// Expected to be called by LR parsers.
77	// If the nonterminal is invalid here, returns std::nullopt.
78	std::optional<StateID> getGoToState(StateID State,
79	SymbolID Nonterminal) const {
80	return Gotos.get(Key: gotoIndex(State, Nonterminal, NumStates: numStates()));
81	}
82	// Returns the state after we shift a terminal.
83	// Expected to be called by LR parsers.
84	// If the terminal is invalid here, returns std::nullopt.
85	std::optional<StateID> getShiftState(StateID State,
86	SymbolID Terminal) const {
87	return Shifts.get(Key: shiftIndex(State, Terminal, NumStates: numStates()));
88	}
89
90	// Returns the possible reductions from a state.
91	//
92	// These are not keyed by a lookahead token. Instead, call canFollow() to
93	// check whether a reduction should apply in the current context:
94	// for (RuleID R : LR.getReduceRules(S)) {
95	// if (!LR.canFollow(G.lookupRule(R).Target, NextToken))
96	// continue;
97	// // ...apply reduce...
98	// }
99	llvm::ArrayRef<RuleID> getReduceRules(StateID State) const {
100	assert(State + `1u` < ReduceOffset.size());
101	return llvm::ArrayRef(Reduces.data() + ReduceOffset [State],
102	Reduces.data() + ReduceOffset [State + `1`]);
103	}
104	// Returns whether Terminal can follow Nonterminal in a valid source file.
105	bool canFollow(SymbolID Nonterminal, SymbolID Terminal) const {
106	assert(isToken(Terminal));
107	assert(isNonterminal(Nonterminal));
108	// tok::unknown is a sentinel value used in recovery: can follow anything.
109	return Terminal == tokenSymbol(TK: tok::unknown) \|\|
110	FollowSets.test(Idx: tok::NUM_TOKENS * Nonterminal +
111	symbolToToken(SID: Terminal));
112	}
113
114	// Looks up available recovery actions if we stopped parsing in this state.
115	llvm::ArrayRef<Recovery> getRecovery(StateID State) const {
116	return llvm::ArrayRef(Recoveries.data() + RecoveryOffset [State],
117	Recoveries.data() + RecoveryOffset [State + `1`]);
118	}
119
120	// Returns the state from which the LR parser should start to parse the input
121	// tokens as the given StartSymbol.
122	//
123	// In LR parsing, the start state of `translation-unit` corresponds to
124	// `_ := • translation-unit`.
125	//
126	// Each start state responds to a* single grammar rule like `_ := start`.*
127	// REQUIRE: The given StartSymbol must exist in the grammar (in a form of
128	// `_ := start`).
129	StateID getStartState(SymbolID StartSymbol) const;
130
131	size_t bytes() const {
132	return sizeof(*this) + Gotos.bytes() + Shifts.bytes() +
133	llvm::capacity_in_bytes(x: Reduces) +
134	llvm::capacity_in_bytes(x: ReduceOffset) +
135	llvm::capacity_in_bytes(X: FollowSets);
136	}
137
138	std::string dumpStatistics() const;
139	std::string dumpForTests(const Grammar &G) const;
140
141	// Build a SLR(1) parsing table.
142	static LRTable buildSLR(const Grammar &G);
143
144	// Helper for building a table with specified actions/states.
145	struct Builder {
146	Builder() = default;
147	Builder(const Grammar &G) {
148	NumNonterminals = G.table().Nonterminals.size();
149	FollowSets = followSets(G);
150	}
151
152	unsigned int NumNonterminals = `0`;
153	// States representing `_ := . start` for various start symbols.
154	std::vector<std::pair<SymbolID, StateID>> StartStates;
155	// State transitions `X := ABC . D EFG` => `X := ABC D . EFG`.
156	// Key is (initial state, D), value is final state.
157	llvm::DenseMap<std::pair<StateID, SymbolID>, StateID> Transition;
158	// Reductions available in a given state.
159	llvm::DenseMap<StateID, llvm::SmallSet<RuleID, `4`>> Reduce;
160	// FollowSets[NT] is the set of terminals that can follow the nonterminal.
161	std::vector<llvm::DenseSet<SymbolID>> FollowSets;
162	// Recovery options available at each state.
163	std::vector<std::pair<StateID, Recovery>> Recoveries;
164
165	LRTable build() &&;
166	};
167
168	private:
169	unsigned numStates() const { return ReduceOffset.size() - `1`; }
170
171	// A map from unsigned key => StateID, used to store actions.
172	// The keys should be sequential but the values are somewhat sparse.
173	//
174	// In practice, the keys encode (origin state, symbol) pairs, and the values
175	// are the state we should move to after seeing that symbol.
176	//
177	// We store one bit for presence/absence of the value for each key.
178	// At every 64th key, we store the offset into the table of values.
179	// e.g. key 0x500 is checkpoint 0x500/64 = 20
180	// Checkpoints[20] = 34
181	// get(0x500) = Values[34] (assuming it has a value)
182	// To look up values in between, we count the set bits:
183	// get(0x509) has a value if HasValue[20] & (1<<9)
184	// #values between 0x500 and 0x509: popcnt(HasValue[20] & (1<<9 - 1))
185	// get(0x509) = Values[34 + popcnt(...)]
186	//
187	// Overall size is 1.25 bits/key + 16 bits/value.
188	// Lookup is constant time with a low factor (no hashing).
189	class TransitionTable {
190	using Word = uint64_t;
191	constexpr static unsigned WordBits = CHAR_BIT * sizeof(Word);
192
193	std::vector<StateID> Values;
194	std::vector<Word> HasValue;
195	std::vector<uint16_t> Checkpoints;
196
197	public:
198	TransitionTable() = default;
199	TransitionTable(const llvm::DenseMap<unsigned, StateID> &Entries,
200	unsigned NumKeys) {
201	assert(
202	Entries.size() <
203	std::numeric_limits<decltype(Checkpoints)::value_type>::max() &&
204	"16 bits too small for value offsets!");
205	unsigned NumWords = (NumKeys + WordBits - `1`) / WordBits;
206	HasValue.resize(new_size: NumWords, x: `0`);
207	Checkpoints.reserve(n: NumWords);
208	Values.reserve(n: Entries.size());
209	for (unsigned I = `0`; I < NumKeys; ++I) {
210	if ((I % WordBits) == `0`)
211	Checkpoints.push_back(x: Values.size());
212	auto It = Entries.find(Val: I);
213	if (It != Entries.end()) {
214	HasValue [I / WordBits] \|= (Word(`1`) << (I % WordBits));
215	Values.push_back(x: It ->second);
216	}
217	}
218	}
219
220	std::optional<StateID> get(unsigned Key) const {
221	// Do we have a value for this key?
222	Word KeyMask = Word(`1`) << (Key % WordBits);
223	unsigned KeyWord = Key / WordBits;
224	if ((HasValue [KeyWord] & KeyMask) == `0`)
225	return std::nullopt;
226	// Count the number of values since the checkpoint.
227	Word BelowKeyMask = KeyMask - `1`;
228	unsigned CountSinceCheckpoint =
229	llvm::popcount(Value: HasValue [KeyWord] & BelowKeyMask);
230	// Find the value relative to the last checkpoint.
231	return Values [Checkpoints [KeyWord] + CountSinceCheckpoint];
232	}
233
234	unsigned size() const { return Values.size(); }
235
236	size_t bytes() const {
237	return llvm::capacity_in_bytes(x: HasValue) +
238	llvm::capacity_in_bytes(x: Values) +
239	llvm::capacity_in_bytes(x: Checkpoints);
240	}
241	};
242	// Shift and Goto tables are keyed by encoded (State, Symbol).
243	static unsigned shiftIndex(StateID State, SymbolID Terminal,
244	unsigned NumStates) {
245	return NumStates * symbolToToken(SID: Terminal) + State;
246	}
247	static unsigned gotoIndex(StateID State, SymbolID Nonterminal,
248	unsigned NumStates) {
249	assert(isNonterminal(Nonterminal));
250	return NumStates * Nonterminal + State;
251	}
252	TransitionTable Shifts;
253	TransitionTable Gotos;
254
255	// A sorted table, storing the start state for each target parsing symbol.
256	std::vector<std::pair<SymbolID, StateID>> StartStates;
257
258	// Given a state ID S, the half-open range of Reduces is
259	// [ReduceOffset[S], ReduceOffset[S+1])
260	std::vector<uint32_t> ReduceOffset;
261	std::vector<RuleID> Reduces;
262	// Conceptually this is a bool[SymbolID][Token], each entry describing whether
263	// the grammar allows the (nonterminal) symbol to be followed by the token.
264	//
265	// This is flattened by encoding the (SymbolID Nonterminal, tok::Kind Token)
266	// as an index: Nonterminal NUM_TOKENS + Token.*
267	llvm::BitVector FollowSets;
268
269	// Recovery stores all recovery actions from all states.
270	// A given state has [RecoveryOffset[S], RecoveryOffset[S+1]).
271	std::vector<uint32_t> RecoveryOffset;
272	std::vector<Recovery> Recoveries;
273	};
274
275	} // namespace pseudo
276	} // namespace clang
277
278	#endif // CLANG_PSEUDO_GRAMMAR_LRTABLE_H
279

source code of clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h