1//===--- LRTable.h - Define LR Parsing Table ---------------------*- C++-*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// The LRTable (referred as LR parsing table in the LR literature) is the core
10// component in LR parsers, it drives the LR parsers by specifying an action to
11// take given the current state on the top of the stack and the current
12// lookahead token.
13//
14// The LRTable can be described as a matrix where the rows represent
15// the states of the LR graph, the columns represent the symbols of the
16// grammar, and each entry of the matrix (called action) represents a
17// state transition in the graph.
18//
19// Typically, based on the category of the grammar symbol, the LRTable is
20// broken into two logically separate tables:
21// - ACTION table with terminals as columns -- e.g. ACTION[S, a] specifies
22// next action (shift/reduce) on state S under a lookahead terminal a
23// - GOTO table with nonterminals as columns -- e.g. GOTO[S, X] specifies
24// the state which we transist to from the state S with the nonterminal X
25//
26// LRTable is *performance-critial* as it is consulted frequently during a
27// parse. In general, LRTable is very sparse (most of the entries are empty).
28// For example, for the C++ language, the SLR table has ~1500 states and 650
29// symbols which results in a matrix having 975K entries, ~90% of entries are
30// empty.
31//
32// This file implements a speed-and-space-efficient LRTable.
33//
34//===----------------------------------------------------------------------===//
35
36#ifndef CLANG_PSEUDO_GRAMMAR_LRTABLE_H
37#define CLANG_PSEUDO_GRAMMAR_LRTABLE_H
38
39#include "clang-pseudo/grammar/Grammar.h"
40#include "llvm/ADT/ArrayRef.h"
41#include "llvm/ADT/BitVector.h"
42#include "llvm/ADT/SmallSet.h"
43#include "llvm/Support/Capacity.h"
44#include "llvm/Support/MathExtras.h"
45#include <cstdint>
46#include <vector>
47
48namespace clang {
49namespace pseudo {
50
51// Represents the LR parsing table, which can efficiently the question "what is
52// the next step given the lookahead token and current state on top of the
53// stack?".
54//
55// This is a dense implementation, which only takes an amount of space that is
56// proportional to the number of non-empty entries in the table.
57//
58// Unlike the typical LR parsing table which allows at most one available action
59// per entry, conflicted actions are allowed in LRTable. The LRTable is designed
60// to be used in nondeterministic LR parsers (e.g. GLR).
61//
62// There are no "accept" actions in the LRTable, instead the stack is inspected
63// after parsing completes: is the state goto(StartState, StartSymbol)?
64class LRTable {
65public:
66 // StateID is only 13 bits wide.
67 using StateID = uint16_t;
68 static constexpr unsigned StateBits = 13;
69
70 struct Recovery {
71 ExtensionID Strategy;
72 SymbolID Result;
73 };
74
75 // Returns the state after we reduce a nonterminal.
76 // Expected to be called by LR parsers.
77 // If the nonterminal is invalid here, returns std::nullopt.
78 std::optional<StateID> getGoToState(StateID State,
79 SymbolID Nonterminal) const {
80 return Gotos.get(Key: gotoIndex(State, Nonterminal, NumStates: numStates()));
81 }
82 // Returns the state after we shift a terminal.
83 // Expected to be called by LR parsers.
84 // If the terminal is invalid here, returns std::nullopt.
85 std::optional<StateID> getShiftState(StateID State,
86 SymbolID Terminal) const {
87 return Shifts.get(Key: shiftIndex(State, Terminal, NumStates: numStates()));
88 }
89
90 // Returns the possible reductions from a state.
91 //
92 // These are not keyed by a lookahead token. Instead, call canFollow() to
93 // check whether a reduction should apply in the current context:
94 // for (RuleID R : LR.getReduceRules(S)) {
95 // if (!LR.canFollow(G.lookupRule(R).Target, NextToken))
96 // continue;
97 // // ...apply reduce...
98 // }
99 llvm::ArrayRef<RuleID> getReduceRules(StateID State) const {
100 assert(State + 1u < ReduceOffset.size());
101 return llvm::ArrayRef(Reduces.data() + ReduceOffset[State],
102 Reduces.data() + ReduceOffset[State + 1]);
103 }
104 // Returns whether Terminal can follow Nonterminal in a valid source file.
105 bool canFollow(SymbolID Nonterminal, SymbolID Terminal) const {
106 assert(isToken(Terminal));
107 assert(isNonterminal(Nonterminal));
108 // tok::unknown is a sentinel value used in recovery: can follow anything.
109 return Terminal == tokenSymbol(TK: tok::unknown) ||
110 FollowSets.test(Idx: tok::NUM_TOKENS * Nonterminal +
111 symbolToToken(SID: Terminal));
112 }
113
114 // Looks up available recovery actions if we stopped parsing in this state.
115 llvm::ArrayRef<Recovery> getRecovery(StateID State) const {
116 return llvm::ArrayRef(Recoveries.data() + RecoveryOffset[State],
117 Recoveries.data() + RecoveryOffset[State + 1]);
118 }
119
120 // Returns the state from which the LR parser should start to parse the input
121 // tokens as the given StartSymbol.
122 //
123 // In LR parsing, the start state of `translation-unit` corresponds to
124 // `_ := • translation-unit`.
125 //
126 // Each start state responds to **a** single grammar rule like `_ := start`.
127 // REQUIRE: The given StartSymbol must exist in the grammar (in a form of
128 // `_ := start`).
129 StateID getStartState(SymbolID StartSymbol) const;
130
131 size_t bytes() const {
132 return sizeof(*this) + Gotos.bytes() + Shifts.bytes() +
133 llvm::capacity_in_bytes(x: Reduces) +
134 llvm::capacity_in_bytes(x: ReduceOffset) +
135 llvm::capacity_in_bytes(X: FollowSets);
136 }
137
138 std::string dumpStatistics() const;
139 std::string dumpForTests(const Grammar &G) const;
140
141 // Build a SLR(1) parsing table.
142 static LRTable buildSLR(const Grammar &G);
143
144 // Helper for building a table with specified actions/states.
145 struct Builder {
146 Builder() = default;
147 Builder(const Grammar &G) {
148 NumNonterminals = G.table().Nonterminals.size();
149 FollowSets = followSets(G);
150 }
151
152 unsigned int NumNonterminals = 0;
153 // States representing `_ := . start` for various start symbols.
154 std::vector<std::pair<SymbolID, StateID>> StartStates;
155 // State transitions `X := ABC . D EFG` => `X := ABC D . EFG`.
156 // Key is (initial state, D), value is final state.
157 llvm::DenseMap<std::pair<StateID, SymbolID>, StateID> Transition;
158 // Reductions available in a given state.
159 llvm::DenseMap<StateID, llvm::SmallSet<RuleID, 4>> Reduce;
160 // FollowSets[NT] is the set of terminals that can follow the nonterminal.
161 std::vector<llvm::DenseSet<SymbolID>> FollowSets;
162 // Recovery options available at each state.
163 std::vector<std::pair<StateID, Recovery>> Recoveries;
164
165 LRTable build() &&;
166 };
167
168private:
169 unsigned numStates() const { return ReduceOffset.size() - 1; }
170
171 // A map from unsigned key => StateID, used to store actions.
172 // The keys should be sequential but the values are somewhat sparse.
173 //
174 // In practice, the keys encode (origin state, symbol) pairs, and the values
175 // are the state we should move to after seeing that symbol.
176 //
177 // We store one bit for presence/absence of the value for each key.
178 // At every 64th key, we store the offset into the table of values.
179 // e.g. key 0x500 is checkpoint 0x500/64 = 20
180 // Checkpoints[20] = 34
181 // get(0x500) = Values[34] (assuming it has a value)
182 // To look up values in between, we count the set bits:
183 // get(0x509) has a value if HasValue[20] & (1<<9)
184 // #values between 0x500 and 0x509: popcnt(HasValue[20] & (1<<9 - 1))
185 // get(0x509) = Values[34 + popcnt(...)]
186 //
187 // Overall size is 1.25 bits/key + 16 bits/value.
188 // Lookup is constant time with a low factor (no hashing).
189 class TransitionTable {
190 using Word = uint64_t;
191 constexpr static unsigned WordBits = CHAR_BIT * sizeof(Word);
192
193 std::vector<StateID> Values;
194 std::vector<Word> HasValue;
195 std::vector<uint16_t> Checkpoints;
196
197 public:
198 TransitionTable() = default;
199 TransitionTable(const llvm::DenseMap<unsigned, StateID> &Entries,
200 unsigned NumKeys) {
201 assert(
202 Entries.size() <
203 std::numeric_limits<decltype(Checkpoints)::value_type>::max() &&
204 "16 bits too small for value offsets!");
205 unsigned NumWords = (NumKeys + WordBits - 1) / WordBits;
206 HasValue.resize(new_size: NumWords, x: 0);
207 Checkpoints.reserve(n: NumWords);
208 Values.reserve(n: Entries.size());
209 for (unsigned I = 0; I < NumKeys; ++I) {
210 if ((I % WordBits) == 0)
211 Checkpoints.push_back(x: Values.size());
212 auto It = Entries.find(Val: I);
213 if (It != Entries.end()) {
214 HasValue[I / WordBits] |= (Word(1) << (I % WordBits));
215 Values.push_back(x: It->second);
216 }
217 }
218 }
219
220 std::optional<StateID> get(unsigned Key) const {
221 // Do we have a value for this key?
222 Word KeyMask = Word(1) << (Key % WordBits);
223 unsigned KeyWord = Key / WordBits;
224 if ((HasValue[KeyWord] & KeyMask) == 0)
225 return std::nullopt;
226 // Count the number of values since the checkpoint.
227 Word BelowKeyMask = KeyMask - 1;
228 unsigned CountSinceCheckpoint =
229 llvm::popcount(Value: HasValue[KeyWord] & BelowKeyMask);
230 // Find the value relative to the last checkpoint.
231 return Values[Checkpoints[KeyWord] + CountSinceCheckpoint];
232 }
233
234 unsigned size() const { return Values.size(); }
235
236 size_t bytes() const {
237 return llvm::capacity_in_bytes(x: HasValue) +
238 llvm::capacity_in_bytes(x: Values) +
239 llvm::capacity_in_bytes(x: Checkpoints);
240 }
241 };
242 // Shift and Goto tables are keyed by encoded (State, Symbol).
243 static unsigned shiftIndex(StateID State, SymbolID Terminal,
244 unsigned NumStates) {
245 return NumStates * symbolToToken(SID: Terminal) + State;
246 }
247 static unsigned gotoIndex(StateID State, SymbolID Nonterminal,
248 unsigned NumStates) {
249 assert(isNonterminal(Nonterminal));
250 return NumStates * Nonterminal + State;
251 }
252 TransitionTable Shifts;
253 TransitionTable Gotos;
254
255 // A sorted table, storing the start state for each target parsing symbol.
256 std::vector<std::pair<SymbolID, StateID>> StartStates;
257
258 // Given a state ID S, the half-open range of Reduces is
259 // [ReduceOffset[S], ReduceOffset[S+1])
260 std::vector<uint32_t> ReduceOffset;
261 std::vector<RuleID> Reduces;
262 // Conceptually this is a bool[SymbolID][Token], each entry describing whether
263 // the grammar allows the (nonterminal) symbol to be followed by the token.
264 //
265 // This is flattened by encoding the (SymbolID Nonterminal, tok::Kind Token)
266 // as an index: Nonterminal * NUM_TOKENS + Token.
267 llvm::BitVector FollowSets;
268
269 // Recovery stores all recovery actions from all states.
270 // A given state has [RecoveryOffset[S], RecoveryOffset[S+1]).
271 std::vector<uint32_t> RecoveryOffset;
272 std::vector<Recovery> Recoveries;
273};
274
275} // namespace pseudo
276} // namespace clang
277
278#endif // CLANG_PSEUDO_GRAMMAR_LRTABLE_H
279

source code of clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h