GLR.cpp source code [clang-tools-extra/pseudo/lib/GLR.cpp]

1	//===--- GLR.cpp ------------------------------------------------ C++--===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "clang-pseudo/GLR.h"
10	#include "clang-pseudo/Language.h"
11	#include "clang-pseudo/grammar/Grammar.h"
12	#include "clang-pseudo/grammar/LRTable.h"
13	#include "clang/Basic/TokenKinds.h"
14	#include "llvm/ADT/ArrayRef.h"
15	#include "llvm/ADT/STLExtras.h"
16	#include "llvm/ADT/ScopeExit.h"
17	#include "llvm/ADT/StringExtras.h"
18	#include "llvm/Support/Debug.h"
19	#include "llvm/Support/FormatVariadic.h"
20	#include <algorithm>
21	#include <memory>
22	#include <optional>
23	#include <queue>
24
25	#define DEBUG_TYPE "GLR.cpp"
26
27	namespace clang {
28	namespace pseudo {
29	namespace {
30
31	Token::Index findRecoveryEndpoint(ExtensionID Strategy, Token::Index Begin,
32	const TokenStream &Tokens,
33	const Language &Lang) {
34	assert(Strategy != `0`);
35	if (auto S = Lang.RecoveryStrategies.lookup(Val: Strategy))
36	return S (Begin, Tokens);
37	return Token::Invalid;
38	}
39
40	} // namespace
41
42	void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
43	unsigned &TokenIndex, const ParseParams &Params,
44	const Language &Lang,
45	std::vector<const GSS::Node *> &NewHeads) {
46	LLVM_DEBUG(llvm::dbgs() << "Recovery at token " << TokenIndex << "...\n");
47	// Describes a possibility to recover by forcibly interpreting a range of
48	// tokens around the cursor as a nonterminal that we expected to see.
49	struct PlaceholderRecovery {
50	// The token prior to the nonterminal which is being recovered.
51	// This starts of the region we're skipping, so higher Position is better.
52	Token::Index Position;
53	// The nonterminal which will be created in order to recover.
54	SymbolID Symbol;
55	// The heuristic used to choose the bounds of the nonterminal to recover.
56	ExtensionID Strategy;
57
58	// The GSS head where we are expecting the recovered nonterminal.
59	const GSS::Node *RecoveryNode;
60	// Payload of nodes on the way back from the OldHead to the recovery node.
61	// These represent the partial parse that is being discarded.
62	// They should become the children of the opaque recovery node.
63	// FIXME: internal structure of opaque nodes is not implemented.
64	//
65	// There may be multiple paths leading to the same recovery node, we choose
66	// one arbitrarily.
67	std::vector<const ForestNode *> DiscardedParse;
68	};
69	std::vector<PlaceholderRecovery> Options;
70
71	// Find recovery options by walking up the stack.
72	//
73	// This is similar to exception handling: we walk up the "frames" of nested
74	// rules being parsed until we find one that has a "handler" which allows us
75	// to determine the node bounds without parsing it.
76	//
77	// Unfortunately there's a significant difference: the stack contains both
78	// "upward" nodes (ancestor parses) and "leftward" ones.
79	// e.g. when parsing `{ if (1) ? }` as compound-stmt, the stack contains:
80	// stmt := IF ( expr ) . stmt - current state, we should recover here!
81	// stmt := IF ( expr . ) stmt - (left, no recovery here)
82	// stmt := IF ( . expr ) stmt - left, we should NOT recover here!
83	// stmt := IF . ( expr ) stmt - (left, no recovery here)
84	// stmt-seq := . stmt - up, we might recover here
85	// compound-stmt := { . stmt-seq } - up, we should recover here!
86	//
87	// It's not obvious how to avoid collecting "leftward" recovery options.
88	// I think the distinction is ill-defined after merging items into states.
89	// For now, we have to take this into account when defining recovery rules.
90	// (e.g. in the expr recovery above, stay inside the parentheses).
91	// FIXME: find a more satisfying way to avoid such false recovery.
92	// FIXME: Add a test for spurious recovery once tests can define strategies.
93	std::vector<const ForestNode *> Path;
94	llvm::DenseSet<const GSS::Node *> Seen;
95	auto WalkUp = [&](const GSS::Node N, Token::Index NextTok, auto* &WalkUp) {
96	if (!Seen.insert(V: N).second)
97	return;
98	if (!N->Recovered) { // Don't recover the same way twice!
99	for (auto Strategy : Lang.Table.getRecovery(State: N->State)) {
100	Options.push_back(x: PlaceholderRecovery{
101	.Position: NextTok,
102	.Symbol: Strategy.Result,
103	.Strategy: Strategy.Strategy,
104	.RecoveryNode: N,
105	.DiscardedParse: Path,
106	});
107	LLVM_DEBUG(llvm::dbgs()
108	<< "Option: recover " << Lang.G.symbolName(Strategy.Result)
109	<< " at token " << NextTok << "\n");
110	}
111	}
112	Path.push_back(x: N->Payload);
113	for (const GSS::Node *Parent : N->parents())
114	WalkUp(Parent, N->Payload->startTokenIndex(), WalkUp);
115	Path.pop_back();
116	};
117	for (auto *N : OldHeads)
118	WalkUp (N, TokenIndex, WalkUp);
119
120	// Now we select the option(s) we will use to recover.
121	//
122	// We prefer options starting further right, as these discard less code
123	// (e.g. we prefer to recover inner scopes rather than outer ones).
124	// The options also need to agree on an endpoint, so the parser has a
125	// consistent position afterwards.
126	//
127	// So conceptually we're sorting by the tuple (start, end), though we avoid
128	// computing `end` for options that can't be winners.
129
130	// Consider options starting further right first.
131	// Don't drop the others yet though, we may still use them if preferred fails.
132	llvm::stable_sort(Range&: Options, C: [&](const auto &L, const auto &R) {
133	return L.Position > R.Position;
134	});
135
136	// We may find multiple winners, but they will have the same range.
137	std::optional<Token::Range> RecoveryRange;
138	std::vector<const PlaceholderRecovery *> BestOptions;
139	for (const PlaceholderRecovery &Option : Options) {
140	// If this starts further left than options we've already found, then
141	// we'll never find anything better. Skip computing End for the rest.
142	if (RecoveryRange && Option.Position < RecoveryRange ->Begin)
143	break;
144
145	auto End = findRecoveryEndpoint(Strategy: Option.Strategy, Begin: Option.Position,
146	Tokens: Params.Code, Lang);
147	// Recovery may not take the parse backwards.
148	if (End == Token::Invalid \|\| End < TokenIndex)
149	continue;
150	if (RecoveryRange) {
151	// If this is worse than our previous options, ignore it.
152	if (RecoveryRange ->End < End)
153	continue;
154	// If this is an improvement over our previous options, then drop them.
155	if (RecoveryRange ->End > End)
156	BestOptions.clear();
157	}
158	// Create recovery nodes and heads for them in the GSS. These may be
159	// discarded if a better recovery is later found, but this path isn't hot.
160	RecoveryRange = {.Begin: Option.Position, .End: End};
161	BestOptions.push_back(x: &Option);
162	}
163
164	if (BestOptions.empty()) {
165	LLVM_DEBUG(llvm::dbgs() << "Recovery failed after trying " << Options.size()
166	<< " strategies\n");
167	return;
168	}
169
170	// We've settled on a set of recovery options, so create their nodes and
171	// advance the cursor.
172	LLVM_DEBUG({
173	llvm::dbgs() << "Recovered range=" << *RecoveryRange << ":";
174	for (const auto *Option : BestOptions)
175	llvm::dbgs() << " " << Lang.G.symbolName(Option->Symbol);
176	llvm::dbgs() << "\n";
177	});
178	// FIXME: in general, we might have the same Option->Symbol multiple times,
179	// and we risk creating redundant Forest and GSS nodes.
180	// We also may inadvertently set up the next glrReduce to create a sequence
181	// node duplicating an opaque node that we're creating here.
182	// There are various options, including simply breaking ties between options.
183	// For now it's obscure enough to ignore.
184	for (const PlaceholderRecovery *Option : BestOptions) {
185	Option->RecoveryNode->Recovered = true;
186	const ForestNode &Placeholder =
187	Params.Forest.createOpaque(SID: Option->Symbol, Start: RecoveryRange ->Begin);
188	LRTable::StateID OldState = Option->RecoveryNode->State;
189	LRTable::StateID NewState =
190	isToken(ID: Option->Symbol)
191	? *Lang.Table.getShiftState(State: OldState, Terminal: Option->Symbol)
192	: *Lang.Table.getGoToState(State: OldState, Nonterminal: Option->Symbol);
193	const GSS::Node *NewHead =
194	Params.GSStack.addNode(State: NewState, Symbol: &Placeholder, Parents: {Option->RecoveryNode});
195	NewHeads.push_back(x: NewHead);
196	}
197	TokenIndex = RecoveryRange ->End;
198	}
199
200	using StateID = LRTable::StateID;
201
202	llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const GSS::Node &N) {
203	std::vector<std::string> ParentStates;
204	for (const auto *Parent : N.parents())
205	ParentStates.push_back(x: llvm::formatv(Fmt: "{0}", Vals: Parent->State));
206	OS << llvm::formatv(Fmt: "state {0}, parsed symbol {1}, parents {3}", Vals: N.State,
207	Vals: N.Payload ? N.Payload->symbol() : `0`,
208	Vals: llvm::join(R&: ParentStates, Separator: ", "));
209	return OS;
210	}
211
212	// Apply all pending shift actions.
213	// In theory, LR parsing doesn't have shift/shift conflicts on a single head.
214	// But we may have multiple active heads, and each head has a shift action.
215	//
216	// We merge the stack -- if multiple heads will reach the same state after
217	// shifting a token, we shift only once by combining these heads.
218	//
219	// E.g. we have two heads (2, 3) in the GSS, and will shift both to reach 4:
220	// 0---1---2
221	// └---3
222	// After the shift action, the GSS is:
223	// 0---1---2---4
224	// └---3---┘
225	void glrShift(llvm::ArrayRef<const GSS::Node *> OldHeads,
226	const ForestNode &NewTok, const ParseParams &Params,
227	const Language &Lang, std::vector<const GSS::Node *> &NewHeads) {
228	assert(NewTok.kind() == ForestNode::Terminal);
229	LLVM_DEBUG(llvm::dbgs() << llvm::formatv(" Shift {0} ({1} active heads):\n",
230	Lang.G.symbolName(NewTok.symbol()),
231	OldHeads.size()));
232
233	// We group pending shifts by their target state so we can merge them.
234	llvm::SmallVector<std::pair<StateID, const GSS::Node *>, `8`> Shifts;
235	for (const auto *H : OldHeads)
236	if (auto S = Lang.Table.getShiftState(State: H->State, Terminal: NewTok.symbol()))
237	Shifts.push_back(Elt: {*S, H});
238	llvm::stable_sort(Range&: Shifts, C: llvm::less_first{});
239
240	auto Rest = llvm::ArrayRef(Shifts);
241	llvm::SmallVector<const GSS::Node *> Parents;
242	while (!Rest.empty()) {
243	// Collect the batch of PendingShift that have compatible shift states.
244	// Their heads become TempParents, the parents of the new GSS node.
245	StateID NextState = Rest.front().first;
246
247	Parents.clear();
248	for (const auto &Base : Rest) {
249	if (Base.first != NextState)
250	break;
251	Parents.push_back(Elt: Base.second);
252	}
253	Rest = Rest.drop_front(N: Parents.size());
254
255	LLVM_DEBUG(llvm::dbgs() << llvm::formatv(" --> S{0} ({1} heads)\n",
256	NextState, Parents.size()));
257	NewHeads.push_back(x: Params.GSStack.addNode(State: NextState, Symbol: &NewTok, Parents));
258	}
259	}
260
261	namespace {
262	// A KeyedQueue yields pairs of keys and values in order of the keys.
263	template <typename Key, typename Value>
264	using KeyedQueue =
265	std::priority_queue<std::pair<Key, Value>,
266	std::vector<std::pair<Key, Value>>, llvm::less_first>;
267
268	template <typename T> void sortAndUnique(std::vector<T> &Vec) {
269	llvm::sort(Vec);
270	Vec.erase(std::unique(Vec.begin(), Vec.end()), Vec.end());
271	}
272
273	// Perform reduces until no more are possible.
274	//
275	// Generally this means walking up from the heads gathering ForestNodes that
276	// will match the RHS of the rule we're reducing into a sequence ForestNode,
277	// and ending up at a base node.
278	// Then we push a new GSS node onto that base, taking care to:
279	// - pack alternative sequence ForestNodes into an ambiguous ForestNode.
280	// - use the same GSS node for multiple heads if the parse state matches.
281	//
282	// Examples of reduction:
283	// Before (simple):
284	// 0--1(expr)--2(semi)
285	// After reducing 2 by `stmt := expr semi`:
286	// 0--3(stmt) // 3 is goto(0, stmt)
287	//
288	// Before (splitting due to R/R conflict):
289	// 0--1(IDENTIFIER)
290	// After reducing 1 by `class-name := IDENTIFIER` & `enum-name := IDENTIFIER`:
291	// 0--2(class-name) // 2 is goto(0, class-name)
292	// └--3(enum-name) // 3 is goto(0, enum-name)
293	//
294	// Before (splitting due to multiple bases):
295	// 0--2(class-name)--4(STAR)
296	// └--3(enum-name)---┘
297	// After reducing 4 by `ptr-operator := STAR`:
298	// 0--2(class-name)--5(ptr-operator) // 5 is goto(2, ptr-operator)
299	// └--3(enum-name)---6(ptr-operator) // 6 is goto(3, ptr-operator)
300	//
301	// Before (joining due to same goto state, multiple bases):
302	// 0--1(cv-qualifier)--3(class-name)
303	// └--2(cv-qualifier)--4(enum-name)
304	// After reducing 3 by `type-name := class-name` and
305	// 4 by `type-name := enum-name`:
306	// 0--1(cv-qualifier)--5(type-name) // 5 is goto(1, type-name) and
307	// └--2(cv-qualifier)--┘ // goto(2, type-name)
308	//
309	// Before (joining due to same goto state, the same base):
310	// 0--1(class-name)--3(STAR)
311	// └--2(enum-name)--4(STAR)
312	// After reducing 3 by `pointer := class-name STAR` and
313	// 2 by`enum-name := class-name STAR`:
314	// 0--5(pointer) // 5 is goto(0, pointer)
315	//
316	// (This is a functor rather than a function to allow it to reuse scratch
317	// storage across calls).
318	class GLRReduce {
319	const ParseParams &Params;
320	const Language& Lang;
321	// There are two interacting complications:
322	// 1. Performing one reduce can unlock new reduces on the newly-created head.
323	// 2a. The ambiguous ForestNodes must be complete (have all sequence nodes).
324	// This means we must have unlocked all the reduces that contribute to it.
325	// 2b. Similarly, the new GSS nodes must be complete (have all parents).
326	//
327	// We define a "family" of reduces as those that produce the same symbol and
328	// cover the same range of tokens. These are exactly the set of reductions
329	// whose sequence nodes would be covered by the same ambiguous node.
330	// We wish to process a whole family at a time (to satisfy complication 2),
331	// and can address complication 1 by carefully ordering the families:
332	// - Process families covering fewer tokens first.
333	// A reduce can't depend on a longer reduce!
334	// - For equal token ranges: if S := T, process T families before S families.
335	// Parsing T can't depend on an equal-length S, as the grammar is acyclic.
336	//
337	// This isn't quite enough: we don't know the token length of the reduction
338	// until we walk up the stack to perform the pop.
339	// So we perform the pop part upfront, and place the push specification on
340	// priority queues such that we can retrieve a family at a time.
341
342	// A reduction family is characterized by its token range and symbol produced.
343	// It is used as a key in the priority queues to group pushes by family.
344	struct Family {
345	// The start of the token range of the reduce.
346	Token::Index Start;
347	SymbolID Symbol;
348	// Rule must produce Symbol and can otherwise be arbitrary.
349	// RuleIDs have the topological order based on the acyclic grammar.
350	// FIXME: should SymbolIDs be so ordered instead?
351	RuleID Rule;
352
353	bool operator==(const Family &Other) const {
354	return Start == Other.Start && Symbol == Other.Symbol;
355	}
356	// The larger Family is the one that should be processed first.
357	bool operator<(const Family &Other) const {
358	if (Start != Other.Start)
359	return Start < Other.Start;
360	if (Symbol != Other.Symbol)
361	return Rule > Other.Rule;
362	assert(*this == Other);
363	return false;
364	}
365	};
366
367	// A sequence is the ForestNode payloads of the GSS nodes we are reducing.
368	using Sequence = llvm::SmallVector<const ForestNode *, Rule::MaxElements>;
369	// Like ArrayRef<const ForestNode>, but with the missing operator<.*
370	// (Sequences are big to move by value as the collections gets rearranged).
371	struct SequenceRef {
372	SequenceRef(const Sequence &S) : S (S) {}
373	llvm::ArrayRef<const ForestNode *> S;
374	friend bool operator==(SequenceRef A, SequenceRef B) { return A.S == B.S; }
375	friend bool operator<(const SequenceRef &A, const SequenceRef &B) {
376	return std::lexicographical_compare(first1: A.S.begin(), last1: A.S.end(), first2: B.S.begin(),
377	last2: B.S.end());
378	}
379	};
380	// Underlying storage for sequences pointed to by stored SequenceRefs.
381	std::deque<Sequence> SequenceStorage;
382	// We don't actually destroy the sequences between calls, to reuse storage.
383	// Everything SequenceStorage[ >=SequenceStorageCount ] is reusable scratch.
384	unsigned SequenceStorageCount;
385
386	// Halfway through a reduction (after the pop, before the push), we have
387	// collected nodes for the RHS of a rule, and reached a base node.
388	// They specify a sequence ForestNode we may build (but we dedup first).
389	// (The RuleID is not stored here, but rather in the Family).
390	struct PushSpec {
391	// The last node popped before pushing. Its parent is the reduction base(s).
392	// (Base is more fundamental, but this is cheaper to store).
393	const GSS::Node* LastPop = nullptr;
394	Sequence Seq = nullptr*;
395	};
396	KeyedQueue<Family, PushSpec> Sequences; // FIXME: rename => PendingPushes?
397
398	// We treat Heads as a queue of Pop operations still to be performed.
399	// PoppedHeads is our position within it.
400	std::vector<const GSS::Node > Heads;
401	unsigned NextPopHead;
402	SymbolID Lookahead;
403
404	Sequence TempSequence;
405	public:
406	GLRReduce(const ParseParams &Params, const Language &Lang)
407	: Params(Params), Lang(Lang) {}
408
409	// Reduce Heads, resulting in new nodes that are appended to Heads.
410	// The "consumed" nodes are not removed!
411	// Only reduce rules compatible with the Lookahead are applied, though
412	// tokenSymbol(tok::unknown) will match any rule.
413	void operator()(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead) {
414	assert(isToken(Lookahead));
415
416	NextPopHead = `0`;
417	this->Heads = &Heads;
418	this->Lookahead = Lookahead;
419	assert(Sequences.empty());
420	SequenceStorageCount = `0`;
421
422	popPending();
423	while (!Sequences.empty()) {
424	pushNext();
425	popPending();
426	}
427	}
428
429	private:
430	bool canReduce(const Rule &R, RuleID RID,
431	llvm::ArrayRef<const ForestNode > RHS) const* {
432	if (!R.Guarded)
433	return true;
434	if (auto Guard = Lang.Guards.lookup(Val: RID))
435	return Guard ({.RHS: RHS, .Tokens: Params.Code, .Lookahead: Lookahead});
436	LLVM_DEBUG(llvm::dbgs()
437	<< llvm::formatv("missing guard implementation for rule {0}\n",
438	Lang.G.dumpRule(RID)));
439	return true;
440	}
441	// pop walks up the parent chain(s) for a reduction from Head by to Rule.
442	// Once we reach the end, record the bases and sequences.
443	void pop(const GSS::Node Head, RuleID RID, const* Rule &Rule) {
444	LLVM_DEBUG(llvm::dbgs() << " Pop " << Lang.G.dumpRule(RID) << "\n");
445	Family F{/Start=/`0`, /Symbol=/Rule.Target, /Rule=/RID};
446	TempSequence.resize_for_overwrite(N: Rule.Size);
447	auto DFS = [&](const GSS::Node N, unsigned* I, auto &DFS) {
448	TempSequence [Rule.Size - `1` - I] = N->Payload;
449	if (I + `1` == Rule.Size) {
450	F.Start = TempSequence.front()->startTokenIndex();
451	LLVM_DEBUG({
452	for (const auto *B : N->parents())
453	llvm::dbgs() << " --> base at S" << B->State << "\n";
454	});
455	if (!canReduce(R: Rule, RID, RHS: TempSequence))
456	return;
457	// Copy the chain to stable storage so it can be enqueued.
458	if (SequenceStorageCount == SequenceStorage.size())
459	SequenceStorage.emplace_back();
460	SequenceStorage [SequenceStorageCount] = TempSequence;
461	Sequence *Seq = &SequenceStorage [SequenceStorageCount++];
462
463	Sequences.emplace(args&: F, args: PushSpec{.LastPop: N, .Seq: Seq});
464	return;
465	}
466	for (const GSS::Node *Parent : N->parents())
467	DFS(Parent, I + `1`, DFS);
468	};
469	DFS(Head, `0`, DFS);
470	}
471
472	// popPending pops every available reduction.
473	void popPending() {
474	for (; NextPopHead < Heads->size(); ++NextPopHead) {
475	// In trivial cases, we perform the complete reduce here!
476	if (popAndPushTrivial())
477	continue;
478	for (RuleID RID :
479	Lang.Table.getReduceRules(State: (*Heads)[NextPopHead]->State)) {
480	const auto &Rule = Lang.G.lookupRule(RID);
481	if (Lang.Table.canFollow(Nonterminal: Rule.Target, Terminal: Lookahead))
482	pop(Head: (*Heads)[NextPopHead], RID, Rule);
483	}
484	}
485	}
486
487	// Storage reused by each call to pushNext.
488	std::vector<std::pair</Goto/ StateID, const GSS::Node *>> FamilyBases;
489	std::vector<std::pair<RuleID, SequenceRef>> FamilySequences;
490	std::vector<const GSS::Node *> Parents;
491	std::vector<const ForestNode *> SequenceNodes;
492
493	// Process one push family, forming a forest node.
494	// This produces new GSS heads which may enable more pops.
495	void pushNext() {
496	assert(!Sequences.empty());
497	Family F = Sequences.top().first;
498
499	LLVM_DEBUG(llvm::dbgs() << " Push " << Lang.G.symbolName(F.Symbol)
500	<< " from token " << F.Start << "\n");
501
502	// Grab the sequences and bases for this family.
503	// We don't care which rule yielded each base. If Family.Symbol is S, the
504	// base includes an item X := ... • S ... and since the grammar is
505	// context-free, all* parses of S are valid here.*
506	FamilySequences.clear();
507	FamilyBases.clear();
508	do {
509	const PushSpec &Push = Sequences.top().second;
510	FamilySequences.emplace_back(args: Sequences.top().first.Rule, args&: *Push.Seq);
511	for (const GSS::Node *Base : Push.LastPop->parents()) {
512	auto NextState = Lang.Table.getGoToState(State: Base->State, Nonterminal: F.Symbol);
513	assert(NextState.has_value() && "goto must succeed after reduce!");
514	FamilyBases.emplace_back(args&: *NextState, args&: Base);
515	}
516
517	Sequences.pop();
518	} while (!Sequences.empty() && Sequences.top().first == F);
519	// Build a forest node for each unique sequence.
520	sortAndUnique(Vec&: FamilySequences);
521	SequenceNodes.clear();
522	for (const auto &SequenceSpec : FamilySequences)
523	SequenceNodes.push_back(x: &Params.Forest.createSequence(
524	SID: F.Symbol, RID: SequenceSpec.first, Elements: SequenceSpec.second.S));
525	// Wrap in an ambiguous node if needed.
526	const ForestNode *Parsed =
527	SequenceNodes.size() == `1`
528	? SequenceNodes.front()
529	: &Params.Forest.createAmbiguous(SID: F.Symbol, Alternatives: SequenceNodes);
530	LLVM_DEBUG(llvm::dbgs() << " --> " << Parsed->dump(Lang.G) << "\n");
531
532	// Bases for this family, deduplicate them, and group by the goTo State.
533	sortAndUnique(Vec&: FamilyBases);
534	// Create a GSS node for each unique goto state.
535	llvm::ArrayRef<decltype(FamilyBases)::value_type> BasesLeft = FamilyBases;
536	while (!BasesLeft.empty()) {
537	StateID NextState = BasesLeft.front().first;
538	Parents.clear();
539	for (const auto &Base : BasesLeft) {
540	if (Base.first != NextState)
541	break;
542	Parents.push_back(x: Base.second);
543	}
544	BasesLeft = BasesLeft.drop_front(N: Parents.size());
545	Heads->push_back(x: Params.GSStack.addNode(State: NextState, Symbol: Parsed, Parents));
546	}
547	}
548
549	// In general we split a reduce into a pop/push, so concurrently-available
550	// reductions can run in the correct order. The data structures are expensive.
551	//
552	// When only one reduction is possible at a time, we can skip this:
553	// we pop and immediately push, as an LR parser (as opposed to GLR) would.
554	// This is valid whenever there's only one concurrent PushSpec.
555	//
556	// This function handles a trivial but common subset of these cases:
557	// - there must be no pending pushes, and only one poppable head
558	// - the head must have only one reduction rule
559	// - the reduction path must be a straight line (no multiple parents)
560	// (Roughly this means there's no local ambiguity, so the LR algorithm works).
561	//
562	// Returns true if we successfully consumed the next unpopped head.
563	bool popAndPushTrivial() {
564	if (!Sequences.empty() \|\| Heads->size() != NextPopHead + `1`)
565	return false;
566	const GSS::Node *Head = Heads->back();
567	std::optional<RuleID> RID;
568	for (RuleID R : Lang.Table.getReduceRules(State: Head->State)) {
569	if (RID.has_value())
570	return false;
571	RID = R;
572	}
573	if (!RID)
574	return true; // no reductions available, but we've processed the head!
575	const auto &Rule = Lang.G.lookupRule(RID: *RID);
576	if (!Lang.Table.canFollow(Nonterminal: Rule.Target, Terminal: Lookahead))
577	return true; // reduction is not available
578	const GSS::Node *Base = Head;
579	TempSequence.resize_for_overwrite(N: Rule.Size);
580	for (unsigned I = `0`; I < Rule.Size; ++I) {
581	if (Base->parents().size() != `1`)
582	return false;
583	TempSequence [Rule.Size - `1` - I] = Base->Payload;
584	Base = Base->parents().front();
585	}
586	if (!canReduce(R: Rule, RID: *RID, RHS: TempSequence))
587	return true; // reduction is not available
588	const ForestNode *Parsed =
589	&Params.Forest.createSequence(SID: Rule.Target, RID: *RID, Elements: TempSequence);
590	auto NextState = Lang.Table.getGoToState(State: Base->State, Nonterminal: Rule.Target);
591	assert(NextState.has_value() && "goto must succeed after reduce!");
592	Heads->push_back(x: Params.GSStack.addNode(State: *NextState, Symbol: Parsed, Parents: {Base}));
593	LLVM_DEBUG(llvm::dbgs()
594	<< " Reduce (trivial) " << Lang.G.dumpRule(*RID) << "\n"
595	<< " --> S" << Heads->back()->State << "\n");
596	return true;
597	}
598	};
599
600	} // namespace
601
602	ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
603	const Language &Lang) {
604	GLRReduce Reduce(Params, Lang);
605	assert(isNonterminal(StartSymbol) && "Start symbol must be a nonterminal");
606	llvm::ArrayRef<ForestNode> Terminals = Params.Forest.createTerminals(Code: Params.Code);
607	auto &GSS = Params.GSStack;
608
609	StateID StartState = Lang.Table.getStartState(StartSymbol);
610	// Heads correspond to the parse of tokens [0, I), NextHeads to [0, I+1).
611	std::vector<const GSS::Node > Heads = {GSS.addNode(/State=/*StartState,
612	/ForestNode=/Symbol: nullptr,
613	Parents: {})};
614	// Invariant: Heads is partitioned by source: {shifted \| reduced}.
615	// HeadsPartition is the index of the first head formed by reduction.
616	// We use this to discard and recreate the reduced heads during recovery.
617	unsigned HeadsPartition = Heads.size();
618	std::vector<const GSS::Node *> NextHeads;
619	auto MaybeGC = [&, Roots(std::vector<const GSS::Node >{}), I(`0u`)]() mutable* {
620	assert(NextHeads.empty() && "Running GC at the wrong time!");
621	if (++I != `20`) // Run periodically to balance CPU and memory usage.
622	return;
623	I = `0`;
624
625	// We need to copy the list: Roots is consumed by the GC.
626	Roots = Heads;
627	GSS.gc(Roots: std::move(Roots));
628	};
629	// Each iteration fully processes a single token.
630	for (unsigned I = `0`; I < Terminals.size();) {
631	LLVM_DEBUG(llvm::dbgs() << llvm::formatv(
632	"Next token {0} (id={1})\n",
633	Lang.G.symbolName(Terminals[I].symbol()), Terminals[I].symbol()));
634	// Consume the token.
635	glrShift(OldHeads: Heads, NewTok: Terminals [I], Params, Lang, NewHeads&: NextHeads);
636
637	// If we weren't able to consume the token, try to skip over some tokens
638	// so we can keep parsing.
639	if (NextHeads.empty()) {
640	// The reduction in the previous round was constrained by lookahead.
641	// On valid code this only rejects dead ends, but on broken code we should
642	// consider all possibilities.
643	//
644	// We discard all heads formed by reduction, and recreate them without
645	// this constraint. This may duplicate some nodes, but it's rare.
646	LLVM_DEBUG(llvm::dbgs() << "Shift failed, will attempt recovery. "
647	"Re-reducing without lookahead.\n");
648	Heads.resize(new_size: HeadsPartition);
649	Reduce (Heads, /allow all reductions/ tokenSymbol(TK: tok::unknown));
650
651	glrRecover(OldHeads: Heads, TokenIndex&: I, Params, Lang, NewHeads&: NextHeads);
652	if (NextHeads.empty())
653	// FIXME: Ensure the `_ := start-symbol` rules have a fallback
654	// error-recovery strategy attached. Then this condition can't happen.
655	return Params.Forest.createOpaque(SID: StartSymbol, /Token::Index=/Start: `0`);
656	} else
657	++I;
658
659	// Form nonterminals containing the token we just consumed.
660	SymbolID Lookahead =
661	I == Terminals.size() ? tokenSymbol(TK: tok::eof) : Terminals [I].symbol();
662	HeadsPartition = NextHeads.size();
663	Reduce (NextHeads, Lookahead);
664	// Prepare for the next token.
665	std::swap(x&: Heads, y&: NextHeads);
666	NextHeads.clear();
667	MaybeGC ();
668	}
669	LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Reached eof\n"));
670
671	// The parse was successful if in state `_ := start-symbol EOF .`
672	// The GSS parent has `_ := start-symbol . EOF`; its payload is the parse.
673	auto AfterStart = Lang.Table.getGoToState(State: StartState, Nonterminal: StartSymbol);
674	assert(AfterStart.has_value() && "goto must succeed after start symbol!");
675	auto Accept = Lang.Table.getShiftState(State: *AfterStart, Terminal: tokenSymbol(TK: tok::eof));
676	assert(Accept.has_value() && "shift EOF must succeed!");
677	auto SearchForAccept = [&](llvm::ArrayRef<const GSS::Node *> Heads) {
678	const ForestNode Result = nullptr*;
679	for (const auto *Head : Heads) {
680	if (Head->State == *Accept) {
681	assert(Head->Payload->symbol() == tokenSymbol(tok::eof));
682	assert(Result == nullptr && "multiple results!");
683	Result = Head->parents().front()->Payload;
684	assert(Result->symbol() == StartSymbol);
685	}
686	}
687	return Result;
688	};
689	if (auto *Result = SearchForAccept (Heads))
690	return *const_cast<ForestNode >(Result); // Safe: we created all nodes.*
691	// We failed to parse the input, returning an opaque forest node for recovery.
692	// FIXME: as above, we can add fallback error handling so this is impossible.
693	return Params.Forest.createOpaque(SID: StartSymbol, /Token::Index=/Start: `0`);
694	}
695
696	void glrReduce(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead,
697	const ParseParams &Params, const Language &Lang) {
698	// Create a new GLRReduce each time for tests, performance doesn't matter.
699	GLRReduce {Params, Lang}(Heads, Lookahead);
700	}
701
702	const GSS::Node GSS::addNode(LRTable::StateID State, const* ForestNode *Symbol,
703	llvm::ArrayRef<const Node *> Parents) {
704	Node Result = new* (allocate(Parents: Parents.size())) Node ();
705	Result->State = State;
706	Result->GCParity = GCParity;
707	Result->ParentCount = Parents.size();
708	Alive.push_back(x: Result);
709	++NodesCreated;
710	Result->Payload = Symbol;
711	if (!Parents.empty())
712	llvm::copy(Range&: Parents, Out: reinterpret_cast<const Node **>(Result + `1`));
713	return Result;
714	}
715
716	GSS::Node GSS::allocate(unsigned* Parents) {
717	if (FreeList.size() <= Parents)
718	FreeList.resize(new_size: Parents + `1`);
719	auto &SizedList = FreeList [Parents];
720	if (!SizedList.empty()) {
721	auto *Result = SizedList.back();
722	SizedList.pop_back();
723	return Result;
724	}
725	return static_cast<Node *>(
726	Arena.Allocate(Size: sizeof(Node) + Parents * sizeof(Node ), Alignment: alignof*(Node)));
727	}
728
729	void GSS::destroy(Node *N) {
730	unsigned ParentCount = N->ParentCount;
731	N->~Node();
732	assert(FreeList.size() > ParentCount && "established on construction!");
733	FreeList [ParentCount].push_back(x: N);
734	}
735
736	unsigned GSS::gc(std::vector<const Node *> &&Queue) {
737	#ifndef NDEBUG
738	auto ParityMatches = [&](const Node N) { return* N->GCParity == GCParity; };
739	assert("Before GC" && llvm::all_of(Alive, ParityMatches));
740	auto Deferred = llvm::make_scope_exit(
741	F: [&] { assert("After GC" && llvm::all_of(Alive, ParityMatches)); });
742	assert(llvm::all_of(
743	Queue, [&](const Node R) { return* llvm::is_contained(Alive, R); }));
744	#endif
745	unsigned InitialCount = Alive.size();
746
747	// Mark
748	GCParity = !GCParity;
749	while (!Queue.empty()) {
750	Node N = const_cast<Node >(Queue.back()); // Safe: we created these nodes.
751	Queue.pop_back();
752	if (N->GCParity != GCParity) { // Not seen yet
753	N->GCParity = GCParity; // Mark as seen
754	for (const Node P : N->parents()) // And walk parents*
755	Queue.push_back(x: P);
756	}
757	}
758	// Sweep
759	llvm::erase_if(C&: Alive, P: [&](Node *N) {
760	if (N->GCParity == GCParity) // Walk reached this node.
761	return false;
762	destroy(N);
763	return true;
764	});
765
766	LLVM_DEBUG(llvm::dbgs() << "GC pruned " << (InitialCount - Alive.size())
767	<< "/" << InitialCount << " GSS nodes\n");
768	return InitialCount - Alive.size();
769	}
770
771	} // namespace pseudo
772	} // namespace clang
773

source code of clang-tools-extra/pseudo/lib/GLR.cpp