GLRTest.cpp source code [clang-tools-extra/pseudo/unittests/GLRTest.cpp]

1	//===--- GLRTest.cpp - Test the GLR parser ----------------------- C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "clang-pseudo/GLR.h"
10	#include "clang-pseudo/Bracket.h"
11	#include "clang-pseudo/Language.h"
12	#include "clang-pseudo/Token.h"
13	#include "clang-pseudo/grammar/Grammar.h"
14	#include "clang/Basic/LangOptions.h"
15	#include "clang/Basic/TokenKinds.h"
16	#include "llvm/ADT/StringExtras.h"
17	#include "llvm/Support/FormatVariadic.h"
18	#include "gmock/gmock.h"
19	#include "gtest/gtest.h"
20	#include <memory>
21
22	namespace clang {
23	namespace pseudo {
24
25	llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
26	const std::vector<const GSS::Node *> &Heads) {
27	for (const auto *Head : Heads)
28	OS << *Head << "\n";
29	return OS;
30	}
31
32	namespace {
33
34	using StateID = LRTable::StateID;
35	using testing::AllOf;
36	using testing::ElementsAre;
37	using testing::IsEmpty;
38	using testing::UnorderedElementsAre;
39
40	MATCHER_P(state, StateID, "") { return arg->State == StateID; }
41	MATCHER_P(parsedSymbol, FNode, "") { return arg->Payload == FNode; }
42	MATCHER_P(parsedSymbolID, SID, "") { return arg->Payload->symbol() == SID; }
43	MATCHER_P(start, Start, "") { return arg->Payload->startTokenIndex() == Start; }
44
45	testing::Matcher<const GSS::Node *>
46	parents(llvm::ArrayRef<const GSS::Node *> Parents) {
47	return testing::Property(property: &GSS::Node::parents,
48	matcher: testing::UnorderedElementsAreArray(container: Parents));
49	}
50
51	Token::Index recoverBraces(Token::Index Begin, const TokenStream &Code) {
52	EXPECT_GT(Begin, `0u`);
53	const Token &Left = Code.tokens()[Begin - `1`];
54	EXPECT_EQ(Left.Kind, tok::l_brace);
55	if (const auto* Right = Left.pair()) {
56	EXPECT_EQ(Right->Kind, tok::r_brace);
57	return Code.index(T: *Right);
58	}
59	return Token::Invalid;
60	}
61
62	class GLRTest : public ::testing::Test {
63	public:
64	void build(llvm::StringRef GrammarBNF) {
65	std::vector<std::string> Diags;
66	TestLang.G = Grammar::parseBNF(BNF: GrammarBNF, Diags);
67	}
68
69	TokenStream emptyTokenStream() {
70	TokenStream Empty;
71	Empty.finalize();
72	return Empty;
73	}
74
75	void buildGrammar(std::vector<std::string> Nonterminals,
76	std::vector<std::string> Rules) {
77	Nonterminals.push_back(x: "_");
78	llvm::sort(C&: Nonterminals);
79	Nonterminals.erase(first: std::unique(first: Nonterminals.begin(), last: Nonterminals.end()),
80	last: Nonterminals.end());
81	std::string FakeTestBNF;
82	for (const auto &NT : Nonterminals)
83	FakeTestBNF += llvm::formatv(Fmt: "{0} := {1}\n", Vals: "_", Vals: NT);
84	FakeTestBNF += llvm::join(R&: Rules, Separator: "\n");
85	build(GrammarBNF: FakeTestBNF);
86	}
87
88	SymbolID id(llvm::StringRef Name) const {
89	for (unsigned I = `0`; I < NumTerminals; ++I)
90	if (TestLang.G.table().Terminals [I] == Name)
91	return tokenSymbol(TK: static_cast<tok::TokenKind>(I));
92	for (SymbolID ID = `0`; ID < TestLang.G.table().Nonterminals.size(); ++ID)
93	if (TestLang.G.table().Nonterminals [ID].Name == Name)
94	return ID;
95	ADD_FAILURE() << "No such symbol found: " << Name;
96	return `0`;
97	}
98	ExtensionID extensionID(llvm::StringRef AttrValueName) const {
99	for (ExtensionID EID = `0`; EID < TestLang.G.table().AttributeValues.size();
100	++EID)
101	if (TestLang.G.table().AttributeValues [EID] == AttrValueName)
102	return EID;
103	ADD_FAILURE() << "No such attribute value found: " << AttrValueName;
104	return `0`;
105	}
106
107	RuleID ruleFor(llvm::StringRef NonterminalName) const {
108	auto RuleRange =
109	TestLang.G.table().Nonterminals [id(Name: NonterminalName)].RuleRange;
110	if (RuleRange.End - RuleRange.Start == `1`)
111	return TestLang.G.table()
112	.Nonterminals [id(Name: NonterminalName)]
113	.RuleRange.Start;
114	ADD_FAILURE() << "Expected a single rule for " << NonterminalName
115	<< ", but it has " << RuleRange.End - RuleRange.Start
116	<< " rule!\n";
117	return `0`;
118	}
119
120	protected:
121	Language TestLang;
122	ForestArena Arena;
123	GSS GSStack;
124	};
125
126	TEST_F(GLRTest, ShiftMergingHeads) {
127	// Given a test case where we have two heads 1, 2, 3 in the GSS, the heads 1,
128	// 2 have shift actions to reach state 4, and the head 3 has a shift action to
129	// reach state 5:
130	// 0--1
131	// └--2
132	// └--3
133	// After the shift action, the GSS (with new heads 4, 5) is:
134	// 0---1---4
135	// └---2---┘
136	// └---3---5
137	auto *GSSNode0 =
138	GSStack.addNode(/State=/`0`, /ForestNode=/Symbol: nullptr, /Parents=/{});
139	auto GSSNode1 = GSStack.addNode(/State=/`1`, /ForestNode=/Symbol: nullptr*,
140	/Parents=/{GSSNode0});
141	auto GSSNode2 = GSStack.addNode(/State=/`2`, /ForestNode=/Symbol: nullptr*,
142	/Parents=/{GSSNode0});
143	auto GSSNode3 = GSStack.addNode(/State=/`3`, /ForestNode=/Symbol: nullptr*,
144	/Parents=/{GSSNode0});
145
146	buildGrammar(Nonterminals: {}, Rules: {}); // Create a fake empty grammar.
147	LRTable::Builder B(TestLang.G);
148	B.Transition [{StateID{`1`}, tokenSymbol(TK: tok::semi)}] = StateID{`4`};
149	B.Transition [{StateID{`2`}, tokenSymbol(TK: tok::semi)}] = StateID{`4`};
150	B.Transition [{StateID{`3`}, tokenSymbol(TK: tok::semi)}] = StateID{`5`};
151	TestLang.Table = std::move(B).build();
152
153	ForestNode &SemiTerminal = Arena.createTerminal(TK: tok::semi, Start: `0`);
154	std::vector<const GSS::Node *> NewHeads;
155	glrShift(OldHeads: {GSSNode1, GSSNode2, GSSNode3}, NextTok: SemiTerminal,
156	Params: {.Code: emptyTokenStream(), .Forest: Arena, .GSStack: GSStack}, Lang: TestLang, NewHeads);
157
158	EXPECT_THAT(NewHeads,
159	UnorderedElementsAre(AllOf(state(`4`), parsedSymbol(&SemiTerminal),
160	parents({GSSNode1, GSSNode2})),
161	AllOf(state(`5`), parsedSymbol(&SemiTerminal),
162	parents({GSSNode3}))))
163	<< NewHeads;
164	}
165
166	TEST_F(GLRTest, ReduceConflictsSplitting) {
167	// Before (splitting due to R/R conflict):
168	// 0--1(IDENTIFIER)
169	// After reducing 1 by `class-name := IDENTIFIER` and
170	// `enum-name := IDENTIFIER`:
171	// 0--2(class-name) // 2 is goto(0, class-name)
172	// └--3(enum-name) // 3 is goto(0, enum-name)
173	buildGrammar(Nonterminals: {"class-name", "enum-name"},
174	Rules: {"class-name := IDENTIFIER", "enum-name := IDENTIFIER"});
175	LRTable::Builder B(TestLang.G);
176	B.Transition [{StateID{`0`}, id(Name: "class-name")}] = StateID{`2`};
177	B.Transition [{StateID{`0`}, id(Name: "enum-name")}] = StateID{`3`};
178	B.Reduce [StateID{`1`}].insert(V: ruleFor(NonterminalName: "class-name"));
179	B.Reduce [StateID{`1`}].insert(V: ruleFor(NonterminalName: "enum-name"));
180	TestLang.Table = std::move(B).build();
181
182	const auto *GSSNode0 =
183	GSStack.addNode(/State=/`0`, /ForestNode=/Symbol: nullptr, /Parents=/{});
184	const auto *GSSNode1 =
185	GSStack.addNode(State: `1`, Symbol: &Arena.createTerminal(TK: tok::identifier, Start: `0`), Parents: {GSSNode0});
186
187	std::vector<const GSS::Node *> Heads = {GSSNode1};
188	glrReduce(Heads, Lookahead: tokenSymbol(TK: tok::eof),
189	Params: {.Code: emptyTokenStream(), .Forest: Arena, .GSStack: GSStack}, Lang: TestLang);
190	EXPECT_THAT(Heads, UnorderedElementsAre(
191	GSSNode1,
192	AllOf(state(`2`), parsedSymbolID(id("class-name")),
193	parents({GSSNode0})),
194	AllOf(state(`3`), parsedSymbolID(id("enum-name")),
195	parents({GSSNode0}))))
196	<< Heads;
197	}
198
199	TEST_F(GLRTest, ReduceSplittingDueToMultipleBases) {
200	// Before (splitting due to multiple bases):
201	// 2(class-name)--4()*
202	// 3(enum-name)---┘
203	// After reducing 4 by `ptr-operator := `:*
204	// 2(class-name)--5(ptr-operator) // 5 is goto(2, ptr-operator)
205	// 3(enum-name)---6(ptr-operator) // 6 is goto(3, ptr-operator)
206	buildGrammar(Nonterminals: {"ptr-operator", "class-name", "enum-name"},
207	Rules: {"ptr-operator := *"});
208
209	auto ClassNameNode = &Arena.createOpaque(SID: id(Name: "class-name"), /TokenIndex=/*Start: `0`);
210	auto EnumNameNode = &Arena.createOpaque(SID: id(Name: "enum-name"), /TokenIndex=/*Start: `0`);
211
212	const auto *GSSNode2 =
213	GSStack.addNode(/State=/`2`, /ForestNode=/Symbol: ClassNameNode, /Parents=/{});
214	const auto *GSSNode3 =
215	GSStack.addNode(/State=/`3`, /ForestNode=/Symbol: EnumNameNode, /Parents=/{});
216	const auto *GSSNode4 = GSStack.addNode(
217	/State=/`4`, Symbol: &Arena.createTerminal(TK: tok::star, /TokenIndex=/Start: `1`),
218	/Parents=/{GSSNode2, GSSNode3});
219
220	LRTable::Builder B(TestLang.G);
221	B.Transition [{StateID{`2`}, id(Name: "ptr-operator")}] = StateID{`5`};
222	B.Transition [{StateID{`3`}, id(Name: "ptr-operator")}] = StateID{`6`};
223	B.Reduce [StateID{`4`}].insert(V: ruleFor(NonterminalName: "ptr-operator"));
224	TestLang.Table = std::move(B).build();
225
226	std::vector<const GSS::Node *> Heads = {GSSNode4};
227	glrReduce(Heads, Lookahead: tokenSymbol(TK: tok::eof), Params: {.Code: emptyTokenStream(), .Forest: Arena, .GSStack: GSStack},
228	Lang: TestLang);
229
230	EXPECT_THAT(Heads, UnorderedElementsAre(
231	GSSNode4,
232	AllOf(state(`5`), parsedSymbolID(id("ptr-operator")),
233	parents({GSSNode2})),
234	AllOf(state(`6`), parsedSymbolID(id("ptr-operator")),
235	parents({GSSNode3}))))
236	<< Heads;
237	// Verify that the payload of the two new heads is shared, only a single
238	// ptr-operator node is created in the forest.
239	EXPECT_EQ(Heads[`1`]->Payload, Heads[`2`]->Payload);
240	}
241
242	TEST_F(GLRTest, ReduceJoiningWithMultipleBases) {
243	// Before (joining due to same goto state, multiple bases):
244	// 0--1(cv-qualifier)--3(class-name)
245	// └--2(cv-qualifier)--4(enum-name)
246	// After reducing 3 by `type-name := class-name` and
247	// 4 by `type-name := enum-name`:
248	// 0--1(cv-qualifier)--5(type-name) // 5 is goto(1, type-name) and
249	// └--2(cv-qualifier)--┘ // goto(2, type-name)
250	buildGrammar(Nonterminals: {"type-name", "class-name", "enum-name", "cv-qualifier"},
251	Rules: {"type-name := class-name", "type-name := enum-name"});
252
253	auto *CVQualifierNode =
254	&Arena.createOpaque(SID: id(Name: "cv-qualifier"), /TokenIndex=/Start: `0`);
255	auto ClassNameNode = &Arena.createOpaque(SID: id(Name: "class-name"), /TokenIndex=/*Start: `1`);
256	auto EnumNameNode = &Arena.createOpaque(SID: id(Name: "enum-name"), /TokenIndex=/*Start: `1`);
257
258	const auto *GSSNode0 =
259	GSStack.addNode(/State=/`0`, /ForestNode=/Symbol: nullptr, /Parents=/{});
260	const auto *GSSNode1 = GSStack.addNode(
261	/State=/`1`, /ForestNode=/Symbol: CVQualifierNode, /Parents=/{GSSNode0});
262	const auto *GSSNode2 = GSStack.addNode(
263	/State=/`2`, /ForestNode=/Symbol: CVQualifierNode, /Parents=/{GSSNode0});
264	const auto *GSSNode3 = GSStack.addNode(
265	/State=/`3`, /ForestNode=/Symbol: ClassNameNode,
266	/Parents=/{GSSNode1});
267	const auto *GSSNode4 =
268	GSStack.addNode(/State=/`4`, /ForestNode=/Symbol: EnumNameNode,
269	/Parents=/{GSSNode2});
270
271	// FIXME: figure out a way to get rid of the hard-coded reduce RuleID!
272	LRTable::Builder B(TestLang.G);
273	B.Transition [{StateID{`1`}, id(Name: "type-name")}] = StateID{`5`};
274	B.Transition [{StateID{`2`}, id(Name: "type-name")}] = StateID{`5`};
275	B.Reduce [StateID{`3`}].insert(/ type-name := class-name / V: RuleID{`0`});
276	B.Reduce [StateID{`4`}].insert(/ type-name := enum-name / V: RuleID{`1`});
277	TestLang.Table = std::move(B).build();
278
279	std::vector<const GSS::Node *> Heads = {GSSNode3, GSSNode4};
280	glrReduce(Heads, Lookahead: tokenSymbol(TK: tok::eof), Params: {.Code: emptyTokenStream(), .Forest: Arena, .GSStack: GSStack},
281	Lang: TestLang);
282
283	// Verify that the stack heads are joint at state 5 after reduces.
284	EXPECT_THAT(Heads, UnorderedElementsAre(GSSNode3, GSSNode4,
285	AllOf(state(`5`),
286	parsedSymbolID(id("type-name")),
287	parents({GSSNode1, GSSNode2}))))
288	<< Heads;
289	// Verify that we create an ambiguous ForestNode of two parses of `type-name`.
290	EXPECT_EQ(Heads.back()->Payload->dumpRecursive(TestLang.G),
291	"[ 1, end) type-name := <ambiguous>\n"
292	"[ 1, end) ├─type-name := class-name\n"
293	"[ 1, end) │ └─class-name := <opaque>\n"
294	"[ 1, end) └─type-name := enum-name\n"
295	"[ 1, end) └─enum-name := <opaque>\n");
296	}
297
298	TEST_F(GLRTest, ReduceJoiningWithSameBase) {
299	// Before (joining due to same goto state, the same base):
300	// 0--1(class-name)--3()*
301	// └--2(enum-name)--4()*
302	// After reducing 3 by `pointer := class-name ` and*
303	// 2 by `pointer := enum-name `:*
304	// 0--5(pointer) // 5 is goto(0, pointer)
305	buildGrammar(Nonterminals: {"pointer", "class-name", "enum-name"},
306	Rules: {"pointer := class-name ", "pointer := enum-name "});
307
308	auto ClassNameNode = &Arena.createOpaque(SID: id(Name: "class-name"), /TokenIndex=/*Start: `0`);
309	auto EnumNameNode = &Arena.createOpaque(SID: id(Name: "enum-name"), /TokenIndex=/*Start: `0`);
310	auto StartTerminal = &Arena.createTerminal(TK: tok::star, /TokenIndex=/*Start: `1`);
311
312	const auto *GSSNode0 =
313	GSStack.addNode(/State=/`0`, /ForestNode=/Symbol: nullptr, /Parents=/{});
314	const auto *GSSNode1 =
315	GSStack.addNode(/State=/`1`, /ForestNode=/Symbol: ClassNameNode,
316	/Parents=/{GSSNode0});
317	const auto *GSSNode2 =
318	GSStack.addNode(/State=/`2`, /ForestNode=/Symbol: EnumNameNode,
319	/Parents=/{GSSNode0});
320	const auto *GSSNode3 =
321	GSStack.addNode(/State=/`3`, /ForestNode=/Symbol: StartTerminal,
322	/Parents=/{GSSNode1});
323	const auto *GSSNode4 =
324	GSStack.addNode(/State=/`4`, /ForestNode=/Symbol: StartTerminal,
325	/Parents=/{GSSNode2});
326
327	// FIXME: figure out a way to get rid of the hard-coded reduce RuleID!
328	LRTable::Builder B(TestLang.G);
329	B.Transition [{StateID{`0`}, id(Name: "pointer")}] = StateID{`5`};
330	B.Reduce [StateID{`3`}].insert(/ pointer := class-name / V: RuleID{`0`});
331	B.Reduce [StateID{`4`}].insert(/ pointer := enum-name / V: RuleID{`1`});
332	TestLang.Table = std::move(B).build();
333
334	std::vector<const GSS::Node *> Heads = {GSSNode3, GSSNode4};
335	glrReduce(Heads, Lookahead: tokenSymbol(TK: tok::eof),
336	Params: {.Code: emptyTokenStream(), .Forest: Arena, .GSStack: GSStack}, Lang: TestLang);
337
338	EXPECT_THAT(
339	Heads, UnorderedElementsAre(GSSNode3, GSSNode4,
340	AllOf(state(`5`), parsedSymbolID(id("pointer")),
341	parents({GSSNode0}))))
342	<< Heads;
343	EXPECT_EQ(Heads.back()->Payload->dumpRecursive(TestLang.G),
344	"[ 0, end) pointer := <ambiguous>\n"
345	"[ 0, end) ├─pointer := class-name *\n"
346	"[ 0, 1) │ ├─class-name := <opaque>\n"
347	"[ 1, end) │ └─* := tok[1]\n"
348	"[ 0, end) └─pointer := enum-name *\n"
349	"[ 0, 1) ├─enum-name := <opaque>\n"
350	"[ 1, end) └─* := tok[1]\n");
351	}
352
353	TEST_F(GLRTest, ReduceLookahead) {
354	// A term can be followed by +, but not by -.
355	buildGrammar(Nonterminals: {"sum", "term"}, Rules: {"expr := term + term", "term := IDENTIFIER"});
356	LRTable::Builder B(TestLang.G);
357	B.Transition [{StateID{`0`}, id(Name: "term")}] = StateID{`2`};
358	B.Reduce [StateID{`1`}].insert(V: RuleID{`0`});
359	TestLang.Table = std::move(B).build();
360
361	auto Identifier = &Arena.createTerminal(TK: tok::identifier, /Start=/*`0`);
362
363	const auto *Root =
364	GSStack.addNode(/State=/`0`, /ForestNode=/Symbol: nullptr, /Parents=/{});
365	const auto *GSSNode1 =
366	GSStack.addNode(/State=/`1`, /ForestNode=/Symbol: Identifier, Parents: {Root});
367
368	// When the lookahead is +, reduce is performed.
369	std::vector<const GSS::Node *> Heads = {GSSNode1};
370	glrReduce(Heads, Lookahead: tokenSymbol(TK: tok::plus), Params: {.Code: emptyTokenStream(), .Forest: Arena, .GSStack: GSStack},
371	Lang: TestLang);
372	EXPECT_THAT(Heads,
373	ElementsAre(GSSNode1, AllOf(state(`2`), parsedSymbolID(id("term")),
374	parents(Root))));
375
376	// When the lookahead is -, reduce is not performed.
377	Heads = {GSSNode1};
378	glrReduce(Heads, Lookahead: tokenSymbol(TK: tok::minus),
379	Params: {.Code: emptyTokenStream(), .Forest: Arena, .GSStack: GSStack}, Lang: TestLang);
380	EXPECT_THAT(Heads, ElementsAre(GSSNode1));
381	}
382
383	TEST_F(GLRTest, Recover) {
384	// Recovery while parsing "word" inside braces.
385	// Before:
386	// 0--1({)--2(?)
387	// After recovering a `word` at state 1:
388	// 0--3(word) // 3 is goto(1, word)
389	buildGrammar(Nonterminals: {"word", "top"}, Rules: {"top := { word [recover=Braces] }"});
390	LRTable::Builder B(TestLang.G);
391	B.Transition [{StateID{`1`}, id(Name: "word")}] = StateID{`3`};
392	B.Recoveries.push_back(x: {StateID{`1`}, {.Strategy: extensionID(AttrValueName: "Braces"), .Result: id(Name: "word")}});
393	TestLang.Table = std::move(B).build();
394	TestLang.RecoveryStrategies.try_emplace(Key: extensionID(AttrValueName: "Braces"), Args&: recoverBraces);
395
396	auto *LBrace = &Arena.createTerminal(TK: tok::l_brace, Start: `0`);
397	auto *Question1 = &Arena.createTerminal(TK: tok::question, Start: `1`);
398	const auto Root = GSStack.addNode(State: `0`, Symbol: nullptr*, Parents: {});
399	const auto *OpenedBraces = GSStack.addNode(State: `1`, Symbol: LBrace, Parents: {Root});
400	const auto *AfterQuestion1 = GSStack.addNode(State: `2`, Symbol: Question1, Parents: {OpenedBraces});
401
402	// Need a token stream with paired braces so the strategy works.
403	clang::LangOptions LOptions;
404	TokenStream Tokens = cook(lex("{ ? ? ? }", LOptions), LOptions);
405	pairBrackets(Tokens);
406	std::vector<const GSS::Node *> NewHeads;
407
408	unsigned TokenIndex = `2`;
409	glrRecover(OldHeads: {AfterQuestion1}, TokenIndex, Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, Lang: TestLang,
410	NewHeads);
411	EXPECT_EQ(TokenIndex, `4u`) << "should skip ahead to matching brace";
412	EXPECT_THAT(NewHeads, ElementsAre(AllOf(state(`3`), parsedSymbolID(id("word")),
413	parents({OpenedBraces}), start(`1u`))));
414	EXPECT_EQ(NewHeads.front()->Payload->kind(), ForestNode::Opaque);
415
416	// Test recovery failure: omit closing brace so strategy fails
417	TokenStream NoRBrace = cook(lex("{ ? ? ? ?", LOptions), LOptions);
418	pairBrackets(NoRBrace);
419	NewHeads.clear();
420	TokenIndex = `2`;
421	glrRecover(OldHeads: {AfterQuestion1}, TokenIndex, Params: {.Code: NoRBrace, .Forest: Arena, .GSStack: GSStack}, Lang: TestLang,
422	NewHeads);
423	EXPECT_EQ(TokenIndex, `2u`) << "should not advance on failure";
424	EXPECT_THAT(NewHeads, IsEmpty());
425	}
426
427	TEST_F(GLRTest, RecoverRightmost) {
428	// In a nested block structure, we recover at the innermost possible block.
429	// Before:
430	// 0--1({)--1({)--1({)
431	// After recovering a `block` at inside the second braces:
432	// 0--1({)--2(body) // 2 is goto(1, body)
433	buildGrammar(Nonterminals: {"body", "top"}, Rules: {"top := { body [recover=Braces] }"});
434	LRTable::Builder B(TestLang.G);
435	B.Transition [{StateID{`1`}, id(Name: "body")}] = StateID{`2`};
436	B.Recoveries.push_back(x: {StateID{`1`}, {.Strategy: extensionID(AttrValueName: "Braces"), .Result: id(Name: "body")}});
437	TestLang.Table = std::move(B).build();
438	TestLang.RecoveryStrategies.try_emplace(Key: extensionID(AttrValueName: "Braces"), Args&: recoverBraces);
439
440	clang::LangOptions LOptions;
441	// Innermost brace is unmatched, to test fallback to next brace.
442	TokenStream Tokens = cook(lex("{ { { ? } }", LOptions), LOptions);
443	Tokens.tokens()[`0`].Pair = `5`;
444	Tokens.tokens()[`1`].Pair = `4`;
445	Tokens.tokens()[`4`].Pair = `1`;
446	Tokens.tokens()[`5`].Pair = `0`;
447
448	auto *Brace1 = &Arena.createTerminal(TK: tok::l_brace, Start: `0`);
449	auto *Brace2 = &Arena.createTerminal(TK: tok::l_brace, Start: `1`);
450	auto *Brace3 = &Arena.createTerminal(TK: tok::l_brace, Start: `2`);
451	const auto Root = GSStack.addNode(State: `0`, Symbol: nullptr*, Parents: {});
452	const auto *In1 = GSStack.addNode(State: `1`, Symbol: Brace1, Parents: {Root});
453	const auto *In2 = GSStack.addNode(State: `1`, Symbol: Brace2, Parents: {In1});
454	const auto *In3 = GSStack.addNode(State: `1`, Symbol: Brace3, Parents: {In2});
455
456	unsigned TokenIndex = `3`;
457	std::vector<const GSS::Node *> NewHeads;
458	glrRecover(OldHeads: {In3}, TokenIndex, Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, Lang: TestLang, NewHeads);
459	EXPECT_EQ(TokenIndex, `5u`);
460	EXPECT_THAT(NewHeads, ElementsAre(AllOf(state(`2`), parsedSymbolID(id("body")),
461	parents({In2}), start(`2u`))));
462	}
463
464	TEST_F(GLRTest, RecoverAlternatives) {
465	// Recovery inside braces with multiple equally good options
466	// Before:
467	// 0--1({)
468	// After recovering either `word` or `number` inside the braces:
469	// 0--1({)--2(word) // 2 is goto(1, word)
470	// └--3(number) // 3 is goto(1, number)
471	buildGrammar(Nonterminals: {"number", "word", "top"},
472	Rules: {
473	"top := { number [recover=Braces] }",
474	"top := { word [recover=Braces] }",
475	});
476	LRTable::Builder B(TestLang.G);
477	B.Transition [{StateID{`1`}, id(Name: "number")}] = StateID{`2`};
478	B.Transition [{StateID{`1`}, id(Name: "word")}] = StateID{`3`};
479	B.Recoveries.push_back(x: {StateID{`1`}, {.Strategy: extensionID(AttrValueName: "Braces"), .Result: id(Name: "number")}});
480	B.Recoveries.push_back(x: {StateID{`1`}, {.Strategy: extensionID(AttrValueName: "Braces"), .Result: id(Name: "word")}});
481	TestLang.RecoveryStrategies.try_emplace(Key: extensionID(AttrValueName: "Braces"), Args&: recoverBraces);
482	TestLang.Table = std::move(B).build();
483	auto *LBrace = &Arena.createTerminal(TK: tok::l_brace, Start: `0`);
484	const auto Root = GSStack.addNode(State: `0`, Symbol: nullptr*, Parents: {});
485	const auto *OpenedBraces = GSStack.addNode(State: `1`, Symbol: LBrace, Parents: {Root});
486
487	clang::LangOptions LOptions;
488	TokenStream Tokens = cook(lex("{ ? }", LOptions), LOptions);
489	pairBrackets(Tokens);
490	std::vector<const GSS::Node *> NewHeads;
491	unsigned TokenIndex = `1`;
492
493	glrRecover(OldHeads: {OpenedBraces}, TokenIndex, Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, Lang: TestLang,
494	NewHeads);
495	EXPECT_EQ(TokenIndex, `2u`);
496	EXPECT_THAT(NewHeads,
497	UnorderedElementsAre(AllOf(state(`2`), parsedSymbolID(id("number")),
498	parents({OpenedBraces}), start(`1u`)),
499	AllOf(state(`3`), parsedSymbolID(id("word")),
500	parents({OpenedBraces}), start(`1u`))));
501	}
502
503	TEST_F(GLRTest, PerfectForestNodeSharing) {
504	// Run the GLR on a simple grammar and test that we build exactly one forest
505	// node per (SymbolID, token range).
506
507	// This is a grmammar where the original parsing-stack-based forest node
508	// sharing approach will fail. In its LR0 graph, it has two states containing
509	// item `expr := • IDENTIFIER`, and both have different goto states on the
510	// nonterminal `expr`.
511	build(GrammarBNF: R"bnf(
512	_ := test EOF
513
514	test := { expr
515	test := { IDENTIFIER
516	test := left-paren expr
517	left-paren := {
518	expr := IDENTIFIER
519	)bnf");
520	TestLang.Table = LRTable::buildSLR(G: TestLang.G);
521	clang::LangOptions LOptions;
522	const TokenStream &Tokens = cook(lex("{ abc", LOptions), LOptions);
523
524	const ForestNode &Parsed =
525	glrParse(Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, StartSymbol: id(Name: "test"), Lang: TestLang);
526	// Verify that there is no duplicated sequence node of `expr := IDENTIFIER`
527	// in the forest, see the `#1` and `=#1` in the dump string.
528	EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
529	"[ 0, end) test := <ambiguous>\n"
530	"[ 0, end) ├─test := { expr\n"
531	"[ 0, 1) │ ├─{ := tok[0]\n"
532	"[ 1, end) │ └─expr := IDENTIFIER #1\n"
533	"[ 1, end) │ └─IDENTIFIER := tok[1]\n"
534	"[ 0, end) ├─test := { IDENTIFIER\n"
535	"[ 0, 1) │ ├─{ := tok[0]\n"
536	"[ 1, end) │ └─IDENTIFIER := tok[1]\n"
537	"[ 0, end) └─test := left-paren expr\n"
538	"[ 0, 1) ├─left-paren := {\n"
539	"[ 0, 1) │ └─{ := tok[0]\n"
540	"[ 1, end) └─expr =#1\n");
541	}
542
543	TEST_F(GLRTest, GLRReduceOrder) {
544	// Given the following grammar, and the input `IDENTIFIER`, reductions should
545	// be performed in the following order:
546	// 1. foo := IDENTIFIER
547	// 2. { test := IDENTIFIER, test := foo }
548	// foo should be reduced first, so that in step 2 we have completed reduces
549	// for test, and form an ambiguous forest node.
550	build(GrammarBNF: R"bnf(
551	_ := test EOF
552
553	test := IDENTIFIER
554	test := foo
555	foo := IDENTIFIER
556	)bnf");
557	clang::LangOptions LOptions;
558	const TokenStream &Tokens = cook(lex("IDENTIFIER", LOptions), LOptions);
559	TestLang.Table = LRTable::buildSLR(G: TestLang.G);
560
561	const ForestNode &Parsed =
562	glrParse(Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, StartSymbol: id(Name: "test"), Lang: TestLang);
563	EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
564	"[ 0, end) test := <ambiguous>\n"
565	"[ 0, end) ├─test := IDENTIFIER\n"
566	"[ 0, end) │ └─IDENTIFIER := tok[0]\n"
567	"[ 0, end) └─test := foo\n"
568	"[ 0, end) └─foo := IDENTIFIER\n"
569	"[ 0, end) └─IDENTIFIER := tok[0]\n");
570	}
571
572	TEST_F(GLRTest, RecoveryEndToEnd) {
573	// Simple example of brace-based recovery showing:
574	// - recovered region includes tokens both ahead of and behind the cursor
575	// - multiple possible recovery rules
576	// - recovery from outer scopes is rejected
577	build(GrammarBNF: R"bnf(
578	_ := block EOF
579
580	block := { block [recover=Braces] }
581	block := { numbers [recover=Braces] }
582	numbers := NUMERIC_CONSTANT NUMERIC_CONSTANT
583	)bnf");
584	TestLang.Table = LRTable::buildSLR(G: TestLang.G);
585	TestLang.RecoveryStrategies.try_emplace(Key: extensionID(AttrValueName: "Braces"), Args&: recoverBraces);
586	clang::LangOptions LOptions;
587	TokenStream Tokens = cook(lex("{ { 42 ? } }", LOptions), LOptions);
588	pairBrackets(Tokens);
589
590	const ForestNode &Parsed =
591	glrParse(Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, StartSymbol: id(Name: "block"), Lang: TestLang);
592	EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
593	"[ 0, end) block := { block [recover=Braces] }\n"
594	"[ 0, 1) ├─{ := tok[0]\n"
595	"[ 1, 5) ├─block := <ambiguous>\n"
596	"[ 1, 5) │ ├─block := { block [recover=Braces] }\n"
597	"[ 1, 2) │ │ ├─{ := tok[1]\n"
598	"[ 2, 4) │ │ ├─block := <opaque>\n"
599	"[ 4, 5) │ │ └─} := tok[4]\n"
600	"[ 1, 5) │ └─block := { numbers [recover=Braces] }\n"
601	"[ 1, 2) │ ├─{ := tok[1]\n"
602	"[ 2, 4) │ ├─numbers := <opaque>\n"
603	"[ 4, 5) │ └─} := tok[4]\n"
604	"[ 5, end) └─} := tok[5]\n");
605	}
606
607	TEST_F(GLRTest, RecoverTerminal) {
608	build(GrammarBNF: R"bnf(
609	_ := stmt EOF
610
611	stmt := IDENTIFIER ; [recover=Skip]
612	)bnf");
613	TestLang.Table = LRTable::buildSLR(G: TestLang.G);
614	TestLang.RecoveryStrategies.try_emplace(
615	Key: extensionID(AttrValueName: "Skip"),
616	Args: [](Token::Index Start, const TokenStream &) { return Start; });
617	clang::LangOptions LOptions;
618	TokenStream Tokens = cook(lex("foo", LOptions), LOptions);
619
620	const ForestNode &Parsed =
621	glrParse(Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, StartSymbol: id(Name: "stmt"), Lang: TestLang);
622	EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
623	"[ 0, end) stmt := IDENTIFIER ; [recover=Skip]\n"
624	"[ 0, 1) ├─IDENTIFIER := tok[0]\n"
625	"[ 1, end) └─; := <opaque>\n");
626	}
627
628	TEST_F(GLRTest, RecoverUnrestrictedReduce) {
629	// Here, ! is not in any rule and therefore not in the follow set of `word`.
630	// We would not normally reduce `word := IDENTIFIER`, but do so for recovery.
631
632	build(GrammarBNF: R"bnf(
633	_ := sentence EOF
634
635	word := IDENTIFIER
636	sentence := word word [recover=AcceptAnyTokenInstead]
637	)bnf");
638
639	clang::LangOptions LOptions;
640	const TokenStream &Tokens = cook(lex("id !", LOptions), LOptions);
641	TestLang.Table = LRTable::buildSLR(G: TestLang.G);
642	TestLang.RecoveryStrategies.try_emplace(
643	Key: extensionID(AttrValueName: "AcceptAnyTokenInstead"),
644	Args: [](Token::Index Start, const TokenStream &Stream) { return Start + `1`; });
645
646	const ForestNode &Parsed =
647	glrParse(Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, StartSymbol: id(Name: "sentence"), Lang: TestLang);
648	EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
649	"[ 0, end) sentence := word word [recover=AcceptAnyTokenInstead]\n"
650	"[ 0, 1) ├─word := IDENTIFIER\n"
651	"[ 0, 1) │ └─IDENTIFIER := tok[0]\n"
652	"[ 1, end) └─word := <opaque>\n");
653	}
654
655	TEST_F(GLRTest, RecoveryFromStartOfInput) {
656	build(GrammarBNF: R"bnf(
657	_ := start [recover=Fallback] EOF
658
659	start := IDENTIFIER
660	)bnf");
661	TestLang.Table = LRTable::buildSLR(G: TestLang.G);
662	bool fallback_recovered = false;
663	auto fallback = [&](Token::Index Start, const TokenStream & Code) {
664	fallback_recovered = true;
665	return Code.tokens().size();
666	};
667	TestLang.RecoveryStrategies.try_emplace(
668	Key: extensionID(AttrValueName: "Fallback"),
669	Args&: fallback);
670	clang::LangOptions LOptions;
671	TokenStream Tokens = cook(lex("?", LOptions), LOptions);
672
673	const ForestNode &Parsed =
674	glrParse(Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, StartSymbol: id(Name: "start"), Lang: TestLang);
675	EXPECT_TRUE(fallback_recovered);
676	EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
677	"[ 0, end) start := <opaque>\n");
678	}
679
680	TEST_F(GLRTest, RepeatedRecovery) {
681	// We require multiple steps of recovery at eof and then a reduction in order
682	// to successfully parse.
683	build(GrammarBNF: R"bnf(
684	_ := function EOF
685	# FIXME: this forces EOF to be in follow(signature).
686	# Remove it once we use unconstrained reduction for recovery.
687	_ := signature EOF
688
689	function := signature body [recover=Skip]
690	signature := IDENTIFIER params [recover=Skip]
691	params := ( )
692	body := { }
693	)bnf");
694	TestLang.Table = LRTable::buildSLR(G: TestLang.G);
695	TestLang.RecoveryStrategies.try_emplace(
696	Key: extensionID(AttrValueName: "Skip"),
697	Args: [](Token::Index Start, const TokenStream &) { return Start; });
698	clang::LangOptions LOptions;
699	TokenStream Tokens = cook(lex("main", LOptions), LOptions);
700
701	const ForestNode &Parsed =
702	glrParse(Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, StartSymbol: id(Name: "function"), Lang: TestLang);
703	EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
704	"[ 0, end) function := signature body [recover=Skip]\n"
705	"[ 0, 1) ├─signature := IDENTIFIER params [recover=Skip]\n"
706	"[ 0, 1) │ ├─IDENTIFIER := tok[0]\n"
707	"[ 1, 1) │ └─params := <opaque>\n"
708	"[ 1, end) └─body := <opaque>\n");
709	}
710
711	TEST_F(GLRTest, NoExplicitAccept) {
712	build(GrammarBNF: R"bnf(
713	_ := test EOF
714
715	test := IDENTIFIER test
716	test := IDENTIFIER
717	)bnf");
718	clang::LangOptions LOptions;
719	// Given the following input, and the grammar above, we perform two reductions
720	// of the nonterminal `test` when the next token is `eof`, verify that the
721	// parser stops at the right state.
722	const TokenStream &Tokens = cook(lex("id id", LOptions), LOptions);
723	TestLang.Table = LRTable::buildSLR(G: TestLang.G);
724
725	const ForestNode &Parsed =
726	glrParse(Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, StartSymbol: id(Name: "test"), Lang: TestLang);
727	EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
728	"[ 0, end) test := IDENTIFIER test\n"
729	"[ 0, 1) ├─IDENTIFIER := tok[0]\n"
730	"[ 1, end) └─test := IDENTIFIER\n"
731	"[ 1, end) └─IDENTIFIER := tok[1]\n");
732	}
733
734	TEST_F(GLRTest, GuardExtension) {
735	build(GrammarBNF: R"bnf(
736	_ := start EOF
737
738	start := IDENTIFIER [guard]
739	)bnf");
740	TestLang.Guards.try_emplace(
741	Key: ruleFor(NonterminalName: "start"), Args: [&](const GuardParams &P) {
742	assert(P.RHS.size() == `1` &&
743	P.RHS.front()->symbol() ==
744	tokenSymbol(clang::tok::identifier));
745	return P.Tokens.tokens()[P.RHS.front()->startTokenIndex()]
746	.text() == "test";
747	});
748	clang::LangOptions LOptions;
749	TestLang.Table = LRTable::buildSLR(G: TestLang.G);
750
751	std::string Input = "test";
752	const TokenStream &Succeeded = cook(lex(Input, LOptions), LOptions);
753	EXPECT_EQ(glrParse({Succeeded, Arena, GSStack}, id("start"), TestLang)
754	.dumpRecursive(TestLang.G),
755	"[ 0, end) start := IDENTIFIER [guard]\n"
756	"[ 0, end) └─IDENTIFIER := tok[0]\n");
757
758	Input = "notest";
759	const TokenStream &Failed = cook(lex(Input, LOptions), LOptions);
760	EXPECT_EQ(glrParse({Failed, Arena, GSStack}, id("start"), TestLang)
761	.dumpRecursive(TestLang.G),
762	"[ 0, end) start := <opaque>\n");
763	}
764
765	TEST(GSSTest, GC) {
766	// ┌-A-┬-AB
767	// ├-B-┘
768	// Root-+-C
769	// ├-D
770	// └-E
771	GSS GSStack;
772	auto Root = GSStack.addNode(State: `0`, Symbol: nullptr*, Parents: {});
773	auto A = GSStack.addNode(State: `0`, Symbol: nullptr*, Parents: {Root});
774	auto B = GSStack.addNode(State: `0`, Symbol: nullptr*, Parents: {Root});
775	auto C = GSStack.addNode(State: `0`, Symbol: nullptr*, Parents: {Root});
776	auto D = GSStack.addNode(State: `0`, Symbol: nullptr*, Parents: {Root});
777	auto AB = GSStack.addNode(State: `0`, Symbol: nullptr*, Parents: {A, B});
778
779	EXPECT_EQ(`1u`, GSStack.gc({AB, C})) << "D is destroyed";
780	EXPECT_EQ(`0u`, GSStack.gc({AB, C})) << "D is already gone";
781	auto E = GSStack.addNode(State: `0`, Symbol: nullptr*, Parents: {Root});
782	EXPECT_EQ(D, E) << "Storage of GCed node D is reused for E";
783	EXPECT_EQ(`3u`, GSStack.gc({A, E})) << "Destroys B, AB, C";
784	EXPECT_EQ(`1u`, GSStack.gc({E})) << "Destroys A";
785	}
786
787	} // namespace
788	} // namespace pseudo
789	} // namespace clang
790

source code of clang-tools-extra/pseudo/unittests/GLRTest.cpp