1//===--- GLRTest.cpp - Test the GLR parser ----------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "clang-pseudo/GLR.h"
10#include "clang-pseudo/Bracket.h"
11#include "clang-pseudo/Language.h"
12#include "clang-pseudo/Token.h"
13#include "clang-pseudo/grammar/Grammar.h"
14#include "clang/Basic/LangOptions.h"
15#include "clang/Basic/TokenKinds.h"
16#include "llvm/ADT/StringExtras.h"
17#include "llvm/Support/FormatVariadic.h"
18#include "gmock/gmock.h"
19#include "gtest/gtest.h"
20#include <memory>
21
22namespace clang {
23namespace pseudo {
24
25llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
26 const std::vector<const GSS::Node *> &Heads) {
27 for (const auto *Head : Heads)
28 OS << *Head << "\n";
29 return OS;
30}
31
32namespace {
33
34using StateID = LRTable::StateID;
35using testing::AllOf;
36using testing::ElementsAre;
37using testing::IsEmpty;
38using testing::UnorderedElementsAre;
39
40MATCHER_P(state, StateID, "") { return arg->State == StateID; }
41MATCHER_P(parsedSymbol, FNode, "") { return arg->Payload == FNode; }
42MATCHER_P(parsedSymbolID, SID, "") { return arg->Payload->symbol() == SID; }
43MATCHER_P(start, Start, "") { return arg->Payload->startTokenIndex() == Start; }
44
45testing::Matcher<const GSS::Node *>
46parents(llvm::ArrayRef<const GSS::Node *> Parents) {
47 return testing::Property(property: &GSS::Node::parents,
48 matcher: testing::UnorderedElementsAreArray(container: Parents));
49}
50
51Token::Index recoverBraces(Token::Index Begin, const TokenStream &Code) {
52 EXPECT_GT(Begin, 0u);
53 const Token &Left = Code.tokens()[Begin - 1];
54 EXPECT_EQ(Left.Kind, tok::l_brace);
55 if (const auto* Right = Left.pair()) {
56 EXPECT_EQ(Right->Kind, tok::r_brace);
57 return Code.index(T: *Right);
58 }
59 return Token::Invalid;
60}
61
62class GLRTest : public ::testing::Test {
63public:
64 void build(llvm::StringRef GrammarBNF) {
65 std::vector<std::string> Diags;
66 TestLang.G = Grammar::parseBNF(BNF: GrammarBNF, Diags);
67 }
68
69 TokenStream emptyTokenStream() {
70 TokenStream Empty;
71 Empty.finalize();
72 return Empty;
73 }
74
75 void buildGrammar(std::vector<std::string> Nonterminals,
76 std::vector<std::string> Rules) {
77 Nonterminals.push_back(x: "_");
78 llvm::sort(C&: Nonterminals);
79 Nonterminals.erase(first: std::unique(first: Nonterminals.begin(), last: Nonterminals.end()),
80 last: Nonterminals.end());
81 std::string FakeTestBNF;
82 for (const auto &NT : Nonterminals)
83 FakeTestBNF += llvm::formatv(Fmt: "{0} := {1}\n", Vals: "_", Vals: NT);
84 FakeTestBNF += llvm::join(R&: Rules, Separator: "\n");
85 build(GrammarBNF: FakeTestBNF);
86 }
87
88 SymbolID id(llvm::StringRef Name) const {
89 for (unsigned I = 0; I < NumTerminals; ++I)
90 if (TestLang.G.table().Terminals[I] == Name)
91 return tokenSymbol(TK: static_cast<tok::TokenKind>(I));
92 for (SymbolID ID = 0; ID < TestLang.G.table().Nonterminals.size(); ++ID)
93 if (TestLang.G.table().Nonterminals[ID].Name == Name)
94 return ID;
95 ADD_FAILURE() << "No such symbol found: " << Name;
96 return 0;
97 }
98 ExtensionID extensionID(llvm::StringRef AttrValueName) const {
99 for (ExtensionID EID = 0; EID < TestLang.G.table().AttributeValues.size();
100 ++EID)
101 if (TestLang.G.table().AttributeValues[EID] == AttrValueName)
102 return EID;
103 ADD_FAILURE() << "No such attribute value found: " << AttrValueName;
104 return 0;
105 }
106
107 RuleID ruleFor(llvm::StringRef NonterminalName) const {
108 auto RuleRange =
109 TestLang.G.table().Nonterminals[id(Name: NonterminalName)].RuleRange;
110 if (RuleRange.End - RuleRange.Start == 1)
111 return TestLang.G.table()
112 .Nonterminals[id(Name: NonterminalName)]
113 .RuleRange.Start;
114 ADD_FAILURE() << "Expected a single rule for " << NonterminalName
115 << ", but it has " << RuleRange.End - RuleRange.Start
116 << " rule!\n";
117 return 0;
118 }
119
120protected:
121 Language TestLang;
122 ForestArena Arena;
123 GSS GSStack;
124};
125
126TEST_F(GLRTest, ShiftMergingHeads) {
127 // Given a test case where we have two heads 1, 2, 3 in the GSS, the heads 1,
128 // 2 have shift actions to reach state 4, and the head 3 has a shift action to
129 // reach state 5:
130 // 0--1
131 // └--2
132 // └--3
133 // After the shift action, the GSS (with new heads 4, 5) is:
134 // 0---1---4
135 // └---2---┘
136 // └---3---5
137 auto *GSSNode0 =
138 GSStack.addNode(/*State=*/0, /*ForestNode=*/Symbol: nullptr, /*Parents=*/{});
139 auto *GSSNode1 = GSStack.addNode(/*State=*/1, /*ForestNode=*/Symbol: nullptr,
140 /*Parents=*/{GSSNode0});
141 auto *GSSNode2 = GSStack.addNode(/*State=*/2, /*ForestNode=*/Symbol: nullptr,
142 /*Parents=*/{GSSNode0});
143 auto *GSSNode3 = GSStack.addNode(/*State=*/3, /*ForestNode=*/Symbol: nullptr,
144 /*Parents=*/{GSSNode0});
145
146 buildGrammar(Nonterminals: {}, Rules: {}); // Create a fake empty grammar.
147 LRTable::Builder B(TestLang.G);
148 B.Transition[{StateID{1}, tokenSymbol(TK: tok::semi)}] = StateID{4};
149 B.Transition[{StateID{2}, tokenSymbol(TK: tok::semi)}] = StateID{4};
150 B.Transition[{StateID{3}, tokenSymbol(TK: tok::semi)}] = StateID{5};
151 TestLang.Table = std::move(B).build();
152
153 ForestNode &SemiTerminal = Arena.createTerminal(TK: tok::semi, Start: 0);
154 std::vector<const GSS::Node *> NewHeads;
155 glrShift(OldHeads: {GSSNode1, GSSNode2, GSSNode3}, NextTok: SemiTerminal,
156 Params: {.Code: emptyTokenStream(), .Forest: Arena, .GSStack: GSStack}, Lang: TestLang, NewHeads);
157
158 EXPECT_THAT(NewHeads,
159 UnorderedElementsAre(AllOf(state(4), parsedSymbol(&SemiTerminal),
160 parents({GSSNode1, GSSNode2})),
161 AllOf(state(5), parsedSymbol(&SemiTerminal),
162 parents({GSSNode3}))))
163 << NewHeads;
164}
165
166TEST_F(GLRTest, ReduceConflictsSplitting) {
167 // Before (splitting due to R/R conflict):
168 // 0--1(IDENTIFIER)
169 // After reducing 1 by `class-name := IDENTIFIER` and
170 // `enum-name := IDENTIFIER`:
171 // 0--2(class-name) // 2 is goto(0, class-name)
172 // └--3(enum-name) // 3 is goto(0, enum-name)
173 buildGrammar(Nonterminals: {"class-name", "enum-name"},
174 Rules: {"class-name := IDENTIFIER", "enum-name := IDENTIFIER"});
175 LRTable::Builder B(TestLang.G);
176 B.Transition[{StateID{0}, id(Name: "class-name")}] = StateID{2};
177 B.Transition[{StateID{0}, id(Name: "enum-name")}] = StateID{3};
178 B.Reduce[StateID{1}].insert(V: ruleFor(NonterminalName: "class-name"));
179 B.Reduce[StateID{1}].insert(V: ruleFor(NonterminalName: "enum-name"));
180 TestLang.Table = std::move(B).build();
181
182 const auto *GSSNode0 =
183 GSStack.addNode(/*State=*/0, /*ForestNode=*/Symbol: nullptr, /*Parents=*/{});
184 const auto *GSSNode1 =
185 GSStack.addNode(State: 1, Symbol: &Arena.createTerminal(TK: tok::identifier, Start: 0), Parents: {GSSNode0});
186
187 std::vector<const GSS::Node *> Heads = {GSSNode1};
188 glrReduce(Heads, Lookahead: tokenSymbol(TK: tok::eof),
189 Params: {.Code: emptyTokenStream(), .Forest: Arena, .GSStack: GSStack}, Lang: TestLang);
190 EXPECT_THAT(Heads, UnorderedElementsAre(
191 GSSNode1,
192 AllOf(state(2), parsedSymbolID(id("class-name")),
193 parents({GSSNode0})),
194 AllOf(state(3), parsedSymbolID(id("enum-name")),
195 parents({GSSNode0}))))
196 << Heads;
197}
198
199TEST_F(GLRTest, ReduceSplittingDueToMultipleBases) {
200 // Before (splitting due to multiple bases):
201 // 2(class-name)--4(*)
202 // 3(enum-name)---┘
203 // After reducing 4 by `ptr-operator := *`:
204 // 2(class-name)--5(ptr-operator) // 5 is goto(2, ptr-operator)
205 // 3(enum-name)---6(ptr-operator) // 6 is goto(3, ptr-operator)
206 buildGrammar(Nonterminals: {"ptr-operator", "class-name", "enum-name"},
207 Rules: {"ptr-operator := *"});
208
209 auto *ClassNameNode = &Arena.createOpaque(SID: id(Name: "class-name"), /*TokenIndex=*/Start: 0);
210 auto *EnumNameNode = &Arena.createOpaque(SID: id(Name: "enum-name"), /*TokenIndex=*/Start: 0);
211
212 const auto *GSSNode2 =
213 GSStack.addNode(/*State=*/2, /*ForestNode=*/Symbol: ClassNameNode, /*Parents=*/{});
214 const auto *GSSNode3 =
215 GSStack.addNode(/*State=*/3, /*ForestNode=*/Symbol: EnumNameNode, /*Parents=*/{});
216 const auto *GSSNode4 = GSStack.addNode(
217 /*State=*/4, Symbol: &Arena.createTerminal(TK: tok::star, /*TokenIndex=*/Start: 1),
218 /*Parents=*/{GSSNode2, GSSNode3});
219
220 LRTable::Builder B(TestLang.G);
221 B.Transition[{StateID{2}, id(Name: "ptr-operator")}] = StateID{5};
222 B.Transition[{StateID{3}, id(Name: "ptr-operator")}] = StateID{6};
223 B.Reduce[StateID{4}].insert(V: ruleFor(NonterminalName: "ptr-operator"));
224 TestLang.Table = std::move(B).build();
225
226 std::vector<const GSS::Node *> Heads = {GSSNode4};
227 glrReduce(Heads, Lookahead: tokenSymbol(TK: tok::eof), Params: {.Code: emptyTokenStream(), .Forest: Arena, .GSStack: GSStack},
228 Lang: TestLang);
229
230 EXPECT_THAT(Heads, UnorderedElementsAre(
231 GSSNode4,
232 AllOf(state(5), parsedSymbolID(id("ptr-operator")),
233 parents({GSSNode2})),
234 AllOf(state(6), parsedSymbolID(id("ptr-operator")),
235 parents({GSSNode3}))))
236 << Heads;
237 // Verify that the payload of the two new heads is shared, only a single
238 // ptr-operator node is created in the forest.
239 EXPECT_EQ(Heads[1]->Payload, Heads[2]->Payload);
240}
241
242TEST_F(GLRTest, ReduceJoiningWithMultipleBases) {
243 // Before (joining due to same goto state, multiple bases):
244 // 0--1(cv-qualifier)--3(class-name)
245 // └--2(cv-qualifier)--4(enum-name)
246 // After reducing 3 by `type-name := class-name` and
247 // 4 by `type-name := enum-name`:
248 // 0--1(cv-qualifier)--5(type-name) // 5 is goto(1, type-name) and
249 // └--2(cv-qualifier)--┘ // goto(2, type-name)
250 buildGrammar(Nonterminals: {"type-name", "class-name", "enum-name", "cv-qualifier"},
251 Rules: {"type-name := class-name", "type-name := enum-name"});
252
253 auto *CVQualifierNode =
254 &Arena.createOpaque(SID: id(Name: "cv-qualifier"), /*TokenIndex=*/Start: 0);
255 auto *ClassNameNode = &Arena.createOpaque(SID: id(Name: "class-name"), /*TokenIndex=*/Start: 1);
256 auto *EnumNameNode = &Arena.createOpaque(SID: id(Name: "enum-name"), /*TokenIndex=*/Start: 1);
257
258 const auto *GSSNode0 =
259 GSStack.addNode(/*State=*/0, /*ForestNode=*/Symbol: nullptr, /*Parents=*/{});
260 const auto *GSSNode1 = GSStack.addNode(
261 /*State=*/1, /*ForestNode=*/Symbol: CVQualifierNode, /*Parents=*/{GSSNode0});
262 const auto *GSSNode2 = GSStack.addNode(
263 /*State=*/2, /*ForestNode=*/Symbol: CVQualifierNode, /*Parents=*/{GSSNode0});
264 const auto *GSSNode3 = GSStack.addNode(
265 /*State=*/3, /*ForestNode=*/Symbol: ClassNameNode,
266 /*Parents=*/{GSSNode1});
267 const auto *GSSNode4 =
268 GSStack.addNode(/*State=*/4, /*ForestNode=*/Symbol: EnumNameNode,
269 /*Parents=*/{GSSNode2});
270
271 // FIXME: figure out a way to get rid of the hard-coded reduce RuleID!
272 LRTable::Builder B(TestLang.G);
273 B.Transition[{StateID{1}, id(Name: "type-name")}] = StateID{5};
274 B.Transition[{StateID{2}, id(Name: "type-name")}] = StateID{5};
275 B.Reduce[StateID{3}].insert(/* type-name := class-name */ V: RuleID{0});
276 B.Reduce[StateID{4}].insert(/* type-name := enum-name */ V: RuleID{1});
277 TestLang.Table = std::move(B).build();
278
279 std::vector<const GSS::Node *> Heads = {GSSNode3, GSSNode4};
280 glrReduce(Heads, Lookahead: tokenSymbol(TK: tok::eof), Params: {.Code: emptyTokenStream(), .Forest: Arena, .GSStack: GSStack},
281 Lang: TestLang);
282
283 // Verify that the stack heads are joint at state 5 after reduces.
284 EXPECT_THAT(Heads, UnorderedElementsAre(GSSNode3, GSSNode4,
285 AllOf(state(5),
286 parsedSymbolID(id("type-name")),
287 parents({GSSNode1, GSSNode2}))))
288 << Heads;
289 // Verify that we create an ambiguous ForestNode of two parses of `type-name`.
290 EXPECT_EQ(Heads.back()->Payload->dumpRecursive(TestLang.G),
291 "[ 1, end) type-name := <ambiguous>\n"
292 "[ 1, end) ├─type-name := class-name\n"
293 "[ 1, end) │ └─class-name := <opaque>\n"
294 "[ 1, end) └─type-name := enum-name\n"
295 "[ 1, end) └─enum-name := <opaque>\n");
296}
297
298TEST_F(GLRTest, ReduceJoiningWithSameBase) {
299 // Before (joining due to same goto state, the same base):
300 // 0--1(class-name)--3(*)
301 // └--2(enum-name)--4(*)
302 // After reducing 3 by `pointer := class-name *` and
303 // 2 by `pointer := enum-name *`:
304 // 0--5(pointer) // 5 is goto(0, pointer)
305 buildGrammar(Nonterminals: {"pointer", "class-name", "enum-name"},
306 Rules: {"pointer := class-name *", "pointer := enum-name *"});
307
308 auto *ClassNameNode = &Arena.createOpaque(SID: id(Name: "class-name"), /*TokenIndex=*/Start: 0);
309 auto *EnumNameNode = &Arena.createOpaque(SID: id(Name: "enum-name"), /*TokenIndex=*/Start: 0);
310 auto *StartTerminal = &Arena.createTerminal(TK: tok::star, /*TokenIndex=*/Start: 1);
311
312 const auto *GSSNode0 =
313 GSStack.addNode(/*State=*/0, /*ForestNode=*/Symbol: nullptr, /*Parents=*/{});
314 const auto *GSSNode1 =
315 GSStack.addNode(/*State=*/1, /*ForestNode=*/Symbol: ClassNameNode,
316 /*Parents=*/{GSSNode0});
317 const auto *GSSNode2 =
318 GSStack.addNode(/*State=*/2, /*ForestNode=*/Symbol: EnumNameNode,
319 /*Parents=*/{GSSNode0});
320 const auto *GSSNode3 =
321 GSStack.addNode(/*State=*/3, /*ForestNode=*/Symbol: StartTerminal,
322 /*Parents=*/{GSSNode1});
323 const auto *GSSNode4 =
324 GSStack.addNode(/*State=*/4, /*ForestNode=*/Symbol: StartTerminal,
325 /*Parents=*/{GSSNode2});
326
327 // FIXME: figure out a way to get rid of the hard-coded reduce RuleID!
328 LRTable::Builder B(TestLang.G);
329 B.Transition[{StateID{0}, id(Name: "pointer")}] = StateID{5};
330 B.Reduce[StateID{3}].insert(/* pointer := class-name */ V: RuleID{0});
331 B.Reduce[StateID{4}].insert(/* pointer := enum-name */ V: RuleID{1});
332 TestLang.Table = std::move(B).build();
333
334 std::vector<const GSS::Node *> Heads = {GSSNode3, GSSNode4};
335 glrReduce(Heads, Lookahead: tokenSymbol(TK: tok::eof),
336 Params: {.Code: emptyTokenStream(), .Forest: Arena, .GSStack: GSStack}, Lang: TestLang);
337
338 EXPECT_THAT(
339 Heads, UnorderedElementsAre(GSSNode3, GSSNode4,
340 AllOf(state(5), parsedSymbolID(id("pointer")),
341 parents({GSSNode0}))))
342 << Heads;
343 EXPECT_EQ(Heads.back()->Payload->dumpRecursive(TestLang.G),
344 "[ 0, end) pointer := <ambiguous>\n"
345 "[ 0, end) ├─pointer := class-name *\n"
346 "[ 0, 1) │ ├─class-name := <opaque>\n"
347 "[ 1, end) │ └─* := tok[1]\n"
348 "[ 0, end) └─pointer := enum-name *\n"
349 "[ 0, 1) ├─enum-name := <opaque>\n"
350 "[ 1, end) └─* := tok[1]\n");
351}
352
353TEST_F(GLRTest, ReduceLookahead) {
354 // A term can be followed by +, but not by -.
355 buildGrammar(Nonterminals: {"sum", "term"}, Rules: {"expr := term + term", "term := IDENTIFIER"});
356 LRTable::Builder B(TestLang.G);
357 B.Transition[{StateID{0}, id(Name: "term")}] = StateID{2};
358 B.Reduce[StateID{1}].insert(V: RuleID{0});
359 TestLang.Table = std::move(B).build();
360
361 auto *Identifier = &Arena.createTerminal(TK: tok::identifier, /*Start=*/0);
362
363 const auto *Root =
364 GSStack.addNode(/*State=*/0, /*ForestNode=*/Symbol: nullptr, /*Parents=*/{});
365 const auto *GSSNode1 =
366 GSStack.addNode(/*State=*/1, /*ForestNode=*/Symbol: Identifier, Parents: {Root});
367
368 // When the lookahead is +, reduce is performed.
369 std::vector<const GSS::Node *> Heads = {GSSNode1};
370 glrReduce(Heads, Lookahead: tokenSymbol(TK: tok::plus), Params: {.Code: emptyTokenStream(), .Forest: Arena, .GSStack: GSStack},
371 Lang: TestLang);
372 EXPECT_THAT(Heads,
373 ElementsAre(GSSNode1, AllOf(state(2), parsedSymbolID(id("term")),
374 parents(Root))));
375
376 // When the lookahead is -, reduce is not performed.
377 Heads = {GSSNode1};
378 glrReduce(Heads, Lookahead: tokenSymbol(TK: tok::minus),
379 Params: {.Code: emptyTokenStream(), .Forest: Arena, .GSStack: GSStack}, Lang: TestLang);
380 EXPECT_THAT(Heads, ElementsAre(GSSNode1));
381}
382
383TEST_F(GLRTest, Recover) {
384 // Recovery while parsing "word" inside braces.
385 // Before:
386 // 0--1({)--2(?)
387 // After recovering a `word` at state 1:
388 // 0--3(word) // 3 is goto(1, word)
389 buildGrammar(Nonterminals: {"word", "top"}, Rules: {"top := { word [recover=Braces] }"});
390 LRTable::Builder B(TestLang.G);
391 B.Transition[{StateID{1}, id(Name: "word")}] = StateID{3};
392 B.Recoveries.push_back(x: {StateID{1}, {.Strategy: extensionID(AttrValueName: "Braces"), .Result: id(Name: "word")}});
393 TestLang.Table = std::move(B).build();
394 TestLang.RecoveryStrategies.try_emplace(Key: extensionID(AttrValueName: "Braces"), Args&: recoverBraces);
395
396 auto *LBrace = &Arena.createTerminal(TK: tok::l_brace, Start: 0);
397 auto *Question1 = &Arena.createTerminal(TK: tok::question, Start: 1);
398 const auto *Root = GSStack.addNode(State: 0, Symbol: nullptr, Parents: {});
399 const auto *OpenedBraces = GSStack.addNode(State: 1, Symbol: LBrace, Parents: {Root});
400 const auto *AfterQuestion1 = GSStack.addNode(State: 2, Symbol: Question1, Parents: {OpenedBraces});
401
402 // Need a token stream with paired braces so the strategy works.
403 clang::LangOptions LOptions;
404 TokenStream Tokens = cook(lex("{ ? ? ? }", LOptions), LOptions);
405 pairBrackets(Tokens);
406 std::vector<const GSS::Node *> NewHeads;
407
408 unsigned TokenIndex = 2;
409 glrRecover(OldHeads: {AfterQuestion1}, TokenIndex, Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, Lang: TestLang,
410 NewHeads);
411 EXPECT_EQ(TokenIndex, 4u) << "should skip ahead to matching brace";
412 EXPECT_THAT(NewHeads, ElementsAre(AllOf(state(3), parsedSymbolID(id("word")),
413 parents({OpenedBraces}), start(1u))));
414 EXPECT_EQ(NewHeads.front()->Payload->kind(), ForestNode::Opaque);
415
416 // Test recovery failure: omit closing brace so strategy fails
417 TokenStream NoRBrace = cook(lex("{ ? ? ? ?", LOptions), LOptions);
418 pairBrackets(NoRBrace);
419 NewHeads.clear();
420 TokenIndex = 2;
421 glrRecover(OldHeads: {AfterQuestion1}, TokenIndex, Params: {.Code: NoRBrace, .Forest: Arena, .GSStack: GSStack}, Lang: TestLang,
422 NewHeads);
423 EXPECT_EQ(TokenIndex, 2u) << "should not advance on failure";
424 EXPECT_THAT(NewHeads, IsEmpty());
425}
426
427TEST_F(GLRTest, RecoverRightmost) {
428 // In a nested block structure, we recover at the innermost possible block.
429 // Before:
430 // 0--1({)--1({)--1({)
431 // After recovering a `block` at inside the second braces:
432 // 0--1({)--2(body) // 2 is goto(1, body)
433 buildGrammar(Nonterminals: {"body", "top"}, Rules: {"top := { body [recover=Braces] }"});
434 LRTable::Builder B(TestLang.G);
435 B.Transition[{StateID{1}, id(Name: "body")}] = StateID{2};
436 B.Recoveries.push_back(x: {StateID{1}, {.Strategy: extensionID(AttrValueName: "Braces"), .Result: id(Name: "body")}});
437 TestLang.Table = std::move(B).build();
438 TestLang.RecoveryStrategies.try_emplace(Key: extensionID(AttrValueName: "Braces"), Args&: recoverBraces);
439
440 clang::LangOptions LOptions;
441 // Innermost brace is unmatched, to test fallback to next brace.
442 TokenStream Tokens = cook(lex("{ { { ? } }", LOptions), LOptions);
443 Tokens.tokens()[0].Pair = 5;
444 Tokens.tokens()[1].Pair = 4;
445 Tokens.tokens()[4].Pair = 1;
446 Tokens.tokens()[5].Pair = 0;
447
448 auto *Brace1 = &Arena.createTerminal(TK: tok::l_brace, Start: 0);
449 auto *Brace2 = &Arena.createTerminal(TK: tok::l_brace, Start: 1);
450 auto *Brace3 = &Arena.createTerminal(TK: tok::l_brace, Start: 2);
451 const auto *Root = GSStack.addNode(State: 0, Symbol: nullptr, Parents: {});
452 const auto *In1 = GSStack.addNode(State: 1, Symbol: Brace1, Parents: {Root});
453 const auto *In2 = GSStack.addNode(State: 1, Symbol: Brace2, Parents: {In1});
454 const auto *In3 = GSStack.addNode(State: 1, Symbol: Brace3, Parents: {In2});
455
456 unsigned TokenIndex = 3;
457 std::vector<const GSS::Node *> NewHeads;
458 glrRecover(OldHeads: {In3}, TokenIndex, Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, Lang: TestLang, NewHeads);
459 EXPECT_EQ(TokenIndex, 5u);
460 EXPECT_THAT(NewHeads, ElementsAre(AllOf(state(2), parsedSymbolID(id("body")),
461 parents({In2}), start(2u))));
462}
463
464TEST_F(GLRTest, RecoverAlternatives) {
465 // Recovery inside braces with multiple equally good options
466 // Before:
467 // 0--1({)
468 // After recovering either `word` or `number` inside the braces:
469 // 0--1({)--2(word) // 2 is goto(1, word)
470 // └--3(number) // 3 is goto(1, number)
471 buildGrammar(Nonterminals: {"number", "word", "top"},
472 Rules: {
473 "top := { number [recover=Braces] }",
474 "top := { word [recover=Braces] }",
475 });
476 LRTable::Builder B(TestLang.G);
477 B.Transition[{StateID{1}, id(Name: "number")}] = StateID{2};
478 B.Transition[{StateID{1}, id(Name: "word")}] = StateID{3};
479 B.Recoveries.push_back(x: {StateID{1}, {.Strategy: extensionID(AttrValueName: "Braces"), .Result: id(Name: "number")}});
480 B.Recoveries.push_back(x: {StateID{1}, {.Strategy: extensionID(AttrValueName: "Braces"), .Result: id(Name: "word")}});
481 TestLang.RecoveryStrategies.try_emplace(Key: extensionID(AttrValueName: "Braces"), Args&: recoverBraces);
482 TestLang.Table = std::move(B).build();
483 auto *LBrace = &Arena.createTerminal(TK: tok::l_brace, Start: 0);
484 const auto *Root = GSStack.addNode(State: 0, Symbol: nullptr, Parents: {});
485 const auto *OpenedBraces = GSStack.addNode(State: 1, Symbol: LBrace, Parents: {Root});
486
487 clang::LangOptions LOptions;
488 TokenStream Tokens = cook(lex("{ ? }", LOptions), LOptions);
489 pairBrackets(Tokens);
490 std::vector<const GSS::Node *> NewHeads;
491 unsigned TokenIndex = 1;
492
493 glrRecover(OldHeads: {OpenedBraces}, TokenIndex, Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, Lang: TestLang,
494 NewHeads);
495 EXPECT_EQ(TokenIndex, 2u);
496 EXPECT_THAT(NewHeads,
497 UnorderedElementsAre(AllOf(state(2), parsedSymbolID(id("number")),
498 parents({OpenedBraces}), start(1u)),
499 AllOf(state(3), parsedSymbolID(id("word")),
500 parents({OpenedBraces}), start(1u))));
501}
502
503TEST_F(GLRTest, PerfectForestNodeSharing) {
504 // Run the GLR on a simple grammar and test that we build exactly one forest
505 // node per (SymbolID, token range).
506
507 // This is a grmammar where the original parsing-stack-based forest node
508 // sharing approach will fail. In its LR0 graph, it has two states containing
509 // item `expr := • IDENTIFIER`, and both have different goto states on the
510 // nonterminal `expr`.
511 build(GrammarBNF: R"bnf(
512 _ := test EOF
513
514 test := { expr
515 test := { IDENTIFIER
516 test := left-paren expr
517 left-paren := {
518 expr := IDENTIFIER
519 )bnf");
520 TestLang.Table = LRTable::buildSLR(G: TestLang.G);
521 clang::LangOptions LOptions;
522 const TokenStream &Tokens = cook(lex("{ abc", LOptions), LOptions);
523
524 const ForestNode &Parsed =
525 glrParse(Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, StartSymbol: id(Name: "test"), Lang: TestLang);
526 // Verify that there is no duplicated sequence node of `expr := IDENTIFIER`
527 // in the forest, see the `#1` and `=#1` in the dump string.
528 EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
529 "[ 0, end) test := <ambiguous>\n"
530 "[ 0, end) ├─test := { expr\n"
531 "[ 0, 1) │ ├─{ := tok[0]\n"
532 "[ 1, end) │ └─expr := IDENTIFIER #1\n"
533 "[ 1, end) │ └─IDENTIFIER := tok[1]\n"
534 "[ 0, end) ├─test := { IDENTIFIER\n"
535 "[ 0, 1) │ ├─{ := tok[0]\n"
536 "[ 1, end) │ └─IDENTIFIER := tok[1]\n"
537 "[ 0, end) └─test := left-paren expr\n"
538 "[ 0, 1) ├─left-paren := {\n"
539 "[ 0, 1) │ └─{ := tok[0]\n"
540 "[ 1, end) └─expr =#1\n");
541}
542
543TEST_F(GLRTest, GLRReduceOrder) {
544 // Given the following grammar, and the input `IDENTIFIER`, reductions should
545 // be performed in the following order:
546 // 1. foo := IDENTIFIER
547 // 2. { test := IDENTIFIER, test := foo }
548 // foo should be reduced first, so that in step 2 we have completed reduces
549 // for test, and form an ambiguous forest node.
550 build(GrammarBNF: R"bnf(
551 _ := test EOF
552
553 test := IDENTIFIER
554 test := foo
555 foo := IDENTIFIER
556 )bnf");
557 clang::LangOptions LOptions;
558 const TokenStream &Tokens = cook(lex("IDENTIFIER", LOptions), LOptions);
559 TestLang.Table = LRTable::buildSLR(G: TestLang.G);
560
561 const ForestNode &Parsed =
562 glrParse(Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, StartSymbol: id(Name: "test"), Lang: TestLang);
563 EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
564 "[ 0, end) test := <ambiguous>\n"
565 "[ 0, end) ├─test := IDENTIFIER\n"
566 "[ 0, end) │ └─IDENTIFIER := tok[0]\n"
567 "[ 0, end) └─test := foo\n"
568 "[ 0, end) └─foo := IDENTIFIER\n"
569 "[ 0, end) └─IDENTIFIER := tok[0]\n");
570}
571
572TEST_F(GLRTest, RecoveryEndToEnd) {
573 // Simple example of brace-based recovery showing:
574 // - recovered region includes tokens both ahead of and behind the cursor
575 // - multiple possible recovery rules
576 // - recovery from outer scopes is rejected
577 build(GrammarBNF: R"bnf(
578 _ := block EOF
579
580 block := { block [recover=Braces] }
581 block := { numbers [recover=Braces] }
582 numbers := NUMERIC_CONSTANT NUMERIC_CONSTANT
583 )bnf");
584 TestLang.Table = LRTable::buildSLR(G: TestLang.G);
585 TestLang.RecoveryStrategies.try_emplace(Key: extensionID(AttrValueName: "Braces"), Args&: recoverBraces);
586 clang::LangOptions LOptions;
587 TokenStream Tokens = cook(lex("{ { 42 ? } }", LOptions), LOptions);
588 pairBrackets(Tokens);
589
590 const ForestNode &Parsed =
591 glrParse(Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, StartSymbol: id(Name: "block"), Lang: TestLang);
592 EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
593 "[ 0, end) block := { block [recover=Braces] }\n"
594 "[ 0, 1) ├─{ := tok[0]\n"
595 "[ 1, 5) ├─block := <ambiguous>\n"
596 "[ 1, 5) │ ├─block := { block [recover=Braces] }\n"
597 "[ 1, 2) │ │ ├─{ := tok[1]\n"
598 "[ 2, 4) │ │ ├─block := <opaque>\n"
599 "[ 4, 5) │ │ └─} := tok[4]\n"
600 "[ 1, 5) │ └─block := { numbers [recover=Braces] }\n"
601 "[ 1, 2) │ ├─{ := tok[1]\n"
602 "[ 2, 4) │ ├─numbers := <opaque>\n"
603 "[ 4, 5) │ └─} := tok[4]\n"
604 "[ 5, end) └─} := tok[5]\n");
605}
606
607TEST_F(GLRTest, RecoverTerminal) {
608 build(GrammarBNF: R"bnf(
609 _ := stmt EOF
610
611 stmt := IDENTIFIER ; [recover=Skip]
612 )bnf");
613 TestLang.Table = LRTable::buildSLR(G: TestLang.G);
614 TestLang.RecoveryStrategies.try_emplace(
615 Key: extensionID(AttrValueName: "Skip"),
616 Args: [](Token::Index Start, const TokenStream &) { return Start; });
617 clang::LangOptions LOptions;
618 TokenStream Tokens = cook(lex("foo", LOptions), LOptions);
619
620 const ForestNode &Parsed =
621 glrParse(Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, StartSymbol: id(Name: "stmt"), Lang: TestLang);
622 EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
623 "[ 0, end) stmt := IDENTIFIER ; [recover=Skip]\n"
624 "[ 0, 1) ├─IDENTIFIER := tok[0]\n"
625 "[ 1, end) └─; := <opaque>\n");
626}
627
628TEST_F(GLRTest, RecoverUnrestrictedReduce) {
629 // Here, ! is not in any rule and therefore not in the follow set of `word`.
630 // We would not normally reduce `word := IDENTIFIER`, but do so for recovery.
631
632 build(GrammarBNF: R"bnf(
633 _ := sentence EOF
634
635 word := IDENTIFIER
636 sentence := word word [recover=AcceptAnyTokenInstead]
637 )bnf");
638
639 clang::LangOptions LOptions;
640 const TokenStream &Tokens = cook(lex("id !", LOptions), LOptions);
641 TestLang.Table = LRTable::buildSLR(G: TestLang.G);
642 TestLang.RecoveryStrategies.try_emplace(
643 Key: extensionID(AttrValueName: "AcceptAnyTokenInstead"),
644 Args: [](Token::Index Start, const TokenStream &Stream) { return Start + 1; });
645
646 const ForestNode &Parsed =
647 glrParse(Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, StartSymbol: id(Name: "sentence"), Lang: TestLang);
648 EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
649 "[ 0, end) sentence := word word [recover=AcceptAnyTokenInstead]\n"
650 "[ 0, 1) ├─word := IDENTIFIER\n"
651 "[ 0, 1) │ └─IDENTIFIER := tok[0]\n"
652 "[ 1, end) └─word := <opaque>\n");
653}
654
655TEST_F(GLRTest, RecoveryFromStartOfInput) {
656 build(GrammarBNF: R"bnf(
657 _ := start [recover=Fallback] EOF
658
659 start := IDENTIFIER
660 )bnf");
661 TestLang.Table = LRTable::buildSLR(G: TestLang.G);
662 bool fallback_recovered = false;
663 auto fallback = [&](Token::Index Start, const TokenStream & Code) {
664 fallback_recovered = true;
665 return Code.tokens().size();
666 };
667 TestLang.RecoveryStrategies.try_emplace(
668 Key: extensionID(AttrValueName: "Fallback"),
669 Args&: fallback);
670 clang::LangOptions LOptions;
671 TokenStream Tokens = cook(lex("?", LOptions), LOptions);
672
673 const ForestNode &Parsed =
674 glrParse(Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, StartSymbol: id(Name: "start"), Lang: TestLang);
675 EXPECT_TRUE(fallback_recovered);
676 EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
677 "[ 0, end) start := <opaque>\n");
678}
679
680TEST_F(GLRTest, RepeatedRecovery) {
681 // We require multiple steps of recovery at eof and then a reduction in order
682 // to successfully parse.
683 build(GrammarBNF: R"bnf(
684 _ := function EOF
685 # FIXME: this forces EOF to be in follow(signature).
686 # Remove it once we use unconstrained reduction for recovery.
687 _ := signature EOF
688
689 function := signature body [recover=Skip]
690 signature := IDENTIFIER params [recover=Skip]
691 params := ( )
692 body := { }
693 )bnf");
694 TestLang.Table = LRTable::buildSLR(G: TestLang.G);
695 TestLang.RecoveryStrategies.try_emplace(
696 Key: extensionID(AttrValueName: "Skip"),
697 Args: [](Token::Index Start, const TokenStream &) { return Start; });
698 clang::LangOptions LOptions;
699 TokenStream Tokens = cook(lex("main", LOptions), LOptions);
700
701 const ForestNode &Parsed =
702 glrParse(Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, StartSymbol: id(Name: "function"), Lang: TestLang);
703 EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
704 "[ 0, end) function := signature body [recover=Skip]\n"
705 "[ 0, 1) ├─signature := IDENTIFIER params [recover=Skip]\n"
706 "[ 0, 1) │ ├─IDENTIFIER := tok[0]\n"
707 "[ 1, 1) │ └─params := <opaque>\n"
708 "[ 1, end) └─body := <opaque>\n");
709}
710
711TEST_F(GLRTest, NoExplicitAccept) {
712 build(GrammarBNF: R"bnf(
713 _ := test EOF
714
715 test := IDENTIFIER test
716 test := IDENTIFIER
717 )bnf");
718 clang::LangOptions LOptions;
719 // Given the following input, and the grammar above, we perform two reductions
720 // of the nonterminal `test` when the next token is `eof`, verify that the
721 // parser stops at the right state.
722 const TokenStream &Tokens = cook(lex("id id", LOptions), LOptions);
723 TestLang.Table = LRTable::buildSLR(G: TestLang.G);
724
725 const ForestNode &Parsed =
726 glrParse(Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, StartSymbol: id(Name: "test"), Lang: TestLang);
727 EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
728 "[ 0, end) test := IDENTIFIER test\n"
729 "[ 0, 1) ├─IDENTIFIER := tok[0]\n"
730 "[ 1, end) └─test := IDENTIFIER\n"
731 "[ 1, end) └─IDENTIFIER := tok[1]\n");
732}
733
734TEST_F(GLRTest, GuardExtension) {
735 build(GrammarBNF: R"bnf(
736 _ := start EOF
737
738 start := IDENTIFIER [guard]
739 )bnf");
740 TestLang.Guards.try_emplace(
741 Key: ruleFor(NonterminalName: "start"), Args: [&](const GuardParams &P) {
742 assert(P.RHS.size() == 1 &&
743 P.RHS.front()->symbol() ==
744 tokenSymbol(clang::tok::identifier));
745 return P.Tokens.tokens()[P.RHS.front()->startTokenIndex()]
746 .text() == "test";
747 });
748 clang::LangOptions LOptions;
749 TestLang.Table = LRTable::buildSLR(G: TestLang.G);
750
751 std::string Input = "test";
752 const TokenStream &Succeeded = cook(lex(Input, LOptions), LOptions);
753 EXPECT_EQ(glrParse({Succeeded, Arena, GSStack}, id("start"), TestLang)
754 .dumpRecursive(TestLang.G),
755 "[ 0, end) start := IDENTIFIER [guard]\n"
756 "[ 0, end) └─IDENTIFIER := tok[0]\n");
757
758 Input = "notest";
759 const TokenStream &Failed = cook(lex(Input, LOptions), LOptions);
760 EXPECT_EQ(glrParse({Failed, Arena, GSStack}, id("start"), TestLang)
761 .dumpRecursive(TestLang.G),
762 "[ 0, end) start := <opaque>\n");
763}
764
765TEST(GSSTest, GC) {
766 // ┌-A-┬-AB
767 // ├-B-┘
768 // Root-+-C
769 // ├-D
770 // └-E
771 GSS GSStack;
772 auto *Root = GSStack.addNode(State: 0, Symbol: nullptr, Parents: {});
773 auto *A = GSStack.addNode(State: 0, Symbol: nullptr, Parents: {Root});
774 auto *B = GSStack.addNode(State: 0, Symbol: nullptr, Parents: {Root});
775 auto *C = GSStack.addNode(State: 0, Symbol: nullptr, Parents: {Root});
776 auto *D = GSStack.addNode(State: 0, Symbol: nullptr, Parents: {Root});
777 auto *AB = GSStack.addNode(State: 0, Symbol: nullptr, Parents: {A, B});
778
779 EXPECT_EQ(1u, GSStack.gc({AB, C})) << "D is destroyed";
780 EXPECT_EQ(0u, GSStack.gc({AB, C})) << "D is already gone";
781 auto *E = GSStack.addNode(State: 0, Symbol: nullptr, Parents: {Root});
782 EXPECT_EQ(D, E) << "Storage of GCed node D is reused for E";
783 EXPECT_EQ(3u, GSStack.gc({A, E})) << "Destroys B, AB, C";
784 EXPECT_EQ(1u, GSStack.gc({E})) << "Destroys A";
785}
786
787} // namespace
788} // namespace pseudo
789} // namespace clang
790

source code of clang-tools-extra/pseudo/unittests/GLRTest.cpp