1 | //===--- GLRTest.cpp - Test the GLR parser ----------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "clang-pseudo/GLR.h" |
10 | #include "clang-pseudo/Bracket.h" |
11 | #include "clang-pseudo/Language.h" |
12 | #include "clang-pseudo/Token.h" |
13 | #include "clang-pseudo/grammar/Grammar.h" |
14 | #include "clang/Basic/LangOptions.h" |
15 | #include "clang/Basic/TokenKinds.h" |
16 | #include "llvm/ADT/StringExtras.h" |
17 | #include "llvm/Support/FormatVariadic.h" |
18 | #include "gmock/gmock.h" |
19 | #include "gtest/gtest.h" |
20 | #include <memory> |
21 | |
22 | namespace clang { |
23 | namespace pseudo { |
24 | |
25 | llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, |
26 | const std::vector<const GSS::Node *> &Heads) { |
27 | for (const auto *Head : Heads) |
28 | OS << *Head << "\n" ; |
29 | return OS; |
30 | } |
31 | |
32 | namespace { |
33 | |
34 | using StateID = LRTable::StateID; |
35 | using testing::AllOf; |
36 | using testing::ElementsAre; |
37 | using testing::IsEmpty; |
38 | using testing::UnorderedElementsAre; |
39 | |
40 | MATCHER_P(state, StateID, "" ) { return arg->State == StateID; } |
41 | MATCHER_P(parsedSymbol, FNode, "" ) { return arg->Payload == FNode; } |
42 | MATCHER_P(parsedSymbolID, SID, "" ) { return arg->Payload->symbol() == SID; } |
43 | MATCHER_P(start, Start, "" ) { return arg->Payload->startTokenIndex() == Start; } |
44 | |
45 | testing::Matcher<const GSS::Node *> |
46 | parents(llvm::ArrayRef<const GSS::Node *> Parents) { |
47 | return testing::Property(property: &GSS::Node::parents, |
48 | matcher: testing::UnorderedElementsAreArray(container: Parents)); |
49 | } |
50 | |
51 | Token::Index recoverBraces(Token::Index Begin, const TokenStream &Code) { |
52 | EXPECT_GT(Begin, 0u); |
53 | const Token &Left = Code.tokens()[Begin - 1]; |
54 | EXPECT_EQ(Left.Kind, tok::l_brace); |
55 | if (const auto* Right = Left.pair()) { |
56 | EXPECT_EQ(Right->Kind, tok::r_brace); |
57 | return Code.index(T: *Right); |
58 | } |
59 | return Token::Invalid; |
60 | } |
61 | |
62 | class GLRTest : public ::testing::Test { |
63 | public: |
64 | void build(llvm::StringRef GrammarBNF) { |
65 | std::vector<std::string> Diags; |
66 | TestLang.G = Grammar::parseBNF(BNF: GrammarBNF, Diags); |
67 | } |
68 | |
69 | TokenStream emptyTokenStream() { |
70 | TokenStream Empty; |
71 | Empty.finalize(); |
72 | return Empty; |
73 | } |
74 | |
75 | void buildGrammar(std::vector<std::string> Nonterminals, |
76 | std::vector<std::string> Rules) { |
77 | Nonterminals.push_back(x: "_" ); |
78 | llvm::sort(C&: Nonterminals); |
79 | Nonterminals.erase(first: std::unique(first: Nonterminals.begin(), last: Nonterminals.end()), |
80 | last: Nonterminals.end()); |
81 | std::string FakeTestBNF; |
82 | for (const auto &NT : Nonterminals) |
83 | FakeTestBNF += llvm::formatv(Fmt: "{0} := {1}\n" , Vals: "_" , Vals: NT); |
84 | FakeTestBNF += llvm::join(R&: Rules, Separator: "\n" ); |
85 | build(GrammarBNF: FakeTestBNF); |
86 | } |
87 | |
88 | SymbolID id(llvm::StringRef Name) const { |
89 | for (unsigned I = 0; I < NumTerminals; ++I) |
90 | if (TestLang.G.table().Terminals[I] == Name) |
91 | return tokenSymbol(TK: static_cast<tok::TokenKind>(I)); |
92 | for (SymbolID ID = 0; ID < TestLang.G.table().Nonterminals.size(); ++ID) |
93 | if (TestLang.G.table().Nonterminals[ID].Name == Name) |
94 | return ID; |
95 | ADD_FAILURE() << "No such symbol found: " << Name; |
96 | return 0; |
97 | } |
98 | ExtensionID extensionID(llvm::StringRef AttrValueName) const { |
99 | for (ExtensionID EID = 0; EID < TestLang.G.table().AttributeValues.size(); |
100 | ++EID) |
101 | if (TestLang.G.table().AttributeValues[EID] == AttrValueName) |
102 | return EID; |
103 | ADD_FAILURE() << "No such attribute value found: " << AttrValueName; |
104 | return 0; |
105 | } |
106 | |
107 | RuleID ruleFor(llvm::StringRef NonterminalName) const { |
108 | auto RuleRange = |
109 | TestLang.G.table().Nonterminals[id(Name: NonterminalName)].RuleRange; |
110 | if (RuleRange.End - RuleRange.Start == 1) |
111 | return TestLang.G.table() |
112 | .Nonterminals[id(Name: NonterminalName)] |
113 | .RuleRange.Start; |
114 | ADD_FAILURE() << "Expected a single rule for " << NonterminalName |
115 | << ", but it has " << RuleRange.End - RuleRange.Start |
116 | << " rule!\n" ; |
117 | return 0; |
118 | } |
119 | |
120 | protected: |
121 | Language TestLang; |
122 | ForestArena Arena; |
123 | GSS GSStack; |
124 | }; |
125 | |
126 | TEST_F(GLRTest, ShiftMergingHeads) { |
127 | // Given a test case where we have two heads 1, 2, 3 in the GSS, the heads 1, |
128 | // 2 have shift actions to reach state 4, and the head 3 has a shift action to |
129 | // reach state 5: |
130 | // 0--1 |
131 | // └--2 |
132 | // └--3 |
133 | // After the shift action, the GSS (with new heads 4, 5) is: |
134 | // 0---1---4 |
135 | // └---2---┘ |
136 | // └---3---5 |
137 | auto *GSSNode0 = |
138 | GSStack.addNode(/*State=*/0, /*ForestNode=*/Symbol: nullptr, /*Parents=*/{}); |
139 | auto *GSSNode1 = GSStack.addNode(/*State=*/1, /*ForestNode=*/Symbol: nullptr, |
140 | /*Parents=*/{GSSNode0}); |
141 | auto *GSSNode2 = GSStack.addNode(/*State=*/2, /*ForestNode=*/Symbol: nullptr, |
142 | /*Parents=*/{GSSNode0}); |
143 | auto *GSSNode3 = GSStack.addNode(/*State=*/3, /*ForestNode=*/Symbol: nullptr, |
144 | /*Parents=*/{GSSNode0}); |
145 | |
146 | buildGrammar(Nonterminals: {}, Rules: {}); // Create a fake empty grammar. |
147 | LRTable::Builder B(TestLang.G); |
148 | B.Transition[{StateID{1}, tokenSymbol(TK: tok::semi)}] = StateID{4}; |
149 | B.Transition[{StateID{2}, tokenSymbol(TK: tok::semi)}] = StateID{4}; |
150 | B.Transition[{StateID{3}, tokenSymbol(TK: tok::semi)}] = StateID{5}; |
151 | TestLang.Table = std::move(B).build(); |
152 | |
153 | ForestNode &SemiTerminal = Arena.createTerminal(TK: tok::semi, Start: 0); |
154 | std::vector<const GSS::Node *> NewHeads; |
155 | glrShift(OldHeads: {GSSNode1, GSSNode2, GSSNode3}, NextTok: SemiTerminal, |
156 | Params: {.Code: emptyTokenStream(), .Forest: Arena, .GSStack: GSStack}, Lang: TestLang, NewHeads); |
157 | |
158 | EXPECT_THAT(NewHeads, |
159 | UnorderedElementsAre(AllOf(state(4), parsedSymbol(&SemiTerminal), |
160 | parents({GSSNode1, GSSNode2})), |
161 | AllOf(state(5), parsedSymbol(&SemiTerminal), |
162 | parents({GSSNode3})))) |
163 | << NewHeads; |
164 | } |
165 | |
166 | TEST_F(GLRTest, ReduceConflictsSplitting) { |
167 | // Before (splitting due to R/R conflict): |
168 | // 0--1(IDENTIFIER) |
169 | // After reducing 1 by `class-name := IDENTIFIER` and |
170 | // `enum-name := IDENTIFIER`: |
171 | // 0--2(class-name) // 2 is goto(0, class-name) |
172 | // └--3(enum-name) // 3 is goto(0, enum-name) |
173 | buildGrammar(Nonterminals: {"class-name" , "enum-name" }, |
174 | Rules: {"class-name := IDENTIFIER" , "enum-name := IDENTIFIER" }); |
175 | LRTable::Builder B(TestLang.G); |
176 | B.Transition[{StateID{0}, id(Name: "class-name" )}] = StateID{2}; |
177 | B.Transition[{StateID{0}, id(Name: "enum-name" )}] = StateID{3}; |
178 | B.Reduce[StateID{1}].insert(V: ruleFor(NonterminalName: "class-name" )); |
179 | B.Reduce[StateID{1}].insert(V: ruleFor(NonterminalName: "enum-name" )); |
180 | TestLang.Table = std::move(B).build(); |
181 | |
182 | const auto *GSSNode0 = |
183 | GSStack.addNode(/*State=*/0, /*ForestNode=*/Symbol: nullptr, /*Parents=*/{}); |
184 | const auto *GSSNode1 = |
185 | GSStack.addNode(State: 1, Symbol: &Arena.createTerminal(TK: tok::identifier, Start: 0), Parents: {GSSNode0}); |
186 | |
187 | std::vector<const GSS::Node *> Heads = {GSSNode1}; |
188 | glrReduce(Heads, Lookahead: tokenSymbol(TK: tok::eof), |
189 | Params: {.Code: emptyTokenStream(), .Forest: Arena, .GSStack: GSStack}, Lang: TestLang); |
190 | EXPECT_THAT(Heads, UnorderedElementsAre( |
191 | GSSNode1, |
192 | AllOf(state(2), parsedSymbolID(id("class-name" )), |
193 | parents({GSSNode0})), |
194 | AllOf(state(3), parsedSymbolID(id("enum-name" )), |
195 | parents({GSSNode0})))) |
196 | << Heads; |
197 | } |
198 | |
199 | TEST_F(GLRTest, ReduceSplittingDueToMultipleBases) { |
200 | // Before (splitting due to multiple bases): |
201 | // 2(class-name)--4(*) |
202 | // 3(enum-name)---┘ |
203 | // After reducing 4 by `ptr-operator := *`: |
204 | // 2(class-name)--5(ptr-operator) // 5 is goto(2, ptr-operator) |
205 | // 3(enum-name)---6(ptr-operator) // 6 is goto(3, ptr-operator) |
206 | buildGrammar(Nonterminals: {"ptr-operator" , "class-name" , "enum-name" }, |
207 | Rules: {"ptr-operator := *" }); |
208 | |
209 | auto *ClassNameNode = &Arena.createOpaque(SID: id(Name: "class-name" ), /*TokenIndex=*/Start: 0); |
210 | auto *EnumNameNode = &Arena.createOpaque(SID: id(Name: "enum-name" ), /*TokenIndex=*/Start: 0); |
211 | |
212 | const auto *GSSNode2 = |
213 | GSStack.addNode(/*State=*/2, /*ForestNode=*/Symbol: ClassNameNode, /*Parents=*/{}); |
214 | const auto *GSSNode3 = |
215 | GSStack.addNode(/*State=*/3, /*ForestNode=*/Symbol: EnumNameNode, /*Parents=*/{}); |
216 | const auto *GSSNode4 = GSStack.addNode( |
217 | /*State=*/4, Symbol: &Arena.createTerminal(TK: tok::star, /*TokenIndex=*/Start: 1), |
218 | /*Parents=*/{GSSNode2, GSSNode3}); |
219 | |
220 | LRTable::Builder B(TestLang.G); |
221 | B.Transition[{StateID{2}, id(Name: "ptr-operator" )}] = StateID{5}; |
222 | B.Transition[{StateID{3}, id(Name: "ptr-operator" )}] = StateID{6}; |
223 | B.Reduce[StateID{4}].insert(V: ruleFor(NonterminalName: "ptr-operator" )); |
224 | TestLang.Table = std::move(B).build(); |
225 | |
226 | std::vector<const GSS::Node *> Heads = {GSSNode4}; |
227 | glrReduce(Heads, Lookahead: tokenSymbol(TK: tok::eof), Params: {.Code: emptyTokenStream(), .Forest: Arena, .GSStack: GSStack}, |
228 | Lang: TestLang); |
229 | |
230 | EXPECT_THAT(Heads, UnorderedElementsAre( |
231 | GSSNode4, |
232 | AllOf(state(5), parsedSymbolID(id("ptr-operator" )), |
233 | parents({GSSNode2})), |
234 | AllOf(state(6), parsedSymbolID(id("ptr-operator" )), |
235 | parents({GSSNode3})))) |
236 | << Heads; |
237 | // Verify that the payload of the two new heads is shared, only a single |
238 | // ptr-operator node is created in the forest. |
239 | EXPECT_EQ(Heads[1]->Payload, Heads[2]->Payload); |
240 | } |
241 | |
242 | TEST_F(GLRTest, ReduceJoiningWithMultipleBases) { |
243 | // Before (joining due to same goto state, multiple bases): |
244 | // 0--1(cv-qualifier)--3(class-name) |
245 | // └--2(cv-qualifier)--4(enum-name) |
246 | // After reducing 3 by `type-name := class-name` and |
247 | // 4 by `type-name := enum-name`: |
248 | // 0--1(cv-qualifier)--5(type-name) // 5 is goto(1, type-name) and |
249 | // └--2(cv-qualifier)--┘ // goto(2, type-name) |
250 | buildGrammar(Nonterminals: {"type-name" , "class-name" , "enum-name" , "cv-qualifier" }, |
251 | Rules: {"type-name := class-name" , "type-name := enum-name" }); |
252 | |
253 | auto *CVQualifierNode = |
254 | &Arena.createOpaque(SID: id(Name: "cv-qualifier" ), /*TokenIndex=*/Start: 0); |
255 | auto *ClassNameNode = &Arena.createOpaque(SID: id(Name: "class-name" ), /*TokenIndex=*/Start: 1); |
256 | auto *EnumNameNode = &Arena.createOpaque(SID: id(Name: "enum-name" ), /*TokenIndex=*/Start: 1); |
257 | |
258 | const auto *GSSNode0 = |
259 | GSStack.addNode(/*State=*/0, /*ForestNode=*/Symbol: nullptr, /*Parents=*/{}); |
260 | const auto *GSSNode1 = GSStack.addNode( |
261 | /*State=*/1, /*ForestNode=*/Symbol: CVQualifierNode, /*Parents=*/{GSSNode0}); |
262 | const auto *GSSNode2 = GSStack.addNode( |
263 | /*State=*/2, /*ForestNode=*/Symbol: CVQualifierNode, /*Parents=*/{GSSNode0}); |
264 | const auto *GSSNode3 = GSStack.addNode( |
265 | /*State=*/3, /*ForestNode=*/Symbol: ClassNameNode, |
266 | /*Parents=*/{GSSNode1}); |
267 | const auto *GSSNode4 = |
268 | GSStack.addNode(/*State=*/4, /*ForestNode=*/Symbol: EnumNameNode, |
269 | /*Parents=*/{GSSNode2}); |
270 | |
271 | // FIXME: figure out a way to get rid of the hard-coded reduce RuleID! |
272 | LRTable::Builder B(TestLang.G); |
273 | B.Transition[{StateID{1}, id(Name: "type-name" )}] = StateID{5}; |
274 | B.Transition[{StateID{2}, id(Name: "type-name" )}] = StateID{5}; |
275 | B.Reduce[StateID{3}].insert(/* type-name := class-name */ V: RuleID{0}); |
276 | B.Reduce[StateID{4}].insert(/* type-name := enum-name */ V: RuleID{1}); |
277 | TestLang.Table = std::move(B).build(); |
278 | |
279 | std::vector<const GSS::Node *> Heads = {GSSNode3, GSSNode4}; |
280 | glrReduce(Heads, Lookahead: tokenSymbol(TK: tok::eof), Params: {.Code: emptyTokenStream(), .Forest: Arena, .GSStack: GSStack}, |
281 | Lang: TestLang); |
282 | |
283 | // Verify that the stack heads are joint at state 5 after reduces. |
284 | EXPECT_THAT(Heads, UnorderedElementsAre(GSSNode3, GSSNode4, |
285 | AllOf(state(5), |
286 | parsedSymbolID(id("type-name" )), |
287 | parents({GSSNode1, GSSNode2})))) |
288 | << Heads; |
289 | // Verify that we create an ambiguous ForestNode of two parses of `type-name`. |
290 | EXPECT_EQ(Heads.back()->Payload->dumpRecursive(TestLang.G), |
291 | "[ 1, end) type-name := <ambiguous>\n" |
292 | "[ 1, end) ├─type-name := class-name\n" |
293 | "[ 1, end) │ └─class-name := <opaque>\n" |
294 | "[ 1, end) └─type-name := enum-name\n" |
295 | "[ 1, end) └─enum-name := <opaque>\n" ); |
296 | } |
297 | |
298 | TEST_F(GLRTest, ReduceJoiningWithSameBase) { |
299 | // Before (joining due to same goto state, the same base): |
300 | // 0--1(class-name)--3(*) |
301 | // └--2(enum-name)--4(*) |
302 | // After reducing 3 by `pointer := class-name *` and |
303 | // 2 by `pointer := enum-name *`: |
304 | // 0--5(pointer) // 5 is goto(0, pointer) |
305 | buildGrammar(Nonterminals: {"pointer" , "class-name" , "enum-name" }, |
306 | Rules: {"pointer := class-name *" , "pointer := enum-name *" }); |
307 | |
308 | auto *ClassNameNode = &Arena.createOpaque(SID: id(Name: "class-name" ), /*TokenIndex=*/Start: 0); |
309 | auto *EnumNameNode = &Arena.createOpaque(SID: id(Name: "enum-name" ), /*TokenIndex=*/Start: 0); |
310 | auto *StartTerminal = &Arena.createTerminal(TK: tok::star, /*TokenIndex=*/Start: 1); |
311 | |
312 | const auto *GSSNode0 = |
313 | GSStack.addNode(/*State=*/0, /*ForestNode=*/Symbol: nullptr, /*Parents=*/{}); |
314 | const auto *GSSNode1 = |
315 | GSStack.addNode(/*State=*/1, /*ForestNode=*/Symbol: ClassNameNode, |
316 | /*Parents=*/{GSSNode0}); |
317 | const auto *GSSNode2 = |
318 | GSStack.addNode(/*State=*/2, /*ForestNode=*/Symbol: EnumNameNode, |
319 | /*Parents=*/{GSSNode0}); |
320 | const auto *GSSNode3 = |
321 | GSStack.addNode(/*State=*/3, /*ForestNode=*/Symbol: StartTerminal, |
322 | /*Parents=*/{GSSNode1}); |
323 | const auto *GSSNode4 = |
324 | GSStack.addNode(/*State=*/4, /*ForestNode=*/Symbol: StartTerminal, |
325 | /*Parents=*/{GSSNode2}); |
326 | |
327 | // FIXME: figure out a way to get rid of the hard-coded reduce RuleID! |
328 | LRTable::Builder B(TestLang.G); |
329 | B.Transition[{StateID{0}, id(Name: "pointer" )}] = StateID{5}; |
330 | B.Reduce[StateID{3}].insert(/* pointer := class-name */ V: RuleID{0}); |
331 | B.Reduce[StateID{4}].insert(/* pointer := enum-name */ V: RuleID{1}); |
332 | TestLang.Table = std::move(B).build(); |
333 | |
334 | std::vector<const GSS::Node *> Heads = {GSSNode3, GSSNode4}; |
335 | glrReduce(Heads, Lookahead: tokenSymbol(TK: tok::eof), |
336 | Params: {.Code: emptyTokenStream(), .Forest: Arena, .GSStack: GSStack}, Lang: TestLang); |
337 | |
338 | EXPECT_THAT( |
339 | Heads, UnorderedElementsAre(GSSNode3, GSSNode4, |
340 | AllOf(state(5), parsedSymbolID(id("pointer" )), |
341 | parents({GSSNode0})))) |
342 | << Heads; |
343 | EXPECT_EQ(Heads.back()->Payload->dumpRecursive(TestLang.G), |
344 | "[ 0, end) pointer := <ambiguous>\n" |
345 | "[ 0, end) ├─pointer := class-name *\n" |
346 | "[ 0, 1) │ ├─class-name := <opaque>\n" |
347 | "[ 1, end) │ └─* := tok[1]\n" |
348 | "[ 0, end) └─pointer := enum-name *\n" |
349 | "[ 0, 1) ├─enum-name := <opaque>\n" |
350 | "[ 1, end) └─* := tok[1]\n" ); |
351 | } |
352 | |
353 | TEST_F(GLRTest, ReduceLookahead) { |
354 | // A term can be followed by +, but not by -. |
355 | buildGrammar(Nonterminals: {"sum" , "term" }, Rules: {"expr := term + term" , "term := IDENTIFIER" }); |
356 | LRTable::Builder B(TestLang.G); |
357 | B.Transition[{StateID{0}, id(Name: "term" )}] = StateID{2}; |
358 | B.Reduce[StateID{1}].insert(V: RuleID{0}); |
359 | TestLang.Table = std::move(B).build(); |
360 | |
361 | auto *Identifier = &Arena.createTerminal(TK: tok::identifier, /*Start=*/0); |
362 | |
363 | const auto *Root = |
364 | GSStack.addNode(/*State=*/0, /*ForestNode=*/Symbol: nullptr, /*Parents=*/{}); |
365 | const auto *GSSNode1 = |
366 | GSStack.addNode(/*State=*/1, /*ForestNode=*/Symbol: Identifier, Parents: {Root}); |
367 | |
368 | // When the lookahead is +, reduce is performed. |
369 | std::vector<const GSS::Node *> Heads = {GSSNode1}; |
370 | glrReduce(Heads, Lookahead: tokenSymbol(TK: tok::plus), Params: {.Code: emptyTokenStream(), .Forest: Arena, .GSStack: GSStack}, |
371 | Lang: TestLang); |
372 | EXPECT_THAT(Heads, |
373 | ElementsAre(GSSNode1, AllOf(state(2), parsedSymbolID(id("term" )), |
374 | parents(Root)))); |
375 | |
376 | // When the lookahead is -, reduce is not performed. |
377 | Heads = {GSSNode1}; |
378 | glrReduce(Heads, Lookahead: tokenSymbol(TK: tok::minus), |
379 | Params: {.Code: emptyTokenStream(), .Forest: Arena, .GSStack: GSStack}, Lang: TestLang); |
380 | EXPECT_THAT(Heads, ElementsAre(GSSNode1)); |
381 | } |
382 | |
383 | TEST_F(GLRTest, Recover) { |
384 | // Recovery while parsing "word" inside braces. |
385 | // Before: |
386 | // 0--1({)--2(?) |
387 | // After recovering a `word` at state 1: |
388 | // 0--3(word) // 3 is goto(1, word) |
389 | buildGrammar(Nonterminals: {"word" , "top" }, Rules: {"top := { word [recover=Braces] }" }); |
390 | LRTable::Builder B(TestLang.G); |
391 | B.Transition[{StateID{1}, id(Name: "word" )}] = StateID{3}; |
392 | B.Recoveries.push_back(x: {StateID{1}, {.Strategy: extensionID(AttrValueName: "Braces" ), .Result: id(Name: "word" )}}); |
393 | TestLang.Table = std::move(B).build(); |
394 | TestLang.RecoveryStrategies.try_emplace(Key: extensionID(AttrValueName: "Braces" ), Args&: recoverBraces); |
395 | |
396 | auto *LBrace = &Arena.createTerminal(TK: tok::l_brace, Start: 0); |
397 | auto *Question1 = &Arena.createTerminal(TK: tok::question, Start: 1); |
398 | const auto *Root = GSStack.addNode(State: 0, Symbol: nullptr, Parents: {}); |
399 | const auto *OpenedBraces = GSStack.addNode(State: 1, Symbol: LBrace, Parents: {Root}); |
400 | const auto *AfterQuestion1 = GSStack.addNode(State: 2, Symbol: Question1, Parents: {OpenedBraces}); |
401 | |
402 | // Need a token stream with paired braces so the strategy works. |
403 | clang::LangOptions LOptions; |
404 | TokenStream Tokens = cook(lex("{ ? ? ? }" , LOptions), LOptions); |
405 | pairBrackets(Tokens); |
406 | std::vector<const GSS::Node *> NewHeads; |
407 | |
408 | unsigned TokenIndex = 2; |
409 | glrRecover(OldHeads: {AfterQuestion1}, TokenIndex, Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, Lang: TestLang, |
410 | NewHeads); |
411 | EXPECT_EQ(TokenIndex, 4u) << "should skip ahead to matching brace" ; |
412 | EXPECT_THAT(NewHeads, ElementsAre(AllOf(state(3), parsedSymbolID(id("word" )), |
413 | parents({OpenedBraces}), start(1u)))); |
414 | EXPECT_EQ(NewHeads.front()->Payload->kind(), ForestNode::Opaque); |
415 | |
416 | // Test recovery failure: omit closing brace so strategy fails |
417 | TokenStream NoRBrace = cook(lex("{ ? ? ? ?" , LOptions), LOptions); |
418 | pairBrackets(NoRBrace); |
419 | NewHeads.clear(); |
420 | TokenIndex = 2; |
421 | glrRecover(OldHeads: {AfterQuestion1}, TokenIndex, Params: {.Code: NoRBrace, .Forest: Arena, .GSStack: GSStack}, Lang: TestLang, |
422 | NewHeads); |
423 | EXPECT_EQ(TokenIndex, 2u) << "should not advance on failure" ; |
424 | EXPECT_THAT(NewHeads, IsEmpty()); |
425 | } |
426 | |
427 | TEST_F(GLRTest, RecoverRightmost) { |
428 | // In a nested block structure, we recover at the innermost possible block. |
429 | // Before: |
430 | // 0--1({)--1({)--1({) |
431 | // After recovering a `block` at inside the second braces: |
432 | // 0--1({)--2(body) // 2 is goto(1, body) |
433 | buildGrammar(Nonterminals: {"body" , "top" }, Rules: {"top := { body [recover=Braces] }" }); |
434 | LRTable::Builder B(TestLang.G); |
435 | B.Transition[{StateID{1}, id(Name: "body" )}] = StateID{2}; |
436 | B.Recoveries.push_back(x: {StateID{1}, {.Strategy: extensionID(AttrValueName: "Braces" ), .Result: id(Name: "body" )}}); |
437 | TestLang.Table = std::move(B).build(); |
438 | TestLang.RecoveryStrategies.try_emplace(Key: extensionID(AttrValueName: "Braces" ), Args&: recoverBraces); |
439 | |
440 | clang::LangOptions LOptions; |
441 | // Innermost brace is unmatched, to test fallback to next brace. |
442 | TokenStream Tokens = cook(lex("{ { { ? } }" , LOptions), LOptions); |
443 | Tokens.tokens()[0].Pair = 5; |
444 | Tokens.tokens()[1].Pair = 4; |
445 | Tokens.tokens()[4].Pair = 1; |
446 | Tokens.tokens()[5].Pair = 0; |
447 | |
448 | auto *Brace1 = &Arena.createTerminal(TK: tok::l_brace, Start: 0); |
449 | auto *Brace2 = &Arena.createTerminal(TK: tok::l_brace, Start: 1); |
450 | auto *Brace3 = &Arena.createTerminal(TK: tok::l_brace, Start: 2); |
451 | const auto *Root = GSStack.addNode(State: 0, Symbol: nullptr, Parents: {}); |
452 | const auto *In1 = GSStack.addNode(State: 1, Symbol: Brace1, Parents: {Root}); |
453 | const auto *In2 = GSStack.addNode(State: 1, Symbol: Brace2, Parents: {In1}); |
454 | const auto *In3 = GSStack.addNode(State: 1, Symbol: Brace3, Parents: {In2}); |
455 | |
456 | unsigned TokenIndex = 3; |
457 | std::vector<const GSS::Node *> NewHeads; |
458 | glrRecover(OldHeads: {In3}, TokenIndex, Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, Lang: TestLang, NewHeads); |
459 | EXPECT_EQ(TokenIndex, 5u); |
460 | EXPECT_THAT(NewHeads, ElementsAre(AllOf(state(2), parsedSymbolID(id("body" )), |
461 | parents({In2}), start(2u)))); |
462 | } |
463 | |
464 | TEST_F(GLRTest, RecoverAlternatives) { |
465 | // Recovery inside braces with multiple equally good options |
466 | // Before: |
467 | // 0--1({) |
468 | // After recovering either `word` or `number` inside the braces: |
469 | // 0--1({)--2(word) // 2 is goto(1, word) |
470 | // └--3(number) // 3 is goto(1, number) |
471 | buildGrammar(Nonterminals: {"number" , "word" , "top" }, |
472 | Rules: { |
473 | "top := { number [recover=Braces] }" , |
474 | "top := { word [recover=Braces] }" , |
475 | }); |
476 | LRTable::Builder B(TestLang.G); |
477 | B.Transition[{StateID{1}, id(Name: "number" )}] = StateID{2}; |
478 | B.Transition[{StateID{1}, id(Name: "word" )}] = StateID{3}; |
479 | B.Recoveries.push_back(x: {StateID{1}, {.Strategy: extensionID(AttrValueName: "Braces" ), .Result: id(Name: "number" )}}); |
480 | B.Recoveries.push_back(x: {StateID{1}, {.Strategy: extensionID(AttrValueName: "Braces" ), .Result: id(Name: "word" )}}); |
481 | TestLang.RecoveryStrategies.try_emplace(Key: extensionID(AttrValueName: "Braces" ), Args&: recoverBraces); |
482 | TestLang.Table = std::move(B).build(); |
483 | auto *LBrace = &Arena.createTerminal(TK: tok::l_brace, Start: 0); |
484 | const auto *Root = GSStack.addNode(State: 0, Symbol: nullptr, Parents: {}); |
485 | const auto *OpenedBraces = GSStack.addNode(State: 1, Symbol: LBrace, Parents: {Root}); |
486 | |
487 | clang::LangOptions LOptions; |
488 | TokenStream Tokens = cook(lex("{ ? }" , LOptions), LOptions); |
489 | pairBrackets(Tokens); |
490 | std::vector<const GSS::Node *> NewHeads; |
491 | unsigned TokenIndex = 1; |
492 | |
493 | glrRecover(OldHeads: {OpenedBraces}, TokenIndex, Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, Lang: TestLang, |
494 | NewHeads); |
495 | EXPECT_EQ(TokenIndex, 2u); |
496 | EXPECT_THAT(NewHeads, |
497 | UnorderedElementsAre(AllOf(state(2), parsedSymbolID(id("number" )), |
498 | parents({OpenedBraces}), start(1u)), |
499 | AllOf(state(3), parsedSymbolID(id("word" )), |
500 | parents({OpenedBraces}), start(1u)))); |
501 | } |
502 | |
503 | TEST_F(GLRTest, PerfectForestNodeSharing) { |
504 | // Run the GLR on a simple grammar and test that we build exactly one forest |
505 | // node per (SymbolID, token range). |
506 | |
507 | // This is a grmammar where the original parsing-stack-based forest node |
508 | // sharing approach will fail. In its LR0 graph, it has two states containing |
509 | // item `expr := • IDENTIFIER`, and both have different goto states on the |
510 | // nonterminal `expr`. |
511 | build(GrammarBNF: R"bnf( |
512 | _ := test EOF |
513 | |
514 | test := { expr |
515 | test := { IDENTIFIER |
516 | test := left-paren expr |
517 | left-paren := { |
518 | expr := IDENTIFIER |
519 | )bnf" ); |
520 | TestLang.Table = LRTable::buildSLR(G: TestLang.G); |
521 | clang::LangOptions LOptions; |
522 | const TokenStream &Tokens = cook(lex("{ abc" , LOptions), LOptions); |
523 | |
524 | const ForestNode &Parsed = |
525 | glrParse(Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, StartSymbol: id(Name: "test" ), Lang: TestLang); |
526 | // Verify that there is no duplicated sequence node of `expr := IDENTIFIER` |
527 | // in the forest, see the `#1` and `=#1` in the dump string. |
528 | EXPECT_EQ(Parsed.dumpRecursive(TestLang.G), |
529 | "[ 0, end) test := <ambiguous>\n" |
530 | "[ 0, end) ├─test := { expr\n" |
531 | "[ 0, 1) │ ├─{ := tok[0]\n" |
532 | "[ 1, end) │ └─expr := IDENTIFIER #1\n" |
533 | "[ 1, end) │ └─IDENTIFIER := tok[1]\n" |
534 | "[ 0, end) ├─test := { IDENTIFIER\n" |
535 | "[ 0, 1) │ ├─{ := tok[0]\n" |
536 | "[ 1, end) │ └─IDENTIFIER := tok[1]\n" |
537 | "[ 0, end) └─test := left-paren expr\n" |
538 | "[ 0, 1) ├─left-paren := {\n" |
539 | "[ 0, 1) │ └─{ := tok[0]\n" |
540 | "[ 1, end) └─expr =#1\n" ); |
541 | } |
542 | |
543 | TEST_F(GLRTest, GLRReduceOrder) { |
544 | // Given the following grammar, and the input `IDENTIFIER`, reductions should |
545 | // be performed in the following order: |
546 | // 1. foo := IDENTIFIER |
547 | // 2. { test := IDENTIFIER, test := foo } |
548 | // foo should be reduced first, so that in step 2 we have completed reduces |
549 | // for test, and form an ambiguous forest node. |
550 | build(GrammarBNF: R"bnf( |
551 | _ := test EOF |
552 | |
553 | test := IDENTIFIER |
554 | test := foo |
555 | foo := IDENTIFIER |
556 | )bnf" ); |
557 | clang::LangOptions LOptions; |
558 | const TokenStream &Tokens = cook(lex("IDENTIFIER" , LOptions), LOptions); |
559 | TestLang.Table = LRTable::buildSLR(G: TestLang.G); |
560 | |
561 | const ForestNode &Parsed = |
562 | glrParse(Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, StartSymbol: id(Name: "test" ), Lang: TestLang); |
563 | EXPECT_EQ(Parsed.dumpRecursive(TestLang.G), |
564 | "[ 0, end) test := <ambiguous>\n" |
565 | "[ 0, end) ├─test := IDENTIFIER\n" |
566 | "[ 0, end) │ └─IDENTIFIER := tok[0]\n" |
567 | "[ 0, end) └─test := foo\n" |
568 | "[ 0, end) └─foo := IDENTIFIER\n" |
569 | "[ 0, end) └─IDENTIFIER := tok[0]\n" ); |
570 | } |
571 | |
572 | TEST_F(GLRTest, RecoveryEndToEnd) { |
573 | // Simple example of brace-based recovery showing: |
574 | // - recovered region includes tokens both ahead of and behind the cursor |
575 | // - multiple possible recovery rules |
576 | // - recovery from outer scopes is rejected |
577 | build(GrammarBNF: R"bnf( |
578 | _ := block EOF |
579 | |
580 | block := { block [recover=Braces] } |
581 | block := { numbers [recover=Braces] } |
582 | numbers := NUMERIC_CONSTANT NUMERIC_CONSTANT |
583 | )bnf" ); |
584 | TestLang.Table = LRTable::buildSLR(G: TestLang.G); |
585 | TestLang.RecoveryStrategies.try_emplace(Key: extensionID(AttrValueName: "Braces" ), Args&: recoverBraces); |
586 | clang::LangOptions LOptions; |
587 | TokenStream Tokens = cook(lex("{ { 42 ? } }" , LOptions), LOptions); |
588 | pairBrackets(Tokens); |
589 | |
590 | const ForestNode &Parsed = |
591 | glrParse(Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, StartSymbol: id(Name: "block" ), Lang: TestLang); |
592 | EXPECT_EQ(Parsed.dumpRecursive(TestLang.G), |
593 | "[ 0, end) block := { block [recover=Braces] }\n" |
594 | "[ 0, 1) ├─{ := tok[0]\n" |
595 | "[ 1, 5) ├─block := <ambiguous>\n" |
596 | "[ 1, 5) │ ├─block := { block [recover=Braces] }\n" |
597 | "[ 1, 2) │ │ ├─{ := tok[1]\n" |
598 | "[ 2, 4) │ │ ├─block := <opaque>\n" |
599 | "[ 4, 5) │ │ └─} := tok[4]\n" |
600 | "[ 1, 5) │ └─block := { numbers [recover=Braces] }\n" |
601 | "[ 1, 2) │ ├─{ := tok[1]\n" |
602 | "[ 2, 4) │ ├─numbers := <opaque>\n" |
603 | "[ 4, 5) │ └─} := tok[4]\n" |
604 | "[ 5, end) └─} := tok[5]\n" ); |
605 | } |
606 | |
607 | TEST_F(GLRTest, RecoverTerminal) { |
608 | build(GrammarBNF: R"bnf( |
609 | _ := stmt EOF |
610 | |
611 | stmt := IDENTIFIER ; [recover=Skip] |
612 | )bnf" ); |
613 | TestLang.Table = LRTable::buildSLR(G: TestLang.G); |
614 | TestLang.RecoveryStrategies.try_emplace( |
615 | Key: extensionID(AttrValueName: "Skip" ), |
616 | Args: [](Token::Index Start, const TokenStream &) { return Start; }); |
617 | clang::LangOptions LOptions; |
618 | TokenStream Tokens = cook(lex("foo" , LOptions), LOptions); |
619 | |
620 | const ForestNode &Parsed = |
621 | glrParse(Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, StartSymbol: id(Name: "stmt" ), Lang: TestLang); |
622 | EXPECT_EQ(Parsed.dumpRecursive(TestLang.G), |
623 | "[ 0, end) stmt := IDENTIFIER ; [recover=Skip]\n" |
624 | "[ 0, 1) ├─IDENTIFIER := tok[0]\n" |
625 | "[ 1, end) └─; := <opaque>\n" ); |
626 | } |
627 | |
628 | TEST_F(GLRTest, RecoverUnrestrictedReduce) { |
629 | // Here, ! is not in any rule and therefore not in the follow set of `word`. |
630 | // We would not normally reduce `word := IDENTIFIER`, but do so for recovery. |
631 | |
632 | build(GrammarBNF: R"bnf( |
633 | _ := sentence EOF |
634 | |
635 | word := IDENTIFIER |
636 | sentence := word word [recover=AcceptAnyTokenInstead] |
637 | )bnf" ); |
638 | |
639 | clang::LangOptions LOptions; |
640 | const TokenStream &Tokens = cook(lex("id !" , LOptions), LOptions); |
641 | TestLang.Table = LRTable::buildSLR(G: TestLang.G); |
642 | TestLang.RecoveryStrategies.try_emplace( |
643 | Key: extensionID(AttrValueName: "AcceptAnyTokenInstead" ), |
644 | Args: [](Token::Index Start, const TokenStream &Stream) { return Start + 1; }); |
645 | |
646 | const ForestNode &Parsed = |
647 | glrParse(Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, StartSymbol: id(Name: "sentence" ), Lang: TestLang); |
648 | EXPECT_EQ(Parsed.dumpRecursive(TestLang.G), |
649 | "[ 0, end) sentence := word word [recover=AcceptAnyTokenInstead]\n" |
650 | "[ 0, 1) ├─word := IDENTIFIER\n" |
651 | "[ 0, 1) │ └─IDENTIFIER := tok[0]\n" |
652 | "[ 1, end) └─word := <opaque>\n" ); |
653 | } |
654 | |
655 | TEST_F(GLRTest, RecoveryFromStartOfInput) { |
656 | build(GrammarBNF: R"bnf( |
657 | _ := start [recover=Fallback] EOF |
658 | |
659 | start := IDENTIFIER |
660 | )bnf" ); |
661 | TestLang.Table = LRTable::buildSLR(G: TestLang.G); |
662 | bool fallback_recovered = false; |
663 | auto fallback = [&](Token::Index Start, const TokenStream & Code) { |
664 | fallback_recovered = true; |
665 | return Code.tokens().size(); |
666 | }; |
667 | TestLang.RecoveryStrategies.try_emplace( |
668 | Key: extensionID(AttrValueName: "Fallback" ), |
669 | Args&: fallback); |
670 | clang::LangOptions LOptions; |
671 | TokenStream Tokens = cook(lex("?" , LOptions), LOptions); |
672 | |
673 | const ForestNode &Parsed = |
674 | glrParse(Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, StartSymbol: id(Name: "start" ), Lang: TestLang); |
675 | EXPECT_TRUE(fallback_recovered); |
676 | EXPECT_EQ(Parsed.dumpRecursive(TestLang.G), |
677 | "[ 0, end) start := <opaque>\n" ); |
678 | } |
679 | |
680 | TEST_F(GLRTest, RepeatedRecovery) { |
681 | // We require multiple steps of recovery at eof and then a reduction in order |
682 | // to successfully parse. |
683 | build(GrammarBNF: R"bnf( |
684 | _ := function EOF |
685 | # FIXME: this forces EOF to be in follow(signature). |
686 | # Remove it once we use unconstrained reduction for recovery. |
687 | _ := signature EOF |
688 | |
689 | function := signature body [recover=Skip] |
690 | signature := IDENTIFIER params [recover=Skip] |
691 | params := ( ) |
692 | body := { } |
693 | )bnf" ); |
694 | TestLang.Table = LRTable::buildSLR(G: TestLang.G); |
695 | TestLang.RecoveryStrategies.try_emplace( |
696 | Key: extensionID(AttrValueName: "Skip" ), |
697 | Args: [](Token::Index Start, const TokenStream &) { return Start; }); |
698 | clang::LangOptions LOptions; |
699 | TokenStream Tokens = cook(lex("main" , LOptions), LOptions); |
700 | |
701 | const ForestNode &Parsed = |
702 | glrParse(Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, StartSymbol: id(Name: "function" ), Lang: TestLang); |
703 | EXPECT_EQ(Parsed.dumpRecursive(TestLang.G), |
704 | "[ 0, end) function := signature body [recover=Skip]\n" |
705 | "[ 0, 1) ├─signature := IDENTIFIER params [recover=Skip]\n" |
706 | "[ 0, 1) │ ├─IDENTIFIER := tok[0]\n" |
707 | "[ 1, 1) │ └─params := <opaque>\n" |
708 | "[ 1, end) └─body := <opaque>\n" ); |
709 | } |
710 | |
711 | TEST_F(GLRTest, NoExplicitAccept) { |
712 | build(GrammarBNF: R"bnf( |
713 | _ := test EOF |
714 | |
715 | test := IDENTIFIER test |
716 | test := IDENTIFIER |
717 | )bnf" ); |
718 | clang::LangOptions LOptions; |
719 | // Given the following input, and the grammar above, we perform two reductions |
720 | // of the nonterminal `test` when the next token is `eof`, verify that the |
721 | // parser stops at the right state. |
722 | const TokenStream &Tokens = cook(lex("id id" , LOptions), LOptions); |
723 | TestLang.Table = LRTable::buildSLR(G: TestLang.G); |
724 | |
725 | const ForestNode &Parsed = |
726 | glrParse(Params: {.Code: Tokens, .Forest: Arena, .GSStack: GSStack}, StartSymbol: id(Name: "test" ), Lang: TestLang); |
727 | EXPECT_EQ(Parsed.dumpRecursive(TestLang.G), |
728 | "[ 0, end) test := IDENTIFIER test\n" |
729 | "[ 0, 1) ├─IDENTIFIER := tok[0]\n" |
730 | "[ 1, end) └─test := IDENTIFIER\n" |
731 | "[ 1, end) └─IDENTIFIER := tok[1]\n" ); |
732 | } |
733 | |
734 | TEST_F(GLRTest, GuardExtension) { |
735 | build(GrammarBNF: R"bnf( |
736 | _ := start EOF |
737 | |
738 | start := IDENTIFIER [guard] |
739 | )bnf" ); |
740 | TestLang.Guards.try_emplace( |
741 | Key: ruleFor(NonterminalName: "start" ), Args: [&](const GuardParams &P) { |
742 | assert(P.RHS.size() == 1 && |
743 | P.RHS.front()->symbol() == |
744 | tokenSymbol(clang::tok::identifier)); |
745 | return P.Tokens.tokens()[P.RHS.front()->startTokenIndex()] |
746 | .text() == "test" ; |
747 | }); |
748 | clang::LangOptions LOptions; |
749 | TestLang.Table = LRTable::buildSLR(G: TestLang.G); |
750 | |
751 | std::string Input = "test" ; |
752 | const TokenStream &Succeeded = cook(lex(Input, LOptions), LOptions); |
753 | EXPECT_EQ(glrParse({Succeeded, Arena, GSStack}, id("start" ), TestLang) |
754 | .dumpRecursive(TestLang.G), |
755 | "[ 0, end) start := IDENTIFIER [guard]\n" |
756 | "[ 0, end) └─IDENTIFIER := tok[0]\n" ); |
757 | |
758 | Input = "notest" ; |
759 | const TokenStream &Failed = cook(lex(Input, LOptions), LOptions); |
760 | EXPECT_EQ(glrParse({Failed, Arena, GSStack}, id("start" ), TestLang) |
761 | .dumpRecursive(TestLang.G), |
762 | "[ 0, end) start := <opaque>\n" ); |
763 | } |
764 | |
765 | TEST(GSSTest, GC) { |
766 | // ┌-A-┬-AB |
767 | // ├-B-┘ |
768 | // Root-+-C |
769 | // ├-D |
770 | // └-E |
771 | GSS GSStack; |
772 | auto *Root = GSStack.addNode(State: 0, Symbol: nullptr, Parents: {}); |
773 | auto *A = GSStack.addNode(State: 0, Symbol: nullptr, Parents: {Root}); |
774 | auto *B = GSStack.addNode(State: 0, Symbol: nullptr, Parents: {Root}); |
775 | auto *C = GSStack.addNode(State: 0, Symbol: nullptr, Parents: {Root}); |
776 | auto *D = GSStack.addNode(State: 0, Symbol: nullptr, Parents: {Root}); |
777 | auto *AB = GSStack.addNode(State: 0, Symbol: nullptr, Parents: {A, B}); |
778 | |
779 | EXPECT_EQ(1u, GSStack.gc({AB, C})) << "D is destroyed" ; |
780 | EXPECT_EQ(0u, GSStack.gc({AB, C})) << "D is already gone" ; |
781 | auto *E = GSStack.addNode(State: 0, Symbol: nullptr, Parents: {Root}); |
782 | EXPECT_EQ(D, E) << "Storage of GCed node D is reused for E" ; |
783 | EXPECT_EQ(3u, GSStack.gc({A, E})) << "Destroys B, AB, C" ; |
784 | EXPECT_EQ(1u, GSStack.gc({E})) << "Destroys A" ; |
785 | } |
786 | |
787 | } // namespace |
788 | } // namespace pseudo |
789 | } // namespace clang |
790 | |