1 | //===--- GLR.cpp -----------------------------------------------*- C++-*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "clang-pseudo/GLR.h" |
10 | #include "clang-pseudo/Language.h" |
11 | #include "clang-pseudo/grammar/Grammar.h" |
12 | #include "clang-pseudo/grammar/LRTable.h" |
13 | #include "clang/Basic/TokenKinds.h" |
14 | #include "llvm/ADT/ArrayRef.h" |
15 | #include "llvm/ADT/STLExtras.h" |
16 | #include "llvm/ADT/ScopeExit.h" |
17 | #include "llvm/ADT/StringExtras.h" |
18 | #include "llvm/Support/Debug.h" |
19 | #include "llvm/Support/FormatVariadic.h" |
20 | #include <algorithm> |
21 | #include <memory> |
22 | #include <optional> |
23 | #include <queue> |
24 | |
25 | #define DEBUG_TYPE "GLR.cpp" |
26 | |
27 | namespace clang { |
28 | namespace pseudo { |
29 | namespace { |
30 | |
31 | Token::Index findRecoveryEndpoint(ExtensionID Strategy, Token::Index Begin, |
32 | const TokenStream &Tokens, |
33 | const Language &Lang) { |
34 | assert(Strategy != 0); |
35 | if (auto S = Lang.RecoveryStrategies.lookup(Val: Strategy)) |
36 | return S(Begin, Tokens); |
37 | return Token::Invalid; |
38 | } |
39 | |
40 | } // namespace |
41 | |
42 | void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads, |
43 | unsigned &TokenIndex, const ParseParams &Params, |
44 | const Language &Lang, |
45 | std::vector<const GSS::Node *> &NewHeads) { |
46 | LLVM_DEBUG(llvm::dbgs() << "Recovery at token " << TokenIndex << "...\n" ); |
47 | // Describes a possibility to recover by forcibly interpreting a range of |
48 | // tokens around the cursor as a nonterminal that we expected to see. |
49 | struct PlaceholderRecovery { |
50 | // The token prior to the nonterminal which is being recovered. |
51 | // This starts of the region we're skipping, so higher Position is better. |
52 | Token::Index Position; |
53 | // The nonterminal which will be created in order to recover. |
54 | SymbolID Symbol; |
55 | // The heuristic used to choose the bounds of the nonterminal to recover. |
56 | ExtensionID Strategy; |
57 | |
58 | // The GSS head where we are expecting the recovered nonterminal. |
59 | const GSS::Node *RecoveryNode; |
60 | // Payload of nodes on the way back from the OldHead to the recovery node. |
61 | // These represent the partial parse that is being discarded. |
62 | // They should become the children of the opaque recovery node. |
63 | // FIXME: internal structure of opaque nodes is not implemented. |
64 | // |
65 | // There may be multiple paths leading to the same recovery node, we choose |
66 | // one arbitrarily. |
67 | std::vector<const ForestNode *> DiscardedParse; |
68 | }; |
69 | std::vector<PlaceholderRecovery> Options; |
70 | |
71 | // Find recovery options by walking up the stack. |
72 | // |
73 | // This is similar to exception handling: we walk up the "frames" of nested |
74 | // rules being parsed until we find one that has a "handler" which allows us |
75 | // to determine the node bounds without parsing it. |
76 | // |
77 | // Unfortunately there's a significant difference: the stack contains both |
78 | // "upward" nodes (ancestor parses) and "leftward" ones. |
79 | // e.g. when parsing `{ if (1) ? }` as compound-stmt, the stack contains: |
80 | // stmt := IF ( expr ) . stmt - current state, we should recover here! |
81 | // stmt := IF ( expr . ) stmt - (left, no recovery here) |
82 | // stmt := IF ( . expr ) stmt - left, we should NOT recover here! |
83 | // stmt := IF . ( expr ) stmt - (left, no recovery here) |
84 | // stmt-seq := . stmt - up, we might recover here |
85 | // compound-stmt := { . stmt-seq } - up, we should recover here! |
86 | // |
87 | // It's not obvious how to avoid collecting "leftward" recovery options. |
88 | // I think the distinction is ill-defined after merging items into states. |
89 | // For now, we have to take this into account when defining recovery rules. |
90 | // (e.g. in the expr recovery above, stay inside the parentheses). |
91 | // FIXME: find a more satisfying way to avoid such false recovery. |
92 | // FIXME: Add a test for spurious recovery once tests can define strategies. |
93 | std::vector<const ForestNode *> Path; |
94 | llvm::DenseSet<const GSS::Node *> Seen; |
95 | auto WalkUp = [&](const GSS::Node *N, Token::Index NextTok, auto &WalkUp) { |
96 | if (!Seen.insert(V: N).second) |
97 | return; |
98 | if (!N->Recovered) { // Don't recover the same way twice! |
99 | for (auto Strategy : Lang.Table.getRecovery(State: N->State)) { |
100 | Options.push_back(x: PlaceholderRecovery{ |
101 | .Position: NextTok, |
102 | .Symbol: Strategy.Result, |
103 | .Strategy: Strategy.Strategy, |
104 | .RecoveryNode: N, |
105 | .DiscardedParse: Path, |
106 | }); |
107 | LLVM_DEBUG(llvm::dbgs() |
108 | << "Option: recover " << Lang.G.symbolName(Strategy.Result) |
109 | << " at token " << NextTok << "\n" ); |
110 | } |
111 | } |
112 | Path.push_back(x: N->Payload); |
113 | for (const GSS::Node *Parent : N->parents()) |
114 | WalkUp(Parent, N->Payload->startTokenIndex(), WalkUp); |
115 | Path.pop_back(); |
116 | }; |
117 | for (auto *N : OldHeads) |
118 | WalkUp(N, TokenIndex, WalkUp); |
119 | |
120 | // Now we select the option(s) we will use to recover. |
121 | // |
122 | // We prefer options starting further right, as these discard less code |
123 | // (e.g. we prefer to recover inner scopes rather than outer ones). |
124 | // The options also need to agree on an endpoint, so the parser has a |
125 | // consistent position afterwards. |
126 | // |
127 | // So conceptually we're sorting by the tuple (start, end), though we avoid |
128 | // computing `end` for options that can't be winners. |
129 | |
130 | // Consider options starting further right first. |
131 | // Don't drop the others yet though, we may still use them if preferred fails. |
132 | llvm::stable_sort(Range&: Options, C: [&](const auto &L, const auto &R) { |
133 | return L.Position > R.Position; |
134 | }); |
135 | |
136 | // We may find multiple winners, but they will have the same range. |
137 | std::optional<Token::Range> RecoveryRange; |
138 | std::vector<const PlaceholderRecovery *> BestOptions; |
139 | for (const PlaceholderRecovery &Option : Options) { |
140 | // If this starts further left than options we've already found, then |
141 | // we'll never find anything better. Skip computing End for the rest. |
142 | if (RecoveryRange && Option.Position < RecoveryRange->Begin) |
143 | break; |
144 | |
145 | auto End = findRecoveryEndpoint(Strategy: Option.Strategy, Begin: Option.Position, |
146 | Tokens: Params.Code, Lang); |
147 | // Recovery may not take the parse backwards. |
148 | if (End == Token::Invalid || End < TokenIndex) |
149 | continue; |
150 | if (RecoveryRange) { |
151 | // If this is worse than our previous options, ignore it. |
152 | if (RecoveryRange->End < End) |
153 | continue; |
154 | // If this is an improvement over our previous options, then drop them. |
155 | if (RecoveryRange->End > End) |
156 | BestOptions.clear(); |
157 | } |
158 | // Create recovery nodes and heads for them in the GSS. These may be |
159 | // discarded if a better recovery is later found, but this path isn't hot. |
160 | RecoveryRange = {.Begin: Option.Position, .End: End}; |
161 | BestOptions.push_back(x: &Option); |
162 | } |
163 | |
164 | if (BestOptions.empty()) { |
165 | LLVM_DEBUG(llvm::dbgs() << "Recovery failed after trying " << Options.size() |
166 | << " strategies\n" ); |
167 | return; |
168 | } |
169 | |
170 | // We've settled on a set of recovery options, so create their nodes and |
171 | // advance the cursor. |
172 | LLVM_DEBUG({ |
173 | llvm::dbgs() << "Recovered range=" << *RecoveryRange << ":" ; |
174 | for (const auto *Option : BestOptions) |
175 | llvm::dbgs() << " " << Lang.G.symbolName(Option->Symbol); |
176 | llvm::dbgs() << "\n" ; |
177 | }); |
178 | // FIXME: in general, we might have the same Option->Symbol multiple times, |
179 | // and we risk creating redundant Forest and GSS nodes. |
180 | // We also may inadvertently set up the next glrReduce to create a sequence |
181 | // node duplicating an opaque node that we're creating here. |
182 | // There are various options, including simply breaking ties between options. |
183 | // For now it's obscure enough to ignore. |
184 | for (const PlaceholderRecovery *Option : BestOptions) { |
185 | Option->RecoveryNode->Recovered = true; |
186 | const ForestNode &Placeholder = |
187 | Params.Forest.createOpaque(SID: Option->Symbol, Start: RecoveryRange->Begin); |
188 | LRTable::StateID OldState = Option->RecoveryNode->State; |
189 | LRTable::StateID NewState = |
190 | isToken(ID: Option->Symbol) |
191 | ? *Lang.Table.getShiftState(State: OldState, Terminal: Option->Symbol) |
192 | : *Lang.Table.getGoToState(State: OldState, Nonterminal: Option->Symbol); |
193 | const GSS::Node *NewHead = |
194 | Params.GSStack.addNode(State: NewState, Symbol: &Placeholder, Parents: {Option->RecoveryNode}); |
195 | NewHeads.push_back(x: NewHead); |
196 | } |
197 | TokenIndex = RecoveryRange->End; |
198 | } |
199 | |
200 | using StateID = LRTable::StateID; |
201 | |
202 | llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const GSS::Node &N) { |
203 | std::vector<std::string> ParentStates; |
204 | for (const auto *Parent : N.parents()) |
205 | ParentStates.push_back(x: llvm::formatv(Fmt: "{0}" , Vals: Parent->State)); |
206 | OS << llvm::formatv(Fmt: "state {0}, parsed symbol {1}, parents {3}" , Vals: N.State, |
207 | Vals: N.Payload ? N.Payload->symbol() : 0, |
208 | Vals: llvm::join(R&: ParentStates, Separator: ", " )); |
209 | return OS; |
210 | } |
211 | |
212 | // Apply all pending shift actions. |
213 | // In theory, LR parsing doesn't have shift/shift conflicts on a single head. |
214 | // But we may have multiple active heads, and each head has a shift action. |
215 | // |
216 | // We merge the stack -- if multiple heads will reach the same state after |
217 | // shifting a token, we shift only once by combining these heads. |
218 | // |
219 | // E.g. we have two heads (2, 3) in the GSS, and will shift both to reach 4: |
220 | // 0---1---2 |
221 | // â””---3 |
222 | // After the shift action, the GSS is: |
223 | // 0---1---2---4 |
224 | // └---3---┘ |
225 | void glrShift(llvm::ArrayRef<const GSS::Node *> OldHeads, |
226 | const ForestNode &NewTok, const ParseParams &Params, |
227 | const Language &Lang, std::vector<const GSS::Node *> &NewHeads) { |
228 | assert(NewTok.kind() == ForestNode::Terminal); |
229 | LLVM_DEBUG(llvm::dbgs() << llvm::formatv(" Shift {0} ({1} active heads):\n" , |
230 | Lang.G.symbolName(NewTok.symbol()), |
231 | OldHeads.size())); |
232 | |
233 | // We group pending shifts by their target state so we can merge them. |
234 | llvm::SmallVector<std::pair<StateID, const GSS::Node *>, 8> Shifts; |
235 | for (const auto *H : OldHeads) |
236 | if (auto S = Lang.Table.getShiftState(State: H->State, Terminal: NewTok.symbol())) |
237 | Shifts.push_back(Elt: {*S, H}); |
238 | llvm::stable_sort(Range&: Shifts, C: llvm::less_first{}); |
239 | |
240 | auto Rest = llvm::ArrayRef(Shifts); |
241 | llvm::SmallVector<const GSS::Node *> Parents; |
242 | while (!Rest.empty()) { |
243 | // Collect the batch of PendingShift that have compatible shift states. |
244 | // Their heads become TempParents, the parents of the new GSS node. |
245 | StateID NextState = Rest.front().first; |
246 | |
247 | Parents.clear(); |
248 | for (const auto &Base : Rest) { |
249 | if (Base.first != NextState) |
250 | break; |
251 | Parents.push_back(Elt: Base.second); |
252 | } |
253 | Rest = Rest.drop_front(N: Parents.size()); |
254 | |
255 | LLVM_DEBUG(llvm::dbgs() << llvm::formatv(" --> S{0} ({1} heads)\n" , |
256 | NextState, Parents.size())); |
257 | NewHeads.push_back(x: Params.GSStack.addNode(State: NextState, Symbol: &NewTok, Parents)); |
258 | } |
259 | } |
260 | |
261 | namespace { |
262 | // A KeyedQueue yields pairs of keys and values in order of the keys. |
263 | template <typename Key, typename Value> |
264 | using KeyedQueue = |
265 | std::priority_queue<std::pair<Key, Value>, |
266 | std::vector<std::pair<Key, Value>>, llvm::less_first>; |
267 | |
268 | template <typename T> void sortAndUnique(std::vector<T> &Vec) { |
269 | llvm::sort(Vec); |
270 | Vec.erase(std::unique(Vec.begin(), Vec.end()), Vec.end()); |
271 | } |
272 | |
273 | // Perform reduces until no more are possible. |
274 | // |
275 | // Generally this means walking up from the heads gathering ForestNodes that |
276 | // will match the RHS of the rule we're reducing into a sequence ForestNode, |
277 | // and ending up at a base node. |
278 | // Then we push a new GSS node onto that base, taking care to: |
279 | // - pack alternative sequence ForestNodes into an ambiguous ForestNode. |
280 | // - use the same GSS node for multiple heads if the parse state matches. |
281 | // |
282 | // Examples of reduction: |
283 | // Before (simple): |
284 | // 0--1(expr)--2(semi) |
285 | // After reducing 2 by `stmt := expr semi`: |
286 | // 0--3(stmt) // 3 is goto(0, stmt) |
287 | // |
288 | // Before (splitting due to R/R conflict): |
289 | // 0--1(IDENTIFIER) |
290 | // After reducing 1 by `class-name := IDENTIFIER` & `enum-name := IDENTIFIER`: |
291 | // 0--2(class-name) // 2 is goto(0, class-name) |
292 | // â””--3(enum-name) // 3 is goto(0, enum-name) |
293 | // |
294 | // Before (splitting due to multiple bases): |
295 | // 0--2(class-name)--4(STAR) |
296 | // └--3(enum-name)---┘ |
297 | // After reducing 4 by `ptr-operator := STAR`: |
298 | // 0--2(class-name)--5(ptr-operator) // 5 is goto(2, ptr-operator) |
299 | // â””--3(enum-name)---6(ptr-operator) // 6 is goto(3, ptr-operator) |
300 | // |
301 | // Before (joining due to same goto state, multiple bases): |
302 | // 0--1(cv-qualifier)--3(class-name) |
303 | // â””--2(cv-qualifier)--4(enum-name) |
304 | // After reducing 3 by `type-name := class-name` and |
305 | // 4 by `type-name := enum-name`: |
306 | // 0--1(cv-qualifier)--5(type-name) // 5 is goto(1, type-name) and |
307 | // └--2(cv-qualifier)--┘ // goto(2, type-name) |
308 | // |
309 | // Before (joining due to same goto state, the same base): |
310 | // 0--1(class-name)--3(STAR) |
311 | // â””--2(enum-name)--4(STAR) |
312 | // After reducing 3 by `pointer := class-name STAR` and |
313 | // 2 by`enum-name := class-name STAR`: |
314 | // 0--5(pointer) // 5 is goto(0, pointer) |
315 | // |
316 | // (This is a functor rather than a function to allow it to reuse scratch |
317 | // storage across calls). |
318 | class GLRReduce { |
319 | const ParseParams &Params; |
320 | const Language& Lang; |
321 | // There are two interacting complications: |
322 | // 1. Performing one reduce can unlock new reduces on the newly-created head. |
323 | // 2a. The ambiguous ForestNodes must be complete (have all sequence nodes). |
324 | // This means we must have unlocked all the reduces that contribute to it. |
325 | // 2b. Similarly, the new GSS nodes must be complete (have all parents). |
326 | // |
327 | // We define a "family" of reduces as those that produce the same symbol and |
328 | // cover the same range of tokens. These are exactly the set of reductions |
329 | // whose sequence nodes would be covered by the same ambiguous node. |
330 | // We wish to process a whole family at a time (to satisfy complication 2), |
331 | // and can address complication 1 by carefully ordering the families: |
332 | // - Process families covering fewer tokens first. |
333 | // A reduce can't depend on a longer reduce! |
334 | // - For equal token ranges: if S := T, process T families before S families. |
335 | // Parsing T can't depend on an equal-length S, as the grammar is acyclic. |
336 | // |
337 | // This isn't quite enough: we don't know the token length of the reduction |
338 | // until we walk up the stack to perform the pop. |
339 | // So we perform the pop part upfront, and place the push specification on |
340 | // priority queues such that we can retrieve a family at a time. |
341 | |
342 | // A reduction family is characterized by its token range and symbol produced. |
343 | // It is used as a key in the priority queues to group pushes by family. |
344 | struct Family { |
345 | // The start of the token range of the reduce. |
346 | Token::Index Start; |
347 | SymbolID Symbol; |
348 | // Rule must produce Symbol and can otherwise be arbitrary. |
349 | // RuleIDs have the topological order based on the acyclic grammar. |
350 | // FIXME: should SymbolIDs be so ordered instead? |
351 | RuleID Rule; |
352 | |
353 | bool operator==(const Family &Other) const { |
354 | return Start == Other.Start && Symbol == Other.Symbol; |
355 | } |
356 | // The larger Family is the one that should be processed first. |
357 | bool operator<(const Family &Other) const { |
358 | if (Start != Other.Start) |
359 | return Start < Other.Start; |
360 | if (Symbol != Other.Symbol) |
361 | return Rule > Other.Rule; |
362 | assert(*this == Other); |
363 | return false; |
364 | } |
365 | }; |
366 | |
367 | // A sequence is the ForestNode payloads of the GSS nodes we are reducing. |
368 | using Sequence = llvm::SmallVector<const ForestNode *, Rule::MaxElements>; |
369 | // Like ArrayRef<const ForestNode*>, but with the missing operator<. |
370 | // (Sequences are big to move by value as the collections gets rearranged). |
371 | struct SequenceRef { |
372 | SequenceRef(const Sequence &S) : S(S) {} |
373 | llvm::ArrayRef<const ForestNode *> S; |
374 | friend bool operator==(SequenceRef A, SequenceRef B) { return A.S == B.S; } |
375 | friend bool operator<(const SequenceRef &A, const SequenceRef &B) { |
376 | return std::lexicographical_compare(first1: A.S.begin(), last1: A.S.end(), first2: B.S.begin(), |
377 | last2: B.S.end()); |
378 | } |
379 | }; |
380 | // Underlying storage for sequences pointed to by stored SequenceRefs. |
381 | std::deque<Sequence> SequenceStorage; |
382 | // We don't actually destroy the sequences between calls, to reuse storage. |
383 | // Everything SequenceStorage[ >=SequenceStorageCount ] is reusable scratch. |
384 | unsigned SequenceStorageCount; |
385 | |
386 | // Halfway through a reduction (after the pop, before the push), we have |
387 | // collected nodes for the RHS of a rule, and reached a base node. |
388 | // They specify a sequence ForestNode we may build (but we dedup first). |
389 | // (The RuleID is not stored here, but rather in the Family). |
390 | struct PushSpec { |
391 | // The last node popped before pushing. Its parent is the reduction base(s). |
392 | // (Base is more fundamental, but this is cheaper to store). |
393 | const GSS::Node* LastPop = nullptr; |
394 | Sequence *Seq = nullptr; |
395 | }; |
396 | KeyedQueue<Family, PushSpec> Sequences; // FIXME: rename => PendingPushes? |
397 | |
398 | // We treat Heads as a queue of Pop operations still to be performed. |
399 | // PoppedHeads is our position within it. |
400 | std::vector<const GSS::Node *> *Heads; |
401 | unsigned NextPopHead; |
402 | SymbolID Lookahead; |
403 | |
404 | Sequence TempSequence; |
405 | public: |
406 | GLRReduce(const ParseParams &Params, const Language &Lang) |
407 | : Params(Params), Lang(Lang) {} |
408 | |
409 | // Reduce Heads, resulting in new nodes that are appended to Heads. |
410 | // The "consumed" nodes are not removed! |
411 | // Only reduce rules compatible with the Lookahead are applied, though |
412 | // tokenSymbol(tok::unknown) will match any rule. |
413 | void operator()(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead) { |
414 | assert(isToken(Lookahead)); |
415 | |
416 | NextPopHead = 0; |
417 | this->Heads = &Heads; |
418 | this->Lookahead = Lookahead; |
419 | assert(Sequences.empty()); |
420 | SequenceStorageCount = 0; |
421 | |
422 | popPending(); |
423 | while (!Sequences.empty()) { |
424 | pushNext(); |
425 | popPending(); |
426 | } |
427 | } |
428 | |
429 | private: |
430 | bool canReduce(const Rule &R, RuleID RID, |
431 | llvm::ArrayRef<const ForestNode *> RHS) const { |
432 | if (!R.Guarded) |
433 | return true; |
434 | if (auto Guard = Lang.Guards.lookup(Val: RID)) |
435 | return Guard({.RHS: RHS, .Tokens: Params.Code, .Lookahead: Lookahead}); |
436 | LLVM_DEBUG(llvm::dbgs() |
437 | << llvm::formatv("missing guard implementation for rule {0}\n" , |
438 | Lang.G.dumpRule(RID))); |
439 | return true; |
440 | } |
441 | // pop walks up the parent chain(s) for a reduction from Head by to Rule. |
442 | // Once we reach the end, record the bases and sequences. |
443 | void pop(const GSS::Node *Head, RuleID RID, const Rule &Rule) { |
444 | LLVM_DEBUG(llvm::dbgs() << " Pop " << Lang.G.dumpRule(RID) << "\n" ); |
445 | Family F{/*Start=*/0, /*Symbol=*/Rule.Target, /*Rule=*/RID}; |
446 | TempSequence.resize_for_overwrite(N: Rule.Size); |
447 | auto DFS = [&](const GSS::Node *N, unsigned I, auto &DFS) { |
448 | TempSequence[Rule.Size - 1 - I] = N->Payload; |
449 | if (I + 1 == Rule.Size) { |
450 | F.Start = TempSequence.front()->startTokenIndex(); |
451 | LLVM_DEBUG({ |
452 | for (const auto *B : N->parents()) |
453 | llvm::dbgs() << " --> base at S" << B->State << "\n" ; |
454 | }); |
455 | if (!canReduce(R: Rule, RID, RHS: TempSequence)) |
456 | return; |
457 | // Copy the chain to stable storage so it can be enqueued. |
458 | if (SequenceStorageCount == SequenceStorage.size()) |
459 | SequenceStorage.emplace_back(); |
460 | SequenceStorage[SequenceStorageCount] = TempSequence; |
461 | Sequence *Seq = &SequenceStorage[SequenceStorageCount++]; |
462 | |
463 | Sequences.emplace(args&: F, args: PushSpec{.LastPop: N, .Seq: Seq}); |
464 | return; |
465 | } |
466 | for (const GSS::Node *Parent : N->parents()) |
467 | DFS(Parent, I + 1, DFS); |
468 | }; |
469 | DFS(Head, 0, DFS); |
470 | } |
471 | |
472 | // popPending pops every available reduction. |
473 | void popPending() { |
474 | for (; NextPopHead < Heads->size(); ++NextPopHead) { |
475 | // In trivial cases, we perform the complete reduce here! |
476 | if (popAndPushTrivial()) |
477 | continue; |
478 | for (RuleID RID : |
479 | Lang.Table.getReduceRules(State: (*Heads)[NextPopHead]->State)) { |
480 | const auto &Rule = Lang.G.lookupRule(RID); |
481 | if (Lang.Table.canFollow(Nonterminal: Rule.Target, Terminal: Lookahead)) |
482 | pop(Head: (*Heads)[NextPopHead], RID, Rule); |
483 | } |
484 | } |
485 | } |
486 | |
487 | // Storage reused by each call to pushNext. |
488 | std::vector<std::pair</*Goto*/ StateID, const GSS::Node *>> FamilyBases; |
489 | std::vector<std::pair<RuleID, SequenceRef>> FamilySequences; |
490 | std::vector<const GSS::Node *> Parents; |
491 | std::vector<const ForestNode *> SequenceNodes; |
492 | |
493 | // Process one push family, forming a forest node. |
494 | // This produces new GSS heads which may enable more pops. |
495 | void pushNext() { |
496 | assert(!Sequences.empty()); |
497 | Family F = Sequences.top().first; |
498 | |
499 | LLVM_DEBUG(llvm::dbgs() << " Push " << Lang.G.symbolName(F.Symbol) |
500 | << " from token " << F.Start << "\n" ); |
501 | |
502 | // Grab the sequences and bases for this family. |
503 | // We don't care which rule yielded each base. If Family.Symbol is S, the |
504 | // base includes an item X := ... • S ... and since the grammar is |
505 | // context-free, *all* parses of S are valid here. |
506 | FamilySequences.clear(); |
507 | FamilyBases.clear(); |
508 | do { |
509 | const PushSpec &Push = Sequences.top().second; |
510 | FamilySequences.emplace_back(args: Sequences.top().first.Rule, args&: *Push.Seq); |
511 | for (const GSS::Node *Base : Push.LastPop->parents()) { |
512 | auto NextState = Lang.Table.getGoToState(State: Base->State, Nonterminal: F.Symbol); |
513 | assert(NextState.has_value() && "goto must succeed after reduce!" ); |
514 | FamilyBases.emplace_back(args&: *NextState, args&: Base); |
515 | } |
516 | |
517 | Sequences.pop(); |
518 | } while (!Sequences.empty() && Sequences.top().first == F); |
519 | // Build a forest node for each unique sequence. |
520 | sortAndUnique(Vec&: FamilySequences); |
521 | SequenceNodes.clear(); |
522 | for (const auto &SequenceSpec : FamilySequences) |
523 | SequenceNodes.push_back(x: &Params.Forest.createSequence( |
524 | SID: F.Symbol, RID: SequenceSpec.first, Elements: SequenceSpec.second.S)); |
525 | // Wrap in an ambiguous node if needed. |
526 | const ForestNode *Parsed = |
527 | SequenceNodes.size() == 1 |
528 | ? SequenceNodes.front() |
529 | : &Params.Forest.createAmbiguous(SID: F.Symbol, Alternatives: SequenceNodes); |
530 | LLVM_DEBUG(llvm::dbgs() << " --> " << Parsed->dump(Lang.G) << "\n" ); |
531 | |
532 | // Bases for this family, deduplicate them, and group by the goTo State. |
533 | sortAndUnique(Vec&: FamilyBases); |
534 | // Create a GSS node for each unique goto state. |
535 | llvm::ArrayRef<decltype(FamilyBases)::value_type> BasesLeft = FamilyBases; |
536 | while (!BasesLeft.empty()) { |
537 | StateID NextState = BasesLeft.front().first; |
538 | Parents.clear(); |
539 | for (const auto &Base : BasesLeft) { |
540 | if (Base.first != NextState) |
541 | break; |
542 | Parents.push_back(x: Base.second); |
543 | } |
544 | BasesLeft = BasesLeft.drop_front(N: Parents.size()); |
545 | Heads->push_back(x: Params.GSStack.addNode(State: NextState, Symbol: Parsed, Parents)); |
546 | } |
547 | } |
548 | |
549 | // In general we split a reduce into a pop/push, so concurrently-available |
550 | // reductions can run in the correct order. The data structures are expensive. |
551 | // |
552 | // When only one reduction is possible at a time, we can skip this: |
553 | // we pop and immediately push, as an LR parser (as opposed to GLR) would. |
554 | // This is valid whenever there's only one concurrent PushSpec. |
555 | // |
556 | // This function handles a trivial but common subset of these cases: |
557 | // - there must be no pending pushes, and only one poppable head |
558 | // - the head must have only one reduction rule |
559 | // - the reduction path must be a straight line (no multiple parents) |
560 | // (Roughly this means there's no local ambiguity, so the LR algorithm works). |
561 | // |
562 | // Returns true if we successfully consumed the next unpopped head. |
563 | bool popAndPushTrivial() { |
564 | if (!Sequences.empty() || Heads->size() != NextPopHead + 1) |
565 | return false; |
566 | const GSS::Node *Head = Heads->back(); |
567 | std::optional<RuleID> RID; |
568 | for (RuleID R : Lang.Table.getReduceRules(State: Head->State)) { |
569 | if (RID.has_value()) |
570 | return false; |
571 | RID = R; |
572 | } |
573 | if (!RID) |
574 | return true; // no reductions available, but we've processed the head! |
575 | const auto &Rule = Lang.G.lookupRule(RID: *RID); |
576 | if (!Lang.Table.canFollow(Nonterminal: Rule.Target, Terminal: Lookahead)) |
577 | return true; // reduction is not available |
578 | const GSS::Node *Base = Head; |
579 | TempSequence.resize_for_overwrite(N: Rule.Size); |
580 | for (unsigned I = 0; I < Rule.Size; ++I) { |
581 | if (Base->parents().size() != 1) |
582 | return false; |
583 | TempSequence[Rule.Size - 1 - I] = Base->Payload; |
584 | Base = Base->parents().front(); |
585 | } |
586 | if (!canReduce(R: Rule, RID: *RID, RHS: TempSequence)) |
587 | return true; // reduction is not available |
588 | const ForestNode *Parsed = |
589 | &Params.Forest.createSequence(SID: Rule.Target, RID: *RID, Elements: TempSequence); |
590 | auto NextState = Lang.Table.getGoToState(State: Base->State, Nonterminal: Rule.Target); |
591 | assert(NextState.has_value() && "goto must succeed after reduce!" ); |
592 | Heads->push_back(x: Params.GSStack.addNode(State: *NextState, Symbol: Parsed, Parents: {Base})); |
593 | LLVM_DEBUG(llvm::dbgs() |
594 | << " Reduce (trivial) " << Lang.G.dumpRule(*RID) << "\n" |
595 | << " --> S" << Heads->back()->State << "\n" ); |
596 | return true; |
597 | } |
598 | }; |
599 | |
600 | } // namespace |
601 | |
602 | ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol, |
603 | const Language &Lang) { |
604 | GLRReduce Reduce(Params, Lang); |
605 | assert(isNonterminal(StartSymbol) && "Start symbol must be a nonterminal" ); |
606 | llvm::ArrayRef<ForestNode> Terminals = Params.Forest.createTerminals(Code: Params.Code); |
607 | auto &GSS = Params.GSStack; |
608 | |
609 | StateID StartState = Lang.Table.getStartState(StartSymbol); |
610 | // Heads correspond to the parse of tokens [0, I), NextHeads to [0, I+1). |
611 | std::vector<const GSS::Node *> Heads = {GSS.addNode(/*State=*/StartState, |
612 | /*ForestNode=*/Symbol: nullptr, |
613 | Parents: {})}; |
614 | // Invariant: Heads is partitioned by source: {shifted | reduced}. |
615 | // HeadsPartition is the index of the first head formed by reduction. |
616 | // We use this to discard and recreate the reduced heads during recovery. |
617 | unsigned HeadsPartition = Heads.size(); |
618 | std::vector<const GSS::Node *> NextHeads; |
619 | auto MaybeGC = [&, Roots(std::vector<const GSS::Node *>{}), I(0u)]() mutable { |
620 | assert(NextHeads.empty() && "Running GC at the wrong time!" ); |
621 | if (++I != 20) // Run periodically to balance CPU and memory usage. |
622 | return; |
623 | I = 0; |
624 | |
625 | // We need to copy the list: Roots is consumed by the GC. |
626 | Roots = Heads; |
627 | GSS.gc(Roots: std::move(Roots)); |
628 | }; |
629 | // Each iteration fully processes a single token. |
630 | for (unsigned I = 0; I < Terminals.size();) { |
631 | LLVM_DEBUG(llvm::dbgs() << llvm::formatv( |
632 | "Next token {0} (id={1})\n" , |
633 | Lang.G.symbolName(Terminals[I].symbol()), Terminals[I].symbol())); |
634 | // Consume the token. |
635 | glrShift(OldHeads: Heads, NewTok: Terminals[I], Params, Lang, NewHeads&: NextHeads); |
636 | |
637 | // If we weren't able to consume the token, try to skip over some tokens |
638 | // so we can keep parsing. |
639 | if (NextHeads.empty()) { |
640 | // The reduction in the previous round was constrained by lookahead. |
641 | // On valid code this only rejects dead ends, but on broken code we should |
642 | // consider all possibilities. |
643 | // |
644 | // We discard all heads formed by reduction, and recreate them without |
645 | // this constraint. This may duplicate some nodes, but it's rare. |
646 | LLVM_DEBUG(llvm::dbgs() << "Shift failed, will attempt recovery. " |
647 | "Re-reducing without lookahead.\n" ); |
648 | Heads.resize(new_size: HeadsPartition); |
649 | Reduce(Heads, /*allow all reductions*/ tokenSymbol(TK: tok::unknown)); |
650 | |
651 | glrRecover(OldHeads: Heads, TokenIndex&: I, Params, Lang, NewHeads&: NextHeads); |
652 | if (NextHeads.empty()) |
653 | // FIXME: Ensure the `_ := start-symbol` rules have a fallback |
654 | // error-recovery strategy attached. Then this condition can't happen. |
655 | return Params.Forest.createOpaque(SID: StartSymbol, /*Token::Index=*/Start: 0); |
656 | } else |
657 | ++I; |
658 | |
659 | // Form nonterminals containing the token we just consumed. |
660 | SymbolID Lookahead = |
661 | I == Terminals.size() ? tokenSymbol(TK: tok::eof) : Terminals[I].symbol(); |
662 | HeadsPartition = NextHeads.size(); |
663 | Reduce(NextHeads, Lookahead); |
664 | // Prepare for the next token. |
665 | std::swap(x&: Heads, y&: NextHeads); |
666 | NextHeads.clear(); |
667 | MaybeGC(); |
668 | } |
669 | LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Reached eof\n" )); |
670 | |
671 | // The parse was successful if in state `_ := start-symbol EOF .` |
672 | // The GSS parent has `_ := start-symbol . EOF`; its payload is the parse. |
673 | auto AfterStart = Lang.Table.getGoToState(State: StartState, Nonterminal: StartSymbol); |
674 | assert(AfterStart.has_value() && "goto must succeed after start symbol!" ); |
675 | auto Accept = Lang.Table.getShiftState(State: *AfterStart, Terminal: tokenSymbol(TK: tok::eof)); |
676 | assert(Accept.has_value() && "shift EOF must succeed!" ); |
677 | auto SearchForAccept = [&](llvm::ArrayRef<const GSS::Node *> Heads) { |
678 | const ForestNode *Result = nullptr; |
679 | for (const auto *Head : Heads) { |
680 | if (Head->State == *Accept) { |
681 | assert(Head->Payload->symbol() == tokenSymbol(tok::eof)); |
682 | assert(Result == nullptr && "multiple results!" ); |
683 | Result = Head->parents().front()->Payload; |
684 | assert(Result->symbol() == StartSymbol); |
685 | } |
686 | } |
687 | return Result; |
688 | }; |
689 | if (auto *Result = SearchForAccept(Heads)) |
690 | return *const_cast<ForestNode *>(Result); // Safe: we created all nodes. |
691 | // We failed to parse the input, returning an opaque forest node for recovery. |
692 | // FIXME: as above, we can add fallback error handling so this is impossible. |
693 | return Params.Forest.createOpaque(SID: StartSymbol, /*Token::Index=*/Start: 0); |
694 | } |
695 | |
696 | void glrReduce(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead, |
697 | const ParseParams &Params, const Language &Lang) { |
698 | // Create a new GLRReduce each time for tests, performance doesn't matter. |
699 | GLRReduce{Params, Lang}(Heads, Lookahead); |
700 | } |
701 | |
702 | const GSS::Node *GSS::addNode(LRTable::StateID State, const ForestNode *Symbol, |
703 | llvm::ArrayRef<const Node *> Parents) { |
704 | Node *Result = new (allocate(Parents: Parents.size())) Node(); |
705 | Result->State = State; |
706 | Result->GCParity = GCParity; |
707 | Result->ParentCount = Parents.size(); |
708 | Alive.push_back(x: Result); |
709 | ++NodesCreated; |
710 | Result->Payload = Symbol; |
711 | if (!Parents.empty()) |
712 | llvm::copy(Range&: Parents, Out: reinterpret_cast<const Node **>(Result + 1)); |
713 | return Result; |
714 | } |
715 | |
716 | GSS::Node *GSS::allocate(unsigned Parents) { |
717 | if (FreeList.size() <= Parents) |
718 | FreeList.resize(new_size: Parents + 1); |
719 | auto &SizedList = FreeList[Parents]; |
720 | if (!SizedList.empty()) { |
721 | auto *Result = SizedList.back(); |
722 | SizedList.pop_back(); |
723 | return Result; |
724 | } |
725 | return static_cast<Node *>( |
726 | Arena.Allocate(Size: sizeof(Node) + Parents * sizeof(Node *), Alignment: alignof(Node))); |
727 | } |
728 | |
729 | void GSS::destroy(Node *N) { |
730 | unsigned ParentCount = N->ParentCount; |
731 | N->~Node(); |
732 | assert(FreeList.size() > ParentCount && "established on construction!" ); |
733 | FreeList[ParentCount].push_back(x: N); |
734 | } |
735 | |
736 | unsigned GSS::gc(std::vector<const Node *> &&Queue) { |
737 | #ifndef NDEBUG |
738 | auto ParityMatches = [&](const Node *N) { return N->GCParity == GCParity; }; |
739 | assert("Before GC" && llvm::all_of(Alive, ParityMatches)); |
740 | auto Deferred = llvm::make_scope_exit( |
741 | F: [&] { assert("After GC" && llvm::all_of(Alive, ParityMatches)); }); |
742 | assert(llvm::all_of( |
743 | Queue, [&](const Node *R) { return llvm::is_contained(Alive, R); })); |
744 | #endif |
745 | unsigned InitialCount = Alive.size(); |
746 | |
747 | // Mark |
748 | GCParity = !GCParity; |
749 | while (!Queue.empty()) { |
750 | Node *N = const_cast<Node *>(Queue.back()); // Safe: we created these nodes. |
751 | Queue.pop_back(); |
752 | if (N->GCParity != GCParity) { // Not seen yet |
753 | N->GCParity = GCParity; // Mark as seen |
754 | for (const Node *P : N->parents()) // And walk parents |
755 | Queue.push_back(x: P); |
756 | } |
757 | } |
758 | // Sweep |
759 | llvm::erase_if(C&: Alive, P: [&](Node *N) { |
760 | if (N->GCParity == GCParity) // Walk reached this node. |
761 | return false; |
762 | destroy(N); |
763 | return true; |
764 | }); |
765 | |
766 | LLVM_DEBUG(llvm::dbgs() << "GC pruned " << (InitialCount - Alive.size()) |
767 | << "/" << InitialCount << " GSS nodes\n" ); |
768 | return InitialCount - Alive.size(); |
769 | } |
770 | |
771 | } // namespace pseudo |
772 | } // namespace clang |
773 | |