1//===--- GLR.h - Implement a GLR parsing algorithm ---------------*- C++-*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This implements a standard Generalized LR (GLR) parsing algorithm.
10//
11// The GLR parser behaves as a normal LR parser until it encounters a conflict.
12// To handle a conflict (where there are multiple actions could perform), the
13// parser will simulate nondeterminism by doing a breadth-first search
14// over all the possibilities.
15//
16// Basic mechanisims of the GLR parser:
17// - A number of processes are operated in parallel.
18// - Each process has its own parsing stack and behaves as a standard
19// determinism LR parser.
20// - When a process encounters a conflict, it will be fork (one for each
21// avaiable action).
22// - When a process encounters an error, it is abandoned.
23// - All process are synchronized by the lookahead token: they perfrom shift
24// action at the same time, which means some processes need wait until other
25// processes have performed all reduce actions.
26//
27//===----------------------------------------------------------------------===//
28
29#ifndef CLANG_PSEUDO_GLR_H
30#define CLANG_PSEUDO_GLR_H
31
32#include "clang-pseudo/Forest.h"
33#include "clang-pseudo/Language.h"
34#include "clang-pseudo/grammar/Grammar.h"
35#include "clang-pseudo/grammar/LRTable.h"
36#include "llvm/Support/Allocator.h"
37#include <vector>
38
39namespace clang {
40namespace pseudo {
41
42// A Graph-Structured Stack efficiently represents all parse stacks of a GLR
43// parser.
44//
45// Each node stores a parse state, the last parsed ForestNode, and the parent
46// node. There may be several heads (top of stack), and the parser operates by:
47// - shift: pushing terminal symbols on top of the stack
48// - reduce: replace N symbols on top of the stack with one nonterminal
49//
50// The structure is a DAG rather than a linear stack:
51// - GLR allows multiple actions (conflicts) on the same head, producing forks
52// where several nodes have the same parent
53// - The parser merges nodes with the same (state, ForestNode), producing joins
54// where one node has multiple parents
55//
56// The parser is responsible for creating nodes and keeping track of the set of
57// heads. The GSS class is mostly an arena for them.
58struct GSS {
59 // A node represents a partial parse of the input up to some point.
60 //
61 // It is the equivalent of a frame in an LR parse stack.
62 // Like such a frame, it has an LR parse state and a syntax-tree node
63 // representing the last parsed symbol (a ForestNode in our case).
64 // Unlike a regular LR stack frame, it may have multiple parents.
65 //
66 // Nodes are not exactly pushed and popped on the stack: pushing is just
67 // allocating a new head node with a parent pointer to the old head. Popping
68 // is just forgetting about a node and remembering its parent instead.
69 struct alignas(struct Node *) Node {
70 // LR state describing how parsing should continue from this head.
71 LRTable::StateID State;
72 // Used internally to track reachability during garbage collection.
73 bool GCParity;
74 // Have we already used this node for error recovery? (prevents loops)
75 mutable bool Recovered = false;
76 // Number of the parents of this node.
77 // The parents hold previous parsed symbols, and may resume control after
78 // this node is reduced.
79 unsigned ParentCount;
80 // The parse node for the last parsed symbol.
81 // This symbol appears on the left of the dot in the parse state's items.
82 // (In the literature, the node is attached to the *edge* to the parent).
83 const ForestNode *Payload = nullptr;
84
85 llvm::ArrayRef<const Node *> parents() const {
86 return llvm::ArrayRef(reinterpret_cast<const Node *const *>(this + 1),
87 ParentCount);
88 };
89 // Parents are stored as a trailing array of Node*.
90 };
91
92 // Allocates a new node in the graph.
93 const Node *addNode(LRTable::StateID State, const ForestNode *Symbol,
94 llvm::ArrayRef<const Node *> Parents);
95 // Frees all nodes not reachable as ancestors of Roots, and returns the count.
96 // Calling this periodically prevents steady memory growth of the GSS.
97 unsigned gc(std::vector<const Node *> &&Roots);
98
99 size_t bytes() const { return Arena.getTotalMemory() + sizeof(*this); }
100 size_t nodesCreated() const { return NodesCreated; }
101
102private:
103 // Nodes are recycled using freelists.
104 // They are variable size, so use one free-list per distinct #parents.
105 std::vector<std::vector<Node *>> FreeList;
106 Node *allocate(unsigned Parents);
107 void destroy(Node *N);
108 // The list of nodes created and not destroyed - our candidates for gc().
109 std::vector<Node *> Alive;
110 bool GCParity = false; // All nodes should match this, except during GC.
111
112 llvm::BumpPtrAllocator Arena;
113 unsigned NodesCreated = 0;
114};
115llvm::raw_ostream &operator<<(llvm::raw_ostream &, const GSS::Node &);
116
117// Parameters for the GLR parsing.
118struct ParseParams {
119 // The token stream to parse.
120 const TokenStream &Code;
121
122 // Arena for data structure used by the GLR algorithm.
123 ForestArena &Forest; // Storage for the output forest.
124 GSS &GSStack; // Storage for parsing stacks.
125};
126
127// Parses the given token stream as the start symbol with the GLR algorithm,
128// and returns a forest node of the start symbol.
129//
130// A rule `_ := StartSymbol` must exit for the chosen start symbol.
131//
132// If the parsing fails, we model it as an opaque node in the forest.
133ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
134 const Language &Lang);
135
136// Shift a token onto all OldHeads, placing the results into NewHeads.
137//
138// Exposed for testing only.
139void glrShift(llvm::ArrayRef<const GSS::Node *> OldHeads,
140 const ForestNode &NextTok, const ParseParams &Params,
141 const Language &Lang, std::vector<const GSS::Node *> &NewHeads);
142// Applies available reductions on Heads, appending resulting heads to the list.
143//
144// Exposed for testing only.
145void glrReduce(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead,
146 const ParseParams &Params, const Language &Lang);
147
148// Heuristically recover from a state where no further parsing is possible.
149//
150// OldHeads is the parse state at TokenIndex.
151// This function consumes zero or more tokens by advancing TokenIndex,
152// and places any recovery states created in NewHeads.
153//
154// On failure, NewHeads is empty and TokenIndex is unchanged.
155//
156// WARNING: glrRecover acts as a "fallback shift". If it consumes no tokens,
157// there is a risk of the parser falling into an infinite loop, creating an
158// endless sequence of recovery nodes.
159// Generally it is safe for recovery to match 0 tokens against sequence symbols
160// like `statement-seq`, as the grammar won't permit another statement-seq
161// immediately afterwards. However recovery strategies for `statement` should
162// consume at least one token, as statements may be adjacent in the input.
163void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
164 unsigned &TokenIndex, const ParseParams &Params,
165 const Language &Lang, std::vector<const GSS::Node *> &NewHeads);
166
167} // namespace pseudo
168} // namespace clang
169
170#endif // CLANG_PSEUDO_GLR_H
171

source code of clang-tools-extra/pseudo/include/clang-pseudo/GLR.h