1 | //===--- GLR.h - Implement a GLR parsing algorithm ---------------*- C++-*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This implements a standard Generalized LR (GLR) parsing algorithm. |
10 | // |
11 | // The GLR parser behaves as a normal LR parser until it encounters a conflict. |
12 | // To handle a conflict (where there are multiple actions could perform), the |
13 | // parser will simulate nondeterminism by doing a breadth-first search |
14 | // over all the possibilities. |
15 | // |
16 | // Basic mechanisims of the GLR parser: |
17 | // - A number of processes are operated in parallel. |
18 | // - Each process has its own parsing stack and behaves as a standard |
19 | // determinism LR parser. |
20 | // - When a process encounters a conflict, it will be fork (one for each |
21 | // avaiable action). |
22 | // - When a process encounters an error, it is abandoned. |
23 | // - All process are synchronized by the lookahead token: they perfrom shift |
24 | // action at the same time, which means some processes need wait until other |
25 | // processes have performed all reduce actions. |
26 | // |
27 | //===----------------------------------------------------------------------===// |
28 | |
29 | #ifndef CLANG_PSEUDO_GLR_H |
30 | #define CLANG_PSEUDO_GLR_H |
31 | |
32 | #include "clang-pseudo/Forest.h" |
33 | #include "clang-pseudo/Language.h" |
34 | #include "clang-pseudo/grammar/Grammar.h" |
35 | #include "clang-pseudo/grammar/LRTable.h" |
36 | #include "llvm/Support/Allocator.h" |
37 | #include <vector> |
38 | |
39 | namespace clang { |
40 | namespace pseudo { |
41 | |
42 | // A Graph-Structured Stack efficiently represents all parse stacks of a GLR |
43 | // parser. |
44 | // |
45 | // Each node stores a parse state, the last parsed ForestNode, and the parent |
46 | // node. There may be several heads (top of stack), and the parser operates by: |
47 | // - shift: pushing terminal symbols on top of the stack |
48 | // - reduce: replace N symbols on top of the stack with one nonterminal |
49 | // |
50 | // The structure is a DAG rather than a linear stack: |
51 | // - GLR allows multiple actions (conflicts) on the same head, producing forks |
52 | // where several nodes have the same parent |
53 | // - The parser merges nodes with the same (state, ForestNode), producing joins |
54 | // where one node has multiple parents |
55 | // |
56 | // The parser is responsible for creating nodes and keeping track of the set of |
57 | // heads. The GSS class is mostly an arena for them. |
58 | struct GSS { |
59 | // A node represents a partial parse of the input up to some point. |
60 | // |
61 | // It is the equivalent of a frame in an LR parse stack. |
62 | // Like such a frame, it has an LR parse state and a syntax-tree node |
63 | // representing the last parsed symbol (a ForestNode in our case). |
64 | // Unlike a regular LR stack frame, it may have multiple parents. |
65 | // |
66 | // Nodes are not exactly pushed and popped on the stack: pushing is just |
67 | // allocating a new head node with a parent pointer to the old head. Popping |
68 | // is just forgetting about a node and remembering its parent instead. |
69 | struct alignas(struct Node *) Node { |
70 | // LR state describing how parsing should continue from this head. |
71 | LRTable::StateID State; |
72 | // Used internally to track reachability during garbage collection. |
73 | bool GCParity; |
74 | // Have we already used this node for error recovery? (prevents loops) |
75 | mutable bool Recovered = false; |
76 | // Number of the parents of this node. |
77 | // The parents hold previous parsed symbols, and may resume control after |
78 | // this node is reduced. |
79 | unsigned ParentCount; |
80 | // The parse node for the last parsed symbol. |
81 | // This symbol appears on the left of the dot in the parse state's items. |
82 | // (In the literature, the node is attached to the *edge* to the parent). |
83 | const ForestNode *Payload = nullptr; |
84 | |
85 | llvm::ArrayRef<const Node *> parents() const { |
86 | return llvm::ArrayRef(reinterpret_cast<const Node *const *>(this + 1), |
87 | ParentCount); |
88 | }; |
89 | // Parents are stored as a trailing array of Node*. |
90 | }; |
91 | |
92 | // Allocates a new node in the graph. |
93 | const Node *addNode(LRTable::StateID State, const ForestNode *Symbol, |
94 | llvm::ArrayRef<const Node *> Parents); |
95 | // Frees all nodes not reachable as ancestors of Roots, and returns the count. |
96 | // Calling this periodically prevents steady memory growth of the GSS. |
97 | unsigned gc(std::vector<const Node *> &&Roots); |
98 | |
99 | size_t bytes() const { return Arena.getTotalMemory() + sizeof(*this); } |
100 | size_t nodesCreated() const { return NodesCreated; } |
101 | |
102 | private: |
103 | // Nodes are recycled using freelists. |
104 | // They are variable size, so use one free-list per distinct #parents. |
105 | std::vector<std::vector<Node *>> FreeList; |
106 | Node *allocate(unsigned Parents); |
107 | void destroy(Node *N); |
108 | // The list of nodes created and not destroyed - our candidates for gc(). |
109 | std::vector<Node *> Alive; |
110 | bool GCParity = false; // All nodes should match this, except during GC. |
111 | |
112 | llvm::BumpPtrAllocator Arena; |
113 | unsigned NodesCreated = 0; |
114 | }; |
115 | llvm::raw_ostream &operator<<(llvm::raw_ostream &, const GSS::Node &); |
116 | |
117 | // Parameters for the GLR parsing. |
118 | struct ParseParams { |
119 | // The token stream to parse. |
120 | const TokenStream &Code; |
121 | |
122 | // Arena for data structure used by the GLR algorithm. |
123 | ForestArena &Forest; // Storage for the output forest. |
124 | GSS &GSStack; // Storage for parsing stacks. |
125 | }; |
126 | |
127 | // Parses the given token stream as the start symbol with the GLR algorithm, |
128 | // and returns a forest node of the start symbol. |
129 | // |
130 | // A rule `_ := StartSymbol` must exit for the chosen start symbol. |
131 | // |
132 | // If the parsing fails, we model it as an opaque node in the forest. |
133 | ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol, |
134 | const Language &Lang); |
135 | |
136 | // Shift a token onto all OldHeads, placing the results into NewHeads. |
137 | // |
138 | // Exposed for testing only. |
139 | void glrShift(llvm::ArrayRef<const GSS::Node *> OldHeads, |
140 | const ForestNode &NextTok, const ParseParams &Params, |
141 | const Language &Lang, std::vector<const GSS::Node *> &NewHeads); |
142 | // Applies available reductions on Heads, appending resulting heads to the list. |
143 | // |
144 | // Exposed for testing only. |
145 | void glrReduce(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead, |
146 | const ParseParams &Params, const Language &Lang); |
147 | |
148 | // Heuristically recover from a state where no further parsing is possible. |
149 | // |
150 | // OldHeads is the parse state at TokenIndex. |
151 | // This function consumes zero or more tokens by advancing TokenIndex, |
152 | // and places any recovery states created in NewHeads. |
153 | // |
154 | // On failure, NewHeads is empty and TokenIndex is unchanged. |
155 | // |
156 | // WARNING: glrRecover acts as a "fallback shift". If it consumes no tokens, |
157 | // there is a risk of the parser falling into an infinite loop, creating an |
158 | // endless sequence of recovery nodes. |
159 | // Generally it is safe for recovery to match 0 tokens against sequence symbols |
160 | // like `statement-seq`, as the grammar won't permit another statement-seq |
161 | // immediately afterwards. However recovery strategies for `statement` should |
162 | // consume at least one token, as statements may be adjacent in the input. |
163 | void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads, |
164 | unsigned &TokenIndex, const ParseParams &Params, |
165 | const Language &Lang, std::vector<const GSS::Node *> &NewHeads); |
166 | |
167 | } // namespace pseudo |
168 | } // namespace clang |
169 | |
170 | #endif // CLANG_PSEUDO_GLR_H |
171 | |