1 | //===--- Forest.h - Parse forest, the output of the GLR parser ---*- C++-*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // A parse forest represents a set of possible parse trees efficiently, it is |
10 | // produced by the GLR parser. |
11 | // |
12 | // Despite the name, its data structure is a tree-like DAG with a single root. |
13 | // Multiple ways to parse the same tokens are presented as an ambiguous node |
14 | // with all possible interpretations as children. |
15 | // Common sub-parses are shared: if two interpretations both parse "1 + 1" as |
16 | // "expr := expr + expr", they will share a Sequence node representing the expr. |
17 | // |
18 | //===----------------------------------------------------------------------===// |
19 | |
20 | #ifndef CLANG_PSEUDO_FOREST_H |
21 | #define CLANG_PSEUDO_FOREST_H |
22 | |
23 | #include "clang-pseudo/Token.h" |
24 | #include "clang-pseudo/grammar/Grammar.h" |
25 | #include "llvm/ADT/ArrayRef.h" |
26 | #include "llvm/ADT/STLExtras.h" |
27 | #include "llvm/Support/Allocator.h" |
28 | #include <cstdint> |
29 | |
30 | namespace clang { |
31 | namespace pseudo { |
32 | |
33 | // A node represents ways to parse a sequence of tokens, it interprets a fixed |
34 | // range of tokens as a fixed grammar symbol. |
35 | // |
36 | // There are different kinds of nodes, some nodes have "children" (stored in a |
37 | // trailing array) and have pointers to them. "Children" has different semantics |
38 | // depending on the node kinds. For an Ambiguous node, it means all |
39 | // possible interpretations; for a Sequence node, it means each symbol on the |
40 | // right hand side of the production rule. |
41 | // |
42 | // Since this is a node in a DAG, a node may have multiple parents. And a node |
43 | // doesn't have parent pointers. |
44 | class alignas(class ForestNode *) ForestNode { |
45 | public: |
46 | class RecursiveIterator; |
47 | enum Kind { |
48 | // A Terminal node is a single terminal symbol bound to a token. |
49 | Terminal, |
50 | // A Sequence node is a nonterminal symbol parsed from a grammar rule, |
51 | // elements() are the parses of each symbol on the RHS of the rule. |
52 | // If the rule is A := X Y Z, the node is for nonterminal A, and elements() |
53 | // are [X, Y, Z]. |
54 | Sequence, |
55 | // An Ambiguous node exposes multiple ways to interpret the code as the |
56 | // same symbol, alternatives() are all possible parses. |
57 | Ambiguous, |
58 | // An Opaque node is a placeholder. It asserts that tokens match a symbol, |
59 | // without saying how. |
60 | // It is used for lazy-parsing (not parsed yet), or error-recovery (invalid |
61 | // code). |
62 | Opaque, |
63 | }; |
64 | Kind kind() const { return K; } |
65 | |
66 | SymbolID symbol() const { return Symbol; } |
67 | |
68 | // The start of the token range, it is a poistion within a token stream. |
69 | Token::Index startTokenIndex() const { return StartIndex; } |
70 | |
71 | // Returns the corresponding grammar rule. |
72 | // REQUIRES: this is a Sequence node. |
73 | RuleID rule() const { |
74 | assert(kind() == Sequence); |
75 | return Data & ((1 << RuleBits) - 1); |
76 | } |
77 | // Returns the parses of each element on the RHS of the rule. |
78 | // REQUIRES: this is a Sequence node; |
79 | llvm::ArrayRef<const ForestNode *> elements() const { |
80 | assert(kind() == Sequence); |
81 | return children(Num: Data >> RuleBits); |
82 | } |
83 | llvm::MutableArrayRef<ForestNode *> elements() { |
84 | assert(kind() == Sequence); |
85 | return children(Num: Data >> RuleBits); |
86 | } |
87 | |
88 | // Returns all possible interpretations of the code. |
89 | // REQUIRES: this is an Ambiguous node. |
90 | llvm::ArrayRef<const ForestNode *> alternatives() const { |
91 | assert(kind() == Ambiguous); |
92 | return children(Num: Data); |
93 | } |
94 | llvm::MutableArrayRef<ForestNode *> alternatives() { |
95 | assert(kind() == Ambiguous); |
96 | return children(Num: Data); |
97 | } |
98 | |
99 | llvm::ArrayRef<const ForestNode *> children() const { |
100 | switch (kind()) { |
101 | case Sequence: |
102 | return elements(); |
103 | case Ambiguous: |
104 | return alternatives(); |
105 | case Terminal: |
106 | case Opaque: |
107 | return {}; |
108 | } |
109 | llvm_unreachable("Bad kind" ); |
110 | } |
111 | |
112 | // Iteration over all nodes in the forest, including this. |
113 | llvm::iterator_range<RecursiveIterator> descendants() const; |
114 | |
115 | std::string dump(const Grammar &) const; |
116 | std::string dumpRecursive(const Grammar &, bool Abbreviated = false) const; |
117 | |
118 | private: |
119 | friend class ForestArena; |
120 | |
121 | ForestNode(Kind K, SymbolID Symbol, Token::Index StartIndex, uint16_t Data) |
122 | : StartIndex(StartIndex), K(K), Symbol(Symbol), Data(Data) {} |
123 | |
124 | ForestNode(const ForestNode &) = delete; |
125 | ForestNode &operator=(const ForestNode &) = delete; |
126 | ForestNode(ForestNode &&) = delete; |
127 | ForestNode &operator=(ForestNode &&) = delete; |
128 | |
129 | static uint16_t sequenceData(RuleID Rule, |
130 | llvm::ArrayRef<const ForestNode *> Elements) { |
131 | assert(Rule < (1 << RuleBits)); |
132 | assert(Elements.size() < (1 << (16 - RuleBits))); |
133 | return Rule | Elements.size() << RuleBits; |
134 | } |
135 | static uint16_t |
136 | ambiguousData(llvm::ArrayRef<const ForestNode *> Alternatives) { |
137 | return Alternatives.size(); |
138 | } |
139 | |
140 | // Retrieves the trailing array. |
141 | llvm::ArrayRef<const ForestNode *> children(uint16_t Num) const { |
142 | return llvm::ArrayRef(reinterpret_cast<ForestNode *const *>(this + 1), Num); |
143 | } |
144 | llvm::MutableArrayRef<ForestNode *> children(uint16_t Num) { |
145 | return llvm::MutableArrayRef(reinterpret_cast<ForestNode **>(this + 1), |
146 | Num); |
147 | } |
148 | |
149 | Token::Index StartIndex; |
150 | Kind K : 4; |
151 | SymbolID Symbol : SymbolBits; |
152 | // Sequence - child count : 4 | RuleID : RuleBits (12) |
153 | // Ambiguous - child count : 16 |
154 | // Terminal, Opaque - unused |
155 | uint16_t Data; |
156 | // An array of ForestNode* following the object. |
157 | }; |
158 | // ForestNode may not be destroyed (for BumpPtrAllocator). |
159 | static_assert(std::is_trivially_destructible<ForestNode>()); |
160 | |
161 | // A memory arena for the parse forest. |
162 | class ForestArena { |
163 | public: |
164 | llvm::ArrayRef<ForestNode> createTerminals(const TokenStream &Code); |
165 | ForestNode &createSequence(SymbolID SID, RuleID RID, |
166 | llvm::ArrayRef<const ForestNode *> Elements) { |
167 | assert(!Elements.empty()); |
168 | return create(K: ForestNode::Sequence, SID, |
169 | Start: Elements.front()->startTokenIndex(), |
170 | Data: ForestNode::sequenceData(Rule: RID, Elements), Elements); |
171 | } |
172 | ForestNode &createAmbiguous(SymbolID SID, |
173 | llvm::ArrayRef<const ForestNode *> Alternatives) { |
174 | assert(!Alternatives.empty()); |
175 | assert(llvm::all_of(Alternatives, |
176 | [SID](const ForestNode *Alternative) { |
177 | return SID == Alternative->symbol(); |
178 | }) && |
179 | "Ambiguous alternatives must represent the same symbol!" ); |
180 | return create(K: ForestNode::Ambiguous, SID, |
181 | Start: Alternatives.front()->startTokenIndex(), |
182 | Data: ForestNode::ambiguousData(Alternatives), Elements: Alternatives); |
183 | } |
184 | ForestNode &createOpaque(SymbolID SID, Token::Index Start) { |
185 | return create(K: ForestNode::Opaque, SID, Start, Data: 0, Elements: {}); |
186 | } |
187 | |
188 | ForestNode &createTerminal(tok::TokenKind TK, Token::Index Start) { |
189 | return create(K: ForestNode::Terminal, SID: tokenSymbol(TK), Start, Data: 0, Elements: {}); |
190 | } |
191 | |
192 | size_t nodeCount() const { return NodeCount; } |
193 | size_t bytes() const { return Arena.getBytesAllocated() + sizeof(*this); } |
194 | |
195 | private: |
196 | ForestNode &create(ForestNode::Kind K, SymbolID SID, Token::Index Start, |
197 | uint16_t Data, |
198 | llvm::ArrayRef<const ForestNode *> Elements) { |
199 | ++NodeCount; |
200 | ForestNode *New = new (Arena.Allocate( |
201 | Size: sizeof(ForestNode) + Elements.size() * sizeof(ForestNode *), |
202 | Alignment: alignof(ForestNode))) ForestNode(K, SID, Start, Data); |
203 | if (!Elements.empty()) |
204 | llvm::copy(Range&: Elements, Out: reinterpret_cast<const ForestNode **>(New + 1)); |
205 | return *New; |
206 | } |
207 | |
208 | llvm::BumpPtrAllocator Arena; |
209 | uint32_t NodeCount = 0; |
210 | }; |
211 | |
212 | class ForestNode::RecursiveIterator |
213 | : public llvm::iterator_facade_base<ForestNode::RecursiveIterator, |
214 | std::input_iterator_tag, |
215 | const ForestNode> { |
216 | llvm::DenseSet<const ForestNode *> Seen; |
217 | struct StackFrame { |
218 | const ForestNode *Parent; |
219 | unsigned ChildIndex; |
220 | }; |
221 | std::vector<StackFrame> Stack; |
222 | const ForestNode *Cur; |
223 | |
224 | public: |
225 | RecursiveIterator(const ForestNode *N = nullptr) : Cur(N) {} |
226 | |
227 | const ForestNode &operator*() const { return *Cur; } |
228 | void operator++(); |
229 | bool operator==(const RecursiveIterator &I) const { return Cur == I.Cur; } |
230 | bool operator!=(const RecursiveIterator &I) const { return !(*this == I); } |
231 | }; |
232 | |
233 | } // namespace pseudo |
234 | } // namespace clang |
235 | |
236 | #endif // CLANG_PSEUDO_FOREST_H |
237 | |