1//===--- Forest.h - Parse forest, the output of the GLR parser ---*- C++-*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// A parse forest represents a set of possible parse trees efficiently, it is
10// produced by the GLR parser.
11//
12// Despite the name, its data structure is a tree-like DAG with a single root.
13// Multiple ways to parse the same tokens are presented as an ambiguous node
14// with all possible interpretations as children.
15// Common sub-parses are shared: if two interpretations both parse "1 + 1" as
16// "expr := expr + expr", they will share a Sequence node representing the expr.
17//
18//===----------------------------------------------------------------------===//
19
20#ifndef CLANG_PSEUDO_FOREST_H
21#define CLANG_PSEUDO_FOREST_H
22
23#include "clang-pseudo/Token.h"
24#include "clang-pseudo/grammar/Grammar.h"
25#include "llvm/ADT/ArrayRef.h"
26#include "llvm/ADT/STLExtras.h"
27#include "llvm/Support/Allocator.h"
28#include <cstdint>
29
30namespace clang {
31namespace pseudo {
32
33// A node represents ways to parse a sequence of tokens, it interprets a fixed
34// range of tokens as a fixed grammar symbol.
35//
36// There are different kinds of nodes, some nodes have "children" (stored in a
37// trailing array) and have pointers to them. "Children" has different semantics
38// depending on the node kinds. For an Ambiguous node, it means all
39// possible interpretations; for a Sequence node, it means each symbol on the
40// right hand side of the production rule.
41//
42// Since this is a node in a DAG, a node may have multiple parents. And a node
43// doesn't have parent pointers.
44class alignas(class ForestNode *) ForestNode {
45public:
46 class RecursiveIterator;
47 enum Kind {
48 // A Terminal node is a single terminal symbol bound to a token.
49 Terminal,
50 // A Sequence node is a nonterminal symbol parsed from a grammar rule,
51 // elements() are the parses of each symbol on the RHS of the rule.
52 // If the rule is A := X Y Z, the node is for nonterminal A, and elements()
53 // are [X, Y, Z].
54 Sequence,
55 // An Ambiguous node exposes multiple ways to interpret the code as the
56 // same symbol, alternatives() are all possible parses.
57 Ambiguous,
58 // An Opaque node is a placeholder. It asserts that tokens match a symbol,
59 // without saying how.
60 // It is used for lazy-parsing (not parsed yet), or error-recovery (invalid
61 // code).
62 Opaque,
63 };
64 Kind kind() const { return K; }
65
66 SymbolID symbol() const { return Symbol; }
67
68 // The start of the token range, it is a poistion within a token stream.
69 Token::Index startTokenIndex() const { return StartIndex; }
70
71 // Returns the corresponding grammar rule.
72 // REQUIRES: this is a Sequence node.
73 RuleID rule() const {
74 assert(kind() == Sequence);
75 return Data & ((1 << RuleBits) - 1);
76 }
77 // Returns the parses of each element on the RHS of the rule.
78 // REQUIRES: this is a Sequence node;
79 llvm::ArrayRef<const ForestNode *> elements() const {
80 assert(kind() == Sequence);
81 return children(Num: Data >> RuleBits);
82 }
83 llvm::MutableArrayRef<ForestNode *> elements() {
84 assert(kind() == Sequence);
85 return children(Num: Data >> RuleBits);
86 }
87
88 // Returns all possible interpretations of the code.
89 // REQUIRES: this is an Ambiguous node.
90 llvm::ArrayRef<const ForestNode *> alternatives() const {
91 assert(kind() == Ambiguous);
92 return children(Num: Data);
93 }
94 llvm::MutableArrayRef<ForestNode *> alternatives() {
95 assert(kind() == Ambiguous);
96 return children(Num: Data);
97 }
98
99 llvm::ArrayRef<const ForestNode *> children() const {
100 switch (kind()) {
101 case Sequence:
102 return elements();
103 case Ambiguous:
104 return alternatives();
105 case Terminal:
106 case Opaque:
107 return {};
108 }
109 llvm_unreachable("Bad kind");
110 }
111
112 // Iteration over all nodes in the forest, including this.
113 llvm::iterator_range<RecursiveIterator> descendants() const;
114
115 std::string dump(const Grammar &) const;
116 std::string dumpRecursive(const Grammar &, bool Abbreviated = false) const;
117
118private:
119 friend class ForestArena;
120
121 ForestNode(Kind K, SymbolID Symbol, Token::Index StartIndex, uint16_t Data)
122 : StartIndex(StartIndex), K(K), Symbol(Symbol), Data(Data) {}
123
124 ForestNode(const ForestNode &) = delete;
125 ForestNode &operator=(const ForestNode &) = delete;
126 ForestNode(ForestNode &&) = delete;
127 ForestNode &operator=(ForestNode &&) = delete;
128
129 static uint16_t sequenceData(RuleID Rule,
130 llvm::ArrayRef<const ForestNode *> Elements) {
131 assert(Rule < (1 << RuleBits));
132 assert(Elements.size() < (1 << (16 - RuleBits)));
133 return Rule | Elements.size() << RuleBits;
134 }
135 static uint16_t
136 ambiguousData(llvm::ArrayRef<const ForestNode *> Alternatives) {
137 return Alternatives.size();
138 }
139
140 // Retrieves the trailing array.
141 llvm::ArrayRef<const ForestNode *> children(uint16_t Num) const {
142 return llvm::ArrayRef(reinterpret_cast<ForestNode *const *>(this + 1), Num);
143 }
144 llvm::MutableArrayRef<ForestNode *> children(uint16_t Num) {
145 return llvm::MutableArrayRef(reinterpret_cast<ForestNode **>(this + 1),
146 Num);
147 }
148
149 Token::Index StartIndex;
150 Kind K : 4;
151 SymbolID Symbol : SymbolBits;
152 // Sequence - child count : 4 | RuleID : RuleBits (12)
153 // Ambiguous - child count : 16
154 // Terminal, Opaque - unused
155 uint16_t Data;
156 // An array of ForestNode* following the object.
157};
158// ForestNode may not be destroyed (for BumpPtrAllocator).
159static_assert(std::is_trivially_destructible<ForestNode>());
160
161// A memory arena for the parse forest.
162class ForestArena {
163public:
164 llvm::ArrayRef<ForestNode> createTerminals(const TokenStream &Code);
165 ForestNode &createSequence(SymbolID SID, RuleID RID,
166 llvm::ArrayRef<const ForestNode *> Elements) {
167 assert(!Elements.empty());
168 return create(K: ForestNode::Sequence, SID,
169 Start: Elements.front()->startTokenIndex(),
170 Data: ForestNode::sequenceData(Rule: RID, Elements), Elements);
171 }
172 ForestNode &createAmbiguous(SymbolID SID,
173 llvm::ArrayRef<const ForestNode *> Alternatives) {
174 assert(!Alternatives.empty());
175 assert(llvm::all_of(Alternatives,
176 [SID](const ForestNode *Alternative) {
177 return SID == Alternative->symbol();
178 }) &&
179 "Ambiguous alternatives must represent the same symbol!");
180 return create(K: ForestNode::Ambiguous, SID,
181 Start: Alternatives.front()->startTokenIndex(),
182 Data: ForestNode::ambiguousData(Alternatives), Elements: Alternatives);
183 }
184 ForestNode &createOpaque(SymbolID SID, Token::Index Start) {
185 return create(K: ForestNode::Opaque, SID, Start, Data: 0, Elements: {});
186 }
187
188 ForestNode &createTerminal(tok::TokenKind TK, Token::Index Start) {
189 return create(K: ForestNode::Terminal, SID: tokenSymbol(TK), Start, Data: 0, Elements: {});
190 }
191
192 size_t nodeCount() const { return NodeCount; }
193 size_t bytes() const { return Arena.getBytesAllocated() + sizeof(*this); }
194
195private:
196 ForestNode &create(ForestNode::Kind K, SymbolID SID, Token::Index Start,
197 uint16_t Data,
198 llvm::ArrayRef<const ForestNode *> Elements) {
199 ++NodeCount;
200 ForestNode *New = new (Arena.Allocate(
201 Size: sizeof(ForestNode) + Elements.size() * sizeof(ForestNode *),
202 Alignment: alignof(ForestNode))) ForestNode(K, SID, Start, Data);
203 if (!Elements.empty())
204 llvm::copy(Range&: Elements, Out: reinterpret_cast<const ForestNode **>(New + 1));
205 return *New;
206 }
207
208 llvm::BumpPtrAllocator Arena;
209 uint32_t NodeCount = 0;
210};
211
212class ForestNode::RecursiveIterator
213 : public llvm::iterator_facade_base<ForestNode::RecursiveIterator,
214 std::input_iterator_tag,
215 const ForestNode> {
216 llvm::DenseSet<const ForestNode *> Seen;
217 struct StackFrame {
218 const ForestNode *Parent;
219 unsigned ChildIndex;
220 };
221 std::vector<StackFrame> Stack;
222 const ForestNode *Cur;
223
224public:
225 RecursiveIterator(const ForestNode *N = nullptr) : Cur(N) {}
226
227 const ForestNode &operator*() const { return *Cur; }
228 void operator++();
229 bool operator==(const RecursiveIterator &I) const { return Cur == I.Cur; }
230 bool operator!=(const RecursiveIterator &I) const { return !(*this == I); }
231};
232
233} // namespace pseudo
234} // namespace clang
235
236#endif // CLANG_PSEUDO_FOREST_H
237

source code of clang-tools-extra/pseudo/include/clang-pseudo/Forest.h