LRGraph.h source code [clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h]

1	//===--- LRGraph.h - Build an LR automaton ------------------- C++--===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// LR parsers are bottom-up parsers -- they scan the input from left to right,
10	// and collect the right-hand side of a production rule (called handle) on top
11	// of the stack, then replace (reduce) the handle with the nonterminal defined
12	// by the production rule.
13	//
14	// This file defines LRGraph, a deterministic handle-finding finite-state
15	// automaton, which is a key component in LR parsers to recognize any of
16	// handles in the grammar efficiently. We build the LR table (ACTION and GOTO
17	// Table) based on the LRGraph.
18	//
19	// LRGraph can be constructed for any context-free grammars.
20	// Even for a LR-ambiguous grammar, we can construct a deterministic FSA, but
21	// interpretation of the FSA is nondeterministic -- we might in a state where
22	// we can continue searching an handle and identify a handle (called
23	// shift/reduce conflicts), or identify more than one handle (callled
24	// reduce/reduce conflicts).
25	//
26	// LRGraph is a common model for all variants of LR automatons, from the most
27	// basic one LR(0), the powerful SLR(1), LR(1) which uses a one-token lookahead
28	// in making decisions.
29	//===----------------------------------------------------------------------===//
30
31	#ifndef CLANG_PSEUDO_GRAMMAR_LRGRAPH_H
32	#define CLANG_PSEUDO_GRAMMAR_LRGRAPH_H
33
34	#include "clang-pseudo/grammar/Grammar.h"
35	#include "llvm/ADT/Hashing.h"
36	#include <vector>
37
38	namespace clang {
39	namespace pseudo {
40
41	// An LR item -- a grammar rule with a dot at some position of the body.
42	// e.g. a production rule A := X Y yields 3 items:
43	// A := . X Y
44	// A := X . Y
45	// A := X Y .
46	// An item indicates how much of a production rule has been recognized at a
47	// position (described by dot), for example, A := X . Y indicates that we have
48	// recognized the X part from the input, and we hope next to see the input
49	// derivable from Y.
50	class Item {
51	public:
52	static Item start(RuleID ID, const Grammar &G) {
53	Item I;
54	I.RID = ID;
55	I.RuleLength = G.lookupRule(RID: ID).Size;
56	return I;
57	}
58	static Item sentinel(RuleID ID) {
59	Item I;
60	I.RID = ID;
61	return I;
62	}
63
64	RuleID rule() const { return RID; }
65	uint8_t dot() const { return DotPos; }
66
67	bool hasNext() const { return DotPos < RuleLength; }
68	SymbolID next(const Grammar &G) const {
69	assert(hasNext());
70	return G.lookupRule(RID).Sequence[DotPos];
71	}
72
73	Item advance() const {
74	assert(hasNext());
75	Item I = *this;
76	++I.DotPos;
77	return I;
78	}
79
80	std::string dump(const Grammar &G) const;
81
82	bool operator==(const Item &I) const {
83	return DotPos == I.DotPos && RID == I.RID;
84	}
85	bool operator<(const Item &I) const {
86	return std::tie(args: RID, args: DotPos) < std::tie(args: I.RID, args: I.DotPos);
87	}
88	friend llvm::hash_code hash_value(const Item &I) {
89	return llvm::hash_combine(args: I.RID, args: I.DotPos);
90	}
91
92	private:
93	RuleID RID = `0`;
94	uint8_t DotPos = `0`;
95	uint8_t RuleLength = `0`; // the length of rule body.
96	};
97
98	// A state represents a node in the LR automaton graph. It is an item set, which
99	// contains all possible rules that the LR parser may be parsing in that state.
100	//
101	// Conceptually, If we knew in advance what we're parsing, at any point we're
102	// partway through parsing a production, sitting on a stack of partially parsed
103	// productions. But because we don't know, there could be several* productions*
104	// we're partway through. The set of possibilities is the parser state, and we
105	// precompute all the transitions between these states.
106	struct State {
107	// A full set of items (including non-kernel items) representing the state,
108	// in a canonical order (see SortByNextSymbol in the cpp file).
109	std::vector<Item> Items;
110
111	std::string dump(const Grammar &G, unsigned Indent = `0`) const;
112	};
113
114	// LRGraph is a deterministic finite state automaton for LR parsing.
115	//
116	// Intuitively, an LR automaton is a transition graph. The graph has a
117	// collection of nodes, called States. Each state corresponds to a particular
118	// item set, which represents a condition that could occur during the process of
119	// parsing a production. Edges are directed from one state to another. Each edge
120	// is labeled by a grammar symbol (terminal or nonterminal).
121	//
122	// LRGraph is used to construct the LR parsing table which is a core
123	// data-structure driving the LR parser.
124	class LRGraph {
125	public:
126	// StateID is the index in States table.
127	using StateID = uint16_t;
128
129	// Constructs an LR(0) automaton.
130	static LRGraph buildLR0(const Grammar &);
131
132	// An edge in the LR graph, it represents a transition in the LR automaton.
133	// If the parser is at state Src, with a lookahead Label, then it
134	// transits to state Dst.
135	struct Edge {
136	StateID Src, Dst;
137	SymbolID Label;
138	};
139
140	// A possible error recovery: choose to match some tokens against a symbol.
141	//
142	// e.g. a state that contains
143	// stmt := { . stmt-seq [recover=braces] }
144	// has a Recovery { Src = S, Strategy=braces, Result=stmt-seq }.
145	struct Recovery {
146	StateID Src; // The state we are in when encountering the error.
147	ExtensionID Strategy; // Heuristic choosing the tokens to match.
148	SymbolID Result; // The symbol that is produced.
149	};
150
151	llvm::ArrayRef<State> states() const { return States; }
152	llvm::ArrayRef<Edge> edges() const { return Edges; }
153	llvm::ArrayRef<Recovery> recoveries() const { return Recoveries; }
154	llvm::ArrayRef<std::pair<SymbolID, StateID>> startStates() const {
155	return StartStates;
156	}
157
158	std::string dumpForTests(const Grammar &) const;
159
160	private:
161	LRGraph(std::vector<State> States, std::vector<Edge> Edges,
162	std::vector<Recovery> Recoveries,
163	std::vector<std::pair<SymbolID, StateID>> StartStates)
164	: States (std::move(States)), Edges (std::move(Edges)),
165	Recoveries (std::move(Recoveries)), StartStates (std::move(StartStates)) {
166	}
167
168	std::vector<State> States;
169	std::vector<Edge> Edges;
170	std::vector<Recovery> Recoveries;
171	std::vector<std::pair<SymbolID, StateID>> StartStates;
172	};
173
174	} // namespace pseudo
175	} // namespace clang
176
177	namespace llvm {
178	// Support clang::pseudo::Item as DenseMap keys.
179	template <> struct DenseMapInfo<clang::pseudo::Item> {
180	static inline clang::pseudo::Item getEmptyKey() {
181	return clang::pseudo::Item::sentinel(ID: -`1`);
182	}
183	static inline clang::pseudo::Item getTombstoneKey() {
184	return clang::pseudo::Item::sentinel(ID: -`2`);
185	}
186	static unsigned getHashValue(const clang::pseudo::Item &I) {
187	return hash_value(I);
188	}
189	static bool isEqual(const clang::pseudo::Item &LHS,
190	const clang::pseudo::Item &RHS) {
191	return LHS == RHS;
192	}
193	};
194	} // namespace llvm
195
196	#endif // CLANG_PSEUDO_GRAMMAR_LRGRAPH_H
197

source code of clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h