1//===--- LRGraph.h - Build an LR automaton ------------------*- C++-*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// LR parsers are bottom-up parsers -- they scan the input from left to right,
10// and collect the right-hand side of a production rule (called handle) on top
11// of the stack, then replace (reduce) the handle with the nonterminal defined
12// by the production rule.
13//
14// This file defines LRGraph, a deterministic handle-finding finite-state
15// automaton, which is a key component in LR parsers to recognize any of
16// handles in the grammar efficiently. We build the LR table (ACTION and GOTO
17// Table) based on the LRGraph.
18//
19// LRGraph can be constructed for any context-free grammars.
20// Even for a LR-ambiguous grammar, we can construct a deterministic FSA, but
21// interpretation of the FSA is nondeterministic -- we might in a state where
22// we can continue searching an handle and identify a handle (called
23// shift/reduce conflicts), or identify more than one handle (callled
24// reduce/reduce conflicts).
25//
26// LRGraph is a common model for all variants of LR automatons, from the most
27// basic one LR(0), the powerful SLR(1), LR(1) which uses a one-token lookahead
28// in making decisions.
29//===----------------------------------------------------------------------===//
30
31#ifndef CLANG_PSEUDO_GRAMMAR_LRGRAPH_H
32#define CLANG_PSEUDO_GRAMMAR_LRGRAPH_H
33
34#include "clang-pseudo/grammar/Grammar.h"
35#include "llvm/ADT/Hashing.h"
36#include <vector>
37
38namespace clang {
39namespace pseudo {
40
41// An LR item -- a grammar rule with a dot at some position of the body.
42// e.g. a production rule A := X Y yields 3 items:
43// A := . X Y
44// A := X . Y
45// A := X Y .
46// An item indicates how much of a production rule has been recognized at a
47// position (described by dot), for example, A := X . Y indicates that we have
48// recognized the X part from the input, and we hope next to see the input
49// derivable from Y.
50class Item {
51public:
52 static Item start(RuleID ID, const Grammar &G) {
53 Item I;
54 I.RID = ID;
55 I.RuleLength = G.lookupRule(RID: ID).Size;
56 return I;
57 }
58 static Item sentinel(RuleID ID) {
59 Item I;
60 I.RID = ID;
61 return I;
62 }
63
64 RuleID rule() const { return RID; }
65 uint8_t dot() const { return DotPos; }
66
67 bool hasNext() const { return DotPos < RuleLength; }
68 SymbolID next(const Grammar &G) const {
69 assert(hasNext());
70 return G.lookupRule(RID).Sequence[DotPos];
71 }
72
73 Item advance() const {
74 assert(hasNext());
75 Item I = *this;
76 ++I.DotPos;
77 return I;
78 }
79
80 std::string dump(const Grammar &G) const;
81
82 bool operator==(const Item &I) const {
83 return DotPos == I.DotPos && RID == I.RID;
84 }
85 bool operator<(const Item &I) const {
86 return std::tie(args: RID, args: DotPos) < std::tie(args: I.RID, args: I.DotPos);
87 }
88 friend llvm::hash_code hash_value(const Item &I) {
89 return llvm::hash_combine(args: I.RID, args: I.DotPos);
90 }
91
92private:
93 RuleID RID = 0;
94 uint8_t DotPos = 0;
95 uint8_t RuleLength = 0; // the length of rule body.
96};
97
98// A state represents a node in the LR automaton graph. It is an item set, which
99// contains all possible rules that the LR parser may be parsing in that state.
100//
101// Conceptually, If we knew in advance what we're parsing, at any point we're
102// partway through parsing a production, sitting on a stack of partially parsed
103// productions. But because we don't know, there could be *several* productions
104// we're partway through. The set of possibilities is the parser state, and we
105// precompute all the transitions between these states.
106struct State {
107 // A full set of items (including non-kernel items) representing the state,
108 // in a canonical order (see SortByNextSymbol in the cpp file).
109 std::vector<Item> Items;
110
111 std::string dump(const Grammar &G, unsigned Indent = 0) const;
112};
113
114// LRGraph is a deterministic finite state automaton for LR parsing.
115//
116// Intuitively, an LR automaton is a transition graph. The graph has a
117// collection of nodes, called States. Each state corresponds to a particular
118// item set, which represents a condition that could occur during the process of
119// parsing a production. Edges are directed from one state to another. Each edge
120// is labeled by a grammar symbol (terminal or nonterminal).
121//
122// LRGraph is used to construct the LR parsing table which is a core
123// data-structure driving the LR parser.
124class LRGraph {
125public:
126 // StateID is the index in States table.
127 using StateID = uint16_t;
128
129 // Constructs an LR(0) automaton.
130 static LRGraph buildLR0(const Grammar &);
131
132 // An edge in the LR graph, it represents a transition in the LR automaton.
133 // If the parser is at state Src, with a lookahead Label, then it
134 // transits to state Dst.
135 struct Edge {
136 StateID Src, Dst;
137 SymbolID Label;
138 };
139
140 // A possible error recovery: choose to match some tokens against a symbol.
141 //
142 // e.g. a state that contains
143 // stmt := { . stmt-seq [recover=braces] }
144 // has a Recovery { Src = S, Strategy=braces, Result=stmt-seq }.
145 struct Recovery {
146 StateID Src; // The state we are in when encountering the error.
147 ExtensionID Strategy; // Heuristic choosing the tokens to match.
148 SymbolID Result; // The symbol that is produced.
149 };
150
151 llvm::ArrayRef<State> states() const { return States; }
152 llvm::ArrayRef<Edge> edges() const { return Edges; }
153 llvm::ArrayRef<Recovery> recoveries() const { return Recoveries; }
154 llvm::ArrayRef<std::pair<SymbolID, StateID>> startStates() const {
155 return StartStates;
156 }
157
158 std::string dumpForTests(const Grammar &) const;
159
160private:
161 LRGraph(std::vector<State> States, std::vector<Edge> Edges,
162 std::vector<Recovery> Recoveries,
163 std::vector<std::pair<SymbolID, StateID>> StartStates)
164 : States(std::move(States)), Edges(std::move(Edges)),
165 Recoveries(std::move(Recoveries)), StartStates(std::move(StartStates)) {
166 }
167
168 std::vector<State> States;
169 std::vector<Edge> Edges;
170 std::vector<Recovery> Recoveries;
171 std::vector<std::pair<SymbolID, StateID>> StartStates;
172};
173
174} // namespace pseudo
175} // namespace clang
176
177namespace llvm {
178// Support clang::pseudo::Item as DenseMap keys.
179template <> struct DenseMapInfo<clang::pseudo::Item> {
180 static inline clang::pseudo::Item getEmptyKey() {
181 return clang::pseudo::Item::sentinel(ID: -1);
182 }
183 static inline clang::pseudo::Item getTombstoneKey() {
184 return clang::pseudo::Item::sentinel(ID: -2);
185 }
186 static unsigned getHashValue(const clang::pseudo::Item &I) {
187 return hash_value(I);
188 }
189 static bool isEqual(const clang::pseudo::Item &LHS,
190 const clang::pseudo::Item &RHS) {
191 return LHS == RHS;
192 }
193};
194} // namespace llvm
195
196#endif // CLANG_PSEUDO_GRAMMAR_LRGRAPH_H
197

source code of clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h