1 | //===--- LRGraph.h - Build an LR automaton ------------------*- C++-*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // LR parsers are bottom-up parsers -- they scan the input from left to right, |
10 | // and collect the right-hand side of a production rule (called handle) on top |
11 | // of the stack, then replace (reduce) the handle with the nonterminal defined |
12 | // by the production rule. |
13 | // |
14 | // This file defines LRGraph, a deterministic handle-finding finite-state |
15 | // automaton, which is a key component in LR parsers to recognize any of |
16 | // handles in the grammar efficiently. We build the LR table (ACTION and GOTO |
17 | // Table) based on the LRGraph. |
18 | // |
19 | // LRGraph can be constructed for any context-free grammars. |
20 | // Even for a LR-ambiguous grammar, we can construct a deterministic FSA, but |
21 | // interpretation of the FSA is nondeterministic -- we might in a state where |
22 | // we can continue searching an handle and identify a handle (called |
23 | // shift/reduce conflicts), or identify more than one handle (callled |
24 | // reduce/reduce conflicts). |
25 | // |
26 | // LRGraph is a common model for all variants of LR automatons, from the most |
27 | // basic one LR(0), the powerful SLR(1), LR(1) which uses a one-token lookahead |
28 | // in making decisions. |
29 | //===----------------------------------------------------------------------===// |
30 | |
31 | #ifndef CLANG_PSEUDO_GRAMMAR_LRGRAPH_H |
32 | #define CLANG_PSEUDO_GRAMMAR_LRGRAPH_H |
33 | |
34 | #include "clang-pseudo/grammar/Grammar.h" |
35 | #include "llvm/ADT/Hashing.h" |
36 | #include <vector> |
37 | |
38 | namespace clang { |
39 | namespace pseudo { |
40 | |
41 | // An LR item -- a grammar rule with a dot at some position of the body. |
42 | // e.g. a production rule A := X Y yields 3 items: |
43 | // A := . X Y |
44 | // A := X . Y |
45 | // A := X Y . |
46 | // An item indicates how much of a production rule has been recognized at a |
47 | // position (described by dot), for example, A := X . Y indicates that we have |
48 | // recognized the X part from the input, and we hope next to see the input |
49 | // derivable from Y. |
50 | class Item { |
51 | public: |
52 | static Item start(RuleID ID, const Grammar &G) { |
53 | Item I; |
54 | I.RID = ID; |
55 | I.RuleLength = G.lookupRule(RID: ID).Size; |
56 | return I; |
57 | } |
58 | static Item sentinel(RuleID ID) { |
59 | Item I; |
60 | I.RID = ID; |
61 | return I; |
62 | } |
63 | |
64 | RuleID rule() const { return RID; } |
65 | uint8_t dot() const { return DotPos; } |
66 | |
67 | bool hasNext() const { return DotPos < RuleLength; } |
68 | SymbolID next(const Grammar &G) const { |
69 | assert(hasNext()); |
70 | return G.lookupRule(RID).Sequence[DotPos]; |
71 | } |
72 | |
73 | Item advance() const { |
74 | assert(hasNext()); |
75 | Item I = *this; |
76 | ++I.DotPos; |
77 | return I; |
78 | } |
79 | |
80 | std::string dump(const Grammar &G) const; |
81 | |
82 | bool operator==(const Item &I) const { |
83 | return DotPos == I.DotPos && RID == I.RID; |
84 | } |
85 | bool operator<(const Item &I) const { |
86 | return std::tie(args: RID, args: DotPos) < std::tie(args: I.RID, args: I.DotPos); |
87 | } |
88 | friend llvm::hash_code hash_value(const Item &I) { |
89 | return llvm::hash_combine(args: I.RID, args: I.DotPos); |
90 | } |
91 | |
92 | private: |
93 | RuleID RID = 0; |
94 | uint8_t DotPos = 0; |
95 | uint8_t RuleLength = 0; // the length of rule body. |
96 | }; |
97 | |
98 | // A state represents a node in the LR automaton graph. It is an item set, which |
99 | // contains all possible rules that the LR parser may be parsing in that state. |
100 | // |
101 | // Conceptually, If we knew in advance what we're parsing, at any point we're |
102 | // partway through parsing a production, sitting on a stack of partially parsed |
103 | // productions. But because we don't know, there could be *several* productions |
104 | // we're partway through. The set of possibilities is the parser state, and we |
105 | // precompute all the transitions between these states. |
106 | struct State { |
107 | // A full set of items (including non-kernel items) representing the state, |
108 | // in a canonical order (see SortByNextSymbol in the cpp file). |
109 | std::vector<Item> Items; |
110 | |
111 | std::string dump(const Grammar &G, unsigned Indent = 0) const; |
112 | }; |
113 | |
114 | // LRGraph is a deterministic finite state automaton for LR parsing. |
115 | // |
116 | // Intuitively, an LR automaton is a transition graph. The graph has a |
117 | // collection of nodes, called States. Each state corresponds to a particular |
118 | // item set, which represents a condition that could occur during the process of |
119 | // parsing a production. Edges are directed from one state to another. Each edge |
120 | // is labeled by a grammar symbol (terminal or nonterminal). |
121 | // |
122 | // LRGraph is used to construct the LR parsing table which is a core |
123 | // data-structure driving the LR parser. |
124 | class LRGraph { |
125 | public: |
126 | // StateID is the index in States table. |
127 | using StateID = uint16_t; |
128 | |
129 | // Constructs an LR(0) automaton. |
130 | static LRGraph buildLR0(const Grammar &); |
131 | |
132 | // An edge in the LR graph, it represents a transition in the LR automaton. |
133 | // If the parser is at state Src, with a lookahead Label, then it |
134 | // transits to state Dst. |
135 | struct Edge { |
136 | StateID Src, Dst; |
137 | SymbolID Label; |
138 | }; |
139 | |
140 | // A possible error recovery: choose to match some tokens against a symbol. |
141 | // |
142 | // e.g. a state that contains |
143 | // stmt := { . stmt-seq [recover=braces] } |
144 | // has a Recovery { Src = S, Strategy=braces, Result=stmt-seq }. |
145 | struct Recovery { |
146 | StateID Src; // The state we are in when encountering the error. |
147 | ExtensionID Strategy; // Heuristic choosing the tokens to match. |
148 | SymbolID Result; // The symbol that is produced. |
149 | }; |
150 | |
151 | llvm::ArrayRef<State> states() const { return States; } |
152 | llvm::ArrayRef<Edge> edges() const { return Edges; } |
153 | llvm::ArrayRef<Recovery> recoveries() const { return Recoveries; } |
154 | llvm::ArrayRef<std::pair<SymbolID, StateID>> startStates() const { |
155 | return StartStates; |
156 | } |
157 | |
158 | std::string dumpForTests(const Grammar &) const; |
159 | |
160 | private: |
161 | LRGraph(std::vector<State> States, std::vector<Edge> Edges, |
162 | std::vector<Recovery> Recoveries, |
163 | std::vector<std::pair<SymbolID, StateID>> StartStates) |
164 | : States(std::move(States)), Edges(std::move(Edges)), |
165 | Recoveries(std::move(Recoveries)), StartStates(std::move(StartStates)) { |
166 | } |
167 | |
168 | std::vector<State> States; |
169 | std::vector<Edge> Edges; |
170 | std::vector<Recovery> Recoveries; |
171 | std::vector<std::pair<SymbolID, StateID>> StartStates; |
172 | }; |
173 | |
174 | } // namespace pseudo |
175 | } // namespace clang |
176 | |
177 | namespace llvm { |
178 | // Support clang::pseudo::Item as DenseMap keys. |
179 | template <> struct DenseMapInfo<clang::pseudo::Item> { |
180 | static inline clang::pseudo::Item getEmptyKey() { |
181 | return clang::pseudo::Item::sentinel(ID: -1); |
182 | } |
183 | static inline clang::pseudo::Item getTombstoneKey() { |
184 | return clang::pseudo::Item::sentinel(ID: -2); |
185 | } |
186 | static unsigned getHashValue(const clang::pseudo::Item &I) { |
187 | return hash_value(I); |
188 | } |
189 | static bool isEqual(const clang::pseudo::Item &LHS, |
190 | const clang::pseudo::Item &RHS) { |
191 | return LHS == RHS; |
192 | } |
193 | }; |
194 | } // namespace llvm |
195 | |
196 | #endif // CLANG_PSEUDO_GRAMMAR_LRGRAPH_H |
197 | |