1 | //===--- LRGraph.cpp - -------------------------------------------*- C++-*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "clang-pseudo/grammar/LRGraph.h" |
10 | #include "clang-pseudo/grammar/Grammar.h" |
11 | #include "llvm/ADT/DenseSet.h" |
12 | #include "llvm/ADT/Hashing.h" |
13 | #include "llvm/ADT/STLExtras.h" |
14 | #include "llvm/ADT/StringExtras.h" |
15 | #include "llvm/Support/FormatVariadic.h" |
16 | #include "llvm/Support/raw_ostream.h" |
17 | |
18 | using ItemSet = std::vector<clang::pseudo::Item>; |
19 | |
20 | namespace llvm { |
21 | // Support clang::pseudo::Item as DenseMap keys. |
22 | template <> struct DenseMapInfo<ItemSet> { |
23 | static inline ItemSet getEmptyKey() { |
24 | return {DenseMapInfo<clang::pseudo::Item>::getEmptyKey()}; |
25 | } |
26 | static inline ItemSet getTombstoneKey() { |
27 | return {DenseMapInfo<clang::pseudo::Item>::getTombstoneKey()}; |
28 | } |
29 | static unsigned getHashValue(const ItemSet &I) { |
30 | return llvm::hash_combine_range(first: I.begin(), last: I.end()); |
31 | } |
32 | static bool isEqual(const ItemSet &LHS, const ItemSet &RHS) { |
33 | return LHS == RHS; |
34 | } |
35 | }; |
36 | } // namespace llvm |
37 | |
38 | namespace clang { |
39 | namespace pseudo { |
40 | namespace { |
41 | |
42 | struct SortByNextSymbol { |
43 | SortByNextSymbol(const Grammar &G) : G(G) {} |
44 | bool operator()(const Item &L, const Item &R) { |
45 | if (L.hasNext() && R.hasNext() && L.next(G) != R.next(G)) |
46 | return L.next(G) < R.next(G); |
47 | if (L.hasNext() != R.hasNext()) |
48 | return L.hasNext() < R.hasNext(); // a trailing dot is minimal. |
49 | return L < R; |
50 | } |
51 | const Grammar &G; |
52 | }; |
53 | |
54 | // Computes a closure of the given item set S: |
55 | // - extends the given S to contain all options for parsing next token; |
56 | // - nonterminals after a dot are recursively expanded into the begin-state |
57 | // of all production rules that produce that nonterminal; |
58 | // |
59 | // Given |
60 | // Grammar rules = [ _ := E, E := E - T, E := T, T := n, T := ( E ) ] |
61 | // Input = [ E := . T ] |
62 | // returns [ E := . T, T := . n, T := . ( E ) ] |
63 | State closure(ItemSet Queue, const Grammar &G) { |
64 | llvm::DenseSet<Item> InQueue = {Queue.begin(), Queue.end()}; |
65 | // We reuse the passed-by-value Queue as the final result, as it's already |
66 | // initialized to the right elements. |
67 | size_t ItIndex = 0; |
68 | while (ItIndex < Queue.size()) { |
69 | const Item &ExpandingItem = Queue[ItIndex]; |
70 | ++ItIndex; |
71 | if (!ExpandingItem.hasNext()) |
72 | continue; |
73 | |
74 | SymbolID NextSym = ExpandingItem.next(G); |
75 | if (pseudo::isToken(ID: NextSym)) |
76 | continue; |
77 | auto RRange = G.table().Nonterminals[NextSym].RuleRange; |
78 | for (RuleID RID = RRange.Start; RID < RRange.End; ++RID) { |
79 | Item NewItem = Item::start(ID: RID, G); |
80 | if (InQueue.insert(V: NewItem).second) // new |
81 | Queue.push_back(x: std::move(NewItem)); |
82 | } |
83 | } |
84 | Queue.shrink_to_fit(); |
85 | llvm::sort(C&: Queue, Comp: SortByNextSymbol(G)); |
86 | return {.Items: std::move(Queue)}; |
87 | } |
88 | |
89 | // Returns all next (with a dot advanced) kernel item sets, partitioned by the |
90 | // advanced symbol. |
91 | // |
92 | // Given |
93 | // S = [ E := . a b, E := E . - T ] |
94 | // returns [ |
95 | // {id(a), [ E := a . b ]}, |
96 | // {id(-), [ E := E - . T ]} |
97 | // ] |
98 | std::vector<std::pair<SymbolID, ItemSet>> |
99 | nextAvailableKernelItems(const State &S, const Grammar &G) { |
100 | std::vector<std::pair<SymbolID, ItemSet>> Results; |
101 | llvm::ArrayRef<Item> AllItems = S.Items; |
102 | AllItems = AllItems.drop_while(Pred: [](const Item &I) { return !I.hasNext(); }); |
103 | while (!AllItems.empty()) { |
104 | SymbolID AdvancedSymbol = AllItems.front().next(G); |
105 | auto Batch = AllItems.take_while(Pred: [AdvancedSymbol, &G](const Item &I) { |
106 | assert(I.hasNext()); |
107 | return I.next(G) == AdvancedSymbol; |
108 | }); |
109 | assert(!Batch.empty()); |
110 | AllItems = AllItems.drop_front(N: Batch.size()); |
111 | |
112 | // Advance a dot over the Symbol. |
113 | ItemSet Next; |
114 | for (const Item &I : Batch) |
115 | Next.push_back(x: I.advance()); |
116 | // sort the set to keep order determinism for hash computation. |
117 | llvm::sort(C&: Next); |
118 | Results.push_back(x: {AdvancedSymbol, std::move(Next)}); |
119 | } |
120 | return Results; |
121 | } |
122 | |
123 | std::vector<std::pair<ExtensionID, SymbolID>> |
124 | availableRecovery(const State &S, const Grammar &G) { |
125 | std::vector<std::pair<ExtensionID, SymbolID>> Result; |
126 | for (const Item &I : S.Items) { |
127 | const auto &Rule = G.lookupRule(RID: I.rule()); |
128 | if (I.dot() != Rule.RecoveryIndex) |
129 | continue; |
130 | Result.push_back(x: {Rule.Recovery, Rule.seq()[Rule.RecoveryIndex]}); |
131 | } |
132 | llvm::sort(C&: Result); |
133 | Result.erase(first: std::unique(first: Result.begin(), last: Result.end()), last: Result.end()); |
134 | return Result; |
135 | } |
136 | |
137 | } // namespace |
138 | |
139 | std::string Item::dump(const Grammar &G) const { |
140 | const auto &Rule = G.lookupRule(RID); |
141 | auto ToNames = [&](llvm::ArrayRef<SymbolID> Syms) { |
142 | std::vector<llvm::StringRef> Results; |
143 | for (auto SID : Syms) |
144 | Results.push_back(x: G.symbolName(SID)); |
145 | return Results; |
146 | }; |
147 | return llvm::formatv(Fmt: "{0} := {1} • {2}{3}" , Vals: G.symbolName(Rule.Target), |
148 | Vals: llvm::join(R: ToNames(Rule.seq().take_front(N: DotPos)), Separator: " " ), |
149 | Vals: llvm::join(R: ToNames(Rule.seq().drop_front(N: DotPos)), Separator: " " ), |
150 | Vals: Rule.RecoveryIndex == DotPos ? " [recovery]" : "" ) |
151 | .str(); |
152 | } |
153 | |
154 | std::string State::dump(const Grammar &G, unsigned Indent) const { |
155 | std::string Result; |
156 | llvm::raw_string_ostream OS(Result); |
157 | for (const auto &Item : Items) |
158 | OS.indent(NumSpaces: Indent) << llvm::formatv(Fmt: "{0}\n" , Vals: Item.dump(G)); |
159 | return OS.str(); |
160 | } |
161 | |
162 | std::string LRGraph::dumpForTests(const Grammar &G) const { |
163 | std::string Result; |
164 | llvm::raw_string_ostream OS(Result); |
165 | OS << "States:\n" ; |
166 | for (StateID ID = 0; ID < States.size(); ++ID) { |
167 | OS << llvm::formatv(Fmt: "State {0}\n" , Vals&: ID); |
168 | OS << States[ID].dump(G, /*Indent*/ 4); |
169 | } |
170 | for (const auto &E : Edges) { |
171 | OS << llvm::formatv(Fmt: "{0} ->[{1}] {2}\n" , Vals: E.Src, Vals: G.symbolName(E.Label), |
172 | Vals: E.Dst); |
173 | } |
174 | return OS.str(); |
175 | } |
176 | |
177 | LRGraph LRGraph::buildLR0(const Grammar &G) { |
178 | class Builder { |
179 | public: |
180 | Builder(const Grammar &G) : G(G) {} |
181 | |
182 | // Adds a given state if not existed. |
183 | std::pair<StateID, /*inserted*/ bool> insert(ItemSet KernelItems) { |
184 | assert(llvm::is_sorted(KernelItems) && |
185 | "Item must be sorted before inserting to a hash map!" ); |
186 | auto It = StatesIndex.find(Val: KernelItems); |
187 | if (It != StatesIndex.end()) |
188 | return {It->second, false}; |
189 | States.push_back(x: closure(Queue: KernelItems, G)); |
190 | StateID NextStateID = States.size() - 1; |
191 | StatesIndex.insert(KV: {std::move(KernelItems), NextStateID}); |
192 | return {NextStateID, true}; |
193 | } |
194 | |
195 | void insertEdge(StateID Src, StateID Dst, SymbolID Label) { |
196 | Edges.push_back(x: {.Src: Src, .Dst: Dst, .Label: Label}); |
197 | } |
198 | |
199 | void insertRecovery(StateID Src, ExtensionID Strategy, SymbolID Result) { |
200 | Recoveries.push_back(x: {.Src: Src, .Strategy: Strategy, .Result: Result}); |
201 | } |
202 | |
203 | // Returns a state with the given id. |
204 | const State &find(StateID ID) const { |
205 | assert(ID < States.size()); |
206 | return States[ID]; |
207 | } |
208 | |
209 | void addStartState(SymbolID Sym, StateID State) { |
210 | StartStates.push_back(x: {Sym, State}); |
211 | } |
212 | |
213 | LRGraph build() && { |
214 | States.shrink_to_fit(); |
215 | Edges.shrink_to_fit(); |
216 | Recoveries.shrink_to_fit(); |
217 | llvm::sort(C&: StartStates); |
218 | StartStates.shrink_to_fit(); |
219 | return LRGraph(std::move(States), std::move(Edges), std::move(Recoveries), |
220 | std::move(StartStates)); |
221 | } |
222 | |
223 | private: |
224 | // Key is the **kernel** item sets. |
225 | llvm::DenseMap<ItemSet, /*index of States*/ size_t> StatesIndex; |
226 | std::vector<State> States; |
227 | std::vector<Edge> Edges; |
228 | std::vector<Recovery> Recoveries; |
229 | const Grammar &G; |
230 | std::vector<std::pair<SymbolID, StateID>> StartStates; |
231 | } Builder(G); |
232 | |
233 | std::vector<StateID> PendingStates; |
234 | // Initialize states with the start symbol. |
235 | auto RRange = G.table().Nonterminals[G.underscore()].RuleRange; |
236 | for (RuleID RID = RRange.Start; RID < RRange.End; ++RID) { |
237 | auto StartState = std::vector<Item>{Item::start(ID: RID, G)}; |
238 | auto Result = Builder.insert(KernelItems: std::move(StartState)); |
239 | assert(Result.second && "State must be new" ); |
240 | PendingStates.push_back(x: Result.first); |
241 | |
242 | const Rule &StartRule = G.lookupRule(RID); |
243 | assert(StartRule.Size == 2 && |
244 | StartRule.seq().back() == tokenSymbol(tok::eof) && |
245 | "Start rule must be of the form `_ := start-symbol EOF`!" ); |
246 | Builder.addStartState(Sym: StartRule.seq().front(), State: Result.first); |
247 | } |
248 | |
249 | while (!PendingStates.empty()) { |
250 | auto StateID = PendingStates.back(); |
251 | PendingStates.pop_back(); |
252 | for (auto Next : nextAvailableKernelItems(S: Builder.find(ID: StateID), G)) { |
253 | auto Insert = Builder.insert(KernelItems: Next.second); |
254 | if (Insert.second) // new state, insert to the pending queue. |
255 | PendingStates.push_back(x: Insert.first); |
256 | Builder.insertEdge(Src: StateID, Dst: Insert.first, Label: Next.first); |
257 | } |
258 | for (auto Recovery : availableRecovery(S: Builder.find(ID: StateID), G)) |
259 | Builder.insertRecovery(Src: StateID, Strategy: Recovery.first, Result: Recovery.second); |
260 | } |
261 | return std::move(Builder).build(); |
262 | } |
263 | |
264 | } // namespace pseudo |
265 | } // namespace clang |
266 | |