1 | //===--- GrammarBNF.cpp - build grammar from BNF files ----------*- C++-*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "clang-pseudo/grammar/Grammar.h" |
10 | #include "clang/Basic/TokenKinds.h" |
11 | #include "llvm/ADT/DenseSet.h" |
12 | #include "llvm/ADT/STLExtras.h" |
13 | #include "llvm/ADT/StringExtras.h" |
14 | #include "llvm/Support/FormatVariadic.h" |
15 | #include <memory> |
16 | #include <utility> |
17 | |
18 | namespace clang { |
19 | namespace pseudo { |
20 | |
21 | namespace { |
22 | static const llvm::StringRef OptSuffix = "_opt" ; |
23 | static const llvm::StringRef StartSymbol = "_" ; |
24 | |
25 | // Builds grammar from BNF files. |
26 | class GrammarBuilder { |
27 | public: |
28 | GrammarBuilder(std::vector<std::string> &Diagnostics) |
29 | : Diagnostics(Diagnostics) {} |
30 | |
31 | Grammar build(llvm::StringRef BNF) { |
32 | auto Specs = eliminateOptional(Input: parse(Lines: BNF)); |
33 | |
34 | assert(llvm::all_of(Specs, |
35 | [](const RuleSpec &R) { |
36 | if (R.Target.ends_with(OptSuffix)) |
37 | return false; |
38 | return llvm::all_of( |
39 | R.Sequence, [](const RuleSpec::Element &E) { |
40 | return !E.Symbol.ends_with(OptSuffix); |
41 | }); |
42 | }) && |
43 | "Optional symbols should be eliminated!" ); |
44 | |
45 | auto T = std::make_unique<GrammarTable>(); |
46 | |
47 | // Assemble the name->ID and ID->nonterminal name maps. |
48 | llvm::DenseSet<llvm::StringRef> UniqueNonterminals; |
49 | llvm::DenseMap<llvm::StringRef, SymbolID> SymbolIds; |
50 | |
51 | llvm::DenseSet<llvm::StringRef> UniqueAttributeValues; |
52 | |
53 | for (uint16_t I = 0; I < NumTerminals; ++I) |
54 | SymbolIds.try_emplace(Key: T->Terminals[I], Args: tokenSymbol(TK: tok::TokenKind(I))); |
55 | auto Consider = [&](llvm::StringRef Name) { |
56 | if (!SymbolIds.count(Val: Name)) |
57 | UniqueNonterminals.insert(V: Name); |
58 | }; |
59 | for (const auto &Spec : Specs) { |
60 | Consider(Spec.Target); |
61 | for (const RuleSpec::Element &Elt : Spec.Sequence) { |
62 | Consider(Elt.Symbol); |
63 | for (const auto& KV : Elt.Attributes) |
64 | UniqueAttributeValues.insert(V: KV.second); |
65 | } |
66 | } |
67 | for (llvm::StringRef Name : UniqueNonterminals) { |
68 | T->Nonterminals.emplace_back(); |
69 | T->Nonterminals.back().Name = Name.str(); |
70 | } |
71 | assert(T->Nonterminals.size() < (1 << (SymbolBits - 1)) && |
72 | "Too many nonterminals to fit in SymbolID bits!" ); |
73 | llvm::sort(C&: T->Nonterminals, Comp: [](const GrammarTable::Nonterminal &L, |
74 | const GrammarTable::Nonterminal &R) { |
75 | return L.Name < R.Name; |
76 | }); |
77 | // Add an empty string for the corresponding sentinel unset attribute. |
78 | T->AttributeValues.push_back(x: "" ); |
79 | UniqueAttributeValues.erase(V: "" ); |
80 | for (llvm::StringRef Name : UniqueAttributeValues) { |
81 | T->AttributeValues.emplace_back(); |
82 | T->AttributeValues.back() = Name.str(); |
83 | } |
84 | llvm::sort(C&: T->AttributeValues); |
85 | assert(T->AttributeValues.front() == "" ); |
86 | |
87 | // Build name -> ID maps for nonterminals. |
88 | for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) |
89 | SymbolIds.try_emplace(Key: T->Nonterminals[SID].Name, Args&: SID); |
90 | |
91 | // Convert the rules. |
92 | T->Rules.reserve(n: Specs.size()); |
93 | std::vector<SymbolID> Symbols; |
94 | auto Lookup = [SymbolIds](llvm::StringRef Name) { |
95 | auto It = SymbolIds.find(Val: Name); |
96 | assert(It != SymbolIds.end() && "Didn't find the symbol in SymbolIds!" ); |
97 | return It->second; |
98 | }; |
99 | for (const auto &Spec : Specs) { |
100 | assert(Spec.Sequence.size() <= Rule::MaxElements); |
101 | Symbols.clear(); |
102 | for (const RuleSpec::Element &Elt : Spec.Sequence) |
103 | Symbols.push_back(x: Lookup(Elt.Symbol)); |
104 | T->Rules.push_back(x: Rule(Lookup(Spec.Target), Symbols)); |
105 | applyAttributes(Spec, T: *T, R&: T->Rules.back()); |
106 | } |
107 | |
108 | assert(T->Rules.size() < (1 << RuleBits) && |
109 | "Too many rules to fit in RuleID bits!" ); |
110 | const auto &SymbolOrder = getTopologicalOrder(T: T.get()); |
111 | llvm::stable_sort( |
112 | Range&: T->Rules, C: [&SymbolOrder](const Rule &Left, const Rule &Right) { |
113 | // Sorted by the topological order of the nonterminal Target. |
114 | return SymbolOrder[Left.Target] < SymbolOrder[Right.Target]; |
115 | }); |
116 | for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) { |
117 | auto StartIt = llvm::partition_point(Range&: T->Rules, P: [&](const Rule &R) { |
118 | return SymbolOrder[R.Target] < SymbolOrder[SID]; |
119 | }); |
120 | RuleID Start = StartIt - T->Rules.begin(); |
121 | RuleID End = Start; |
122 | while (End < T->Rules.size() && T->Rules[End].Target == SID) |
123 | ++End; |
124 | T->Nonterminals[SID].RuleRange = {.Start: Start, .End: End}; |
125 | } |
126 | Grammar G(std::move(T)); |
127 | diagnoseGrammar(G); |
128 | return G; |
129 | } |
130 | |
131 | // Gets topological order for nonterminal symbols. |
132 | // |
133 | // The topological order is defined as: if a *single* nonterminal A produces |
134 | // (or transitively) a nonterminal B (that said, there is a production rule |
135 | // B := A), then A is less than B. |
136 | // |
137 | // It returns the sort key for each symbol, the array is indexed by SymbolID. |
138 | std::vector<unsigned> getTopologicalOrder(GrammarTable *T) { |
139 | std::vector<std::pair<SymbolID, SymbolID>> Dependencies; |
140 | for (const auto &Rule : T->Rules) { |
141 | // if A := B, A depends on B. |
142 | if (Rule.Size == 1 && pseudo::isNonterminal(ID: Rule.Sequence[0])) |
143 | Dependencies.push_back(x: {Rule.Target, Rule.Sequence[0]}); |
144 | } |
145 | llvm::sort(C&: Dependencies); |
146 | std::vector<SymbolID> Order; |
147 | // Each nonterminal state flows: NotVisited -> Visiting -> Visited. |
148 | enum State { |
149 | NotVisited, |
150 | Visiting, |
151 | Visited, |
152 | }; |
153 | std::vector<State> VisitStates(T->Nonterminals.size(), NotVisited); |
154 | std::function<void(SymbolID)> DFS = [&](SymbolID SID) -> void { |
155 | if (VisitStates[SID] == Visited) |
156 | return; |
157 | if (VisitStates[SID] == Visiting) { |
158 | Diagnostics.push_back( |
159 | x: llvm::formatv(Fmt: "The grammar contains a cycle involving symbol {0}" , |
160 | Vals&: T->Nonterminals[SID].Name)); |
161 | return; |
162 | } |
163 | VisitStates[SID] = Visiting; |
164 | for (auto It = llvm::lower_bound(Range&: Dependencies, |
165 | Value: std::pair<SymbolID, SymbolID>{SID, 0}); |
166 | It != Dependencies.end() && It->first == SID; ++It) |
167 | DFS(It->second); |
168 | VisitStates[SID] = Visited; |
169 | Order.push_back(x: SID); |
170 | }; |
171 | for (SymbolID ID = 0; ID != T->Nonterminals.size(); ++ID) |
172 | DFS(ID); |
173 | std::vector<unsigned> Result(T->Nonterminals.size(), 0); |
174 | for (size_t I = 0; I < Order.size(); ++I) |
175 | Result[Order[I]] = I; |
176 | return Result; |
177 | } |
178 | |
179 | private: |
180 | // Text representation of a BNF grammar rule. |
181 | struct RuleSpec { |
182 | llvm::StringRef Target; |
183 | struct Element { |
184 | llvm::StringRef Symbol; // Name of the symbol |
185 | // Attributes that are associated to the sequence symbol or rule. |
186 | std::vector<std::pair<llvm::StringRef/*Key*/, llvm::StringRef/*Value*/>> |
187 | Attributes; |
188 | }; |
189 | std::vector<Element> Sequence; |
190 | |
191 | std::string toString() const { |
192 | std::vector<llvm::StringRef> Body; |
193 | for (const auto &E : Sequence) |
194 | Body.push_back(x: E.Symbol); |
195 | return llvm::formatv(Fmt: "{0} := {1}" , Vals: Target, Vals: llvm::join(R&: Body, Separator: " " )); |
196 | } |
197 | }; |
198 | |
199 | std::vector<RuleSpec> parse(llvm::StringRef Lines) { |
200 | std::vector<RuleSpec> Specs; |
201 | for (llvm::StringRef Line : llvm::split(Str: Lines, Separator: '\n')) { |
202 | Line = Line.trim(); |
203 | // Strip anything coming after the '#' (comment). |
204 | Line = Line.take_while(F: [](char C) { return C != '#'; }); |
205 | if (Line.empty()) |
206 | continue; |
207 | RuleSpec Rule; |
208 | if (parseLine(Line, Out&: Rule)) |
209 | Specs.push_back(x: std::move(Rule)); |
210 | } |
211 | return Specs; |
212 | } |
213 | |
214 | bool parseLine(llvm::StringRef Line, RuleSpec &Out) { |
215 | auto Parts = Line.split(Separator: ":=" ); |
216 | if (Parts.first == Line) { // no separator in Line |
217 | Diagnostics.push_back( |
218 | x: llvm::formatv(Fmt: "Failed to parse '{0}': no separator :=" , Vals&: Line).str()); |
219 | return false; |
220 | } |
221 | |
222 | Out.Target = Parts.first.trim(); |
223 | Out.Sequence.clear(); |
224 | for (llvm::StringRef Chunk : llvm::split(Str: Parts.second, Separator: ' ')) { |
225 | Chunk = Chunk.trim(); |
226 | if (Chunk.empty()) |
227 | continue; // skip empty |
228 | if (Chunk.starts_with(Prefix: "[" ) && Chunk.ends_with(Suffix: "]" )) { |
229 | if (Out.Sequence.empty()) |
230 | continue; |
231 | |
232 | parseAttributes(Content: Chunk, Out&: Out.Sequence.back().Attributes); |
233 | continue; |
234 | } |
235 | |
236 | Out.Sequence.push_back(x: {.Symbol: Chunk, /*Attributes=*/{}}); |
237 | } |
238 | return true; |
239 | } |
240 | |
241 | bool parseAttributes( |
242 | llvm::StringRef Content, |
243 | std::vector<std::pair<llvm::StringRef, llvm::StringRef>> &Out) { |
244 | assert(Content.starts_with("[" ) && Content.ends_with("]" )); |
245 | auto KV = Content.drop_front().drop_back().split(Separator: '='); |
246 | Out.push_back(x: {KV.first, KV.second.trim()}); |
247 | |
248 | return true; |
249 | } |
250 | // Apply the parsed extensions (stored in RuleSpec) to the grammar Rule. |
251 | void applyAttributes(const RuleSpec& Spec, const GrammarTable& T, Rule& R) { |
252 | auto LookupExtensionID = [&T](llvm::StringRef Name) { |
253 | const auto It = llvm::partition_point( |
254 | Range: T.AttributeValues, P: [&](llvm::StringRef X) { return X < Name; }); |
255 | assert(It != T.AttributeValues.end() && *It == Name && |
256 | "Didn't find the attribute in AttrValues!" ); |
257 | return It - T.AttributeValues.begin(); |
258 | }; |
259 | for (unsigned I = 0; I < Spec.Sequence.size(); ++I) { |
260 | for (const auto &KV : Spec.Sequence[I].Attributes) { |
261 | if (KV.first == "guard" ) { |
262 | R.Guarded = true; |
263 | } else if (KV.first == "recover" ) { |
264 | R.Recovery = LookupExtensionID(KV.second); |
265 | R.RecoveryIndex = I; |
266 | } else { |
267 | Diagnostics.push_back( |
268 | x: llvm::formatv(Fmt: "Unknown attribute '{0}'" , Vals: KV.first).str()); |
269 | } |
270 | } |
271 | } |
272 | } |
273 | |
274 | // Inlines all _opt symbols. |
275 | // For example, a rule E := id +_opt id, after elimination, we have two |
276 | // equivalent rules: |
277 | // 1) E := id + id |
278 | // 2) E := id id |
279 | std::vector<RuleSpec> eliminateOptional(llvm::ArrayRef<RuleSpec> Input) { |
280 | std::vector<RuleSpec> Results; |
281 | std::vector<RuleSpec::Element> Storage; |
282 | for (const auto &R : Input) { |
283 | eliminateOptionalTail( |
284 | Elements: R.Sequence, Result&: Storage, CB: [&Results, &Storage, &R, this]() { |
285 | if (Storage.empty()) { |
286 | Diagnostics.push_back( |
287 | x: llvm::formatv(Fmt: "Rule '{0}' has a nullable RHS" , Vals: R.toString())); |
288 | return; |
289 | } |
290 | Results.push_back(x: {.Target: R.Target, .Sequence: Storage}); |
291 | }); |
292 | assert(Storage.empty()); |
293 | } |
294 | return Results; |
295 | } |
296 | void eliminateOptionalTail(llvm::ArrayRef<RuleSpec::Element> Elements, |
297 | std::vector<RuleSpec::Element> &Result, |
298 | llvm::function_ref<void()> CB) { |
299 | if (Elements.empty()) |
300 | return CB(); |
301 | auto Front = Elements.front(); |
302 | if (!Front.Symbol.ends_with(Suffix: OptSuffix)) { |
303 | Result.push_back(x: std::move(Front)); |
304 | eliminateOptionalTail(Elements: Elements.drop_front(N: 1), Result, CB); |
305 | Result.pop_back(); |
306 | return; |
307 | } |
308 | // Enumerate two options: skip the opt symbol, or inline the symbol. |
309 | eliminateOptionalTail(Elements: Elements.drop_front(N: 1), Result, CB); // skip |
310 | Front.Symbol = Front.Symbol.drop_back(N: OptSuffix.size()); // drop "_opt" |
311 | Result.push_back(x: std::move(Front)); |
312 | eliminateOptionalTail(Elements: Elements.drop_front(N: 1), Result, CB); |
313 | Result.pop_back(); |
314 | } |
315 | |
316 | // Diagnoses the grammar and emit warnings if any. |
317 | void diagnoseGrammar(const Grammar &G) { |
318 | const auto &T = G.table(); |
319 | for (SymbolID SID = 0; SID < T.Nonterminals.size(); ++SID) { |
320 | auto Range = T.Nonterminals[SID].RuleRange; |
321 | if (Range.Start == Range.End) |
322 | Diagnostics.push_back( |
323 | x: llvm::formatv(Fmt: "No rules for nonterminal: {0}" , Vals: G.symbolName(SID))); |
324 | llvm::StringRef NameRef = T.Nonterminals[SID].Name; |
325 | if (llvm::all_of(Range&: NameRef, P: llvm::isAlpha) && NameRef.upper() == NameRef) { |
326 | Diagnostics.push_back(x: llvm::formatv( |
327 | Fmt: "Token-like name {0} is used as a nonterminal" , Vals: G.symbolName(SID))); |
328 | } |
329 | } |
330 | llvm::DenseSet<llvm::hash_code> VisitedRules; |
331 | for (RuleID RID = 0; RID < T.Rules.size(); ++RID) { |
332 | const auto &R = T.Rules[RID]; |
333 | auto Code = llvm::hash_combine( |
334 | args: R.Target, args: llvm::hash_combine_range(first: R.seq().begin(), last: R.seq().end())); |
335 | auto [_, New] = VisitedRules.insert(V: Code); |
336 | if (!New) |
337 | Diagnostics.push_back( |
338 | x: llvm::formatv(Fmt: "Duplicate rule: `{0}`" , Vals: G.dumpRule(RID))); |
339 | } |
340 | // symbol-id -> used counts |
341 | std::vector<unsigned> UseCounts(T.Nonterminals.size(), 0); |
342 | for (const Rule &R : T.Rules) |
343 | for (SymbolID SID : R.seq()) |
344 | if (isNonterminal(ID: SID)) |
345 | ++UseCounts[SID]; |
346 | for (SymbolID SID = 0; SID < UseCounts.size(); ++SID) |
347 | if (UseCounts[SID] == 0 && T.Nonterminals[SID].Name != StartSymbol) |
348 | Diagnostics.push_back( |
349 | x: llvm::formatv(Fmt: "Nonterminal never used: {0}" , Vals: G.symbolName(SID))); |
350 | } |
351 | std::vector<std::string> &Diagnostics; |
352 | }; |
353 | } // namespace |
354 | |
355 | Grammar Grammar::parseBNF(llvm::StringRef BNF, |
356 | std::vector<std::string> &Diagnostics) { |
357 | Diagnostics.clear(); |
358 | return GrammarBuilder(Diagnostics).build(BNF); |
359 | } |
360 | |
361 | } // namespace pseudo |
362 | } // namespace clang |
363 | |