1//===--- GrammarBNF.cpp - build grammar from BNF files ----------*- C++-*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "clang-pseudo/grammar/Grammar.h"
10#include "clang/Basic/TokenKinds.h"
11#include "llvm/ADT/DenseSet.h"
12#include "llvm/ADT/STLExtras.h"
13#include "llvm/ADT/StringExtras.h"
14#include "llvm/Support/FormatVariadic.h"
15#include <memory>
16#include <utility>
17
18namespace clang {
19namespace pseudo {
20
21namespace {
22static const llvm::StringRef OptSuffix = "_opt";
23static const llvm::StringRef StartSymbol = "_";
24
25// Builds grammar from BNF files.
26class GrammarBuilder {
27public:
28 GrammarBuilder(std::vector<std::string> &Diagnostics)
29 : Diagnostics(Diagnostics) {}
30
31 Grammar build(llvm::StringRef BNF) {
32 auto Specs = eliminateOptional(Input: parse(Lines: BNF));
33
34 assert(llvm::all_of(Specs,
35 [](const RuleSpec &R) {
36 if (R.Target.ends_with(OptSuffix))
37 return false;
38 return llvm::all_of(
39 R.Sequence, [](const RuleSpec::Element &E) {
40 return !E.Symbol.ends_with(OptSuffix);
41 });
42 }) &&
43 "Optional symbols should be eliminated!");
44
45 auto T = std::make_unique<GrammarTable>();
46
47 // Assemble the name->ID and ID->nonterminal name maps.
48 llvm::DenseSet<llvm::StringRef> UniqueNonterminals;
49 llvm::DenseMap<llvm::StringRef, SymbolID> SymbolIds;
50
51 llvm::DenseSet<llvm::StringRef> UniqueAttributeValues;
52
53 for (uint16_t I = 0; I < NumTerminals; ++I)
54 SymbolIds.try_emplace(Key: T->Terminals[I], Args: tokenSymbol(TK: tok::TokenKind(I)));
55 auto Consider = [&](llvm::StringRef Name) {
56 if (!SymbolIds.count(Val: Name))
57 UniqueNonterminals.insert(V: Name);
58 };
59 for (const auto &Spec : Specs) {
60 Consider(Spec.Target);
61 for (const RuleSpec::Element &Elt : Spec.Sequence) {
62 Consider(Elt.Symbol);
63 for (const auto& KV : Elt.Attributes)
64 UniqueAttributeValues.insert(V: KV.second);
65 }
66 }
67 for (llvm::StringRef Name : UniqueNonterminals) {
68 T->Nonterminals.emplace_back();
69 T->Nonterminals.back().Name = Name.str();
70 }
71 assert(T->Nonterminals.size() < (1 << (SymbolBits - 1)) &&
72 "Too many nonterminals to fit in SymbolID bits!");
73 llvm::sort(C&: T->Nonterminals, Comp: [](const GrammarTable::Nonterminal &L,
74 const GrammarTable::Nonterminal &R) {
75 return L.Name < R.Name;
76 });
77 // Add an empty string for the corresponding sentinel unset attribute.
78 T->AttributeValues.push_back(x: "");
79 UniqueAttributeValues.erase(V: "");
80 for (llvm::StringRef Name : UniqueAttributeValues) {
81 T->AttributeValues.emplace_back();
82 T->AttributeValues.back() = Name.str();
83 }
84 llvm::sort(C&: T->AttributeValues);
85 assert(T->AttributeValues.front() == "");
86
87 // Build name -> ID maps for nonterminals.
88 for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID)
89 SymbolIds.try_emplace(Key: T->Nonterminals[SID].Name, Args&: SID);
90
91 // Convert the rules.
92 T->Rules.reserve(n: Specs.size());
93 std::vector<SymbolID> Symbols;
94 auto Lookup = [SymbolIds](llvm::StringRef Name) {
95 auto It = SymbolIds.find(Val: Name);
96 assert(It != SymbolIds.end() && "Didn't find the symbol in SymbolIds!");
97 return It->second;
98 };
99 for (const auto &Spec : Specs) {
100 assert(Spec.Sequence.size() <= Rule::MaxElements);
101 Symbols.clear();
102 for (const RuleSpec::Element &Elt : Spec.Sequence)
103 Symbols.push_back(x: Lookup(Elt.Symbol));
104 T->Rules.push_back(x: Rule(Lookup(Spec.Target), Symbols));
105 applyAttributes(Spec, T: *T, R&: T->Rules.back());
106 }
107
108 assert(T->Rules.size() < (1 << RuleBits) &&
109 "Too many rules to fit in RuleID bits!");
110 const auto &SymbolOrder = getTopologicalOrder(T: T.get());
111 llvm::stable_sort(
112 Range&: T->Rules, C: [&SymbolOrder](const Rule &Left, const Rule &Right) {
113 // Sorted by the topological order of the nonterminal Target.
114 return SymbolOrder[Left.Target] < SymbolOrder[Right.Target];
115 });
116 for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) {
117 auto StartIt = llvm::partition_point(Range&: T->Rules, P: [&](const Rule &R) {
118 return SymbolOrder[R.Target] < SymbolOrder[SID];
119 });
120 RuleID Start = StartIt - T->Rules.begin();
121 RuleID End = Start;
122 while (End < T->Rules.size() && T->Rules[End].Target == SID)
123 ++End;
124 T->Nonterminals[SID].RuleRange = {.Start: Start, .End: End};
125 }
126 Grammar G(std::move(T));
127 diagnoseGrammar(G);
128 return G;
129 }
130
131 // Gets topological order for nonterminal symbols.
132 //
133 // The topological order is defined as: if a *single* nonterminal A produces
134 // (or transitively) a nonterminal B (that said, there is a production rule
135 // B := A), then A is less than B.
136 //
137 // It returns the sort key for each symbol, the array is indexed by SymbolID.
138 std::vector<unsigned> getTopologicalOrder(GrammarTable *T) {
139 std::vector<std::pair<SymbolID, SymbolID>> Dependencies;
140 for (const auto &Rule : T->Rules) {
141 // if A := B, A depends on B.
142 if (Rule.Size == 1 && pseudo::isNonterminal(ID: Rule.Sequence[0]))
143 Dependencies.push_back(x: {Rule.Target, Rule.Sequence[0]});
144 }
145 llvm::sort(C&: Dependencies);
146 std::vector<SymbolID> Order;
147 // Each nonterminal state flows: NotVisited -> Visiting -> Visited.
148 enum State {
149 NotVisited,
150 Visiting,
151 Visited,
152 };
153 std::vector<State> VisitStates(T->Nonterminals.size(), NotVisited);
154 std::function<void(SymbolID)> DFS = [&](SymbolID SID) -> void {
155 if (VisitStates[SID] == Visited)
156 return;
157 if (VisitStates[SID] == Visiting) {
158 Diagnostics.push_back(
159 x: llvm::formatv(Fmt: "The grammar contains a cycle involving symbol {0}",
160 Vals&: T->Nonterminals[SID].Name));
161 return;
162 }
163 VisitStates[SID] = Visiting;
164 for (auto It = llvm::lower_bound(Range&: Dependencies,
165 Value: std::pair<SymbolID, SymbolID>{SID, 0});
166 It != Dependencies.end() && It->first == SID; ++It)
167 DFS(It->second);
168 VisitStates[SID] = Visited;
169 Order.push_back(x: SID);
170 };
171 for (SymbolID ID = 0; ID != T->Nonterminals.size(); ++ID)
172 DFS(ID);
173 std::vector<unsigned> Result(T->Nonterminals.size(), 0);
174 for (size_t I = 0; I < Order.size(); ++I)
175 Result[Order[I]] = I;
176 return Result;
177 }
178
179private:
180 // Text representation of a BNF grammar rule.
181 struct RuleSpec {
182 llvm::StringRef Target;
183 struct Element {
184 llvm::StringRef Symbol; // Name of the symbol
185 // Attributes that are associated to the sequence symbol or rule.
186 std::vector<std::pair<llvm::StringRef/*Key*/, llvm::StringRef/*Value*/>>
187 Attributes;
188 };
189 std::vector<Element> Sequence;
190
191 std::string toString() const {
192 std::vector<llvm::StringRef> Body;
193 for (const auto &E : Sequence)
194 Body.push_back(x: E.Symbol);
195 return llvm::formatv(Fmt: "{0} := {1}", Vals: Target, Vals: llvm::join(R&: Body, Separator: " "));
196 }
197 };
198
199 std::vector<RuleSpec> parse(llvm::StringRef Lines) {
200 std::vector<RuleSpec> Specs;
201 for (llvm::StringRef Line : llvm::split(Str: Lines, Separator: '\n')) {
202 Line = Line.trim();
203 // Strip anything coming after the '#' (comment).
204 Line = Line.take_while(F: [](char C) { return C != '#'; });
205 if (Line.empty())
206 continue;
207 RuleSpec Rule;
208 if (parseLine(Line, Out&: Rule))
209 Specs.push_back(x: std::move(Rule));
210 }
211 return Specs;
212 }
213
214 bool parseLine(llvm::StringRef Line, RuleSpec &Out) {
215 auto Parts = Line.split(Separator: ":=");
216 if (Parts.first == Line) { // no separator in Line
217 Diagnostics.push_back(
218 x: llvm::formatv(Fmt: "Failed to parse '{0}': no separator :=", Vals&: Line).str());
219 return false;
220 }
221
222 Out.Target = Parts.first.trim();
223 Out.Sequence.clear();
224 for (llvm::StringRef Chunk : llvm::split(Str: Parts.second, Separator: ' ')) {
225 Chunk = Chunk.trim();
226 if (Chunk.empty())
227 continue; // skip empty
228 if (Chunk.starts_with(Prefix: "[") && Chunk.ends_with(Suffix: "]")) {
229 if (Out.Sequence.empty())
230 continue;
231
232 parseAttributes(Content: Chunk, Out&: Out.Sequence.back().Attributes);
233 continue;
234 }
235
236 Out.Sequence.push_back(x: {.Symbol: Chunk, /*Attributes=*/{}});
237 }
238 return true;
239 }
240
241 bool parseAttributes(
242 llvm::StringRef Content,
243 std::vector<std::pair<llvm::StringRef, llvm::StringRef>> &Out) {
244 assert(Content.starts_with("[") && Content.ends_with("]"));
245 auto KV = Content.drop_front().drop_back().split(Separator: '=');
246 Out.push_back(x: {KV.first, KV.second.trim()});
247
248 return true;
249 }
250 // Apply the parsed extensions (stored in RuleSpec) to the grammar Rule.
251 void applyAttributes(const RuleSpec& Spec, const GrammarTable& T, Rule& R) {
252 auto LookupExtensionID = [&T](llvm::StringRef Name) {
253 const auto It = llvm::partition_point(
254 Range: T.AttributeValues, P: [&](llvm::StringRef X) { return X < Name; });
255 assert(It != T.AttributeValues.end() && *It == Name &&
256 "Didn't find the attribute in AttrValues!");
257 return It - T.AttributeValues.begin();
258 };
259 for (unsigned I = 0; I < Spec.Sequence.size(); ++I) {
260 for (const auto &KV : Spec.Sequence[I].Attributes) {
261 if (KV.first == "guard") {
262 R.Guarded = true;
263 } else if (KV.first == "recover") {
264 R.Recovery = LookupExtensionID(KV.second);
265 R.RecoveryIndex = I;
266 } else {
267 Diagnostics.push_back(
268 x: llvm::formatv(Fmt: "Unknown attribute '{0}'", Vals: KV.first).str());
269 }
270 }
271 }
272 }
273
274 // Inlines all _opt symbols.
275 // For example, a rule E := id +_opt id, after elimination, we have two
276 // equivalent rules:
277 // 1) E := id + id
278 // 2) E := id id
279 std::vector<RuleSpec> eliminateOptional(llvm::ArrayRef<RuleSpec> Input) {
280 std::vector<RuleSpec> Results;
281 std::vector<RuleSpec::Element> Storage;
282 for (const auto &R : Input) {
283 eliminateOptionalTail(
284 Elements: R.Sequence, Result&: Storage, CB: [&Results, &Storage, &R, this]() {
285 if (Storage.empty()) {
286 Diagnostics.push_back(
287 x: llvm::formatv(Fmt: "Rule '{0}' has a nullable RHS", Vals: R.toString()));
288 return;
289 }
290 Results.push_back(x: {.Target: R.Target, .Sequence: Storage});
291 });
292 assert(Storage.empty());
293 }
294 return Results;
295 }
296 void eliminateOptionalTail(llvm::ArrayRef<RuleSpec::Element> Elements,
297 std::vector<RuleSpec::Element> &Result,
298 llvm::function_ref<void()> CB) {
299 if (Elements.empty())
300 return CB();
301 auto Front = Elements.front();
302 if (!Front.Symbol.ends_with(Suffix: OptSuffix)) {
303 Result.push_back(x: std::move(Front));
304 eliminateOptionalTail(Elements: Elements.drop_front(N: 1), Result, CB);
305 Result.pop_back();
306 return;
307 }
308 // Enumerate two options: skip the opt symbol, or inline the symbol.
309 eliminateOptionalTail(Elements: Elements.drop_front(N: 1), Result, CB); // skip
310 Front.Symbol = Front.Symbol.drop_back(N: OptSuffix.size()); // drop "_opt"
311 Result.push_back(x: std::move(Front));
312 eliminateOptionalTail(Elements: Elements.drop_front(N: 1), Result, CB);
313 Result.pop_back();
314 }
315
316 // Diagnoses the grammar and emit warnings if any.
317 void diagnoseGrammar(const Grammar &G) {
318 const auto &T = G.table();
319 for (SymbolID SID = 0; SID < T.Nonterminals.size(); ++SID) {
320 auto Range = T.Nonterminals[SID].RuleRange;
321 if (Range.Start == Range.End)
322 Diagnostics.push_back(
323 x: llvm::formatv(Fmt: "No rules for nonterminal: {0}", Vals: G.symbolName(SID)));
324 llvm::StringRef NameRef = T.Nonterminals[SID].Name;
325 if (llvm::all_of(Range&: NameRef, P: llvm::isAlpha) && NameRef.upper() == NameRef) {
326 Diagnostics.push_back(x: llvm::formatv(
327 Fmt: "Token-like name {0} is used as a nonterminal", Vals: G.symbolName(SID)));
328 }
329 }
330 llvm::DenseSet<llvm::hash_code> VisitedRules;
331 for (RuleID RID = 0; RID < T.Rules.size(); ++RID) {
332 const auto &R = T.Rules[RID];
333 auto Code = llvm::hash_combine(
334 args: R.Target, args: llvm::hash_combine_range(first: R.seq().begin(), last: R.seq().end()));
335 auto [_, New] = VisitedRules.insert(V: Code);
336 if (!New)
337 Diagnostics.push_back(
338 x: llvm::formatv(Fmt: "Duplicate rule: `{0}`", Vals: G.dumpRule(RID)));
339 }
340 // symbol-id -> used counts
341 std::vector<unsigned> UseCounts(T.Nonterminals.size(), 0);
342 for (const Rule &R : T.Rules)
343 for (SymbolID SID : R.seq())
344 if (isNonterminal(ID: SID))
345 ++UseCounts[SID];
346 for (SymbolID SID = 0; SID < UseCounts.size(); ++SID)
347 if (UseCounts[SID] == 0 && T.Nonterminals[SID].Name != StartSymbol)
348 Diagnostics.push_back(
349 x: llvm::formatv(Fmt: "Nonterminal never used: {0}", Vals: G.symbolName(SID)));
350 }
351 std::vector<std::string> &Diagnostics;
352};
353} // namespace
354
355Grammar Grammar::parseBNF(llvm::StringRef BNF,
356 std::vector<std::string> &Diagnostics) {
357 Diagnostics.clear();
358 return GrammarBuilder(Diagnostics).build(BNF);
359}
360
361} // namespace pseudo
362} // namespace clang
363

source code of clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp