GrammarBNF.cpp source code [clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp]

1	//===--- GrammarBNF.cpp - build grammar from BNF files ----------- C++--===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "clang-pseudo/grammar/Grammar.h"
10	#include "clang/Basic/TokenKinds.h"
11	#include "llvm/ADT/DenseSet.h"
12	#include "llvm/ADT/STLExtras.h"
13	#include "llvm/ADT/StringExtras.h"
14	#include "llvm/Support/FormatVariadic.h"
15	#include <memory>
16	#include <utility>
17
18	namespace clang {
19	namespace pseudo {
20
21	namespace {
22	static const llvm::StringRef OptSuffix = "_opt";
23	static const llvm::StringRef StartSymbol = "_";
24
25	// Builds grammar from BNF files.
26	class GrammarBuilder {
27	public:
28	GrammarBuilder(std::vector<std::string> &Diagnostics)
29	: Diagnostics(Diagnostics) {}
30
31	Grammar build(llvm::StringRef BNF) {
32	auto Specs = eliminateOptional(Input: parse(Lines: BNF));
33
34	assert(llvm::all_of(Specs,
35	[](const RuleSpec &R) {
36	if (R.Target.ends_with(OptSuffix))
37	return false;
38	return llvm::all_of(
39	R.Sequence, [](const RuleSpec::Element &E) {
40	return !E.Symbol.ends_with(OptSuffix);
41	});
42	}) &&
43	"Optional symbols should be eliminated!");
44
45	auto T = std::make_unique<GrammarTable>();
46
47	// Assemble the name->ID and ID->nonterminal name maps.
48	llvm::DenseSet<llvm::StringRef> UniqueNonterminals;
49	llvm::DenseMap<llvm::StringRef, SymbolID> SymbolIds;
50
51	llvm::DenseSet<llvm::StringRef> UniqueAttributeValues;
52
53	for (uint16_t I = `0`; I < NumTerminals; ++I)
54	SymbolIds.try_emplace(Key: T ->Terminals [I], Args: tokenSymbol(TK: tok::TokenKind(I)));
55	auto Consider = [&](llvm::StringRef Name) {
56	if (!SymbolIds.count(Val: Name))
57	UniqueNonterminals.insert(V: Name);
58	};
59	for (const auto &Spec : Specs) {
60	Consider(Spec.Target);
61	for (const RuleSpec::Element &Elt : Spec.Sequence) {
62	Consider(Elt.Symbol);
63	for (const auto& KV : Elt.Attributes)
64	UniqueAttributeValues.insert(V: KV.second);
65	}
66	}
67	for (llvm::StringRef Name : UniqueNonterminals) {
68	T ->Nonterminals.emplace_back();
69	T ->Nonterminals.back().Name = Name.str();
70	}
71	assert(T ->Nonterminals.size() < (`1` << (SymbolBits - `1`)) &&
72	"Too many nonterminals to fit in SymbolID bits!");
73	llvm::sort(C&: T ->Nonterminals, Comp: [](const GrammarTable::Nonterminal &L,
74	const GrammarTable::Nonterminal &R) {
75	return L.Name < R.Name;
76	});
77	// Add an empty string for the corresponding sentinel unset attribute.
78	T ->AttributeValues.push_back(x: "");
79	UniqueAttributeValues.erase(V: "");
80	for (llvm::StringRef Name : UniqueAttributeValues) {
81	T ->AttributeValues.emplace_back();
82	T ->AttributeValues.back() = Name.str();
83	}
84	llvm::sort(C&: T ->AttributeValues);
85	assert(T ->AttributeValues.front() == "");
86
87	// Build name -> ID maps for nonterminals.
88	for (SymbolID SID = `0`; SID < T ->Nonterminals.size(); ++SID)
89	SymbolIds.try_emplace(Key: T ->Nonterminals [SID].Name, Args&: SID);
90
91	// Convert the rules.
92	T ->Rules.reserve(n: Specs.size());
93	std::vector<SymbolID> Symbols;
94	auto Lookup = [SymbolIds](llvm::StringRef Name) {
95	auto It = SymbolIds.find(Val: Name);
96	assert(It != SymbolIds.end() && "Didn't find the symbol in SymbolIds!");
97	return It ->second;
98	};
99	for (const auto &Spec : Specs) {
100	assert(Spec.Sequence.size() <= Rule::MaxElements);
101	Symbols.clear();
102	for (const RuleSpec::Element &Elt : Spec.Sequence)
103	Symbols.push_back(x: Lookup(Elt.Symbol));
104	T ->Rules.push_back(x: Rule (Lookup(Spec.Target), Symbols));
105	applyAttributes(Spec, T: *T, R&: T ->Rules.back());
106	}
107
108	assert(T ->Rules.size() < (`1` << RuleBits) &&
109	"Too many rules to fit in RuleID bits!");
110	const auto &SymbolOrder = getTopologicalOrder(T: T.get());
111	llvm::stable_sort(
112	Range&: T ->Rules, C: [&SymbolOrder](const Rule &Left, const Rule &Right) {
113	// Sorted by the topological order of the nonterminal Target.
114	return SymbolOrder [Left.Target] < SymbolOrder [Right.Target];
115	});
116	for (SymbolID SID = `0`; SID < T ->Nonterminals.size(); ++SID) {
117	auto StartIt = llvm::partition_point(Range&: T ->Rules, P: [&](const Rule &R) {
118	return SymbolOrder [R.Target] < SymbolOrder [SID];
119	});
120	RuleID Start = StartIt - T ->Rules.begin();
121	RuleID End = Start;
122	while (End < T ->Rules.size() && T ->Rules [End].Target == SID)
123	++End;
124	T ->Nonterminals [SID].RuleRange = {.Start: Start, .End: End};
125	}
126	Grammar G(std::move(T));
127	diagnoseGrammar(G);
128	return G;
129	}
130
131	// Gets topological order for nonterminal symbols.
132	//
133	// The topological order is defined as: if a single* nonterminal A produces*
134	// (or transitively) a nonterminal B (that said, there is a production rule
135	// B := A), then A is less than B.
136	//
137	// It returns the sort key for each symbol, the array is indexed by SymbolID.
138	std::vector<unsigned> getTopologicalOrder(GrammarTable *T) {
139	std::vector<std::pair<SymbolID, SymbolID>> Dependencies;
140	for (const auto &Rule : T->Rules) {
141	// if A := B, A depends on B.
142	if (Rule.Size == `1` && pseudo::isNonterminal(ID: Rule.Sequence[`0`]))
143	Dependencies.push_back(x: {Rule.Target, Rule.Sequence[`0`]});
144	}
145	llvm::sort(C&: Dependencies);
146	std::vector<SymbolID> Order;
147	// Each nonterminal state flows: NotVisited -> Visiting -> Visited.
148	enum State {
149	NotVisited,
150	Visiting,
151	Visited,
152	};
153	std::vector<State> VisitStates(T->Nonterminals.size(), NotVisited);
154	std::function<void(SymbolID)> DFS = [&](SymbolID SID) -> void {
155	if (VisitStates [SID] == Visited)
156	return;
157	if (VisitStates [SID] == Visiting) {
158	Diagnostics.push_back(
159	x: llvm::formatv(Fmt: "The grammar contains a cycle involving symbol {0}",
160	Vals&: T->Nonterminals [SID].Name));
161	return;
162	}
163	VisitStates [SID] = Visiting;
164	for (auto It = llvm::lower_bound(Range&: Dependencies,
165	Value: std::pair<SymbolID, SymbolID>{SID, `0`});
166	It != Dependencies.end() && It ->first == SID; ++It)
167	DFS (It ->second);
168	VisitStates [SID] = Visited;
169	Order.push_back(x: SID);
170	};
171	for (SymbolID ID = `0`; ID != T->Nonterminals.size(); ++ID)
172	DFS (ID);
173	std::vector<unsigned> Result(T->Nonterminals.size(), `0`);
174	for (size_t I = `0`; I < Order.size(); ++I)
175	Result [Order [I]] = I;
176	return Result;
177	}
178
179	private:
180	// Text representation of a BNF grammar rule.
181	struct RuleSpec {
182	llvm::StringRef Target;
183	struct Element {
184	llvm::StringRef Symbol; // Name of the symbol
185	// Attributes that are associated to the sequence symbol or rule.
186	std::vector<std::pair<llvm::StringRef/Key/, llvm::StringRef/Value/>>
187	Attributes;
188	};
189	std::vector<Element> Sequence;
190
191	std::string toString() const {
192	std::vector<llvm::StringRef> Body;
193	for (const auto &E : Sequence)
194	Body.push_back(x: E.Symbol);
195	return llvm::formatv(Fmt: "{0} := {1}", Vals: Target, Vals: llvm::join(R&: Body, Separator: " "));
196	}
197	};
198
199	std::vector<RuleSpec> parse(llvm::StringRef Lines) {
200	std::vector<RuleSpec> Specs;
201	for (llvm::StringRef Line : llvm::split(Str: Lines, Separator: `'\n'`)) {
202	Line = Line.trim();
203	// Strip anything coming after the '#' (comment).
204	Line = Line.take_while(F: [](char C) { return C != `'#'`; });
205	if (Line.empty())
206	continue;
207	RuleSpec Rule;
208	if (parseLine(Line, Out&: Rule))
209	Specs.push_back(x: std::move(Rule));
210	}
211	return Specs;
212	}
213
214	bool parseLine(llvm::StringRef Line, RuleSpec &Out) {
215	auto Parts = Line.split(Separator: ":=");
216	if (Parts.first == Line) { // no separator in Line
217	Diagnostics.push_back(
218	x: llvm::formatv(Fmt: "Failed to parse '{0}': no separator :=", Vals&: Line).str());
219	return false;
220	}
221
222	Out.Target = Parts.first.trim();
223	Out.Sequence.clear();
224	for (llvm::StringRef Chunk : llvm::split(Str: Parts.second, Separator: `' '`)) {
225	Chunk = Chunk.trim();
226	if (Chunk.empty())
227	continue; // skip empty
228	if (Chunk.starts_with(Prefix: "[") && Chunk.ends_with(Suffix: "]")) {
229	if (Out.Sequence.empty())
230	continue;
231
232	parseAttributes(Content: Chunk, Out&: Out.Sequence.back().Attributes);
233	continue;
234	}
235
236	Out.Sequence.push_back(x: {.Symbol: Chunk, /Attributes=/{}});
237	}
238	return true;
239	}
240
241	bool parseAttributes(
242	llvm::StringRef Content,
243	std::vector<std::pair<llvm::StringRef, llvm::StringRef>> &Out) {
244	assert(Content.starts_with("[") && Content.ends_with("]"));
245	auto KV = Content.drop_front().drop_back().split(Separator: `'='`);
246	Out.push_back(x: {KV.first, KV.second.trim()});
247
248	return true;
249	}
250	// Apply the parsed extensions (stored in RuleSpec) to the grammar Rule.
251	void applyAttributes(const RuleSpec& Spec, const GrammarTable& T, Rule& R) {
252	auto LookupExtensionID = [&T](llvm::StringRef Name) {
253	const auto It = llvm::partition_point(
254	Range: T.AttributeValues, P: [&](llvm::StringRef X) { return X < Name; });
255	assert(It != T.AttributeValues.end() && *It == Name &&
256	"Didn't find the attribute in AttrValues!");
257	return It - T.AttributeValues.begin();
258	};
259	for (unsigned I = `0`; I < Spec.Sequence.size(); ++I) {
260	for (const auto &KV : Spec.Sequence [I].Attributes) {
261	if (KV.first == "guard") {
262	R.Guarded = true;
263	} else if (KV.first == "recover") {
264	R.Recovery = LookupExtensionID(KV.second);
265	R.RecoveryIndex = I;
266	} else {
267	Diagnostics.push_back(
268	x: llvm::formatv(Fmt: "Unknown attribute '{0}'", Vals: KV.first).str());
269	}
270	}
271	}
272	}
273
274	// Inlines all _opt symbols.
275	// For example, a rule E := id +_opt id, after elimination, we have two
276	// equivalent rules:
277	// 1) E := id + id
278	// 2) E := id id
279	std::vector<RuleSpec> eliminateOptional(llvm::ArrayRef<RuleSpec> Input) {
280	std::vector<RuleSpec> Results;
281	std::vector<RuleSpec::Element> Storage;
282	for (const auto &R : Input) {
283	eliminateOptionalTail(
284	Elements: R.Sequence, Result&: Storage, CB: [&Results, &Storage, &R, this]() {
285	if (Storage.empty()) {
286	Diagnostics.push_back(
287	x: llvm::formatv(Fmt: "Rule '{0}' has a nullable RHS", Vals: R.toString()));
288	return;
289	}
290	Results.push_back(x: {.Target: R.Target, .Sequence: Storage});
291	});
292	assert(Storage.empty());
293	}
294	return Results;
295	}
296	void eliminateOptionalTail(llvm::ArrayRef<RuleSpec::Element> Elements,
297	std::vector<RuleSpec::Element> &Result,
298	llvm::function_ref<void()> CB) {
299	if (Elements.empty())
300	return CB ();
301	auto Front = Elements.front();
302	if (!Front.Symbol.ends_with(Suffix: OptSuffix)) {
303	Result.push_back(x: std::move(Front));
304	eliminateOptionalTail(Elements: Elements.drop_front(N: `1`), Result, CB);
305	Result.pop_back();
306	return;
307	}
308	// Enumerate two options: skip the opt symbol, or inline the symbol.
309	eliminateOptionalTail(Elements: Elements.drop_front(N: `1`), Result, CB); // skip
310	Front.Symbol = Front.Symbol.drop_back(N: OptSuffix.size()); // drop "_opt"
311	Result.push_back(x: std::move(Front));
312	eliminateOptionalTail(Elements: Elements.drop_front(N: `1`), Result, CB);
313	Result.pop_back();
314	}
315
316	// Diagnoses the grammar and emit warnings if any.
317	void diagnoseGrammar(const Grammar &G) {
318	const auto &T = G.table();
319	for (SymbolID SID = `0`; SID < T.Nonterminals.size(); ++SID) {
320	auto Range = T.Nonterminals [SID].RuleRange;
321	if (Range.Start == Range.End)
322	Diagnostics.push_back(
323	x: llvm::formatv(Fmt: "No rules for nonterminal: {0}", Vals: G.symbolName(SID)));
324	llvm::StringRef NameRef = T.Nonterminals [SID].Name;
325	if (llvm::all_of(Range&: NameRef, P: llvm::isAlpha) && NameRef.upper() == NameRef) {
326	Diagnostics.push_back(x: llvm::formatv(
327	Fmt: "Token-like name {0} is used as a nonterminal", Vals: G.symbolName(SID)));
328	}
329	}
330	llvm::DenseSet<llvm::hash_code> VisitedRules;
331	for (RuleID RID = `0`; RID < T.Rules.size(); ++RID) {
332	const auto &R = T.Rules [RID];
333	auto Code = llvm::hash_combine(
334	args: R.Target, args: llvm::hash_combine_range(first: R.seq().begin(), last: R.seq().end()));
335	auto [_, New] = VisitedRules.insert(V: Code);
336	if (!New)
337	Diagnostics.push_back(
338	x: llvm::formatv(Fmt: "Duplicate rule: `{0}`", Vals: G.dumpRule(RID)));
339	}
340	// symbol-id -> used counts
341	std::vector<unsigned> UseCounts(T.Nonterminals.size(), `0`);
342	for (const Rule &R : T.Rules)
343	for (SymbolID SID : R.seq())
344	if (isNonterminal(ID: SID))
345	++UseCounts [SID];
346	for (SymbolID SID = `0`; SID < UseCounts.size(); ++SID)
347	if (UseCounts [SID] == `0` && T.Nonterminals [SID].Name != StartSymbol)
348	Diagnostics.push_back(
349	x: llvm::formatv(Fmt: "Nonterminal never used: {0}", Vals: G.symbolName(SID)));
350	}
351	std::vector<std::string> &Diagnostics;
352	};
353	} // namespace
354
355	Grammar Grammar::parseBNF(llvm::StringRef BNF,
356	std::vector<std::string> &Diagnostics) {
357	Diagnostics.clear();
358	return GrammarBuilder (Diagnostics).build(BNF);
359	}
360
361	} // namespace pseudo
362	} // namespace clang
363

source code of clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp