Grammar.cpp source code [clang-tools-extra/pseudo/lib/grammar/Grammar.cpp]

1	//===--- Grammar.cpp - Grammar for clang pseudoparser ------------ C++--===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "clang-pseudo/grammar/Grammar.h"
10	#include "clang/Basic/TokenKinds.h"
11	#include "llvm/ADT/ArrayRef.h"
12	#include "llvm/ADT/STLExtras.h"
13	#include "llvm/ADT/StringRef.h"
14	#include "llvm/Support/FormatVariadic.h"
15	#include "llvm/Support/raw_ostream.h"
16	#include <optional>
17
18	namespace clang {
19	namespace pseudo {
20
21	Rule::Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Sequence)
22	: Target(Target), Size(static_cast<uint8_t>(Sequence.size())) {
23	assert(Sequence.size() <= Rule::MaxElements);
24	llvm::copy(Range&: Sequence, Out: this->Sequence);
25	}
26
27	Grammar::Grammar(std::unique_ptr<GrammarTable> Table) : T (std::move(Table)) {
28	Underscore = *findNonterminal(Name: "_");
29	}
30
31	llvm::ArrayRef<Rule> Grammar::rulesFor(SymbolID SID) const {
32	assert(isNonterminal(SID));
33	const auto &R = T ->Nonterminals [SID].RuleRange;
34	assert(R.End <= T ->Rules.size());
35	return llvm::ArrayRef(&T ->Rules [R.Start], R.End - R.Start);
36	}
37
38	const Rule &Grammar::lookupRule(RuleID RID) const {
39	assert(RID < T ->Rules.size());
40	return T ->Rules [RID];
41	}
42
43	llvm::StringRef Grammar::symbolName(SymbolID SID) const {
44	if (isToken(ID: SID))
45	return T ->Terminals [symbolToToken(SID)];
46	return T ->Nonterminals [SID].Name;
47	}
48
49	std::optional<SymbolID> Grammar::findNonterminal(llvm::StringRef Name) const {
50	auto It = llvm::partition_point(
51	Range&: T ->Nonterminals,
52	P: [&](const GrammarTable::Nonterminal &X) { return X.Name < Name; });
53	if (It != T ->Nonterminals.end() && It ->Name == Name)
54	return It - T ->Nonterminals.begin();
55	return std::nullopt;
56	}
57
58	std::string Grammar::dumpRule(RuleID RID) const {
59	std::string Result;
60	llvm::raw_string_ostream OS(Result);
61	const Rule &R = T ->Rules [RID];
62	OS << symbolName(SID: R.Target) << " :=";
63	for (unsigned I = `0`; I < R.Size; ++I) {
64	OS << " " << symbolName(SID: R.Sequence[I]);
65	if (R.RecoveryIndex == I)
66	OS << " [recover=" << T ->AttributeValues [R.Recovery] << "]";
67	}
68	if (R.Guarded)
69	OS << " [guard]";
70	return Result;
71	}
72
73	std::string Grammar::dumpRules(SymbolID SID) const {
74	assert(isNonterminal(SID));
75	std::string Result;
76	const auto &Range = T ->Nonterminals [SID].RuleRange;
77	for (RuleID RID = Range.Start; RID < Range.End; ++RID)
78	Result.append(str: dumpRule(RID)).push_back(c: `'\n'`);
79	return Result;
80	}
81
82	std::string Grammar::dump() const {
83	std::string Result;
84	llvm::raw_string_ostream OS(Result);
85	OS << "Nonterminals:\n";
86	for (SymbolID SID = `0`; SID < T ->Nonterminals.size(); ++SID)
87	OS << llvm::formatv(Fmt: " {0} {1}\n", Vals&: SID, Vals: symbolName(SID));
88	OS << "Rules:\n";
89	for (RuleID RID = `0`; RID < T ->Rules.size(); ++RID)
90	OS << llvm::formatv(Fmt: " {0} {1}\n", Vals&: RID, Vals: dumpRule(RID));
91	return OS.str();
92	}
93
94	std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &G) {
95	std::vector<llvm::DenseSet<SymbolID>> FirstSets(
96	G.table().Nonterminals.size());
97	auto ExpandFirstSet = [&FirstSets](SymbolID Target, SymbolID First) {
98	assert(isNonterminal(Target));
99	if (isToken(ID: First))
100	return FirstSets [Target].insert(V: First).second;
101	bool Changed = false;
102	for (SymbolID SID : FirstSets [First])
103	Changed \|= FirstSets [Target].insert(V: SID).second;
104	return Changed;
105	};
106
107	// A rule S := T ... implies elements in FIRST(S):
108	// - if T is a terminal, FIRST(S) contains T
109	// - if T is a nonterminal, FIRST(S) contains FIRST(T)
110	// Since FIRST(T) may not have been fully computed yet, FIRST(S) itself may
111	// end up being incomplete.
112	// We iterate until we hit a fixed point.
113	// (This isn't particularly efficient, but table building isn't on the
114	// critical path).
115	bool Changed = true;
116	while (Changed) {
117	Changed = false;
118	for (const auto &R : G.table().Rules)
119	// We only need to consider the first element because symbols are
120	// non-nullable.
121	Changed \|= ExpandFirstSet (R.Target, R.seq().front());
122	}
123	return FirstSets;
124	}
125
126	std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &G) {
127	auto FirstSets = firstSets(G);
128	std::vector<llvm::DenseSet<SymbolID>> FollowSets(
129	G.table().Nonterminals.size());
130	// Expand the follow set of a nonterminal symbol Y by adding all from the
131	// given symbol set.
132	auto ExpandFollowSet = [&FollowSets](SymbolID Y,
133	const llvm::DenseSet<SymbolID> &ToAdd) {
134	assert(isNonterminal(Y));
135	bool Changed = false;
136	for (SymbolID F : ToAdd)
137	Changed \|= FollowSets [Y].insert(V: F).second;
138	return Changed;
139	};
140	// Follow sets is computed based on the following 3 rules, the computation
141	// is completed at a fixed point where there is no more new symbols can be
142	// added to any of the follow sets.
143	//
144	// Rule 1: add endmarker to the FOLLOW(S), where S is the start symbol of the
145	// augmented grammar, in our case it is '_'.
146	FollowSets [G.underscore()].insert(V: tokenSymbol(TK: tok::eof));
147	bool Changed = true;
148	while (Changed) {
149	Changed = false;
150	for (const auto &R : G.table().Rules) {
151	// Rule 2: for a rule X := ... Y Z, we add all symbols from FIRST(Z) to
152	// FOLLOW(Y).
153	for (size_t I = `0`; I + `1` < R.seq().size(); ++I) {
154	if (isToken(ID: R.seq()[I]))
155	continue;
156	// We only need to consider the next symbol because symbols are
157	// non-nullable.
158	SymbolID Next = R.seq()[I + `1`];
159	if (isToken(ID: Next))
160	// First set for a terminal is itself.
161	Changed \|= ExpandFollowSet (R.seq()[I], {Next});
162	else
163	Changed \|= ExpandFollowSet (R.seq()[I], FirstSets [Next]);
164	}
165	// Rule 3: for a rule X := ... Z, we add all symbols from FOLLOW(X) to
166	// FOLLOW(Z).
167	SymbolID Z = R.seq().back();
168	if (isNonterminal(ID: Z))
169	Changed \|= ExpandFollowSet (Z, FollowSets [R.Target]);
170	}
171	}
172	return FollowSets;
173	}
174
175	static llvm::ArrayRef<std::string> getTerminalNames() {
176	static const auto &TerminalNames = []() {
177	auto TerminalNames = new std::string[NumTerminals];
178	#define PUNCTUATOR(Tok, Spelling) TerminalNames[tok::Tok] = Spelling;
179	#define KEYWORD(Keyword, Condition) \
180	TerminalNames[tok::kw_##Keyword] = llvm::StringRef(#Keyword).upper();
181	#define TOK(Tok) TerminalNames[tok::Tok] = llvm::StringRef(#Tok).upper();
182	#include "clang/Basic/TokenKinds.def"
183	return llvm::ArrayRef(TerminalNames, NumTerminals);
184	}();
185	return TerminalNames;
186	}
187	GrammarTable::GrammarTable() : Terminals(getTerminalNames()) {}
188
189	} // namespace pseudo
190	} // namespace clang
191

source code of clang-tools-extra/pseudo/lib/grammar/Grammar.cpp