1//===--- Grammar.cpp - Grammar for clang pseudoparser -----------*- C++-*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "clang-pseudo/grammar/Grammar.h"
10#include "clang/Basic/TokenKinds.h"
11#include "llvm/ADT/ArrayRef.h"
12#include "llvm/ADT/STLExtras.h"
13#include "llvm/ADT/StringRef.h"
14#include "llvm/Support/FormatVariadic.h"
15#include "llvm/Support/raw_ostream.h"
16#include <optional>
17
18namespace clang {
19namespace pseudo {
20
21Rule::Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Sequence)
22 : Target(Target), Size(static_cast<uint8_t>(Sequence.size())) {
23 assert(Sequence.size() <= Rule::MaxElements);
24 llvm::copy(Range&: Sequence, Out: this->Sequence);
25}
26
27Grammar::Grammar(std::unique_ptr<GrammarTable> Table) : T(std::move(Table)) {
28 Underscore = *findNonterminal(Name: "_");
29}
30
31llvm::ArrayRef<Rule> Grammar::rulesFor(SymbolID SID) const {
32 assert(isNonterminal(SID));
33 const auto &R = T->Nonterminals[SID].RuleRange;
34 assert(R.End <= T->Rules.size());
35 return llvm::ArrayRef(&T->Rules[R.Start], R.End - R.Start);
36}
37
38const Rule &Grammar::lookupRule(RuleID RID) const {
39 assert(RID < T->Rules.size());
40 return T->Rules[RID];
41}
42
43llvm::StringRef Grammar::symbolName(SymbolID SID) const {
44 if (isToken(ID: SID))
45 return T->Terminals[symbolToToken(SID)];
46 return T->Nonterminals[SID].Name;
47}
48
49std::optional<SymbolID> Grammar::findNonterminal(llvm::StringRef Name) const {
50 auto It = llvm::partition_point(
51 Range&: T->Nonterminals,
52 P: [&](const GrammarTable::Nonterminal &X) { return X.Name < Name; });
53 if (It != T->Nonterminals.end() && It->Name == Name)
54 return It - T->Nonterminals.begin();
55 return std::nullopt;
56}
57
58std::string Grammar::dumpRule(RuleID RID) const {
59 std::string Result;
60 llvm::raw_string_ostream OS(Result);
61 const Rule &R = T->Rules[RID];
62 OS << symbolName(SID: R.Target) << " :=";
63 for (unsigned I = 0; I < R.Size; ++I) {
64 OS << " " << symbolName(SID: R.Sequence[I]);
65 if (R.RecoveryIndex == I)
66 OS << " [recover=" << T->AttributeValues[R.Recovery] << "]";
67 }
68 if (R.Guarded)
69 OS << " [guard]";
70 return Result;
71}
72
73std::string Grammar::dumpRules(SymbolID SID) const {
74 assert(isNonterminal(SID));
75 std::string Result;
76 const auto &Range = T->Nonterminals[SID].RuleRange;
77 for (RuleID RID = Range.Start; RID < Range.End; ++RID)
78 Result.append(str: dumpRule(RID)).push_back(c: '\n');
79 return Result;
80}
81
82std::string Grammar::dump() const {
83 std::string Result;
84 llvm::raw_string_ostream OS(Result);
85 OS << "Nonterminals:\n";
86 for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID)
87 OS << llvm::formatv(Fmt: " {0} {1}\n", Vals&: SID, Vals: symbolName(SID));
88 OS << "Rules:\n";
89 for (RuleID RID = 0; RID < T->Rules.size(); ++RID)
90 OS << llvm::formatv(Fmt: " {0} {1}\n", Vals&: RID, Vals: dumpRule(RID));
91 return OS.str();
92}
93
94std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &G) {
95 std::vector<llvm::DenseSet<SymbolID>> FirstSets(
96 G.table().Nonterminals.size());
97 auto ExpandFirstSet = [&FirstSets](SymbolID Target, SymbolID First) {
98 assert(isNonterminal(Target));
99 if (isToken(ID: First))
100 return FirstSets[Target].insert(V: First).second;
101 bool Changed = false;
102 for (SymbolID SID : FirstSets[First])
103 Changed |= FirstSets[Target].insert(V: SID).second;
104 return Changed;
105 };
106
107 // A rule S := T ... implies elements in FIRST(S):
108 // - if T is a terminal, FIRST(S) contains T
109 // - if T is a nonterminal, FIRST(S) contains FIRST(T)
110 // Since FIRST(T) may not have been fully computed yet, FIRST(S) itself may
111 // end up being incomplete.
112 // We iterate until we hit a fixed point.
113 // (This isn't particularly efficient, but table building isn't on the
114 // critical path).
115 bool Changed = true;
116 while (Changed) {
117 Changed = false;
118 for (const auto &R : G.table().Rules)
119 // We only need to consider the first element because symbols are
120 // non-nullable.
121 Changed |= ExpandFirstSet(R.Target, R.seq().front());
122 }
123 return FirstSets;
124}
125
126std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &G) {
127 auto FirstSets = firstSets(G);
128 std::vector<llvm::DenseSet<SymbolID>> FollowSets(
129 G.table().Nonterminals.size());
130 // Expand the follow set of a nonterminal symbol Y by adding all from the
131 // given symbol set.
132 auto ExpandFollowSet = [&FollowSets](SymbolID Y,
133 const llvm::DenseSet<SymbolID> &ToAdd) {
134 assert(isNonterminal(Y));
135 bool Changed = false;
136 for (SymbolID F : ToAdd)
137 Changed |= FollowSets[Y].insert(V: F).second;
138 return Changed;
139 };
140 // Follow sets is computed based on the following 3 rules, the computation
141 // is completed at a fixed point where there is no more new symbols can be
142 // added to any of the follow sets.
143 //
144 // Rule 1: add endmarker to the FOLLOW(S), where S is the start symbol of the
145 // augmented grammar, in our case it is '_'.
146 FollowSets[G.underscore()].insert(V: tokenSymbol(TK: tok::eof));
147 bool Changed = true;
148 while (Changed) {
149 Changed = false;
150 for (const auto &R : G.table().Rules) {
151 // Rule 2: for a rule X := ... Y Z, we add all symbols from FIRST(Z) to
152 // FOLLOW(Y).
153 for (size_t I = 0; I + 1 < R.seq().size(); ++I) {
154 if (isToken(ID: R.seq()[I]))
155 continue;
156 // We only need to consider the next symbol because symbols are
157 // non-nullable.
158 SymbolID Next = R.seq()[I + 1];
159 if (isToken(ID: Next))
160 // First set for a terminal is itself.
161 Changed |= ExpandFollowSet(R.seq()[I], {Next});
162 else
163 Changed |= ExpandFollowSet(R.seq()[I], FirstSets[Next]);
164 }
165 // Rule 3: for a rule X := ... Z, we add all symbols from FOLLOW(X) to
166 // FOLLOW(Z).
167 SymbolID Z = R.seq().back();
168 if (isNonterminal(ID: Z))
169 Changed |= ExpandFollowSet(Z, FollowSets[R.Target]);
170 }
171 }
172 return FollowSets;
173}
174
175static llvm::ArrayRef<std::string> getTerminalNames() {
176 static const auto &TerminalNames = []() {
177 auto TerminalNames = new std::string[NumTerminals];
178#define PUNCTUATOR(Tok, Spelling) TerminalNames[tok::Tok] = Spelling;
179#define KEYWORD(Keyword, Condition) \
180 TerminalNames[tok::kw_##Keyword] = llvm::StringRef(#Keyword).upper();
181#define TOK(Tok) TerminalNames[tok::Tok] = llvm::StringRef(#Tok).upper();
182#include "clang/Basic/TokenKinds.def"
183 return llvm::ArrayRef(TerminalNames, NumTerminals);
184 }();
185 return TerminalNames;
186}
187GrammarTable::GrammarTable() : Terminals(getTerminalNames()) {}
188
189} // namespace pseudo
190} // namespace clang
191

source code of clang-tools-extra/pseudo/lib/grammar/Grammar.cpp