1 | //===--- Grammar.cpp - Grammar for clang pseudoparser -----------*- C++-*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "clang-pseudo/grammar/Grammar.h" |
10 | #include "clang/Basic/TokenKinds.h" |
11 | #include "llvm/ADT/ArrayRef.h" |
12 | #include "llvm/ADT/STLExtras.h" |
13 | #include "llvm/ADT/StringRef.h" |
14 | #include "llvm/Support/FormatVariadic.h" |
15 | #include "llvm/Support/raw_ostream.h" |
16 | #include <optional> |
17 | |
18 | namespace clang { |
19 | namespace pseudo { |
20 | |
21 | Rule::Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Sequence) |
22 | : Target(Target), Size(static_cast<uint8_t>(Sequence.size())) { |
23 | assert(Sequence.size() <= Rule::MaxElements); |
24 | llvm::copy(Range&: Sequence, Out: this->Sequence); |
25 | } |
26 | |
27 | Grammar::Grammar(std::unique_ptr<GrammarTable> Table) : T(std::move(Table)) { |
28 | Underscore = *findNonterminal(Name: "_" ); |
29 | } |
30 | |
31 | llvm::ArrayRef<Rule> Grammar::rulesFor(SymbolID SID) const { |
32 | assert(isNonterminal(SID)); |
33 | const auto &R = T->Nonterminals[SID].RuleRange; |
34 | assert(R.End <= T->Rules.size()); |
35 | return llvm::ArrayRef(&T->Rules[R.Start], R.End - R.Start); |
36 | } |
37 | |
38 | const Rule &Grammar::lookupRule(RuleID RID) const { |
39 | assert(RID < T->Rules.size()); |
40 | return T->Rules[RID]; |
41 | } |
42 | |
43 | llvm::StringRef Grammar::symbolName(SymbolID SID) const { |
44 | if (isToken(ID: SID)) |
45 | return T->Terminals[symbolToToken(SID)]; |
46 | return T->Nonterminals[SID].Name; |
47 | } |
48 | |
49 | std::optional<SymbolID> Grammar::findNonterminal(llvm::StringRef Name) const { |
50 | auto It = llvm::partition_point( |
51 | Range&: T->Nonterminals, |
52 | P: [&](const GrammarTable::Nonterminal &X) { return X.Name < Name; }); |
53 | if (It != T->Nonterminals.end() && It->Name == Name) |
54 | return It - T->Nonterminals.begin(); |
55 | return std::nullopt; |
56 | } |
57 | |
58 | std::string Grammar::dumpRule(RuleID RID) const { |
59 | std::string Result; |
60 | llvm::raw_string_ostream OS(Result); |
61 | const Rule &R = T->Rules[RID]; |
62 | OS << symbolName(SID: R.Target) << " :=" ; |
63 | for (unsigned I = 0; I < R.Size; ++I) { |
64 | OS << " " << symbolName(SID: R.Sequence[I]); |
65 | if (R.RecoveryIndex == I) |
66 | OS << " [recover=" << T->AttributeValues[R.Recovery] << "]" ; |
67 | } |
68 | if (R.Guarded) |
69 | OS << " [guard]" ; |
70 | return Result; |
71 | } |
72 | |
73 | std::string Grammar::dumpRules(SymbolID SID) const { |
74 | assert(isNonterminal(SID)); |
75 | std::string Result; |
76 | const auto &Range = T->Nonterminals[SID].RuleRange; |
77 | for (RuleID RID = Range.Start; RID < Range.End; ++RID) |
78 | Result.append(str: dumpRule(RID)).push_back(c: '\n'); |
79 | return Result; |
80 | } |
81 | |
82 | std::string Grammar::dump() const { |
83 | std::string Result; |
84 | llvm::raw_string_ostream OS(Result); |
85 | OS << "Nonterminals:\n" ; |
86 | for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) |
87 | OS << llvm::formatv(Fmt: " {0} {1}\n" , Vals&: SID, Vals: symbolName(SID)); |
88 | OS << "Rules:\n" ; |
89 | for (RuleID RID = 0; RID < T->Rules.size(); ++RID) |
90 | OS << llvm::formatv(Fmt: " {0} {1}\n" , Vals&: RID, Vals: dumpRule(RID)); |
91 | return OS.str(); |
92 | } |
93 | |
94 | std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &G) { |
95 | std::vector<llvm::DenseSet<SymbolID>> FirstSets( |
96 | G.table().Nonterminals.size()); |
97 | auto ExpandFirstSet = [&FirstSets](SymbolID Target, SymbolID First) { |
98 | assert(isNonterminal(Target)); |
99 | if (isToken(ID: First)) |
100 | return FirstSets[Target].insert(V: First).second; |
101 | bool Changed = false; |
102 | for (SymbolID SID : FirstSets[First]) |
103 | Changed |= FirstSets[Target].insert(V: SID).second; |
104 | return Changed; |
105 | }; |
106 | |
107 | // A rule S := T ... implies elements in FIRST(S): |
108 | // - if T is a terminal, FIRST(S) contains T |
109 | // - if T is a nonterminal, FIRST(S) contains FIRST(T) |
110 | // Since FIRST(T) may not have been fully computed yet, FIRST(S) itself may |
111 | // end up being incomplete. |
112 | // We iterate until we hit a fixed point. |
113 | // (This isn't particularly efficient, but table building isn't on the |
114 | // critical path). |
115 | bool Changed = true; |
116 | while (Changed) { |
117 | Changed = false; |
118 | for (const auto &R : G.table().Rules) |
119 | // We only need to consider the first element because symbols are |
120 | // non-nullable. |
121 | Changed |= ExpandFirstSet(R.Target, R.seq().front()); |
122 | } |
123 | return FirstSets; |
124 | } |
125 | |
126 | std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &G) { |
127 | auto FirstSets = firstSets(G); |
128 | std::vector<llvm::DenseSet<SymbolID>> FollowSets( |
129 | G.table().Nonterminals.size()); |
130 | // Expand the follow set of a nonterminal symbol Y by adding all from the |
131 | // given symbol set. |
132 | auto ExpandFollowSet = [&FollowSets](SymbolID Y, |
133 | const llvm::DenseSet<SymbolID> &ToAdd) { |
134 | assert(isNonterminal(Y)); |
135 | bool Changed = false; |
136 | for (SymbolID F : ToAdd) |
137 | Changed |= FollowSets[Y].insert(V: F).second; |
138 | return Changed; |
139 | }; |
140 | // Follow sets is computed based on the following 3 rules, the computation |
141 | // is completed at a fixed point where there is no more new symbols can be |
142 | // added to any of the follow sets. |
143 | // |
144 | // Rule 1: add endmarker to the FOLLOW(S), where S is the start symbol of the |
145 | // augmented grammar, in our case it is '_'. |
146 | FollowSets[G.underscore()].insert(V: tokenSymbol(TK: tok::eof)); |
147 | bool Changed = true; |
148 | while (Changed) { |
149 | Changed = false; |
150 | for (const auto &R : G.table().Rules) { |
151 | // Rule 2: for a rule X := ... Y Z, we add all symbols from FIRST(Z) to |
152 | // FOLLOW(Y). |
153 | for (size_t I = 0; I + 1 < R.seq().size(); ++I) { |
154 | if (isToken(ID: R.seq()[I])) |
155 | continue; |
156 | // We only need to consider the next symbol because symbols are |
157 | // non-nullable. |
158 | SymbolID Next = R.seq()[I + 1]; |
159 | if (isToken(ID: Next)) |
160 | // First set for a terminal is itself. |
161 | Changed |= ExpandFollowSet(R.seq()[I], {Next}); |
162 | else |
163 | Changed |= ExpandFollowSet(R.seq()[I], FirstSets[Next]); |
164 | } |
165 | // Rule 3: for a rule X := ... Z, we add all symbols from FOLLOW(X) to |
166 | // FOLLOW(Z). |
167 | SymbolID Z = R.seq().back(); |
168 | if (isNonterminal(ID: Z)) |
169 | Changed |= ExpandFollowSet(Z, FollowSets[R.Target]); |
170 | } |
171 | } |
172 | return FollowSets; |
173 | } |
174 | |
175 | static llvm::ArrayRef<std::string> getTerminalNames() { |
176 | static const auto &TerminalNames = []() { |
177 | auto TerminalNames = new std::string[NumTerminals]; |
178 | #define PUNCTUATOR(Tok, Spelling) TerminalNames[tok::Tok] = Spelling; |
179 | #define KEYWORD(Keyword, Condition) \ |
180 | TerminalNames[tok::kw_##Keyword] = llvm::StringRef(#Keyword).upper(); |
181 | #define TOK(Tok) TerminalNames[tok::Tok] = llvm::StringRef(#Tok).upper(); |
182 | #include "clang/Basic/TokenKinds.def" |
183 | return llvm::ArrayRef(TerminalNames, NumTerminals); |
184 | }(); |
185 | return TerminalNames; |
186 | } |
187 | GrammarTable::GrammarTable() : Terminals(getTerminalNames()) {} |
188 | |
189 | } // namespace pseudo |
190 | } // namespace clang |
191 | |