1 | //===--- Grammar.h - grammar used by clang pseudoparser ---------*- C++-*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file defines base structures for parsing & modeling a grammar for a |
10 | // programming language: |
11 | // |
12 | // # This is a fake C++ BNF grammar |
13 | // _ := translation-unit |
14 | // translation-unit := declaration-seq_opt |
15 | // declaration-seq := declaration |
16 | // declaration-seq := declaration-seq declaration |
17 | // |
18 | // A grammar formally describes a language, and it is constructed by a set of |
19 | // production rules. A rule is of BNF form (AAA := BBB CCC). A symbol is either |
20 | // nonterminal or terminal, identified by a SymbolID. |
21 | // |
22 | // Annotations are supported in a syntax form of [key=value]. They specify |
23 | // attributes which are associated with either a grammar symbol (on the |
24 | // right-hand side of the symbol) or a grammar rule (at the end of the rule |
25 | // body). |
26 | // Attributes provide a way to inject custom code into the GLR parser. Each |
27 | // unique attribute value creates an extension point (identified by ExtensionID |
28 | // ), and an extension point corresponds to a piece of native code. For |
29 | // example, C++ grammar has a rule: |
30 | // |
31 | // compound_statement := { statement-seq [recover=Brackets] } |
32 | // |
33 | // The `recover` attribute instructs the parser that we should perform error |
34 | // recovery if parsing the statement-seq fails. The `Brackets` recovery |
35 | // heuristic is implemented in CXX.cpp by binding the ExtensionID for the |
36 | // `Recovery` value to a specific C++ function that finds the recovery point. |
37 | // |
38 | // Notions about the BNF grammar: |
39 | // - "_" is the start symbol of the augmented grammar; |
40 | // - single-line comment is supported, starting with a # |
41 | // - A rule describes how a nonterminal (left side of :=) is constructed, and |
42 | // it is *per line* in the grammar file |
43 | // - Terminals (also called tokens) correspond to the clang::TokenKind; they |
44 | // are written in the grammar like "IDENTIFIER", "USING", "+" |
45 | // - Nonterminals are specified with "lower-case" names in the grammar; they |
46 | // shouldn't be nullable (has an empty sequence) |
47 | // - optional symbols are supported (specified with a _opt suffix), and they |
48 | // will be eliminated during the grammar parsing stage |
49 | // |
50 | //===----------------------------------------------------------------------===// |
51 | |
52 | #ifndef CLANG_PSEUDO_GRAMMAR_GRAMMAR_H |
53 | #define CLANG_PSEUDO_GRAMMAR_GRAMMAR_H |
54 | |
55 | #include "clang/Basic/TokenKinds.h" |
56 | #include "llvm/ADT/ArrayRef.h" |
57 | #include "llvm/ADT/DenseSet.h" |
58 | #include "llvm/ADT/StringRef.h" |
59 | #include "llvm/Support/raw_ostream.h" |
60 | #include <cstdint> |
61 | #include <optional> |
62 | #include <vector> |
63 | |
64 | namespace clang { |
65 | namespace pseudo { |
66 | // A SymbolID uniquely identifies a terminal/nonterminal symbol in a grammar. |
67 | // nonterminal IDs are indexes into a table of nonterminal symbols. |
68 | // Terminal IDs correspond to the clang TokenKind enum. |
69 | using SymbolID = uint16_t; |
70 | // SymbolID is only 12 bits wide. |
71 | // There are maximum 2^11 terminals (aka tokens) and 2^11 nonterminals. |
72 | static constexpr uint16_t SymbolBits = 12; |
73 | static constexpr uint16_t NumTerminals = tok::NUM_TOKENS; |
74 | // SymbolIDs with the top bit set are tokens/terminals. |
75 | static constexpr SymbolID TokenFlag = 1 << (SymbolBits - 1); |
76 | inline bool isToken(SymbolID ID) { return ID & TokenFlag; } |
77 | inline bool isNonterminal(SymbolID ID) { return !isToken(ID); } |
78 | // The terminals are always the clang tok::TokenKind (not all are used). |
79 | inline tok::TokenKind symbolToToken(SymbolID SID) { |
80 | assert(isToken(SID)); |
81 | SID &= ~TokenFlag; |
82 | assert(SID < NumTerminals); |
83 | return static_cast<tok::TokenKind>(SID); |
84 | } |
85 | inline constexpr SymbolID tokenSymbol(tok::TokenKind TK) { |
86 | return TokenFlag | static_cast<SymbolID>(TK); |
87 | } |
88 | |
89 | // An extension is a piece of native code specific to a grammar that modifies |
90 | // the behavior of annotated rules. One ExtensionID is assigned for each unique |
91 | // attribute value (all attributes share a namespace). |
92 | using ExtensionID = uint16_t; |
93 | |
94 | // A RuleID uniquely identifies a production rule in a grammar. |
95 | // It is an index into a table of rules. |
96 | using RuleID = uint16_t; |
97 | // There are maximum 2^12 rules. |
98 | static constexpr unsigned RuleBits = 12; |
99 | |
100 | // Represent a production rule in the grammar, e.g. |
101 | // expression := a b c |
102 | // ^Target ^Sequence |
103 | struct Rule { |
104 | Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Seq); |
105 | |
106 | // We occupy 4 bits for the sequence, in theory, it can be at most 2^4 tokens |
107 | // long, however, we're stricter in order to reduce the size, we limit the max |
108 | // length to 9 (this is the longest sequence in cxx grammar). |
109 | static constexpr unsigned SizeBits = 4; |
110 | static constexpr unsigned MaxElements = 9; |
111 | static_assert(MaxElements < (1 << SizeBits), "Exceeds the maximum limit" ); |
112 | static_assert(SizeBits + SymbolBits <= 16, |
113 | "Must be able to store symbol ID + size efficiently" ); |
114 | |
115 | // 16 bits for target symbol and size of sequence: |
116 | // SymbolID : 12 | Size : 4 |
117 | SymbolID Target : SymbolBits; |
118 | uint8_t Size : SizeBits; // Size of the Sequence |
119 | SymbolID Sequence[MaxElements]; |
120 | |
121 | // A guarded rule has extra logic to determine whether the RHS is eligible. |
122 | bool Guarded = false; |
123 | |
124 | // Specifies the index within Sequence eligible for error recovery. |
125 | // Given stmt := { stmt-seq_opt }, if we fail to parse the stmt-seq then we |
126 | // should recover by finding the matching brace, and forcing stmt-seq to match |
127 | // everything between braces. |
128 | // For now, only a single strategy at a single point is possible. |
129 | uint8_t RecoveryIndex = -1; |
130 | ExtensionID Recovery = 0; |
131 | |
132 | llvm::ArrayRef<SymbolID> seq() const { |
133 | return llvm::ArrayRef<SymbolID>(Sequence, Size); |
134 | } |
135 | friend bool operator==(const Rule &L, const Rule &R) { |
136 | return L.Target == R.Target && L.seq() == R.seq() && L.Guarded == R.Guarded; |
137 | } |
138 | }; |
139 | |
140 | struct GrammarTable; |
141 | |
142 | // Grammar that describes a programming language, e.g. C++. It represents the |
143 | // contents of the specified grammar. |
144 | // It is a building block for constructing a table-based parser. |
145 | class Grammar { |
146 | public: |
147 | Grammar() = default; // Creates an invalid dummy grammar. |
148 | explicit Grammar(std::unique_ptr<GrammarTable>); |
149 | |
150 | // Parses grammar from a BNF file. |
151 | // Diagnostics emitted during parsing are stored in Diags. |
152 | static Grammar parseBNF(llvm::StringRef BNF, std::vector<std::string> &Diags); |
153 | |
154 | // Returns the SymbolID of the symbol '_'. |
155 | SymbolID underscore() const { return Underscore; }; |
156 | |
157 | // Returns all rules of the given nonterminal symbol. |
158 | llvm::ArrayRef<Rule> rulesFor(SymbolID SID) const; |
159 | const Rule &lookupRule(RuleID RID) const; |
160 | |
161 | // Gets symbol (terminal or nonterminal) name. |
162 | // Terminals have names like "," (kw_comma) or "OPERATOR" (kw_operator). |
163 | llvm::StringRef symbolName(SymbolID) const; |
164 | |
165 | // Lookup the SymbolID of the nonterminal symbol by Name. |
166 | std::optional<SymbolID> findNonterminal(llvm::StringRef Name) const; |
167 | |
168 | // Dumps the whole grammar. |
169 | std::string dump() const; |
170 | // Dumps a particular rule. |
171 | std::string dumpRule(RuleID) const; |
172 | // Dumps all rules of the given nonterminal symbol. |
173 | std::string dumpRules(SymbolID) const; |
174 | |
175 | const GrammarTable &table() const { return *T; } |
176 | |
177 | private: |
178 | std::unique_ptr<GrammarTable> T; |
179 | // The symbol ID of '_'. (In the LR literature, this is the start symbol of |
180 | // the augmented grammar.) |
181 | SymbolID Underscore; |
182 | }; |
183 | // For each nonterminal X, computes the set of terminals that begin strings |
184 | // derived from X. (Known as FIRST sets in grammar-based parsers). |
185 | std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &); |
186 | // For each nonterminal X, computes the set of terminals that could immediately |
187 | // follow X. (Known as FOLLOW sets in grammar-based parsers). |
188 | std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &); |
189 | |
190 | // Storage for the underlying data of the Grammar. |
191 | // It can be constructed dynamically (from compiling BNF file) or statically |
192 | // (a compiled data-source). |
193 | struct GrammarTable { |
194 | GrammarTable(); |
195 | |
196 | struct Nonterminal { |
197 | std::string Name; |
198 | // Corresponding rules that construct the nonterminal, it is a [Start, End) |
199 | // index range of the Rules table. |
200 | struct { |
201 | RuleID Start; |
202 | RuleID End; |
203 | } RuleRange; |
204 | }; |
205 | |
206 | // RuleID is an index into this table of rule definitions. |
207 | // |
208 | // Rules with the same target symbol (LHS) are grouped into a single range. |
209 | // The relative order of different target symbols is *not* by SymbolID, but |
210 | // rather a topological sort: if S := T then the rules producing T have lower |
211 | // RuleIDs than rules producing S. |
212 | // (This strange order simplifies the GLR parser: for a given token range, if |
213 | // we reduce in increasing RuleID order then we need never backtrack -- |
214 | // prerequisite reductions are reached before dependent ones). |
215 | std::vector<Rule> Rules; |
216 | // A table of terminals (aka tokens). It corresponds to the clang::Token. |
217 | // clang::tok::TokenKind is the index of the table. |
218 | llvm::ArrayRef<std::string> Terminals; |
219 | // A table of nonterminals, sorted by name. |
220 | // SymbolID is the index of the table. |
221 | std::vector<Nonterminal> Nonterminals; |
222 | // A table of attribute values, sorted by name. |
223 | // ExtensionID is the index of the table. |
224 | std::vector<std::string> AttributeValues; |
225 | }; |
226 | |
227 | } // namespace pseudo |
228 | } // namespace clang |
229 | |
230 | #endif // CLANG_PSEUDO_GRAMMAR_GRAMMAR_H |
231 | |