Grammar.h source code [clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h]

1	//===--- Grammar.h - grammar used by clang pseudoparser ---------- C++--===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file defines base structures for parsing & modeling a grammar for a
10	// programming language:
11	//
12	// # This is a fake C++ BNF grammar
13	// _ := translation-unit
14	// translation-unit := declaration-seq_opt
15	// declaration-seq := declaration
16	// declaration-seq := declaration-seq declaration
17	//
18	// A grammar formally describes a language, and it is constructed by a set of
19	// production rules. A rule is of BNF form (AAA := BBB CCC). A symbol is either
20	// nonterminal or terminal, identified by a SymbolID.
21	//
22	// Annotations are supported in a syntax form of [key=value]. They specify
23	// attributes which are associated with either a grammar symbol (on the
24	// right-hand side of the symbol) or a grammar rule (at the end of the rule
25	// body).
26	// Attributes provide a way to inject custom code into the GLR parser. Each
27	// unique attribute value creates an extension point (identified by ExtensionID
28	// ), and an extension point corresponds to a piece of native code. For
29	// example, C++ grammar has a rule:
30	//
31	// compound_statement := { statement-seq [recover=Brackets] }
32	//
33	// The `recover` attribute instructs the parser that we should perform error
34	// recovery if parsing the statement-seq fails. The `Brackets` recovery
35	// heuristic is implemented in CXX.cpp by binding the ExtensionID for the
36	// `Recovery` value to a specific C++ function that finds the recovery point.
37	//
38	// Notions about the BNF grammar:
39	// - "_" is the start symbol of the augmented grammar;
40	// - single-line comment is supported, starting with a #
41	// - A rule describes how a nonterminal (left side of :=) is constructed, and
42	// it is per line* in the grammar file*
43	// - Terminals (also called tokens) correspond to the clang::TokenKind; they
44	// are written in the grammar like "IDENTIFIER", "USING", "+"
45	// - Nonterminals are specified with "lower-case" names in the grammar; they
46	// shouldn't be nullable (has an empty sequence)
47	// - optional symbols are supported (specified with a _opt suffix), and they
48	// will be eliminated during the grammar parsing stage
49	//
50	//===----------------------------------------------------------------------===//
51
52	#ifndef CLANG_PSEUDO_GRAMMAR_GRAMMAR_H
53	#define CLANG_PSEUDO_GRAMMAR_GRAMMAR_H
54
55	#include "clang/Basic/TokenKinds.h"
56	#include "llvm/ADT/ArrayRef.h"
57	#include "llvm/ADT/DenseSet.h"
58	#include "llvm/ADT/StringRef.h"
59	#include "llvm/Support/raw_ostream.h"
60	#include <cstdint>
61	#include <optional>
62	#include <vector>
63
64	namespace clang {
65	namespace pseudo {
66	// A SymbolID uniquely identifies a terminal/nonterminal symbol in a grammar.
67	// nonterminal IDs are indexes into a table of nonterminal symbols.
68	// Terminal IDs correspond to the clang TokenKind enum.
69	using SymbolID = uint16_t;
70	// SymbolID is only 12 bits wide.
71	// There are maximum 2^11 terminals (aka tokens) and 2^11 nonterminals.
72	static constexpr uint16_t SymbolBits = `12`;
73	static constexpr uint16_t NumTerminals = tok::NUM_TOKENS;
74	// SymbolIDs with the top bit set are tokens/terminals.
75	static constexpr SymbolID TokenFlag = `1` << (SymbolBits - `1`);
76	inline bool isToken(SymbolID ID) { return ID & TokenFlag; }
77	inline bool isNonterminal(SymbolID ID) { return !isToken(ID); }
78	// The terminals are always the clang tok::TokenKind (not all are used).
79	inline tok::TokenKind symbolToToken(SymbolID SID) {
80	assert(isToken(SID));
81	SID &= ~TokenFlag;
82	assert(SID < NumTerminals);
83	return static_cast<tok::TokenKind>(SID);
84	}
85	inline constexpr SymbolID tokenSymbol(tok::TokenKind TK) {
86	return TokenFlag \| static_cast<SymbolID>(TK);
87	}
88
89	// An extension is a piece of native code specific to a grammar that modifies
90	// the behavior of annotated rules. One ExtensionID is assigned for each unique
91	// attribute value (all attributes share a namespace).
92	using ExtensionID = uint16_t;
93
94	// A RuleID uniquely identifies a production rule in a grammar.
95	// It is an index into a table of rules.
96	using RuleID = uint16_t;
97	// There are maximum 2^12 rules.
98	static constexpr unsigned RuleBits = `12`;
99
100	// Represent a production rule in the grammar, e.g.
101	// expression := a b c
102	// ^Target ^Sequence
103	struct Rule {
104	Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Seq);
105
106	// We occupy 4 bits for the sequence, in theory, it can be at most 2^4 tokens
107	// long, however, we're stricter in order to reduce the size, we limit the max
108	// length to 9 (this is the longest sequence in cxx grammar).
109	static constexpr unsigned SizeBits = `4`;
110	static constexpr unsigned MaxElements = `9`;
111	static_assert(MaxElements < (`1` << SizeBits), "Exceeds the maximum limit");
112	static_assert(SizeBits + SymbolBits <= `16`,
113	"Must be able to store symbol ID + size efficiently");
114
115	// 16 bits for target symbol and size of sequence:
116	// SymbolID : 12 \| Size : 4
117	SymbolID Target : SymbolBits;
118	uint8_t Size : SizeBits; // Size of the Sequence
119	SymbolID Sequence[MaxElements];
120
121	// A guarded rule has extra logic to determine whether the RHS is eligible.
122	bool Guarded = false;
123
124	// Specifies the index within Sequence eligible for error recovery.
125	// Given stmt := { stmt-seq_opt }, if we fail to parse the stmt-seq then we
126	// should recover by finding the matching brace, and forcing stmt-seq to match
127	// everything between braces.
128	// For now, only a single strategy at a single point is possible.
129	uint8_t RecoveryIndex = -`1`;
130	ExtensionID Recovery = `0`;
131
132	llvm::ArrayRef<SymbolID> seq() const {
133	return llvm::ArrayRef<SymbolID>(Sequence, Size);
134	}
135	friend bool operator==(const Rule &L, const Rule &R) {
136	return L.Target == R.Target && L.seq() == R.seq() && L.Guarded == R.Guarded;
137	}
138	};
139
140	struct GrammarTable;
141
142	// Grammar that describes a programming language, e.g. C++. It represents the
143	// contents of the specified grammar.
144	// It is a building block for constructing a table-based parser.
145	class Grammar {
146	public:
147	Grammar() = default; // Creates an invalid dummy grammar.
148	explicit Grammar(std::unique_ptr<GrammarTable>);
149
150	// Parses grammar from a BNF file.
151	// Diagnostics emitted during parsing are stored in Diags.
152	static Grammar parseBNF(llvm::StringRef BNF, std::vector<std::string> &Diags);
153
154	// Returns the SymbolID of the symbol '_'.
155	SymbolID underscore() const { return Underscore; };
156
157	// Returns all rules of the given nonterminal symbol.
158	llvm::ArrayRef<Rule> rulesFor(SymbolID SID) const;
159	const Rule &lookupRule(RuleID RID) const;
160
161	// Gets symbol (terminal or nonterminal) name.
162	// Terminals have names like "," (kw_comma) or "OPERATOR" (kw_operator).
163	llvm::StringRef symbolName(SymbolID) const;
164
165	// Lookup the SymbolID of the nonterminal symbol by Name.
166	std::optional<SymbolID> findNonterminal(llvm::StringRef Name) const;
167
168	// Dumps the whole grammar.
169	std::string dump() const;
170	// Dumps a particular rule.
171	std::string dumpRule(RuleID) const;
172	// Dumps all rules of the given nonterminal symbol.
173	std::string dumpRules(SymbolID) const;
174
175	const GrammarTable &table() const { return *T; }
176
177	private:
178	std::unique_ptr<GrammarTable> T;
179	// The symbol ID of '_'. (In the LR literature, this is the start symbol of
180	// the augmented grammar.)
181	SymbolID Underscore;
182	};
183	// For each nonterminal X, computes the set of terminals that begin strings
184	// derived from X. (Known as FIRST sets in grammar-based parsers).
185	std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &);
186	// For each nonterminal X, computes the set of terminals that could immediately
187	// follow X. (Known as FOLLOW sets in grammar-based parsers).
188	std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &);
189
190	// Storage for the underlying data of the Grammar.
191	// It can be constructed dynamically (from compiling BNF file) or statically
192	// (a compiled data-source).
193	struct GrammarTable {
194	GrammarTable();
195
196	struct Nonterminal {
197	std::string Name;
198	// Corresponding rules that construct the nonterminal, it is a [Start, End)
199	// index range of the Rules table.
200	struct {
201	RuleID Start;
202	RuleID End;
203	} RuleRange;
204	};
205
206	// RuleID is an index into this table of rule definitions.
207	//
208	// Rules with the same target symbol (LHS) are grouped into a single range.
209	// The relative order of different target symbols is not* by SymbolID, but*
210	// rather a topological sort: if S := T then the rules producing T have lower
211	// RuleIDs than rules producing S.
212	// (This strange order simplifies the GLR parser: for a given token range, if
213	// we reduce in increasing RuleID order then we need never backtrack --
214	// prerequisite reductions are reached before dependent ones).
215	std::vector<Rule> Rules;
216	// A table of terminals (aka tokens). It corresponds to the clang::Token.
217	// clang::tok::TokenKind is the index of the table.
218	llvm::ArrayRef<std::string> Terminals;
219	// A table of nonterminals, sorted by name.
220	// SymbolID is the index of the table.
221	std::vector<Nonterminal> Nonterminals;
222	// A table of attribute values, sorted by name.
223	// ExtensionID is the index of the table.
224	std::vector<std::string> AttributeValues;
225	};
226
227	} // namespace pseudo
228	} // namespace clang
229
230	#endif // CLANG_PSEUDO_GRAMMAR_GRAMMAR_H
231

source code of clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h