1 | //===--- CXX.cpp - Define public interfaces for C++ grammar ---------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "clang-pseudo/cxx/CXX.h" |
10 | #include "clang-pseudo/Forest.h" |
11 | #include "clang-pseudo/Language.h" |
12 | #include "clang-pseudo/grammar/Grammar.h" |
13 | #include "clang-pseudo/grammar/LRTable.h" |
14 | #include "clang/Basic/CharInfo.h" |
15 | #include "clang/Basic/TokenKinds.h" |
16 | #include "llvm/ADT/StringSwitch.h" |
17 | #include "llvm/Support/Debug.h" |
18 | #include <utility> |
19 | #define DEBUG_TYPE "CXX.cpp" |
20 | |
21 | namespace clang { |
22 | namespace pseudo { |
23 | namespace cxx { |
24 | namespace { |
25 | static const char *CXXBNF = |
26 | #include "CXXBNF.inc" |
27 | ; |
28 | |
29 | // User-defined string literals look like `""suffix`. |
30 | bool isStringUserDefined(const Token &Tok) { |
31 | return !Tok.text().ends_with(Suffix: "\"" ); |
32 | } |
33 | bool isCharUserDefined(const Token &Tok) { return !Tok.text().ends_with(Suffix: "'" ); } |
34 | |
35 | // Combinable flags describing numbers. |
36 | // Clang has just one numeric_token kind, the grammar has 4. |
37 | enum NumericKind { |
38 | Integer = 0, |
39 | Floating = 1 << 0, |
40 | UserDefined = 1 << 1, |
41 | }; |
42 | // Determine the kind of numeric_constant we have. |
43 | // We can assume it's something valid, as it has been lexed. |
44 | // FIXME: is this expensive enough that we should set flags on the token |
45 | // and reuse them rather than computing it for each guard? |
46 | unsigned numKind(const Token &Tok) { |
47 | assert(Tok.Kind == tok::numeric_constant); |
48 | llvm::StringRef Text = Tok.text(); |
49 | if (Text.size() <= 1) |
50 | return Integer; |
51 | bool Hex = |
52 | Text.size() > 2 && Text[0] == '0' && (Text[1] == 'x' || Text[1] == 'X'); |
53 | uint8_t K = Integer; |
54 | |
55 | for (char C : Text) { |
56 | switch (C) { |
57 | case '.': |
58 | K |= Floating; |
59 | break; |
60 | case 'e': |
61 | case 'E': |
62 | if (!Hex) |
63 | K |= Floating; |
64 | break; |
65 | case 'p': |
66 | case 'P': |
67 | if (Hex) |
68 | K |= Floating; |
69 | break; |
70 | case '_': |
71 | K |= UserDefined; |
72 | break; |
73 | default: |
74 | break; |
75 | } |
76 | } |
77 | |
78 | // We would be done here, but there are stdlib UDLs that lack _. |
79 | // We must distinguish these from the builtin suffixes. |
80 | unsigned LastLetter = Text.size(); |
81 | while (LastLetter > 0 && isLetter(c: Text[LastLetter - 1])) |
82 | --LastLetter; |
83 | if (LastLetter == Text.size()) // Common case |
84 | return NumericKind(K); |
85 | // Trailing d/e/f are not part of the suffix in hex numbers. |
86 | while (Hex && LastLetter < Text.size() && isHexDigit(c: Text[LastLetter])) |
87 | ++LastLetter; |
88 | return llvm::StringSwitch<int, unsigned>(Text.substr(Start: LastLetter)) |
89 | // std::chrono |
90 | .Cases("h" , "min" , "s" , "ms" , "us" , "ns" , "d" , "y" , K | UserDefined) |
91 | // complex |
92 | .Cases("il" , "i" , "if" , K | UserDefined) |
93 | .Default(K); |
94 | } |
95 | |
96 | // RHS is expected to contain a single terminal. |
97 | // Returns the corresponding token. |
98 | const Token &onlyToken(tok::TokenKind Kind, |
99 | const ArrayRef<const ForestNode *> RHS, |
100 | const TokenStream &Tokens) { |
101 | assert(RHS.size() == 1 && RHS.front()->symbol() == tokenSymbol(Kind)); |
102 | return Tokens.tokens()[RHS.front()->startTokenIndex()]; |
103 | } |
104 | // RHS is expected to contain a single symbol. |
105 | // Returns the corresponding ForestNode. |
106 | const ForestNode &onlySymbol(SymbolID Kind, |
107 | const ArrayRef<const ForestNode *> RHS, |
108 | const TokenStream &Tokens) { |
109 | assert(RHS.size() == 1 && RHS.front()->symbol() == Kind); |
110 | return *RHS.front(); |
111 | } |
112 | |
113 | bool isFunctionDeclarator(const ForestNode *Declarator) { |
114 | assert(Declarator->symbol() == cxx::Symbol::declarator); |
115 | bool IsFunction = false; |
116 | while (true) { |
117 | // not well-formed code, return the best guess. |
118 | if (Declarator->kind() != ForestNode::Sequence) |
119 | return IsFunction; |
120 | |
121 | switch (Declarator->rule()) { |
122 | case rule::noptr_declarator::declarator_id: // reached the bottom |
123 | return IsFunction; |
124 | // *X is a nonfunction (unless X is a function). |
125 | case rule::ptr_declarator::ptr_operator__ptr_declarator: |
126 | Declarator = Declarator->elements()[1]; |
127 | IsFunction = false; |
128 | continue; |
129 | // X() is a function (unless X is a pointer or similar). |
130 | case rule::declarator:: |
131 | noptr_declarator__parameters_and_qualifiers__trailing_return_type: |
132 | case rule::noptr_declarator::noptr_declarator__parameters_and_qualifiers: |
133 | Declarator = Declarator->elements()[0]; |
134 | IsFunction = true; |
135 | continue; |
136 | // X[] is an array (unless X is a pointer or function). |
137 | case rule::noptr_declarator:: |
138 | noptr_declarator__L_SQUARE__constant_expression__R_SQUARE: |
139 | case rule::noptr_declarator::noptr_declarator__L_SQUARE__R_SQUARE: |
140 | Declarator = Declarator->elements()[0]; |
141 | IsFunction = false; |
142 | continue; |
143 | // (X) is whatever X is. |
144 | case rule::noptr_declarator::L_PAREN__ptr_declarator__R_PAREN: |
145 | Declarator = Declarator->elements()[1]; |
146 | continue; |
147 | case rule::ptr_declarator::noptr_declarator: |
148 | case rule::declarator::ptr_declarator: |
149 | Declarator = Declarator->elements()[0]; |
150 | continue; |
151 | |
152 | default: |
153 | assert(false && "unhandled declarator for IsFunction" ); |
154 | return IsFunction; |
155 | } |
156 | } |
157 | llvm_unreachable("unreachable" ); |
158 | } |
159 | |
160 | bool guardNextTokenNotElse(const GuardParams &P) { |
161 | return symbolToToken(SID: P.Lookahead) != tok::kw_else; |
162 | } |
163 | |
164 | bool specifiesStructuredBinding(const GuardParams &P) { |
165 | const auto DSS = P.RHS[0]; |
166 | assert(DSS->symbol() == Symbol::decl_specifier_seq); |
167 | |
168 | auto Length = P.RHS[1]->startTokenIndex() - DSS->startTokenIndex(); |
169 | for (const auto &T : |
170 | P.Tokens.tokens().slice(N: DSS->startTokenIndex(), M: Length)) { |
171 | switch (T.Kind) { |
172 | case clang::tok::kw_static: |
173 | case clang::tok::kw_thread_local: |
174 | case clang::tok::kw_auto: |
175 | case clang::tok::kw_const: |
176 | case clang::tok::kw_volatile: |
177 | break; |
178 | default: |
179 | return false; |
180 | } |
181 | } |
182 | return true; |
183 | } |
184 | |
185 | // Whether this e.g. decl-specifier contains an "exclusive" type such as a class |
186 | // name, and thus can't combine with a second exclusive type. |
187 | // |
188 | // Returns false for |
189 | // - non-types |
190 | // - "unsigned" etc that may suffice as types but may modify others |
191 | // - cases of uncertainty (e.g. due to ambiguity) |
192 | bool hasExclusiveType(const ForestNode *N) { |
193 | // FIXME: every time we apply this check, we walk the whole subtree. |
194 | // Add per-node caching instead. |
195 | while (true) { |
196 | assert(N->symbol() == Symbol::decl_specifier_seq || |
197 | N->symbol() == Symbol::type_specifier_seq || |
198 | N->symbol() == Symbol::defining_type_specifier_seq || |
199 | N->symbol() == Symbol::decl_specifier || |
200 | N->symbol() == Symbol::type_specifier || |
201 | N->symbol() == Symbol::defining_type_specifier || |
202 | N->symbol() == Symbol::simple_type_specifier); |
203 | if (N->kind() == ForestNode::Opaque) |
204 | return false; // conservative |
205 | if (N->kind() == ForestNode::Ambiguous) |
206 | return llvm::all_of(N->alternatives(), hasExclusiveType); // conservative |
207 | // All supported symbols are nonterminals. |
208 | assert(N->kind() == ForestNode::Sequence); |
209 | switch (N->rule()) { |
210 | // seq := element seq: check element then continue into seq |
211 | case rule::decl_specifier_seq::decl_specifier__decl_specifier_seq: |
212 | case rule::defining_type_specifier_seq::defining_type_specifier__defining_type_specifier_seq: |
213 | case rule::type_specifier_seq::type_specifier__type_specifier_seq: |
214 | if (hasExclusiveType(N: N->children()[0])) |
215 | return true; |
216 | N = N->children()[1]; |
217 | continue; |
218 | // seq := element: continue into element |
219 | case rule::decl_specifier_seq::decl_specifier: |
220 | case rule::type_specifier_seq::type_specifier: |
221 | case rule::defining_type_specifier_seq::defining_type_specifier: |
222 | N = N->children()[0]; |
223 | continue; |
224 | |
225 | // defining-type-specifier |
226 | case rule::defining_type_specifier::type_specifier: |
227 | N = N->children()[0]; |
228 | continue; |
229 | case rule::defining_type_specifier::class_specifier: |
230 | case rule::defining_type_specifier::enum_specifier: |
231 | return true; |
232 | |
233 | // decl-specifier |
234 | case rule::decl_specifier::defining_type_specifier: |
235 | N = N->children()[0]; |
236 | continue; |
237 | case rule::decl_specifier::CONSTEVAL: |
238 | case rule::decl_specifier::CONSTEXPR: |
239 | case rule::decl_specifier::CONSTINIT: |
240 | case rule::decl_specifier::INLINE: |
241 | case rule::decl_specifier::FRIEND: |
242 | case rule::decl_specifier::storage_class_specifier: |
243 | case rule::decl_specifier::TYPEDEF: |
244 | case rule::decl_specifier::function_specifier: |
245 | return false; |
246 | |
247 | // type-specifier |
248 | case rule::type_specifier::elaborated_type_specifier: |
249 | case rule::type_specifier::typename_specifier: |
250 | return true; |
251 | case rule::type_specifier::simple_type_specifier: |
252 | N = N->children()[0]; |
253 | continue; |
254 | case rule::type_specifier::cv_qualifier: |
255 | return false; |
256 | |
257 | // simple-type-specifier |
258 | case rule::simple_type_specifier::type_name: |
259 | case rule::simple_type_specifier::template_name: |
260 | case rule::simple_type_specifier::builtin_type: |
261 | case rule::simple_type_specifier::nested_name_specifier__TEMPLATE__simple_template_id: |
262 | case rule::simple_type_specifier::nested_name_specifier__template_name: |
263 | case rule::simple_type_specifier::nested_name_specifier__type_name: |
264 | case rule::simple_type_specifier::decltype_specifier: |
265 | case rule::simple_type_specifier::placeholder_type_specifier: |
266 | return true; |
267 | case rule::simple_type_specifier::LONG: |
268 | case rule::simple_type_specifier::SHORT: |
269 | case rule::simple_type_specifier::SIGNED: |
270 | case rule::simple_type_specifier::UNSIGNED: |
271 | return false; |
272 | |
273 | default: |
274 | LLVM_DEBUG(llvm::errs() << "Unhandled rule " << N->rule() << "\n" ); |
275 | llvm_unreachable("hasExclusiveType be exhaustive!" ); |
276 | } |
277 | } |
278 | } |
279 | |
280 | llvm::DenseMap<ExtensionID, RuleGuard> buildGuards() { |
281 | #define GUARD(cond) \ |
282 | { \ |
283 | [](const GuardParams &P) { return cond; } \ |
284 | } |
285 | #define TOKEN_GUARD(kind, cond) \ |
286 | [](const GuardParams& P) { \ |
287 | const Token &Tok = onlyToken(tok::kind, P.RHS, P.Tokens); \ |
288 | return cond; \ |
289 | } |
290 | #define SYMBOL_GUARD(kind, cond) \ |
291 | [](const GuardParams& P) { \ |
292 | const ForestNode &N = onlySymbol(Symbol::kind, P.RHS, P.Tokens); \ |
293 | return cond; \ |
294 | } |
295 | return { |
296 | {rule::function_declarator::declarator, |
297 | SYMBOL_GUARD(declarator, isFunctionDeclarator(&N))}, |
298 | {rule::non_function_declarator::declarator, |
299 | SYMBOL_GUARD(declarator, !isFunctionDeclarator(&N))}, |
300 | |
301 | // A {decl,type,defining-type}-specifier-sequence cannot have multiple |
302 | // "exclusive" types (like class names): a value has only one type. |
303 | {rule::defining_type_specifier_seq:: |
304 | defining_type_specifier__defining_type_specifier_seq, |
305 | GUARD(!hasExclusiveType(P.RHS[0]) || !hasExclusiveType(P.RHS[1]))}, |
306 | {rule::type_specifier_seq::type_specifier__type_specifier_seq, |
307 | GUARD(!hasExclusiveType(P.RHS[0]) || !hasExclusiveType(P.RHS[1]))}, |
308 | {rule::decl_specifier_seq::decl_specifier__decl_specifier_seq, |
309 | GUARD(!hasExclusiveType(P.RHS[0]) || !hasExclusiveType(P.RHS[1]))}, |
310 | |
311 | {rule::contextual_override::IDENTIFIER, |
312 | TOKEN_GUARD(identifier, Tok.text() == "override" )}, |
313 | {rule::contextual_final::IDENTIFIER, |
314 | TOKEN_GUARD(identifier, Tok.text() == "final" )}, |
315 | {rule::import_keyword::IDENTIFIER, |
316 | TOKEN_GUARD(identifier, Tok.text() == "import" )}, |
317 | {rule::export_keyword::IDENTIFIER, |
318 | TOKEN_GUARD(identifier, Tok.text() == "export" )}, |
319 | {rule::module_keyword::IDENTIFIER, |
320 | TOKEN_GUARD(identifier, Tok.text() == "module" )}, |
321 | {rule::contextual_zero::NUMERIC_CONSTANT, |
322 | TOKEN_GUARD(numeric_constant, Tok.text() == "0" )}, |
323 | |
324 | {rule::selection_statement::IF__L_PAREN__condition__R_PAREN__statement, |
325 | guardNextTokenNotElse}, |
326 | {rule::selection_statement:: |
327 | IF__L_PAREN__init_statement__condition__R_PAREN__statement, |
328 | guardNextTokenNotElse}, |
329 | {rule::selection_statement:: |
330 | IF__CONSTEXPR__L_PAREN__condition__R_PAREN__statement, |
331 | guardNextTokenNotElse}, |
332 | {rule::selection_statement:: |
333 | IF__CONSTEXPR__L_PAREN__init_statement__condition__R_PAREN__statement, |
334 | guardNextTokenNotElse}, |
335 | |
336 | // Implement C++ [basic.lookup.qual.general]: |
337 | // If a name, template-id, or decltype-specifier is followed by a |
338 | // ::, it shall designate a namespace, class, enumeration, or |
339 | // dependent type, and the :: is never interpreted as a complete |
340 | // nested-name-specifier. |
341 | {rule::nested_name_specifier::COLONCOLON, |
342 | TOKEN_GUARD(coloncolon, Tok.prev().Kind != tok::identifier)}, |
343 | |
344 | // Implement C++ [dcl.pre#6]: |
345 | // A simple-declaration with an identifier-list is called a structured |
346 | // binding declaration ([dcl.struct.bind]). If the decl-specifier-seq |
347 | // contains any decl-specifier other than static, thread_local, auto, |
348 | // or cv-qualifiers, the program is ill-formed. |
349 | {rule::simple_declaration:: |
350 | decl_specifier_seq__ref_qualifier__L_SQUARE__identifier_list__R_SQUARE__initializer__SEMI, |
351 | specifiesStructuredBinding}, |
352 | {rule::simple_declaration:: |
353 | decl_specifier_seq__L_SQUARE__identifier_list__R_SQUARE__initializer__SEMI, |
354 | specifiesStructuredBinding}, |
355 | |
356 | // The grammar distinguishes (only) user-defined vs plain string literals, |
357 | // where the clang lexer distinguishes (only) encoding types. |
358 | {rule::user_defined_string_literal_chunk::STRING_LITERAL, |
359 | TOKEN_GUARD(string_literal, isStringUserDefined(Tok))}, |
360 | {rule::user_defined_string_literal_chunk::UTF8_STRING_LITERAL, |
361 | TOKEN_GUARD(utf8_string_literal, isStringUserDefined(Tok))}, |
362 | {rule::user_defined_string_literal_chunk::UTF16_STRING_LITERAL, |
363 | TOKEN_GUARD(utf16_string_literal, isStringUserDefined(Tok))}, |
364 | {rule::user_defined_string_literal_chunk::UTF32_STRING_LITERAL, |
365 | TOKEN_GUARD(utf32_string_literal, isStringUserDefined(Tok))}, |
366 | {rule::user_defined_string_literal_chunk::WIDE_STRING_LITERAL, |
367 | TOKEN_GUARD(wide_string_literal, isStringUserDefined(Tok))}, |
368 | {rule::string_literal_chunk::STRING_LITERAL, |
369 | TOKEN_GUARD(string_literal, !isStringUserDefined(Tok))}, |
370 | {rule::string_literal_chunk::UTF8_STRING_LITERAL, |
371 | TOKEN_GUARD(utf8_string_literal, !isStringUserDefined(Tok))}, |
372 | {rule::string_literal_chunk::UTF16_STRING_LITERAL, |
373 | TOKEN_GUARD(utf16_string_literal, !isStringUserDefined(Tok))}, |
374 | {rule::string_literal_chunk::UTF32_STRING_LITERAL, |
375 | TOKEN_GUARD(utf32_string_literal, !isStringUserDefined(Tok))}, |
376 | {rule::string_literal_chunk::WIDE_STRING_LITERAL, |
377 | TOKEN_GUARD(wide_string_literal, !isStringUserDefined(Tok))}, |
378 | // And the same for chars. |
379 | {rule::user_defined_character_literal::CHAR_CONSTANT, |
380 | TOKEN_GUARD(char_constant, isCharUserDefined(Tok))}, |
381 | {rule::user_defined_character_literal::UTF8_CHAR_CONSTANT, |
382 | TOKEN_GUARD(utf8_char_constant, isCharUserDefined(Tok))}, |
383 | {rule::user_defined_character_literal::UTF16_CHAR_CONSTANT, |
384 | TOKEN_GUARD(utf16_char_constant, isCharUserDefined(Tok))}, |
385 | {rule::user_defined_character_literal::UTF32_CHAR_CONSTANT, |
386 | TOKEN_GUARD(utf32_char_constant, isCharUserDefined(Tok))}, |
387 | {rule::user_defined_character_literal::WIDE_CHAR_CONSTANT, |
388 | TOKEN_GUARD(wide_char_constant, isCharUserDefined(Tok))}, |
389 | {rule::character_literal::CHAR_CONSTANT, |
390 | TOKEN_GUARD(char_constant, !isCharUserDefined(Tok))}, |
391 | {rule::character_literal::UTF8_CHAR_CONSTANT, |
392 | TOKEN_GUARD(utf8_char_constant, !isCharUserDefined(Tok))}, |
393 | {rule::character_literal::UTF16_CHAR_CONSTANT, |
394 | TOKEN_GUARD(utf16_char_constant, !isCharUserDefined(Tok))}, |
395 | {rule::character_literal::UTF32_CHAR_CONSTANT, |
396 | TOKEN_GUARD(utf32_char_constant, !isCharUserDefined(Tok))}, |
397 | {rule::character_literal::WIDE_CHAR_CONSTANT, |
398 | TOKEN_GUARD(wide_char_constant, !isCharUserDefined(Tok))}, |
399 | // clang just has one NUMERIC_CONSTANT token for {ud,plain}x{float,int} |
400 | {rule::user_defined_integer_literal::NUMERIC_CONSTANT, |
401 | TOKEN_GUARD(numeric_constant, numKind(Tok) == (Integer | UserDefined))}, |
402 | {rule::user_defined_floating_point_literal::NUMERIC_CONSTANT, |
403 | TOKEN_GUARD(numeric_constant, numKind(Tok) == (Floating | UserDefined))}, |
404 | {rule::integer_literal::NUMERIC_CONSTANT, |
405 | TOKEN_GUARD(numeric_constant, numKind(Tok) == Integer)}, |
406 | {rule::floating_point_literal::NUMERIC_CONSTANT, |
407 | TOKEN_GUARD(numeric_constant, numKind(Tok) == Floating)}, |
408 | }; |
409 | #undef TOKEN_GUARD |
410 | #undef SYMBOL_GUARD |
411 | } |
412 | |
413 | Token::Index recoverBrackets(Token::Index Begin, const TokenStream &Tokens) { |
414 | assert(Begin > 0); |
415 | const Token &Left = Tokens.tokens()[Begin - 1]; |
416 | assert(Left.Kind == tok::l_brace || Left.Kind == tok::l_paren || |
417 | Left.Kind == tok::l_square); |
418 | if (const Token *Right = Left.pair()) { |
419 | assert(Tokens.index(*Right) > Begin - 1); |
420 | return Tokens.index(T: *Right); |
421 | } |
422 | return Token::Invalid; |
423 | } |
424 | |
425 | llvm::DenseMap<ExtensionID, RecoveryStrategy> buildRecoveryStrategies() { |
426 | return { |
427 | {Extension::Brackets, recoverBrackets}, |
428 | }; |
429 | } |
430 | |
431 | } // namespace |
432 | |
433 | const Language &getLanguage() { |
434 | static const auto &CXXLanguage = []() -> const Language & { |
435 | std::vector<std::string> Diags; |
436 | auto G = Grammar::parseBNF(BNF: CXXBNF, Diags&: Diags); |
437 | assert(Diags.empty()); |
438 | LRTable Table = LRTable::buildSLR(G: G); |
439 | const Language *PL = new Language{ |
440 | std::move(G), |
441 | std::move(Table), |
442 | buildGuards(), |
443 | buildRecoveryStrategies(), |
444 | }; |
445 | return *PL; |
446 | }(); |
447 | return CXXLanguage; |
448 | } |
449 | |
450 | } // namespace cxx |
451 | } // namespace pseudo |
452 | } // namespace clang |
453 | |