1//===- Parser.cpp - Matcher expression parser -----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Recursive parser implementation for the matcher expression grammar.
10//
11//===----------------------------------------------------------------------===//
12
13#include "Parser.h"
14
15#include <vector>
16
17namespace mlir::query::matcher::internal {
18
19// Simple structure to hold information for one token from the parser.
20struct Parser::TokenInfo {
21 TokenInfo() = default;
22
23 // Method to set the kind and text of the token
24 void set(TokenKind newKind, llvm::StringRef newText) {
25 kind = newKind;
26 text = newText;
27 }
28
29 // Known identifiers.
30 static const char *const ID_Extract;
31
32 llvm::StringRef text;
33 TokenKind kind = TokenKind::Eof;
34 SourceRange range;
35 VariantValue value;
36};
37
38const char *const Parser::TokenInfo::ID_Extract = "extract";
39
40class Parser::CodeTokenizer {
41public:
42 // Constructor with matcherCode and error
43 explicit CodeTokenizer(llvm::StringRef matcherCode, Diagnostics *error)
44 : code(matcherCode), startOfLine(matcherCode), error(error) {
45 nextToken = getNextToken();
46 }
47
48 // Constructor with matcherCode, error, and codeCompletionOffset
49 CodeTokenizer(llvm::StringRef matcherCode, Diagnostics *error,
50 unsigned codeCompletionOffset)
51 : code(matcherCode), startOfLine(matcherCode), error(error),
52 codeCompletionLocation(matcherCode.data() + codeCompletionOffset) {
53 nextToken = getNextToken();
54 }
55
56 // Peek at next token without consuming it
57 const TokenInfo &peekNextToken() const { return nextToken; }
58
59 // Consume and return the next token
60 TokenInfo consumeNextToken() {
61 TokenInfo thisToken = nextToken;
62 nextToken = getNextToken();
63 return thisToken;
64 }
65
66 // Skip any newline tokens
67 TokenInfo skipNewlines() {
68 while (nextToken.kind == TokenKind::NewLine)
69 nextToken = getNextToken();
70 return nextToken;
71 }
72
73 // Consume and return next token, ignoring newlines
74 TokenInfo consumeNextTokenIgnoreNewlines() {
75 skipNewlines();
76 return nextToken.kind == TokenKind::Eof ? nextToken : consumeNextToken();
77 }
78
79 // Return kind of next token
80 TokenKind nextTokenKind() const { return nextToken.kind; }
81
82private:
83 // Helper function to get the first character as a new StringRef and drop it
84 // from the original string
85 llvm::StringRef firstCharacterAndDrop(llvm::StringRef &str) {
86 assert(!str.empty());
87 llvm::StringRef firstChar = str.substr(Start: 0, N: 1);
88 str = str.drop_front();
89 return firstChar;
90 }
91
92 // Get next token, consuming whitespaces and handling different token types
93 TokenInfo getNextToken() {
94 consumeWhitespace();
95 TokenInfo result;
96 result.range.start = currentLocation();
97
98 // Code completion case
99 if (codeCompletionLocation && codeCompletionLocation <= code.data()) {
100 result.set(newKind: TokenKind::CodeCompletion,
101 newText: llvm::StringRef(codeCompletionLocation, 0));
102 codeCompletionLocation = nullptr;
103 return result;
104 }
105
106 // End of file case
107 if (code.empty()) {
108 result.set(newKind: TokenKind::Eof, newText: "");
109 return result;
110 }
111
112 // Switch to handle specific characters
113 switch (code[0]) {
114 case '#':
115 code = code.drop_until(F: [](char c) { return c == '\n'; });
116 return getNextToken();
117 case ',':
118 result.set(newKind: TokenKind::Comma, newText: firstCharacterAndDrop(str&: code));
119 break;
120 case '.':
121 result.set(newKind: TokenKind::Period, newText: firstCharacterAndDrop(str&: code));
122 break;
123 case '\n':
124 ++line;
125 startOfLine = code.drop_front();
126 result.set(newKind: TokenKind::NewLine, newText: firstCharacterAndDrop(str&: code));
127 break;
128 case '(':
129 result.set(newKind: TokenKind::OpenParen, newText: firstCharacterAndDrop(str&: code));
130 break;
131 case ')':
132 result.set(newKind: TokenKind::CloseParen, newText: firstCharacterAndDrop(str&: code));
133 break;
134 case '"':
135 case '\'':
136 consumeStringLiteral(result: &result);
137 break;
138 case '0':
139 case '1':
140 case '2':
141 case '3':
142 case '4':
143 case '5':
144 case '6':
145 case '7':
146 case '8':
147 case '9':
148 consumeNumberLiteral(result: &result);
149 break;
150 default:
151 parseIdentifierOrInvalid(result: &result);
152 break;
153 }
154
155 result.range.end = currentLocation();
156 return result;
157 }
158
159 void consumeNumberLiteral(TokenInfo *result) {
160 StringRef original = code;
161 unsigned value = 0;
162 if (!code.consumeInteger(Radix: 0, Result&: value)) {
163 size_t numConsumed = original.size() - code.size();
164 result->text = original.take_front(N: numConsumed);
165 result->kind = TokenKind::Literal;
166 result->value = static_cast<int64_t>(value);
167 return;
168 }
169 }
170
171 // Consume a string literal, handle escape sequences and missing closing
172 // quote.
173 void consumeStringLiteral(TokenInfo *result) {
174 bool inEscape = false;
175 const char marker = code[0];
176 for (size_t length = 1; length < code.size(); ++length) {
177 if (inEscape) {
178 inEscape = false;
179 continue;
180 }
181 if (code[length] == '\\') {
182 inEscape = true;
183 continue;
184 }
185 if (code[length] == marker) {
186 result->kind = TokenKind::Literal;
187 result->text = code.substr(Start: 0, N: length + 1);
188 result->value = code.substr(Start: 1, N: length - 1);
189 code = code.drop_front(N: length + 1);
190 return;
191 }
192 }
193 llvm::StringRef errorText = code;
194 code = code.drop_front(N: code.size());
195 SourceRange range;
196 range.start = result->range.start;
197 range.end = currentLocation();
198 error->addError(range, error: ErrorType::ParserStringError) << errorText;
199 result->kind = TokenKind::Error;
200 }
201
202 void parseIdentifierOrInvalid(TokenInfo *result) {
203 if (isalnum(code[0])) {
204 // Parse an identifier
205 size_t tokenLength = 1;
206
207 while (true) {
208 // A code completion location in/immediately after an identifier will
209 // cause the portion of the identifier before the code completion
210 // location to become a code completion token.
211 if (codeCompletionLocation == code.data() + tokenLength) {
212 codeCompletionLocation = nullptr;
213 result->kind = TokenKind::CodeCompletion;
214 result->text = code.substr(Start: 0, N: tokenLength);
215 code = code.drop_front(N: tokenLength);
216 return;
217 }
218 if (tokenLength == code.size() || !(isalnum(code[tokenLength])))
219 break;
220 ++tokenLength;
221 }
222 llvm::StringRef token = code.substr(Start: 0, N: tokenLength);
223 code = code.drop_front(N: tokenLength);
224 // Check if the identifier is a boolean literal
225 if (token == "true") {
226 result->text = "false";
227 result->kind = TokenKind::Literal;
228 result->value = true;
229 } else if (token == "false") {
230 result->text = "false";
231 result->kind = TokenKind::Literal;
232 result->value = false;
233 } else {
234 // Otherwise it is treated as a normal identifier
235 result->kind = TokenKind::Ident;
236 result->text = token;
237 }
238 } else {
239 result->kind = TokenKind::InvalidChar;
240 result->text = code.substr(Start: 0, N: 1);
241 code = code.drop_front(N: 1);
242 }
243 }
244
245 // Consume all leading whitespace from code, except newlines
246 void consumeWhitespace() { code = code.ltrim(Chars: " \t\v\f\r"); }
247
248 // Returns the current location in the source code
249 SourceLocation currentLocation() {
250 SourceLocation location;
251 location.line = line;
252 location.column = code.data() - startOfLine.data() + 1;
253 return location;
254 }
255
256 llvm::StringRef code;
257 llvm::StringRef startOfLine;
258 unsigned line = 1;
259 Diagnostics *error;
260 TokenInfo nextToken;
261 const char *codeCompletionLocation = nullptr;
262};
263
264Parser::Sema::~Sema() = default;
265
266std::vector<ArgKind> Parser::Sema::getAcceptedCompletionTypes(
267 llvm::ArrayRef<std::pair<MatcherCtor, unsigned>> context) {
268 return {};
269}
270
271std::vector<MatcherCompletion>
272Parser::Sema::getMatcherCompletions(llvm::ArrayRef<ArgKind> acceptedTypes) {
273 return {};
274}
275
276// Entry for the scope of a parser
277struct Parser::ScopedContextEntry {
278 Parser *parser;
279
280 ScopedContextEntry(Parser *parser, MatcherCtor c) : parser(parser) {
281 parser->contextStack.emplace_back(args&: c, args: 0u);
282 }
283
284 ~ScopedContextEntry() { parser->contextStack.pop_back(); }
285
286 void nextArg() { ++parser->contextStack.back().second; }
287};
288
289// Parse and validate expressions starting with an identifier.
290// This function can parse named values and matchers. In case of failure, it
291// will try to determine the user's intent to give an appropriate error message.
292bool Parser::parseIdentifierPrefixImpl(VariantValue *value) {
293 const TokenInfo nameToken = tokenizer->consumeNextToken();
294
295 if (tokenizer->nextTokenKind() != TokenKind::OpenParen) {
296 // Parse as a named value.
297 if (auto namedValue = namedValues ? namedValues->lookup(Key: nameToken.text)
298 : VariantValue()) {
299
300 if (tokenizer->nextTokenKind() != TokenKind::Period) {
301 *value = namedValue;
302 return true;
303 }
304
305 if (!namedValue.isMatcher()) {
306 error->addError(range: tokenizer->peekNextToken().range,
307 error: ErrorType::ParserNotAMatcher);
308 return false;
309 }
310 }
311
312 if (tokenizer->nextTokenKind() == TokenKind::NewLine) {
313 error->addError(range: tokenizer->peekNextToken().range,
314 error: ErrorType::ParserNoOpenParen)
315 << "NewLine";
316 return false;
317 }
318
319 // If the syntax is correct and the name is not a matcher either, report
320 // an unknown named value.
321 if ((tokenizer->nextTokenKind() == TokenKind::Comma ||
322 tokenizer->nextTokenKind() == TokenKind::CloseParen ||
323 tokenizer->nextTokenKind() == TokenKind::NewLine ||
324 tokenizer->nextTokenKind() == TokenKind::Eof) &&
325 !sema->lookupMatcherCtor(matcherName: nameToken.text)) {
326 error->addError(range: nameToken.range, error: ErrorType::RegistryValueNotFound)
327 << nameToken.text;
328 return false;
329 }
330 // Otherwise, fallback to the matcher parser.
331 }
332
333 tokenizer->skipNewlines();
334
335 assert(nameToken.kind == TokenKind::Ident);
336 TokenInfo openToken = tokenizer->consumeNextToken();
337 if (openToken.kind != TokenKind::OpenParen) {
338 error->addError(range: openToken.range, error: ErrorType::ParserNoOpenParen)
339 << openToken.text;
340 return false;
341 }
342
343 std::optional<MatcherCtor> ctor = sema->lookupMatcherCtor(matcherName: nameToken.text);
344
345 // Parse as a matcher expression.
346 return parseMatcherExpressionImpl(nameToken, openToken, ctor, value);
347}
348
349bool Parser::parseChainedExpression(std::string &argument) {
350 // Parse the parenthesized argument to .extract("foo")
351 // Note: EOF is handled inside the consume functions and would fail below when
352 // checking token kind.
353 const TokenInfo openToken = tokenizer->consumeNextToken();
354 const TokenInfo argumentToken = tokenizer->consumeNextTokenIgnoreNewlines();
355 const TokenInfo closeToken = tokenizer->consumeNextTokenIgnoreNewlines();
356
357 if (openToken.kind != TokenKind::OpenParen) {
358 error->addError(range: openToken.range, error: ErrorType::ParserChainedExprNoOpenParen);
359 return false;
360 }
361
362 if (argumentToken.kind != TokenKind::Literal ||
363 !argumentToken.value.isString()) {
364 error->addError(range: argumentToken.range,
365 error: ErrorType::ParserChainedExprInvalidArg);
366 return false;
367 }
368
369 if (closeToken.kind != TokenKind::CloseParen) {
370 error->addError(range: closeToken.range, error: ErrorType::ParserChainedExprNoCloseParen);
371 return false;
372 }
373
374 // If all checks passed, extract the argument and return true.
375 argument = argumentToken.value.getString();
376 return true;
377}
378
379// Parse the arguments of a matcher
380bool Parser::parseMatcherArgs(std::vector<ParserValue> &args, MatcherCtor ctor,
381 const TokenInfo &nameToken, TokenInfo &endToken) {
382 ScopedContextEntry sce(this, ctor);
383
384 while (tokenizer->nextTokenKind() != TokenKind::Eof) {
385 if (tokenizer->nextTokenKind() == TokenKind::CloseParen) {
386 // end of args.
387 endToken = tokenizer->consumeNextToken();
388 break;
389 }
390
391 if (!args.empty()) {
392 // We must find a , token to continue.
393 TokenInfo commaToken = tokenizer->consumeNextToken();
394 if (commaToken.kind != TokenKind::Comma) {
395 error->addError(range: commaToken.range, error: ErrorType::ParserNoComma)
396 << commaToken.text;
397 return false;
398 }
399 }
400
401 ParserValue argValue;
402 tokenizer->skipNewlines();
403
404 argValue.text = tokenizer->peekNextToken().text;
405 argValue.range = tokenizer->peekNextToken().range;
406 if (!parseExpressionImpl(value: &argValue.value)) {
407 return false;
408 }
409
410 tokenizer->skipNewlines();
411 args.push_back(x: argValue);
412 sce.nextArg();
413 }
414
415 return true;
416}
417
418// Parse and validate a matcher expression.
419bool Parser::parseMatcherExpressionImpl(const TokenInfo &nameToken,
420 const TokenInfo &openToken,
421 std::optional<MatcherCtor> ctor,
422 VariantValue *value) {
423 if (!ctor) {
424 error->addError(range: nameToken.range, error: ErrorType::RegistryMatcherNotFound)
425 << nameToken.text;
426 // Do not return here. We need to continue to give completion suggestions.
427 }
428
429 std::vector<ParserValue> args;
430 TokenInfo endToken;
431
432 tokenizer->skipNewlines();
433
434 if (!parseMatcherArgs(args, ctor: ctor.value_or(u: nullptr), nameToken, endToken)) {
435 return false;
436 }
437
438 // Check for the missing closing parenthesis
439 if (endToken.kind != TokenKind::CloseParen) {
440 error->addError(range: openToken.range, error: ErrorType::ParserNoCloseParen)
441 << nameToken.text;
442 return false;
443 }
444
445 std::string functionName;
446 if (tokenizer->peekNextToken().kind == TokenKind::Period) {
447 tokenizer->consumeNextToken();
448 TokenInfo chainCallToken = tokenizer->consumeNextToken();
449 if (chainCallToken.kind == TokenKind::CodeCompletion) {
450 addCompletion(compToken: chainCallToken, completion: MatcherCompletion("extract(\"", "extract"));
451 return false;
452 }
453
454 if (chainCallToken.kind != TokenKind::Ident ||
455 chainCallToken.text != TokenInfo::ID_Extract) {
456 error->addError(range: chainCallToken.range,
457 error: ErrorType::ParserMalformedChainedExpr);
458 return false;
459 }
460
461 if (chainCallToken.text == TokenInfo::ID_Extract &&
462 !parseChainedExpression(argument&: functionName))
463 return false;
464 }
465
466 if (!ctor)
467 return false;
468 // Merge the start and end infos.
469 SourceRange matcherRange = nameToken.range;
470 matcherRange.end = endToken.range.end;
471 VariantMatcher result = sema->actOnMatcherExpression(
472 Ctor: *ctor, NameRange: matcherRange, functionName, Args: args, Error: error);
473 if (result.isNull())
474 return false;
475 *value = result;
476 return true;
477}
478
479// If the prefix of this completion matches the completion token, add it to
480// completions minus the prefix.
481void Parser::addCompletion(const TokenInfo &compToken,
482 const MatcherCompletion &completion) {
483 if (llvm::StringRef(completion.typedText).starts_with(Prefix: compToken.text)) {
484 completions.emplace_back(args: completion.typedText.substr(pos: compToken.text.size()),
485 args: completion.matcherDecl);
486 }
487}
488
489std::vector<MatcherCompletion>
490Parser::getNamedValueCompletions(llvm::ArrayRef<ArgKind> acceptedTypes) {
491 if (!namedValues)
492 return {};
493
494 std::vector<MatcherCompletion> result;
495 for (const auto &entry : *namedValues) {
496 std::string decl =
497 (entry.getValue().getTypeAsString() + " " + entry.getKey()).str();
498 result.emplace_back(args: entry.getKey(), args&: decl);
499 }
500 return result;
501}
502
503void Parser::addExpressionCompletions() {
504 const TokenInfo compToken = tokenizer->consumeNextTokenIgnoreNewlines();
505 assert(compToken.kind == TokenKind::CodeCompletion);
506
507 // We cannot complete code if there is an invalid element on the context
508 // stack.
509 for (const auto &entry : contextStack) {
510 if (!entry.first)
511 return;
512 }
513
514 auto acceptedTypes = sema->getAcceptedCompletionTypes(context: contextStack);
515 for (const auto &completion : sema->getMatcherCompletions(acceptedTypes)) {
516 addCompletion(compToken, completion);
517 }
518
519 for (const auto &completion : getNamedValueCompletions(acceptedTypes)) {
520 addCompletion(compToken, completion);
521 }
522}
523
524// Parse an <Expresssion>
525bool Parser::parseExpressionImpl(VariantValue *value) {
526 switch (tokenizer->nextTokenKind()) {
527 case TokenKind::Literal:
528 *value = tokenizer->consumeNextToken().value;
529 return true;
530 case TokenKind::Ident:
531 return parseIdentifierPrefixImpl(value);
532 case TokenKind::CodeCompletion:
533 addExpressionCompletions();
534 return false;
535 case TokenKind::Eof:
536 error->addError(range: tokenizer->consumeNextToken().range,
537 error: ErrorType::ParserNoCode);
538 return false;
539
540 case TokenKind::Error:
541 // This error was already reported by the tokenizer.
542 return false;
543 case TokenKind::NewLine:
544 case TokenKind::OpenParen:
545 case TokenKind::CloseParen:
546 case TokenKind::Comma:
547 case TokenKind::Period:
548 case TokenKind::InvalidChar:
549 const TokenInfo token = tokenizer->consumeNextToken();
550 error->addError(range: token.range, error: ErrorType::ParserInvalidToken)
551 << (token.kind == TokenKind::NewLine ? "NewLine" : token.text);
552 return false;
553 }
554
555 llvm_unreachable("Unknown token kind.");
556}
557
558Parser::Parser(CodeTokenizer *tokenizer, const Registry &matcherRegistry,
559 const NamedValueMap *namedValues, Diagnostics *error)
560 : tokenizer(tokenizer),
561 sema(std::make_unique<RegistrySema>(args: matcherRegistry)),
562 namedValues(namedValues), error(error) {}
563
564Parser::RegistrySema::~RegistrySema() = default;
565
566std::optional<MatcherCtor>
567Parser::RegistrySema::lookupMatcherCtor(llvm::StringRef matcherName) {
568 return RegistryManager::lookupMatcherCtor(matcherName, matcherRegistry);
569}
570
571VariantMatcher Parser::RegistrySema::actOnMatcherExpression(
572 MatcherCtor ctor, SourceRange nameRange, llvm::StringRef functionName,
573 llvm::ArrayRef<ParserValue> args, Diagnostics *error) {
574 return RegistryManager::constructMatcher(ctor, nameRange, functionName, args,
575 error);
576}
577
578std::vector<ArgKind> Parser::RegistrySema::getAcceptedCompletionTypes(
579 llvm::ArrayRef<std::pair<MatcherCtor, unsigned>> context) {
580 return RegistryManager::getAcceptedCompletionTypes(context);
581}
582
583std::vector<MatcherCompletion> Parser::RegistrySema::getMatcherCompletions(
584 llvm::ArrayRef<ArgKind> acceptedTypes) {
585 return RegistryManager::getMatcherCompletions(acceptedTypes, matcherRegistry);
586}
587
588bool Parser::parseExpression(llvm::StringRef &code,
589 const Registry &matcherRegistry,
590 const NamedValueMap *namedValues,
591 VariantValue *value, Diagnostics *error) {
592 CodeTokenizer tokenizer(code, error);
593 Parser parser(&tokenizer, matcherRegistry, namedValues, error);
594 if (!parser.parseExpressionImpl(value))
595 return false;
596 auto nextToken = tokenizer.peekNextToken();
597 if (nextToken.kind != TokenKind::Eof &&
598 nextToken.kind != TokenKind::NewLine) {
599 error->addError(range: tokenizer.peekNextToken().range,
600 error: ErrorType::ParserTrailingCode);
601 return false;
602 }
603 return true;
604}
605
606std::vector<MatcherCompletion>
607Parser::completeExpression(llvm::StringRef &code, unsigned completionOffset,
608 const Registry &matcherRegistry,
609 const NamedValueMap *namedValues) {
610 Diagnostics error;
611 CodeTokenizer tokenizer(code, &error, completionOffset);
612 Parser parser(&tokenizer, matcherRegistry, namedValues, &error);
613 VariantValue dummy;
614 parser.parseExpressionImpl(value: &dummy);
615
616 return parser.completions;
617}
618
619std::optional<DynMatcher> Parser::parseMatcherExpression(
620 llvm::StringRef &code, const Registry &matcherRegistry,
621 const NamedValueMap *namedValues, Diagnostics *error) {
622 VariantValue value;
623 if (!parseExpression(code, matcherRegistry, namedValues, value: &value, error))
624 return std::nullopt;
625 if (!value.isMatcher()) {
626 error->addError(range: SourceRange(), error: ErrorType::ParserNotAMatcher);
627 return std::nullopt;
628 }
629 std::optional<DynMatcher> result = value.getMatcher().getDynMatcher();
630 if (!result) {
631 error->addError(range: SourceRange(), error: ErrorType::ParserOverloadedType)
632 << value.getTypeAsString();
633 }
634 return result;
635}
636
637} // namespace mlir::query::matcher::internal
638

Provided by KDAB

Privacy Policy
Improve your Profiling and Debugging skills
Find out more

source code of mlir/lib/Query/Matcher/Parser.cpp