1//===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the lexer for the MLIR textual form.
10//
11//===----------------------------------------------------------------------===//
12
13#include "Lexer.h"
14#include "Token.h"
15#include "mlir/AsmParser/CodeComplete.h"
16#include "mlir/IR/Diagnostics.h"
17#include "mlir/IR/Location.h"
18#include "mlir/IR/MLIRContext.h"
19#include "mlir/Support/LLVM.h"
20#include "llvm/ADT/STLExtras.h"
21#include "llvm/ADT/StringExtras.h"
22#include "llvm/ADT/StringSwitch.h"
23#include "llvm/Support/ErrorHandling.h"
24#include "llvm/Support/SourceMgr.h"
25#include <cassert>
26#include <cctype>
27
28using namespace mlir;
29
30// Returns true if 'c' is an allowable punctuation character: [$._-]
31// Returns false otherwise.
32static bool isPunct(char c) {
33 return c == '$' || c == '.' || c == '_' || c == '-';
34}
35
36Lexer::Lexer(const llvm::SourceMgr &sourceMgr, MLIRContext *context,
37 AsmParserCodeCompleteContext *codeCompleteContext)
38 : sourceMgr(sourceMgr), context(context), codeCompleteLoc(nullptr) {
39 auto bufferID = sourceMgr.getMainFileID();
40 curBuffer = sourceMgr.getMemoryBuffer(i: bufferID)->getBuffer();
41 curPtr = curBuffer.begin();
42
43 // Set the code completion location if it was provided.
44 if (codeCompleteContext)
45 codeCompleteLoc = codeCompleteContext->getCodeCompleteLoc().getPointer();
46}
47
48/// Encode the specified source location information into an attribute for
49/// attachment to the IR.
50Location Lexer::getEncodedSourceLocation(SMLoc loc) {
51 auto &sourceMgr = getSourceMgr();
52 unsigned mainFileID = sourceMgr.getMainFileID();
53
54 // TODO: Fix performance issues in SourceMgr::getLineAndColumn so that we can
55 // use it here.
56 auto &bufferInfo = sourceMgr.getBufferInfo(i: mainFileID);
57 unsigned lineNo = bufferInfo.getLineNumber(Ptr: loc.getPointer());
58 unsigned column =
59 (loc.getPointer() - bufferInfo.getPointerForLineNumber(LineNo: lineNo)) + 1;
60 auto *buffer = sourceMgr.getMemoryBuffer(i: mainFileID);
61
62 return FileLineColLoc::get(context, buffer->getBufferIdentifier(), lineNo,
63 column);
64}
65
66/// emitError - Emit an error message and return an Token::error token.
67Token Lexer::emitError(const char *loc, const Twine &message) {
68 mlir::emitError(loc: getEncodedSourceLocation(loc: SMLoc::getFromPointer(Ptr: loc)),
69 message);
70 return formToken(kind: Token::error, tokStart: loc);
71}
72
73Token Lexer::lexToken() {
74 while (true) {
75 const char *tokStart = curPtr;
76
77 // Check to see if the current token is at the code completion location.
78 if (tokStart == codeCompleteLoc)
79 return formToken(kind: Token::code_complete, tokStart);
80
81 // Lex the next token.
82 switch (*curPtr++) {
83 default:
84 // Handle bare identifiers.
85 if (isalpha(curPtr[-1]))
86 return lexBareIdentifierOrKeyword(tokStart);
87
88 // Unknown character, emit an error.
89 return emitError(loc: tokStart, message: "unexpected character");
90
91 case ' ':
92 case '\t':
93 case '\n':
94 case '\r':
95 // Handle whitespace.
96 continue;
97
98 case '_':
99 // Handle bare identifiers.
100 return lexBareIdentifierOrKeyword(tokStart);
101
102 case 0:
103 // This may either be a nul character in the source file or may be the EOF
104 // marker that llvm::MemoryBuffer guarantees will be there.
105 if (curPtr - 1 == curBuffer.end())
106 return formToken(kind: Token::eof, tokStart);
107 continue;
108
109 case ':':
110 return formToken(kind: Token::colon, tokStart);
111 case ',':
112 return formToken(kind: Token::comma, tokStart);
113 case '.':
114 return lexEllipsis(tokStart);
115 case '(':
116 return formToken(kind: Token::l_paren, tokStart);
117 case ')':
118 return formToken(kind: Token::r_paren, tokStart);
119 case '{':
120 if (*curPtr == '-' && *(curPtr + 1) == '#') {
121 curPtr += 2;
122 return formToken(kind: Token::file_metadata_begin, tokStart);
123 }
124 return formToken(kind: Token::l_brace, tokStart);
125 case '}':
126 return formToken(kind: Token::r_brace, tokStart);
127 case '[':
128 return formToken(kind: Token::l_square, tokStart);
129 case ']':
130 return formToken(kind: Token::r_square, tokStart);
131 case '<':
132 return formToken(kind: Token::less, tokStart);
133 case '>':
134 return formToken(kind: Token::greater, tokStart);
135 case '=':
136 return formToken(kind: Token::equal, tokStart);
137
138 case '+':
139 return formToken(kind: Token::plus, tokStart);
140 case '*':
141 return formToken(kind: Token::star, tokStart);
142 case '-':
143 if (*curPtr == '>') {
144 ++curPtr;
145 return formToken(kind: Token::arrow, tokStart);
146 }
147 return formToken(kind: Token::minus, tokStart);
148
149 case '?':
150 return formToken(kind: Token::question, tokStart);
151
152 case '|':
153 return formToken(kind: Token::vertical_bar, tokStart);
154
155 case '/':
156 if (*curPtr == '/') {
157 skipComment();
158 continue;
159 }
160 return emitError(loc: tokStart, message: "unexpected character");
161
162 case '@':
163 return lexAtIdentifier(tokStart);
164
165 case '#':
166 if (*curPtr == '-' && *(curPtr + 1) == '}') {
167 curPtr += 2;
168 return formToken(kind: Token::file_metadata_end, tokStart);
169 }
170 [[fallthrough]];
171 case '!':
172 case '^':
173 case '%':
174 return lexPrefixedIdentifier(tokStart);
175 case '"':
176 return lexString(tokStart);
177
178 case '0':
179 case '1':
180 case '2':
181 case '3':
182 case '4':
183 case '5':
184 case '6':
185 case '7':
186 case '8':
187 case '9':
188 return lexNumber(tokStart);
189 }
190 }
191}
192
193/// Lex an '@foo' identifier.
194///
195/// symbol-ref-id ::= `@` (bare-id | string-literal)
196///
197Token Lexer::lexAtIdentifier(const char *tokStart) {
198 char cur = *curPtr++;
199
200 // Try to parse a string literal, if present.
201 if (cur == '"') {
202 Token stringIdentifier = lexString(tokStart: curPtr);
203 if (stringIdentifier.is(k: Token::error))
204 return stringIdentifier;
205 return formToken(kind: Token::at_identifier, tokStart);
206 }
207
208 // Otherwise, these always start with a letter or underscore.
209 if (!isalpha(cur) && cur != '_')
210 return emitError(loc: curPtr - 1,
211 message: "@ identifier expected to start with letter or '_'");
212
213 while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' ||
214 *curPtr == '$' || *curPtr == '.')
215 ++curPtr;
216 return formToken(kind: Token::at_identifier, tokStart);
217}
218
219/// Lex a bare identifier or keyword that starts with a letter.
220///
221/// bare-id ::= (letter|[_]) (letter|digit|[_$.])*
222/// integer-type ::= `[su]?i[1-9][0-9]*`
223///
224Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) {
225 // Match the rest of the identifier regex: [0-9a-zA-Z_.$]*
226 while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' ||
227 *curPtr == '$' || *curPtr == '.')
228 ++curPtr;
229
230 // Check to see if this identifier is a keyword.
231 StringRef spelling(tokStart, curPtr - tokStart);
232
233 auto isAllDigit = [](StringRef str) {
234 return llvm::all_of(Range&: str, P: llvm::isDigit);
235 };
236
237 // Check for i123, si456, ui789.
238 if ((spelling.size() > 1 && tokStart[0] == 'i' &&
239 isAllDigit(spelling.drop_front())) ||
240 ((spelling.size() > 2 && tokStart[1] == 'i' &&
241 (tokStart[0] == 's' || tokStart[0] == 'u')) &&
242 isAllDigit(spelling.drop_front(N: 2))))
243 return Token(Token::inttype, spelling);
244
245 Token::Kind kind = StringSwitch<Token::Kind>(spelling)
246#define TOK_KEYWORD(SPELLING) .Case(#SPELLING, Token::kw_##SPELLING)
247#include "TokenKinds.def"
248 .Default(Value: Token::bare_identifier);
249
250 return Token(kind, spelling);
251}
252
253/// Skip a comment line, starting with a '//'.
254///
255/// TODO: add a regex for comments here and to the spec.
256///
257void Lexer::skipComment() {
258 // Advance over the second '/' in a '//' comment.
259 assert(*curPtr == '/');
260 ++curPtr;
261
262 while (true) {
263 switch (*curPtr++) {
264 case '\n':
265 case '\r':
266 // Newline is end of comment.
267 return;
268 case 0:
269 // If this is the end of the buffer, end the comment.
270 if (curPtr - 1 == curBuffer.end()) {
271 --curPtr;
272 return;
273 }
274 [[fallthrough]];
275 default:
276 // Skip over other characters.
277 break;
278 }
279 }
280}
281
282/// Lex an ellipsis.
283///
284/// ellipsis ::= '...'
285///
286Token Lexer::lexEllipsis(const char *tokStart) {
287 assert(curPtr[-1] == '.');
288
289 if (curPtr == curBuffer.end() || *curPtr != '.' || *(curPtr + 1) != '.')
290 return emitError(loc: curPtr, message: "expected three consecutive dots for an ellipsis");
291
292 curPtr += 2;
293 return formToken(kind: Token::ellipsis, tokStart);
294}
295
296/// Lex a number literal.
297///
298/// integer-literal ::= digit+ | `0x` hex_digit+
299/// float-literal ::= [-+]?[0-9]+[.][0-9]*([eE][-+]?[0-9]+)?
300///
301Token Lexer::lexNumber(const char *tokStart) {
302 assert(isdigit(curPtr[-1]));
303
304 // Handle the hexadecimal case.
305 if (curPtr[-1] == '0' && *curPtr == 'x') {
306 // If we see stuff like 0xi32, this is a literal `0` followed by an
307 // identifier `xi32`, stop after `0`.
308 if (!isxdigit(curPtr[1]))
309 return formToken(kind: Token::integer, tokStart);
310
311 curPtr += 2;
312 while (isxdigit(*curPtr))
313 ++curPtr;
314
315 return formToken(kind: Token::integer, tokStart);
316 }
317
318 // Handle the normal decimal case.
319 while (isdigit(*curPtr))
320 ++curPtr;
321
322 if (*curPtr != '.')
323 return formToken(kind: Token::integer, tokStart);
324 ++curPtr;
325
326 // Skip over [0-9]*([eE][-+]?[0-9]+)?
327 while (isdigit(*curPtr))
328 ++curPtr;
329
330 if (*curPtr == 'e' || *curPtr == 'E') {
331 if (isdigit(static_cast<unsigned char>(curPtr[1])) ||
332 ((curPtr[1] == '-' || curPtr[1] == '+') &&
333 isdigit(static_cast<unsigned char>(curPtr[2])))) {
334 curPtr += 2;
335 while (isdigit(*curPtr))
336 ++curPtr;
337 }
338 }
339 return formToken(kind: Token::floatliteral, tokStart);
340}
341
342/// Lex an identifier that starts with a prefix followed by suffix-id.
343///
344/// attribute-id ::= `#` suffix-id
345/// ssa-id ::= '%' suffix-id
346/// block-id ::= '^' suffix-id
347/// type-id ::= '!' suffix-id
348/// suffix-id ::= digit+ | (letter|id-punct) (letter|id-punct|digit)*
349/// id-punct ::= `$` | `.` | `_` | `-`
350///
351Token Lexer::lexPrefixedIdentifier(const char *tokStart) {
352 Token::Kind kind;
353 StringRef errorKind;
354 switch (*tokStart) {
355 case '#':
356 kind = Token::hash_identifier;
357 errorKind = "invalid attribute name";
358 break;
359 case '%':
360 kind = Token::percent_identifier;
361 errorKind = "invalid SSA name";
362 break;
363 case '^':
364 kind = Token::caret_identifier;
365 errorKind = "invalid block name";
366 break;
367 case '!':
368 kind = Token::exclamation_identifier;
369 errorKind = "invalid type identifier";
370 break;
371 default:
372 llvm_unreachable("invalid caller");
373 }
374
375 // Parse suffix-id.
376 if (isdigit(*curPtr)) {
377 // If suffix-id starts with a digit, the rest must be digits.
378 while (isdigit(*curPtr))
379 ++curPtr;
380 } else if (isalpha(*curPtr) || isPunct(c: *curPtr)) {
381 do {
382 ++curPtr;
383 } while (isalpha(*curPtr) || isdigit(*curPtr) || isPunct(c: *curPtr));
384 } else if (curPtr == codeCompleteLoc) {
385 return formToken(kind: Token::code_complete, tokStart);
386 } else {
387 return emitError(loc: curPtr - 1, message: errorKind);
388 }
389
390 // Check for a code completion within the identifier.
391 if (codeCompleteLoc && codeCompleteLoc >= tokStart &&
392 codeCompleteLoc <= curPtr) {
393 return Token(Token::code_complete,
394 StringRef(tokStart, codeCompleteLoc - tokStart));
395 }
396
397 return formToken(kind, tokStart);
398}
399
400/// Lex a string literal.
401///
402/// string-literal ::= '"' [^"\n\f\v\r]* '"'
403///
404/// TODO: define escaping rules.
405Token Lexer::lexString(const char *tokStart) {
406 assert(curPtr[-1] == '"');
407
408 while (true) {
409 // Check to see if there is a code completion location within the string. In
410 // these cases we generate a completion location and place the currently
411 // lexed string within the token. This allows for the parser to use the
412 // partially lexed string when computing the completion results.
413 if (curPtr == codeCompleteLoc)
414 return formToken(kind: Token::code_complete, tokStart);
415
416 switch (*curPtr++) {
417 case '"':
418 return formToken(kind: Token::string, tokStart);
419 case 0:
420 // If this is a random nul character in the middle of a string, just
421 // include it. If it is the end of file, then it is an error.
422 if (curPtr - 1 != curBuffer.end())
423 continue;
424 [[fallthrough]];
425 case '\n':
426 case '\v':
427 case '\f':
428 return emitError(loc: curPtr - 1, message: "expected '\"' in string literal");
429 case '\\':
430 // Handle explicitly a few escapes.
431 if (*curPtr == '"' || *curPtr == '\\' || *curPtr == 'n' || *curPtr == 't')
432 ++curPtr;
433 else if (llvm::isHexDigit(C: *curPtr) && llvm::isHexDigit(C: curPtr[1]))
434 // Support \xx for two hex digits.
435 curPtr += 2;
436 else
437 return emitError(loc: curPtr - 1, message: "unknown escape in string literal");
438 continue;
439
440 default:
441 continue;
442 }
443 }
444}
445

source code of mlir/lib/AsmParser/Lexer.cpp