| 1 | //===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file implements the lexer for the MLIR textual form. |
| 10 | // |
| 11 | //===----------------------------------------------------------------------===// |
| 12 | |
| 13 | #include "Lexer.h" |
| 14 | #include "Token.h" |
| 15 | #include "mlir/AsmParser/CodeComplete.h" |
| 16 | #include "mlir/IR/Diagnostics.h" |
| 17 | #include "mlir/IR/Location.h" |
| 18 | #include "mlir/IR/MLIRContext.h" |
| 19 | #include "mlir/Support/LLVM.h" |
| 20 | #include "llvm/ADT/STLExtras.h" |
| 21 | #include "llvm/ADT/StringExtras.h" |
| 22 | #include "llvm/ADT/StringSwitch.h" |
| 23 | #include "llvm/Support/ErrorHandling.h" |
| 24 | #include "llvm/Support/SourceMgr.h" |
| 25 | #include <cassert> |
| 26 | #include <cctype> |
| 27 | |
| 28 | using namespace mlir; |
| 29 | |
| 30 | // Returns true if 'c' is an allowable punctuation character: [$._-] |
| 31 | // Returns false otherwise. |
| 32 | static bool isPunct(char c) { |
| 33 | return c == '$' || c == '.' || c == '_' || c == '-'; |
| 34 | } |
| 35 | |
| 36 | Lexer::Lexer(const llvm::SourceMgr &sourceMgr, MLIRContext *context, |
| 37 | AsmParserCodeCompleteContext *codeCompleteContext) |
| 38 | : sourceMgr(sourceMgr), context(context), codeCompleteLoc(nullptr) { |
| 39 | auto bufferID = sourceMgr.getMainFileID(); |
| 40 | curBuffer = sourceMgr.getMemoryBuffer(i: bufferID)->getBuffer(); |
| 41 | curPtr = curBuffer.begin(); |
| 42 | |
| 43 | // Set the code completion location if it was provided. |
| 44 | if (codeCompleteContext) |
| 45 | codeCompleteLoc = codeCompleteContext->getCodeCompleteLoc().getPointer(); |
| 46 | } |
| 47 | |
| 48 | /// Encode the specified source location information into an attribute for |
| 49 | /// attachment to the IR. |
| 50 | Location Lexer::getEncodedSourceLocation(SMLoc loc) { |
| 51 | auto &sourceMgr = getSourceMgr(); |
| 52 | unsigned mainFileID = sourceMgr.getMainFileID(); |
| 53 | |
| 54 | // TODO: Fix performance issues in SourceMgr::getLineAndColumn so that we can |
| 55 | // use it here. |
| 56 | auto &bufferInfo = sourceMgr.getBufferInfo(i: mainFileID); |
| 57 | unsigned lineNo = bufferInfo.getLineNumber(Ptr: loc.getPointer()); |
| 58 | unsigned column = |
| 59 | (loc.getPointer() - bufferInfo.getPointerForLineNumber(LineNo: lineNo)) + 1; |
| 60 | auto *buffer = sourceMgr.getMemoryBuffer(i: mainFileID); |
| 61 | |
| 62 | return FileLineColLoc::get(context, fileName: buffer->getBufferIdentifier(), line: lineNo, |
| 63 | column); |
| 64 | } |
| 65 | |
| 66 | /// emitError - Emit an error message and return an Token::error token. |
| 67 | Token Lexer::emitError(const char *loc, const Twine &message) { |
| 68 | mlir::emitError(loc: getEncodedSourceLocation(loc: SMLoc::getFromPointer(Ptr: loc)), |
| 69 | message); |
| 70 | return formToken(kind: Token::error, tokStart: loc); |
| 71 | } |
| 72 | |
| 73 | Token Lexer::lexToken() { |
| 74 | while (true) { |
| 75 | const char *tokStart = curPtr; |
| 76 | |
| 77 | // Check to see if the current token is at the code completion location. |
| 78 | if (tokStart == codeCompleteLoc) |
| 79 | return formToken(kind: Token::code_complete, tokStart); |
| 80 | |
| 81 | // Lex the next token. |
| 82 | switch (*curPtr++) { |
| 83 | default: |
| 84 | // Handle bare identifiers. |
| 85 | if (isalpha(curPtr[-1])) |
| 86 | return lexBareIdentifierOrKeyword(tokStart); |
| 87 | |
| 88 | // Unknown character, emit an error. |
| 89 | return emitError(loc: tokStart, message: "unexpected character" ); |
| 90 | |
| 91 | case ' ': |
| 92 | case '\t': |
| 93 | case '\n': |
| 94 | case '\r': |
| 95 | // Handle whitespace. |
| 96 | continue; |
| 97 | |
| 98 | case '_': |
| 99 | // Handle bare identifiers. |
| 100 | return lexBareIdentifierOrKeyword(tokStart); |
| 101 | |
| 102 | case 0: |
| 103 | // This may either be a nul character in the source file or may be the EOF |
| 104 | // marker that llvm::MemoryBuffer guarantees will be there. |
| 105 | if (curPtr - 1 == curBuffer.end()) |
| 106 | return formToken(kind: Token::eof, tokStart); |
| 107 | continue; |
| 108 | |
| 109 | case ':': |
| 110 | return formToken(kind: Token::colon, tokStart); |
| 111 | case ',': |
| 112 | return formToken(kind: Token::comma, tokStart); |
| 113 | case '.': |
| 114 | return lexEllipsis(tokStart); |
| 115 | case '(': |
| 116 | return formToken(kind: Token::l_paren, tokStart); |
| 117 | case ')': |
| 118 | return formToken(kind: Token::r_paren, tokStart); |
| 119 | case '{': |
| 120 | if (*curPtr == '-' && *(curPtr + 1) == '#') { |
| 121 | curPtr += 2; |
| 122 | return formToken(kind: Token::file_metadata_begin, tokStart); |
| 123 | } |
| 124 | return formToken(kind: Token::l_brace, tokStart); |
| 125 | case '}': |
| 126 | return formToken(kind: Token::r_brace, tokStart); |
| 127 | case '[': |
| 128 | return formToken(kind: Token::l_square, tokStart); |
| 129 | case ']': |
| 130 | return formToken(kind: Token::r_square, tokStart); |
| 131 | case '<': |
| 132 | return formToken(kind: Token::less, tokStart); |
| 133 | case '>': |
| 134 | return formToken(kind: Token::greater, tokStart); |
| 135 | case '=': |
| 136 | return formToken(kind: Token::equal, tokStart); |
| 137 | |
| 138 | case '+': |
| 139 | return formToken(kind: Token::plus, tokStart); |
| 140 | case '*': |
| 141 | return formToken(kind: Token::star, tokStart); |
| 142 | case '-': |
| 143 | if (*curPtr == '>') { |
| 144 | ++curPtr; |
| 145 | return formToken(kind: Token::arrow, tokStart); |
| 146 | } |
| 147 | return formToken(kind: Token::minus, tokStart); |
| 148 | |
| 149 | case '?': |
| 150 | return formToken(kind: Token::question, tokStart); |
| 151 | |
| 152 | case '|': |
| 153 | return formToken(kind: Token::vertical_bar, tokStart); |
| 154 | |
| 155 | case '/': |
| 156 | if (*curPtr == '/') { |
| 157 | skipComment(); |
| 158 | continue; |
| 159 | } |
| 160 | return formToken(kind: Token::slash, tokStart); |
| 161 | |
| 162 | case '@': |
| 163 | return lexAtIdentifier(tokStart); |
| 164 | |
| 165 | case '#': |
| 166 | if (*curPtr == '-' && *(curPtr + 1) == '}') { |
| 167 | curPtr += 2; |
| 168 | return formToken(kind: Token::file_metadata_end, tokStart); |
| 169 | } |
| 170 | [[fallthrough]]; |
| 171 | case '!': |
| 172 | case '^': |
| 173 | case '%': |
| 174 | return lexPrefixedIdentifier(tokStart); |
| 175 | case '"': |
| 176 | return lexString(tokStart); |
| 177 | |
| 178 | case '0': |
| 179 | case '1': |
| 180 | case '2': |
| 181 | case '3': |
| 182 | case '4': |
| 183 | case '5': |
| 184 | case '6': |
| 185 | case '7': |
| 186 | case '8': |
| 187 | case '9': |
| 188 | return lexNumber(tokStart); |
| 189 | } |
| 190 | } |
| 191 | } |
| 192 | |
| 193 | /// Lex an '@foo' identifier. |
| 194 | /// |
| 195 | /// symbol-ref-id ::= `@` (bare-id | string-literal) |
| 196 | /// |
| 197 | Token Lexer::lexAtIdentifier(const char *tokStart) { |
| 198 | char cur = *curPtr++; |
| 199 | |
| 200 | // Try to parse a string literal, if present. |
| 201 | if (cur == '"') { |
| 202 | Token stringIdentifier = lexString(tokStart: curPtr); |
| 203 | if (stringIdentifier.is(k: Token::error)) |
| 204 | return stringIdentifier; |
| 205 | return formToken(kind: Token::at_identifier, tokStart); |
| 206 | } |
| 207 | |
| 208 | // Otherwise, these always start with a letter or underscore. |
| 209 | if (!isalpha(cur) && cur != '_') |
| 210 | return emitError(loc: curPtr - 1, |
| 211 | message: "@ identifier expected to start with letter or '_'" ); |
| 212 | |
| 213 | while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' || |
| 214 | *curPtr == '$' || *curPtr == '.') |
| 215 | ++curPtr; |
| 216 | return formToken(kind: Token::at_identifier, tokStart); |
| 217 | } |
| 218 | |
| 219 | /// Lex a bare identifier or keyword that starts with a letter. |
| 220 | /// |
| 221 | /// bare-id ::= (letter|[_]) (letter|digit|[_$.])* |
| 222 | /// integer-type ::= `[su]?i[1-9][0-9]*` |
| 223 | /// |
| 224 | Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) { |
| 225 | // Match the rest of the identifier regex: [0-9a-zA-Z_.$]* |
| 226 | while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' || |
| 227 | *curPtr == '$' || *curPtr == '.') |
| 228 | ++curPtr; |
| 229 | |
| 230 | // Check to see if this identifier is a keyword. |
| 231 | StringRef spelling(tokStart, curPtr - tokStart); |
| 232 | |
| 233 | auto isAllDigit = [](StringRef str) { |
| 234 | return llvm::all_of(Range&: str, P: llvm::isDigit); |
| 235 | }; |
| 236 | |
| 237 | // Check for i123, si456, ui789. |
| 238 | if ((spelling.size() > 1 && tokStart[0] == 'i' && |
| 239 | isAllDigit(spelling.drop_front())) || |
| 240 | ((spelling.size() > 2 && tokStart[1] == 'i' && |
| 241 | (tokStart[0] == 's' || tokStart[0] == 'u')) && |
| 242 | isAllDigit(spelling.drop_front(N: 2)))) |
| 243 | return Token(Token::inttype, spelling); |
| 244 | |
| 245 | Token::Kind kind = StringSwitch<Token::Kind>(spelling) |
| 246 | #define TOK_KEYWORD(SPELLING) .Case(#SPELLING, Token::kw_##SPELLING) |
| 247 | #include "TokenKinds.def" |
| 248 | .Default(Value: Token::bare_identifier); |
| 249 | |
| 250 | return Token(kind, spelling); |
| 251 | } |
| 252 | |
| 253 | /// Skip a comment line, starting with a '//'. |
| 254 | /// |
| 255 | /// TODO: add a regex for comments here and to the spec. |
| 256 | /// |
| 257 | void Lexer::() { |
| 258 | // Advance over the second '/' in a '//' comment. |
| 259 | assert(*curPtr == '/'); |
| 260 | ++curPtr; |
| 261 | |
| 262 | while (true) { |
| 263 | switch (*curPtr++) { |
| 264 | case '\n': |
| 265 | case '\r': |
| 266 | // Newline is end of comment. |
| 267 | return; |
| 268 | case 0: |
| 269 | // If this is the end of the buffer, end the comment. |
| 270 | if (curPtr - 1 == curBuffer.end()) { |
| 271 | --curPtr; |
| 272 | return; |
| 273 | } |
| 274 | [[fallthrough]]; |
| 275 | default: |
| 276 | // Skip over other characters. |
| 277 | break; |
| 278 | } |
| 279 | } |
| 280 | } |
| 281 | |
| 282 | /// Lex an ellipsis. |
| 283 | /// |
| 284 | /// ellipsis ::= '...' |
| 285 | /// |
| 286 | Token Lexer::lexEllipsis(const char *tokStart) { |
| 287 | assert(curPtr[-1] == '.'); |
| 288 | |
| 289 | if (curPtr == curBuffer.end() || *curPtr != '.' || *(curPtr + 1) != '.') |
| 290 | return emitError(loc: curPtr, message: "expected three consecutive dots for an ellipsis" ); |
| 291 | |
| 292 | curPtr += 2; |
| 293 | return formToken(kind: Token::ellipsis, tokStart); |
| 294 | } |
| 295 | |
| 296 | /// Lex a number literal. |
| 297 | /// |
| 298 | /// integer-literal ::= digit+ | `0x` hex_digit+ |
| 299 | /// float-literal ::= [-+]?[0-9]+[.][0-9]*([eE][-+]?[0-9]+)? |
| 300 | /// |
| 301 | Token Lexer::lexNumber(const char *tokStart) { |
| 302 | assert(isdigit(curPtr[-1])); |
| 303 | |
| 304 | // Handle the hexadecimal case. |
| 305 | if (curPtr[-1] == '0' && *curPtr == 'x') { |
| 306 | // If we see stuff like 0xi32, this is a literal `0` followed by an |
| 307 | // identifier `xi32`, stop after `0`. |
| 308 | if (!isxdigit(curPtr[1])) |
| 309 | return formToken(kind: Token::integer, tokStart); |
| 310 | |
| 311 | curPtr += 2; |
| 312 | while (isxdigit(*curPtr)) |
| 313 | ++curPtr; |
| 314 | |
| 315 | return formToken(kind: Token::integer, tokStart); |
| 316 | } |
| 317 | |
| 318 | // Handle the normal decimal case. |
| 319 | while (isdigit(*curPtr)) |
| 320 | ++curPtr; |
| 321 | |
| 322 | if (*curPtr != '.') |
| 323 | return formToken(kind: Token::integer, tokStart); |
| 324 | ++curPtr; |
| 325 | |
| 326 | // Skip over [0-9]*([eE][-+]?[0-9]+)? |
| 327 | while (isdigit(*curPtr)) |
| 328 | ++curPtr; |
| 329 | |
| 330 | if (*curPtr == 'e' || *curPtr == 'E') { |
| 331 | if (isdigit(static_cast<unsigned char>(curPtr[1])) || |
| 332 | ((curPtr[1] == '-' || curPtr[1] == '+') && |
| 333 | isdigit(static_cast<unsigned char>(curPtr[2])))) { |
| 334 | curPtr += 2; |
| 335 | while (isdigit(*curPtr)) |
| 336 | ++curPtr; |
| 337 | } |
| 338 | } |
| 339 | return formToken(kind: Token::floatliteral, tokStart); |
| 340 | } |
| 341 | |
| 342 | /// Lex an identifier that starts with a prefix followed by suffix-id. |
| 343 | /// |
| 344 | /// attribute-id ::= `#` suffix-id |
| 345 | /// ssa-id ::= '%' suffix-id |
| 346 | /// block-id ::= '^' suffix-id |
| 347 | /// type-id ::= '!' suffix-id |
| 348 | /// suffix-id ::= digit+ | (letter|id-punct) (letter|id-punct|digit)* |
| 349 | /// id-punct ::= `$` | `.` | `_` | `-` |
| 350 | /// |
| 351 | Token Lexer::lexPrefixedIdentifier(const char *tokStart) { |
| 352 | Token::Kind kind; |
| 353 | StringRef errorKind; |
| 354 | switch (*tokStart) { |
| 355 | case '#': |
| 356 | kind = Token::hash_identifier; |
| 357 | errorKind = "invalid attribute name" ; |
| 358 | break; |
| 359 | case '%': |
| 360 | kind = Token::percent_identifier; |
| 361 | errorKind = "invalid SSA name" ; |
| 362 | break; |
| 363 | case '^': |
| 364 | kind = Token::caret_identifier; |
| 365 | errorKind = "invalid block name" ; |
| 366 | break; |
| 367 | case '!': |
| 368 | kind = Token::exclamation_identifier; |
| 369 | errorKind = "invalid type identifier" ; |
| 370 | break; |
| 371 | default: |
| 372 | llvm_unreachable("invalid caller" ); |
| 373 | } |
| 374 | |
| 375 | // Parse suffix-id. |
| 376 | if (isdigit(*curPtr)) { |
| 377 | // If suffix-id starts with a digit, the rest must be digits. |
| 378 | while (isdigit(*curPtr)) |
| 379 | ++curPtr; |
| 380 | } else if (isalpha(*curPtr) || isPunct(c: *curPtr)) { |
| 381 | do { |
| 382 | ++curPtr; |
| 383 | } while (isalpha(*curPtr) || isdigit(*curPtr) || isPunct(c: *curPtr)); |
| 384 | } else if (curPtr == codeCompleteLoc) { |
| 385 | return formToken(kind: Token::code_complete, tokStart); |
| 386 | } else { |
| 387 | return emitError(loc: curPtr - 1, message: errorKind); |
| 388 | } |
| 389 | |
| 390 | // Check for a code completion within the identifier. |
| 391 | if (codeCompleteLoc && codeCompleteLoc >= tokStart && |
| 392 | codeCompleteLoc <= curPtr) { |
| 393 | return Token(Token::code_complete, |
| 394 | StringRef(tokStart, codeCompleteLoc - tokStart)); |
| 395 | } |
| 396 | |
| 397 | return formToken(kind, tokStart); |
| 398 | } |
| 399 | |
| 400 | /// Lex a string literal. |
| 401 | /// |
| 402 | /// string-literal ::= '"' [^"\n\f\v\r]* '"' |
| 403 | /// |
| 404 | /// TODO: define escaping rules. |
| 405 | Token Lexer::lexString(const char *tokStart) { |
| 406 | assert(curPtr[-1] == '"'); |
| 407 | |
| 408 | while (true) { |
| 409 | // Check to see if there is a code completion location within the string. In |
| 410 | // these cases we generate a completion location and place the currently |
| 411 | // lexed string within the token. This allows for the parser to use the |
| 412 | // partially lexed string when computing the completion results. |
| 413 | if (curPtr == codeCompleteLoc) |
| 414 | return formToken(kind: Token::code_complete, tokStart); |
| 415 | |
| 416 | switch (*curPtr++) { |
| 417 | case '"': |
| 418 | return formToken(kind: Token::string, tokStart); |
| 419 | case 0: |
| 420 | // If this is a random nul character in the middle of a string, just |
| 421 | // include it. If it is the end of file, then it is an error. |
| 422 | if (curPtr - 1 != curBuffer.end()) |
| 423 | continue; |
| 424 | [[fallthrough]]; |
| 425 | case '\n': |
| 426 | case '\v': |
| 427 | case '\f': |
| 428 | return emitError(loc: curPtr - 1, message: "expected '\"' in string literal" ); |
| 429 | case '\\': |
| 430 | // Handle explicitly a few escapes. |
| 431 | if (*curPtr == '"' || *curPtr == '\\' || *curPtr == 'n' || *curPtr == 't') |
| 432 | ++curPtr; |
| 433 | else if (llvm::isHexDigit(C: *curPtr) && llvm::isHexDigit(C: curPtr[1])) |
| 434 | // Support \xx for two hex digits. |
| 435 | curPtr += 2; |
| 436 | else |
| 437 | return emitError(loc: curPtr - 1, message: "unknown escape in string literal" ); |
| 438 | continue; |
| 439 | |
| 440 | default: |
| 441 | continue; |
| 442 | } |
| 443 | } |
| 444 | } |
| 445 | |