| 1 | //===- Lexer.cpp ----------------------------------------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #include "Lexer.h" |
| 10 | #include "mlir/Tools/PDLL/AST/Diagnostic.h" |
| 11 | #include "mlir/Tools/PDLL/Parser/CodeComplete.h" |
| 12 | #include "llvm/ADT/StringExtras.h" |
| 13 | #include "llvm/ADT/StringSwitch.h" |
| 14 | #include "llvm/Support/SourceMgr.h" |
| 15 | |
| 16 | using namespace mlir; |
| 17 | using namespace mlir::pdll; |
| 18 | |
| 19 | //===----------------------------------------------------------------------===// |
| 20 | // Token |
| 21 | //===----------------------------------------------------------------------===// |
| 22 | |
| 23 | std::string Token::getStringValue() const { |
| 24 | assert(getKind() == string || getKind() == string_block || |
| 25 | getKind() == code_complete_string); |
| 26 | |
| 27 | // Start by dropping the quotes. |
| 28 | StringRef bytes = getSpelling(); |
| 29 | if (is(k: string)) |
| 30 | bytes = bytes.drop_front().drop_back(); |
| 31 | else if (is(k: string_block)) |
| 32 | bytes = bytes.drop_front(N: 2).drop_back(N: 2); |
| 33 | |
| 34 | std::string result; |
| 35 | result.reserve(res: bytes.size()); |
| 36 | for (unsigned i = 0, e = bytes.size(); i != e;) { |
| 37 | auto c = bytes[i++]; |
| 38 | if (c != '\\') { |
| 39 | result.push_back(c: c); |
| 40 | continue; |
| 41 | } |
| 42 | |
| 43 | assert(i + 1 <= e && "invalid string should be caught by lexer" ); |
| 44 | auto c1 = bytes[i++]; |
| 45 | switch (c1) { |
| 46 | case '"': |
| 47 | case '\\': |
| 48 | result.push_back(c: c1); |
| 49 | continue; |
| 50 | case 'n': |
| 51 | result.push_back(c: '\n'); |
| 52 | continue; |
| 53 | case 't': |
| 54 | result.push_back(c: '\t'); |
| 55 | continue; |
| 56 | default: |
| 57 | break; |
| 58 | } |
| 59 | |
| 60 | assert(i + 1 <= e && "invalid string should be caught by lexer" ); |
| 61 | auto c2 = bytes[i++]; |
| 62 | |
| 63 | assert(llvm::isHexDigit(c1) && llvm::isHexDigit(c2) && "invalid escape" ); |
| 64 | result.push_back(c: (llvm::hexDigitValue(C: c1) << 4) | llvm::hexDigitValue(C: c2)); |
| 65 | } |
| 66 | |
| 67 | return result; |
| 68 | } |
| 69 | |
| 70 | //===----------------------------------------------------------------------===// |
| 71 | // Lexer |
| 72 | //===----------------------------------------------------------------------===// |
| 73 | |
| 74 | Lexer::Lexer(llvm::SourceMgr &mgr, ast::DiagnosticEngine &diagEngine, |
| 75 | CodeCompleteContext *codeCompleteContext) |
| 76 | : srcMgr(mgr), diagEngine(diagEngine), addedHandlerToDiagEngine(false), |
| 77 | codeCompletionLocation(nullptr) { |
| 78 | curBufferID = mgr.getMainFileID(); |
| 79 | curBuffer = srcMgr.getMemoryBuffer(i: curBufferID)->getBuffer(); |
| 80 | curPtr = curBuffer.begin(); |
| 81 | |
| 82 | // Set the code completion location if necessary. |
| 83 | if (codeCompleteContext) { |
| 84 | codeCompletionLocation = |
| 85 | codeCompleteContext->getCodeCompleteLoc().getPointer(); |
| 86 | } |
| 87 | |
| 88 | // If the diag engine has no handler, add a default that emits to the |
| 89 | // SourceMgr. |
| 90 | if (!diagEngine.getHandlerFn()) { |
| 91 | diagEngine.setHandlerFn([&](const ast::Diagnostic &diag) { |
| 92 | srcMgr.PrintMessage(Loc: diag.getLocation().Start, Kind: diag.getSeverity(), |
| 93 | Msg: diag.getMessage()); |
| 94 | for (const ast::Diagnostic ¬e : diag.getNotes()) |
| 95 | srcMgr.PrintMessage(Loc: note.getLocation().Start, Kind: note.getSeverity(), |
| 96 | Msg: note.getMessage()); |
| 97 | }); |
| 98 | addedHandlerToDiagEngine = true; |
| 99 | } |
| 100 | } |
| 101 | |
| 102 | Lexer::~Lexer() { |
| 103 | if (addedHandlerToDiagEngine) |
| 104 | diagEngine.setHandlerFn(nullptr); |
| 105 | } |
| 106 | |
| 107 | LogicalResult Lexer::pushInclude(StringRef filename, SMRange includeLoc) { |
| 108 | std::string includedFile; |
| 109 | int bufferID = |
| 110 | srcMgr.AddIncludeFile(Filename: filename.str(), IncludeLoc: includeLoc.End, IncludedFile&: includedFile); |
| 111 | if (!bufferID) |
| 112 | return failure(); |
| 113 | |
| 114 | curBufferID = bufferID; |
| 115 | curBuffer = srcMgr.getMemoryBuffer(i: curBufferID)->getBuffer(); |
| 116 | curPtr = curBuffer.begin(); |
| 117 | return success(); |
| 118 | } |
| 119 | |
| 120 | Token Lexer::emitError(SMRange loc, const Twine &msg) { |
| 121 | diagEngine.emitError(loc, msg); |
| 122 | return formToken(kind: Token::error, tokStart: loc.Start.getPointer()); |
| 123 | } |
| 124 | Token Lexer::emitErrorAndNote(SMRange loc, const Twine &msg, SMRange noteLoc, |
| 125 | const Twine ¬e) { |
| 126 | diagEngine.emitError(loc, msg)->attachNote(msg: note, noteLoc); |
| 127 | return formToken(kind: Token::error, tokStart: loc.Start.getPointer()); |
| 128 | } |
| 129 | Token Lexer::emitError(const char *loc, const Twine &msg) { |
| 130 | return emitError( |
| 131 | loc: SMRange(SMLoc::getFromPointer(Ptr: loc), SMLoc::getFromPointer(Ptr: loc + 1)), msg); |
| 132 | } |
| 133 | |
| 134 | int Lexer::getNextChar() { |
| 135 | char curChar = *curPtr++; |
| 136 | switch (curChar) { |
| 137 | default: |
| 138 | return static_cast<unsigned char>(curChar); |
| 139 | case 0: { |
| 140 | // A nul character in the stream is either the end of the current buffer |
| 141 | // or a random nul in the file. Disambiguate that here. |
| 142 | if (curPtr - 1 != curBuffer.end()) |
| 143 | return 0; |
| 144 | |
| 145 | // Otherwise, return end of file. |
| 146 | --curPtr; |
| 147 | return EOF; |
| 148 | } |
| 149 | case '\n': |
| 150 | case '\r': |
| 151 | // Handle the newline character by ignoring it and incrementing the line |
| 152 | // count. However, be careful about 'dos style' files with \n\r in them. |
| 153 | // Only treat a \n\r or \r\n as a single line. |
| 154 | if ((*curPtr == '\n' || (*curPtr == '\r')) && *curPtr != curChar) |
| 155 | ++curPtr; |
| 156 | return '\n'; |
| 157 | } |
| 158 | } |
| 159 | |
| 160 | Token Lexer::lexToken() { |
| 161 | while (true) { |
| 162 | const char *tokStart = curPtr; |
| 163 | |
| 164 | // Check to see if this token is at the code completion location. |
| 165 | if (tokStart == codeCompletionLocation) |
| 166 | return formToken(kind: Token::code_complete, tokStart); |
| 167 | |
| 168 | // This always consumes at least one character. |
| 169 | int curChar = getNextChar(); |
| 170 | switch (curChar) { |
| 171 | default: |
| 172 | // Handle identifiers: [a-zA-Z_] |
| 173 | if (isalpha(curChar) || curChar == '_') |
| 174 | return lexIdentifier(tokStart); |
| 175 | |
| 176 | // Unknown character, emit an error. |
| 177 | return emitError(loc: tokStart, msg: "unexpected character" ); |
| 178 | case EOF: { |
| 179 | // Return EOF denoting the end of lexing. |
| 180 | Token eof = formToken(kind: Token::eof, tokStart); |
| 181 | |
| 182 | // Check to see if we are in an included file. |
| 183 | SMLoc parentIncludeLoc = srcMgr.getParentIncludeLoc(i: curBufferID); |
| 184 | if (parentIncludeLoc.isValid()) { |
| 185 | curBufferID = srcMgr.FindBufferContainingLoc(Loc: parentIncludeLoc); |
| 186 | curBuffer = srcMgr.getMemoryBuffer(i: curBufferID)->getBuffer(); |
| 187 | curPtr = parentIncludeLoc.getPointer(); |
| 188 | } |
| 189 | |
| 190 | return eof; |
| 191 | } |
| 192 | |
| 193 | // Lex punctuation. |
| 194 | case '-': |
| 195 | if (*curPtr == '>') { |
| 196 | ++curPtr; |
| 197 | return formToken(kind: Token::arrow, tokStart); |
| 198 | } |
| 199 | return emitError(loc: tokStart, msg: "unexpected character" ); |
| 200 | case ':': |
| 201 | return formToken(kind: Token::colon, tokStart); |
| 202 | case ',': |
| 203 | return formToken(kind: Token::comma, tokStart); |
| 204 | case '.': |
| 205 | return formToken(kind: Token::dot, tokStart); |
| 206 | case '=': |
| 207 | if (*curPtr == '>') { |
| 208 | ++curPtr; |
| 209 | return formToken(kind: Token::equal_arrow, tokStart); |
| 210 | } |
| 211 | return formToken(kind: Token::equal, tokStart); |
| 212 | case ';': |
| 213 | return formToken(kind: Token::semicolon, tokStart); |
| 214 | case '[': |
| 215 | if (*curPtr == '{') { |
| 216 | ++curPtr; |
| 217 | return lexString(tokStart, /*isStringBlock=*/true); |
| 218 | } |
| 219 | return formToken(kind: Token::l_square, tokStart); |
| 220 | case ']': |
| 221 | return formToken(kind: Token::r_square, tokStart); |
| 222 | |
| 223 | case '<': |
| 224 | return formToken(kind: Token::less, tokStart); |
| 225 | case '>': |
| 226 | return formToken(kind: Token::greater, tokStart); |
| 227 | case '{': |
| 228 | return formToken(kind: Token::l_brace, tokStart); |
| 229 | case '}': |
| 230 | return formToken(kind: Token::r_brace, tokStart); |
| 231 | case '(': |
| 232 | return formToken(kind: Token::l_paren, tokStart); |
| 233 | case ')': |
| 234 | return formToken(kind: Token::r_paren, tokStart); |
| 235 | case '/': |
| 236 | if (*curPtr == '/') { |
| 237 | lexComment(); |
| 238 | continue; |
| 239 | } |
| 240 | return emitError(loc: tokStart, msg: "unexpected character" ); |
| 241 | |
| 242 | // Ignore whitespace characters. |
| 243 | case 0: |
| 244 | case ' ': |
| 245 | case '\t': |
| 246 | case '\n': |
| 247 | return lexToken(); |
| 248 | |
| 249 | case '#': |
| 250 | return lexDirective(tokStart); |
| 251 | case '"': |
| 252 | return lexString(tokStart, /*isStringBlock=*/false); |
| 253 | |
| 254 | case '0': |
| 255 | case '1': |
| 256 | case '2': |
| 257 | case '3': |
| 258 | case '4': |
| 259 | case '5': |
| 260 | case '6': |
| 261 | case '7': |
| 262 | case '8': |
| 263 | case '9': |
| 264 | return lexNumber(tokStart); |
| 265 | } |
| 266 | } |
| 267 | } |
| 268 | |
| 269 | /// Skip a comment line, starting with a '//'. |
| 270 | void Lexer::() { |
| 271 | // Advance over the second '/' in a '//' comment. |
| 272 | assert(*curPtr == '/'); |
| 273 | ++curPtr; |
| 274 | |
| 275 | while (true) { |
| 276 | switch (*curPtr++) { |
| 277 | case '\n': |
| 278 | case '\r': |
| 279 | // Newline is end of comment. |
| 280 | return; |
| 281 | case 0: |
| 282 | // If this is the end of the buffer, end the comment. |
| 283 | if (curPtr - 1 == curBuffer.end()) { |
| 284 | --curPtr; |
| 285 | return; |
| 286 | } |
| 287 | [[fallthrough]]; |
| 288 | default: |
| 289 | // Skip over other characters. |
| 290 | break; |
| 291 | } |
| 292 | } |
| 293 | } |
| 294 | |
| 295 | Token Lexer::lexDirective(const char *tokStart) { |
| 296 | // Match the rest with an identifier regex: [0-9a-zA-Z_]* |
| 297 | while (isalnum(*curPtr) || *curPtr == '_') |
| 298 | ++curPtr; |
| 299 | |
| 300 | StringRef str(tokStart, curPtr - tokStart); |
| 301 | return Token(Token::directive, str); |
| 302 | } |
| 303 | |
| 304 | Token Lexer::lexIdentifier(const char *tokStart) { |
| 305 | // Match the rest of the identifier regex: [0-9a-zA-Z_]* |
| 306 | while (isalnum(*curPtr) || *curPtr == '_') |
| 307 | ++curPtr; |
| 308 | |
| 309 | // Check to see if this identifier is a keyword. |
| 310 | StringRef str(tokStart, curPtr - tokStart); |
| 311 | Token::Kind kind = StringSwitch<Token::Kind>(str) |
| 312 | .Case(S: "attr" , Value: Token::kw_attr) |
| 313 | .Case(S: "Attr" , Value: Token::kw_Attr) |
| 314 | .Case(S: "erase" , Value: Token::kw_erase) |
| 315 | .Case(S: "let" , Value: Token::kw_let) |
| 316 | .Case(S: "Constraint" , Value: Token::kw_Constraint) |
| 317 | .Case(S: "not" , Value: Token::kw_not) |
| 318 | .Case(S: "op" , Value: Token::kw_op) |
| 319 | .Case(S: "Op" , Value: Token::kw_Op) |
| 320 | .Case(S: "OpName" , Value: Token::kw_OpName) |
| 321 | .Case(S: "Pattern" , Value: Token::kw_Pattern) |
| 322 | .Case(S: "replace" , Value: Token::kw_replace) |
| 323 | .Case(S: "return" , Value: Token::kw_return) |
| 324 | .Case(S: "rewrite" , Value: Token::kw_rewrite) |
| 325 | .Case(S: "Rewrite" , Value: Token::kw_Rewrite) |
| 326 | .Case(S: "type" , Value: Token::kw_type) |
| 327 | .Case(S: "Type" , Value: Token::kw_Type) |
| 328 | .Case(S: "TypeRange" , Value: Token::kw_TypeRange) |
| 329 | .Case(S: "Value" , Value: Token::kw_Value) |
| 330 | .Case(S: "ValueRange" , Value: Token::kw_ValueRange) |
| 331 | .Case(S: "with" , Value: Token::kw_with) |
| 332 | .Case(S: "_" , Value: Token::underscore) |
| 333 | .Default(Value: Token::identifier); |
| 334 | return Token(kind, str); |
| 335 | } |
| 336 | |
| 337 | Token Lexer::lexNumber(const char *tokStart) { |
| 338 | assert(isdigit(curPtr[-1])); |
| 339 | |
| 340 | // Handle the normal decimal case. |
| 341 | while (isdigit(*curPtr)) |
| 342 | ++curPtr; |
| 343 | |
| 344 | return formToken(kind: Token::integer, tokStart); |
| 345 | } |
| 346 | |
| 347 | Token Lexer::lexString(const char *tokStart, bool isStringBlock) { |
| 348 | while (true) { |
| 349 | // Check to see if there is a code completion location within the string. In |
| 350 | // these cases we generate a completion location and place the currently |
| 351 | // lexed string within the token (without the quotes). This allows for the |
| 352 | // parser to use the partially lexed string when computing the completion |
| 353 | // results. |
| 354 | if (curPtr == codeCompletionLocation) { |
| 355 | return formToken(kind: Token::code_complete_string, |
| 356 | tokStart: tokStart + (isStringBlock ? 2 : 1)); |
| 357 | } |
| 358 | |
| 359 | switch (*curPtr++) { |
| 360 | case '"': |
| 361 | // If this is a string block, we only end the string when we encounter a |
| 362 | // `}]`. |
| 363 | if (!isStringBlock) |
| 364 | return formToken(kind: Token::string, tokStart); |
| 365 | continue; |
| 366 | case '}': |
| 367 | // If this is a string block, we only end the string when we encounter a |
| 368 | // `}]`. |
| 369 | if (!isStringBlock || *curPtr != ']') |
| 370 | continue; |
| 371 | ++curPtr; |
| 372 | return formToken(kind: Token::string_block, tokStart); |
| 373 | case 0: { |
| 374 | // If this is a random nul character in the middle of a string, just |
| 375 | // include it. If it is the end of file, then it is an error. |
| 376 | if (curPtr - 1 != curBuffer.end()) |
| 377 | continue; |
| 378 | --curPtr; |
| 379 | |
| 380 | StringRef expectedEndStr = isStringBlock ? "}]" : "\"" ; |
| 381 | return emitError(loc: curPtr - 1, |
| 382 | msg: "expected '" + expectedEndStr + "' in string literal" ); |
| 383 | } |
| 384 | |
| 385 | case '\n': |
| 386 | case '\v': |
| 387 | case '\f': |
| 388 | // String blocks allow multiple lines. |
| 389 | if (!isStringBlock) |
| 390 | return emitError(loc: curPtr - 1, msg: "expected '\"' in string literal" ); |
| 391 | continue; |
| 392 | |
| 393 | case '\\': |
| 394 | // Handle explicitly a few escapes. |
| 395 | if (*curPtr == '"' || *curPtr == '\\' || *curPtr == 'n' || |
| 396 | *curPtr == 't') { |
| 397 | ++curPtr; |
| 398 | } else if (llvm::isHexDigit(C: *curPtr) && llvm::isHexDigit(C: curPtr[1])) { |
| 399 | // Support \xx for two hex digits. |
| 400 | curPtr += 2; |
| 401 | } else { |
| 402 | return emitError(loc: curPtr - 1, msg: "unknown escape in string literal" ); |
| 403 | } |
| 404 | continue; |
| 405 | |
| 406 | default: |
| 407 | continue; |
| 408 | } |
| 409 | } |
| 410 | } |
| 411 | |