1//===- Lexer.cpp ----------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "Lexer.h"
10#include "mlir/Tools/PDLL/AST/Diagnostic.h"
11#include "mlir/Tools/PDLL/Parser/CodeComplete.h"
12#include "llvm/ADT/StringExtras.h"
13#include "llvm/ADT/StringSwitch.h"
14#include "llvm/Support/SourceMgr.h"
15
16using namespace mlir;
17using namespace mlir::pdll;
18
19//===----------------------------------------------------------------------===//
20// Token
21//===----------------------------------------------------------------------===//
22
23std::string Token::getStringValue() const {
24 assert(getKind() == string || getKind() == string_block ||
25 getKind() == code_complete_string);
26
27 // Start by dropping the quotes.
28 StringRef bytes = getSpelling();
29 if (is(k: string))
30 bytes = bytes.drop_front().drop_back();
31 else if (is(k: string_block))
32 bytes = bytes.drop_front(N: 2).drop_back(N: 2);
33
34 std::string result;
35 result.reserve(res: bytes.size());
36 for (unsigned i = 0, e = bytes.size(); i != e;) {
37 auto c = bytes[i++];
38 if (c != '\\') {
39 result.push_back(c: c);
40 continue;
41 }
42
43 assert(i + 1 <= e && "invalid string should be caught by lexer");
44 auto c1 = bytes[i++];
45 switch (c1) {
46 case '"':
47 case '\\':
48 result.push_back(c: c1);
49 continue;
50 case 'n':
51 result.push_back(c: '\n');
52 continue;
53 case 't':
54 result.push_back(c: '\t');
55 continue;
56 default:
57 break;
58 }
59
60 assert(i + 1 <= e && "invalid string should be caught by lexer");
61 auto c2 = bytes[i++];
62
63 assert(llvm::isHexDigit(c1) && llvm::isHexDigit(c2) && "invalid escape");
64 result.push_back(c: (llvm::hexDigitValue(C: c1) << 4) | llvm::hexDigitValue(C: c2));
65 }
66
67 return result;
68}
69
70//===----------------------------------------------------------------------===//
71// Lexer
72//===----------------------------------------------------------------------===//
73
74Lexer::Lexer(llvm::SourceMgr &mgr, ast::DiagnosticEngine &diagEngine,
75 CodeCompleteContext *codeCompleteContext)
76 : srcMgr(mgr), diagEngine(diagEngine), addedHandlerToDiagEngine(false),
77 codeCompletionLocation(nullptr) {
78 curBufferID = mgr.getMainFileID();
79 curBuffer = srcMgr.getMemoryBuffer(i: curBufferID)->getBuffer();
80 curPtr = curBuffer.begin();
81
82 // Set the code completion location if necessary.
83 if (codeCompleteContext) {
84 codeCompletionLocation =
85 codeCompleteContext->getCodeCompleteLoc().getPointer();
86 }
87
88 // If the diag engine has no handler, add a default that emits to the
89 // SourceMgr.
90 if (!diagEngine.getHandlerFn()) {
91 diagEngine.setHandlerFn([&](const ast::Diagnostic &diag) {
92 srcMgr.PrintMessage(Loc: diag.getLocation().Start, Kind: diag.getSeverity(),
93 Msg: diag.getMessage());
94 for (const ast::Diagnostic &note : diag.getNotes())
95 srcMgr.PrintMessage(Loc: note.getLocation().Start, Kind: note.getSeverity(),
96 Msg: note.getMessage());
97 });
98 addedHandlerToDiagEngine = true;
99 }
100}
101
102Lexer::~Lexer() {
103 if (addedHandlerToDiagEngine)
104 diagEngine.setHandlerFn(nullptr);
105}
106
107LogicalResult Lexer::pushInclude(StringRef filename, SMRange includeLoc) {
108 std::string includedFile;
109 int bufferID =
110 srcMgr.AddIncludeFile(Filename: filename.str(), IncludeLoc: includeLoc.End, IncludedFile&: includedFile);
111 if (!bufferID)
112 return failure();
113
114 curBufferID = bufferID;
115 curBuffer = srcMgr.getMemoryBuffer(i: curBufferID)->getBuffer();
116 curPtr = curBuffer.begin();
117 return success();
118}
119
120Token Lexer::emitError(SMRange loc, const Twine &msg) {
121 diagEngine.emitError(loc, msg);
122 return formToken(kind: Token::error, tokStart: loc.Start.getPointer());
123}
124Token Lexer::emitErrorAndNote(SMRange loc, const Twine &msg, SMRange noteLoc,
125 const Twine &note) {
126 diagEngine.emitError(loc, msg)->attachNote(msg: note, noteLoc);
127 return formToken(kind: Token::error, tokStart: loc.Start.getPointer());
128}
129Token Lexer::emitError(const char *loc, const Twine &msg) {
130 return emitError(
131 loc: SMRange(SMLoc::getFromPointer(Ptr: loc), SMLoc::getFromPointer(Ptr: loc + 1)), msg);
132}
133
134int Lexer::getNextChar() {
135 char curChar = *curPtr++;
136 switch (curChar) {
137 default:
138 return static_cast<unsigned char>(curChar);
139 case 0: {
140 // A nul character in the stream is either the end of the current buffer
141 // or a random nul in the file. Disambiguate that here.
142 if (curPtr - 1 != curBuffer.end())
143 return 0;
144
145 // Otherwise, return end of file.
146 --curPtr;
147 return EOF;
148 }
149 case '\n':
150 case '\r':
151 // Handle the newline character by ignoring it and incrementing the line
152 // count. However, be careful about 'dos style' files with \n\r in them.
153 // Only treat a \n\r or \r\n as a single line.
154 if ((*curPtr == '\n' || (*curPtr == '\r')) && *curPtr != curChar)
155 ++curPtr;
156 return '\n';
157 }
158}
159
160Token Lexer::lexToken() {
161 while (true) {
162 const char *tokStart = curPtr;
163
164 // Check to see if this token is at the code completion location.
165 if (tokStart == codeCompletionLocation)
166 return formToken(kind: Token::code_complete, tokStart);
167
168 // This always consumes at least one character.
169 int curChar = getNextChar();
170 switch (curChar) {
171 default:
172 // Handle identifiers: [a-zA-Z_]
173 if (isalpha(curChar) || curChar == '_')
174 return lexIdentifier(tokStart);
175
176 // Unknown character, emit an error.
177 return emitError(loc: tokStart, msg: "unexpected character");
178 case EOF: {
179 // Return EOF denoting the end of lexing.
180 Token eof = formToken(kind: Token::eof, tokStart);
181
182 // Check to see if we are in an included file.
183 SMLoc parentIncludeLoc = srcMgr.getParentIncludeLoc(i: curBufferID);
184 if (parentIncludeLoc.isValid()) {
185 curBufferID = srcMgr.FindBufferContainingLoc(Loc: parentIncludeLoc);
186 curBuffer = srcMgr.getMemoryBuffer(i: curBufferID)->getBuffer();
187 curPtr = parentIncludeLoc.getPointer();
188 }
189
190 return eof;
191 }
192
193 // Lex punctuation.
194 case '-':
195 if (*curPtr == '>') {
196 ++curPtr;
197 return formToken(kind: Token::arrow, tokStart);
198 }
199 return emitError(loc: tokStart, msg: "unexpected character");
200 case ':':
201 return formToken(kind: Token::colon, tokStart);
202 case ',':
203 return formToken(kind: Token::comma, tokStart);
204 case '.':
205 return formToken(kind: Token::dot, tokStart);
206 case '=':
207 if (*curPtr == '>') {
208 ++curPtr;
209 return formToken(kind: Token::equal_arrow, tokStart);
210 }
211 return formToken(kind: Token::equal, tokStart);
212 case ';':
213 return formToken(kind: Token::semicolon, tokStart);
214 case '[':
215 if (*curPtr == '{') {
216 ++curPtr;
217 return lexString(tokStart, /*isStringBlock=*/true);
218 }
219 return formToken(kind: Token::l_square, tokStart);
220 case ']':
221 return formToken(kind: Token::r_square, tokStart);
222
223 case '<':
224 return formToken(kind: Token::less, tokStart);
225 case '>':
226 return formToken(kind: Token::greater, tokStart);
227 case '{':
228 return formToken(kind: Token::l_brace, tokStart);
229 case '}':
230 return formToken(kind: Token::r_brace, tokStart);
231 case '(':
232 return formToken(kind: Token::l_paren, tokStart);
233 case ')':
234 return formToken(kind: Token::r_paren, tokStart);
235 case '/':
236 if (*curPtr == '/') {
237 lexComment();
238 continue;
239 }
240 return emitError(loc: tokStart, msg: "unexpected character");
241
242 // Ignore whitespace characters.
243 case 0:
244 case ' ':
245 case '\t':
246 case '\n':
247 return lexToken();
248
249 case '#':
250 return lexDirective(tokStart);
251 case '"':
252 return lexString(tokStart, /*isStringBlock=*/false);
253
254 case '0':
255 case '1':
256 case '2':
257 case '3':
258 case '4':
259 case '5':
260 case '6':
261 case '7':
262 case '8':
263 case '9':
264 return lexNumber(tokStart);
265 }
266 }
267}
268
269/// Skip a comment line, starting with a '//'.
270void Lexer::lexComment() {
271 // Advance over the second '/' in a '//' comment.
272 assert(*curPtr == '/');
273 ++curPtr;
274
275 while (true) {
276 switch (*curPtr++) {
277 case '\n':
278 case '\r':
279 // Newline is end of comment.
280 return;
281 case 0:
282 // If this is the end of the buffer, end the comment.
283 if (curPtr - 1 == curBuffer.end()) {
284 --curPtr;
285 return;
286 }
287 [[fallthrough]];
288 default:
289 // Skip over other characters.
290 break;
291 }
292 }
293}
294
295Token Lexer::lexDirective(const char *tokStart) {
296 // Match the rest with an identifier regex: [0-9a-zA-Z_]*
297 while (isalnum(*curPtr) || *curPtr == '_')
298 ++curPtr;
299
300 StringRef str(tokStart, curPtr - tokStart);
301 return Token(Token::directive, str);
302}
303
304Token Lexer::lexIdentifier(const char *tokStart) {
305 // Match the rest of the identifier regex: [0-9a-zA-Z_]*
306 while (isalnum(*curPtr) || *curPtr == '_')
307 ++curPtr;
308
309 // Check to see if this identifier is a keyword.
310 StringRef str(tokStart, curPtr - tokStart);
311 Token::Kind kind = StringSwitch<Token::Kind>(str)
312 .Case(S: "attr", Value: Token::kw_attr)
313 .Case(S: "Attr", Value: Token::kw_Attr)
314 .Case(S: "erase", Value: Token::kw_erase)
315 .Case(S: "let", Value: Token::kw_let)
316 .Case(S: "Constraint", Value: Token::kw_Constraint)
317 .Case(S: "not", Value: Token::kw_not)
318 .Case(S: "op", Value: Token::kw_op)
319 .Case(S: "Op", Value: Token::kw_Op)
320 .Case(S: "OpName", Value: Token::kw_OpName)
321 .Case(S: "Pattern", Value: Token::kw_Pattern)
322 .Case(S: "replace", Value: Token::kw_replace)
323 .Case(S: "return", Value: Token::kw_return)
324 .Case(S: "rewrite", Value: Token::kw_rewrite)
325 .Case(S: "Rewrite", Value: Token::kw_Rewrite)
326 .Case(S: "type", Value: Token::kw_type)
327 .Case(S: "Type", Value: Token::kw_Type)
328 .Case(S: "TypeRange", Value: Token::kw_TypeRange)
329 .Case(S: "Value", Value: Token::kw_Value)
330 .Case(S: "ValueRange", Value: Token::kw_ValueRange)
331 .Case(S: "with", Value: Token::kw_with)
332 .Case(S: "_", Value: Token::underscore)
333 .Default(Value: Token::identifier);
334 return Token(kind, str);
335}
336
337Token Lexer::lexNumber(const char *tokStart) {
338 assert(isdigit(curPtr[-1]));
339
340 // Handle the normal decimal case.
341 while (isdigit(*curPtr))
342 ++curPtr;
343
344 return formToken(kind: Token::integer, tokStart);
345}
346
347Token Lexer::lexString(const char *tokStart, bool isStringBlock) {
348 while (true) {
349 // Check to see if there is a code completion location within the string. In
350 // these cases we generate a completion location and place the currently
351 // lexed string within the token (without the quotes). This allows for the
352 // parser to use the partially lexed string when computing the completion
353 // results.
354 if (curPtr == codeCompletionLocation) {
355 return formToken(kind: Token::code_complete_string,
356 tokStart: tokStart + (isStringBlock ? 2 : 1));
357 }
358
359 switch (*curPtr++) {
360 case '"':
361 // If this is a string block, we only end the string when we encounter a
362 // `}]`.
363 if (!isStringBlock)
364 return formToken(kind: Token::string, tokStart);
365 continue;
366 case '}':
367 // If this is a string block, we only end the string when we encounter a
368 // `}]`.
369 if (!isStringBlock || *curPtr != ']')
370 continue;
371 ++curPtr;
372 return formToken(kind: Token::string_block, tokStart);
373 case 0: {
374 // If this is a random nul character in the middle of a string, just
375 // include it. If it is the end of file, then it is an error.
376 if (curPtr - 1 != curBuffer.end())
377 continue;
378 --curPtr;
379
380 StringRef expectedEndStr = isStringBlock ? "}]" : "\"";
381 return emitError(loc: curPtr - 1,
382 msg: "expected '" + expectedEndStr + "' in string literal");
383 }
384
385 case '\n':
386 case '\v':
387 case '\f':
388 // String blocks allow multiple lines.
389 if (!isStringBlock)
390 return emitError(loc: curPtr - 1, msg: "expected '\"' in string literal");
391 continue;
392
393 case '\\':
394 // Handle explicitly a few escapes.
395 if (*curPtr == '"' || *curPtr == '\\' || *curPtr == 'n' ||
396 *curPtr == 't') {
397 ++curPtr;
398 } else if (llvm::isHexDigit(C: *curPtr) && llvm::isHexDigit(C: curPtr[1])) {
399 // Support \xx for two hex digits.
400 curPtr += 2;
401 } else {
402 return emitError(loc: curPtr - 1, msg: "unknown escape in string literal");
403 }
404 continue;
405
406 default:
407 continue;
408 }
409 }
410}
411

Provided by KDAB

Privacy Policy
Improve your Profiling and Debugging skills
Find out more

source code of mlir/lib/Tools/PDLL/Parser/Lexer.cpp