Lexer.cpp source code [mlir/lib/AsmParser/Lexer.cpp]

1	//===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements the lexer for the MLIR textual form.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "Lexer.h"
14	#include "Token.h"
15	#include "mlir/AsmParser/CodeComplete.h"
16	#include "mlir/IR/Diagnostics.h"
17	#include "mlir/IR/Location.h"
18	#include "mlir/IR/MLIRContext.h"
19	#include "mlir/Support/LLVM.h"
20	#include "llvm/ADT/STLExtras.h"
21	#include "llvm/ADT/StringExtras.h"
22	#include "llvm/ADT/StringSwitch.h"
23	#include "llvm/Support/ErrorHandling.h"
24	#include "llvm/Support/SourceMgr.h"
25	#include <cassert>
26	#include <cctype>
27
28	using namespace mlir;
29
30	// Returns true if 'c' is an allowable punctuation character: [$._-]
31	// Returns false otherwise.
32	static bool isPunct(char c) {
33	return c == `'$'` \|\| c == `'.'` \|\| c == `'_'` \|\| c == `'-'`;
34	}
35
36	Lexer::Lexer(const llvm::SourceMgr &sourceMgr, MLIRContext *context,
37	AsmParserCodeCompleteContext *codeCompleteContext)
38	: sourceMgr(sourceMgr), context(context), codeCompleteLoc(nullptr) {
39	auto bufferID = sourceMgr.getMainFileID();
40	curBuffer = sourceMgr.getMemoryBuffer(i: bufferID)->getBuffer();
41	curPtr = curBuffer.begin();
42
43	// Set the code completion location if it was provided.
44	if (codeCompleteContext)
45	codeCompleteLoc = codeCompleteContext->getCodeCompleteLoc().getPointer();
46	}
47
48	/// Encode the specified source location information into an attribute for
49	/// attachment to the IR.
50	Location Lexer::getEncodedSourceLocation(SMLoc loc) {
51	auto &sourceMgr = getSourceMgr();
52	unsigned mainFileID = sourceMgr.getMainFileID();
53
54	// TODO: Fix performance issues in SourceMgr::getLineAndColumn so that we can
55	// use it here.
56	auto &bufferInfo = sourceMgr.getBufferInfo(i: mainFileID);
57	unsigned lineNo = bufferInfo.getLineNumber(Ptr: loc.getPointer());
58	unsigned column =
59	(loc.getPointer() - bufferInfo.getPointerForLineNumber(LineNo: lineNo)) + `1`;
60	auto *buffer = sourceMgr.getMemoryBuffer(i: mainFileID);
61
62	return FileLineColLoc::get(context, fileName: buffer->getBufferIdentifier(), line: lineNo,
63	column);
64	}
65
66	/// emitError - Emit an error message and return an Token::error token.
67	Token Lexer::emitError(const char loc, const* Twine &message) {
68	mlir::emitError(loc: getEncodedSourceLocation(loc: SMLoc::getFromPointer(Ptr: loc)),
69	message);
70	return formToken(kind: Token::error, tokStart: loc);
71	}
72
73	Token Lexer::lexToken() {
74	while (true) {
75	const char *tokStart = curPtr;
76
77	// Check to see if the current token is at the code completion location.
78	if (tokStart == codeCompleteLoc)
79	return formToken(kind: Token::code_complete, tokStart);
80
81	// Lex the next token.
82	switch (*curPtr++) {
83	default:
84	// Handle bare identifiers.
85	if (isalpha(curPtr[-`1`]))
86	return lexBareIdentifierOrKeyword(tokStart);
87
88	// Unknown character, emit an error.
89	return emitError(loc: tokStart, message: "unexpected character");
90
91	case `' '`:
92	case `'\t'`:
93	case `'\n'`:
94	case `'\r'`:
95	// Handle whitespace.
96	continue;
97
98	case `'_'`:
99	// Handle bare identifiers.
100	return lexBareIdentifierOrKeyword(tokStart);
101
102	case `0`:
103	// This may either be a nul character in the source file or may be the EOF
104	// marker that llvm::MemoryBuffer guarantees will be there.
105	if (curPtr - `1` == curBuffer.end())
106	return formToken(kind: Token::eof, tokStart);
107	continue;
108
109	case `':'`:
110	return formToken(kind: Token::colon, tokStart);
111	case `','`:
112	return formToken(kind: Token::comma, tokStart);
113	case `'.'`:
114	return lexEllipsis(tokStart);
115	case `'('`:
116	return formToken(kind: Token::l_paren, tokStart);
117	case `')'`:
118	return formToken(kind: Token::r_paren, tokStart);
119	case `'{'`:
120	if (curPtr == `'-'` && (curPtr + `1`) == `'#'`) {
121	curPtr += `2`;
122	return formToken(kind: Token::file_metadata_begin, tokStart);
123	}
124	return formToken(kind: Token::l_brace, tokStart);
125	case `'}'`:
126	return formToken(kind: Token::r_brace, tokStart);
127	case `'['`:
128	return formToken(kind: Token::l_square, tokStart);
129	case `']'`:
130	return formToken(kind: Token::r_square, tokStart);
131	case `'<'`:
132	return formToken(kind: Token::less, tokStart);
133	case `'>'`:
134	return formToken(kind: Token::greater, tokStart);
135	case `'='`:
136	return formToken(kind: Token::equal, tokStart);
137
138	case `'+'`:
139	return formToken(kind: Token::plus, tokStart);
140	case `'*'`:
141	return formToken(kind: Token::star, tokStart);
142	case `'-'`:
143	if (*curPtr == `'>'`) {
144	++curPtr;
145	return formToken(kind: Token::arrow, tokStart);
146	}
147	return formToken(kind: Token::minus, tokStart);
148
149	case `'?'`:
150	return formToken(kind: Token::question, tokStart);
151
152	case `'\|'`:
153	return formToken(kind: Token::vertical_bar, tokStart);
154
155	case `'/'`:
156	if (*curPtr == `'/'`) {
157	skipComment();
158	continue;
159	}
160	return formToken(kind: Token::slash, tokStart);
161
162	case `'@'`:
163	return lexAtIdentifier(tokStart);
164
165	case `'#'`:
166	if (curPtr == `'-'` && (curPtr + `1`) == `'}'`) {
167	curPtr += `2`;
168	return formToken(kind: Token::file_metadata_end, tokStart);
169	}
170	[[fallthrough]];
171	case `'!'`:
172	case `'^'`:
173	case `'%'`:
174	return lexPrefixedIdentifier(tokStart);
175	case `'"'`:
176	return lexString(tokStart);
177
178	case `'0'`:
179	case `'1'`:
180	case `'2'`:
181	case `'3'`:
182	case `'4'`:
183	case `'5'`:
184	case `'6'`:
185	case `'7'`:
186	case `'8'`:
187	case `'9'`:
188	return lexNumber(tokStart);
189	}
190	}
191	}
192
193	/// Lex an '@foo' identifier.
194	///
195	/// symbol-ref-id ::= `@` (bare-id \| string-literal)
196	///
197	Token Lexer::lexAtIdentifier(const char *tokStart) {
198	char cur = *curPtr++;
199
200	// Try to parse a string literal, if present.
201	if (cur == `'"'`) {
202	Token stringIdentifier = lexString(tokStart: curPtr);
203	if (stringIdentifier.is(k: Token::error))
204	return stringIdentifier;
205	return formToken(kind: Token::at_identifier, tokStart);
206	}
207
208	// Otherwise, these always start with a letter or underscore.
209	if (!isalpha(cur) && cur != `'_'`)
210	return emitError(loc: curPtr - `1`,
211	message: "@ identifier expected to start with letter or '_'");
212
213	while (isalpha(curPtr) \|\| isdigit(curPtr) \|\| *curPtr == `'_'` \|\|
214	curPtr == `'$'` \|\| curPtr == `'.'`)
215	++curPtr;
216	return formToken(kind: Token::at_identifier, tokStart);
217	}
218
219	/// Lex a bare identifier or keyword that starts with a letter.
220	///
221	/// bare-id ::= (letter\|[_]) (letter\|digit\|[_$.])*
222	/// integer-type ::= `[su]?i[1-9][0-9]`*
223	///
224	Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) {
225	// Match the rest of the identifier regex: [0-9a-zA-Z_.$]*
226	while (isalpha(curPtr) \|\| isdigit(curPtr) \|\| *curPtr == `'_'` \|\|
227	curPtr == `'$'` \|\| curPtr == `'.'`)
228	++curPtr;
229
230	// Check to see if this identifier is a keyword.
231	StringRef spelling(tokStart, curPtr - tokStart);
232
233	auto isAllDigit = [](StringRef str) {
234	return llvm::all_of(Range&: str, P: llvm::isDigit);
235	};
236
237	// Check for i123, si456, ui789.
238	if ((spelling.size() > `1` && tokStart[`0`] == `'i'` &&
239	isAllDigit (spelling.drop_front())) \|\|
240	((spelling.size() > `2` && tokStart[`1`] == `'i'` &&
241	(tokStart[`0`] == `'s'` \|\| tokStart[`0`] == `'u'`)) &&
242	isAllDigit (spelling.drop_front(N: `2`))))
243	return Token (Token::inttype, spelling);
244
245	Token::Kind kind = StringSwitch<Token::Kind>(spelling)
246	#define TOK_KEYWORD(SPELLING) .Case(#SPELLING, Token::kw_##SPELLING)
247	#include "TokenKinds.def"
248	.Default(Value: Token::bare_identifier);
249
250	return Token (kind, spelling);
251	}
252
253	/// Skip a comment line, starting with a '//'.
254	///
255	/// TODO: add a regex for comments here and to the spec.
256	///
257	void Lexer::skipComment() {
258	// Advance over the second '/' in a '//' comment.
259	assert(*curPtr == `'/'`);
260	++curPtr;
261
262	while (true) {
263	switch (*curPtr++) {
264	case `'\n'`:
265	case `'\r'`:
266	// Newline is end of comment.
267	return;
268	case `0`:
269	// If this is the end of the buffer, end the comment.
270	if (curPtr - `1` == curBuffer.end()) {
271	--curPtr;
272	return;
273	}
274	[[fallthrough]];
275	default:
276	// Skip over other characters.
277	break;
278	}
279	}
280	}
281
282	/// Lex an ellipsis.
283	///
284	/// ellipsis ::= '...'
285	///
286	Token Lexer::lexEllipsis(const char *tokStart) {
287	assert(curPtr[-`1`] == `'.'`);
288
289	if (curPtr == curBuffer.end() \|\| curPtr != `'.'` \|\| (curPtr + `1`) != `'.'`)
290	return emitError(loc: curPtr, message: "expected three consecutive dots for an ellipsis");
291
292	curPtr += `2`;
293	return formToken(kind: Token::ellipsis, tokStart);
294	}
295
296	/// Lex a number literal.
297	///
298	/// integer-literal ::= digit+ \| `0x` hex_digit+
299	/// float-literal ::= [-+]?[0-9]+[.][0-9]([eE][-+]?[0-9]+)?*
300	///
301	Token Lexer::lexNumber(const char *tokStart) {
302	assert(isdigit(curPtr[-`1`]));
303
304	// Handle the hexadecimal case.
305	if (curPtr[-`1`] == `'0'` && *curPtr == `'x'`) {
306	// If we see stuff like 0xi32, this is a literal `0` followed by an
307	// identifier `xi32`, stop after `0`.
308	if (!isxdigit(curPtr[`1`]))
309	return formToken(kind: Token::integer, tokStart);
310
311	curPtr += `2`;
312	while (isxdigit(*curPtr))
313	++curPtr;
314
315	return formToken(kind: Token::integer, tokStart);
316	}
317
318	// Handle the normal decimal case.
319	while (isdigit(*curPtr))
320	++curPtr;
321
322	if (*curPtr != `'.'`)
323	return formToken(kind: Token::integer, tokStart);
324	++curPtr;
325
326	// Skip over [0-9]([eE][-+]?[0-9]+)?*
327	while (isdigit(*curPtr))
328	++curPtr;
329
330	if (curPtr == `'e'` \|\| curPtr == `'E'`) {
331	if (isdigit(static_cast<unsigned char>(curPtr[`1`])) \|\|
332	((curPtr[`1`] == `'-'` \|\| curPtr[`1`] == `'+'`) &&
333	isdigit(static_cast<unsigned char>(curPtr[`2`])))) {
334	curPtr += `2`;
335	while (isdigit(*curPtr))
336	++curPtr;
337	}
338	}
339	return formToken(kind: Token::floatliteral, tokStart);
340	}
341
342	/// Lex an identifier that starts with a prefix followed by suffix-id.
343	///
344	/// attribute-id ::= `#` suffix-id
345	/// ssa-id ::= '%' suffix-id
346	/// block-id ::= '^' suffix-id
347	/// type-id ::= '!' suffix-id
348	/// suffix-id ::= digit+ \| (letter\|id-punct) (letter\|id-punct\|digit)*
349	/// id-punct ::= `$` \| `.` \| `_` \| `-`
350	///
351	Token Lexer::lexPrefixedIdentifier(const char *tokStart) {
352	Token::Kind kind;
353	StringRef errorKind;
354	switch (*tokStart) {
355	case `'#'`:
356	kind = Token::hash_identifier;
357	errorKind = "invalid attribute name";
358	break;
359	case `'%'`:
360	kind = Token::percent_identifier;
361	errorKind = "invalid SSA name";
362	break;
363	case `'^'`:
364	kind = Token::caret_identifier;
365	errorKind = "invalid block name";
366	break;
367	case `'!'`:
368	kind = Token::exclamation_identifier;
369	errorKind = "invalid type identifier";
370	break;
371	default:
372	llvm_unreachable("invalid caller");
373	}
374
375	// Parse suffix-id.
376	if (isdigit(*curPtr)) {
377	// If suffix-id starts with a digit, the rest must be digits.
378	while (isdigit(*curPtr))
379	++curPtr;
380	} else if (isalpha(curPtr) \|\| isPunct(c: curPtr)) {
381	do {
382	++curPtr;
383	} while (isalpha(curPtr) \|\| isdigit(curPtr) \|\| isPunct(c: *curPtr));
384	} else if (curPtr == codeCompleteLoc) {
385	return formToken(kind: Token::code_complete, tokStart);
386	} else {
387	return emitError(loc: curPtr - `1`, message: errorKind);
388	}
389
390	// Check for a code completion within the identifier.
391	if (codeCompleteLoc && codeCompleteLoc >= tokStart &&
392	codeCompleteLoc <= curPtr) {
393	return Token (Token::code_complete,
394	StringRef(tokStart, codeCompleteLoc - tokStart));
395	}
396
397	return formToken(kind, tokStart);
398	}
399
400	/// Lex a string literal.
401	///
402	/// string-literal ::= '"' [^"\n\f\v\r] '"'*
403	///
404	/// TODO: define escaping rules.
405	Token Lexer::lexString(const char *tokStart) {
406	assert(curPtr[-`1`] == `'"'`);
407
408	while (true) {
409	// Check to see if there is a code completion location within the string. In
410	// these cases we generate a completion location and place the currently
411	// lexed string within the token. This allows for the parser to use the
412	// partially lexed string when computing the completion results.
413	if (curPtr == codeCompleteLoc)
414	return formToken(kind: Token::code_complete, tokStart);
415
416	switch (*curPtr++) {
417	case `'"'`:
418	return formToken(kind: Token::string, tokStart);
419	case `0`:
420	// If this is a random nul character in the middle of a string, just
421	// include it. If it is the end of file, then it is an error.
422	if (curPtr - `1` != curBuffer.end())
423	continue;
424	[[fallthrough]];
425	case `'\n'`:
426	case `'\v'`:
427	case `'\f'`:
428	return emitError(loc: curPtr - `1`, message: "expected '\"' in string literal");
429	case `'\\'`:
430	// Handle explicitly a few escapes.
431	if (curPtr == `'"'` \|\| curPtr == `'\\'` \|\| curPtr == `'n'` \|\| curPtr == `'t'`)
432	++curPtr;
433	else if (llvm::isHexDigit(C: *curPtr) && llvm::isHexDigit(C: curPtr[`1`]))
434	// Support \xx for two hex digits.
435	curPtr += `2`;
436	else
437	return emitError(loc: curPtr - `1`, message: "unknown escape in string literal");
438	continue;
439
440	default:
441	continue;
442	}
443	}
444	}
445

source code of mlir/lib/AsmParser/Lexer.cpp