Lexer.cpp source code [mlir/lib/Tools/PDLL/Parser/Lexer.cpp]

1	//===- Lexer.cpp ----------------------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "Lexer.h"
10	#include "mlir/Tools/PDLL/AST/Diagnostic.h"
11	#include "mlir/Tools/PDLL/Parser/CodeComplete.h"
12	#include "llvm/ADT/StringExtras.h"
13	#include "llvm/ADT/StringSwitch.h"
14	#include "llvm/Support/SourceMgr.h"
15
16	using namespace mlir;
17	using namespace mlir::pdll;
18
19	//===----------------------------------------------------------------------===//
20	// Token
21	//===----------------------------------------------------------------------===//
22
23	std::string Token::getStringValue() const {
24	assert(getKind() == string \|\| getKind() == string_block \|\|
25	getKind() == code_complete_string);
26
27	// Start by dropping the quotes.
28	StringRef bytes = getSpelling();
29	if (is(k: string))
30	bytes = bytes.drop_front().drop_back();
31	else if (is(k: string_block))
32	bytes = bytes.drop_front(N: `2`).drop_back(N: `2`);
33
34	std::string result;
35	result.reserve(res: bytes.size());
36	for (unsigned i = `0`, e = bytes.size(); i != e;) {
37	auto c = bytes [i++];
38	if (c != `'\\'`) {
39	result.push_back(c: c);
40	continue;
41	}
42
43	assert(i + `1` <= e && "invalid string should be caught by lexer");
44	auto c1 = bytes [i++];
45	switch (c1) {
46	case `'"'`:
47	case `'\\'`:
48	result.push_back(c: c1);
49	continue;
50	case `'n'`:
51	result.push_back(c: `'\n'`);
52	continue;
53	case `'t'`:
54	result.push_back(c: `'\t'`);
55	continue;
56	default:
57	break;
58	}
59
60	assert(i + `1` <= e && "invalid string should be caught by lexer");
61	auto c2 = bytes [i++];
62
63	assert(llvm::isHexDigit(c1) && llvm::isHexDigit(c2) && "invalid escape");
64	result.push_back(c: (llvm::hexDigitValue(C: c1) << `4`) \| llvm::hexDigitValue(C: c2));
65	}
66
67	return result;
68	}
69
70	//===----------------------------------------------------------------------===//
71	// Lexer
72	//===----------------------------------------------------------------------===//
73
74	Lexer::Lexer(llvm::SourceMgr &mgr, ast::DiagnosticEngine &diagEngine,
75	CodeCompleteContext *codeCompleteContext)
76	: srcMgr(mgr), diagEngine(diagEngine), addedHandlerToDiagEngine(false),
77	codeCompletionLocation(nullptr) {
78	curBufferID = mgr.getMainFileID();
79	curBuffer = srcMgr.getMemoryBuffer(i: curBufferID)->getBuffer();
80	curPtr = curBuffer.begin();
81
82	// Set the code completion location if necessary.
83	if (codeCompleteContext) {
84	codeCompletionLocation =
85	codeCompleteContext->getCodeCompleteLoc().getPointer();
86	}
87
88	// If the diag engine has no handler, add a default that emits to the
89	// SourceMgr.
90	if (!diagEngine.getHandlerFn()) {
91	diagEngine.setHandlerFn([&](const ast::Diagnostic &diag) {
92	srcMgr.PrintMessage(Loc: diag.getLocation().Start, Kind: diag.getSeverity(),
93	Msg: diag.getMessage());
94	for (const ast::Diagnostic &note : diag.getNotes())
95	srcMgr.PrintMessage(Loc: note.getLocation().Start, Kind: note.getSeverity(),
96	Msg: note.getMessage());
97	});
98	addedHandlerToDiagEngine = true;
99	}
100	}
101
102	Lexer::~Lexer() {
103	if (addedHandlerToDiagEngine)
104	diagEngine.setHandlerFn(nullptr);
105	}
106
107	LogicalResult Lexer::pushInclude(StringRef filename, SMRange includeLoc) {
108	std::string includedFile;
109	int bufferID =
110	srcMgr.AddIncludeFile(Filename: filename.str(), IncludeLoc: includeLoc.End, IncludedFile&: includedFile);
111	if (!bufferID)
112	return failure();
113
114	curBufferID = bufferID;
115	curBuffer = srcMgr.getMemoryBuffer(i: curBufferID)->getBuffer();
116	curPtr = curBuffer.begin();
117	return success();
118	}
119
120	Token Lexer::emitError(SMRange loc, const Twine &msg) {
121	diagEngine.emitError(loc, msg);
122	return formToken(kind: Token::error, tokStart: loc.Start.getPointer());
123	}
124	Token Lexer::emitErrorAndNote(SMRange loc, const Twine &msg, SMRange noteLoc,
125	const Twine &note) {
126	diagEngine.emitError(loc, msg)->attachNote(msg: note, noteLoc);
127	return formToken(kind: Token::error, tokStart: loc.Start.getPointer());
128	}
129	Token Lexer::emitError(const char loc, const* Twine &msg) {
130	return emitError(
131	loc: SMRange(SMLoc::getFromPointer(Ptr: loc), SMLoc::getFromPointer(Ptr: loc + `1`)), msg);
132	}
133
134	int Lexer::getNextChar() {
135	char curChar = *curPtr++;
136	switch (curChar) {
137	default:
138	return static_cast<unsigned char>(curChar);
139	case `0`: {
140	// A nul character in the stream is either the end of the current buffer
141	// or a random nul in the file. Disambiguate that here.
142	if (curPtr - `1` != curBuffer.end())
143	return `0`;
144
145	// Otherwise, return end of file.
146	--curPtr;
147	return EOF;
148	}
149	case `'\n'`:
150	case `'\r'`:
151	// Handle the newline character by ignoring it and incrementing the line
152	// count. However, be careful about 'dos style' files with \n\r in them.
153	// Only treat a \n\r or \r\n as a single line.
154	if ((curPtr == `'\n'` \|\| (curPtr == `'\r'`)) && *curPtr != curChar)
155	++curPtr;
156	return `'\n'`;
157	}
158	}
159
160	Token Lexer::lexToken() {
161	while (true) {
162	const char *tokStart = curPtr;
163
164	// Check to see if this token is at the code completion location.
165	if (tokStart == codeCompletionLocation)
166	return formToken(kind: Token::code_complete, tokStart);
167
168	// This always consumes at least one character.
169	int curChar = getNextChar();
170	switch (curChar) {
171	default:
172	// Handle identifiers: [a-zA-Z_]
173	if (isalpha(curChar) \|\| curChar == `'_'`)
174	return lexIdentifier(tokStart);
175
176	// Unknown character, emit an error.
177	return emitError(loc: tokStart, msg: "unexpected character");
178	case EOF: {
179	// Return EOF denoting the end of lexing.
180	Token eof = formToken(kind: Token::eof, tokStart);
181
182	// Check to see if we are in an included file.
183	SMLoc parentIncludeLoc = srcMgr.getParentIncludeLoc(i: curBufferID);
184	if (parentIncludeLoc.isValid()) {
185	curBufferID = srcMgr.FindBufferContainingLoc(Loc: parentIncludeLoc);
186	curBuffer = srcMgr.getMemoryBuffer(i: curBufferID)->getBuffer();
187	curPtr = parentIncludeLoc.getPointer();
188	}
189
190	return eof;
191	}
192
193	// Lex punctuation.
194	case `'-'`:
195	if (*curPtr == `'>'`) {
196	++curPtr;
197	return formToken(kind: Token::arrow, tokStart);
198	}
199	return emitError(loc: tokStart, msg: "unexpected character");
200	case `':'`:
201	return formToken(kind: Token::colon, tokStart);
202	case `','`:
203	return formToken(kind: Token::comma, tokStart);
204	case `'.'`:
205	return formToken(kind: Token::dot, tokStart);
206	case `'='`:
207	if (*curPtr == `'>'`) {
208	++curPtr;
209	return formToken(kind: Token::equal_arrow, tokStart);
210	}
211	return formToken(kind: Token::equal, tokStart);
212	case `';'`:
213	return formToken(kind: Token::semicolon, tokStart);
214	case `'['`:
215	if (*curPtr == `'{'`) {
216	++curPtr;
217	return lexString(tokStart, /isStringBlock=/true);
218	}
219	return formToken(kind: Token::l_square, tokStart);
220	case `']'`:
221	return formToken(kind: Token::r_square, tokStart);
222
223	case `'<'`:
224	return formToken(kind: Token::less, tokStart);
225	case `'>'`:
226	return formToken(kind: Token::greater, tokStart);
227	case `'{'`:
228	return formToken(kind: Token::l_brace, tokStart);
229	case `'}'`:
230	return formToken(kind: Token::r_brace, tokStart);
231	case `'('`:
232	return formToken(kind: Token::l_paren, tokStart);
233	case `')'`:
234	return formToken(kind: Token::r_paren, tokStart);
235	case `'/'`:
236	if (*curPtr == `'/'`) {
237	lexComment();
238	continue;
239	}
240	return emitError(loc: tokStart, msg: "unexpected character");
241
242	// Ignore whitespace characters.
243	case `0`:
244	case `' '`:
245	case `'\t'`:
246	case `'\n'`:
247	return lexToken();
248
249	case `'#'`:
250	return lexDirective(tokStart);
251	case `'"'`:
252	return lexString(tokStart, /isStringBlock=/false);
253
254	case `'0'`:
255	case `'1'`:
256	case `'2'`:
257	case `'3'`:
258	case `'4'`:
259	case `'5'`:
260	case `'6'`:
261	case `'7'`:
262	case `'8'`:
263	case `'9'`:
264	return lexNumber(tokStart);
265	}
266	}
267	}
268
269	/// Skip a comment line, starting with a '//'.
270	void Lexer::lexComment() {
271	// Advance over the second '/' in a '//' comment.
272	assert(*curPtr == `'/'`);
273	++curPtr;
274
275	while (true) {
276	switch (*curPtr++) {
277	case `'\n'`:
278	case `'\r'`:
279	// Newline is end of comment.
280	return;
281	case `0`:
282	// If this is the end of the buffer, end the comment.
283	if (curPtr - `1` == curBuffer.end()) {
284	--curPtr;
285	return;
286	}
287	[[fallthrough]];
288	default:
289	// Skip over other characters.
290	break;
291	}
292	}
293	}
294
295	Token Lexer::lexDirective(const char *tokStart) {
296	// Match the rest with an identifier regex: [0-9a-zA-Z_]*
297	while (isalnum(curPtr) \|\| curPtr == `'_'`)
298	++curPtr;
299
300	StringRef str(tokStart, curPtr - tokStart);
301	return Token (Token::directive, str);
302	}
303
304	Token Lexer::lexIdentifier(const char *tokStart) {
305	// Match the rest of the identifier regex: [0-9a-zA-Z_]*
306	while (isalnum(curPtr) \|\| curPtr == `'_'`)
307	++curPtr;
308
309	// Check to see if this identifier is a keyword.
310	StringRef str(tokStart, curPtr - tokStart);
311	Token::Kind kind = StringSwitch<Token::Kind>(str)
312	.Case(S: "attr", Value: Token::kw_attr)
313	.Case(S: "Attr", Value: Token::kw_Attr)
314	.Case(S: "erase", Value: Token::kw_erase)
315	.Case(S: "let", Value: Token::kw_let)
316	.Case(S: "Constraint", Value: Token::kw_Constraint)
317	.Case(S: "not", Value: Token::kw_not)
318	.Case(S: "op", Value: Token::kw_op)
319	.Case(S: "Op", Value: Token::kw_Op)
320	.Case(S: "OpName", Value: Token::kw_OpName)
321	.Case(S: "Pattern", Value: Token::kw_Pattern)
322	.Case(S: "replace", Value: Token::kw_replace)
323	.Case(S: "return", Value: Token::kw_return)
324	.Case(S: "rewrite", Value: Token::kw_rewrite)
325	.Case(S: "Rewrite", Value: Token::kw_Rewrite)
326	.Case(S: "type", Value: Token::kw_type)
327	.Case(S: "Type", Value: Token::kw_Type)
328	.Case(S: "TypeRange", Value: Token::kw_TypeRange)
329	.Case(S: "Value", Value: Token::kw_Value)
330	.Case(S: "ValueRange", Value: Token::kw_ValueRange)
331	.Case(S: "with", Value: Token::kw_with)
332	.Case(S: "_", Value: Token::underscore)
333	.Default(Value: Token::identifier);
334	return Token (kind, str);
335	}
336
337	Token Lexer::lexNumber(const char *tokStart) {
338	assert(isdigit(curPtr[-`1`]));
339
340	// Handle the normal decimal case.
341	while (isdigit(*curPtr))
342	++curPtr;
343
344	return formToken(kind: Token::integer, tokStart);
345	}
346
347	Token Lexer::lexString(const char tokStart, bool* isStringBlock) {
348	while (true) {
349	// Check to see if there is a code completion location within the string. In
350	// these cases we generate a completion location and place the currently
351	// lexed string within the token (without the quotes). This allows for the
352	// parser to use the partially lexed string when computing the completion
353	// results.
354	if (curPtr == codeCompletionLocation) {
355	return formToken(kind: Token::code_complete_string,
356	tokStart: tokStart + (isStringBlock ? `2` : `1`));
357	}
358
359	switch (*curPtr++) {
360	case `'"'`:
361	// If this is a string block, we only end the string when we encounter a
362	// `}]`.
363	if (!isStringBlock)
364	return formToken(kind: Token::string, tokStart);
365	continue;
366	case `'}'`:
367	// If this is a string block, we only end the string when we encounter a
368	// `}]`.
369	if (!isStringBlock \|\| *curPtr != `']'`)
370	continue;
371	++curPtr;
372	return formToken(kind: Token::string_block, tokStart);
373	case `0`: {
374	// If this is a random nul character in the middle of a string, just
375	// include it. If it is the end of file, then it is an error.
376	if (curPtr - `1` != curBuffer.end())
377	continue;
378	--curPtr;
379
380	StringRef expectedEndStr = isStringBlock ? "}]" : "\"";
381	return emitError(loc: curPtr - `1`,
382	msg: "expected '" + expectedEndStr + "' in string literal");
383	}
384
385	case `'\n'`:
386	case `'\v'`:
387	case `'\f'`:
388	// String blocks allow multiple lines.
389	if (!isStringBlock)
390	return emitError(loc: curPtr - `1`, msg: "expected '\"' in string literal");
391	continue;
392
393	case `'\\'`:
394	// Handle explicitly a few escapes.
395	if (curPtr == `'"'` \|\| curPtr == `'\\'` \|\| *curPtr == `'n'` \|\|
396	*curPtr == `'t'`) {
397	++curPtr;
398	} else if (llvm::isHexDigit(C: *curPtr) && llvm::isHexDigit(C: curPtr[`1`])) {
399	// Support \xx for two hex digits.
400	curPtr += `2`;
401	} else {
402	return emitError(loc: curPtr - `1`, msg: "unknown escape in string literal");
403	}
404	continue;
405
406	default:
407	continue;
408	}
409	}
410	}
411

Provided by KDAB

Improve your Profiling and Debugging skills

Find out more

Definitions

source code of mlir/lib/Tools/PDLL/Parser/Lexer.cpp