1//===- Lexer.h - Lexer for the Toy language -------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements a simple Lexer for the Toy language.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef TOY_LEXER_H
14#define TOY_LEXER_H
15
16#include "llvm/ADT/StringRef.h"
17
18#include <cstdlib>
19#include <memory>
20#include <string>
21
22namespace toy {
23
24/// Structure definition a location in a file.
25struct Location {
26 std::shared_ptr<std::string> file; ///< filename.
27 int line; ///< line number.
28 int col; ///< column number.
29};
30
31// List of Token returned by the lexer.
32enum Token : int {
33 tok_semicolon = ';',
34 tok_parenthese_open = '(',
35 tok_parenthese_close = ')',
36 tok_bracket_open = '{',
37 tok_bracket_close = '}',
38 tok_sbracket_open = '[',
39 tok_sbracket_close = ']',
40
41 tok_eof = -1,
42
43 // commands
44 tok_return = -2,
45 tok_var = -3,
46 tok_def = -4,
47 tok_struct = -5,
48
49 // primary
50 tok_identifier = -6,
51 tok_number = -7,
52};
53
54/// The Lexer is an abstract base class providing all the facilities that the
55/// Parser expects. It goes through the stream one token at a time and keeps
56/// track of the location in the file for debugging purpose.
57/// It relies on a subclass to provide a `readNextLine()` method. The subclass
58/// can proceed by reading the next line from the standard input or from a
59/// memory mapped file.
60class Lexer {
61public:
62 /// Create a lexer for the given filename. The filename is kept only for
63 /// debugging purpose (attaching a location to a Token).
64 Lexer(std::string filename)
65 : lastLocation(
66 {.file: std::make_shared<std::string>(args: std::move(filename)), .line: 0, .col: 0}) {}
67 virtual ~Lexer() = default;
68
69 /// Look at the current token in the stream.
70 Token getCurToken() { return curTok; }
71
72 /// Move to the next token in the stream and return it.
73 Token getNextToken() { return curTok = getTok(); }
74
75 /// Move to the next token in the stream, asserting on the current token
76 /// matching the expectation.
77 void consume(Token tok) {
78 assert(tok == curTok && "consume Token mismatch expectation");
79 getNextToken();
80 }
81
82 /// Return the current identifier (prereq: getCurToken() == tok_identifier)
83 llvm::StringRef getId() {
84 assert(curTok == tok_identifier);
85 return identifierStr;
86 }
87
88 /// Return the current number (prereq: getCurToken() == tok_number)
89 double getValue() {
90 assert(curTok == tok_number);
91 return numVal;
92 }
93
94 /// Return the location for the beginning of the current token.
95 Location getLastLocation() { return lastLocation; }
96
97 // Return the current line in the file.
98 int getLine() { return curLineNum; }
99
100 // Return the current column in the file.
101 int getCol() { return curCol; }
102
103private:
104 /// Delegate to a derived class fetching the next line. Returns an empty
105 /// string to signal end of file (EOF). Lines are expected to always finish
106 /// with "\n"
107 virtual llvm::StringRef readNextLine() = 0;
108
109 /// Return the next character from the stream. This manages the buffer for the
110 /// current line and request the next line buffer to the derived class as
111 /// needed.
112 int getNextChar() {
113 // The current line buffer should not be empty unless it is the end of file.
114 if (curLineBuffer.empty())
115 return EOF;
116 ++curCol;
117 auto nextchar = curLineBuffer.front();
118 curLineBuffer = curLineBuffer.drop_front();
119 if (curLineBuffer.empty())
120 curLineBuffer = readNextLine();
121 if (nextchar == '\n') {
122 ++curLineNum;
123 curCol = 0;
124 }
125 return nextchar;
126 }
127
128 /// Return the next token from standard input.
129 Token getTok() {
130 // Skip any whitespace.
131 while (isspace(lastChar))
132 lastChar = Token(getNextChar());
133
134 // Save the current location before reading the token characters.
135 lastLocation.line = curLineNum;
136 lastLocation.col = curCol;
137
138 // Identifier: [a-zA-Z][a-zA-Z0-9_]*
139 if (isalpha(lastChar)) {
140 identifierStr = (char)lastChar;
141 while (isalnum((lastChar = Token(getNextChar()))) || lastChar == '_')
142 identifierStr += (char)lastChar;
143
144 if (identifierStr == "return")
145 return tok_return;
146 if (identifierStr == "def")
147 return tok_def;
148 if (identifierStr == "struct")
149 return tok_struct;
150 if (identifierStr == "var")
151 return tok_var;
152 return tok_identifier;
153 }
154
155 // Number: [0-9] ([0-9.])*
156 if (isdigit(lastChar)) {
157 std::string numStr;
158 do {
159 numStr += lastChar;
160 lastChar = Token(getNextChar());
161 } while (isdigit(lastChar) || lastChar == '.');
162
163 numVal = strtod(nptr: numStr.c_str(), endptr: nullptr);
164 return tok_number;
165 }
166
167 if (lastChar == '#') {
168 // Comment until end of line.
169 do {
170 lastChar = Token(getNextChar());
171 } while (lastChar != EOF && lastChar != '\n' && lastChar != '\r');
172
173 if (lastChar != EOF)
174 return getTok();
175 }
176
177 // Check for end of file. Don't eat the EOF.
178 if (lastChar == EOF)
179 return tok_eof;
180
181 // Otherwise, just return the character as its ascii value.
182 Token thisChar = Token(lastChar);
183 lastChar = Token(getNextChar());
184 return thisChar;
185 }
186
187 /// The last token read from the input.
188 Token curTok = tok_eof;
189
190 /// Location for `curTok`.
191 Location lastLocation;
192
193 /// If the current Token is an identifier, this string contains the value.
194 std::string identifierStr;
195
196 /// If the current Token is a number, this contains the value.
197 double numVal = 0;
198
199 /// The last value returned by getNextChar(). We need to keep it around as we
200 /// always need to read ahead one character to decide when to end a token and
201 /// we can't put it back in the stream after reading from it.
202 Token lastChar = Token(' ');
203
204 /// Keep track of the current line number in the input stream
205 int curLineNum = 0;
206
207 /// Keep track of the current column number in the input stream
208 int curCol = 0;
209
210 /// Buffer supplied by the derived class on calls to `readNextLine()`
211 llvm::StringRef curLineBuffer = "\n";
212};
213
214/// A lexer implementation operating on a buffer in memory.
215class LexerBuffer final : public Lexer {
216public:
217 LexerBuffer(const char *begin, const char *end, std::string filename)
218 : Lexer(std::move(filename)), current(begin), end(end) {}
219
220private:
221 /// Provide one line at a time to the Lexer, return an empty string when
222 /// reaching the end of the buffer.
223 llvm::StringRef readNextLine() override {
224 auto *begin = current;
225 while (current <= end && *current && *current != '\n')
226 ++current;
227 if (current <= end && *current)
228 ++current;
229 llvm::StringRef result{begin, static_cast<size_t>(current - begin)};
230 return result;
231 }
232 const char *current, *end;
233};
234} // namespace toy
235
236#endif // TOY_LEXER_H
237

source code of mlir/examples/toy/Ch7/include/toy/Lexer.h