1//===- Lexer.h - Lexer for the Toy language -------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements a simple Lexer for the Toy language.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef TOY_LEXER_H
14#define TOY_LEXER_H
15
16#include "llvm/ADT/StringRef.h"
17
18#include <memory>
19#include <string>
20
21namespace toy {
22
23/// Structure definition a location in a file.
24struct Location {
25 std::shared_ptr<std::string> file; ///< filename.
26 int line; ///< line number.
27 int col; ///< column number.
28};
29
30// List of Token returned by the lexer.
31enum Token : int {
32 tok_semicolon = ';',
33 tok_parenthese_open = '(',
34 tok_parenthese_close = ')',
35 tok_bracket_open = '{',
36 tok_bracket_close = '}',
37 tok_sbracket_open = '[',
38 tok_sbracket_close = ']',
39
40 tok_eof = -1,
41
42 // commands
43 tok_return = -2,
44 tok_var = -3,
45 tok_def = -4,
46 tok_struct = -5,
47
48 // primary
49 tok_identifier = -6,
50 tok_number = -7,
51};
52
53/// The Lexer is an abstract base class providing all the facilities that the
54/// Parser expects. It goes through the stream one token at a time and keeps
55/// track of the location in the file for debugging purpose.
56/// It relies on a subclass to provide a `readNextLine()` method. The subclass
57/// can proceed by reading the next line from the standard input or from a
58/// memory mapped file.
59class Lexer {
60public:
61 /// Create a lexer for the given filename. The filename is kept only for
62 /// debugging purpose (attaching a location to a Token).
63 Lexer(std::string filename)
64 : lastLocation(
65 {.file: std::make_shared<std::string>(args: std::move(filename)), .line: 0, .col: 0}) {}
66 virtual ~Lexer() = default;
67
68 /// Look at the current token in the stream.
69 Token getCurToken() { return curTok; }
70
71 /// Move to the next token in the stream and return it.
72 Token getNextToken() { return curTok = getTok(); }
73
74 /// Move to the next token in the stream, asserting on the current token
75 /// matching the expectation.
76 void consume(Token tok) {
77 assert(tok == curTok && "consume Token mismatch expectation");
78 getNextToken();
79 }
80
81 /// Return the current identifier (prereq: getCurToken() == tok_identifier)
82 llvm::StringRef getId() {
83 assert(curTok == tok_identifier);
84 return identifierStr;
85 }
86
87 /// Return the current number (prereq: getCurToken() == tok_number)
88 double getValue() {
89 assert(curTok == tok_number);
90 return numVal;
91 }
92
93 /// Return the location for the beginning of the current token.
94 Location getLastLocation() { return lastLocation; }
95
96 // Return the current line in the file.
97 int getLine() { return curLineNum; }
98
99 // Return the current column in the file.
100 int getCol() { return curCol; }
101
102private:
103 /// Delegate to a derived class fetching the next line. Returns an empty
104 /// string to signal end of file (EOF). Lines are expected to always finish
105 /// with "\n"
106 virtual llvm::StringRef readNextLine() = 0;
107
108 /// Return the next character from the stream. This manages the buffer for the
109 /// current line and request the next line buffer to the derived class as
110 /// needed.
111 int getNextChar() {
112 // The current line buffer should not be empty unless it is the end of file.
113 if (curLineBuffer.empty())
114 return EOF;
115 ++curCol;
116 auto nextchar = curLineBuffer.front();
117 curLineBuffer = curLineBuffer.drop_front();
118 if (curLineBuffer.empty())
119 curLineBuffer = readNextLine();
120 if (nextchar == '\n') {
121 ++curLineNum;
122 curCol = 0;
123 }
124 return nextchar;
125 }
126
127 /// Return the next token from standard input.
128 Token getTok() {
129 // Skip any whitespace.
130 while (isspace(lastChar))
131 lastChar = Token(getNextChar());
132
133 // Save the current location before reading the token characters.
134 lastLocation.line = curLineNum;
135 lastLocation.col = curCol;
136
137 // Identifier: [a-zA-Z][a-zA-Z0-9_]*
138 if (isalpha(lastChar)) {
139 identifierStr = (char)lastChar;
140 while (isalnum((lastChar = Token(getNextChar()))) || lastChar == '_')
141 identifierStr += (char)lastChar;
142
143 if (identifierStr == "return")
144 return tok_return;
145 if (identifierStr == "def")
146 return tok_def;
147 if (identifierStr == "struct")
148 return tok_struct;
149 if (identifierStr == "var")
150 return tok_var;
151 return tok_identifier;
152 }
153
154 // Number: [0-9] ([0-9.])*
155 if (isdigit(lastChar)) {
156 std::string numStr;
157 do {
158 numStr += lastChar;
159 lastChar = Token(getNextChar());
160 } while (isdigit(lastChar) || lastChar == '.');
161
162 numVal = strtod(nptr: numStr.c_str(), endptr: nullptr);
163 return tok_number;
164 }
165
166 if (lastChar == '#') {
167 // Comment until end of line.
168 do {
169 lastChar = Token(getNextChar());
170 } while (lastChar != EOF && lastChar != '\n' && lastChar != '\r');
171
172 if (lastChar != EOF)
173 return getTok();
174 }
175
176 // Check for end of file. Don't eat the EOF.
177 if (lastChar == EOF)
178 return tok_eof;
179
180 // Otherwise, just return the character as its ascii value.
181 Token thisChar = Token(lastChar);
182 lastChar = Token(getNextChar());
183 return thisChar;
184 }
185
186 /// The last token read from the input.
187 Token curTok = tok_eof;
188
189 /// Location for `curTok`.
190 Location lastLocation;
191
192 /// If the current Token is an identifier, this string contains the value.
193 std::string identifierStr;
194
195 /// If the current Token is a number, this contains the value.
196 double numVal = 0;
197
198 /// The last value returned by getNextChar(). We need to keep it around as we
199 /// always need to read ahead one character to decide when to end a token and
200 /// we can't put it back in the stream after reading from it.
201 Token lastChar = Token(' ');
202
203 /// Keep track of the current line number in the input stream
204 int curLineNum = 0;
205
206 /// Keep track of the current column number in the input stream
207 int curCol = 0;
208
209 /// Buffer supplied by the derived class on calls to `readNextLine()`
210 llvm::StringRef curLineBuffer = "\n";
211};
212
213/// A lexer implementation operating on a buffer in memory.
214class LexerBuffer final : public Lexer {
215public:
216 LexerBuffer(const char *begin, const char *end, std::string filename)
217 : Lexer(std::move(filename)), current(begin), end(end) {}
218
219private:
220 /// Provide one line at a time to the Lexer, return an empty string when
221 /// reaching the end of the buffer.
222 llvm::StringRef readNextLine() override {
223 auto *begin = current;
224 while (current <= end && *current && *current != '\n')
225 ++current;
226 if (current <= end && *current)
227 ++current;
228 llvm::StringRef result{begin, static_cast<size_t>(current - begin)};
229 return result;
230 }
231 const char *current, *end;
232};
233} // namespace toy
234
235#endif // TOY_LEXER_H
236

source code of mlir/examples/toy/Ch7/include/toy/Lexer.h