1 | //===- Lexer.h - Lexer for the Toy language -------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements a simple Lexer for the Toy language. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #ifndef TOY_LEXER_H |
14 | #define TOY_LEXER_H |
15 | |
16 | #include "llvm/ADT/StringRef.h" |
17 | |
18 | #include <memory> |
19 | #include <string> |
20 | |
21 | namespace toy { |
22 | |
23 | /// Structure definition a location in a file. |
24 | struct Location { |
25 | std::shared_ptr<std::string> file; ///< filename. |
26 | int line; ///< line number. |
27 | int col; ///< column number. |
28 | }; |
29 | |
30 | // List of Token returned by the lexer. |
31 | enum Token : int { |
32 | tok_semicolon = ';', |
33 | tok_parenthese_open = '(', |
34 | tok_parenthese_close = ')', |
35 | tok_bracket_open = '{', |
36 | tok_bracket_close = '}', |
37 | tok_sbracket_open = '[', |
38 | tok_sbracket_close = ']', |
39 | |
40 | tok_eof = -1, |
41 | |
42 | // commands |
43 | tok_return = -2, |
44 | tok_var = -3, |
45 | tok_def = -4, |
46 | tok_struct = -5, |
47 | |
48 | // primary |
49 | tok_identifier = -6, |
50 | tok_number = -7, |
51 | }; |
52 | |
53 | /// The Lexer is an abstract base class providing all the facilities that the |
54 | /// Parser expects. It goes through the stream one token at a time and keeps |
55 | /// track of the location in the file for debugging purpose. |
56 | /// It relies on a subclass to provide a `readNextLine()` method. The subclass |
57 | /// can proceed by reading the next line from the standard input or from a |
58 | /// memory mapped file. |
59 | class Lexer { |
60 | public: |
61 | /// Create a lexer for the given filename. The filename is kept only for |
62 | /// debugging purpose (attaching a location to a Token). |
63 | Lexer(std::string filename) |
64 | : lastLocation( |
65 | {.file: std::make_shared<std::string>(args: std::move(filename)), .line: 0, .col: 0}) {} |
66 | virtual ~Lexer() = default; |
67 | |
68 | /// Look at the current token in the stream. |
69 | Token getCurToken() { return curTok; } |
70 | |
71 | /// Move to the next token in the stream and return it. |
72 | Token getNextToken() { return curTok = getTok(); } |
73 | |
74 | /// Move to the next token in the stream, asserting on the current token |
75 | /// matching the expectation. |
76 | void consume(Token tok) { |
77 | assert(tok == curTok && "consume Token mismatch expectation" ); |
78 | getNextToken(); |
79 | } |
80 | |
81 | /// Return the current identifier (prereq: getCurToken() == tok_identifier) |
82 | llvm::StringRef getId() { |
83 | assert(curTok == tok_identifier); |
84 | return identifierStr; |
85 | } |
86 | |
87 | /// Return the current number (prereq: getCurToken() == tok_number) |
88 | double getValue() { |
89 | assert(curTok == tok_number); |
90 | return numVal; |
91 | } |
92 | |
93 | /// Return the location for the beginning of the current token. |
94 | Location getLastLocation() { return lastLocation; } |
95 | |
96 | // Return the current line in the file. |
97 | int getLine() { return curLineNum; } |
98 | |
99 | // Return the current column in the file. |
100 | int getCol() { return curCol; } |
101 | |
102 | private: |
103 | /// Delegate to a derived class fetching the next line. Returns an empty |
104 | /// string to signal end of file (EOF). Lines are expected to always finish |
105 | /// with "\n" |
106 | virtual llvm::StringRef readNextLine() = 0; |
107 | |
108 | /// Return the next character from the stream. This manages the buffer for the |
109 | /// current line and request the next line buffer to the derived class as |
110 | /// needed. |
111 | int getNextChar() { |
112 | // The current line buffer should not be empty unless it is the end of file. |
113 | if (curLineBuffer.empty()) |
114 | return EOF; |
115 | ++curCol; |
116 | auto nextchar = curLineBuffer.front(); |
117 | curLineBuffer = curLineBuffer.drop_front(); |
118 | if (curLineBuffer.empty()) |
119 | curLineBuffer = readNextLine(); |
120 | if (nextchar == '\n') { |
121 | ++curLineNum; |
122 | curCol = 0; |
123 | } |
124 | return nextchar; |
125 | } |
126 | |
127 | /// Return the next token from standard input. |
128 | Token getTok() { |
129 | // Skip any whitespace. |
130 | while (isspace(lastChar)) |
131 | lastChar = Token(getNextChar()); |
132 | |
133 | // Save the current location before reading the token characters. |
134 | lastLocation.line = curLineNum; |
135 | lastLocation.col = curCol; |
136 | |
137 | // Identifier: [a-zA-Z][a-zA-Z0-9_]* |
138 | if (isalpha(lastChar)) { |
139 | identifierStr = (char)lastChar; |
140 | while (isalnum((lastChar = Token(getNextChar()))) || lastChar == '_') |
141 | identifierStr += (char)lastChar; |
142 | |
143 | if (identifierStr == "return" ) |
144 | return tok_return; |
145 | if (identifierStr == "def" ) |
146 | return tok_def; |
147 | if (identifierStr == "struct" ) |
148 | return tok_struct; |
149 | if (identifierStr == "var" ) |
150 | return tok_var; |
151 | return tok_identifier; |
152 | } |
153 | |
154 | // Number: [0-9] ([0-9.])* |
155 | if (isdigit(lastChar)) { |
156 | std::string numStr; |
157 | do { |
158 | numStr += lastChar; |
159 | lastChar = Token(getNextChar()); |
160 | } while (isdigit(lastChar) || lastChar == '.'); |
161 | |
162 | numVal = strtod(nptr: numStr.c_str(), endptr: nullptr); |
163 | return tok_number; |
164 | } |
165 | |
166 | if (lastChar == '#') { |
167 | // Comment until end of line. |
168 | do { |
169 | lastChar = Token(getNextChar()); |
170 | } while (lastChar != EOF && lastChar != '\n' && lastChar != '\r'); |
171 | |
172 | if (lastChar != EOF) |
173 | return getTok(); |
174 | } |
175 | |
176 | // Check for end of file. Don't eat the EOF. |
177 | if (lastChar == EOF) |
178 | return tok_eof; |
179 | |
180 | // Otherwise, just return the character as its ascii value. |
181 | Token thisChar = Token(lastChar); |
182 | lastChar = Token(getNextChar()); |
183 | return thisChar; |
184 | } |
185 | |
186 | /// The last token read from the input. |
187 | Token curTok = tok_eof; |
188 | |
189 | /// Location for `curTok`. |
190 | Location lastLocation; |
191 | |
192 | /// If the current Token is an identifier, this string contains the value. |
193 | std::string identifierStr; |
194 | |
195 | /// If the current Token is a number, this contains the value. |
196 | double numVal = 0; |
197 | |
198 | /// The last value returned by getNextChar(). We need to keep it around as we |
199 | /// always need to read ahead one character to decide when to end a token and |
200 | /// we can't put it back in the stream after reading from it. |
201 | Token lastChar = Token(' '); |
202 | |
203 | /// Keep track of the current line number in the input stream |
204 | int curLineNum = 0; |
205 | |
206 | /// Keep track of the current column number in the input stream |
207 | int curCol = 0; |
208 | |
209 | /// Buffer supplied by the derived class on calls to `readNextLine()` |
210 | llvm::StringRef curLineBuffer = "\n" ; |
211 | }; |
212 | |
213 | /// A lexer implementation operating on a buffer in memory. |
214 | class LexerBuffer final : public Lexer { |
215 | public: |
216 | LexerBuffer(const char *begin, const char *end, std::string filename) |
217 | : Lexer(std::move(filename)), current(begin), end(end) {} |
218 | |
219 | private: |
220 | /// Provide one line at a time to the Lexer, return an empty string when |
221 | /// reaching the end of the buffer. |
222 | llvm::StringRef readNextLine() override { |
223 | auto *begin = current; |
224 | while (current <= end && *current && *current != '\n') |
225 | ++current; |
226 | if (current <= end && *current) |
227 | ++current; |
228 | llvm::StringRef result{begin, static_cast<size_t>(current - begin)}; |
229 | return result; |
230 | } |
231 | const char *current, *end; |
232 | }; |
233 | } // namespace toy |
234 | |
235 | #endif // TOY_LEXER_H |
236 | |