| 1 | //===- Lexer.h - Lexer for the Toy language -------------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file implements a simple Lexer for the Toy language. |
| 10 | // |
| 11 | //===----------------------------------------------------------------------===// |
| 12 | |
| 13 | #ifndef TOY_LEXER_H |
| 14 | #define TOY_LEXER_H |
| 15 | |
| 16 | #include "llvm/ADT/StringRef.h" |
| 17 | |
| 18 | #include <cstdlib> |
| 19 | #include <memory> |
| 20 | #include <string> |
| 21 | |
| 22 | namespace toy { |
| 23 | |
| 24 | /// Structure definition a location in a file. |
| 25 | struct Location { |
| 26 | std::shared_ptr<std::string> file; ///< filename. |
| 27 | int line; ///< line number. |
| 28 | int col; ///< column number. |
| 29 | }; |
| 30 | |
| 31 | // List of Token returned by the lexer. |
| 32 | enum Token : int { |
| 33 | tok_semicolon = ';', |
| 34 | tok_parenthese_open = '(', |
| 35 | tok_parenthese_close = ')', |
| 36 | tok_bracket_open = '{', |
| 37 | tok_bracket_close = '}', |
| 38 | tok_sbracket_open = '[', |
| 39 | tok_sbracket_close = ']', |
| 40 | |
| 41 | tok_eof = -1, |
| 42 | |
| 43 | // commands |
| 44 | tok_return = -2, |
| 45 | tok_var = -3, |
| 46 | tok_def = -4, |
| 47 | tok_struct = -5, |
| 48 | |
| 49 | // primary |
| 50 | tok_identifier = -6, |
| 51 | tok_number = -7, |
| 52 | }; |
| 53 | |
| 54 | /// The Lexer is an abstract base class providing all the facilities that the |
| 55 | /// Parser expects. It goes through the stream one token at a time and keeps |
| 56 | /// track of the location in the file for debugging purpose. |
| 57 | /// It relies on a subclass to provide a `readNextLine()` method. The subclass |
| 58 | /// can proceed by reading the next line from the standard input or from a |
| 59 | /// memory mapped file. |
| 60 | class Lexer { |
| 61 | public: |
| 62 | /// Create a lexer for the given filename. The filename is kept only for |
| 63 | /// debugging purpose (attaching a location to a Token). |
| 64 | Lexer(std::string filename) |
| 65 | : lastLocation( |
| 66 | {.file: std::make_shared<std::string>(args: std::move(filename)), .line: 0, .col: 0}) {} |
| 67 | virtual ~Lexer() = default; |
| 68 | |
| 69 | /// Look at the current token in the stream. |
| 70 | Token getCurToken() { return curTok; } |
| 71 | |
| 72 | /// Move to the next token in the stream and return it. |
| 73 | Token getNextToken() { return curTok = getTok(); } |
| 74 | |
| 75 | /// Move to the next token in the stream, asserting on the current token |
| 76 | /// matching the expectation. |
| 77 | void consume(Token tok) { |
| 78 | assert(tok == curTok && "consume Token mismatch expectation" ); |
| 79 | getNextToken(); |
| 80 | } |
| 81 | |
| 82 | /// Return the current identifier (prereq: getCurToken() == tok_identifier) |
| 83 | llvm::StringRef getId() { |
| 84 | assert(curTok == tok_identifier); |
| 85 | return identifierStr; |
| 86 | } |
| 87 | |
| 88 | /// Return the current number (prereq: getCurToken() == tok_number) |
| 89 | double getValue() { |
| 90 | assert(curTok == tok_number); |
| 91 | return numVal; |
| 92 | } |
| 93 | |
| 94 | /// Return the location for the beginning of the current token. |
| 95 | Location getLastLocation() { return lastLocation; } |
| 96 | |
| 97 | // Return the current line in the file. |
| 98 | int getLine() { return curLineNum; } |
| 99 | |
| 100 | // Return the current column in the file. |
| 101 | int getCol() { return curCol; } |
| 102 | |
| 103 | private: |
| 104 | /// Delegate to a derived class fetching the next line. Returns an empty |
| 105 | /// string to signal end of file (EOF). Lines are expected to always finish |
| 106 | /// with "\n" |
| 107 | virtual llvm::StringRef readNextLine() = 0; |
| 108 | |
| 109 | /// Return the next character from the stream. This manages the buffer for the |
| 110 | /// current line and request the next line buffer to the derived class as |
| 111 | /// needed. |
| 112 | int getNextChar() { |
| 113 | // The current line buffer should not be empty unless it is the end of file. |
| 114 | if (curLineBuffer.empty()) |
| 115 | return EOF; |
| 116 | ++curCol; |
| 117 | auto nextchar = curLineBuffer.front(); |
| 118 | curLineBuffer = curLineBuffer.drop_front(); |
| 119 | if (curLineBuffer.empty()) |
| 120 | curLineBuffer = readNextLine(); |
| 121 | if (nextchar == '\n') { |
| 122 | ++curLineNum; |
| 123 | curCol = 0; |
| 124 | } |
| 125 | return nextchar; |
| 126 | } |
| 127 | |
| 128 | /// Return the next token from standard input. |
| 129 | Token getTok() { |
| 130 | // Skip any whitespace. |
| 131 | while (isspace(lastChar)) |
| 132 | lastChar = Token(getNextChar()); |
| 133 | |
| 134 | // Save the current location before reading the token characters. |
| 135 | lastLocation.line = curLineNum; |
| 136 | lastLocation.col = curCol; |
| 137 | |
| 138 | // Identifier: [a-zA-Z][a-zA-Z0-9_]* |
| 139 | if (isalpha(lastChar)) { |
| 140 | identifierStr = (char)lastChar; |
| 141 | while (isalnum((lastChar = Token(getNextChar()))) || lastChar == '_') |
| 142 | identifierStr += (char)lastChar; |
| 143 | |
| 144 | if (identifierStr == "return" ) |
| 145 | return tok_return; |
| 146 | if (identifierStr == "def" ) |
| 147 | return tok_def; |
| 148 | if (identifierStr == "struct" ) |
| 149 | return tok_struct; |
| 150 | if (identifierStr == "var" ) |
| 151 | return tok_var; |
| 152 | return tok_identifier; |
| 153 | } |
| 154 | |
| 155 | // Number: [0-9] ([0-9.])* |
| 156 | if (isdigit(lastChar)) { |
| 157 | std::string numStr; |
| 158 | do { |
| 159 | numStr += lastChar; |
| 160 | lastChar = Token(getNextChar()); |
| 161 | } while (isdigit(lastChar) || lastChar == '.'); |
| 162 | |
| 163 | numVal = strtod(nptr: numStr.c_str(), endptr: nullptr); |
| 164 | return tok_number; |
| 165 | } |
| 166 | |
| 167 | if (lastChar == '#') { |
| 168 | // Comment until end of line. |
| 169 | do { |
| 170 | lastChar = Token(getNextChar()); |
| 171 | } while (lastChar != EOF && lastChar != '\n' && lastChar != '\r'); |
| 172 | |
| 173 | if (lastChar != EOF) |
| 174 | return getTok(); |
| 175 | } |
| 176 | |
| 177 | // Check for end of file. Don't eat the EOF. |
| 178 | if (lastChar == EOF) |
| 179 | return tok_eof; |
| 180 | |
| 181 | // Otherwise, just return the character as its ascii value. |
| 182 | Token thisChar = Token(lastChar); |
| 183 | lastChar = Token(getNextChar()); |
| 184 | return thisChar; |
| 185 | } |
| 186 | |
| 187 | /// The last token read from the input. |
| 188 | Token curTok = tok_eof; |
| 189 | |
| 190 | /// Location for `curTok`. |
| 191 | Location lastLocation; |
| 192 | |
| 193 | /// If the current Token is an identifier, this string contains the value. |
| 194 | std::string identifierStr; |
| 195 | |
| 196 | /// If the current Token is a number, this contains the value. |
| 197 | double numVal = 0; |
| 198 | |
| 199 | /// The last value returned by getNextChar(). We need to keep it around as we |
| 200 | /// always need to read ahead one character to decide when to end a token and |
| 201 | /// we can't put it back in the stream after reading from it. |
| 202 | Token lastChar = Token(' '); |
| 203 | |
| 204 | /// Keep track of the current line number in the input stream |
| 205 | int curLineNum = 0; |
| 206 | |
| 207 | /// Keep track of the current column number in the input stream |
| 208 | int curCol = 0; |
| 209 | |
| 210 | /// Buffer supplied by the derived class on calls to `readNextLine()` |
| 211 | llvm::StringRef curLineBuffer = "\n" ; |
| 212 | }; |
| 213 | |
| 214 | /// A lexer implementation operating on a buffer in memory. |
| 215 | class LexerBuffer final : public Lexer { |
| 216 | public: |
| 217 | LexerBuffer(const char *begin, const char *end, std::string filename) |
| 218 | : Lexer(std::move(filename)), current(begin), end(end) {} |
| 219 | |
| 220 | private: |
| 221 | /// Provide one line at a time to the Lexer, return an empty string when |
| 222 | /// reaching the end of the buffer. |
| 223 | llvm::StringRef readNextLine() override { |
| 224 | auto *begin = current; |
| 225 | while (current <= end && *current && *current != '\n') |
| 226 | ++current; |
| 227 | if (current <= end && *current) |
| 228 | ++current; |
| 229 | llvm::StringRef result{begin, static_cast<size_t>(current - begin)}; |
| 230 | return result; |
| 231 | } |
| 232 | const char *current, *end; |
| 233 | }; |
| 234 | } // namespace toy |
| 235 | |
| 236 | #endif // TOY_LEXER_H |
| 237 | |