1 | //===- ScriptLexer.cpp ----------------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file defines a lexer for the linker script. |
10 | // |
11 | // The linker script's grammar is not complex but ambiguous due to the |
12 | // lack of the formal specification of the language. What we are trying to |
13 | // do in this and other files in LLD is to make a "reasonable" linker |
14 | // script processor. |
15 | // |
16 | // Among simplicity, compatibility and efficiency, we put the most |
17 | // emphasis on simplicity when we wrote this lexer. Compatibility with the |
18 | // GNU linkers is important, but we did not try to clone every tiny corner |
19 | // case of their lexers, as even ld.bfd and ld.gold are subtly different |
20 | // in various corner cases. We do not care much about efficiency because |
21 | // the time spent in parsing linker scripts is usually negligible. |
22 | // |
23 | // Our grammar of the linker script is LL(2), meaning that it needs at |
24 | // most two-token lookahead to parse. The only place we need two-token |
25 | // lookahead is labels in version scripts, where we need to parse "local :" |
26 | // as if "local:". |
27 | // |
28 | // Overall, this lexer works fine for most linker scripts. There might |
29 | // be room for improving compatibility, but that's probably not at the |
30 | // top of our todo list. |
31 | // |
32 | //===----------------------------------------------------------------------===// |
33 | |
34 | #include "ScriptLexer.h" |
35 | #include "lld/Common/ErrorHandler.h" |
36 | #include "llvm/ADT/Twine.h" |
37 | #include "llvm/Support/ErrorHandling.h" |
38 | #include <algorithm> |
39 | |
40 | using namespace llvm; |
41 | using namespace lld; |
42 | using namespace lld::elf; |
43 | |
44 | // Returns a whole line containing the current token. |
45 | StringRef ScriptLexer::getLine() { |
46 | StringRef s = getCurrentMB().getBuffer(); |
47 | StringRef tok = tokens[pos - 1]; |
48 | |
49 | size_t pos = s.rfind(C: '\n', From: tok.data() - s.data()); |
50 | if (pos != StringRef::npos) |
51 | s = s.substr(Start: pos + 1); |
52 | return s.substr(Start: 0, N: s.find_first_of(Chars: "\r\n" )); |
53 | } |
54 | |
55 | // Returns 1-based line number of the current token. |
56 | size_t ScriptLexer::getLineNumber() { |
57 | if (pos == 0) |
58 | return 1; |
59 | StringRef s = getCurrentMB().getBuffer(); |
60 | StringRef tok = tokens[pos - 1]; |
61 | const size_t tokOffset = tok.data() - s.data(); |
62 | |
63 | // For the first token, or when going backwards, start from the beginning of |
64 | // the buffer. If this token is after the previous token, start from the |
65 | // previous token. |
66 | size_t line = 1; |
67 | size_t start = 0; |
68 | if (lastLineNumberOffset > 0 && tokOffset >= lastLineNumberOffset) { |
69 | start = lastLineNumberOffset; |
70 | line = lastLineNumber; |
71 | } |
72 | |
73 | line += s.substr(Start: start, N: tokOffset - start).count(C: '\n'); |
74 | |
75 | // Store the line number of this token for reuse. |
76 | lastLineNumberOffset = tokOffset; |
77 | lastLineNumber = line; |
78 | |
79 | return line; |
80 | } |
81 | |
82 | // Returns 0-based column number of the current token. |
83 | size_t ScriptLexer::getColumnNumber() { |
84 | StringRef tok = tokens[pos - 1]; |
85 | return tok.data() - getLine().data(); |
86 | } |
87 | |
88 | std::string ScriptLexer::getCurrentLocation() { |
89 | std::string filename = std::string(getCurrentMB().getBufferIdentifier()); |
90 | return (filename + ":" + Twine(getLineNumber())).str(); |
91 | } |
92 | |
93 | ScriptLexer::ScriptLexer(MemoryBufferRef mb) { tokenize(mb); } |
94 | |
95 | // We don't want to record cascading errors. Keep only the first one. |
96 | void ScriptLexer::setError(const Twine &msg) { |
97 | if (errorCount()) |
98 | return; |
99 | |
100 | std::string s = (getCurrentLocation() + ": " + msg).str(); |
101 | if (pos) |
102 | s += "\n>>> " + getLine().str() + "\n>>> " + |
103 | std::string(getColumnNumber(), ' ') + "^" ; |
104 | error(msg: s); |
105 | } |
106 | |
107 | // Split S into linker script tokens. |
108 | void ScriptLexer::tokenize(MemoryBufferRef mb) { |
109 | std::vector<StringRef> vec; |
110 | mbs.push_back(x: mb); |
111 | StringRef s = mb.getBuffer(); |
112 | StringRef begin = s; |
113 | |
114 | for (;;) { |
115 | s = skipSpace(s); |
116 | if (s.empty()) |
117 | break; |
118 | |
119 | // Quoted token. Note that double-quote characters are parts of a token |
120 | // because, in a glob match context, only unquoted tokens are interpreted |
121 | // as glob patterns. Double-quoted tokens are literal patterns in that |
122 | // context. |
123 | if (s.starts_with(Prefix: "\"" )) { |
124 | size_t e = s.find(Str: "\"" , From: 1); |
125 | if (e == StringRef::npos) { |
126 | StringRef filename = mb.getBufferIdentifier(); |
127 | size_t lineno = begin.substr(Start: 0, N: s.data() - begin.data()).count(C: '\n'); |
128 | error(msg: filename + ":" + Twine(lineno + 1) + ": unclosed quote" ); |
129 | return; |
130 | } |
131 | |
132 | vec.push_back(x: s.take_front(N: e + 1)); |
133 | s = s.substr(Start: e + 1); |
134 | continue; |
135 | } |
136 | |
137 | // Some operators form separate tokens. |
138 | if (s.starts_with(Prefix: "<<=" ) || s.starts_with(Prefix: ">>=" )) { |
139 | vec.push_back(x: s.substr(Start: 0, N: 3)); |
140 | s = s.substr(Start: 3); |
141 | continue; |
142 | } |
143 | if (s.size() > 1 && ((s[1] == '=' && strchr(s: "*/+-<>&^|" , c: s[0])) || |
144 | (s[0] == s[1] && strchr(s: "<>&|" , c: s[0])))) { |
145 | vec.push_back(x: s.substr(Start: 0, N: 2)); |
146 | s = s.substr(Start: 2); |
147 | continue; |
148 | } |
149 | |
150 | // Unquoted token. This is more relaxed than tokens in C-like language, |
151 | // so that you can write "file-name.cpp" as one bare token, for example. |
152 | size_t pos = s.find_first_not_of( |
153 | Chars: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" |
154 | "0123456789_.$/\\~=+[]*?-!^:" ); |
155 | |
156 | // A character that cannot start a word (which is usually a |
157 | // punctuation) forms a single character token. |
158 | if (pos == 0) |
159 | pos = 1; |
160 | vec.push_back(x: s.substr(Start: 0, N: pos)); |
161 | s = s.substr(Start: pos); |
162 | } |
163 | |
164 | tokens.insert(position: tokens.begin() + pos, first: vec.begin(), last: vec.end()); |
165 | } |
166 | |
167 | // Skip leading whitespace characters or comments. |
168 | StringRef ScriptLexer::skipSpace(StringRef s) { |
169 | for (;;) { |
170 | if (s.starts_with(Prefix: "/*" )) { |
171 | size_t e = s.find(Str: "*/" , From: 2); |
172 | if (e == StringRef::npos) { |
173 | setError("unclosed comment in a linker script" ); |
174 | return "" ; |
175 | } |
176 | s = s.substr(Start: e + 2); |
177 | continue; |
178 | } |
179 | if (s.starts_with(Prefix: "#" )) { |
180 | size_t e = s.find(C: '\n', From: 1); |
181 | if (e == StringRef::npos) |
182 | e = s.size() - 1; |
183 | s = s.substr(Start: e + 1); |
184 | continue; |
185 | } |
186 | size_t size = s.size(); |
187 | s = s.ltrim(); |
188 | if (s.size() == size) |
189 | return s; |
190 | } |
191 | } |
192 | |
193 | // An erroneous token is handled as if it were the last token before EOF. |
194 | bool ScriptLexer::atEOF() { return errorCount() || tokens.size() == pos; } |
195 | |
196 | // Split a given string as an expression. |
197 | // This function returns "3", "*" and "5" for "3*5" for example. |
198 | static std::vector<StringRef> tokenizeExpr(StringRef s) { |
199 | StringRef ops = "!~*/+-<>?^:=" ; // List of operators |
200 | |
201 | // Quoted strings are literal strings, so we don't want to split it. |
202 | if (s.starts_with(Prefix: "\"" )) |
203 | return {s}; |
204 | |
205 | // Split S with operators as separators. |
206 | std::vector<StringRef> ret; |
207 | while (!s.empty()) { |
208 | size_t e = s.find_first_of(Chars: ops); |
209 | |
210 | // No need to split if there is no operator. |
211 | if (e == StringRef::npos) { |
212 | ret.push_back(x: s); |
213 | break; |
214 | } |
215 | |
216 | // Get a token before the operator. |
217 | if (e != 0) |
218 | ret.push_back(x: s.substr(Start: 0, N: e)); |
219 | |
220 | // Get the operator as a token. |
221 | // Keep !=, ==, >=, <=, << and >> operators as a single tokens. |
222 | if (s.substr(Start: e).starts_with(Prefix: "!=" ) || s.substr(Start: e).starts_with(Prefix: "==" ) || |
223 | s.substr(Start: e).starts_with(Prefix: ">=" ) || s.substr(Start: e).starts_with(Prefix: "<=" ) || |
224 | s.substr(Start: e).starts_with(Prefix: "<<" ) || s.substr(Start: e).starts_with(Prefix: ">>" )) { |
225 | ret.push_back(x: s.substr(Start: e, N: 2)); |
226 | s = s.substr(Start: e + 2); |
227 | } else { |
228 | ret.push_back(x: s.substr(Start: e, N: 1)); |
229 | s = s.substr(Start: e + 1); |
230 | } |
231 | } |
232 | return ret; |
233 | } |
234 | |
235 | // In contexts where expressions are expected, the lexer should apply |
236 | // different tokenization rules than the default one. By default, |
237 | // arithmetic operator characters are regular characters, but in the |
238 | // expression context, they should be independent tokens. |
239 | // |
240 | // For example, "foo*3" should be tokenized to "foo", "*" and "3" only |
241 | // in the expression context. |
242 | // |
243 | // This function may split the current token into multiple tokens. |
244 | void ScriptLexer::maybeSplitExpr() { |
245 | if (!inExpr || errorCount() || atEOF()) |
246 | return; |
247 | |
248 | std::vector<StringRef> v = tokenizeExpr(s: tokens[pos]); |
249 | if (v.size() == 1) |
250 | return; |
251 | tokens.erase(position: tokens.begin() + pos); |
252 | tokens.insert(position: tokens.begin() + pos, first: v.begin(), last: v.end()); |
253 | } |
254 | |
255 | StringRef ScriptLexer::next() { |
256 | maybeSplitExpr(); |
257 | |
258 | if (errorCount()) |
259 | return "" ; |
260 | if (atEOF()) { |
261 | setError("unexpected EOF" ); |
262 | return "" ; |
263 | } |
264 | return tokens[pos++]; |
265 | } |
266 | |
267 | StringRef ScriptLexer::peek() { |
268 | StringRef tok = next(); |
269 | if (errorCount()) |
270 | return "" ; |
271 | pos = pos - 1; |
272 | return tok; |
273 | } |
274 | |
275 | StringRef ScriptLexer::peek2() { |
276 | skip(); |
277 | StringRef tok = next(); |
278 | if (errorCount()) |
279 | return "" ; |
280 | pos = pos - 2; |
281 | return tok; |
282 | } |
283 | |
284 | bool ScriptLexer::consume(StringRef tok) { |
285 | if (peek() == tok) { |
286 | skip(); |
287 | return true; |
288 | } |
289 | return false; |
290 | } |
291 | |
292 | // Consumes Tok followed by ":". Space is allowed between Tok and ":". |
293 | bool ScriptLexer::consumeLabel(StringRef tok) { |
294 | if (consume(tok: (tok + ":" ).str())) |
295 | return true; |
296 | if (tokens.size() >= pos + 2 && tokens[pos] == tok && |
297 | tokens[pos + 1] == ":" ) { |
298 | pos += 2; |
299 | return true; |
300 | } |
301 | return false; |
302 | } |
303 | |
304 | void ScriptLexer::skip() { (void)next(); } |
305 | |
306 | void ScriptLexer::expect(StringRef expect) { |
307 | if (errorCount()) |
308 | return; |
309 | StringRef tok = next(); |
310 | if (tok != expect) |
311 | setError(expect + " expected, but got " + tok); |
312 | } |
313 | |
314 | // Returns true if S encloses T. |
315 | static bool encloses(StringRef s, StringRef t) { |
316 | return s.bytes_begin() <= t.bytes_begin() && t.bytes_end() <= s.bytes_end(); |
317 | } |
318 | |
319 | MemoryBufferRef ScriptLexer::getCurrentMB() { |
320 | // Find input buffer containing the current token. |
321 | assert(!mbs.empty()); |
322 | if (pos == 0) |
323 | return mbs.back(); |
324 | for (MemoryBufferRef mb : mbs) |
325 | if (encloses(s: mb.getBuffer(), t: tokens[pos - 1])) |
326 | return mb; |
327 | llvm_unreachable("getCurrentMB: failed to find a token" ); |
328 | } |
329 | |