ResourceScriptToken.cpp source code [llvm/tools/llvm-rc/ResourceScriptToken.cpp]

1	//===-- ResourceScriptToken.cpp ---------------------------------- C++--===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===---------------------------------------------------------------------===//
8	//
9	// This file implements an interface defined in ResourceScriptToken.h.
10	// In particular, it defines an .rc script tokenizer.
11	//
12	//===---------------------------------------------------------------------===//
13
14	#include "ResourceScriptToken.h"
15	#include "llvm/ADT/StringExtras.h"
16	#include "llvm/Support/raw_ostream.h"
17
18	#include <algorithm>
19	#include <cassert>
20	#include <cctype>
21	#include <cstdlib>
22	#include <utility>
23
24	using namespace llvm;
25
26	using Kind = RCToken::Kind;
27
28	// Checks if Representation is a correct description of an RC integer.
29	// It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+),
30	// or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L'
31	// character (that is the difference between our representation and
32	// StringRef's one). If Representation is correct, 'true' is returned and
33	// the return value is put back in Num.
34	static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) {
35	size_t Length = Representation.size();
36	if (Length == `0`)
37	return false;
38	// Strip the last 'L' if unnecessary.
39	if (std::toupper(c: Representation.back()) == `'L'`)
40	Representation = Representation.drop_back(N: `1`);
41
42	return !Representation.getAsInteger<uint32_t>(Radix: `0`, Result&: Num);
43	}
44
45	RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value)
46	: TokenKind(RCTokenKind), TokenValue (Value) {}
47
48	uint32_t RCToken::intValue() const {
49	assert(TokenKind == Kind::Int);
50	// We assume that the token already is a correct integer (checked by
51	// rcGetAsInteger).
52	uint32_t Result;
53	bool IsSuccess = rcGetAsInteger(Representation: TokenValue, Num&: Result);
54	assert(IsSuccess);
55	(void)IsSuccess; // Silence the compiler warning when -DNDEBUG flag is on.
56	return Result;
57	}
58
59	bool RCToken::isLongInt() const {
60	return TokenKind == Kind::Int && std::toupper(c: TokenValue.back()) == `'L'`;
61	}
62
63	StringRef RCToken::value() const { return TokenValue; }
64
65	Kind RCToken::kind() const { return TokenKind; }
66
67	bool RCToken::isBinaryOp() const {
68	switch (TokenKind) {
69	case Kind::Plus:
70	case Kind::Minus:
71	case Kind::Pipe:
72	case Kind::Amp:
73	return true;
74	default:
75	return false;
76	}
77	}
78
79	static Error getStringError(const Twine &message) {
80	return make_error<StringError>(Args: "Error parsing file: " + message,
81	Args: inconvertibleErrorCode());
82	}
83
84	namespace {
85
86	class Tokenizer {
87	public:
88	Tokenizer(StringRef Input) : Data (Input), DataLength(Input.size()), Pos(`0`) {}
89
90	Expected<std::vector<RCToken>> run();
91
92	private:
93	// All 'advancing' methods return boolean values; if they're equal to false,
94	// the stream has ended or failed.
95	bool advance(size_t Amount = `1`);
96	bool skipWhitespaces();
97
98	// Consumes a token. If any problem occurred, a non-empty Error is returned.
99	Error consumeToken(const Kind TokenKind);
100
101	// Check if tokenizer is about to read FollowingChars.
102	bool willNowRead(StringRef FollowingChars) const;
103
104	// Check if tokenizer can start reading an identifier at current position.
105	// The original tool did non specify the rules to determine what is a correct
106	// identifier. We assume they should follow the C convention:
107	// [a-zA-Z_][a-zA-Z0-9_].*
108	bool canStartIdentifier() const;
109	// Check if tokenizer can continue reading an identifier.
110	bool canContinueIdentifier() const;
111
112	// Check if tokenizer can start reading an integer.
113	// A correct integer always starts with a 0-9 digit,
114	// can contain characters 0-9A-Fa-f (digits),
115	// Ll (marking the integer is 32-bit), Xx (marking the representation
116	// is hexadecimal). As some kind of separator should come after the
117	// integer, we can consume the integer until a non-alphanumeric
118	// character.
119	bool canStartInt() const;
120	bool canContinueInt() const;
121
122	bool canStartString() const;
123
124	// Check if tokenizer can start reading a single line comment (e.g. a comment
125	// that begins with '//')
126	bool canStartLineComment() const;
127
128	// Check if tokenizer can start or finish reading a block comment (e.g. a
129	// comment that begins with '/' and ends with '/')
130	bool canStartBlockComment() const;
131
132	// Throw away all remaining characters on the current line.
133	void skipCurrentLine();
134
135	bool streamEof() const;
136
137	// Classify the token that is about to be read from the current position.
138	Kind classifyCurrentToken() const;
139
140	// Process the Kind::Identifier token - check if it is
141	// an identifier describing a block start or end.
142	void processIdentifier(RCToken &token) const;
143
144	StringRef Data;
145	size_t DataLength, Pos;
146	};
147
148	void Tokenizer::skipCurrentLine() {
149	Pos = Data.find_first_of(Chars: "\r\n", From: Pos);
150	Pos = Data.find_first_not_of(Chars: "\r\n", From: Pos);
151
152	if (Pos == StringRef::npos)
153	Pos = DataLength;
154	}
155
156	Expected<std::vector<RCToken>> Tokenizer::run() {
157	Pos = `0`;
158	std::vector<RCToken> Result;
159
160	// Consume an optional UTF-8 Byte Order Mark.
161	if (willNowRead(FollowingChars: "\xef\xbb\xbf"))
162	advance(Amount: `3`);
163
164	while (!streamEof()) {
165	if (!skipWhitespaces())
166	break;
167
168	Kind TokenKind = classifyCurrentToken();
169	if (TokenKind == Kind::Invalid)
170	return getStringError(message: "Invalid token found at position " + Twine (Pos));
171
172	const size_t TokenStart = Pos;
173	if (Error TokenError = consumeToken(TokenKind))
174	return std::move(TokenError);
175
176	// Comments are just deleted, don't bother saving them.
177	if (TokenKind == Kind::LineComment \|\| TokenKind == Kind::StartComment)
178	continue;
179
180	RCToken Token(TokenKind, Data.take_front(N: Pos).drop_front(N: TokenStart));
181	if (TokenKind == Kind::Identifier) {
182	processIdentifier(token&: Token);
183	} else if (TokenKind == Kind::Int) {
184	uint32_t TokenInt;
185	if (!rcGetAsInteger(Representation: Token.value(), Num&: TokenInt)) {
186	// The integer has incorrect format or cannot be represented in
187	// a 32-bit integer.
188	return getStringError(message: "Integer invalid or too large: " +
189	Token.value().str());
190	}
191	}
192
193	Result.push_back(x: Token);
194	}
195
196	return Result;
197	}
198
199	bool Tokenizer::advance(size_t Amount) {
200	Pos += Amount;
201	return !streamEof();
202	}
203
204	bool Tokenizer::skipWhitespaces() {
205	while (!streamEof() && isSpace(C: Data [Pos]))
206	advance();
207	return !streamEof();
208	}
209
210	Error Tokenizer::consumeToken(const Kind TokenKind) {
211	switch (TokenKind) {
212	// One-character token consumption.
213	#define TOKEN(Name)
214	#define SHORT_TOKEN(Name, Ch) case Kind::Name:
215	#include "ResourceScriptTokenList.def"
216	advance();
217	return Error::success();
218
219	case Kind::LineComment:
220	advance(Amount: `2`);
221	skipCurrentLine();
222	return Error::success();
223
224	case Kind::StartComment: {
225	advance(Amount: `2`);
226	auto EndPos = Data.find(Str: "*/", From: Pos);
227	if (EndPos == StringRef::npos)
228	return getStringError(
229	message: "Unclosed multi-line comment beginning at position " + Twine (Pos));
230	advance(Amount: EndPos - Pos);
231	advance(Amount: `2`);
232	return Error::success();
233	}
234	case Kind::Identifier:
235	while (!streamEof() && canContinueIdentifier())
236	advance();
237	return Error::success();
238
239	case Kind::Int:
240	while (!streamEof() && canContinueInt())
241	advance();
242	return Error::success();
243
244	case Kind::String:
245	// Consume the preceding 'L', if there is any.
246	if (std::toupper(c: Data [Pos]) == `'L'`)
247	advance();
248	// Consume the double-quote.
249	advance();
250
251	// Consume the characters until the end of the file, line or string.
252	while (true) {
253	if (streamEof()) {
254	return getStringError(message: "Unterminated string literal.");
255	} else if (Data [Pos] == `'"'`) {
256	// Consume the ending double-quote.
257	advance();
258	// However, if another '"' follows this double-quote, the string didn't
259	// end and we just included '"' into the string.
260	if (!willNowRead(FollowingChars: "\""))
261	return Error::success();
262	} else if (Data [Pos] == `'\n'`) {
263	return getStringError(message: "String literal not terminated in the line.");
264	}
265
266	advance();
267	}
268
269	case Kind::Invalid:
270	assert(false && "Cannot consume an invalid token.");
271	}
272
273	llvm_unreachable("Unknown RCToken::Kind");
274	}
275
276	bool Tokenizer::willNowRead(StringRef FollowingChars) const {
277	return Data.drop_front(N: Pos).starts_with(Prefix: FollowingChars);
278	}
279
280	bool Tokenizer::canStartIdentifier() const {
281	assert(!streamEof());
282
283	const char CurChar = Data [Pos];
284	return std::isalpha(CurChar) \|\| CurChar == `'_'` \|\| CurChar == `'.'`;
285	}
286
287	bool Tokenizer::canContinueIdentifier() const {
288	assert(!streamEof());
289	const char CurChar = Data [Pos];
290	return std::isalnum(CurChar) \|\| CurChar == `'_'` \|\| CurChar == `'.'` \|\|
291	CurChar == `'/'` \|\| CurChar == `'\\'` \|\| CurChar == `'-'`;
292	}
293
294	bool Tokenizer::canStartInt() const {
295	assert(!streamEof());
296	return std::isdigit(Data [Pos]);
297	}
298
299	bool Tokenizer::canStartBlockComment() const {
300	assert(!streamEof());
301	return Data.drop_front(N: Pos).starts_with(Prefix: "/*");
302	}
303
304	bool Tokenizer::canStartLineComment() const {
305	assert(!streamEof());
306	return Data.drop_front(N: Pos).starts_with(Prefix: "//");
307	}
308
309	bool Tokenizer::canContinueInt() const {
310	assert(!streamEof());
311	return std::isalnum(Data [Pos]);
312	}
313
314	bool Tokenizer::canStartString() const {
315	return willNowRead(FollowingChars: "\"") \|\| willNowRead(FollowingChars: "L\"") \|\| willNowRead(FollowingChars: "l\"");
316	}
317
318	bool Tokenizer::streamEof() const { return Pos == DataLength; }
319
320	Kind Tokenizer::classifyCurrentToken() const {
321	if (canStartBlockComment())
322	return Kind::StartComment;
323	if (canStartLineComment())
324	return Kind::LineComment;
325
326	if (canStartInt())
327	return Kind::Int;
328	if (canStartString())
329	return Kind::String;
330	// BEGIN and END are at this point of lexing recognized as identifiers.
331	if (canStartIdentifier())
332	return Kind::Identifier;
333
334	const char CurChar = Data [Pos];
335
336	switch (CurChar) {
337	// One-character token classification.
338	#define TOKEN(Name)
339	#define SHORT_TOKEN(Name, Ch) \
340	case Ch: \
341	return Kind::Name;
342	#include "ResourceScriptTokenList.def"
343
344	default:
345	return Kind::Invalid;
346	}
347	}
348
349	void Tokenizer::processIdentifier(RCToken &Token) const {
350	assert(Token.kind() == Kind::Identifier);
351	StringRef Name = Token.value();
352
353	if (Name.equals_insensitive(RHS: "begin"))
354	Token = RCToken (Kind::BlockBegin, Name);
355	else if (Name.equals_insensitive(RHS: "end"))
356	Token = RCToken (Kind::BlockEnd, Name);
357	}
358
359	} // anonymous namespace
360
361	namespace llvm {
362
363	Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) {
364	return Tokenizer (Input).run();
365	}
366
367	} // namespace llvm
368

source code of llvm/tools/llvm-rc/ResourceScriptToken.cpp