Token.h source code [clang-tools-extra/clangd/support/Token.h]

1	//===--- Token.h - Tokens and token streams in the pseudoparser --- C++--===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// Tokens are the first level of abstraction above bytes used in pseudoparsing.
10	// We use clang's lexer to scan the bytes (in raw mode, with no preprocessor).
11	// The tokens is wrapped into pseudo::Token, along with line/indent info.
12	//
13	// Unlike clang, we make multiple passes over the whole file, out-of-order.
14	// Therefore we retain the whole token sequence in memory. (This is feasible as
15	// we process one file at a time). pseudo::TokenStream holds such a stream.
16	// The initial stream holds the raw tokens read from the file, later passes
17	// operate on derived TokenStreams (e.g. with directives stripped).
18	//
19	// Similar facilities from clang that are not* used:*
20	// - SourceManager: designed around multiple files and precise macro expansion.
21	// - clang::Token: coupled to SourceManager, doesn't retain layout info.
22	// (pseudo::Token is similar, but without SourceLocations).
23	// - syntax::TokenBuffer: coupled to SourceManager, has #includes and macros.
24	// (pseudo::TokenStream is similar, but a flat token list).
25	//
26	//===----------------------------------------------------------------------===//
27
28	#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_TOKEN_H
29	#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_TOKEN_H
30
31	#include "clang/Basic/LLVM.h"
32	#include "clang/Basic/LangStandard.h"
33	#include "clang/Basic/TokenKinds.h"
34	#include "llvm/ADT/ArrayRef.h"
35	#include "llvm/ADT/STLForwardCompat.h"
36	#include "llvm/Support/raw_ostream.h"
37	#include <cstdint>
38	#include <limits>
39	#include <memory>
40	#include <vector>
41
42	namespace clang {
43	class LangOptions;
44	namespace clangd {
45
46	/// A single C++ or preprocessor token.
47	///
48	/// Unlike clang::Token and syntax::Token, these tokens are not connected to a
49	/// SourceManager - we are not dealing with multiple files.
50	struct Token {
51	/// An Index identifies a token within a stream.
52	using Index = uint32_t;
53	/// A sentinel Index indicating no token.
54	constexpr static Index Invalid = std::numeric_limits<Index>::max();
55	struct Range;
56
57	/// The token text.
58	///
59	/// Typically from the original source file, but may have been synthesized.
60	StringRef text() const { return StringRef(Data, Length); }
61	const char Data = nullptr*;
62	uint32_t Length = `0`;
63
64	/// Zero-based line number for the start of the token.
65	/// This refers to the original source file as written.
66	uint32_t Line = `0`;
67	/// Width of whitespace before the first token on this line.
68	uint8_t Indent = `0`;
69	/// Flags have some meaning defined by the function that produced this stream.
70	uint8_t Flags = `0`;
71	/// Index into the original token stream (as raw-lexed from the source code).
72	Index OriginalIndex = Invalid;
73	// Helpers to get/set Flags based on `enum class`.
74	template <class T> bool flag(T Mask) const {
75	return Flags & uint8_t{llvm::to_underlying(Mask)};
76	}
77	template <class T> void setFlag(T Mask) {
78	Flags \|= uint8_t{llvm::to_underlying(Mask)};
79	}
80
81	/// Returns the next token in the stream. this may not be a sentinel.
82	const Token &next() const {
83	assert(Kind != tok::eof);
84	return (this* + `1`);
85	}
86	/// Returns the next token in the stream, skipping over comments.
87	const Token &nextNC() const {
88	const Token T = this*;
89	do
90	T = &T->next();
91	while (T->Kind == tok::comment);
92	return *T;
93	}
94	/// Returns the previous token in the stream. this may not be a sentinel.
95	const Token &prev() const {
96	assert(Kind != tok::eof);
97	return (this* - `1`);
98	}
99	/// Returns the bracket paired with this one, if any.
100	const Token pair() const* { return Pair == `0` ? nullptr : this + Pair; }
101
102	/// The type of token as determined by clang's lexer.
103	clang::tok::TokenKind Kind = clang::tok::unknown;
104	/// If this token is a paired bracket, the offset of the pair in the stream.
105	int32_t Pair = `0`;
106	};
107	static_assert(sizeof(Token) <= sizeof(char *) + `24`, "Careful with layout!");
108	llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token &);
109
110	/// A half-open range of tokens within a stream.
111	struct Token::Range {
112	Index Begin = `0`;
113	Index End = `0`;
114
115	uint32_t size() const { return End - Begin; }
116	static Range emptyAt(Index Index) { return Range{.Begin: Index, .End: Index}; }
117	};
118	llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token::Range &);
119
120	/// A complete sequence of Tokens representing a source file.
121	///
122	/// This may match a raw file from disk, or be derived from a previous stream.
123	/// For example, stripping comments from a TokenStream results in a new stream.
124	///
125	/// A stream has sentinel 'eof' tokens at each end, e.g `int main();` becomes:
126	/// int main ( ) ;
127	/// eof kw_int ident l_paren r_paren semi eof
128	/// front() back()
129	/// 0 1 2 3 4 5
130	class TokenStream {
131	public:
132	/// Create an empty stream.
133	///
134	/// Initially, the stream is appendable and not finalized.
135	/// The token sequence may only be accessed after finalize() is called.
136	///
137	/// Payload is an opaque object which will be owned by the stream.
138	/// e.g. an allocator to hold backing storage for synthesized token text.
139	explicit TokenStream(std::shared_ptr<void> Payload = nullptr);
140
141	/// Append a token to the stream, which must not be finalized.
142	void push(Token T) {
143	assert(!isFinalized());
144	Storage.push_back(x: std::move(T));
145	}
146
147	/// Finalize the token stream, allowing tokens to be accessed.
148	/// Tokens may no longer be appended.
149	void finalize();
150	bool isFinalized() const;
151
152	/// Returns the index of T within the stream.
153	///
154	/// T must be within the stream or the end sentinel (not the start sentinel).
155	Token::Index index(const Token &T) const {
156	assert(isFinalized());
157	assert(&T >= Storage.data() && &T < Storage.data() + Storage.size());
158	assert(&T != Storage.data() && "start sentinel");
159	return &T - Tokens.data();
160	}
161
162	ArrayRef<Token> tokens() const {
163	assert(isFinalized());
164	return Tokens;
165	}
166	ArrayRef<Token> tokens(Token::Range R) const {
167	return tokens().slice(N: R.Begin, M: R.End - R.Begin);
168	}
169
170	MutableArrayRef<Token> tokens() {
171	assert(isFinalized());
172	return Tokens;
173	}
174
175	/// May return the end sentinel if the stream is empty.
176	const Token &front() const {
177	assert(isFinalized());
178	return Storage [`1`];
179	}
180
181	/// Returns the shared payload.
182	std::shared_ptr<void> getPayload() const { return Payload; }
183	/// Adds the given payload to the stream.
184	void addPayload(std::shared_ptr<void> P) {
185	if (!Payload)
186	Payload = std::move(P);
187	else
188	Payload = std::make_shared<
189	std::pair<std::shared_ptr<void>, std::shared_ptr<void>>>(
190	args: std::move(P), args: std::move(Payload));
191	}
192
193	/// Print the tokens in this stream to the output stream.
194	///
195	/// The presence of newlines/spaces is preserved, but not the quantity.
196	void print(llvm::raw_ostream &) const;
197
198	private:
199	std::shared_ptr<void> Payload;
200
201	MutableArrayRef<Token> Tokens;
202	std::vector<Token> Storage; // eof + Tokens + eof
203	};
204	llvm::raw_ostream &operator<<(llvm::raw_ostream &, const TokenStream &);
205
206	/// Extracts a raw token stream from the source code.
207	///
208	/// All tokens will reference the data of the provided string.
209	/// "word-like" tokens such as identifiers and keywords will be raw_identifier.
210	TokenStream lex(const std::string &, const clang::LangOptions &);
211	enum class LexFlags : uint8_t {
212	/// Marks the token at the start of a logical preprocessor line.
213	/// This is a position where a directive might start.
214	///
215	/// Here, the first # is StartsPPLine, but second is not (same logical line).
216	/// #define X(error) \
217	/// #error // not a directive!
218	///
219	/// Careful, the directive may not start exactly on the StartsPPLine token:
220	/// /comment/ #include <foo.h>
221	StartsPPLine = `1` << `0`,
222	/// Marks tokens containing trigraphs, escaped newlines, UCNs etc.
223	/// The text() of such tokens will contain the raw trigrah.
224	NeedsCleaning = `1` << `1`,
225	};
226	/// A generic lang options suitable for lexing/parsing a langage.
227	clang::LangOptions genericLangOpts(
228	clang::Language = clang::Language::CXX,
229	clang::LangStandard::Kind = clang::LangStandard::lang_unspecified);
230
231	/// Decoding raw tokens written in the source code, returning a derived stream.
232	///
233	/// - escaped newlines within tokens are removed
234	/// - trigraphs are replaced with the characters they encode
235	/// - UCNs within raw_identifiers are replaced by the characters they encode
236	/// (UCNs within strings, comments etc are not translated)
237	/// - raw_identifier tokens are assigned their correct keyword type
238	/// - the >> token is split into separate > > tokens
239	/// (we use a modified grammar where >> is a nonterminal, not a token)
240	///
241	/// The StartsPPLine flag is preserved.
242	///
243	/// Formally the identifier correctly happens before preprocessing, while we
244	/// should only cook raw_identifiers that survive preprocessing.
245	/// However, ignoring the Token::Kind of tokens in directives achieves the same.
246	/// (And having cooked token kinds in PP-disabled sections is useful for us).
247	TokenStream cook(const TokenStream &, const clang::LangOptions &);
248
249	/// Drops comment tokens.
250	TokenStream stripComments(const TokenStream &);
251
252	} // namespace clangd
253	} // namespace clang
254
255	#endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_TOKEN_H
256

source code of clang-tools-extra/clangd/support/Token.h