1 | //===--- Token.h - Token interface ------------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file defines the Token interface. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #ifndef LLVM_CLANG_LEX_TOKEN_H |
14 | #define LLVM_CLANG_LEX_TOKEN_H |
15 | |
16 | #include "clang/Basic/SourceLocation.h" |
17 | #include "clang/Basic/TokenKinds.h" |
18 | #include "llvm/ADT/ArrayRef.h" |
19 | #include "llvm/ADT/StringRef.h" |
20 | #include <cassert> |
21 | |
22 | namespace clang { |
23 | |
24 | class IdentifierInfo; |
25 | class LangOptions; |
26 | |
27 | /// Token - This structure provides full information about a lexed token. |
28 | /// It is not intended to be space efficient, it is intended to return as much |
29 | /// information as possible about each returned token. This is expected to be |
30 | /// compressed into a smaller form if memory footprint is important. |
31 | /// |
32 | /// The parser can create a special "annotation token" representing a stream of |
33 | /// tokens that were parsed and semantically resolved, e.g.: "foo::MyClass<int>" |
34 | /// can be represented by a single typename annotation token that carries |
35 | /// information about the SourceRange of the tokens and the type object. |
36 | class Token { |
37 | /// The location of the token. This is actually a SourceLocation. |
38 | SourceLocation::UIntTy Loc; |
39 | |
40 | // Conceptually these next two fields could be in a union. However, this |
41 | // causes gcc 4.2 to pessimize LexTokenInternal, a very performance critical |
42 | // routine. Keeping as separate members with casts until a more beautiful fix |
43 | // presents itself. |
44 | |
45 | /// UintData - This holds either the length of the token text, when |
46 | /// a normal token, or the end of the SourceRange when an annotation |
47 | /// token. |
48 | SourceLocation::UIntTy UintData; |
49 | |
50 | /// PtrData - This is a union of four different pointer types, which depends |
51 | /// on what type of token this is: |
52 | /// Identifiers, keywords, etc: |
53 | /// This is an IdentifierInfo*, which contains the uniqued identifier |
54 | /// spelling. |
55 | /// Literals: isLiteral() returns true. |
56 | /// This is a pointer to the start of the token in a text buffer, which |
57 | /// may be dirty (have trigraphs / escaped newlines). |
58 | /// Annotations (resolved type names, C++ scopes, etc): isAnnotation(). |
59 | /// This is a pointer to sema-specific data for the annotation token. |
60 | /// Eof: |
61 | /// This is a pointer to a Decl. |
62 | /// Other: |
63 | /// This is null. |
64 | void *PtrData; |
65 | |
66 | /// Kind - The actual flavor of token this is. |
67 | tok::TokenKind Kind; |
68 | |
69 | /// Flags - Bits we track about this token, members of the TokenFlags enum. |
70 | unsigned short Flags; |
71 | |
72 | public: |
73 | // Various flags set per token: |
74 | enum TokenFlags { |
75 | StartOfLine = 0x01, // At start of line or only after whitespace |
76 | // (considering the line after macro expansion). |
77 | LeadingSpace = 0x02, // Whitespace exists before this token (considering |
78 | // whitespace after macro expansion). |
79 | DisableExpand = 0x04, // This identifier may never be macro expanded. |
80 | NeedsCleaning = 0x08, // Contained an escaped newline or trigraph. |
81 | LeadingEmptyMacro = 0x10, // Empty macro exists before this token. |
82 | HasUDSuffix = 0x20, // This string or character literal has a ud-suffix. |
83 | HasUCN = 0x40, // This identifier contains a UCN. |
84 | IgnoredComma = 0x80, // This comma is not a macro argument separator (MS). |
85 | StringifiedInMacro = 0x100, // This string or character literal is formed by |
86 | // macro stringizing or charizing operator. |
87 | CommaAfterElided = 0x200, // The comma following this token was elided (MS). |
88 | IsEditorPlaceholder = 0x400, // This identifier is a placeholder. |
89 | IsReinjected = 0x800, // A phase 4 token that was produced before and |
90 | // re-added, e.g. via EnterTokenStream. Annotation |
91 | // tokens are *not* reinjected. |
92 | }; |
93 | |
94 | tok::TokenKind getKind() const { return Kind; } |
95 | void setKind(tok::TokenKind K) { Kind = K; } |
96 | |
97 | /// is/isNot - Predicates to check if this token is a specific kind, as in |
98 | /// "if (Tok.is(tok::l_brace)) {...}". |
99 | bool is(tok::TokenKind K) const { return Kind == K; } |
100 | bool isNot(tok::TokenKind K) const { return Kind != K; } |
101 | bool isOneOf(tok::TokenKind K1, tok::TokenKind K2) const { |
102 | return is(K: K1) || is(K: K2); |
103 | } |
104 | template <typename... Ts> bool isOneOf(tok::TokenKind K1, Ts... Ks) const { |
105 | return is(K: K1) || isOneOf(Ks...); |
106 | } |
107 | |
108 | /// Return true if this is a raw identifier (when lexing |
109 | /// in raw mode) or a non-keyword identifier (when lexing in non-raw mode). |
110 | bool isAnyIdentifier() const { |
111 | return tok::isAnyIdentifier(K: getKind()); |
112 | } |
113 | |
114 | /// Return true if this is a "literal", like a numeric |
115 | /// constant, string, etc. |
116 | bool isLiteral() const { |
117 | return tok::isLiteral(K: getKind()); |
118 | } |
119 | |
120 | /// Return true if this is any of tok::annot_* kind tokens. |
121 | bool isAnnotation() const { return tok::isAnnotation(K: getKind()); } |
122 | |
123 | /// Return true if the token is a keyword that is parsed in the same |
124 | /// position as a standard attribute, but that has semantic meaning |
125 | /// and so cannot be a true attribute. |
126 | bool isRegularKeywordAttribute() const { |
127 | return tok::isRegularKeywordAttribute(K: getKind()); |
128 | } |
129 | |
130 | /// Return a source location identifier for the specified |
131 | /// offset in the current file. |
132 | SourceLocation getLocation() const { |
133 | return SourceLocation::getFromRawEncoding(Encoding: Loc); |
134 | } |
135 | unsigned getLength() const { |
136 | assert(!isAnnotation() && "Annotation tokens have no length field" ); |
137 | return UintData; |
138 | } |
139 | |
140 | void setLocation(SourceLocation L) { Loc = L.getRawEncoding(); } |
141 | void setLength(unsigned Len) { |
142 | assert(!isAnnotation() && "Annotation tokens have no length field" ); |
143 | UintData = Len; |
144 | } |
145 | |
146 | SourceLocation getAnnotationEndLoc() const { |
147 | assert(isAnnotation() && "Used AnnotEndLocID on non-annotation token" ); |
148 | return SourceLocation::getFromRawEncoding(Encoding: UintData ? UintData : Loc); |
149 | } |
150 | void setAnnotationEndLoc(SourceLocation L) { |
151 | assert(isAnnotation() && "Used AnnotEndLocID on non-annotation token" ); |
152 | UintData = L.getRawEncoding(); |
153 | } |
154 | |
155 | SourceLocation getLastLoc() const { |
156 | return isAnnotation() ? getAnnotationEndLoc() : getLocation(); |
157 | } |
158 | |
159 | SourceLocation getEndLoc() const { |
160 | return isAnnotation() ? getAnnotationEndLoc() |
161 | : getLocation().getLocWithOffset(Offset: getLength()); |
162 | } |
163 | |
164 | /// SourceRange of the group of tokens that this annotation token |
165 | /// represents. |
166 | SourceRange getAnnotationRange() const { |
167 | return SourceRange(getLocation(), getAnnotationEndLoc()); |
168 | } |
169 | void setAnnotationRange(SourceRange R) { |
170 | setLocation(R.getBegin()); |
171 | setAnnotationEndLoc(R.getEnd()); |
172 | } |
173 | |
174 | const char *getName() const { return tok::getTokenName(Kind); } |
175 | |
176 | /// Reset all flags to cleared. |
177 | void startToken() { |
178 | Kind = tok::unknown; |
179 | Flags = 0; |
180 | PtrData = nullptr; |
181 | UintData = 0; |
182 | Loc = SourceLocation().getRawEncoding(); |
183 | } |
184 | |
185 | bool hasPtrData() const { return PtrData != nullptr; } |
186 | |
187 | IdentifierInfo *getIdentifierInfo() const { |
188 | assert(isNot(tok::raw_identifier) && |
189 | "getIdentifierInfo() on a tok::raw_identifier token!" ); |
190 | assert(!isAnnotation() && |
191 | "getIdentifierInfo() on an annotation token!" ); |
192 | if (isLiteral()) return nullptr; |
193 | if (is(K: tok::eof)) return nullptr; |
194 | return (IdentifierInfo*) PtrData; |
195 | } |
196 | void setIdentifierInfo(IdentifierInfo *II) { |
197 | PtrData = (void*) II; |
198 | } |
199 | |
200 | const void *getEofData() const { |
201 | assert(is(tok::eof)); |
202 | return reinterpret_cast<const void *>(PtrData); |
203 | } |
204 | void setEofData(const void *D) { |
205 | assert(is(tok::eof)); |
206 | assert(!PtrData); |
207 | PtrData = const_cast<void *>(D); |
208 | } |
209 | |
210 | /// getRawIdentifier - For a raw identifier token (i.e., an identifier |
211 | /// lexed in raw mode), returns a reference to the text substring in the |
212 | /// buffer if known. |
213 | StringRef getRawIdentifier() const { |
214 | assert(is(tok::raw_identifier)); |
215 | return StringRef(reinterpret_cast<const char *>(PtrData), getLength()); |
216 | } |
217 | void setRawIdentifierData(const char *Ptr) { |
218 | assert(is(tok::raw_identifier)); |
219 | PtrData = const_cast<char*>(Ptr); |
220 | } |
221 | |
222 | /// getLiteralData - For a literal token (numeric constant, string, etc), this |
223 | /// returns a pointer to the start of it in the text buffer if known, null |
224 | /// otherwise. |
225 | const char *getLiteralData() const { |
226 | assert(isLiteral() && "Cannot get literal data of non-literal" ); |
227 | return reinterpret_cast<const char*>(PtrData); |
228 | } |
229 | void setLiteralData(const char *Ptr) { |
230 | assert(isLiteral() && "Cannot set literal data of non-literal" ); |
231 | PtrData = const_cast<char*>(Ptr); |
232 | } |
233 | |
234 | void *getAnnotationValue() const { |
235 | assert(isAnnotation() && "Used AnnotVal on non-annotation token" ); |
236 | return PtrData; |
237 | } |
238 | void setAnnotationValue(void *val) { |
239 | assert(isAnnotation() && "Used AnnotVal on non-annotation token" ); |
240 | PtrData = val; |
241 | } |
242 | |
243 | /// Set the specified flag. |
244 | void setFlag(TokenFlags Flag) { |
245 | Flags |= Flag; |
246 | } |
247 | |
248 | /// Get the specified flag. |
249 | bool getFlag(TokenFlags Flag) const { |
250 | return (Flags & Flag) != 0; |
251 | } |
252 | |
253 | /// Unset the specified flag. |
254 | void clearFlag(TokenFlags Flag) { |
255 | Flags &= ~Flag; |
256 | } |
257 | |
258 | /// Return the internal represtation of the flags. |
259 | /// |
260 | /// This is only intended for low-level operations such as writing tokens to |
261 | /// disk. |
262 | unsigned getFlags() const { |
263 | return Flags; |
264 | } |
265 | |
266 | /// Set a flag to either true or false. |
267 | void setFlagValue(TokenFlags Flag, bool Val) { |
268 | if (Val) |
269 | setFlag(Flag); |
270 | else |
271 | clearFlag(Flag); |
272 | } |
273 | |
274 | /// isAtStartOfLine - Return true if this token is at the start of a line. |
275 | /// |
276 | bool isAtStartOfLine() const { return getFlag(Flag: StartOfLine); } |
277 | |
278 | /// Return true if this token has whitespace before it. |
279 | /// |
280 | bool hasLeadingSpace() const { return getFlag(Flag: LeadingSpace); } |
281 | |
282 | /// Return true if this identifier token should never |
283 | /// be expanded in the future, due to C99 6.10.3.4p2. |
284 | bool isExpandDisabled() const { return getFlag(Flag: DisableExpand); } |
285 | |
286 | /// Return true if we have an ObjC keyword identifier. |
287 | bool isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const; |
288 | |
289 | /// Return the ObjC keyword kind. |
290 | tok::ObjCKeywordKind getObjCKeywordID() const; |
291 | |
292 | bool isSimpleTypeSpecifier(const LangOptions &LangOpts) const; |
293 | |
294 | /// Return true if this token has trigraphs or escaped newlines in it. |
295 | bool needsCleaning() const { return getFlag(Flag: NeedsCleaning); } |
296 | |
297 | /// Return true if this token has an empty macro before it. |
298 | /// |
299 | bool hasLeadingEmptyMacro() const { return getFlag(Flag: LeadingEmptyMacro); } |
300 | |
301 | /// Return true if this token is a string or character literal which |
302 | /// has a ud-suffix. |
303 | bool hasUDSuffix() const { return getFlag(Flag: HasUDSuffix); } |
304 | |
305 | /// Returns true if this token contains a universal character name. |
306 | bool hasUCN() const { return getFlag(Flag: HasUCN); } |
307 | |
308 | /// Returns true if this token is formed by macro by stringizing or charizing |
309 | /// operator. |
310 | bool stringifiedInMacro() const { return getFlag(Flag: StringifiedInMacro); } |
311 | |
312 | /// Returns true if the comma after this token was elided. |
313 | bool commaAfterElided() const { return getFlag(Flag: CommaAfterElided); } |
314 | |
315 | /// Returns true if this token is an editor placeholder. |
316 | /// |
317 | /// Editor placeholders are produced by the code-completion engine and are |
318 | /// represented as characters between '<#' and '#>' in the source code. The |
319 | /// lexer uses identifier tokens to represent placeholders. |
320 | bool isEditorPlaceholder() const { return getFlag(Flag: IsEditorPlaceholder); } |
321 | }; |
322 | |
323 | /// Information about the conditional stack (\#if directives) |
324 | /// currently active. |
325 | struct PPConditionalInfo { |
326 | /// Location where the conditional started. |
327 | SourceLocation IfLoc; |
328 | |
329 | /// True if this was contained in a skipping directive, e.g., |
330 | /// in a "\#if 0" block. |
331 | bool WasSkipping; |
332 | |
333 | /// True if we have emitted tokens already, and now we're in |
334 | /// an \#else block or something. Only useful in Skipping blocks. |
335 | bool FoundNonSkip; |
336 | |
337 | /// True if we've seen a \#else in this block. If so, |
338 | /// \#elif/\#else directives are not allowed. |
339 | bool FoundElse; |
340 | }; |
341 | |
342 | // Extra information needed for annonation tokens. |
343 | struct PragmaLoopHintInfo { |
344 | Token PragmaName; |
345 | Token Option; |
346 | ArrayRef<Token> Toks; |
347 | }; |
348 | } // end namespace clang |
349 | |
350 | #endif // LLVM_CLANG_LEX_TOKEN_H |
351 | |