Lexer.cpp source code [clang/lib/Lex/Lexer.cpp]

1	//===- Lexer.cpp - C Language Family Lexer --------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements the Lexer and Token interfaces.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "clang/Lex/Lexer.h"
14	#include "UnicodeCharSets.h"
15	#include "clang/Basic/CharInfo.h"
16	#include "clang/Basic/Diagnostic.h"
17	#include "clang/Basic/IdentifierTable.h"
18	#include "clang/Basic/LLVM.h"
19	#include "clang/Basic/LangOptions.h"
20	#include "clang/Basic/SourceLocation.h"
21	#include "clang/Basic/SourceManager.h"
22	#include "clang/Basic/TokenKinds.h"
23	#include "clang/Lex/LexDiagnostic.h"
24	#include "clang/Lex/LiteralSupport.h"
25	#include "clang/Lex/MultipleIncludeOpt.h"
26	#include "clang/Lex/Preprocessor.h"
27	#include "clang/Lex/PreprocessorOptions.h"
28	#include "clang/Lex/Token.h"
29	#include "llvm/ADT/STLExtras.h"
30	#include "llvm/ADT/StringExtras.h"
31	#include "llvm/ADT/StringRef.h"
32	#include "llvm/ADT/StringSwitch.h"
33	#include "llvm/Support/Compiler.h"
34	#include "llvm/Support/ConvertUTF.h"
35	#include "llvm/Support/MemoryBufferRef.h"
36	#include "llvm/Support/NativeFormatting.h"
37	#include "llvm/Support/Unicode.h"
38	#include "llvm/Support/UnicodeCharRanges.h"
39	#include <algorithm>
40	#include <cassert>
41	#include <cstddef>
42	#include <cstdint>
43	#include <cstring>
44	#include <optional>
45	#include <string>
46	#include <tuple>
47	#include <utility>
48
49	#ifdef __SSE4_2__
50	#include <nmmintrin.h>
51	#endif
52
53	using namespace clang;
54
55	//===----------------------------------------------------------------------===//
56	// Token Class Implementation
57	//===----------------------------------------------------------------------===//
58
59	/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
60	bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
61	if (isAnnotation())
62	return false;
63	if (const IdentifierInfo *II = getIdentifierInfo())
64	return II->getObjCKeywordID() == objcKey;
65	return false;
66	}
67
68	/// getObjCKeywordID - Return the ObjC keyword kind.
69	tok::ObjCKeywordKind Token::getObjCKeywordID() const {
70	if (isAnnotation())
71	return tok::objc_not_keyword;
72	const IdentifierInfo *specId = getIdentifierInfo();
73	return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
74	}
75
76	/// Determine whether the token kind starts a simple-type-specifier.
77	bool Token::isSimpleTypeSpecifier(const LangOptions &LangOpts) const {
78	switch (getKind()) {
79	case tok::annot_typename:
80	case tok::annot_decltype:
81	case tok::annot_pack_indexing_type:
82	return true;
83
84	case tok::kw_short:
85	case tok::kw_long:
86	case tok::kw___int64:
87	case tok::kw___int128:
88	case tok::kw_signed:
89	case tok::kw_unsigned:
90	case tok::kw_void:
91	case tok::kw_char:
92	case tok::kw_int:
93	case tok::kw_half:
94	case tok::kw_float:
95	case tok::kw_double:
96	case tok::kw___bf16:
97	case tok::kw__Float16:
98	case tok::kw___float128:
99	case tok::kw___ibm128:
100	case tok::kw_wchar_t:
101	case tok::kw_bool:
102	case tok::kw__Bool:
103	case tok::kw__Accum:
104	case tok::kw__Fract:
105	case tok::kw__Sat:
106	#define TRANSFORM_TYPE_TRAIT_DEF(_, Trait) case tok::kw___##Trait:
107	#include "clang/Basic/TransformTypeTraits.def"
108	case tok::kw___auto_type:
109	case tok::kw_char16_t:
110	case tok::kw_char32_t:
111	case tok::kw_typeof:
112	case tok::kw_decltype:
113	case tok::kw_char8_t:
114	return getIdentifierInfo()->isKeyword(LangOpts);
115
116	default:
117	return false;
118	}
119	}
120
121	//===----------------------------------------------------------------------===//
122	// Lexer Class Implementation
123	//===----------------------------------------------------------------------===//
124
125	void Lexer::anchor() {}
126
127	void Lexer::InitLexer(const char BufStart, const* char *BufPtr,
128	const char *BufEnd) {
129	BufferStart = BufStart;
130	BufferPtr = BufPtr;
131	BufferEnd = BufEnd;
132
133	assert(BufEnd[`0`] == `0` &&
134	"We assume that the input buffer has a null character at the end"
135	" to simplify lexing!");
136
137	// Check whether we have a BOM in the beginning of the buffer. If yes - act
138	// accordingly. Right now we support only UTF-8 with and without BOM, so, just
139	// skip the UTF-8 BOM if it's present.
140	if (BufferStart == BufferPtr) {
141	// Determine the size of the BOM.
142	StringRef Buf(BufferStart, BufferEnd - BufferStart);
143	size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
144	.StartsWith(S: "\xEF\xBB\xBF", Value: `3`) // UTF-8 BOM
145	.Default(Value: `0`);
146
147	// Skip the BOM.
148	BufferPtr += BOMLength;
149	}
150
151	Is_PragmaLexer = false;
152	CurrentConflictMarkerState = CMK_None;
153
154	// Start of the file is a start of line.
155	IsAtStartOfLine = true;
156	IsAtPhysicalStartOfLine = true;
157
158	HasLeadingSpace = false;
159	HasLeadingEmptyMacro = false;
160
161	// We are not after parsing a #.
162	ParsingPreprocessorDirective = false;
163
164	// We are not after parsing #include.
165	ParsingFilename = false;
166
167	// We are not in raw mode. Raw mode disables diagnostics and interpretation
168	// of tokens (e.g. identifiers, thus disabling macro expansion). It is used
169	// to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
170	// or otherwise skipping over tokens.
171	LexingRawMode = false;
172
173	// Default to not keeping comments.
174	ExtendedTokenMode = `0`;
175
176	NewLinePtr = nullptr;
177	}
178
179	/// Lexer constructor - Create a new lexer object for the specified buffer
180	/// with the specified preprocessor managing the lexing process. This lexer
181	/// assumes that the associated file buffer and Preprocessor objects will
182	/// outlive it, so it doesn't take ownership of either of them.
183	Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile,
184	Preprocessor &PP, bool IsFirstIncludeOfFile)
185	: PreprocessorLexer (&PP, FID),
186	FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
187	LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment),
188	IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
189	InitLexer(BufStart: InputFile.getBufferStart(), BufPtr: InputFile.getBufferStart(),
190	BufEnd: InputFile.getBufferEnd());
191
192	resetExtendedTokenMode();
193	}
194
195	/// Lexer constructor - Create a new raw lexer object. This object is only
196	/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
197	/// range will outlive it, so it doesn't take ownership of it.
198	Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
199	const char BufStart, const* char BufPtr, const* char *BufEnd,
200	bool IsFirstIncludeOfFile)
201	: FileLoc (fileloc), LangOpts(langOpts), LineComment(LangOpts.LineComment),
202	IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
203	InitLexer(BufStart, BufPtr, BufEnd);
204
205	// We are* in raw mode.*
206	LexingRawMode = true;
207	}
208
209	/// Lexer constructor - Create a new raw lexer object. This object is only
210	/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
211	/// range will outlive it, so it doesn't take ownership of it.
212	Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile,
213	const SourceManager &SM, const LangOptions &langOpts,
214	bool IsFirstIncludeOfFile)
215	: Lexer (SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(),
216	FromFile.getBufferStart(), FromFile.getBufferEnd(),
217	IsFirstIncludeOfFile) {}
218
219	void Lexer::resetExtendedTokenMode() {
220	assert(PP && "Cannot reset token mode without a preprocessor");
221	if (LangOpts.TraditionalCPP)
222	SetKeepWhitespaceMode(true);
223	else
224	SetCommentRetentionState(PP->getCommentRetentionState());
225	}
226
227	/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
228	/// _Pragma expansion. This has a variety of magic semantics that this method
229	/// sets up. It returns a new'd Lexer that must be delete'd when done.
230	///
231	/// On entrance to this routine, TokStartLoc is a macro location which has a
232	/// spelling loc that indicates the bytes to be lexed for the token and an
233	/// expansion location that indicates where all lexed tokens should be
234	/// "expanded from".
235	///
236	/// TODO: It would really be nice to make _Pragma just be a wrapper around a
237	/// normal lexer that remaps tokens as they fly by. This would require making
238	/// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer
239	/// interface that could handle this stuff. This would pull GetMappedTokenLoc
240	/// out of the critical path of the lexer!
241	///
242	Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
243	SourceLocation ExpansionLocStart,
244	SourceLocation ExpansionLocEnd,
245	unsigned TokLen, Preprocessor &PP) {
246	SourceManager &SM = PP.getSourceManager();
247
248	// Create the lexer as if we were going to lex the file normally.
249	FileID SpellingFID = SM.getFileID(SpellingLoc);
250	llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(FID: SpellingFID);
251	Lexer L = new* Lexer (SpellingFID, InputFile, PP);
252
253	// Now that the lexer is created, change the start/end locations so that we
254	// just lex the subsection of the file that we want. This is lexing from a
255	// scratch buffer.
256	const char *StrData = SM.getCharacterData(SL: SpellingLoc);
257
258	L->BufferPtr = StrData;
259	L->BufferEnd = StrData+TokLen;
260	assert(L->BufferEnd[`0`] == `0` && "Buffer is not nul terminated!");
261
262	// Set the SourceLocation with the remapping information. This ensures that
263	// GetMappedTokenLoc will remap the tokens as they are lexed.
264	L->FileLoc = SM.createExpansionLoc(SpellingLoc: SM.getLocForStartOfFile(FID: SpellingFID),
265	ExpansionLocStart,
266	ExpansionLocEnd, Length: TokLen);
267
268	// Ensure that the lexer thinks it is inside a directive, so that end \n will
269	// return an EOD token.
270	L->ParsingPreprocessorDirective = true;
271
272	// This lexer really is for _Pragma.
273	L->Is_PragmaLexer = true;
274	return L;
275	}
276
277	void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) {
278	this->IsAtPhysicalStartOfLine = IsAtStartOfLine;
279	this->IsAtStartOfLine = IsAtStartOfLine;
280	assert((BufferStart + Offset) <= BufferEnd);
281	BufferPtr = BufferStart + Offset;
282	}
283
284	template <typename T> static void StringifyImpl(T &Str, char Quote) {
285	typename T::size_type i = `0`, e = Str.size();
286	while (i < e) {
287	if (Str[i] == `'\\'` \|\| Str[i] == Quote) {
288	Str.insert(Str.begin() + i, `'\\'`);
289	i += `2`;
290	++e;
291	} else if (Str[i] == `'\n'` \|\| Str[i] == `'\r'`) {
292	// Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
293	if ((i < e - `1`) && (Str[i + `1`] == `'\n'` \|\| Str[i + `1`] == `'\r'`) &&
294	Str[i] != Str[i + `1`]) {
295	Str[i] = `'\\'`;
296	Str[i + `1`] = `'n'`;
297	} else {
298	// Replace '\n' and '\r' to '\\' followed by 'n'.
299	Str[i] = `'\\'`;
300	Str.insert(Str.begin() + i + `1`, `'n'`);
301	++e;
302	}
303	i += `2`;
304	} else
305	++i;
306	}
307	}
308
309	std::string Lexer::Stringify(StringRef Str, bool Charify) {
310	std::string Result = std::string (Str);
311	char Quote = Charify ? `'\''` : `'"'`;
312	StringifyImpl(Str&: Result, Quote);
313	return Result;
314	}
315
316	void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, Quote: `'"'`); }
317
318	//===----------------------------------------------------------------------===//
319	// Token Spelling
320	//===----------------------------------------------------------------------===//
321
322	/// Slow case of getSpelling. Extract the characters comprising the
323	/// spelling of this token from the provided input buffer.
324	static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
325	const LangOptions &LangOpts, char *Spelling) {
326	assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");
327
328	size_t Length = `0`;
329	const char *BufEnd = BufPtr + Tok.getLength();
330
331	if (tok::isStringLiteral(K: Tok.getKind())) {
332	// Munch the encoding-prefix and opening double-quote.
333	while (BufPtr < BufEnd) {
334	auto CharAndSize = Lexer::getCharAndSizeNoWarn(Ptr: BufPtr, LangOpts);
335	Spelling[Length++] = CharAndSize.Char;
336	BufPtr += CharAndSize.Size;
337
338	if (Spelling[Length - `1`] == `'"'`)
339	break;
340	}
341
342	// Raw string literals need special handling; trigraph expansion and line
343	// splicing do not occur within their d-char-sequence nor within their
344	// r-char-sequence.
345	if (Length >= `2` &&
346	Spelling[Length - `2`] == `'R'` && Spelling[Length - `1`] == `'"'`) {
347	// Search backwards from the end of the token to find the matching closing
348	// quote.
349	const char *RawEnd = BufEnd;
350	do --RawEnd; while (*RawEnd != `'"'`);
351	size_t RawLength = RawEnd - BufPtr + `1`;
352
353	// Everything between the quotes is included verbatim in the spelling.
354	memcpy(dest: Spelling + Length, src: BufPtr, n: RawLength);
355	Length += RawLength;
356	BufPtr += RawLength;
357
358	// The rest of the token is lexed normally.
359	}
360	}
361
362	while (BufPtr < BufEnd) {
363	auto CharAndSize = Lexer::getCharAndSizeNoWarn(Ptr: BufPtr, LangOpts);
364	Spelling[Length++] = CharAndSize.Char;
365	BufPtr += CharAndSize.Size;
366	}
367
368	assert(Length < Tok.getLength() &&
369	"NeedsCleaning flag set on token that didn't need cleaning!");
370	return Length;
371	}
372
373	/// getSpelling() - Return the 'spelling' of this token. The spelling of a
374	/// token are the characters used to represent the token in the source file
375	/// after trigraph expansion and escaped-newline folding. In particular, this
376	/// wants to get the true, uncanonicalized, spelling of things like digraphs
377	/// UCNs, etc.
378	StringRef Lexer::getSpelling(SourceLocation loc,
379	SmallVectorImpl<char> &buffer,
380	const SourceManager &SM,
381	const LangOptions &options,
382	bool *invalid) {
383	// Break down the source location.
384	std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(Loc: loc);
385
386	// Try to the load the file buffer.
387	bool invalidTemp = false;
388	StringRef file = SM.getBufferData(FID: locInfo.first, Invalid: &invalidTemp);
389	if (invalidTemp) {
390	if (invalid) invalid = true*;
391	return {};
392	}
393
394	const char *tokenBegin = file.data() + locInfo.second;
395
396	// Lex from the start of the given location.
397	Lexer lexer(SM.getLocForStartOfFile(FID: locInfo.first), options,
398	file.begin(), tokenBegin, file.end());
399	Token token;
400	lexer.LexFromRawLexer(Result&: token);
401
402	unsigned length = token.getLength();
403
404	// Common case: no need for cleaning.
405	if (!token.needsCleaning())
406	return StringRef(tokenBegin, length);
407
408	// Hard case, we need to relex the characters into the string.
409	buffer.resize(N: length);
410	buffer.resize(N: getSpellingSlow(Tok: token, BufPtr: tokenBegin, LangOpts: options, Spelling: buffer.data()));
411	return StringRef(buffer.data(), buffer.size());
412	}
413
414	/// getSpelling() - Return the 'spelling' of this token. The spelling of a
415	/// token are the characters used to represent the token in the source file
416	/// after trigraph expansion and escaped-newline folding. In particular, this
417	/// wants to get the true, uncanonicalized, spelling of things like digraphs
418	/// UCNs, etc.
419	std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
420	const LangOptions &LangOpts, bool *Invalid) {
421	assert((int)Tok.getLength() >= `0` && "Token character range is bogus!");
422
423	bool CharDataInvalid = false;
424	const char *TokStart = SourceMgr.getCharacterData(SL: Tok.getLocation(),
425	Invalid: &CharDataInvalid);
426	if (Invalid)
427	*Invalid = CharDataInvalid;
428	if (CharDataInvalid)
429	return {};
430
431	// If this token contains nothing interesting, return it directly.
432	if (!Tok.needsCleaning())
433	return std::string (TokStart, TokStart + Tok.getLength());
434
435	std::string Result;
436	Result.resize(n: Tok.getLength());
437	Result.resize(n: getSpellingSlow(Tok, BufPtr: TokStart, LangOpts, Spelling: &*Result.begin()));
438	return Result;
439	}
440
441	/// getSpelling - This method is used to get the spelling of a token into a
442	/// preallocated buffer, instead of as an std::string. The caller is required
443	/// to allocate enough space for the token, which is guaranteed to be at least
444	/// Tok.getLength() bytes long. The actual length of the token is returned.
445	///
446	/// Note that this method may do two possible things: it may either fill in
447	/// the buffer specified with characters, or it may change the input pointer
448	/// to point to a constant buffer with the data already in it (avoiding a
449	/// copy). The caller is not allowed to modify the returned buffer pointer
450	/// if an internal buffer is returned.
451	unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
452	const SourceManager &SourceMgr,
453	const LangOptions &LangOpts, bool *Invalid) {
454	assert((int)Tok.getLength() >= `0` && "Token character range is bogus!");
455
456	const char TokStart = nullptr*;
457	// NOTE: this has to be checked before* testing for an IdentifierInfo.*
458	if (Tok.is(K: tok::raw_identifier))
459	TokStart = Tok.getRawIdentifier().data();
460	else if (!Tok.hasUCN()) {
461	if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
462	// Just return the string from the identifier table, which is very quick.
463	Buffer = II->getNameStart();
464	return II->getLength();
465	}
466	}
467
468	// NOTE: this can be checked even after testing for an IdentifierInfo.
469	if (Tok.isLiteral())
470	TokStart = Tok.getLiteralData();
471
472	if (!TokStart) {
473	// Compute the start of the token in the input lexer buffer.
474	bool CharDataInvalid = false;
475	TokStart = SourceMgr.getCharacterData(SL: Tok.getLocation(), Invalid: &CharDataInvalid);
476	if (Invalid)
477	*Invalid = CharDataInvalid;
478	if (CharDataInvalid) {
479	Buffer = "";
480	return `0`;
481	}
482	}
483
484	// If this token contains nothing interesting, return it directly.
485	if (!Tok.needsCleaning()) {
486	Buffer = TokStart;
487	return Tok.getLength();
488	}
489
490	// Otherwise, hard case, relex the characters into the string.
491	return getSpellingSlow(Tok, BufPtr: TokStart, LangOpts, Spelling: const_cast<char*>(Buffer));
492	}
493
494	/// MeasureTokenLength - Relex the token at the specified location and return
495	/// its length in bytes in the input file. If the token needs cleaning (e.g.
496	/// includes a trigraph or an escaped newline) then this count includes bytes
497	/// that are part of that.
498	unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
499	const SourceManager &SM,
500	const LangOptions &LangOpts) {
501	Token TheTok;
502	if (getRawToken(Loc, Result&: TheTok, SM, LangOpts))
503	return `0`;
504	return TheTok.getLength();
505	}
506
507	/// Relex the token at the specified location.
508	/// \returns true if there was a failure, false on success.
509	bool Lexer::getRawToken(SourceLocation Loc, Token &Result,
510	const SourceManager &SM,
511	const LangOptions &LangOpts,
512	bool IgnoreWhiteSpace) {
513	// TODO: this could be special cased for common tokens like identifiers, ')',
514	// etc to make this faster, if it mattered. Just look at StrData[0] to handle
515	// all obviously single-char tokens. This could use
516	// Lexer::isObviouslySimpleCharacter for example to handle identifiers or
517	// something.
518
519	// If this comes from a macro expansion, we really do want the macro name, not
520	// the token this macro expanded to.
521	Loc = SM.getExpansionLoc(Loc);
522	std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
523	bool Invalid = false;
524	StringRef Buffer = SM.getBufferData(FID: LocInfo.first, Invalid: &Invalid);
525	if (Invalid)
526	return true;
527
528	const char *StrData = Buffer.data()+LocInfo.second;
529
530	if (!IgnoreWhiteSpace && isWhitespace(c: SkipEscapedNewLines(P: StrData)[`0`]))
531	return true;
532
533	// Create a lexer starting at the beginning of this token.
534	Lexer TheLexer(SM.getLocForStartOfFile(FID: LocInfo.first), LangOpts,
535	Buffer.begin(), StrData, Buffer.end());
536	TheLexer.SetCommentRetentionState(true);
537	TheLexer.LexFromRawLexer(Result);
538	return false;
539	}
540
541	/// Returns the pointer that points to the beginning of line that contains
542	/// the given offset, or null if the offset if invalid.
543	static const char findBeginningOfLine(StringRef Buffer, unsigned* Offset) {
544	const char *BufStart = Buffer.data();
545	if (Offset >= Buffer.size())
546	return nullptr;
547
548	const char *LexStart = BufStart + Offset;
549	for (; LexStart != BufStart; --LexStart) {
550	if (isVerticalWhitespace(c: LexStart[`0`]) &&
551	!Lexer::isNewLineEscaped(BufferStart: BufStart, Str: LexStart)) {
552	// LexStart should point at first character of logical line.
553	++LexStart;
554	break;
555	}
556	}
557	return LexStart;
558	}
559
560	static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
561	const SourceManager &SM,
562	const LangOptions &LangOpts) {
563	assert(Loc.isFileID());
564	std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
565	if (LocInfo.first.isInvalid())
566	return Loc;
567
568	bool Invalid = false;
569	StringRef Buffer = SM.getBufferData(FID: LocInfo.first, Invalid: &Invalid);
570	if (Invalid)
571	return Loc;
572
573	// Back up from the current location until we hit the beginning of a line
574	// (or the buffer). We'll relex from that point.
575	const char *StrData = Buffer.data() + LocInfo.second;
576	const char *LexStart = findBeginningOfLine(Buffer, Offset: LocInfo.second);
577	if (!LexStart \|\| LexStart == StrData)
578	return Loc;
579
580	// Create a lexer starting at the beginning of this token.
581	SourceLocation LexerStartLoc = Loc.getLocWithOffset(Offset: -LocInfo.second);
582	Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
583	Buffer.end());
584	TheLexer.SetCommentRetentionState(true);
585
586	// Lex tokens until we find the token that contains the source location.
587	Token TheTok;
588	do {
589	TheLexer.LexFromRawLexer(Result&: TheTok);
590
591	if (TheLexer.getBufferLocation() > StrData) {
592	// Lexing this token has taken the lexer past the source location we're
593	// looking for. If the current token encompasses our source location,
594	// return the beginning of that token.
595	if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
596	return TheTok.getLocation();
597
598	// We ended up skipping over the source location entirely, which means
599	// that it points into whitespace. We're done here.
600	break;
601	}
602	} while (TheTok.getKind() != tok::eof);
603
604	// We've passed our source location; just return the original source location.
605	return Loc;
606	}
607
608	SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
609	const SourceManager &SM,
610	const LangOptions &LangOpts) {
611	if (Loc.isFileID())
612	return getBeginningOfFileToken(Loc, SM, LangOpts);
613
614	if (!SM.isMacroArgExpansion(Loc))
615	return Loc;
616
617	SourceLocation FileLoc = SM.getSpellingLoc(Loc);
618	SourceLocation BeginFileLoc = getBeginningOfFileToken(Loc: FileLoc, SM, LangOpts);
619	std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(Loc: FileLoc);
620	std::pair<FileID, unsigned> BeginFileLocInfo =
621	SM.getDecomposedLoc(Loc: BeginFileLoc);
622	assert(FileLocInfo.first == BeginFileLocInfo.first &&
623	FileLocInfo.second >= BeginFileLocInfo.second);
624	return Loc.getLocWithOffset(Offset: BeginFileLocInfo.second - FileLocInfo.second);
625	}
626
627	namespace {
628
629	enum PreambleDirectiveKind {
630	PDK_Skipped,
631	PDK_Unknown
632	};
633
634	} // namespace
635
636	PreambleBounds Lexer::ComputePreamble(StringRef Buffer,
637	const LangOptions &LangOpts,
638	unsigned MaxLines) {
639	// Create a lexer starting at the beginning of the file. Note that we use a
640	// "fake" file source location at offset 1 so that the lexer will track our
641	// position within the file.
642	const SourceLocation::UIntTy StartOffset = `1`;
643	SourceLocation FileLoc = SourceLocation::getFromRawEncoding(Encoding: StartOffset);
644	Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
645	Buffer.end());
646	TheLexer.SetCommentRetentionState(true);
647
648	bool InPreprocessorDirective = false;
649	Token TheTok;
650	SourceLocation ActiveCommentLoc;
651
652	unsigned MaxLineOffset = `0`;
653	if (MaxLines) {
654	const char *CurPtr = Buffer.begin();
655	unsigned CurLine = `0`;
656	while (CurPtr != Buffer.end()) {
657	char ch = *CurPtr++;
658	if (ch == `'\n'`) {
659	++CurLine;
660	if (CurLine == MaxLines)
661	break;
662	}
663	}
664	if (CurPtr != Buffer.end())
665	MaxLineOffset = CurPtr - Buffer.begin();
666	}
667
668	do {
669	TheLexer.LexFromRawLexer(Result&: TheTok);
670
671	if (InPreprocessorDirective) {
672	// If we've hit the end of the file, we're done.
673	if (TheTok.getKind() == tok::eof) {
674	break;
675	}
676
677	// If we haven't hit the end of the preprocessor directive, skip this
678	// token.
679	if (!TheTok.isAtStartOfLine())
680	continue;
681
682	// We've passed the end of the preprocessor directive, and will look
683	// at this token again below.
684	InPreprocessorDirective = false;
685	}
686
687	// Keep track of the # of lines in the preamble.
688	if (TheTok.isAtStartOfLine()) {
689	unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
690
691	// If we were asked to limit the number of lines in the preamble,
692	// and we're about to exceed that limit, we're done.
693	if (MaxLineOffset && TokOffset >= MaxLineOffset)
694	break;
695	}
696
697	// Comments are okay; skip over them.
698	if (TheTok.getKind() == tok::comment) {
699	if (ActiveCommentLoc.isInvalid())
700	ActiveCommentLoc = TheTok.getLocation();
701	continue;
702	}
703
704	if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
705	// This is the start of a preprocessor directive.
706	Token HashTok = TheTok;
707	InPreprocessorDirective = true;
708	ActiveCommentLoc = SourceLocation ();
709
710	// Figure out which directive this is. Since we're lexing raw tokens,
711	// we don't have an identifier table available. Instead, just look at
712	// the raw identifier to recognize and categorize preprocessor directives.
713	TheLexer.LexFromRawLexer(Result&: TheTok);
714	if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
715	StringRef Keyword = TheTok.getRawIdentifier();
716	PreambleDirectiveKind PDK
717	= llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
718	.Case(S: "include", Value: PDK_Skipped)
719	.Case(S: "__include_macros", Value: PDK_Skipped)
720	.Case(S: "define", Value: PDK_Skipped)
721	.Case(S: "undef", Value: PDK_Skipped)
722	.Case(S: "line", Value: PDK_Skipped)
723	.Case(S: "error", Value: PDK_Skipped)
724	.Case(S: "pragma", Value: PDK_Skipped)
725	.Case(S: "import", Value: PDK_Skipped)
726	.Case(S: "include_next", Value: PDK_Skipped)
727	.Case(S: "warning", Value: PDK_Skipped)
728	.Case(S: "ident", Value: PDK_Skipped)
729	.Case(S: "sccs", Value: PDK_Skipped)
730	.Case(S: "assert", Value: PDK_Skipped)
731	.Case(S: "unassert", Value: PDK_Skipped)
732	.Case(S: "if", Value: PDK_Skipped)
733	.Case(S: "ifdef", Value: PDK_Skipped)
734	.Case(S: "ifndef", Value: PDK_Skipped)
735	.Case(S: "elif", Value: PDK_Skipped)
736	.Case(S: "elifdef", Value: PDK_Skipped)
737	.Case(S: "elifndef", Value: PDK_Skipped)
738	.Case(S: "else", Value: PDK_Skipped)
739	.Case(S: "endif", Value: PDK_Skipped)
740	.Default(Value: PDK_Unknown);
741
742	switch (PDK) {
743	case PDK_Skipped:
744	continue;
745
746	case PDK_Unknown:
747	// We don't know what this directive is; stop at the '#'.
748	break;
749	}
750	}
751
752	// We only end up here if we didn't recognize the preprocessor
753	// directive or it was one that can't occur in the preamble at this
754	// point. Roll back the current token to the location of the '#'.
755	TheTok = HashTok;
756	} else if (TheTok.isAtStartOfLine() &&
757	TheTok.getKind() == tok::raw_identifier &&
758	TheTok.getRawIdentifier() == "module" &&
759	LangOpts.CPlusPlusModules) {
760	// The initial global module fragment introducer "module;" is part of
761	// the preamble, which runs up to the module declaration "module foo;".
762	Token ModuleTok = TheTok;
763	do {
764	TheLexer.LexFromRawLexer(Result&: TheTok);
765	} while (TheTok.getKind() == tok::comment);
766	if (TheTok.getKind() != tok::semi) {
767	// Not global module fragment, roll back.
768	TheTok = ModuleTok;
769	break;
770	}
771	continue;
772	}
773
774	// We hit a token that we don't recognize as being in the
775	// "preprocessing only" part of the file, so we're no longer in
776	// the preamble.
777	break;
778	} while (true);
779
780	SourceLocation End;
781	if (ActiveCommentLoc.isValid())
782	End = ActiveCommentLoc; // don't truncate a decl comment.
783	else
784	End = TheTok.getLocation();
785
786	return PreambleBounds (End.getRawEncoding() - FileLoc.getRawEncoding(),
787	TheTok.isAtStartOfLine());
788	}
789
790	unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
791	const SourceManager &SM,
792	const LangOptions &LangOpts) {
793	// Figure out how many physical characters away the specified expansion
794	// character is. This needs to take into consideration newlines and
795	// trigraphs.
796	bool Invalid = false;
797	const char *TokPtr = SM.getCharacterData(SL: TokStart, Invalid: &Invalid);
798
799	// If they request the first char of the token, we're trivially done.
800	if (Invalid \|\| (CharNo == `0` && Lexer::isObviouslySimpleCharacter(C: *TokPtr)))
801	return `0`;
802
803	unsigned PhysOffset = `0`;
804
805	// The usual case is that tokens don't contain anything interesting. Skip
806	// over the uninteresting characters. If a token only consists of simple
807	// chars, this method is extremely fast.
808	while (Lexer::isObviouslySimpleCharacter(C: *TokPtr)) {
809	if (CharNo == `0`)
810	return PhysOffset;
811	++TokPtr;
812	--CharNo;
813	++PhysOffset;
814	}
815
816	// If we have a character that may be a trigraph or escaped newline, use a
817	// lexer to parse it correctly.
818	for (; CharNo; --CharNo) {
819	auto CharAndSize = Lexer::getCharAndSizeNoWarn(Ptr: TokPtr, LangOpts);
820	TokPtr += CharAndSize.Size;
821	PhysOffset += CharAndSize.Size;
822	}
823
824	// Final detail: if we end up on an escaped newline, we want to return the
825	// location of the actual byte of the token. For example foo\<newline>bar
826	// advanced by 3 should return the location of b, not of \\. One compounding
827	// detail of this is that the escape may be made by a trigraph.
828	if (!Lexer::isObviouslySimpleCharacter(C: *TokPtr))
829	PhysOffset += Lexer::SkipEscapedNewLines(P: TokPtr)-TokPtr;
830
831	return PhysOffset;
832	}
833
834	/// Computes the source location just past the end of the
835	/// token at this source location.
836	///
837	/// This routine can be used to produce a source location that
838	/// points just past the end of the token referenced by \p Loc, and
839	/// is generally used when a diagnostic needs to point just after a
840	/// token where it expected something different that it received. If
841	/// the returned source location would not be meaningful (e.g., if
842	/// it points into a macro), this routine returns an invalid
843	/// source location.
844	///
845	/// \param Offset an offset from the end of the token, where the source
846	/// location should refer to. The default offset (0) produces a source
847	/// location pointing just past the end of the token; an offset of 1 produces
848	/// a source location pointing to the last character in the token, etc.
849	SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
850	const SourceManager &SM,
851	const LangOptions &LangOpts) {
852	if (Loc.isInvalid())
853	return {};
854
855	if (Loc.isMacroID()) {
856	if (Offset > `0` \|\| !isAtEndOfMacroExpansion(loc: Loc, SM, LangOpts, MacroEnd: &Loc))
857	return {}; // Points inside the macro expansion.
858	}
859
860	unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
861	if (Len > Offset)
862	Len = Len - Offset;
863	else
864	return Loc;
865
866	return Loc.getLocWithOffset(Offset: Len);
867	}
868
869	/// Returns true if the given MacroID location points at the first
870	/// token of the macro expansion.
871	bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc,
872	const SourceManager &SM,
873	const LangOptions &LangOpts,
874	SourceLocation *MacroBegin) {
875	assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
876
877	SourceLocation expansionLoc;
878	if (!SM.isAtStartOfImmediateMacroExpansion(Loc: loc, MacroBegin: &expansionLoc))
879	return false;
880
881	if (expansionLoc.isFileID()) {
882	// No other macro expansions, this is the first.
883	if (MacroBegin)
884	*MacroBegin = expansionLoc;
885	return true;
886	}
887
888	return isAtStartOfMacroExpansion(loc: expansionLoc, SM, LangOpts, MacroBegin);
889	}
890
891	/// Returns true if the given MacroID location points at the last
892	/// token of the macro expansion.
893	bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc,
894	const SourceManager &SM,
895	const LangOptions &LangOpts,
896	SourceLocation *MacroEnd) {
897	assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
898
899	SourceLocation spellLoc = SM.getSpellingLoc(Loc: loc);
900	unsigned tokLen = MeasureTokenLength(Loc: spellLoc, SM, LangOpts);
901	if (tokLen == `0`)
902	return false;
903
904	SourceLocation afterLoc = loc.getLocWithOffset(Offset: tokLen);
905	SourceLocation expansionLoc;
906	if (!SM.isAtEndOfImmediateMacroExpansion(Loc: afterLoc, MacroEnd: &expansionLoc))
907	return false;
908
909	if (expansionLoc.isFileID()) {
910	// No other macro expansions.
911	if (MacroEnd)
912	*MacroEnd = expansionLoc;
913	return true;
914	}
915
916	return isAtEndOfMacroExpansion(loc: expansionLoc, SM, LangOpts, MacroEnd);
917	}
918
919	static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
920	const SourceManager &SM,
921	const LangOptions &LangOpts) {
922	SourceLocation Begin = Range.getBegin();
923	SourceLocation End = Range.getEnd();
924	assert(Begin.isFileID() && End.isFileID());
925	if (Range.isTokenRange()) {
926	End = Lexer::getLocForEndOfToken(Loc: End, Offset: `0`, SM,LangOpts);
927	if (End.isInvalid())
928	return {};
929	}
930
931	// Break down the source locations.
932	FileID FID;
933	unsigned BeginOffs;
934	std::tie(args&: FID, args&: BeginOffs) = SM.getDecomposedLoc(Loc: Begin);
935	if (FID.isInvalid())
936	return {};
937
938	unsigned EndOffs;
939	if (!SM.isInFileID(Loc: End, FID, RelativeOffset: &EndOffs) \|\|
940	BeginOffs > EndOffs)
941	return {};
942
943	return CharSourceRange::getCharRange(B: Begin, E: End);
944	}
945
946	// Assumes that `Loc` is in an expansion.
947	static bool isInExpansionTokenRange(const SourceLocation Loc,
948	const SourceManager &SM) {
949	return SM.getSLocEntry(FID: SM.getFileID(SpellingLoc: Loc))
950	.getExpansion()
951	.isExpansionTokenRange();
952	}
953
954	CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
955	const SourceManager &SM,
956	const LangOptions &LangOpts) {
957	SourceLocation Begin = Range.getBegin();
958	SourceLocation End = Range.getEnd();
959	if (Begin.isInvalid() \|\| End.isInvalid())
960	return {};
961
962	if (Begin.isFileID() && End.isFileID())
963	return makeRangeFromFileLocs(Range, SM, LangOpts);
964
965	if (Begin.isMacroID() && End.isFileID()) {
966	if (!isAtStartOfMacroExpansion(loc: Begin, SM, LangOpts, MacroBegin: &Begin))
967	return {};
968	Range.setBegin(Begin);
969	return makeRangeFromFileLocs(Range, SM, LangOpts);
970	}
971
972	if (Begin.isFileID() && End.isMacroID()) {
973	if (Range.isTokenRange()) {
974	if (!isAtEndOfMacroExpansion(loc: End, SM, LangOpts, MacroEnd: &End))
975	return {};
976	// Use the original* end, not the expanded one in `End`.*
977	Range.setTokenRange(isInExpansionTokenRange(Loc: Range.getEnd(), SM));
978	} else if (!isAtStartOfMacroExpansion(loc: End, SM, LangOpts, MacroBegin: &End))
979	return {};
980	Range.setEnd(End);
981	return makeRangeFromFileLocs(Range, SM, LangOpts);
982	}
983
984	assert(Begin.isMacroID() && End.isMacroID());
985	SourceLocation MacroBegin, MacroEnd;
986	if (isAtStartOfMacroExpansion(loc: Begin, SM, LangOpts, MacroBegin: &MacroBegin) &&
987	((Range.isTokenRange() && isAtEndOfMacroExpansion(loc: End, SM, LangOpts,
988	MacroEnd: &MacroEnd)) \|\|
989	(Range.isCharRange() && isAtStartOfMacroExpansion(loc: End, SM, LangOpts,
990	MacroBegin: &MacroEnd)))) {
991	Range.setBegin(MacroBegin);
992	Range.setEnd(MacroEnd);
993	// Use the original* `End`, not the expanded one in `MacroEnd`.*
994	if (Range.isTokenRange())
995	Range.setTokenRange(isInExpansionTokenRange(Loc: End, SM));
996	return makeRangeFromFileLocs(Range, SM, LangOpts);
997	}
998
999	bool Invalid = false;
1000	const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(FID: SM.getFileID(SpellingLoc: Begin),
1001	Invalid: &Invalid);
1002	if (Invalid)
1003	return {};
1004
1005	if (BeginEntry.getExpansion().isMacroArgExpansion()) {
1006	const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(FID: SM.getFileID(SpellingLoc: End),
1007	Invalid: &Invalid);
1008	if (Invalid)
1009	return {};
1010
1011	if (EndEntry.getExpansion().isMacroArgExpansion() &&
1012	BeginEntry.getExpansion().getExpansionLocStart() ==
1013	EndEntry.getExpansion().getExpansionLocStart()) {
1014	Range.setBegin(SM.getImmediateSpellingLoc(Loc: Begin));
1015	Range.setEnd(SM.getImmediateSpellingLoc(Loc: End));
1016	return makeFileCharRange(Range, SM, LangOpts);
1017	}
1018	}
1019
1020	return {};
1021	}
1022
1023	StringRef Lexer::getSourceText(CharSourceRange Range,
1024	const SourceManager &SM,
1025	const LangOptions &LangOpts,
1026	bool *Invalid) {
1027	Range = makeFileCharRange(Range, SM, LangOpts);
1028	if (Range.isInvalid()) {
1029	if (Invalid) Invalid = true*;
1030	return {};
1031	}
1032
1033	// Break down the source location.
1034	std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Loc: Range.getBegin());
1035	if (beginInfo.first.isInvalid()) {
1036	if (Invalid) Invalid = true*;
1037	return {};
1038	}
1039
1040	unsigned EndOffs;
1041	if (!SM.isInFileID(Loc: Range.getEnd(), FID: beginInfo.first, RelativeOffset: &EndOffs) \|\|
1042	beginInfo.second > EndOffs) {
1043	if (Invalid) Invalid = true*;
1044	return {};
1045	}
1046
1047	// Try to the load the file buffer.
1048	bool invalidTemp = false;
1049	StringRef file = SM.getBufferData(FID: beginInfo.first, Invalid: &invalidTemp);
1050	if (invalidTemp) {
1051	if (Invalid) Invalid = true*;
1052	return {};
1053	}
1054
1055	if (Invalid) Invalid = false*;
1056	return file.substr(Start: beginInfo.second, N: EndOffs - beginInfo.second);
1057	}
1058
1059	StringRef Lexer::getImmediateMacroName(SourceLocation Loc,
1060	const SourceManager &SM,
1061	const LangOptions &LangOpts) {
1062	assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1063
1064	// Find the location of the immediate macro expansion.
1065	while (true) {
1066	FileID FID = SM.getFileID(SpellingLoc: Loc);
1067	const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
1068	const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
1069	Loc = Expansion.getExpansionLocStart();
1070	if (!Expansion.isMacroArgExpansion())
1071	break;
1072
1073	// For macro arguments we need to check that the argument did not come
1074	// from an inner macro, e.g: "MAC1( MAC2(foo) )"
1075
1076	// Loc points to the argument id of the macro definition, move to the
1077	// macro expansion.
1078	Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1079	SourceLocation SpellLoc = Expansion.getSpellingLoc();
1080	if (SpellLoc.isFileID())
1081	break; // No inner macro.
1082
1083	// If spelling location resides in the same FileID as macro expansion
1084	// location, it means there is no inner macro.
1085	FileID MacroFID = SM.getFileID(SpellingLoc: Loc);
1086	if (SM.isInFileID(Loc: SpellLoc, FID: MacroFID))
1087	break;
1088
1089	// Argument came from inner macro.
1090	Loc = SpellLoc;
1091	}
1092
1093	// Find the spelling location of the start of the non-argument expansion
1094	// range. This is where the macro name was spelled in order to begin
1095	// expanding this macro.
1096	Loc = SM.getSpellingLoc(Loc);
1097
1098	// Dig out the buffer where the macro name was spelled and the extents of the
1099	// name so that we can render it into the expansion note.
1100	std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1101	unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1102	StringRef ExpansionBuffer = SM.getBufferData(FID: ExpansionInfo.first);
1103	return ExpansionBuffer.substr(Start: ExpansionInfo.second, N: MacroTokenLength);
1104	}
1105
1106	StringRef Lexer::getImmediateMacroNameForDiagnostics(
1107	SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) {
1108	assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1109	// Walk past macro argument expansions.
1110	while (SM.isMacroArgExpansion(Loc))
1111	Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1112
1113	// If the macro's spelling isn't FileID or from scratch space, then it's
1114	// actually a token paste or stringization (or similar) and not a macro at
1115	// all.
1116	SourceLocation SpellLoc = SM.getSpellingLoc(Loc);
1117	if (!SpellLoc.isFileID() \|\| SM.isWrittenInScratchSpace(Loc: SpellLoc))
1118	return {};
1119
1120	// Find the spelling location of the start of the non-argument expansion
1121	// range. This is where the macro name was spelled in order to begin
1122	// expanding this macro.
1123	Loc = SM.getSpellingLoc(Loc: SM.getImmediateExpansionRange(Loc).getBegin());
1124
1125	// Dig out the buffer where the macro name was spelled and the extents of the
1126	// name so that we can render it into the expansion note.
1127	std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1128	unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1129	StringRef ExpansionBuffer = SM.getBufferData(FID: ExpansionInfo.first);
1130	return ExpansionBuffer.substr(Start: ExpansionInfo.second, N: MacroTokenLength);
1131	}
1132
1133	bool Lexer::isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts) {
1134	return isAsciiIdentifierContinue(c, AllowDollar: LangOpts.DollarIdents);
1135	}
1136
1137	bool Lexer::isNewLineEscaped(const char BufferStart, const* char *Str) {
1138	assert(isVerticalWhitespace(Str[`0`]));
1139	if (Str - `1` < BufferStart)
1140	return false;
1141
1142	if ((Str[`0`] == `'\n'` && Str[-`1`] == `'\r'`) \|\|
1143	(Str[`0`] == `'\r'` && Str[-`1`] == `'\n'`)) {
1144	if (Str - `2` < BufferStart)
1145	return false;
1146	--Str;
1147	}
1148	--Str;
1149
1150	// Rewind to first non-space character:
1151	while (Str > BufferStart && isHorizontalWhitespace(c: *Str))
1152	--Str;
1153
1154	return *Str == `'\\'`;
1155	}
1156
1157	StringRef Lexer::getIndentationForLine(SourceLocation Loc,
1158	const SourceManager &SM) {
1159	if (Loc.isInvalid() \|\| Loc.isMacroID())
1160	return {};
1161	std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1162	if (LocInfo.first.isInvalid())
1163	return {};
1164	bool Invalid = false;
1165	StringRef Buffer = SM.getBufferData(FID: LocInfo.first, Invalid: &Invalid);
1166	if (Invalid)
1167	return {};
1168	const char *Line = findBeginningOfLine(Buffer, Offset: LocInfo.second);
1169	if (!Line)
1170	return {};
1171	StringRef Rest = Buffer.substr(Start: Line - Buffer.data());
1172	size_t NumWhitespaceChars = Rest.find_first_not_of(Chars: " \t");
1173	return NumWhitespaceChars == StringRef::npos
1174	? ""
1175	: Rest.take_front(N: NumWhitespaceChars);
1176	}
1177
1178	//===----------------------------------------------------------------------===//
1179	// Diagnostics forwarding code.
1180	//===----------------------------------------------------------------------===//
1181
1182	/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
1183	/// lexer buffer was all expanded at a single point, perform the mapping.
1184	/// This is currently only used for _Pragma implementation, so it is the slow
1185	/// path of the hot getSourceLocation method. Do not allow it to be inlined.
1186	static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(
1187	Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
1188	static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
1189	SourceLocation FileLoc,
1190	unsigned CharNo, unsigned TokLen) {
1191	assert(FileLoc.isMacroID() && "Must be a macro expansion");
1192
1193	// Otherwise, we're lexing "mapped tokens". This is used for things like
1194	// _Pragma handling. Combine the expansion location of FileLoc with the
1195	// spelling location.
1196	SourceManager &SM = PP.getSourceManager();
1197
1198	// Create a new SLoc which is expanded from Expansion(FileLoc) but whose
1199	// characters come from spelling(FileLoc)+Offset.
1200	SourceLocation SpellingLoc = SM.getSpellingLoc(Loc: FileLoc);
1201	SpellingLoc = SpellingLoc.getLocWithOffset(Offset: CharNo);
1202
1203	// Figure out the expansion loc range, which is the range covered by the
1204	// original _Pragma(...) sequence.
1205	CharSourceRange II = SM.getImmediateExpansionRange(Loc: FileLoc);
1206
1207	return SM.createExpansionLoc(SpellingLoc, ExpansionLocStart: II.getBegin(), ExpansionLocEnd: II.getEnd(), Length: TokLen);
1208	}
1209
1210	/// getSourceLocation - Return a source location identifier for the specified
1211	/// offset in the current file.
1212	SourceLocation Lexer::getSourceLocation(const char *Loc,
1213	unsigned TokLen) const {
1214	assert(Loc >= BufferStart && Loc <= BufferEnd &&
1215	"Location out of range for this buffer!");
1216
1217	// In the normal case, we're just lexing from a simple file buffer, return
1218	// the file id from FileLoc with the offset specified.
1219	unsigned CharNo = Loc-BufferStart;
1220	if (FileLoc.isFileID())
1221	return FileLoc.getLocWithOffset(Offset: CharNo);
1222
1223	// Otherwise, this is the _Pragma lexer case, which pretends that all of the
1224	// tokens are lexed from where the _Pragma was defined.
1225	assert(PP && "This doesn't work on raw lexers");
1226	return GetMappedTokenLoc(PP&: *PP, FileLoc, CharNo, TokLen);
1227	}
1228
1229	/// Diag - Forwarding function for diagnostics. This translate a source
1230	/// position in the current buffer into a SourceLocation object for rendering.
1231	DiagnosticBuilder Lexer::Diag(const char Loc, unsigned* DiagID) const {
1232	return PP->Diag(Loc: getSourceLocation(Loc), DiagID);
1233	}
1234
1235	//===----------------------------------------------------------------------===//
1236	// Trigraph and Escaped Newline Handling Code.
1237	//===----------------------------------------------------------------------===//
1238
1239	/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
1240	/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
1241	static char GetTrigraphCharForLetter(char Letter) {
1242	switch (Letter) {
1243	default: return `0`;
1244	case `'='`: return `'#'`;
1245	case `')'`: return `']'`;
1246	case `'('`: return `'['`;
1247	case `'!'`: return `'\|'`;
1248	case `'\''`: return `'^'`;
1249	case `'>'`: return `'}'`;
1250	case `'/'`: return `'\\'`;
1251	case `'<'`: return `'{'`;
1252	case `'-'`: return `'~'`;
1253	}
1254	}
1255
1256	/// DecodeTrigraphChar - If the specified character is a legal trigraph when
1257	/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled,
1258	/// return the result character. Finally, emit a warning about trigraph use
1259	/// whether trigraphs are enabled or not.
1260	static char DecodeTrigraphChar(const char CP, Lexer L, bool Trigraphs) {
1261	char Res = GetTrigraphCharForLetter(Letter: *CP);
1262	if (!Res)
1263	return Res;
1264
1265	if (!Trigraphs) {
1266	if (L && !L->isLexingRawMode())
1267	L->Diag(Loc: CP-`2`, diag::DiagID: trigraph_ignored);
1268	return `0`;
1269	}
1270
1271	if (L && !L->isLexingRawMode())
1272	L->Diag(Loc: CP-`2`, diag::DiagID: trigraph_converted) << StringRef(&Res, `1`);
1273	return Res;
1274	}
1275
1276	/// getEscapedNewLineSize - Return the size of the specified escaped newline,
1277	/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
1278	/// trigraph equivalent on entry to this function.
1279	unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
1280	unsigned Size = `0`;
1281	while (isWhitespace(c: Ptr[Size])) {
1282	++Size;
1283
1284	if (Ptr[Size-`1`] != `'\n'` && Ptr[Size-`1`] != `'\r'`)
1285	continue;
1286
1287	// If this is a \r\n or \n\r, skip the other half.
1288	if ((Ptr[Size] == `'\r'` \|\| Ptr[Size] == `'\n'`) &&
1289	Ptr[Size-`1`] != Ptr[Size])
1290	++Size;
1291
1292	return Size;
1293	}
1294
1295	// Not an escaped newline, must be a \t or something else.
1296	return `0`;
1297	}
1298
1299	/// SkipEscapedNewLines - If P points to an escaped newline (or a series of
1300	/// them), skip over them and return the first non-escaped-newline found,
1301	/// otherwise return P.
1302	const char Lexer::SkipEscapedNewLines(const* char *P) {
1303	while (true) {
1304	const char *AfterEscape;
1305	if (*P == `'\\'`) {
1306	AfterEscape = P+`1`;
1307	} else if (*P == `'?'`) {
1308	// If not a trigraph for escape, bail out.
1309	if (P[`1`] != `'?'` \|\| P[`2`] != `'/'`)
1310	return P;
1311	// FIXME: Take LangOpts into account; the language might not
1312	// support trigraphs.
1313	AfterEscape = P+`3`;
1314	} else {
1315	return P;
1316	}
1317
1318	unsigned NewLineSize = Lexer::getEscapedNewLineSize(Ptr: AfterEscape);
1319	if (NewLineSize == `0`) return P;
1320	P = AfterEscape+NewLineSize;
1321	}
1322	}
1323
1324	std::optional<Token> Lexer::findNextToken(SourceLocation Loc,
1325	const SourceManager &SM,
1326	const LangOptions &LangOpts,
1327	bool IncludeComments) {
1328	if (Loc.isMacroID()) {
1329	if (!Lexer::isAtEndOfMacroExpansion(loc: Loc, SM, LangOpts, MacroEnd: &Loc))
1330	return std::nullopt;
1331	}
1332	Loc = Lexer::getLocForEndOfToken(Loc, Offset: `0`, SM, LangOpts);
1333
1334	// Break down the source location.
1335	std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1336
1337	// Try to load the file buffer.
1338	bool InvalidTemp = false;
1339	StringRef File = SM.getBufferData(FID: LocInfo.first, Invalid: &InvalidTemp);
1340	if (InvalidTemp)
1341	return std::nullopt;
1342
1343	const char *TokenBegin = File.data() + LocInfo.second;
1344
1345	// Lex from the start of the given location.
1346	Lexer lexer(SM.getLocForStartOfFile(FID: LocInfo.first), LangOpts, File.begin(),
1347	TokenBegin, File.end());
1348	lexer.SetCommentRetentionState(IncludeComments);
1349	// Find the token.
1350	Token Tok;
1351	lexer.LexFromRawLexer(Result&: Tok);
1352	return Tok;
1353	}
1354
1355	std::optional<Token> Lexer::findPreviousToken(SourceLocation Loc,
1356	const SourceManager &SM,
1357	const LangOptions &LangOpts,
1358	bool IncludeComments) {
1359	const auto StartOfFile = SM.getLocForStartOfFile(FID: SM.getFileID(SpellingLoc: Loc));
1360	while (Loc != StartOfFile) {
1361	Loc = Loc.getLocWithOffset(Offset: -`1`);
1362	if (Loc.isInvalid())
1363	return std::nullopt;
1364
1365	Loc = GetBeginningOfToken(Loc, SM, LangOpts);
1366	Token Tok;
1367	if (getRawToken(Loc, Result&: Tok, SM, LangOpts))
1368	continue; // Not a token, go to prev location.
1369	if (!Tok.is(K: tok::comment) \|\| IncludeComments) {
1370	return Tok;
1371	}
1372	}
1373	return std::nullopt;
1374	}
1375
1376	/// Checks that the given token is the first token that occurs after the
1377	/// given location (this excludes comments and whitespace). Returns the location
1378	/// immediately after the specified token. If the token is not found or the
1379	/// location is inside a macro, the returned source location will be invalid.
1380	SourceLocation Lexer::findLocationAfterToken(
1381	SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM,
1382	const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
1383	std::optional<Token> Tok = findNextToken(Loc, SM, LangOpts);
1384	if (!Tok \|\| Tok ->isNot(K: TKind))
1385	return {};
1386	SourceLocation TokenLoc = Tok ->getLocation();
1387
1388	// Calculate how much whitespace needs to be skipped if any.
1389	unsigned NumWhitespaceChars = `0`;
1390	if (SkipTrailingWhitespaceAndNewLine) {
1391	const char *TokenEnd = SM.getCharacterData(SL: TokenLoc) + Tok ->getLength();
1392	unsigned char C = *TokenEnd;
1393	while (isHorizontalWhitespace(c: C)) {
1394	C = *(++TokenEnd);
1395	NumWhitespaceChars++;
1396	}
1397
1398	// Skip \r, \n, \r\n, or \n\r
1399	if (C == `'\n'` \|\| C == `'\r'`) {
1400	char PrevC = C;
1401	C = *(++TokenEnd);
1402	NumWhitespaceChars++;
1403	if ((C == `'\n'` \|\| C == `'\r'`) && C != PrevC)
1404	NumWhitespaceChars++;
1405	}
1406	}
1407
1408	return TokenLoc.getLocWithOffset(Offset: Tok ->getLength() + NumWhitespaceChars);
1409	}
1410
1411	/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
1412	/// get its size, and return it. This is tricky in several cases:
1413	/// 1. If currently at the start of a trigraph, we warn about the trigraph,
1414	/// then either return the trigraph (skipping 3 chars) or the '?',
1415	/// depending on whether trigraphs are enabled or not.
1416	/// 2. If this is an escaped newline (potentially with whitespace between
1417	/// the backslash and newline), implicitly skip the newline and return
1418	/// the char after it.
1419	///
1420	/// This handles the slow/uncommon case of the getCharAndSize method. Here we
1421	/// know that we can accumulate into Size, and that we have already incremented
1422	/// Ptr by Size bytes.
1423	///
1424	/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
1425	/// be updated to match.
1426	Lexer::SizedChar Lexer::getCharAndSizeSlow(const char Ptr, Token Tok) {
1427	unsigned Size = `0`;
1428	// If we have a slash, look for an escaped newline.
1429	if (Ptr[`0`] == `'\\'`) {
1430	++Size;
1431	++Ptr;
1432	Slash:
1433	// Common case, backslash-char where the char is not whitespace.
1434	if (!isWhitespace(c: Ptr[`0`]))
1435	return {.Char: `'\\'`, .Size: Size};
1436
1437	// See if we have optional whitespace characters between the slash and
1438	// newline.
1439	if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1440	// Remember that this token needs to be cleaned.
1441	if (Tok) Tok->setFlag(Token::NeedsCleaning);
1442
1443	// Warn if there was whitespace between the backslash and newline.
1444	if (Ptr[`0`] != `'\n'` && Ptr[`0`] != `'\r'` && Tok && !isLexingRawMode())
1445	Diag(Loc: Ptr, diag::DiagID: backslash_newline_space);
1446
1447	// Found backslash<whitespace><newline>. Parse the char after it.
1448	Size += EscapedNewLineSize;
1449	Ptr += EscapedNewLineSize;
1450
1451	// Use slow version to accumulate a correct size field.
1452	auto CharAndSize = getCharAndSizeSlow(Ptr, Tok);
1453	CharAndSize.Size += Size;
1454	return CharAndSize;
1455	}
1456
1457	// Otherwise, this is not an escaped newline, just return the slash.
1458	return {.Char: `'\\'`, .Size: Size};
1459	}
1460
1461	// If this is a trigraph, process it.
1462	if (Ptr[`0`] == `'?'` && Ptr[`1`] == `'?'`) {
1463	// If this is actually a legal trigraph (not something like "??x"), emit
1464	// a trigraph warning. If so, and if trigraphs are enabled, return it.
1465	if (char C = DecodeTrigraphChar(CP: Ptr + `2`, L: Tok ? this : nullptr,
1466	Trigraphs: LangOpts.Trigraphs)) {
1467	// Remember that this token needs to be cleaned.
1468	if (Tok) Tok->setFlag(Token::NeedsCleaning);
1469
1470	Ptr += `3`;
1471	Size += `3`;
1472	if (C == `'\\'`) goto Slash;
1473	return {.Char: C, .Size: Size};
1474	}
1475	}
1476
1477	// If this is neither, return a single character.
1478	return {.Char: *Ptr, .Size: Size + `1u`};
1479	}
1480
1481	/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
1482	/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size,
1483	/// and that we have already incremented Ptr by Size bytes.
1484	///
1485	/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
1486	/// be updated to match.
1487	Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr,
1488	const LangOptions &LangOpts) {
1489
1490	unsigned Size = `0`;
1491	// If we have a slash, look for an escaped newline.
1492	if (Ptr[`0`] == `'\\'`) {
1493	++Size;
1494	++Ptr;
1495	Slash:
1496	// Common case, backslash-char where the char is not whitespace.
1497	if (!isWhitespace(c: Ptr[`0`]))
1498	return {.Char: `'\\'`, .Size: Size};
1499
1500	// See if we have optional whitespace characters followed by a newline.
1501	if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1502	// Found backslash<whitespace><newline>. Parse the char after it.
1503	Size += EscapedNewLineSize;
1504	Ptr += EscapedNewLineSize;
1505
1506	// Use slow version to accumulate a correct size field.
1507	auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts);
1508	CharAndSize.Size += Size;
1509	return CharAndSize;
1510	}
1511
1512	// Otherwise, this is not an escaped newline, just return the slash.
1513	return {.Char: `'\\'`, .Size: Size};
1514	}
1515
1516	// If this is a trigraph, process it.
1517	if (LangOpts.Trigraphs && Ptr[`0`] == `'?'` && Ptr[`1`] == `'?'`) {
1518	// If this is actually a legal trigraph (not something like "??x"), return
1519	// it.
1520	if (char C = GetTrigraphCharForLetter(Letter: Ptr[`2`])) {
1521	Ptr += `3`;
1522	Size += `3`;
1523	if (C == `'\\'`) goto Slash;
1524	return {.Char: C, .Size: Size};
1525	}
1526	}
1527
1528	// If this is neither, return a single character.
1529	return {.Char: *Ptr, .Size: Size + `1u`};
1530	}
1531
1532	//===----------------------------------------------------------------------===//
1533	// Helper methods for lexing.
1534	//===----------------------------------------------------------------------===//
1535
1536	/// Routine that indiscriminately sets the offset into the source file.
1537	void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
1538	BufferPtr = BufferStart + Offset;
1539	if (BufferPtr > BufferEnd)
1540	BufferPtr = BufferEnd;
1541	// FIXME: What exactly does the StartOfLine bit mean? There are two
1542	// possible meanings for the "start" of the line: the first token on the
1543	// unexpanded line, or the first token on the expanded line.
1544	IsAtStartOfLine = StartOfLine;
1545	IsAtPhysicalStartOfLine = StartOfLine;
1546	}
1547
1548	static bool isUnicodeWhitespace(uint32_t Codepoint) {
1549	static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
1550	UnicodeWhitespaceCharRanges);
1551	return UnicodeWhitespaceChars.contains(C: Codepoint);
1552	}
1553
1554	static llvm::SmallString<`5`> codepointAsHexString(uint32_t C) {
1555	llvm::SmallString<`5`> CharBuf;
1556	llvm::raw_svector_ostream CharOS(CharBuf);
1557	llvm::write_hex(S&: CharOS, N: C, Style: llvm::HexPrintStyle::Upper, Width: `4`);
1558	return CharBuf;
1559	}
1560
1561	// To mitigate https://github.com/llvm/llvm-project/issues/54732,
1562	// we allow "Mathematical Notation Characters" in identifiers.
1563	// This is a proposed profile that extends the XID_Start/XID_continue
1564	// with mathematical symbols, superscipts and subscripts digits
1565	// found in some production software.
1566	// https://www.unicode.org/L2/L2022/22230-math-profile.pdf
1567	static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts,
1568	bool IsStart, bool &IsExtension) {
1569	static const llvm::sys::UnicodeCharSet MathStartChars(
1570	MathematicalNotationProfileIDStartRanges);
1571	static const llvm::sys::UnicodeCharSet MathContinueChars(
1572	MathematicalNotationProfileIDContinueRanges);
1573	if (MathStartChars.contains(C) \|\|
1574	(!IsStart && MathContinueChars.contains(C))) {
1575	IsExtension = true;
1576	return true;
1577	}
1578	return false;
1579	}
1580
1581	static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts,
1582	bool &IsExtension) {
1583	if (LangOpts.AsmPreprocessor) {
1584	return false;
1585	} else if (LangOpts.DollarIdents && `'$'` == C) {
1586	return true;
1587	} else if (LangOpts.CPlusPlus \|\| LangOpts.C23) {
1588	// A non-leading codepoint must have the XID_Continue property.
1589	// XIDContinueRanges doesn't contains characters also in XIDStartRanges,
1590	// so we need to check both tables.
1591	// '_' doesn't have the XID_Continue property but is allowed in C and C++.
1592	static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1593	static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges);
1594	if (C == `'_'` \|\| XIDStartChars.contains(C) \|\| XIDContinueChars.contains(C))
1595	return true;
1596	return isMathematicalExtensionID(C, LangOpts, /IsStart=/false,
1597	IsExtension);
1598	} else if (LangOpts.C11) {
1599	static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
1600	C11AllowedIDCharRanges);
1601	return C11AllowedIDChars.contains(C);
1602	} else {
1603	static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1604	C99AllowedIDCharRanges);
1605	return C99AllowedIDChars.contains(C);
1606	}
1607	}
1608
1609	static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts,
1610	bool &IsExtension) {
1611	assert(C > `0x7F` && "isAllowedInitiallyIDChar called with an ASCII codepoint");
1612	IsExtension = false;
1613	if (LangOpts.AsmPreprocessor) {
1614	return false;
1615	}
1616	if (LangOpts.CPlusPlus \|\| LangOpts.C23) {
1617	static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1618	if (XIDStartChars.contains(C))
1619	return true;
1620	return isMathematicalExtensionID(C, LangOpts, /IsStart=/true,
1621	IsExtension);
1622	}
1623	if (!isAllowedIDChar(C, LangOpts, IsExtension))
1624	return false;
1625	if (LangOpts.C11) {
1626	static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
1627	C11DisallowedInitialIDCharRanges);
1628	return !C11DisallowedInitialIDChars.contains(C);
1629	}
1630	static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1631	C99DisallowedInitialIDCharRanges);
1632	return !C99DisallowedInitialIDChars.contains(C);
1633	}
1634
1635	static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C,
1636	CharSourceRange Range) {
1637
1638	static const llvm::sys::UnicodeCharSet MathStartChars(
1639	MathematicalNotationProfileIDStartRanges);
1640	static const llvm::sys::UnicodeCharSet MathContinueChars(
1641	MathematicalNotationProfileIDContinueRanges);
1642
1643	(void)MathStartChars;
1644	(void)MathContinueChars;
1645	assert((MathStartChars.contains(C) \|\| MathContinueChars.contains(C)) &&
1646	"Unexpected mathematical notation codepoint");
1647	Diags.Report(Range.getBegin(), diag::ext_mathematical_notation)
1648	<< codepointAsHexString(C) << Range;
1649	}
1650
1651	static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
1652	const char *End) {
1653	return CharSourceRange::getCharRange(B: L.getSourceLocation(Loc: Begin),
1654	E: L.getSourceLocation(Loc: End));
1655	}
1656
1657	static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
1658	CharSourceRange Range, bool IsFirst) {
1659	// Check C99 compatibility.
1660	if (!Diags.isIgnored(diag::DiagID: warn_c99_compat_unicode_id, Loc: Range.getBegin())) {
1661	enum {
1662	CannotAppearInIdentifier = `0`,
1663	CannotStartIdentifier
1664	};
1665
1666	static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1667	C99AllowedIDCharRanges);
1668	static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1669	C99DisallowedInitialIDCharRanges);
1670	if (!C99AllowedIDChars.contains(C)) {
1671	Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1672	<< Range
1673	<< CannotAppearInIdentifier;
1674	} else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {
1675	Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1676	<< Range
1677	<< CannotStartIdentifier;
1678	}
1679	}
1680	}
1681
1682	/// After encountering UTF-8 character C and interpreting it as an identifier
1683	/// character, check whether it's a homoglyph for a common non-identifier
1684	/// source character that is unlikely to be an intentional identifier
1685	/// character and warn if so.
1686	static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
1687	CharSourceRange Range) {
1688	// FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
1689	struct HomoglyphPair {
1690	uint32_t Character;
1691	char LooksLike;
1692	bool operator<(HomoglyphPair R) const { return Character < R.Character; }
1693	};
1694	static constexpr HomoglyphPair SortedHomoglyphs[] = {
1695	{.Character: U`'\u00ad'`, .LooksLike: `0`}, // SOFT HYPHEN
1696	{.Character: U`'\u01c3'`, .LooksLike: `'!'`}, // LATIN LETTER RETROFLEX CLICK
1697	{.Character: U`'\u037e'`, .LooksLike: `';'`}, // GREEK QUESTION MARK
1698	{.Character: U`'\u200b'`, .LooksLike: `0`}, // ZERO WIDTH SPACE
1699	{.Character: U`'\u200c'`, .LooksLike: `0`}, // ZERO WIDTH NON-JOINER
1700	{.Character: U`'\u200d'`, .LooksLike: `0`}, // ZERO WIDTH JOINER
1701	{.Character: U`'\u2060'`, .LooksLike: `0`}, // WORD JOINER
1702	{.Character: U`'\u2061'`, .LooksLike: `0`}, // FUNCTION APPLICATION
1703	{.Character: U`'\u2062'`, .LooksLike: `0`}, // INVISIBLE TIMES
1704	{.Character: U`'\u2063'`, .LooksLike: `0`}, // INVISIBLE SEPARATOR
1705	{.Character: U`'\u2064'`, .LooksLike: `0`}, // INVISIBLE PLUS
1706	{.Character: U`'\u2212'`, .LooksLike: `'-'`}, // MINUS SIGN
1707	{.Character: U`'\u2215'`, .LooksLike: `'/'`}, // DIVISION SLASH
1708	{.Character: U`'\u2216'`, .LooksLike: `'\\'`}, // SET MINUS
1709	{.Character: U`'\u2217'`, .LooksLike: `''`}, // ASTERISK OPERATOR*
1710	{.Character: U`'\u2223'`, .LooksLike: `'\|'`}, // DIVIDES
1711	{.Character: U`'\u2227'`, .LooksLike: `'^'`}, // LOGICAL AND
1712	{.Character: U`'\u2236'`, .LooksLike: `':'`}, // RATIO
1713	{.Character: U`'\u223c'`, .LooksLike: `'~'`}, // TILDE OPERATOR
1714	{.Character: U`'\ua789'`, .LooksLike: `':'`}, // MODIFIER LETTER COLON
1715	{.Character: U`'\ufeff'`, .LooksLike: `0`}, // ZERO WIDTH NO-BREAK SPACE
1716	{.Character: U`'\uff01'`, .LooksLike: `'!'`}, // FULLWIDTH EXCLAMATION MARK
1717	{.Character: U`'\uff03'`, .LooksLike: `'#'`}, // FULLWIDTH NUMBER SIGN
1718	{.Character: U`'\uff04'`, .LooksLike: `'$'`}, // FULLWIDTH DOLLAR SIGN
1719	{.Character: U`'\uff05'`, .LooksLike: `'%'`}, // FULLWIDTH PERCENT SIGN
1720	{.Character: U`'\uff06'`, .LooksLike: `'&'`}, // FULLWIDTH AMPERSAND
1721	{.Character: U`'\uff08'`, .LooksLike: `'('`}, // FULLWIDTH LEFT PARENTHESIS
1722	{.Character: U`'\uff09'`, .LooksLike: `')'`}, // FULLWIDTH RIGHT PARENTHESIS
1723	{.Character: U`'\uff0a'`, .LooksLike: `''`}, // FULLWIDTH ASTERISK*
1724	{.Character: U`'\uff0b'`, .LooksLike: `'+'`}, // FULLWIDTH ASTERISK
1725	{.Character: U`'\uff0c'`, .LooksLike: `','`}, // FULLWIDTH COMMA
1726	{.Character: U`'\uff0d'`, .LooksLike: `'-'`}, // FULLWIDTH HYPHEN-MINUS
1727	{.Character: U`'\uff0e'`, .LooksLike: `'.'`}, // FULLWIDTH FULL STOP
1728	{.Character: U`'\uff0f'`, .LooksLike: `'/'`}, // FULLWIDTH SOLIDUS
1729	{.Character: U`'\uff1a'`, .LooksLike: `':'`}, // FULLWIDTH COLON
1730	{.Character: U`'\uff1b'`, .LooksLike: `';'`}, // FULLWIDTH SEMICOLON
1731	{.Character: U`'\uff1c'`, .LooksLike: `'<'`}, // FULLWIDTH LESS-THAN SIGN
1732	{.Character: U`'\uff1d'`, .LooksLike: `'='`}, // FULLWIDTH EQUALS SIGN
1733	{.Character: U`'\uff1e'`, .LooksLike: `'>'`}, // FULLWIDTH GREATER-THAN SIGN
1734	{.Character: U`'\uff1f'`, .LooksLike: `'?'`}, // FULLWIDTH QUESTION MARK
1735	{.Character: U`'\uff20'`, .LooksLike: `'@'`}, // FULLWIDTH COMMERCIAL AT
1736	{.Character: U`'\uff3b'`, .LooksLike: `'['`}, // FULLWIDTH LEFT SQUARE BRACKET
1737	{.Character: U`'\uff3c'`, .LooksLike: `'\\'`}, // FULLWIDTH REVERSE SOLIDUS
1738	{.Character: U`'\uff3d'`, .LooksLike: `']'`}, // FULLWIDTH RIGHT SQUARE BRACKET
1739	{.Character: U`'\uff3e'`, .LooksLike: `'^'`}, // FULLWIDTH CIRCUMFLEX ACCENT
1740	{.Character: U`'\uff5b'`, .LooksLike: `'{'`}, // FULLWIDTH LEFT CURLY BRACKET
1741	{.Character: U`'\uff5c'`, .LooksLike: `'\|'`}, // FULLWIDTH VERTICAL LINE
1742	{.Character: U`'\uff5d'`, .LooksLike: `'}'`}, // FULLWIDTH RIGHT CURLY BRACKET
1743	{.Character: U`'\uff5e'`, .LooksLike: `'~'`}, // FULLWIDTH TILDE
1744	{.Character: `0`, .LooksLike: `0`}
1745	};
1746	auto Homoglyph =
1747	std::lower_bound(first: std::begin(arr: SortedHomoglyphs),
1748	last: std::end(arr: SortedHomoglyphs) - `1`, val: HomoglyphPair{.Character: C, .LooksLike: `'\0'`});
1749	if (Homoglyph->Character == C) {
1750	if (Homoglyph->LooksLike) {
1751	const char LooksLikeStr[] = {Homoglyph->LooksLike, `0`};
1752	Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
1753	<< Range << codepointAsHexString(C) << LooksLikeStr;
1754	} else {
1755	Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
1756	<< Range << codepointAsHexString(C);
1757	}
1758	}
1759	}
1760
1761	static void diagnoseInvalidUnicodeCodepointInIdentifier(
1762	DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint,
1763	CharSourceRange Range, bool IsFirst) {
1764	if (isASCII(c: CodePoint))
1765	return;
1766
1767	bool IsExtension;
1768	bool IsIDStart = isAllowedInitiallyIDChar(C: CodePoint, LangOpts, IsExtension);
1769	bool IsIDContinue =
1770	IsIDStart \|\| isAllowedIDChar(C: CodePoint, LangOpts, IsExtension);
1771
1772	if ((IsFirst && IsIDStart) \|\| (!IsFirst && IsIDContinue))
1773	return;
1774
1775	bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue;
1776
1777	if (!IsFirst \|\| InvalidOnlyAtStart) {
1778	Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier)
1779	<< Range << codepointAsHexString(C: CodePoint) << int(InvalidOnlyAtStart)
1780	<< FixItHint::CreateRemoval(RemoveRange: Range);
1781	} else {
1782	Diags.Report(Range.getBegin(), diag::err_character_not_allowed)
1783	<< Range << codepointAsHexString(C: CodePoint)
1784	<< FixItHint::CreateRemoval(RemoveRange: Range);
1785	}
1786	}
1787
1788	bool Lexer::tryConsumeIdentifierUCN(const char &CurPtr, unsigned* Size,
1789	Token &Result) {
1790	const char *UCNPtr = CurPtr + Size;
1791	uint32_t CodePoint = tryReadUCN(StartPtr&: UCNPtr, SlashLoc: CurPtr, /Token=/Result: nullptr);
1792	if (CodePoint == `0`) {
1793	return false;
1794	}
1795	bool IsExtension = false;
1796	if (!isAllowedIDChar(C: CodePoint, LangOpts, IsExtension)) {
1797	if (isASCII(c: CodePoint) \|\| isUnicodeWhitespace(Codepoint: CodePoint))
1798	return false;
1799	if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
1800	!PP->isPreprocessedOutput())
1801	diagnoseInvalidUnicodeCodepointInIdentifier(
1802	Diags&: PP->getDiagnostics(), LangOpts, CodePoint,
1803	Range: makeCharRange(L&: *this, Begin: CurPtr, End: UCNPtr),
1804	/IsFirst=/false);
1805
1806	// We got a unicode codepoint that is neither a space nor a
1807	// a valid identifier part.
1808	// Carry on as if the codepoint was valid for recovery purposes.
1809	} else if (!isLexingRawMode()) {
1810	if (IsExtension)
1811	diagnoseExtensionInIdentifier(Diags&: PP->getDiagnostics(), C: CodePoint,
1812	Range: makeCharRange(L&: *this, Begin: CurPtr, End: UCNPtr));
1813
1814	maybeDiagnoseIDCharCompat(Diags&: PP->getDiagnostics(), C: CodePoint,
1815	Range: makeCharRange(L&: *this, Begin: CurPtr, End: UCNPtr),
1816	/IsFirst=/false);
1817	}
1818
1819	Result.setFlag(Token::HasUCN);
1820	if ((UCNPtr - CurPtr == `6` && CurPtr[`1`] == `'u'`) \|\|
1821	(UCNPtr - CurPtr == `10` && CurPtr[`1`] == `'U'`))
1822	CurPtr = UCNPtr;
1823	else
1824	while (CurPtr != UCNPtr)
1825	(void)getAndAdvanceChar(Ptr&: CurPtr, Tok&: Result);
1826	return true;
1827	}
1828
1829	bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result) {
1830	llvm::UTF32 CodePoint;
1831
1832	// If a UTF-8 codepoint appears immediately after an escaped new line,
1833	// CurPtr may point to the splicing \ on the preceding line,
1834	// so we need to skip it.
1835	unsigned FirstCodeUnitSize;
1836	getCharAndSize(Ptr: CurPtr, Size&: FirstCodeUnitSize);
1837	const char *CharStart = CurPtr + FirstCodeUnitSize - `1`;
1838	const char *UnicodePtr = CharStart;
1839
1840	llvm::ConversionResult ConvResult = llvm::convertUTF8Sequence(
1841	source: (const llvm::UTF8 *)&UnicodePtr, sourceEnd: (const* llvm::UTF8 *)BufferEnd,
1842	target: &CodePoint, flags: llvm::strictConversion);
1843	if (ConvResult != llvm::conversionOK)
1844	return false;
1845
1846	bool IsExtension = false;
1847	if (!isAllowedIDChar(C: static_cast<uint32_t>(CodePoint), LangOpts,
1848	IsExtension)) {
1849	if (isASCII(c: CodePoint) \|\| isUnicodeWhitespace(Codepoint: CodePoint))
1850	return false;
1851
1852	if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
1853	!PP->isPreprocessedOutput())
1854	diagnoseInvalidUnicodeCodepointInIdentifier(
1855	Diags&: PP->getDiagnostics(), LangOpts, CodePoint,
1856	Range: makeCharRange(L&: *this, Begin: CharStart, End: UnicodePtr), /IsFirst=/false);
1857	// We got a unicode codepoint that is neither a space nor a
1858	// a valid identifier part. Carry on as if the codepoint was
1859	// valid for recovery purposes.
1860	} else if (!isLexingRawMode()) {
1861	if (IsExtension)
1862	diagnoseExtensionInIdentifier(
1863	Diags&: PP->getDiagnostics(), C: CodePoint,
1864	Range: makeCharRange(L&: *this, Begin: CharStart, End: UnicodePtr));
1865	maybeDiagnoseIDCharCompat(Diags&: PP->getDiagnostics(), C: CodePoint,
1866	Range: makeCharRange(L&: *this, Begin: CharStart, End: UnicodePtr),
1867	/IsFirst=/false);
1868	maybeDiagnoseUTF8Homoglyph(Diags&: PP->getDiagnostics(), C: CodePoint,
1869	Range: makeCharRange(L&: *this, Begin: CharStart, End: UnicodePtr));
1870	}
1871
1872	// Once we sucessfully parsed some UTF-8,
1873	// calling ConsumeChar ensures the NeedsCleaning flag is set on the token
1874	// being lexed, and that warnings about trailing spaces are emitted.
1875	ConsumeChar(Ptr: CurPtr, Size: FirstCodeUnitSize, Tok&: Result);
1876	CurPtr = UnicodePtr;
1877	return true;
1878	}
1879
1880	bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,
1881	const char *CurPtr) {
1882	bool IsExtension = false;
1883	if (isAllowedInitiallyIDChar(C, LangOpts, IsExtension)) {
1884	if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
1885	!PP->isPreprocessedOutput()) {
1886	if (IsExtension)
1887	diagnoseExtensionInIdentifier(Diags&: PP->getDiagnostics(), C,
1888	Range: makeCharRange(L&: *this, Begin: BufferPtr, End: CurPtr));
1889	maybeDiagnoseIDCharCompat(Diags&: PP->getDiagnostics(), C,
1890	Range: makeCharRange(L&: *this, Begin: BufferPtr, End: CurPtr),
1891	/IsFirst=/true);
1892	maybeDiagnoseUTF8Homoglyph(Diags&: PP->getDiagnostics(), C,
1893	Range: makeCharRange(L&: *this, Begin: BufferPtr, End: CurPtr));
1894	}
1895
1896	MIOpt.ReadToken();
1897	return LexIdentifierContinue(Result, CurPtr);
1898	}
1899
1900	if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
1901	!PP->isPreprocessedOutput() && !isASCII(c: *BufferPtr) &&
1902	!isUnicodeWhitespace(Codepoint: C)) {
1903	// Non-ASCII characters tend to creep into source code unintentionally.
1904	// Instead of letting the parser complain about the unknown token,
1905	// just drop the character.
1906	// Note that we can /only/ do this when the non-ASCII character is actually
1907	// spelled as Unicode, not written as a UCN. The standard requires that
1908	// we not throw away any possible preprocessor tokens, but there's a
1909	// loophole in the mapping of Unicode characters to basic character set
1910	// characters that allows us to map these particular characters to, say,
1911	// whitespace.
1912	diagnoseInvalidUnicodeCodepointInIdentifier(
1913	Diags&: PP->getDiagnostics(), LangOpts, CodePoint: C,
1914	Range: makeCharRange(L&: *this, Begin: BufferPtr, End: CurPtr), /IsStart/ IsFirst: true);
1915	BufferPtr = CurPtr;
1916	return false;
1917	}
1918
1919	// Otherwise, we have an explicit UCN or a character that's unlikely to show
1920	// up by accident.
1921	MIOpt.ReadToken();
1922	FormTokenWithChars(Result, TokEnd: CurPtr, Kind: tok::unknown);
1923	return true;
1924	}
1925
1926	static const char *
1927	fastParseASCIIIdentifier(const char *CurPtr,
1928	[[maybe_unused]] const char *BufferEnd) {
1929	#ifdef __SSE4_2__
1930	alignas(`16`) static constexpr char AsciiIdentifierRange[`16`] = {
1931	`'_'`, `'_'`, `'A'`, `'Z'`, `'a'`, `'z'`, `'0'`, `'9'`,
1932	};
1933	constexpr ssize_t BytesPerRegister = `16`;
1934
1935	__m128i AsciiIdentifierRangeV =
1936	_mm_load_si128((const __m128i *)AsciiIdentifierRange);
1937
1938	while (LLVM_LIKELY(BufferEnd - CurPtr >= BytesPerRegister)) {
1939	__m128i Cv = _mm_loadu_si128((const __m128i *)(CurPtr));
1940
1941	int Consumed = _mm_cmpistri(AsciiIdentifierRangeV, Cv,
1942	_SIDD_LEAST_SIGNIFICANT \| _SIDD_CMP_RANGES \|
1943	_SIDD_UBYTE_OPS \| _SIDD_NEGATIVE_POLARITY);
1944	CurPtr += Consumed;
1945	if (Consumed == BytesPerRegister)
1946	continue;
1947	return CurPtr;
1948	}
1949	#endif
1950
1951	unsigned char C = *CurPtr;
1952	while (isAsciiIdentifierContinue(c: C))
1953	C = *++CurPtr;
1954	return CurPtr;
1955	}
1956
1957	bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
1958	// Match [_A-Za-z0-9], we have already matched an identifier start.*
1959
1960	while (true) {
1961
1962	CurPtr = fastParseASCIIIdentifier(CurPtr, BufferEnd);
1963
1964	unsigned Size;
1965	// Slow path: handle trigraph, unicode codepoints, UCNs.
1966	unsigned char C = getCharAndSize(Ptr: CurPtr, Size);
1967	if (isAsciiIdentifierContinue(c: C)) {
1968	CurPtr = ConsumeChar(Ptr: CurPtr, Size, Tok&: Result);
1969	continue;
1970	}
1971	if (C == `'$'`) {
1972	// If we hit a $ and they are not supported in identifiers, we are done.
1973	if (!LangOpts.DollarIdents)
1974	break;
1975	// Otherwise, emit a diagnostic and continue.
1976	if (!isLexingRawMode())
1977	Diag(Loc: CurPtr, diag::DiagID: ext_dollar_in_identifier);
1978	CurPtr = ConsumeChar(Ptr: CurPtr, Size, Tok&: Result);
1979	continue;
1980	}
1981	if (C == `'\\'` && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1982	continue;
1983	if (!isASCII(c: C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
1984	continue;
1985	// Neither an expected Unicode codepoint nor a UCN.
1986	break;
1987	}
1988
1989	const char *IdStart = BufferPtr;
1990	FormTokenWithChars(Result, TokEnd: CurPtr, Kind: tok::raw_identifier);
1991	Result.setRawIdentifierData(IdStart);
1992
1993	// If we are in raw mode, return this identifier raw. There is no need to
1994	// look up identifier information or attempt to macro expand it.
1995	if (LexingRawMode)
1996	return true;
1997
1998	// Fill in Result.IdentifierInfo and update the token kind,
1999	// looking up the identifier in the identifier table.
2000	const IdentifierInfo *II = PP->LookUpIdentifierInfo(Identifier&: Result);
2001	// Note that we have to call PP->LookUpIdentifierInfo() even for code
2002	// completion, it writes IdentifierInfo into Result, and callers rely on it.
2003
2004	// If the completion point is at the end of an identifier, we want to treat
2005	// the identifier as incomplete even if it resolves to a macro or a keyword.
2006	// This allows e.g. 'class^' to complete to 'classifier'.
2007	if (isCodeCompletionPoint(CurPtr)) {
2008	// Return the code-completion token.
2009	Result.setKind(tok::code_completion);
2010	// Skip the code-completion char and all immediate identifier characters.
2011	// This ensures we get consistent behavior when completing at any point in
2012	// an identifier (i.e. at the start, in the middle, at the end). Note that
2013	// only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
2014	// simpler.
2015	assert(*CurPtr == `0` && "Completion character must be 0");
2016	++CurPtr;
2017	// Note that code completion token is not added as a separate character
2018	// when the completion point is at the end of the buffer. Therefore, we need
2019	// to check if the buffer has ended.
2020	if (CurPtr < BufferEnd) {
2021	while (isAsciiIdentifierContinue(c: *CurPtr))
2022	++CurPtr;
2023	}
2024	BufferPtr = CurPtr;
2025	return true;
2026	}
2027
2028	// Finally, now that we know we have an identifier, pass this off to the
2029	// preprocessor, which may macro expand it or something.
2030	if (II->isHandleIdentifierCase())
2031	return PP->HandleIdentifier(Identifier&: Result);
2032
2033	return true;
2034	}
2035
2036	/// isHexaLiteral - Return true if Start points to a hex constant.
2037	/// in microsoft mode (where this is supposed to be several different tokens).
2038	bool Lexer::isHexaLiteral(const char Start, const* LangOptions &LangOpts) {
2039	auto CharAndSize1 = Lexer::getCharAndSizeNoWarn(Ptr: Start, LangOpts);
2040	char C1 = CharAndSize1.Char;
2041	if (C1 != `'0'`)
2042	return false;
2043
2044	auto CharAndSize2 =
2045	Lexer::getCharAndSizeNoWarn(Ptr: Start + CharAndSize1.Size, LangOpts);
2046	char C2 = CharAndSize2.Char;
2047	return (C2 == `'x'` \|\| C2 == `'X'`);
2048	}
2049
2050	/// LexNumericConstant - Lex the remainder of a integer or floating point
2051	/// constant. From[-1] is the first character lexed. Return the end of the
2052	/// constant.
2053	bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
2054	unsigned Size;
2055	char C = getCharAndSize(Ptr: CurPtr, Size);
2056	char PrevCh = `0`;
2057	while (isPreprocessingNumberBody(c: C)) {
2058	CurPtr = ConsumeChar(Ptr: CurPtr, Size, Tok&: Result);
2059	PrevCh = C;
2060	if (LangOpts.HLSL && C == `'.'` && (CurPtr == `'x'` \|\| CurPtr == `'r'`)) {
2061	CurPtr -= Size;
2062	break;
2063	}
2064	C = getCharAndSize(Ptr: CurPtr, Size);
2065	}
2066
2067	// If we fell out, check for a sign, due to 1e+12. If we have one, continue.
2068	if ((C == `'-'` \|\| C == `'+'`) && (PrevCh == `'E'` \|\| PrevCh == `'e'`)) {
2069	// If we are in Microsoft mode, don't continue if the constant is hex.
2070	// For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
2071	if (!LangOpts.MicrosoftExt \|\| !isHexaLiteral(Start: BufferPtr, LangOpts))
2072	return LexNumericConstant(Result, CurPtr: ConsumeChar(Ptr: CurPtr, Size, Tok&: Result));
2073	}
2074
2075	// If we have a hex FP constant, continue.
2076	if ((C == `'-'` \|\| C == `'+'`) && (PrevCh == `'P'` \|\| PrevCh == `'p'`)) {
2077	// Outside C99 and C++17, we accept hexadecimal floating point numbers as a
2078	// not-quite-conforming extension. Only do so if this looks like it's
2079	// actually meant to be a hexfloat, and not if it has a ud-suffix.
2080	bool IsHexFloat = true;
2081	if (!LangOpts.C99) {
2082	if (!isHexaLiteral(Start: BufferPtr, LangOpts))
2083	IsHexFloat = false;
2084	else if (!LangOpts.CPlusPlus17 &&
2085	std::find(first: BufferPtr, last: CurPtr, val: `'_'`) != CurPtr)
2086	IsHexFloat = false;
2087	}
2088	if (IsHexFloat)
2089	return LexNumericConstant(Result, CurPtr: ConsumeChar(Ptr: CurPtr, Size, Tok&: Result));
2090	}
2091
2092	// If we have a digit separator, continue.
2093	if (C == `'\''` && (LangOpts.CPlusPlus14 \|\| LangOpts.C23)) {
2094	auto [Next, NextSize] = getCharAndSizeNoWarn(Ptr: CurPtr + Size, LangOpts);
2095	if (isAsciiIdentifierContinue(c: Next)) {
2096	if (!isLexingRawMode())
2097	Diag(Loc: CurPtr, DiagID: LangOpts.CPlusPlus
2098	? diag::warn_cxx11_compat_digit_separator
2099	: diag::warn_c23_compat_digit_separator);
2100	CurPtr = ConsumeChar(Ptr: CurPtr, Size, Tok&: Result);
2101	CurPtr = ConsumeChar(Ptr: CurPtr, Size: NextSize, Tok&: Result);
2102	return LexNumericConstant(Result, CurPtr);
2103	}
2104	}
2105
2106	// If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
2107	if (C == `'\\'` && tryConsumeIdentifierUCN(CurPtr, Size, Result))
2108	return LexNumericConstant(Result, CurPtr);
2109	if (!isASCII(c: C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
2110	return LexNumericConstant(Result, CurPtr);
2111
2112	// Update the location of token as well as BufferPtr.
2113	const char *TokStart = BufferPtr;
2114	FormTokenWithChars(Result, TokEnd: CurPtr, Kind: tok::numeric_constant);
2115	Result.setLiteralData(TokStart);
2116	return true;
2117	}
2118
2119	/// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
2120	/// in C++11, or warn on a ud-suffix in C++98.
2121	const char Lexer::LexUDSuffix(Token &Result, const* char *CurPtr,
2122	bool IsStringLiteral) {
2123	assert(LangOpts.CPlusPlus);
2124
2125	// Maximally munch an identifier.
2126	unsigned Size;
2127	char C = getCharAndSize(Ptr: CurPtr, Size);
2128	bool Consumed = false;
2129
2130	if (!isAsciiIdentifierStart(c: C)) {
2131	if (C == `'\\'` && tryConsumeIdentifierUCN(CurPtr, Size, Result))
2132	Consumed = true;
2133	else if (!isASCII(c: C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
2134	Consumed = true;
2135	else
2136	return CurPtr;
2137	}
2138
2139	if (!LangOpts.CPlusPlus11) {
2140	if (!isLexingRawMode())
2141	Diag(Loc: CurPtr,
2142	DiagID: C == `'_'` ? diag::warn_cxx11_compat_user_defined_literal
2143	: diag::warn_cxx11_compat_reserved_user_defined_literal)
2144	<< FixItHint::CreateInsertion(InsertionLoc: getSourceLocation(Loc: CurPtr), Code: " ");
2145	return CurPtr;
2146	}
2147
2148	// C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
2149	// that does not start with an underscore is ill-formed. As a conforming
2150	// extension, we treat all such suffixes as if they had whitespace before
2151	// them. We assume a suffix beginning with a UCN or UTF-8 character is more
2152	// likely to be a ud-suffix than a macro, however, and accept that.
2153	if (!Consumed) {
2154	bool IsUDSuffix = false;
2155	if (C == `'_'`)
2156	IsUDSuffix = true;
2157	else if (IsStringLiteral && LangOpts.CPlusPlus14) {
2158	// In C++1y, we need to look ahead a few characters to see if this is a
2159	// valid suffix for a string literal or a numeric literal (this could be
2160	// the 'operator""if' defining a numeric literal operator).
2161	const unsigned MaxStandardSuffixLength = `3`;
2162	char Buffer[MaxStandardSuffixLength] = { C };
2163	unsigned Consumed = Size;
2164	unsigned Chars = `1`;
2165	while (true) {
2166	auto [Next, NextSize] =
2167	getCharAndSizeNoWarn(Ptr: CurPtr + Consumed, LangOpts);
2168	if (!isAsciiIdentifierContinue(c: Next)) {
2169	// End of suffix. Check whether this is on the allowed list.
2170	const StringRef CompleteSuffix(Buffer, Chars);
2171	IsUDSuffix =
2172	StringLiteralParser::isValidUDSuffix(LangOpts, Suffix: CompleteSuffix);
2173	break;
2174	}
2175
2176	if (Chars == MaxStandardSuffixLength)
2177	// Too long: can't be a standard suffix.
2178	break;
2179
2180	Buffer[Chars++] = Next;
2181	Consumed += NextSize;
2182	}
2183	}
2184
2185	if (!IsUDSuffix) {
2186	if (!isLexingRawMode())
2187	Diag(Loc: CurPtr, DiagID: LangOpts.MSVCCompat
2188	? diag::ext_ms_reserved_user_defined_literal
2189	: diag::ext_reserved_user_defined_literal)
2190	<< FixItHint::CreateInsertion(InsertionLoc: getSourceLocation(Loc: CurPtr), Code: " ");
2191	return CurPtr;
2192	}
2193
2194	CurPtr = ConsumeChar(Ptr: CurPtr, Size, Tok&: Result);
2195	}
2196
2197	Result.setFlag(Token::HasUDSuffix);
2198	while (true) {
2199	C = getCharAndSize(Ptr: CurPtr, Size);
2200	if (isAsciiIdentifierContinue(c: C)) {
2201	CurPtr = ConsumeChar(Ptr: CurPtr, Size, Tok&: Result);
2202	} else if (C == `'\\'` && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
2203	} else if (!isASCII(c: C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) {
2204	} else
2205	break;
2206	}
2207
2208	return CurPtr;
2209	}
2210
2211	/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
2212	/// either " or L" or u8" or u" or U".
2213	bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
2214	tok::TokenKind Kind) {
2215	const char *AfterQuote = CurPtr;
2216	// Does this string contain the \0 character?
2217	const char NulCharacter = nullptr*;
2218
2219	if (!isLexingRawMode() &&
2220	(Kind == tok::utf8_string_literal \|\|
2221	Kind == tok::utf16_string_literal \|\|
2222	Kind == tok::utf32_string_literal))
2223	Diag(Loc: BufferPtr, DiagID: LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal
2224	: diag::warn_c99_compat_unicode_literal);
2225
2226	char C = getAndAdvanceChar(Ptr&: CurPtr, Tok&: Result);
2227	while (C != `'"'`) {
2228	// Skip escaped characters. Escaped newlines will already be processed by
2229	// getAndAdvanceChar.
2230	if (C == `'\\'`)
2231	C = getAndAdvanceChar(Ptr&: CurPtr, Tok&: Result);
2232
2233	if (C == `'\n'` \|\| C == `'\r'` \|\| // Newline.
2234	(C == `0` && CurPtr-`1` == BufferEnd)) { // End of file.
2235	if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2236	Diag(Loc: BufferPtr, diag::DiagID: ext_unterminated_char_or_string) << `1`;
2237	FormTokenWithChars(Result, TokEnd: CurPtr-`1`, Kind: tok::unknown);
2238	return true;
2239	}
2240
2241	if (C == `0`) {
2242	if (isCodeCompletionPoint(CurPtr: CurPtr-`1`)) {
2243	if (ParsingFilename)
2244	codeCompleteIncludedFile(PathStart: AfterQuote, CompletionPoint: CurPtr - `1`, /IsAngled=/false);
2245	else
2246	PP->CodeCompleteNaturalLanguage();
2247	FormTokenWithChars(Result, TokEnd: CurPtr - `1`, Kind: tok::unknown);
2248	cutOffLexing();
2249	return true;
2250	}
2251
2252	NulCharacter = CurPtr-`1`;
2253	}
2254	C = getAndAdvanceChar(Ptr&: CurPtr, Tok&: Result);
2255	}
2256
2257	// If we are in C++11, lex the optional ud-suffix.
2258	if (LangOpts.CPlusPlus)
2259	CurPtr = LexUDSuffix(Result, CurPtr, IsStringLiteral: true);
2260
2261	// If a nul character existed in the string, warn about it.
2262	if (NulCharacter && !isLexingRawMode())
2263	Diag(Loc: NulCharacter, diag::DiagID: null_in_char_or_string) << `1`;
2264
2265	// Update the location of the token as well as the BufferPtr instance var.
2266	const char *TokStart = BufferPtr;
2267	FormTokenWithChars(Result, TokEnd: CurPtr, Kind);
2268	Result.setLiteralData(TokStart);
2269	return true;
2270	}
2271
2272	/// LexRawStringLiteral - Lex the remainder of a raw string literal, after
2273	/// having lexed R", LR", u8R", uR", or UR".
2274	bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
2275	tok::TokenKind Kind) {
2276	// This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
2277	// Between the initial and final double quote characters of the raw string,
2278	// any transformations performed in phases 1 and 2 (trigraphs,
2279	// universal-character-names, and line splicing) are reverted.
2280
2281	if (!isLexingRawMode())
2282	Diag(Loc: BufferPtr, diag::DiagID: warn_cxx98_compat_raw_string_literal);
2283
2284	unsigned PrefixLen = `0`;
2285
2286	while (PrefixLen != `16` && isRawStringDelimBody(c: CurPtr[PrefixLen])) {
2287	if (!isLexingRawMode() &&
2288	llvm::is_contained(Set: {`'$'`, `'@'`, '`'}, Element: CurPtr[PrefixLen])) {
2289	const char *Pos = &CurPtr[PrefixLen];
2290	Diag(Loc: Pos, DiagID: LangOpts.CPlusPlus26
2291	? diag::warn_cxx26_compat_raw_string_literal_character_set
2292	: diag::ext_cxx26_raw_string_literal_character_set)
2293	<< StringRef(Pos, `1`);
2294	}
2295	++PrefixLen;
2296	}
2297
2298	// If the last character was not a '(', then we didn't lex a valid delimiter.
2299	if (CurPtr[PrefixLen] != `'('`) {
2300	if (!isLexingRawMode()) {
2301	const char *PrefixEnd = &CurPtr[PrefixLen];
2302	if (PrefixLen == `16`) {
2303	Diag(Loc: PrefixEnd, diag::DiagID: err_raw_delim_too_long);
2304	} else if (*PrefixEnd == `'\n'`) {
2305	Diag(Loc: PrefixEnd, diag::DiagID: err_invalid_newline_raw_delim);
2306	} else {
2307	Diag(Loc: PrefixEnd, diag::DiagID: err_invalid_char_raw_delim)
2308	<< StringRef(PrefixEnd, `1`);
2309	}
2310	}
2311
2312	// Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
2313	// it's possible the '"' was intended to be part of the raw string, but
2314	// there's not much we can do about that.
2315	while (true) {
2316	char C = *CurPtr++;
2317
2318	if (C == `'"'`)
2319	break;
2320	if (C == `0` && CurPtr-`1` == BufferEnd) {
2321	--CurPtr;
2322	break;
2323	}
2324	}
2325
2326	FormTokenWithChars(Result, TokEnd: CurPtr, Kind: tok::unknown);
2327	return true;
2328	}
2329
2330	// Save prefix and move CurPtr past it
2331	const char *Prefix = CurPtr;
2332	CurPtr += PrefixLen + `1`; // skip over prefix and '('
2333
2334	while (true) {
2335	char C = *CurPtr++;
2336
2337	if (C == `')'`) {
2338	// Check for prefix match and closing quote.
2339	if (strncmp(s1: CurPtr, s2: Prefix, n: PrefixLen) == `0` && CurPtr[PrefixLen] == `'"'`) {
2340	CurPtr += PrefixLen + `1`; // skip over prefix and '"'
2341	break;
2342	}
2343	} else if (C == `0` && CurPtr-`1` == BufferEnd) { // End of file.
2344	if (!isLexingRawMode())
2345	Diag(Loc: BufferPtr, diag::DiagID: err_unterminated_raw_string)
2346	<< StringRef(Prefix, PrefixLen);
2347	FormTokenWithChars(Result, TokEnd: CurPtr-`1`, Kind: tok::unknown);
2348	return true;
2349	}
2350	}
2351
2352	// If we are in C++11, lex the optional ud-suffix.
2353	if (LangOpts.CPlusPlus)
2354	CurPtr = LexUDSuffix(Result, CurPtr, IsStringLiteral: true);
2355
2356	// Update the location of token as well as BufferPtr.
2357	const char *TokStart = BufferPtr;
2358	FormTokenWithChars(Result, TokEnd: CurPtr, Kind);
2359	Result.setLiteralData(TokStart);
2360	return true;
2361	}
2362
2363	/// LexAngledStringLiteral - Lex the remainder of an angled string literal,
2364	/// after having lexed the '<' character. This is used for #include filenames.
2365	bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
2366	// Does this string contain the \0 character?
2367	const char NulCharacter = nullptr*;
2368	const char *AfterLessPos = CurPtr;
2369	char C = getAndAdvanceChar(Ptr&: CurPtr, Tok&: Result);
2370	while (C != `'>'`) {
2371	// Skip escaped characters. Escaped newlines will already be processed by
2372	// getAndAdvanceChar.
2373	if (C == `'\\'`)
2374	C = getAndAdvanceChar(Ptr&: CurPtr, Tok&: Result);
2375
2376	if (isVerticalWhitespace(c: C) \|\| // Newline.
2377	(C == `0` && (CurPtr - `1` == BufferEnd))) { // End of file.
2378	// If the filename is unterminated, then it must just be a lone <
2379	// character. Return this as such.
2380	FormTokenWithChars(Result, TokEnd: AfterLessPos, Kind: tok::less);
2381	return true;
2382	}
2383
2384	if (C == `0`) {
2385	if (isCodeCompletionPoint(CurPtr: CurPtr - `1`)) {
2386	codeCompleteIncludedFile(PathStart: AfterLessPos, CompletionPoint: CurPtr - `1`, /IsAngled=/true);
2387	cutOffLexing();
2388	FormTokenWithChars(Result, TokEnd: CurPtr - `1`, Kind: tok::unknown);
2389	return true;
2390	}
2391	NulCharacter = CurPtr-`1`;
2392	}
2393	C = getAndAdvanceChar(Ptr&: CurPtr, Tok&: Result);
2394	}
2395
2396	// If a nul character existed in the string, warn about it.
2397	if (NulCharacter && !isLexingRawMode())
2398	Diag(Loc: NulCharacter, diag::DiagID: null_in_char_or_string) << `1`;
2399
2400	// Update the location of token as well as BufferPtr.
2401	const char *TokStart = BufferPtr;
2402	FormTokenWithChars(Result, TokEnd: CurPtr, Kind: tok::header_name);
2403	Result.setLiteralData(TokStart);
2404	return true;
2405	}
2406
2407	void Lexer::codeCompleteIncludedFile(const char *PathStart,
2408	const char *CompletionPoint,
2409	bool IsAngled) {
2410	// Completion only applies to the filename, after the last slash.
2411	StringRef PartialPath(PathStart, CompletionPoint - PathStart);
2412	llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/";
2413	auto Slash = PartialPath.find_last_of(Chars: SlashChars);
2414	StringRef Dir =
2415	(Slash == StringRef::npos) ? "" : PartialPath.take_front(N: Slash);
2416	const char *StartOfFilename =
2417	(Slash == StringRef::npos) ? PathStart : PathStart + Slash + `1`;
2418	// Code completion filter range is the filename only, up to completion point.
2419	PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get(
2420	Name: StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));
2421	// We should replace the characters up to the closing quote or closest slash,
2422	// if any.
2423	while (CompletionPoint < BufferEnd) {
2424	char Next = *(CompletionPoint + `1`);
2425	if (Next == `0` \|\| Next == `'\r'` \|\| Next == `'\n'`)
2426	break;
2427	++CompletionPoint;
2428	if (Next == (IsAngled ? `'>'` : `'"'`))
2429	break;
2430	if (SlashChars.contains(C: Next))
2431	break;
2432	}
2433
2434	PP->setCodeCompletionTokenRange(
2435	Start: FileLoc.getLocWithOffset(Offset: StartOfFilename - BufferStart),
2436	End: FileLoc.getLocWithOffset(Offset: CompletionPoint - BufferStart));
2437	PP->CodeCompleteIncludedFile(Dir, IsAngled);
2438	}
2439
2440	/// LexCharConstant - Lex the remainder of a character constant, after having
2441	/// lexed either ' or L' or u8' or u' or U'.
2442	bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
2443	tok::TokenKind Kind) {
2444	// Does this character contain the \0 character?
2445	const char NulCharacter = nullptr*;
2446
2447	if (!isLexingRawMode()) {
2448	if (Kind == tok::utf16_char_constant \|\| Kind == tok::utf32_char_constant)
2449	Diag(Loc: BufferPtr, DiagID: LangOpts.CPlusPlus
2450	? diag::warn_cxx98_compat_unicode_literal
2451	: diag::warn_c99_compat_unicode_literal);
2452	else if (Kind == tok::utf8_char_constant)
2453	Diag(Loc: BufferPtr, DiagID: LangOpts.CPlusPlus
2454	? diag::warn_cxx14_compat_u8_character_literal
2455	: diag::warn_c17_compat_u8_character_literal);
2456	}
2457
2458	char C = getAndAdvanceChar(Ptr&: CurPtr, Tok&: Result);
2459	if (C == `'\''`) {
2460	if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2461	Diag(Loc: BufferPtr, diag::DiagID: ext_empty_character);
2462	FormTokenWithChars(Result, TokEnd: CurPtr, Kind: tok::unknown);
2463	return true;
2464	}
2465
2466	while (C != `'\''`) {
2467	// Skip escaped characters.
2468	if (C == `'\\'`)
2469	C = getAndAdvanceChar(Ptr&: CurPtr, Tok&: Result);
2470
2471	if (C == `'\n'` \|\| C == `'\r'` \|\| // Newline.
2472	(C == `0` && CurPtr-`1` == BufferEnd)) { // End of file.
2473	if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2474	Diag(Loc: BufferPtr, diag::DiagID: ext_unterminated_char_or_string) << `0`;
2475	FormTokenWithChars(Result, TokEnd: CurPtr-`1`, Kind: tok::unknown);
2476	return true;
2477	}
2478
2479	if (C == `0`) {
2480	if (isCodeCompletionPoint(CurPtr: CurPtr-`1`)) {
2481	PP->CodeCompleteNaturalLanguage();
2482	FormTokenWithChars(Result, TokEnd: CurPtr-`1`, Kind: tok::unknown);
2483	cutOffLexing();
2484	return true;
2485	}
2486
2487	NulCharacter = CurPtr-`1`;
2488	}
2489	C = getAndAdvanceChar(Ptr&: CurPtr, Tok&: Result);
2490	}
2491
2492	// If we are in C++11, lex the optional ud-suffix.
2493	if (LangOpts.CPlusPlus)
2494	CurPtr = LexUDSuffix(Result, CurPtr, IsStringLiteral: false);
2495
2496	// If a nul character existed in the character, warn about it.
2497	if (NulCharacter && !isLexingRawMode())
2498	Diag(Loc: NulCharacter, diag::DiagID: null_in_char_or_string) << `0`;
2499
2500	// Update the location of token as well as BufferPtr.
2501	const char *TokStart = BufferPtr;
2502	FormTokenWithChars(Result, TokEnd: CurPtr, Kind);
2503	Result.setLiteralData(TokStart);
2504	return true;
2505	}
2506
2507	/// SkipWhitespace - Efficiently skip over a series of whitespace characters.
2508	/// Update BufferPtr to point to the next non-whitespace character and return.
2509	///
2510	/// This method forms a token and returns true if KeepWhitespaceMode is enabled.
2511	bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
2512	bool &TokAtPhysicalStartOfLine) {
2513	// Whitespace - Skip it, then return the token after the whitespace.
2514	bool SawNewline = isVerticalWhitespace(c: CurPtr[-`1`]);
2515
2516	unsigned char Char = *CurPtr;
2517
2518	const char lastNewLine = nullptr*;
2519	auto setLastNewLine = [&](const char *Ptr) {
2520	lastNewLine = Ptr;
2521	if (!NewLinePtr)
2522	NewLinePtr = Ptr;
2523	};
2524	if (SawNewline)
2525	setLastNewLine (CurPtr - `1`);
2526
2527	// Skip consecutive spaces efficiently.
2528	while (true) {
2529	// Skip horizontal whitespace very aggressively.
2530	while (isHorizontalWhitespace(c: Char))
2531	Char = *++CurPtr;
2532
2533	// Otherwise if we have something other than whitespace, we're done.
2534	if (!isVerticalWhitespace(c: Char))
2535	break;
2536
2537	if (ParsingPreprocessorDirective) {
2538	// End of preprocessor directive line, let LexTokenInternal handle this.
2539	BufferPtr = CurPtr;
2540	return false;
2541	}
2542
2543	// OK, but handle newline.
2544	if (*CurPtr == `'\n'`)
2545	setLastNewLine (CurPtr);
2546	SawNewline = true;
2547	Char = *++CurPtr;
2548	}
2549
2550	// If the client wants us to return whitespace, return it now.
2551	if (isKeepWhitespaceMode()) {
2552	FormTokenWithChars(Result, TokEnd: CurPtr, Kind: tok::unknown);
2553	if (SawNewline) {
2554	IsAtStartOfLine = true;
2555	IsAtPhysicalStartOfLine = true;
2556	}
2557	// FIXME: The next token will not have LeadingSpace set.
2558	return true;
2559	}
2560
2561	// If this isn't immediately after a newline, there is leading space.
2562	char PrevChar = CurPtr[-`1`];
2563	bool HasLeadingSpace = !isVerticalWhitespace(c: PrevChar);
2564
2565	Result.setFlagValue(Flag: Token::LeadingSpace, Val: HasLeadingSpace);
2566	if (SawNewline) {
2567	Result.setFlag(Token::StartOfLine);
2568	TokAtPhysicalStartOfLine = true;
2569
2570	if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) {
2571	if (auto *Handler = PP->getEmptylineHandler())
2572	Handler->HandleEmptyline(Range: SourceRange (getSourceLocation(Loc: NewLinePtr + `1`),
2573	getSourceLocation(Loc: lastNewLine)));
2574	}
2575	}
2576
2577	BufferPtr = CurPtr;
2578	return false;
2579	}
2580
2581	/// We have just read the // characters from input. Skip until we find the
2582	/// newline character that terminates the comment. Then update BufferPtr and
2583	/// return.
2584	///
2585	/// If we're in KeepCommentMode or any CommentHandler has inserted
2586	/// some tokens, this will store the first token and return true.
2587	bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
2588	bool &TokAtPhysicalStartOfLine) {
2589	// If Line comments aren't explicitly enabled for this language, emit an
2590	// extension warning.
2591	if (!LineComment) {
2592	if (!isLexingRawMode()) // There's no PP in raw mode, so can't emit diags.
2593	Diag(Loc: BufferPtr, diag::DiagID: ext_line_comment);
2594
2595	// Mark them enabled so we only emit one warning for this translation
2596	// unit.
2597	LineComment = true;
2598	}
2599
2600	// Scan over the body of the comment. The common case, when scanning, is that
2601	// the comment contains normal ascii characters with nothing interesting in
2602	// them. As such, optimize for this case with the inner loop.
2603	//
2604	// This loop terminates with CurPtr pointing at the newline (or end of buffer)
2605	// character that ends the line comment.
2606
2607	// C++23 [lex.phases] p1
2608	// Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2609	// diagnostic only once per entire ill-formed subsequence to avoid
2610	// emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2611	bool UnicodeDecodingAlreadyDiagnosed = false;
2612
2613	char C;
2614	while (true) {
2615	C = *CurPtr;
2616	// Skip over characters in the fast loop.
2617	while (isASCII(c: C) && C != `0` && // Potentially EOF.
2618	C != `'\n'` && C != `'\r'`) { // Newline or DOS-style newline.
2619	C = *++CurPtr;
2620	UnicodeDecodingAlreadyDiagnosed = false;
2621	}
2622
2623	if (!isASCII(c: C)) {
2624	unsigned Length = llvm::getUTF8SequenceSize(
2625	source: (const llvm::UTF8 )CurPtr, sourceEnd: (const* llvm::UTF8 *)BufferEnd);
2626	if (Length == `0`) {
2627	if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2628	Diag(Loc: CurPtr, diag::DiagID: warn_invalid_utf8_in_comment);
2629	UnicodeDecodingAlreadyDiagnosed = true;
2630	++CurPtr;
2631	} else {
2632	UnicodeDecodingAlreadyDiagnosed = false;
2633	CurPtr += Length;
2634	}
2635	continue;
2636	}
2637
2638	const char *NextLine = CurPtr;
2639	if (C != `0`) {
2640	// We found a newline, see if it's escaped.
2641	const char *EscapePtr = CurPtr-`1`;
2642	bool HasSpace = false;
2643	while (isHorizontalWhitespace(c: EscapePtr)) { // Skip whitespace.*
2644	--EscapePtr;
2645	HasSpace = true;
2646	}
2647
2648	if (*EscapePtr == `'\\'`)
2649	// Escaped newline.
2650	CurPtr = EscapePtr;
2651	else if (EscapePtr[`0`] == `'/'` && EscapePtr[-`1`] == `'?'` &&
2652	EscapePtr[-`2`] == `'?'` && LangOpts.Trigraphs)
2653	// Trigraph-escaped newline.
2654	CurPtr = EscapePtr-`2`;
2655	else
2656	break; // This is a newline, we're done.
2657
2658	// If there was space between the backslash and newline, warn about it.
2659	if (HasSpace && !isLexingRawMode())
2660	Diag(Loc: EscapePtr, diag::DiagID: backslash_newline_space);
2661	}
2662
2663	// Otherwise, this is a hard case. Fall back on getAndAdvanceChar to
2664	// properly decode the character. Read it in raw mode to avoid emitting
2665	// diagnostics about things like trigraphs. If we see an escaped newline,
2666	// we'll handle it below.
2667	const char *OldPtr = CurPtr;
2668	bool OldRawMode = isLexingRawMode();
2669	LexingRawMode = true;
2670	C = getAndAdvanceChar(Ptr&: CurPtr, Tok&: Result);
2671	LexingRawMode = OldRawMode;
2672
2673	// If we only read only one character, then no special handling is needed.
2674	// We're done and can skip forward to the newline.
2675	if (C != `0` && CurPtr == OldPtr+`1`) {
2676	CurPtr = NextLine;
2677	break;
2678	}
2679
2680	// If we read multiple characters, and one of those characters was a \r or
2681	// \n, then we had an escaped newline within the comment. Emit diagnostic
2682	// unless the next line is also a // comment.
2683	if (CurPtr != OldPtr + `1` && C != `'/'` &&
2684	(CurPtr == BufferEnd + `1` \|\| CurPtr[`0`] != `'/'`)) {
2685	for (; OldPtr != CurPtr; ++OldPtr)
2686	if (OldPtr[`0`] == `'\n'` \|\| OldPtr[`0`] == `'\r'`) {
2687	// Okay, we found a // comment that ends in a newline, if the next
2688	// line is also a // comment, but has spaces, don't emit a diagnostic.
2689	if (isWhitespace(c: C)) {
2690	const char *ForwardPtr = CurPtr;
2691	while (isWhitespace(c: ForwardPtr)) // Skip whitespace.*
2692	++ForwardPtr;
2693	if (ForwardPtr[`0`] == `'/'` && ForwardPtr[`1`] == `'/'`)
2694	break;
2695	}
2696
2697	if (!isLexingRawMode())
2698	Diag(Loc: OldPtr-`1`, diag::DiagID: ext_multi_line_line_comment);
2699	break;
2700	}
2701	}
2702
2703	if (C == `'\r'` \|\| C == `'\n'` \|\| CurPtr == BufferEnd + `1`) {
2704	--CurPtr;
2705	break;
2706	}
2707
2708	if (C == `'\0'` && isCodeCompletionPoint(CurPtr: CurPtr-`1`)) {
2709	PP->CodeCompleteNaturalLanguage();
2710	cutOffLexing();
2711	return false;
2712	}
2713	}
2714
2715	// Found but did not consume the newline. Notify comment handlers about the
2716	// comment unless we're in a #if 0 block.
2717	if (PP && !isLexingRawMode() &&
2718	PP->HandleComment(result&: Result, Comment: SourceRange (getSourceLocation(Loc: BufferPtr),
2719	getSourceLocation(Loc: CurPtr)))) {
2720	BufferPtr = CurPtr;
2721	return true; // A token has to be returned.
2722	}
2723
2724	// If we are returning comments as tokens, return this comment as a token.
2725	if (inKeepCommentMode())
2726	return SaveLineComment(Result, CurPtr);
2727
2728	// If we are inside a preprocessor directive and we see the end of line,
2729	// return immediately, so that the lexer can return this as an EOD token.
2730	if (ParsingPreprocessorDirective \|\| CurPtr == BufferEnd) {
2731	BufferPtr = CurPtr;
2732	return false;
2733	}
2734
2735	// Otherwise, eat the \n character. We don't care if this is a \n\r or
2736	// \r\n sequence. This is an efficiency hack (because we know the \n can't
2737	// contribute to another token), it isn't needed for correctness. Note that
2738	// this is ok even in KeepWhitespaceMode, because we would have returned the
2739	// comment above in that mode.
2740	NewLinePtr = CurPtr++;
2741
2742	// The next returned token is at the start of the line.
2743	Result.setFlag(Token::StartOfLine);
2744	TokAtPhysicalStartOfLine = true;
2745	// No leading whitespace seen so far.
2746	Result.clearFlag(Flag: Token::LeadingSpace);
2747	BufferPtr = CurPtr;
2748	return false;
2749	}
2750
2751	/// If in save-comment mode, package up this Line comment in an appropriate
2752	/// way and return it.
2753	bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
2754	// If we're not in a preprocessor directive, just return the // comment
2755	// directly.
2756	FormTokenWithChars(Result, TokEnd: CurPtr, Kind: tok::comment);
2757
2758	if (!ParsingPreprocessorDirective \|\| LexingRawMode)
2759	return true;
2760
2761	// If this Line-style comment is in a macro definition, transmogrify it into
2762	// a C-style block comment.
2763	bool Invalid = false;
2764	std::string Spelling = PP->getSpelling(Tok: Result, Invalid: &Invalid);
2765	if (Invalid)
2766	return true;
2767
2768	assert(Spelling[`0`] == `'/'` && Spelling[`1`] == `'/'` && "Not line comment?");
2769	Spelling [`1`] = `''`; // Change prefix to "/".
2770	Spelling += "/"; // add suffix.*
2771
2772	Result.setKind(tok::comment);
2773	PP->CreateString(Str: Spelling, Tok&: Result,
2774	ExpansionLocStart: Result.getLocation(), ExpansionLocEnd: Result.getLocation());
2775	return true;
2776	}
2777
2778	/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
2779	/// character (either \\n or \\r) is part of an escaped newline sequence. Issue
2780	/// a diagnostic if so. We know that the newline is inside of a block comment.
2781	static bool isEndOfBlockCommentWithEscapedNewLine(const char CurPtr, Lexer L,
2782	bool Trigraphs) {
2783	assert(CurPtr[`0`] == `'\n'` \|\| CurPtr[`0`] == `'\r'`);
2784
2785	// Position of the first trigraph in the ending sequence.
2786	const char TrigraphPos = nullptr*;
2787	// Position of the first whitespace after a '\' in the ending sequence.
2788	const char SpacePos = nullptr*;
2789
2790	while (true) {
2791	// Back up off the newline.
2792	--CurPtr;
2793
2794	// If this is a two-character newline sequence, skip the other character.
2795	if (CurPtr[`0`] == `'\n'` \|\| CurPtr[`0`] == `'\r'`) {
2796	// \n\n or \r\r -> not escaped newline.
2797	if (CurPtr[`0`] == CurPtr[`1`])
2798	return false;
2799	// \n\r or \r\n -> skip the newline.
2800	--CurPtr;
2801	}
2802
2803	// If we have horizontal whitespace, skip over it. We allow whitespace
2804	// between the slash and newline.
2805	while (isHorizontalWhitespace(c: CurPtr) \|\| CurPtr == `0`) {
2806	SpacePos = CurPtr;
2807	--CurPtr;
2808	}
2809
2810	// If we have a slash, this is an escaped newline.
2811	if (*CurPtr == `'\\'`) {
2812	--CurPtr;
2813	} else if (CurPtr[`0`] == `'/'` && CurPtr[-`1`] == `'?'` && CurPtr[-`2`] == `'?'`) {
2814	// This is a trigraph encoding of a slash.
2815	TrigraphPos = CurPtr - `2`;
2816	CurPtr -= `3`;
2817	} else {
2818	return false;
2819	}
2820
2821	// If the character preceding the escaped newline is a '', then after line*
2822	// splicing we have a '/' ending the comment.*
2823	if (CurPtr == `''`)
2824	break;
2825
2826	if (CurPtr != `'\n'` && CurPtr != `'\r'`)
2827	return false;
2828	}
2829
2830	if (TrigraphPos) {
2831	// If no trigraphs are enabled, warn that we ignored this trigraph and
2832	// ignore this character.*
2833	if (!Trigraphs) {
2834	if (!L->isLexingRawMode())
2835	L->Diag(Loc: TrigraphPos, diag::DiagID: trigraph_ignored_block_comment);
2836	return false;
2837	}
2838	if (!L->isLexingRawMode())
2839	L->Diag(Loc: TrigraphPos, diag::DiagID: trigraph_ends_block_comment);
2840	}
2841
2842	// Warn about having an escaped newline between the / characters.*
2843	if (!L->isLexingRawMode())
2844	L->Diag(Loc: CurPtr + `1`, diag::DiagID: escaped_newline_block_comment_end);
2845
2846	// If there was space between the backslash and newline, warn about it.
2847	if (SpacePos && !L->isLexingRawMode())
2848	L->Diag(Loc: SpacePos, diag::DiagID: backslash_newline_space);
2849
2850	return true;
2851	}
2852
2853	#ifdef __SSE2__
2854	#include <emmintrin.h>
2855	#elif __ALTIVEC__
2856	#include <altivec.h>
2857	#undef bool
2858	#endif
2859
2860	/// We have just read from input the / and characters that started a comment.*
2861	/// Read until we find the and / characters that terminate the comment.*
2862	/// Note that we don't bother decoding trigraphs or escaped newlines in block
2863	/// comments, because they cannot cause the comment to end. The only thing
2864	/// that can happen is the comment could end with an escaped newline between
2865	/// the terminating and /.*
2866	///
2867	/// If we're in KeepCommentMode or any CommentHandler has inserted
2868	/// some tokens, this will store the first token and return true.
2869	bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
2870	bool &TokAtPhysicalStartOfLine) {
2871	// Scan one character past where we should, looking for a '/' character. Once
2872	// we find it, check to see if it was preceded by a . This common*
2873	// optimization helps people who like to put a lot of characters in their*
2874	// comments.
2875
2876	// The first character we get with newlines and trigraphs skipped to handle
2877	// the degenerate // case below correctly if the * has an escaped newline*
2878	// after it.
2879	unsigned CharSize;
2880	unsigned char C = getCharAndSize(Ptr: CurPtr, Size&: CharSize);
2881	CurPtr += CharSize;
2882	if (C == `0` && CurPtr == BufferEnd+`1`) {
2883	if (!isLexingRawMode())
2884	Diag(Loc: BufferPtr, diag::DiagID: err_unterminated_block_comment);
2885	--CurPtr;
2886
2887	// KeepWhitespaceMode should return this broken comment as a token. Since
2888	// it isn't a well formed comment, just return it as an 'unknown' token.
2889	if (isKeepWhitespaceMode()) {
2890	FormTokenWithChars(Result, TokEnd: CurPtr, Kind: tok::unknown);
2891	return true;
2892	}
2893
2894	BufferPtr = CurPtr;
2895	return false;
2896	}
2897
2898	// Check to see if the first character after the '/' is another /. If so,*
2899	// then this slash does not end the block comment, it is part of it.
2900	if (C == `'/'`)
2901	C = *CurPtr++;
2902
2903	// C++23 [lex.phases] p1
2904	// Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2905	// diagnostic only once per entire ill-formed subsequence to avoid
2906	// emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2907	bool UnicodeDecodingAlreadyDiagnosed = false;
2908
2909	while (true) {
2910	// Skip over all non-interesting characters until we find end of buffer or a
2911	// (probably ending) '/' character.
2912	if (CurPtr + `24` < BufferEnd &&
2913	// If there is a code-completion point avoid the fast scan because it
2914	// doesn't check for '\0'.
2915	!(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
2916	// While not aligned to a 16-byte boundary.
2917	while (C != `'/'` && (intptr_t)CurPtr % `16` != `0`) {
2918	if (!isASCII(c: C))
2919	goto MultiByteUTF8;
2920	C = *CurPtr++;
2921	}
2922	if (C == `'/'`) goto FoundSlash;
2923
2924	#ifdef __SSE2__
2925	__m128i Slashes = _mm_set1_epi8(b: `'/'`);
2926	while (CurPtr + `16` < BufferEnd) {
2927	int Mask = _mm_movemask_epi8(a: (const* __m128i *)CurPtr);
2928	if (LLVM_UNLIKELY(Mask != `0`)) {
2929	goto MultiByteUTF8;
2930	}
2931	// look for slashes
2932	int cmp = _mm_movemask_epi8(a: _mm_cmpeq_epi8(a: (const* __m128i*)CurPtr,
2933	b: Slashes));
2934	if (cmp != `0`) {
2935	// Adjust the pointer to point directly after the first slash. It's
2936	// not necessary to set C here, it will be overwritten at the end of
2937	// the outer loop.
2938	CurPtr += llvm::countr_zero<unsigned>(Val: cmp) + `1`;
2939	goto FoundSlash;
2940	}
2941	CurPtr += `16`;
2942	}
2943	#elif __ALTIVEC__
2944	__vector unsigned char LongUTF = {`0x80`, `0x80`, `0x80`, `0x80`, `0x80`, `0x80`,
2945	`0x80`, `0x80`, `0x80`, `0x80`, `0x80`, `0x80`,
2946	`0x80`, `0x80`, `0x80`, `0x80`};
2947	__vector unsigned char Slashes = {
2948	`'/'`, `'/'`, `'/'`, `'/'`, `'/'`, `'/'`, `'/'`, `'/'`,
2949	`'/'`, `'/'`, `'/'`, `'/'`, `'/'`, `'/'`, `'/'`, `'/'`
2950	};
2951	while (CurPtr + `16` < BufferEnd) {
2952	if (LLVM_UNLIKELY(
2953	vec_any_ge((const* __vector unsigned char *)CurPtr, LongUTF)))
2954	goto MultiByteUTF8;
2955	if (vec_any_eq((const* __vector unsigned char *)CurPtr, Slashes)) {
2956	break;
2957	}
2958	CurPtr += `16`;
2959	}
2960
2961	#else
2962	while (CurPtr + `16` < BufferEnd) {
2963	bool HasNonASCII = false;
2964	for (unsigned I = `0`; I < `16`; ++I)
2965	HasNonASCII \|= !isASCII(CurPtr[I]);
2966
2967	if (LLVM_UNLIKELY(HasNonASCII))
2968	goto MultiByteUTF8;
2969
2970	bool HasSlash = false;
2971	for (unsigned I = `0`; I < `16`; ++I)
2972	HasSlash \|= CurPtr[I] == `'/'`;
2973	if (HasSlash)
2974	break;
2975	CurPtr += `16`;
2976	}
2977	#endif
2978
2979	// It has to be one of the bytes scanned, increment to it and read one.
2980	C = *CurPtr++;
2981	}
2982
2983	// Loop to scan the remainder, warning on invalid UTF-8
2984	// if the corresponding warning is enabled, emitting a diagnostic only once
2985	// per sequence that cannot be decoded.
2986	while (C != `'/'` && C != `'\0'`) {
2987	if (isASCII(c: C)) {
2988	UnicodeDecodingAlreadyDiagnosed = false;
2989	C = *CurPtr++;
2990	continue;
2991	}
2992	MultiByteUTF8:
2993	// CurPtr is 1 code unit past C, so to decode
2994	// the codepoint, we need to read from the previous position.
2995	unsigned Length = llvm::getUTF8SequenceSize(
2996	source: (const llvm::UTF8 )CurPtr - `1`, sourceEnd: (const* llvm::UTF8 *)BufferEnd);
2997	if (Length == `0`) {
2998	if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2999	Diag(Loc: CurPtr - `1`, diag::DiagID: warn_invalid_utf8_in_comment);
3000	UnicodeDecodingAlreadyDiagnosed = true;
3001	} else {
3002	UnicodeDecodingAlreadyDiagnosed = false;
3003	CurPtr += Length - `1`;
3004	}
3005	C = *CurPtr++;
3006	}
3007
3008	if (C == `'/'`) {
3009	FoundSlash:
3010	if (CurPtr[-`2`] == `''`) // We found the final /. We're done!
3011	break;
3012
3013	if ((CurPtr[-`2`] == `'\n'` \|\| CurPtr[-`2`] == `'\r'`)) {
3014	if (isEndOfBlockCommentWithEscapedNewLine(CurPtr: CurPtr - `2`, L: this,
3015	Trigraphs: LangOpts.Trigraphs)) {
3016	// We found the final /, though it had an escaped newline between the*
3017	// and /. We're done!*
3018	break;
3019	}
3020	}
3021	if (CurPtr[`0`] == `'*'` && CurPtr[`1`] != `'/'`) {
3022	// If this is a / inside of the comment, emit a warning. Don't do this*
3023	// if this is a //, which will end the comment. This misses cases with*
3024	// embedded escaped newlines, but oh well.
3025	if (!isLexingRawMode())
3026	Diag(Loc: CurPtr-`1`, diag::DiagID: warn_nested_block_comment);
3027	}
3028	} else if (C == `0` && CurPtr == BufferEnd+`1`) {
3029	if (!isLexingRawMode())
3030	Diag(Loc: BufferPtr, diag::DiagID: err_unterminated_block_comment);
3031	// Note: the user probably forgot a /. We could continue immediately*
3032	// after the /, but this would involve lexing a lot of what really is the*
3033	// comment, which surely would confuse the parser.
3034	--CurPtr;
3035
3036	// KeepWhitespaceMode should return this broken comment as a token. Since
3037	// it isn't a well formed comment, just return it as an 'unknown' token.
3038	if (isKeepWhitespaceMode()) {
3039	FormTokenWithChars(Result, TokEnd: CurPtr, Kind: tok::unknown);
3040	return true;
3041	}
3042
3043	BufferPtr = CurPtr;
3044	return false;
3045	} else if (C == `'\0'` && isCodeCompletionPoint(CurPtr: CurPtr-`1`)) {
3046	PP->CodeCompleteNaturalLanguage();
3047	cutOffLexing();
3048	return false;
3049	}
3050
3051	C = *CurPtr++;
3052	}
3053
3054	// Notify comment handlers about the comment unless we're in a #if 0 block.
3055	if (PP && !isLexingRawMode() &&
3056	PP->HandleComment(result&: Result, Comment: SourceRange (getSourceLocation(Loc: BufferPtr),
3057	getSourceLocation(Loc: CurPtr)))) {
3058	BufferPtr = CurPtr;
3059	return true; // A token has to be returned.
3060	}
3061
3062	// If we are returning comments as tokens, return this comment as a token.
3063	if (inKeepCommentMode()) {
3064	FormTokenWithChars(Result, TokEnd: CurPtr, Kind: tok::comment);
3065	return true;
3066	}
3067
3068	// It is common for the tokens immediately after a // comment to be
3069	// whitespace. Instead of going through the big switch, handle it
3070	// efficiently now. This is safe even in KeepWhitespaceMode because we would
3071	// have already returned above with the comment as a token.
3072	if (isHorizontalWhitespace(c: *CurPtr)) {
3073	SkipWhitespace(Result, CurPtr: CurPtr+`1`, TokAtPhysicalStartOfLine);
3074	return false;
3075	}
3076
3077	// Otherwise, just return so that the next character will be lexed as a token.
3078	BufferPtr = CurPtr;
3079	Result.setFlag(Token::LeadingSpace);
3080	return false;
3081	}
3082
3083	//===----------------------------------------------------------------------===//
3084	// Primary Lexing Entry Points
3085	//===----------------------------------------------------------------------===//
3086
3087	/// ReadToEndOfLine - Read the rest of the current preprocessor line as an
3088	/// uninterpreted string. This switches the lexer out of directive mode.
3089	void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) {
3090	assert(ParsingPreprocessorDirective && ParsingFilename == false &&
3091	"Must be in a preprocessing directive!");
3092	Token Tmp;
3093	Tmp.startToken();
3094
3095	// CurPtr - Cache BufferPtr in an automatic variable.
3096	const char *CurPtr = BufferPtr;
3097	while (true) {
3098	char Char = getAndAdvanceChar(Ptr&: CurPtr, Tok&: Tmp);
3099	switch (Char) {
3100	default:
3101	if (Result)
3102	Result->push_back(Elt: Char);
3103	break;
3104	case `0`: // Null.
3105	// Found end of file?
3106	if (CurPtr-`1` != BufferEnd) {
3107	if (isCodeCompletionPoint(CurPtr: CurPtr-`1`)) {
3108	PP->CodeCompleteNaturalLanguage();
3109	cutOffLexing();
3110	return;
3111	}
3112
3113	// Nope, normal character, continue.
3114	if (Result)
3115	Result->push_back(Elt: Char);
3116	break;
3117	}
3118	// FALL THROUGH.
3119	[[fallthrough]];
3120	case `'\r'`:
3121	case `'\n'`:
3122	// Okay, we found the end of the line. First, back up past the \0, \r, \n.
3123	assert(CurPtr[-`1`] == Char && "Trigraphs for newline?");
3124	BufferPtr = CurPtr-`1`;
3125
3126	// Next, lex the character, which should handle the EOD transition.
3127	Lex(Result&: Tmp);
3128	if (Tmp.is(K: tok::code_completion)) {
3129	if (PP)
3130	PP->CodeCompleteNaturalLanguage();
3131	Lex(Result&: Tmp);
3132	}
3133	assert(Tmp.is(tok::eod) && "Unexpected token!");
3134
3135	// Finally, we're done;
3136	return;
3137	}
3138	}
3139	}
3140
3141	/// LexEndOfFile - CurPtr points to the end of this file. Handle this
3142	/// condition, reporting diagnostics and handling other edge cases as required.
3143	/// This returns true if Result contains a token, false if PP.Lex should be
3144	/// called again.
3145	bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
3146	// If we hit the end of the file while parsing a preprocessor directive,
3147	// end the preprocessor directive first. The next token returned will
3148	// then be the end of file.
3149	if (ParsingPreprocessorDirective) {
3150	// Done parsing the "line".
3151	ParsingPreprocessorDirective = false;
3152	// Update the location of token as well as BufferPtr.
3153	FormTokenWithChars(Result, TokEnd: CurPtr, Kind: tok::eod);
3154
3155	// Restore comment saving mode, in case it was disabled for directive.
3156	if (PP)
3157	resetExtendedTokenMode();
3158	return true; // Have a token.
3159	}
3160
3161	// If we are in raw mode, return this event as an EOF token. Let the caller
3162	// that put us in raw mode handle the event.
3163	if (isLexingRawMode()) {
3164	Result.startToken();
3165	BufferPtr = BufferEnd;
3166	FormTokenWithChars(Result, TokEnd: BufferEnd, Kind: tok::eof);
3167	return true;
3168	}
3169
3170	if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) {
3171	PP->setRecordedPreambleConditionalStack(ConditionalStack);
3172	// If the preamble cuts off the end of a header guard, consider it guarded.
3173	// The guard is valid for the preamble content itself, and for tools the
3174	// most useful answer is "yes, this file has a header guard".
3175	if (!ConditionalStack.empty())
3176	MIOpt.ExitTopLevelConditional();
3177	ConditionalStack.clear();
3178	}
3179
3180	// Issue diagnostics for unterminated #if and missing newline.
3181
3182	// If we are in a #if directive, emit an error.
3183	while (!ConditionalStack.empty()) {
3184	if (PP->getCodeCompletionFileLoc() != FileLoc)
3185	PP->Diag(ConditionalStack.back().IfLoc,
3186	diag::err_pp_unterminated_conditional);
3187	ConditionalStack.pop_back();
3188	}
3189
3190	// Before C++11 and C2y, a file not ending with a newline was UB. Both
3191	// standards changed this behavior (as a DR or equivalent), but we still have
3192	// an opt-in diagnostic to warn about it.
3193	if (CurPtr != BufferStart && (CurPtr[-`1`] != `'\n'` && CurPtr[-`1`] != `'\r'`))
3194	Diag(BufferEnd, diag::warn_no_newline_eof)
3195	<< FixItHint::CreateInsertion(getSourceLocation(BufferEnd), "\n");
3196
3197	BufferPtr = CurPtr;
3198
3199	// Finally, let the preprocessor handle this.
3200	return PP->HandleEndOfFile(Result, isEndOfMacro: isPragmaLexer());
3201	}
3202
3203	/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
3204	/// the specified lexer will return a tok::l_paren token, 0 if it is something
3205	/// else and 2 if there are no more tokens in the buffer controlled by the
3206	/// lexer.
3207	unsigned Lexer::isNextPPTokenLParen() {
3208	assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
3209
3210	if (isDependencyDirectivesLexer()) {
3211	if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size())
3212	return `2`;
3213	return DepDirectives.front().Tokens [NextDepDirectiveTokenIndex].is(
3214	K: tok::l_paren);
3215	}
3216
3217	// Switch to 'skipping' mode. This will ensure that we can lex a token
3218	// without emitting diagnostics, disables macro expansion, and will cause EOF
3219	// to return an EOF token instead of popping the include stack.
3220	LexingRawMode = true;
3221
3222	// Save state that can be changed while lexing so that we can restore it.
3223	const char *TmpBufferPtr = BufferPtr;
3224	bool inPPDirectiveMode = ParsingPreprocessorDirective;
3225	bool atStartOfLine = IsAtStartOfLine;
3226	bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3227	bool leadingSpace = HasLeadingSpace;
3228
3229	Token Tok;
3230	Lex(Result&: Tok);
3231
3232	// Restore state that may have changed.
3233	BufferPtr = TmpBufferPtr;
3234	ParsingPreprocessorDirective = inPPDirectiveMode;
3235	HasLeadingSpace = leadingSpace;
3236	IsAtStartOfLine = atStartOfLine;
3237	IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
3238
3239	// Restore the lexer back to non-skipping mode.
3240	LexingRawMode = false;
3241
3242	if (Tok.is(K: tok::eof))
3243	return `2`;
3244	return Tok.is(K: tok::l_paren);
3245	}
3246
3247	/// Find the end of a version control conflict marker.
3248	static const char FindConflictEnd(const* char CurPtr, const* char *BufferEnd,
3249	ConflictMarkerKind CMK) {
3250	const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
3251	size_t TermLen = CMK == CMK_Perforce ? `5` : `7`;
3252	auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(Start: TermLen);
3253	size_t Pos = RestOfBuffer.find(Str: Terminator);
3254	while (Pos != StringRef::npos) {
3255	// Must occur at start of line.
3256	if (Pos == `0` \|\|
3257	(RestOfBuffer [Pos - `1`] != `'\r'` && RestOfBuffer [Pos - `1`] != `'\n'`)) {
3258	RestOfBuffer = RestOfBuffer.substr(Start: Pos+TermLen);
3259	Pos = RestOfBuffer.find(Str: Terminator);
3260	continue;
3261	}
3262	return RestOfBuffer.data()+Pos;
3263	}
3264	return nullptr;
3265	}
3266
3267	/// IsStartOfConflictMarker - If the specified pointer is the start of a version
3268	/// control conflict marker like '<<<<<<<', recognize it as such, emit an error
3269	/// and recover nicely. This returns true if it is a conflict marker and false
3270	/// if not.
3271	bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
3272	// Only a conflict marker if it starts at the beginning of a line.
3273	if (CurPtr != BufferStart &&
3274	CurPtr[-`1`] != `'\n'` && CurPtr[-`1`] != `'\r'`)
3275	return false;
3276
3277	// Check to see if we have <<<<<<< or >>>>.
3278	if (!StringRef(CurPtr, BufferEnd - CurPtr).starts_with(Prefix: "<<<<<<<") &&
3279	!StringRef(CurPtr, BufferEnd - CurPtr).starts_with(Prefix: ">>>> "))
3280	return false;
3281
3282	// If we have a situation where we don't care about conflict markers, ignore
3283	// it.
3284	if (CurrentConflictMarkerState \|\| isLexingRawMode())
3285	return false;
3286
3287	ConflictMarkerKind Kind = *CurPtr == `'<'` ? CMK_Normal : CMK_Perforce;
3288
3289	// Check to see if there is an ending marker somewhere in the buffer at the
3290	// start of a line to terminate this conflict marker.
3291	if (FindConflictEnd(CurPtr, BufferEnd, CMK: Kind)) {
3292	// We found a match. We are really in a conflict marker.
3293	// Diagnose this, and ignore to the end of line.
3294	Diag(CurPtr, diag::err_conflict_marker);
3295	CurrentConflictMarkerState = Kind;
3296
3297	// Skip ahead to the end of line. We know this exists because the
3298	// end-of-conflict marker starts with \r or \n.
3299	while (CurPtr != `'\r'` && CurPtr != `'\n'`) {
3300	assert(CurPtr != BufferEnd && "Didn't find end of line");
3301	++CurPtr;
3302	}
3303	BufferPtr = CurPtr;
3304	return true;
3305	}
3306
3307	// No end of conflict marker found.
3308	return false;
3309	}
3310
3311	/// HandleEndOfConflictMarker - If this is a '====' or '\|\|\|\|' or '>>>>', or if
3312	/// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
3313	/// is the end of a conflict marker. Handle it by ignoring up until the end of
3314	/// the line. This returns true if it is a conflict marker and false if not.
3315	bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
3316	// Only a conflict marker if it starts at the beginning of a line.
3317	if (CurPtr != BufferStart &&
3318	CurPtr[-`1`] != `'\n'` && CurPtr[-`1`] != `'\r'`)
3319	return false;
3320
3321	// If we have a situation where we don't care about conflict markers, ignore
3322	// it.
3323	if (!CurrentConflictMarkerState \|\| isLexingRawMode())
3324	return false;
3325
3326	// Check to see if we have the marker (4 characters in a row).
3327	for (unsigned i = `1`; i != `4`; ++i)
3328	if (CurPtr[i] != CurPtr[`0`])
3329	return false;
3330
3331	// If we do have it, search for the end of the conflict marker. This could
3332	// fail if it got skipped with a '#if 0' or something. Note that CurPtr might
3333	// be the end of conflict marker.
3334	if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
3335	CMK: CurrentConflictMarkerState)) {
3336	CurPtr = End;
3337
3338	// Skip ahead to the end of line.
3339	while (CurPtr != BufferEnd && CurPtr != `'\r'` && CurPtr != `'\n'`)
3340	++CurPtr;
3341
3342	BufferPtr = CurPtr;
3343
3344	// No longer in the conflict marker.
3345	CurrentConflictMarkerState = CMK_None;
3346	return true;
3347	}
3348
3349	return false;
3350	}
3351
3352	static const char findPlaceholderEnd(const* char *CurPtr,
3353	const char *BufferEnd) {
3354	if (CurPtr == BufferEnd)
3355	return nullptr;
3356	BufferEnd -= `1`; // Scan until the second last character.
3357	for (; CurPtr != BufferEnd; ++CurPtr) {
3358	if (CurPtr[`0`] == `'#'` && CurPtr[`1`] == `'>'`)
3359	return CurPtr + `2`;
3360	}
3361	return nullptr;
3362	}
3363
3364	bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
3365	assert(CurPtr[-`1`] == `'<'` && CurPtr[`0`] == `'#'` && "Not a placeholder!");
3366	if (!PP \|\| !PP->getPreprocessorOpts().LexEditorPlaceholders \|\| LexingRawMode)
3367	return false;
3368	const char *End = findPlaceholderEnd(CurPtr: CurPtr + `1`, BufferEnd);
3369	if (!End)
3370	return false;
3371	const char *Start = CurPtr - `1`;
3372	if (!LangOpts.AllowEditorPlaceholders)
3373	Diag(Start, diag::err_placeholder_in_source);
3374	Result.startToken();
3375	FormTokenWithChars(Result, TokEnd: End, Kind: tok::raw_identifier);
3376	Result.setRawIdentifierData(Start);
3377	PP->LookUpIdentifierInfo(Identifier&: Result);
3378	Result.setFlag(Token::IsEditorPlaceholder);
3379	BufferPtr = End;
3380	return true;
3381	}
3382
3383	bool Lexer::isCodeCompletionPoint(const char CurPtr) const* {
3384	if (PP && PP->isCodeCompletionEnabled()) {
3385	SourceLocation Loc = FileLoc.getLocWithOffset(Offset: CurPtr-BufferStart);
3386	return Loc == PP->getCodeCompletionLoc();
3387	}
3388
3389	return false;
3390	}
3391
3392	void Lexer::DiagnoseDelimitedOrNamedEscapeSequence(SourceLocation Loc,
3393	bool Named,
3394	const LangOptions &Opts,
3395	DiagnosticsEngine &Diags) {
3396	unsigned DiagId;
3397	if (Opts.CPlusPlus23)
3398	DiagId = diag::warn_cxx23_delimited_escape_sequence;
3399	else if (Opts.C2y && !Named)
3400	DiagId = diag::warn_c2y_delimited_escape_sequence;
3401	else
3402	DiagId = diag::ext_delimited_escape_sequence;
3403
3404	// The trailing arguments are only used by the extension warning; either this
3405	// is a C2y extension or a C++23 extension, unless it's a named escape
3406	// sequence in C, then it's a Clang extension.
3407	unsigned Ext;
3408	if (!Opts.CPlusPlus)
3409	Ext = Named ? `2` / Clang extension / : `1` / C2y extension /;
3410	else
3411	Ext = `0`; // C++23 extension
3412
3413	Diags.Report(Loc, DiagID: DiagId) << Named << Ext;
3414	}
3415
3416	std::optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
3417	const char *SlashLoc,
3418	Token *Result) {
3419	unsigned CharSize;
3420	char Kind = getCharAndSize(Ptr: StartPtr, Size&: CharSize);
3421	assert((Kind == `'u'` \|\| Kind == `'U'`) && "expected a UCN");
3422
3423	unsigned NumHexDigits;
3424	if (Kind == `'u'`)
3425	NumHexDigits = `4`;
3426	else if (Kind == `'U'`)
3427	NumHexDigits = `8`;
3428
3429	bool Delimited = false;
3430	bool FoundEndDelimiter = false;
3431	unsigned Count = `0`;
3432	bool Diagnose = Result && !isLexingRawMode();
3433
3434	if (!LangOpts.CPlusPlus && !LangOpts.C99) {
3435	if (Diagnose)
3436	Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
3437	return std::nullopt;
3438	}
3439
3440	const char *CurPtr = StartPtr + CharSize;
3441	const char *KindLoc = &CurPtr[-`1`];
3442
3443	uint32_t CodePoint = `0`;
3444	while (Count != NumHexDigits \|\| Delimited) {
3445	char C = getCharAndSize(Ptr: CurPtr, Size&: CharSize);
3446	if (!Delimited && Count == `0` && C == `'{'`) {
3447	Delimited = true;
3448	CurPtr += CharSize;
3449	continue;
3450	}
3451
3452	if (Delimited && C == `'}'`) {
3453	CurPtr += CharSize;
3454	FoundEndDelimiter = true;
3455	break;
3456	}
3457
3458	unsigned Value = llvm::hexDigitValue(C);
3459	if (Value == -`1U`) {
3460	if (!Delimited)
3461	break;
3462	if (Diagnose)
3463	Diag(SlashLoc, diag::warn_delimited_ucn_incomplete)
3464	<< StringRef(KindLoc, `1`);
3465	return std::nullopt;
3466	}
3467
3468	if (CodePoint & `0xF000'0000`) {
3469	if (Diagnose)
3470	Diag(KindLoc, diag::err_escape_too_large) << `0`;
3471	return std::nullopt;
3472	}
3473
3474	CodePoint <<= `4`;
3475	CodePoint \|= Value;
3476	CurPtr += CharSize;
3477	Count++;
3478	}
3479
3480	if (Count == `0`) {
3481	if (Diagnose)
3482	Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3483	: diag::warn_ucn_escape_no_digits)
3484	<< StringRef(KindLoc, `1`);
3485	return std::nullopt;
3486	}
3487
3488	if (Delimited && Kind == `'U'`) {
3489	if (Diagnose)
3490	Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, `1`);
3491	return std::nullopt;
3492	}
3493
3494	if (!Delimited && Count != NumHexDigits) {
3495	if (Diagnose) {
3496	Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3497	// If the user wrote \U1234, suggest a fixit to \u.
3498	if (Count == `4` && NumHexDigits == `8`) {
3499	CharSourceRange URange = makeCharRange(L&: *this, Begin: KindLoc, End: KindLoc + `1`);
3500	Diag(KindLoc, diag::note_ucn_four_not_eight)
3501	<< FixItHint::CreateReplacement(URange, "u");
3502	}
3503	}
3504	return std::nullopt;
3505	}
3506
3507	if (Delimited && PP)
3508	DiagnoseDelimitedOrNamedEscapeSequence(Loc: getSourceLocation(Loc: SlashLoc), Named: false,
3509	Opts: PP->getLangOpts(),
3510	Diags&: PP->getDiagnostics());
3511
3512	if (Result) {
3513	Result->setFlag(Token::HasUCN);
3514	// If the UCN contains either a trigraph or a line splicing,
3515	// we need to call getAndAdvanceChar again to set the appropriate flags
3516	// on Result.
3517	if (CurPtr - StartPtr == (ptrdiff_t)(Count + `1` + (Delimited ? `2` : `0`)))
3518	StartPtr = CurPtr;
3519	else
3520	while (StartPtr != CurPtr)
3521	(void)getAndAdvanceChar(Ptr&: StartPtr, Tok&: *Result);
3522	} else {
3523	StartPtr = CurPtr;
3524	}
3525	return CodePoint;
3526	}
3527
3528	std::optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
3529	const char *SlashLoc,
3530	Token *Result) {
3531	unsigned CharSize;
3532	bool Diagnose = Result && !isLexingRawMode();
3533
3534	char C = getCharAndSize(Ptr: StartPtr, Size&: CharSize);
3535	assert(C == `'N'` && "expected \\N{...}");
3536
3537	const char *CurPtr = StartPtr + CharSize;
3538	const char *KindLoc = &CurPtr[-`1`];
3539
3540	C = getCharAndSize(Ptr: CurPtr, Size&: CharSize);
3541	if (C != `'{'`) {
3542	if (Diagnose)
3543	Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3544	return std::nullopt;
3545	}
3546	CurPtr += CharSize;
3547	const char *StartName = CurPtr;
3548	bool FoundEndDelimiter = false;
3549	llvm::SmallVector<char, `30`> Buffer;
3550	while (C) {
3551	C = getCharAndSize(Ptr: CurPtr, Size&: CharSize);
3552	CurPtr += CharSize;
3553	if (C == `'}'`) {
3554	FoundEndDelimiter = true;
3555	break;
3556	}
3557
3558	if (isVerticalWhitespace(c: C))
3559	break;
3560	Buffer.push_back(Elt: C);
3561	}
3562
3563	if (!FoundEndDelimiter \|\| Buffer.empty()) {
3564	if (Diagnose)
3565	Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3566	: diag::warn_delimited_ucn_incomplete)
3567	<< StringRef(KindLoc, `1`);
3568	return std::nullopt;
3569	}
3570
3571	StringRef Name(Buffer.data(), Buffer.size());
3572	std::optional<char32_t> Match =
3573	llvm::sys::unicode::nameToCodepointStrict(Name);
3574	std::optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch;
3575	if (!Match) {
3576	LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name);
3577	if (Diagnose) {
3578	Diag(StartName, diag::err_invalid_ucn_name)
3579	<< StringRef(Buffer.data(), Buffer.size())
3580	<< makeCharRange(*this, StartName, CurPtr - CharSize);
3581	if (LooseMatch) {
3582	Diag(StartName, diag::note_invalid_ucn_name_loose_matching)
3583	<< FixItHint::CreateReplacement(
3584	makeCharRange(*this, StartName, CurPtr - CharSize),
3585	LooseMatch->Name);
3586	}
3587	}
3588	// We do not offer misspelled character names suggestions here
3589	// as the set of what would be a valid suggestion depends on context,
3590	// and we should not make invalid suggestions.
3591	}
3592
3593	if (Diagnose && Match)
3594	DiagnoseDelimitedOrNamedEscapeSequence(Loc: getSourceLocation(Loc: SlashLoc), Named: true,
3595	Opts: PP->getLangOpts(),
3596	Diags&: PP->getDiagnostics());
3597
3598	// If no diagnostic has been emitted yet, likely because we are doing a
3599	// tentative lexing, we do not want to recover here to make sure the token
3600	// will not be incorrectly considered valid. This function will be called
3601	// again and a diagnostic emitted then.
3602	if (LooseMatch && Diagnose)
3603	Match = LooseMatch ->CodePoint;
3604
3605	if (Result) {
3606	Result->setFlag(Token::HasUCN);
3607	// If the UCN contains either a trigraph or a line splicing,
3608	// we need to call getAndAdvanceChar again to set the appropriate flags
3609	// on Result.
3610	if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + `3`))
3611	StartPtr = CurPtr;
3612	else
3613	while (StartPtr != CurPtr)
3614	(void)getAndAdvanceChar(Ptr&: StartPtr, Tok&: *Result);
3615	} else {
3616	StartPtr = CurPtr;
3617	}
3618	return Match ? std::optional<uint32_t>(*Match) : std::nullopt;
3619	}
3620
3621	uint32_t Lexer::tryReadUCN(const char &StartPtr, const* char *SlashLoc,
3622	Token *Result) {
3623
3624	unsigned CharSize;
3625	std::optional<uint32_t> CodePointOpt;
3626	char Kind = getCharAndSize(Ptr: StartPtr, Size&: CharSize);
3627	if (Kind == `'u'` \|\| Kind == `'U'`)
3628	CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result);
3629	else if (Kind == `'N'`)
3630	CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result);
3631
3632	if (!CodePointOpt)
3633	return `0`;
3634
3635	uint32_t CodePoint = *CodePointOpt;
3636
3637	// Don't apply C family restrictions to UCNs in assembly mode
3638	if (LangOpts.AsmPreprocessor)
3639	return CodePoint;
3640
3641	// C23 6.4.3p2: A universal character name shall not designate a code point
3642	// where the hexadecimal value is:
3643	// - in the range D800 through DFFF inclusive; or
3644	// - greater than 10FFFF.
3645	// A universal-character-name outside the c-char-sequence of a character
3646	// constant, or the s-char-sequence of a string-literal shall not designate
3647	// a control character or a character in the basic character set.
3648
3649	// C++11 [lex.charset]p2: If the hexadecimal value for a
3650	// universal-character-name corresponds to a surrogate code point (in the
3651	// range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
3652	// if the hexadecimal value for a universal-character-name outside the
3653	// c-char-sequence, s-char-sequence, or r-char-sequence of a character or
3654	// string literal corresponds to a control character (in either of the
3655	// ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
3656	// basic source character set, the program is ill-formed.
3657	if (CodePoint < `0xA0`) {
3658	// We don't use isLexingRawMode() here because we need to warn about bad
3659	// UCNs even when skipping preprocessing tokens in a #if block.
3660	if (Result && PP) {
3661	if (CodePoint < `0x20` \|\| CodePoint >= `0x7F`)
3662	Diag(BufferPtr, diag::err_ucn_control_character);
3663	else {
3664	char C = static_cast<char>(CodePoint);
3665	Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, `1`);
3666	}
3667	}
3668
3669	return `0`;
3670	} else if (CodePoint >= `0xD800` && CodePoint <= `0xDFFF`) {
3671	// C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
3672	// We don't use isLexingRawMode() here because we need to diagnose bad
3673	// UCNs even when skipping preprocessing tokens in a #if block.
3674	if (Result && PP) {
3675	if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
3676	Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
3677	else
3678	Diag(BufferPtr, diag::err_ucn_escape_invalid);
3679	}
3680	return `0`;
3681	}
3682
3683	return CodePoint;
3684	}
3685
3686	bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
3687	const char *CurPtr) {
3688	if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
3689	isUnicodeWhitespace(Codepoint: C)) {
3690	Diag(BufferPtr, diag::ext_unicode_whitespace)
3691	<< makeCharRange(*this, BufferPtr, CurPtr);
3692
3693	Result.setFlag(Token::LeadingSpace);
3694	return true;
3695	}
3696	return false;
3697	}
3698
3699	void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
3700	IsAtStartOfLine = Result.isAtStartOfLine();
3701	HasLeadingSpace = Result.hasLeadingSpace();
3702	HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
3703	// Note that this doesn't affect IsAtPhysicalStartOfLine.
3704	}
3705
3706	bool Lexer::Lex(Token &Result) {
3707	assert(!isDependencyDirectivesLexer());
3708
3709	// Start a new token.
3710	Result.startToken();
3711
3712	// Set up misc whitespace flags for LexTokenInternal.
3713	if (IsAtStartOfLine) {
3714	Result.setFlag(Token::StartOfLine);
3715	IsAtStartOfLine = false;
3716	}
3717
3718	if (HasLeadingSpace) {
3719	Result.setFlag(Token::LeadingSpace);
3720	HasLeadingSpace = false;
3721	}
3722
3723	if (HasLeadingEmptyMacro) {
3724	Result.setFlag(Token::LeadingEmptyMacro);
3725	HasLeadingEmptyMacro = false;
3726	}
3727
3728	bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3729	IsAtPhysicalStartOfLine = false;
3730	bool isRawLex = isLexingRawMode();
3731	(void) isRawLex;
3732	bool returnedToken = LexTokenInternal(Result, TokAtPhysicalStartOfLine: atPhysicalStartOfLine);
3733	// (After the LexTokenInternal call, the lexer might be destroyed.)
3734	assert((returnedToken \|\| !isRawLex) && "Raw lex must succeed");
3735	return returnedToken;
3736	}
3737
3738	/// LexTokenInternal - This implements a simple C family lexer. It is an
3739	/// extremely performance critical piece of code. This assumes that the buffer
3740	/// has a null character at the end of the file. This returns a preprocessing
3741	/// token, not a normal token, as such, it is an internal interface. It assumes
3742	/// that the Flags of result have been cleared before calling this.
3743	bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
3744	LexStart:
3745	assert(!Result.needsCleaning() && "Result needs cleaning");
3746	assert(!Result.hasPtrData() && "Result has not been reset");
3747
3748	// CurPtr - Cache BufferPtr in an automatic variable.
3749	const char *CurPtr = BufferPtr;
3750
3751	// Small amounts of horizontal whitespace is very common between tokens.
3752	if (isHorizontalWhitespace(c: *CurPtr)) {
3753	do {
3754	++CurPtr;
3755	} while (isHorizontalWhitespace(c: *CurPtr));
3756
3757	// If we are keeping whitespace and other tokens, just return what we just
3758	// skipped. The next lexer invocation will return the token after the
3759	// whitespace.
3760	if (isKeepWhitespaceMode()) {
3761	FormTokenWithChars(Result, TokEnd: CurPtr, Kind: tok::unknown);
3762	// FIXME: The next token will not have LeadingSpace set.
3763	return true;
3764	}
3765
3766	BufferPtr = CurPtr;
3767	Result.setFlag(Token::LeadingSpace);
3768	}
3769
3770	unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below.
3771
3772	// Read a character, advancing over it.
3773	char Char = getAndAdvanceChar(Ptr&: CurPtr, Tok&: Result);
3774	tok::TokenKind Kind;
3775
3776	if (!isVerticalWhitespace(c: Char))
3777	NewLinePtr = nullptr;
3778
3779	switch (Char) {
3780	case `0`: // Null.
3781	// Found end of file?
3782	if (CurPtr-`1` == BufferEnd)
3783	return LexEndOfFile(Result, CurPtr: CurPtr-`1`);
3784
3785	// Check if we are performing code completion.
3786	if (isCodeCompletionPoint(CurPtr: CurPtr-`1`)) {
3787	// Return the code-completion token.
3788	Result.startToken();
3789	FormTokenWithChars(Result, TokEnd: CurPtr, Kind: tok::code_completion);
3790	return true;
3791	}
3792
3793	if (!isLexingRawMode())
3794	Diag(CurPtr-`1`, diag::null_in_file);
3795	Result.setFlag(Token::LeadingSpace);
3796	if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3797	return true; // KeepWhitespaceMode
3798
3799	// We know the lexer hasn't changed, so just try again with this lexer.
3800	// (We manually eliminate the tail call to avoid recursion.)
3801	goto LexNextToken;
3802
3803	case `26`: // DOS & CP/M EOF: "^Z".
3804	// If we're in Microsoft extensions mode, treat this as end of file.
3805	if (LangOpts.MicrosoftExt) {
3806	if (!isLexingRawMode())
3807	Diag(CurPtr-`1`, diag::ext_ctrl_z_eof_microsoft);
3808	return LexEndOfFile(Result, CurPtr: CurPtr-`1`);
3809	}
3810
3811	// If Microsoft extensions are disabled, this is just random garbage.
3812	Kind = tok::unknown;
3813	break;
3814
3815	case `'\r'`:
3816	if (CurPtr[`0`] == `'\n'`)
3817	(void)getAndAdvanceChar(Ptr&: CurPtr, Tok&: Result);
3818	[[fallthrough]];
3819	case `'\n'`:
3820	// If we are inside a preprocessor directive and we see the end of line,
3821	// we know we are done with the directive, so return an EOD token.
3822	if (ParsingPreprocessorDirective) {
3823	// Done parsing the "line".
3824	ParsingPreprocessorDirective = false;
3825
3826	// Restore comment saving mode, in case it was disabled for directive.
3827	if (PP)
3828	resetExtendedTokenMode();
3829
3830	// Since we consumed a newline, we are back at the start of a line.
3831	IsAtStartOfLine = true;
3832	IsAtPhysicalStartOfLine = true;
3833	NewLinePtr = CurPtr - `1`;
3834
3835	Kind = tok::eod;
3836	break;
3837	}
3838
3839	// No leading whitespace seen so far.
3840	Result.clearFlag(Flag: Token::LeadingSpace);
3841
3842	if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3843	return true; // KeepWhitespaceMode
3844
3845	// We only saw whitespace, so just try again with this lexer.
3846	// (We manually eliminate the tail call to avoid recursion.)
3847	goto LexNextToken;
3848	case `' '`:
3849	case `'\t'`:
3850	case `'\f'`:
3851	case `'\v'`:
3852	SkipHorizontalWhitespace:
3853	Result.setFlag(Token::LeadingSpace);
3854	if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3855	return true; // KeepWhitespaceMode
3856
3857	SkipIgnoredUnits:
3858	CurPtr = BufferPtr;
3859
3860	// If the next token is obviously a // or / / comment, skip it efficiently
3861	// too (without going through the big switch stmt).
3862	if (CurPtr[`0`] == `'/'` && CurPtr[`1`] == `'/'` && !inKeepCommentMode() &&
3863	LineComment && (LangOpts.CPlusPlus \|\| !LangOpts.TraditionalCPP)) {
3864	if (SkipLineComment(Result, CurPtr: CurPtr+`2`, TokAtPhysicalStartOfLine))
3865	return true; // There is a token to return.
3866	goto SkipIgnoredUnits;
3867	} else if (CurPtr[`0`] == `'/'` && CurPtr[`1`] == `'*'` && !inKeepCommentMode()) {
3868	if (SkipBlockComment(Result, CurPtr: CurPtr+`2`, TokAtPhysicalStartOfLine))
3869	return true; // There is a token to return.
3870	goto SkipIgnoredUnits;
3871	} else if (isHorizontalWhitespace(c: *CurPtr)) {
3872	goto SkipHorizontalWhitespace;
3873	}
3874	// We only saw whitespace, so just try again with this lexer.
3875	// (We manually eliminate the tail call to avoid recursion.)
3876	goto LexNextToken;
3877
3878	// C99 6.4.4.1: Integer Constants.
3879	// C99 6.4.4.2: Floating Constants.
3880	case `'0'`: case `'1'`: case `'2'`: case `'3'`: case `'4'`:
3881	case `'5'`: case `'6'`: case `'7'`: case `'8'`: case `'9'`:
3882	// Notify MIOpt that we read a non-whitespace/non-comment token.
3883	MIOpt.ReadToken();
3884	return LexNumericConstant(Result, CurPtr);
3885
3886	// Identifier (e.g., uber), or
3887	// UTF-8 (C23/C++17) or UTF-16 (C11/C++11) character literal, or
3888	// UTF-8 or UTF-16 string literal (C11/C++11).
3889	case `'u'`:
3890	// Notify MIOpt that we read a non-whitespace/non-comment token.
3891	MIOpt.ReadToken();
3892
3893	if (LangOpts.CPlusPlus11 \|\| LangOpts.C11) {
3894	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
3895
3896	// UTF-16 string literal
3897	if (Char == `'"'`)
3898	return LexStringLiteral(Result, CurPtr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
3899	Kind: tok::utf16_string_literal);
3900
3901	// UTF-16 character constant
3902	if (Char == `'\''`)
3903	return LexCharConstant(Result, CurPtr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
3904	Kind: tok::utf16_char_constant);
3905
3906	// UTF-16 raw string literal
3907	if (Char == `'R'` && LangOpts.RawStringLiterals &&
3908	getCharAndSize(Ptr: CurPtr + SizeTmp, Size&: SizeTmp2) == `'"'`)
3909	return LexRawStringLiteral(Result,
3910	CurPtr: ConsumeChar(Ptr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
3911	Size: SizeTmp2, Tok&: Result),
3912	Kind: tok::utf16_string_literal);
3913
3914	if (Char == `'8'`) {
3915	char Char2 = getCharAndSize(Ptr: CurPtr + SizeTmp, Size&: SizeTmp2);
3916
3917	// UTF-8 string literal
3918	if (Char2 == `'"'`)
3919	return LexStringLiteral(Result,
3920	CurPtr: ConsumeChar(Ptr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
3921	Size: SizeTmp2, Tok&: Result),
3922	Kind: tok::utf8_string_literal);
3923	if (Char2 == `'\''` && (LangOpts.CPlusPlus17 \|\| LangOpts.C23))
3924	return LexCharConstant(
3925	Result, CurPtr: ConsumeChar(Ptr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
3926	Size: SizeTmp2, Tok&: Result),
3927	Kind: tok::utf8_char_constant);
3928
3929	if (Char2 == `'R'` && LangOpts.RawStringLiterals) {
3930	unsigned SizeTmp3;
3931	char Char3 = getCharAndSize(Ptr: CurPtr + SizeTmp + SizeTmp2, Size&: SizeTmp3);
3932	// UTF-8 raw string literal
3933	if (Char3 == `'"'`) {
3934	return LexRawStringLiteral(Result,
3935	CurPtr: ConsumeChar(Ptr: ConsumeChar(Ptr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
3936	Size: SizeTmp2, Tok&: Result),
3937	Size: SizeTmp3, Tok&: Result),
3938	Kind: tok::utf8_string_literal);
3939	}
3940	}
3941	}
3942	}
3943
3944	// treat u like the start of an identifier.
3945	return LexIdentifierContinue(Result, CurPtr);
3946
3947	case `'U'`: // Identifier (e.g. Uber) or C11/C++11 UTF-32 string literal
3948	// Notify MIOpt that we read a non-whitespace/non-comment token.
3949	MIOpt.ReadToken();
3950
3951	if (LangOpts.CPlusPlus11 \|\| LangOpts.C11) {
3952	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
3953
3954	// UTF-32 string literal
3955	if (Char == `'"'`)
3956	return LexStringLiteral(Result, CurPtr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
3957	Kind: tok::utf32_string_literal);
3958
3959	// UTF-32 character constant
3960	if (Char == `'\''`)
3961	return LexCharConstant(Result, CurPtr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
3962	Kind: tok::utf32_char_constant);
3963
3964	// UTF-32 raw string literal
3965	if (Char == `'R'` && LangOpts.RawStringLiterals &&
3966	getCharAndSize(Ptr: CurPtr + SizeTmp, Size&: SizeTmp2) == `'"'`)
3967	return LexRawStringLiteral(Result,
3968	CurPtr: ConsumeChar(Ptr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
3969	Size: SizeTmp2, Tok&: Result),
3970	Kind: tok::utf32_string_literal);
3971	}
3972
3973	// treat U like the start of an identifier.
3974	return LexIdentifierContinue(Result, CurPtr);
3975
3976	case `'R'`: // Identifier or C++0x raw string literal
3977	// Notify MIOpt that we read a non-whitespace/non-comment token.
3978	MIOpt.ReadToken();
3979
3980	if (LangOpts.RawStringLiterals) {
3981	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
3982
3983	if (Char == `'"'`)
3984	return LexRawStringLiteral(Result,
3985	CurPtr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
3986	Kind: tok::string_literal);
3987	}
3988
3989	// treat R like the start of an identifier.
3990	return LexIdentifierContinue(Result, CurPtr);
3991
3992	case `'L'`: // Identifier (Loony) or wide literal (L'x' or L"xyz").
3993	// Notify MIOpt that we read a non-whitespace/non-comment token.
3994	MIOpt.ReadToken();
3995	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
3996
3997	// Wide string literal.
3998	if (Char == `'"'`)
3999	return LexStringLiteral(Result, CurPtr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
4000	Kind: tok::wide_string_literal);
4001
4002	// Wide raw string literal.
4003	if (LangOpts.RawStringLiterals && Char == `'R'` &&
4004	getCharAndSize(Ptr: CurPtr + SizeTmp, Size&: SizeTmp2) == `'"'`)
4005	return LexRawStringLiteral(Result,
4006	CurPtr: ConsumeChar(Ptr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
4007	Size: SizeTmp2, Tok&: Result),
4008	Kind: tok::wide_string_literal);
4009
4010	// Wide character constant.
4011	if (Char == `'\''`)
4012	return LexCharConstant(Result, CurPtr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
4013	Kind: tok::wide_char_constant);
4014	// FALL THROUGH, treating L like the start of an identifier.
4015	[[fallthrough]];
4016
4017	// C99 6.4.2: Identifiers.
4018	case `'A'`: case `'B'`: case `'C'`: case `'D'`: case `'E'`: case `'F'`: case `'G'`:
4019	case `'H'`: case `'I'`: case `'J'`: case `'K'`: /'L'/case `'M'`: case `'N'`:
4020	case `'O'`: case `'P'`: case `'Q'`: /'R'/case `'S'`: case `'T'`: /'U'/
4021	case `'V'`: case `'W'`: case `'X'`: case `'Y'`: case `'Z'`:
4022	case `'a'`: case `'b'`: case `'c'`: case `'d'`: case `'e'`: case `'f'`: case `'g'`:
4023	case `'h'`: case `'i'`: case `'j'`: case `'k'`: case `'l'`: case `'m'`: case `'n'`:
4024	case `'o'`: case `'p'`: case `'q'`: case `'r'`: case `'s'`: case `'t'`: /'u'/
4025	case `'v'`: case `'w'`: case `'x'`: case `'y'`: case `'z'`:
4026	case `'_'`:
4027	// Notify MIOpt that we read a non-whitespace/non-comment token.
4028	MIOpt.ReadToken();
4029	return LexIdentifierContinue(Result, CurPtr);
4030
4031	case `'$'`: // $ in identifiers.
4032	if (LangOpts.DollarIdents) {
4033	if (!isLexingRawMode())
4034	Diag(CurPtr-`1`, diag::ext_dollar_in_identifier);
4035	// Notify MIOpt that we read a non-whitespace/non-comment token.
4036	MIOpt.ReadToken();
4037	return LexIdentifierContinue(Result, CurPtr);
4038	}
4039
4040	Kind = tok::unknown;
4041	break;
4042
4043	// C99 6.4.4: Character Constants.
4044	case `'\''`:
4045	// Notify MIOpt that we read a non-whitespace/non-comment token.
4046	MIOpt.ReadToken();
4047	return LexCharConstant(Result, CurPtr, Kind: tok::char_constant);
4048
4049	// C99 6.4.5: String Literals.
4050	case `'"'`:
4051	// Notify MIOpt that we read a non-whitespace/non-comment token.
4052	MIOpt.ReadToken();
4053	return LexStringLiteral(Result, CurPtr,
4054	Kind: ParsingFilename ? tok::header_name
4055	: tok::string_literal);
4056
4057	// C99 6.4.6: Punctuators.
4058	case `'?'`:
4059	Kind = tok::question;
4060	break;
4061	case `'['`:
4062	Kind = tok::l_square;
4063	break;
4064	case `']'`:
4065	Kind = tok::r_square;
4066	break;
4067	case `'('`:
4068	Kind = tok::l_paren;
4069	break;
4070	case `')'`:
4071	Kind = tok::r_paren;
4072	break;
4073	case `'{'`:
4074	Kind = tok::l_brace;
4075	break;
4076	case `'}'`:
4077	Kind = tok::r_brace;
4078	break;
4079	case `'.'`:
4080	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
4081	if (Char >= `'0'` && Char <= `'9'`) {
4082	// Notify MIOpt that we read a non-whitespace/non-comment token.
4083	MIOpt.ReadToken();
4084
4085	return LexNumericConstant(Result, CurPtr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result));
4086	} else if (LangOpts.CPlusPlus && Char == `'*'`) {
4087	Kind = tok::periodstar;
4088	CurPtr += SizeTmp;
4089	} else if (Char == `'.'` &&
4090	getCharAndSize(Ptr: CurPtr+SizeTmp, Size&: SizeTmp2) == `'.'`) {
4091	Kind = tok::ellipsis;
4092	CurPtr = ConsumeChar(Ptr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
4093	Size: SizeTmp2, Tok&: Result);
4094	} else {
4095	Kind = tok::period;
4096	}
4097	break;
4098	case `'&'`:
4099	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
4100	if (Char == `'&'`) {
4101	Kind = tok::ampamp;
4102	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4103	} else if (Char == `'='`) {
4104	Kind = tok::ampequal;
4105	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4106	} else {
4107	Kind = tok::amp;
4108	}
4109	break;
4110	case `'*'`:
4111	if (getCharAndSize(Ptr: CurPtr, Size&: SizeTmp) == `'='`) {
4112	Kind = tok::starequal;
4113	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4114	} else {
4115	Kind = tok::star;
4116	}
4117	break;
4118	case `'+'`:
4119	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
4120	if (Char == `'+'`) {
4121	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4122	Kind = tok::plusplus;
4123	} else if (Char == `'='`) {
4124	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4125	Kind = tok::plusequal;
4126	} else {
4127	Kind = tok::plus;
4128	}
4129	break;
4130	case `'-'`:
4131	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
4132	if (Char == `'-'`) { // --
4133	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4134	Kind = tok::minusminus;
4135	} else if (Char == `'>'` && LangOpts.CPlusPlus &&
4136	getCharAndSize(Ptr: CurPtr+SizeTmp, Size&: SizeTmp2) == `''`) { // C++ ->
4137	CurPtr = ConsumeChar(Ptr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
4138	Size: SizeTmp2, Tok&: Result);
4139	Kind = tok::arrowstar;
4140	} else if (Char == `'>'`) { // ->
4141	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4142	Kind = tok::arrow;
4143	} else if (Char == `'='`) { // -=
4144	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4145	Kind = tok::minusequal;
4146	} else {
4147	Kind = tok::minus;
4148	}
4149	break;
4150	case `'~'`:
4151	Kind = tok::tilde;
4152	break;
4153	case `'!'`:
4154	if (getCharAndSize(Ptr: CurPtr, Size&: SizeTmp) == `'='`) {
4155	Kind = tok::exclaimequal;
4156	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4157	} else {
4158	Kind = tok::exclaim;
4159	}
4160	break;
4161	case `'/'`:
4162	// 6.4.9: Comments
4163	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
4164	if (Char == `'/'`) { // Line comment.
4165	// Even if Line comments are disabled (e.g. in C89 mode), we generally
4166	// want to lex this as a comment. There is one problem with this though,
4167	// that in one particular corner case, this can change the behavior of the
4168	// resultant program. For example, In "foo /// bar", C89 would lex
4169	// this as "foo / bar" and languages with Line comments would lex it as
4170	// "foo". Check to see if the character after the second slash is a ''.*
4171	// If so, we will lex that as a "/" instead of the start of a comment.
4172	// However, we never do this if we are just preprocessing.
4173	bool TreatAsComment =
4174	LineComment && (LangOpts.CPlusPlus \|\| !LangOpts.TraditionalCPP);
4175	if (!TreatAsComment)
4176	if (!(PP && PP->isPreprocessedOutput()))
4177	TreatAsComment = getCharAndSize(Ptr: CurPtr+SizeTmp, Size&: SizeTmp2) != `'*'`;
4178
4179	if (TreatAsComment) {
4180	if (SkipLineComment(Result, CurPtr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
4181	TokAtPhysicalStartOfLine))
4182	return true; // There is a token to return.
4183
4184	// It is common for the tokens immediately after a // comment to be
4185	// whitespace (indentation for the next line). Instead of going through
4186	// the big switch, handle it efficiently now.
4187	goto SkipIgnoredUnits;
4188	}
4189	}
4190
4191	if (Char == `''`) { // /*/ comment.
4192	if (SkipBlockComment(Result, CurPtr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
4193	TokAtPhysicalStartOfLine))
4194	return true; // There is a token to return.
4195
4196	// We only saw whitespace, so just try again with this lexer.
4197	// (We manually eliminate the tail call to avoid recursion.)
4198	goto LexNextToken;
4199	}
4200
4201	if (Char == `'='`) {
4202	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4203	Kind = tok::slashequal;
4204	} else {
4205	Kind = tok::slash;
4206	}
4207	break;
4208	case `'%'`:
4209	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
4210	if (Char == `'='`) {
4211	Kind = tok::percentequal;
4212	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4213	} else if (LangOpts.Digraphs && Char == `'>'`) {
4214	Kind = tok::r_brace; // '%>' -> '}'
4215	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4216	} else if (LangOpts.Digraphs && Char == `':'`) {
4217	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4218	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
4219	if (Char == `'%'` && getCharAndSize(Ptr: CurPtr+SizeTmp, Size&: SizeTmp2) == `':'`) {
4220	Kind = tok::hashhash; // '%:%:' -> '##'
4221	CurPtr = ConsumeChar(Ptr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
4222	Size: SizeTmp2, Tok&: Result);
4223	} else if (Char == `'@'` && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize
4224	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4225	if (!isLexingRawMode())
4226	Diag(BufferPtr, diag::ext_charize_microsoft);
4227	Kind = tok::hashat;
4228	} else { // '%:' -> '#'
4229	// We parsed a # character. If this occurs at the start of the line,
4230	// it's actually the start of a preprocessing directive. Callback to
4231	// the preprocessor to handle it.
4232	// TODO: -fpreprocessed mode??
4233	if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
4234	goto HandleDirective;
4235
4236	Kind = tok::hash;
4237	}
4238	} else {
4239	Kind = tok::percent;
4240	}
4241	break;
4242	case `'<'`:
4243	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
4244	if (ParsingFilename) {
4245	return LexAngledStringLiteral(Result, CurPtr);
4246	} else if (Char == `'<'`) {
4247	char After = getCharAndSize(Ptr: CurPtr+SizeTmp, Size&: SizeTmp2);
4248	if (After == `'='`) {
4249	Kind = tok::lesslessequal;
4250	CurPtr = ConsumeChar(Ptr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
4251	Size: SizeTmp2, Tok&: Result);
4252	} else if (After == `'<'` && IsStartOfConflictMarker(CurPtr: CurPtr-`1`)) {
4253	// If this is actually a '<<<<<<<' version control conflict marker,
4254	// recognize it as such and recover nicely.
4255	goto LexNextToken;
4256	} else if (After == `'<'` && HandleEndOfConflictMarker(CurPtr: CurPtr-`1`)) {
4257	// If this is '<<<<' and we're in a Perforce-style conflict marker,
4258	// ignore it.
4259	goto LexNextToken;
4260	} else if (LangOpts.CUDA && After == `'<'`) {
4261	Kind = tok::lesslessless;
4262	CurPtr = ConsumeChar(Ptr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
4263	Size: SizeTmp2, Tok&: Result);
4264	} else {
4265	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4266	Kind = tok::lessless;
4267	}
4268	} else if (Char == `'='`) {
4269	char After = getCharAndSize(Ptr: CurPtr+SizeTmp, Size&: SizeTmp2);
4270	if (After == `'>'`) {
4271	if (LangOpts.CPlusPlus20) {
4272	if (!isLexingRawMode())
4273	Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
4274	CurPtr = ConsumeChar(Ptr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
4275	Size: SizeTmp2, Tok&: Result);
4276	Kind = tok::spaceship;
4277	break;
4278	}
4279	// Suggest adding a space between the '<=' and the '>' to avoid a
4280	// change in semantics if this turns up in C++ <=17 mode.
4281	if (LangOpts.CPlusPlus && !isLexingRawMode()) {
4282	Diag(BufferPtr, diag::warn_cxx20_compat_spaceship)
4283	<< FixItHint::CreateInsertion(
4284	getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");
4285	}
4286	}
4287	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4288	Kind = tok::lessequal;
4289	} else if (LangOpts.Digraphs && Char == `':'`) { // '<:' -> '['
4290	if (LangOpts.CPlusPlus11 &&
4291	getCharAndSize(Ptr: CurPtr + SizeTmp, Size&: SizeTmp2) == `':'`) {
4292	// C++0x [lex.pptoken]p3:
4293	// Otherwise, if the next three characters are <:: and the subsequent
4294	// character is neither : nor >, the < is treated as a preprocessor
4295	// token by itself and not as the first character of the alternative
4296	// token <:.
4297	unsigned SizeTmp3;
4298	char After = getCharAndSize(Ptr: CurPtr + SizeTmp + SizeTmp2, Size&: SizeTmp3);
4299	if (After != `':'` && After != `'>'`) {
4300	Kind = tok::less;
4301	if (!isLexingRawMode())
4302	Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
4303	break;
4304	}
4305	}
4306
4307	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4308	Kind = tok::l_square;
4309	} else if (LangOpts.Digraphs && Char == `'%'`) { // '<%' -> '{'
4310	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4311	Kind = tok::l_brace;
4312	} else if (Char == `'#'` && /Not a trigraph/ SizeTmp == `1` &&
4313	lexEditorPlaceholder(Result, CurPtr)) {
4314	return true;
4315	} else {
4316	Kind = tok::less;
4317	}
4318	break;
4319	case `'>'`:
4320	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
4321	if (Char == `'='`) {
4322	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4323	Kind = tok::greaterequal;
4324	} else if (Char == `'>'`) {
4325	char After = getCharAndSize(Ptr: CurPtr+SizeTmp, Size&: SizeTmp2);
4326	if (After == `'='`) {
4327	CurPtr = ConsumeChar(Ptr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
4328	Size: SizeTmp2, Tok&: Result);
4329	Kind = tok::greatergreaterequal;
4330	} else if (After == `'>'` && IsStartOfConflictMarker(CurPtr: CurPtr-`1`)) {
4331	// If this is actually a '>>>>' conflict marker, recognize it as such
4332	// and recover nicely.
4333	goto LexNextToken;
4334	} else if (After == `'>'` && HandleEndOfConflictMarker(CurPtr: CurPtr-`1`)) {
4335	// If this is '>>>>>>>' and we're in a conflict marker, ignore it.
4336	goto LexNextToken;
4337	} else if (LangOpts.CUDA && After == `'>'`) {
4338	Kind = tok::greatergreatergreater;
4339	CurPtr = ConsumeChar(Ptr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
4340	Size: SizeTmp2, Tok&: Result);
4341	} else {
4342	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4343	Kind = tok::greatergreater;
4344	}
4345	} else {
4346	Kind = tok::greater;
4347	}
4348	break;
4349	case `'^'`:
4350	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
4351	if (Char == `'='`) {
4352	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4353	Kind = tok::caretequal;
4354	} else {
4355	if (LangOpts.OpenCL && Char == `'^'`)
4356	Diag(CurPtr, diag::err_opencl_logical_exclusive_or);
4357	Kind = tok::caret;
4358	}
4359	break;
4360	case `'\|'`:
4361	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
4362	if (Char == `'='`) {
4363	Kind = tok::pipeequal;
4364	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4365	} else if (Char == `'\|'`) {
4366	// If this is '\|\|\|\|\|\|\|' and we're in a conflict marker, ignore it.
4367	if (CurPtr[`1`] == `'\|'` && HandleEndOfConflictMarker(CurPtr: CurPtr-`1`))
4368	goto LexNextToken;
4369	Kind = tok::pipepipe;
4370	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4371	} else {
4372	Kind = tok::pipe;
4373	}
4374	break;
4375	case `':'`:
4376	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
4377	if (LangOpts.Digraphs && Char == `'>'`) {
4378	Kind = tok::r_square; // ':>' -> ']'
4379	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4380	} else if (Char == `':'`) {
4381	Kind = tok::coloncolon;
4382	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4383	} else {
4384	Kind = tok::colon;
4385	}
4386	break;
4387	case `';'`:
4388	Kind = tok::semi;
4389	break;
4390	case `'='`:
4391	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
4392	if (Char == `'='`) {
4393	// If this is '====' and we're in a conflict marker, ignore it.
4394	if (CurPtr[`1`] == `'='` && HandleEndOfConflictMarker(CurPtr: CurPtr-`1`))
4395	goto LexNextToken;
4396
4397	Kind = tok::equalequal;
4398	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4399	} else {
4400	Kind = tok::equal;
4401	}
4402	break;
4403	case `','`:
4404	Kind = tok::comma;
4405	break;
4406	case `'#'`:
4407	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
4408	if (Char == `'#'`) {
4409	Kind = tok::hashhash;
4410	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4411	} else if (Char == `'@'` && LangOpts.MicrosoftExt) { // #@ -> Charize
4412	Kind = tok::hashat;
4413	if (!isLexingRawMode())
4414	Diag(BufferPtr, diag::ext_charize_microsoft);
4415	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4416	} else {
4417	// We parsed a # character. If this occurs at the start of the line,
4418	// it's actually the start of a preprocessing directive. Callback to
4419	// the preprocessor to handle it.
4420	// TODO: -fpreprocessed mode??
4421	if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
4422	goto HandleDirective;
4423
4424	Kind = tok::hash;
4425	}
4426	break;
4427
4428	case `'@'`:
4429	// Objective C support.
4430	if (CurPtr[-`1`] == `'@'` && LangOpts.ObjC)
4431	Kind = tok::at;
4432	else
4433	Kind = tok::unknown;
4434	break;
4435
4436	// UCNs (C99 6.4.3, C++11 [lex.charset]p2)
4437	case `'\\'`:
4438	if (!LangOpts.AsmPreprocessor) {
4439	if (uint32_t CodePoint = tryReadUCN(StartPtr&: CurPtr, SlashLoc: BufferPtr, Result: &Result)) {
4440	if (CheckUnicodeWhitespace(Result, C: CodePoint, CurPtr)) {
4441	if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4442	return true; // KeepWhitespaceMode
4443
4444	// We only saw whitespace, so just try again with this lexer.
4445	// (We manually eliminate the tail call to avoid recursion.)
4446	goto LexNextToken;
4447	}
4448
4449	return LexUnicodeIdentifierStart(Result, C: CodePoint, CurPtr);
4450	}
4451	}
4452
4453	Kind = tok::unknown;
4454	break;
4455
4456	default: {
4457	if (isASCII(c: Char)) {
4458	Kind = tok::unknown;
4459	break;
4460	}
4461
4462	llvm::UTF32 CodePoint;
4463
4464	// We can't just reset CurPtr to BufferPtr because BufferPtr may point to
4465	// an escaped newline.
4466	--CurPtr;
4467	llvm::ConversionResult Status =
4468	llvm::convertUTF8Sequence(source: (const llvm::UTF8 **)&CurPtr,
4469	sourceEnd: (const llvm::UTF8 *)BufferEnd,
4470	target: &CodePoint,
4471	flags: llvm::strictConversion);
4472	if (Status == llvm::conversionOK) {
4473	if (CheckUnicodeWhitespace(Result, C: CodePoint, CurPtr)) {
4474	if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4475	return true; // KeepWhitespaceMode
4476
4477	// We only saw whitespace, so just try again with this lexer.
4478	// (We manually eliminate the tail call to avoid recursion.)
4479	goto LexNextToken;
4480	}
4481	return LexUnicodeIdentifierStart(Result, C: CodePoint, CurPtr);
4482	}
4483
4484	if (isLexingRawMode() \|\| ParsingPreprocessorDirective \|\|
4485	PP->isPreprocessedOutput()) {
4486	++CurPtr;
4487	Kind = tok::unknown;
4488	break;
4489	}
4490
4491	// Non-ASCII characters tend to creep into source code unintentionally.
4492	// Instead of letting the parser complain about the unknown token,
4493	// just diagnose the invalid UTF-8, then drop the character.
4494	Diag(CurPtr, diag::err_invalid_utf8);
4495
4496	BufferPtr = CurPtr+`1`;
4497	// We're pretending the character didn't exist, so just try again with
4498	// this lexer.
4499	// (We manually eliminate the tail call to avoid recursion.)
4500	goto LexNextToken;
4501	}
4502	}
4503
4504	// Notify MIOpt that we read a non-whitespace/non-comment token.
4505	MIOpt.ReadToken();
4506
4507	// Update the location of token as well as BufferPtr.
4508	FormTokenWithChars(Result, TokEnd: CurPtr, Kind);
4509	return true;
4510
4511	HandleDirective:
4512	// We parsed a # character and it's the start of a preprocessing directive.
4513
4514	FormTokenWithChars(Result, TokEnd: CurPtr, Kind: tok::hash);
4515	PP->HandleDirective(Result);
4516
4517	if (PP->hadModuleLoaderFatalFailure())
4518	// With a fatal failure in the module loader, we abort parsing.
4519	return true;
4520
4521	// We parsed the directive; lex a token with the new state.
4522	return false;
4523
4524	LexNextToken:
4525	Result.clearFlag(Flag: Token::NeedsCleaning);
4526	goto LexStart;
4527	}
4528
4529	const char *Lexer::convertDependencyDirectiveToken(
4530	const dependency_directives_scan::Token &DDTok, Token &Result) {
4531	const char *TokPtr = BufferStart + DDTok.Offset;
4532	Result.startToken();
4533	Result.setLocation(getSourceLocation(Loc: TokPtr));
4534	Result.setKind(DDTok.Kind);
4535	Result.setFlag((Token::TokenFlags)DDTok.Flags);
4536	Result.setLength(DDTok.Length);
4537	BufferPtr = TokPtr + DDTok.Length;
4538	return TokPtr;
4539	}
4540
4541	bool Lexer::LexDependencyDirectiveToken(Token &Result) {
4542	assert(isDependencyDirectivesLexer());
4543
4544	using namespace dependency_directives_scan;
4545
4546	if (BufferPtr == BufferEnd)
4547	return LexEndOfFile(Result, CurPtr: BufferPtr);
4548
4549	while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) {
4550	if (DepDirectives.front().Kind == pp_eof)
4551	return LexEndOfFile(Result, CurPtr: BufferEnd);
4552	if (DepDirectives.front().Kind == tokens_present_before_eof)
4553	MIOpt.ReadToken();
4554	NextDepDirectiveTokenIndex = `0`;
4555	DepDirectives = DepDirectives.drop_front();
4556	}
4557
4558	const dependency_directives_scan::Token &DDTok =
4559	DepDirectives.front().Tokens [NextDepDirectiveTokenIndex++];
4560	if (NextDepDirectiveTokenIndex > `1` \|\| DDTok.Kind != tok::hash) {
4561	// Read something other than a preprocessor directive hash.
4562	MIOpt.ReadToken();
4563	}
4564
4565	if (ParsingFilename && DDTok.is(K: tok::less)) {
4566	BufferPtr = BufferStart + DDTok.Offset;
4567	LexAngledStringLiteral(Result, CurPtr: BufferPtr + `1`);
4568	if (Result.isNot(K: tok::header_name))
4569	return true;
4570	// Advance the index of lexed tokens.
4571	while (true) {
4572	const dependency_directives_scan::Token &NextTok =
4573	DepDirectives.front().Tokens [NextDepDirectiveTokenIndex];
4574	if (BufferStart + NextTok.Offset >= BufferPtr)
4575	break;
4576	++NextDepDirectiveTokenIndex;
4577	}
4578	return true;
4579	}
4580
4581	const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result);
4582
4583	if (Result.is(K: tok::hash) && Result.isAtStartOfLine()) {
4584	PP->HandleDirective(Result);
4585	return false;
4586	}
4587	if (Result.is(K: tok::raw_identifier)) {
4588	Result.setRawIdentifierData(TokPtr);
4589	if (!isLexingRawMode()) {
4590	const IdentifierInfo *II = PP->LookUpIdentifierInfo(Identifier&: Result);
4591	if (II->isHandleIdentifierCase())
4592	return PP->HandleIdentifier(Identifier&: Result);
4593	}
4594	return true;
4595	}
4596	if (Result.isLiteral()) {
4597	Result.setLiteralData(TokPtr);
4598	return true;
4599	}
4600	if (Result.is(K: tok::colon)) {
4601	// Convert consecutive colons to 'tok::coloncolon'.
4602	if (*BufferPtr == `':'`) {
4603	assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
4604	tok::colon));
4605	++NextDepDirectiveTokenIndex;
4606	Result.setKind(tok::coloncolon);
4607	}
4608	return true;
4609	}
4610	if (Result.is(K: tok::eod))
4611	ParsingPreprocessorDirective = false;
4612
4613	return true;
4614	}
4615
4616	bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) {
4617	assert(isDependencyDirectivesLexer());
4618
4619	using namespace dependency_directives_scan;
4620
4621	bool Stop = false;
4622	unsigned NestedIfs = `0`;
4623	do {
4624	DepDirectives = DepDirectives.drop_front();
4625	switch (DepDirectives.front().Kind) {
4626	case pp_none:
4627	llvm_unreachable("unexpected 'pp_none'");
4628	case pp_include:
4629	case pp___include_macros:
4630	case pp_define:
4631	case pp_undef:
4632	case pp_import:
4633	case pp_pragma_import:
4634	case pp_pragma_once:
4635	case pp_pragma_push_macro:
4636	case pp_pragma_pop_macro:
4637	case pp_pragma_include_alias:
4638	case pp_pragma_system_header:
4639	case pp_include_next:
4640	case decl_at_import:
4641	case cxx_module_decl:
4642	case cxx_import_decl:
4643	case cxx_export_module_decl:
4644	case cxx_export_import_decl:
4645	case tokens_present_before_eof:
4646	break;
4647	case pp_if:
4648	case pp_ifdef:
4649	case pp_ifndef:
4650	++NestedIfs;
4651	break;
4652	case pp_elif:
4653	case pp_elifdef:
4654	case pp_elifndef:
4655	case pp_else:
4656	if (!NestedIfs) {
4657	Stop = true;
4658	}
4659	break;
4660	case pp_endif:
4661	if (!NestedIfs) {
4662	Stop = true;
4663	} else {
4664	--NestedIfs;
4665	}
4666	break;
4667	case pp_eof:
4668	NextDepDirectiveTokenIndex = `0`;
4669	return LexEndOfFile(Result, CurPtr: BufferEnd);
4670	}
4671	} while (!Stop);
4672
4673	const dependency_directives_scan::Token &DDTok =
4674	DepDirectives.front().Tokens.front();
4675	assert(DDTok.is(tok::hash));
4676	NextDepDirectiveTokenIndex = `1`;
4677
4678	convertDependencyDirectiveToken(DDTok, Result);
4679	return false;
4680	}
4681

Provided by KDAB

Definitions

isObjCAtKeyword
getObjCKeywordID
isSimpleTypeSpecifier
anchor
InitLexer
Lexer
Lexer
Lexer
resetExtendedTokenMode
Create_PragmaLexer
seek
StringifyImpl
Stringify
Stringify
getSpellingSlow
getSpelling
getSpelling
getSpelling
MeasureTokenLength
getRawToken
findBeginningOfLine
getBeginningOfFileToken
GetBeginningOfToken
PreambleDirectiveKind
ComputePreamble
getTokenPrefixLength
getLocForEndOfToken
isAtStartOfMacroExpansion
isAtEndOfMacroExpansion
makeRangeFromFileLocs
isInExpansionTokenRange
makeFileCharRange
getSourceText
getImmediateMacroName
getImmediateMacroNameForDiagnostics
isAsciiIdentifierContinueChar
isNewLineEscaped
getIndentationForLine
GetMappedTokenLoc
getSourceLocation
Diag
GetTrigraphCharForLetter
DecodeTrigraphChar
getEscapedNewLineSize
SkipEscapedNewLines
findNextToken
findPreviousToken
findLocationAfterToken
getCharAndSizeSlow
getCharAndSizeSlowNoWarn
SetByteOffset
isUnicodeWhitespace
codepointAsHexString
isMathematicalExtensionID
isAllowedIDChar
isAllowedInitiallyIDChar
diagnoseExtensionInIdentifier
makeCharRange
maybeDiagnoseIDCharCompat
maybeDiagnoseUTF8Homoglyph
diagnoseInvalidUnicodeCodepointInIdentifier
tryConsumeIdentifierUCN
tryConsumeIdentifierUTF8Char
LexUnicodeIdentifierStart
fastParseASCIIIdentifier
LexIdentifierContinue
isHexaLiteral
LexNumericConstant
LexUDSuffix
LexStringLiteral
LexRawStringLiteral
LexAngledStringLiteral
codeCompleteIncludedFile
LexCharConstant
SkipWhitespace
SkipLineComment
SaveLineComment
isEndOfBlockCommentWithEscapedNewLine
SkipBlockComment
ReadToEndOfLine
LexEndOfFile
isNextPPTokenLParen
FindConflictEnd
IsStartOfConflictMarker
HandleEndOfConflictMarker
findPlaceholderEnd
lexEditorPlaceholder
isCodeCompletionPoint
DiagnoseDelimitedOrNamedEscapeSequence
tryReadNumericUCN
tryReadNamedUCN
tryReadUCN
CheckUnicodeWhitespace
PropagateLineStartLeadingSpaceInfo
Lex
LexTokenInternal
convertDependencyDirectiveToken
LexDependencyDirectiveToken

Update your C++ knowledge – Modern C++11/14/17 Training

Find out more

Definitions

source code of clang/lib/Lex/Lexer.cpp