Lex.cpp source code [clang-tools-extra/pseudo/lib/Lex.cpp]

1	//===--- Lex.cpp - extract token stream from source code ---------- C++--===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "clang-pseudo/Token.h"
10	#include "clang/Basic/IdentifierTable.h"
11	#include "clang/Basic/SourceLocation.h"
12	#include "clang/Basic/TokenKinds.h"
13	#include "clang/Lex/Lexer.h"
14	#include "clang/Lex/LiteralSupport.h"
15
16	namespace clang {
17	namespace pseudo {
18
19	TokenStream lex(const std::string &Code, const clang::LangOptions &LangOpts) {
20	clang::SourceLocation Start;
21	// Tokenize using clang's lexer in raw mode.
22	// std::string guarantees null-termination, which the lexer needs.
23	clang::Lexer Lexer(Start, LangOpts, Code.data(), Code.data(),
24	Code.data() + Code.size());
25	Lexer.SetCommentRetentionState(true);
26
27	TokenStream Result;
28	clang::Token CT;
29	// Index into the token stream of original source code.
30	Token::Index TokenIndex = `0`;
31	unsigned LastOffset = `0`;
32	unsigned Line = `0`;
33	unsigned Indent = `0`;
34	for (Lexer.LexFromRawLexer(Result&: CT); CT.getKind() != clang::tok::eof;
35	Lexer.LexFromRawLexer(Result&: CT)) {
36	unsigned Offset =
37	CT.getLocation().getRawEncoding() - Start.getRawEncoding();
38
39	Token Tok;
40	Tok.Data = &Code [Offset];
41	Tok.Length = CT.getLength();
42	Tok.Kind = CT.getKind();
43
44	// Update current line number and indentation from raw source code.
45	unsigned NewLineStart = `0`;
46	for (unsigned I = LastOffset; I < Offset; ++I) {
47	if (Code [I] == `'\n'`) {
48	NewLineStart = I + `1`;
49	++Line;
50	}
51	}
52	if (NewLineStart \|\| !LastOffset) {
53	Indent = `0`;
54	for (char C : StringRef(Code).slice(Start: NewLineStart, End: Offset)) {
55	if (C == `' '`)
56	++Indent;
57	else if (C == `'\t'`)
58	Indent += `8`;
59	else
60	break;
61	}
62	}
63	Tok.Indent = Indent;
64	Tok.Line = Line;
65
66	if (CT.isAtStartOfLine())
67	Tok.setFlag(LexFlags::StartsPPLine);
68	if (CT.needsCleaning() \|\| CT.hasUCN())
69	Tok.setFlag(LexFlags::NeedsCleaning);
70
71	Tok.OriginalIndex = TokenIndex++;
72	Result.push(T: Tok);
73	LastOffset = Offset;
74	}
75	Result.finalize();
76	return Result;
77	}
78
79	TokenStream cook(const TokenStream &Code, const LangOptions &LangOpts) {
80	auto CleanedStorage = std::make_shared<llvm::BumpPtrAllocator>();
81	clang::IdentifierTable Identifiers(LangOpts);
82	TokenStream Result(CleanedStorage);
83	Result.addPayload(P: Code.getPayload());
84	for (auto Tok : Code.tokens()) {
85	if (Tok.flag(Mask: LexFlags::NeedsCleaning)) {
86	// Remove escaped newlines and trigraphs.
87	llvm::SmallString<`64`> CleanBuffer;
88	const char *Pos = Tok.text().begin();
89	while (Pos < Tok.text().end()) {
90	auto [Char, CharSize] =
91	clang::Lexer::getCharAndSizeNoWarn(Ptr: Pos, LangOpts);
92	CleanBuffer.push_back(Elt: Char);
93	assert(CharSize != `0` && "no progress!");
94	Pos += CharSize;
95	}
96	llvm::StringRef Text = CleanBuffer;
97	llvm::SmallString<`64`> UCNBuffer;
98	// A surface reading of the standard suggests UCNs might appear anywhere.
99	// But we need only decode them in raw_identifiers.
100	// - they cannot appear in punctuation/keyword tokens, because UCNs
101	// cannot encode basic characters outside of literals [lex.charset]
102	// - they can appear in literals, but we need not unescape them now.
103	// We treat them as escape sequences when evaluating the literal.
104	// - comments are handled similarly to literals
105	// This is good fortune, because expandUCNs requires its input to be a
106	// reasonably valid identifier (e.g. without stray backslashes).
107	if (Tok.Kind == tok::raw_identifier) {
108	clang::expandUCNs(Buf&: UCNBuffer, Input: CleanBuffer);
109	Text = UCNBuffer;
110	}
111
112	Tok.Data = Text.copy(A&: *CleanedStorage).data();
113	Tok.Length = Text.size();
114	Tok.Flags &= ~static_cast<decltype(Tok.Flags)>(LexFlags::NeedsCleaning);
115	}
116
117	if (Tok.Kind == tok::raw_identifier) {
118	// Cook raw_identifiers into identifier, keyword, etc.
119	Tok.Kind = Identifiers.get(Name: Tok.text()).getTokenID();
120	} else if (Tok.Kind == tok::greatergreater) {
121	// Split the greatergreater token.
122	// FIXME: split lessless token to support Cuda triple angle brackets <<<.
123	assert(Tok.text() == ">>");
124	Tok.Kind = tok::greater;
125	Tok.Length = `1`;
126	Result.push(T: Tok);
127	// Line is wrong if the first greater is followed by an escaped newline!
128	Tok.Data = Tok.text().data() + `1`;
129	}
130
131	Result.push(T: std::move(Tok));
132	}
133
134	Result.finalize();
135	return Result;
136	}
137
138	} // namespace pseudo
139	} // namespace clang
140

source code of clang-tools-extra/pseudo/lib/Lex.cpp