1//===--- Lex.cpp - extract token stream from source code ---------*- C++-*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "clang-pseudo/Token.h"
10#include "clang/Basic/IdentifierTable.h"
11#include "clang/Basic/SourceLocation.h"
12#include "clang/Basic/TokenKinds.h"
13#include "clang/Lex/Lexer.h"
14#include "clang/Lex/LiteralSupport.h"
15
16namespace clang {
17namespace pseudo {
18
19TokenStream lex(const std::string &Code, const clang::LangOptions &LangOpts) {
20 clang::SourceLocation Start;
21 // Tokenize using clang's lexer in raw mode.
22 // std::string guarantees null-termination, which the lexer needs.
23 clang::Lexer Lexer(Start, LangOpts, Code.data(), Code.data(),
24 Code.data() + Code.size());
25 Lexer.SetCommentRetentionState(true);
26
27 TokenStream Result;
28 clang::Token CT;
29 // Index into the token stream of original source code.
30 Token::Index TokenIndex = 0;
31 unsigned LastOffset = 0;
32 unsigned Line = 0;
33 unsigned Indent = 0;
34 for (Lexer.LexFromRawLexer(Result&: CT); CT.getKind() != clang::tok::eof;
35 Lexer.LexFromRawLexer(Result&: CT)) {
36 unsigned Offset =
37 CT.getLocation().getRawEncoding() - Start.getRawEncoding();
38
39 Token Tok;
40 Tok.Data = &Code[Offset];
41 Tok.Length = CT.getLength();
42 Tok.Kind = CT.getKind();
43
44 // Update current line number and indentation from raw source code.
45 unsigned NewLineStart = 0;
46 for (unsigned I = LastOffset; I < Offset; ++I) {
47 if (Code[I] == '\n') {
48 NewLineStart = I + 1;
49 ++Line;
50 }
51 }
52 if (NewLineStart || !LastOffset) {
53 Indent = 0;
54 for (char C : StringRef(Code).slice(Start: NewLineStart, End: Offset)) {
55 if (C == ' ')
56 ++Indent;
57 else if (C == '\t')
58 Indent += 8;
59 else
60 break;
61 }
62 }
63 Tok.Indent = Indent;
64 Tok.Line = Line;
65
66 if (CT.isAtStartOfLine())
67 Tok.setFlag(LexFlags::StartsPPLine);
68 if (CT.needsCleaning() || CT.hasUCN())
69 Tok.setFlag(LexFlags::NeedsCleaning);
70
71 Tok.OriginalIndex = TokenIndex++;
72 Result.push(T: Tok);
73 LastOffset = Offset;
74 }
75 Result.finalize();
76 return Result;
77}
78
79TokenStream cook(const TokenStream &Code, const LangOptions &LangOpts) {
80 auto CleanedStorage = std::make_shared<llvm::BumpPtrAllocator>();
81 clang::IdentifierTable Identifiers(LangOpts);
82 TokenStream Result(CleanedStorage);
83 Result.addPayload(P: Code.getPayload());
84 for (auto Tok : Code.tokens()) {
85 if (Tok.flag(Mask: LexFlags::NeedsCleaning)) {
86 // Remove escaped newlines and trigraphs.
87 llvm::SmallString<64> CleanBuffer;
88 const char *Pos = Tok.text().begin();
89 while (Pos < Tok.text().end()) {
90 auto [Char, CharSize] =
91 clang::Lexer::getCharAndSizeNoWarn(Ptr: Pos, LangOpts);
92 CleanBuffer.push_back(Elt: Char);
93 assert(CharSize != 0 && "no progress!");
94 Pos += CharSize;
95 }
96 llvm::StringRef Text = CleanBuffer;
97 llvm::SmallString<64> UCNBuffer;
98 // A surface reading of the standard suggests UCNs might appear anywhere.
99 // But we need only decode them in raw_identifiers.
100 // - they cannot appear in punctuation/keyword tokens, because UCNs
101 // cannot encode basic characters outside of literals [lex.charset]
102 // - they can appear in literals, but we need not unescape them now.
103 // We treat them as escape sequences when evaluating the literal.
104 // - comments are handled similarly to literals
105 // This is good fortune, because expandUCNs requires its input to be a
106 // reasonably valid identifier (e.g. without stray backslashes).
107 if (Tok.Kind == tok::raw_identifier) {
108 clang::expandUCNs(Buf&: UCNBuffer, Input: CleanBuffer);
109 Text = UCNBuffer;
110 }
111
112 Tok.Data = Text.copy(A&: *CleanedStorage).data();
113 Tok.Length = Text.size();
114 Tok.Flags &= ~static_cast<decltype(Tok.Flags)>(LexFlags::NeedsCleaning);
115 }
116
117 if (Tok.Kind == tok::raw_identifier) {
118 // Cook raw_identifiers into identifier, keyword, etc.
119 Tok.Kind = Identifiers.get(Name: Tok.text()).getTokenID();
120 } else if (Tok.Kind == tok::greatergreater) {
121 // Split the greatergreater token.
122 // FIXME: split lessless token to support Cuda triple angle brackets <<<.
123 assert(Tok.text() == ">>");
124 Tok.Kind = tok::greater;
125 Tok.Length = 1;
126 Result.push(T: Tok);
127 // Line is wrong if the first greater is followed by an escaped newline!
128 Tok.Data = Tok.text().data() + 1;
129 }
130
131 Result.push(T: std::move(Tok));
132 }
133
134 Result.finalize();
135 return Result;
136}
137
138} // namespace pseudo
139} // namespace clang
140

source code of clang-tools-extra/pseudo/lib/Lex.cpp