1 | //===--- TokenTest.cpp ----------------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "clang-pseudo/Token.h" |
10 | #include "clang/Basic/LangOptions.h" |
11 | #include "clang/Basic/TokenKinds.h" |
12 | #include "gmock/gmock.h" |
13 | #include "gtest/gtest.h" |
14 | |
15 | namespace clang { |
16 | namespace pseudo { |
17 | namespace { |
18 | |
19 | using testing::AllOf; |
20 | using testing::ElementsAre; |
21 | using testing::ElementsAreArray; |
22 | using testing::Not; |
23 | |
24 | MATCHER_P2(token, Text, Kind, "" ) { |
25 | return arg.Kind == Kind && arg.text() == Text; |
26 | } |
27 | |
28 | MATCHER_P(hasFlag, Flag, "" ) { return arg.flag(Flag); } |
29 | |
30 | MATCHER_P2(lineIndent, Line, Indent, "" ) { |
31 | return arg.Line == (unsigned)Line && arg.Indent == (unsigned)Indent; |
32 | } |
33 | |
34 | MATCHER_P(originalIndex, index, "" ) { |
35 | return arg.OriginalIndex == (Token::Index)index; |
36 | } |
37 | |
38 | TEST(TokenTest, Lex) { |
39 | LangOptions Opts; |
40 | std::string Code = R"cpp( |
41 | #include <stdio.h> |
42 | int main() { |
43 | return 42; // the answer |
44 | } |
45 | )cpp" ; |
46 | TokenStream Raw = lex(Code, Opts); |
47 | ASSERT_TRUE(Raw.isFinalized()); |
48 | EXPECT_THAT(Raw.tokens(), |
49 | ElementsAreArray({ |
50 | // Lexing of directives is weird, especially <angled> strings. |
51 | token("#" , tok::hash), |
52 | token("include" , tok::raw_identifier), |
53 | token("<" , tok::less), |
54 | token("stdio" , tok::raw_identifier), |
55 | token("." , tok::period), |
56 | token("h" , tok::raw_identifier), |
57 | token(">" , tok::greater), |
58 | |
59 | token("int" , tok::raw_identifier), |
60 | token("main" , tok::raw_identifier), |
61 | token("(" , tok::l_paren), |
62 | token(")" , tok::r_paren), |
63 | token("{" , tok::l_brace), |
64 | token("return" , tok::raw_identifier), |
65 | token("42" , tok::numeric_constant), |
66 | token(";" , tok::semi), |
67 | token("// the answer" , tok::comment), |
68 | token("}" , tok::r_brace), |
69 | })); |
70 | |
71 | TokenStream Cooked = cook(Raw, Opts); |
72 | ASSERT_TRUE(Cooked.isFinalized()); |
73 | EXPECT_THAT(Cooked.tokens(), |
74 | ElementsAreArray({ |
75 | // Cooked identifier types in directives are not meaningful. |
76 | token("#" , tok::hash), |
77 | token("include" , tok::identifier), |
78 | token("<" , tok::less), |
79 | token("stdio" , tok::identifier), |
80 | token("." , tok::period), |
81 | token("h" , tok::identifier), |
82 | token(">" , tok::greater), |
83 | |
84 | token("int" , tok::kw_int), |
85 | token("main" , tok::identifier), |
86 | token("(" , tok::l_paren), |
87 | token(")" , tok::r_paren), |
88 | token("{" , tok::l_brace), |
89 | token("return" , tok::kw_return), |
90 | token("42" , tok::numeric_constant), |
91 | token(";" , tok::semi), |
92 | token("// the answer" , tok::comment), |
93 | token("}" , tok::r_brace), |
94 | })); |
95 | // Check raw tokens point back into original source code. |
96 | EXPECT_EQ(Raw.tokens().front().text().begin(), &Code[Code.find('#')]); |
97 | } |
98 | |
99 | TEST(TokenTest, LineContinuation) { |
100 | LangOptions Opts; |
101 | std::string Code = R"cpp( |
102 | one_\ |
103 | token |
104 | two \ |
105 | tokens |
106 | )cpp" ; |
107 | TokenStream Raw = lex(Code, Opts); |
108 | EXPECT_THAT( |
109 | Raw.tokens(), |
110 | ElementsAre(AllOf(token("one_\\\ntoken" , tok::raw_identifier), |
111 | hasFlag(LexFlags::StartsPPLine), |
112 | hasFlag(LexFlags::NeedsCleaning), lineIndent(1, 0), |
113 | originalIndex(0)), |
114 | AllOf(token("two" , tok::raw_identifier), |
115 | hasFlag(LexFlags::StartsPPLine), |
116 | Not(hasFlag(LexFlags::NeedsCleaning)), |
117 | originalIndex(1)), |
118 | AllOf(token("\\\ntokens" , tok::raw_identifier), |
119 | Not(hasFlag(LexFlags::StartsPPLine)), |
120 | hasFlag(LexFlags::NeedsCleaning), originalIndex(2)))); |
121 | |
122 | TokenStream Cooked = cook(Raw, Opts); |
123 | EXPECT_THAT( |
124 | Cooked.tokens(), |
125 | ElementsAre(AllOf(token("one_token" , tok::identifier), lineIndent(1, 0), |
126 | originalIndex(0)), |
127 | AllOf(token("two" , tok::identifier), originalIndex(1)), |
128 | AllOf(token("tokens" , tok::identifier), originalIndex(2)))); |
129 | } |
130 | |
131 | TEST(TokenTest, EncodedCharacters) { |
132 | LangOptions Opts; |
133 | Opts.Trigraphs = true; |
134 | Opts.Digraphs = true; |
135 | Opts.C99 = true; // UCNs |
136 | Opts.CXXOperatorNames = true; |
137 | std::string Code = R"(and <: ??! '??=' \u00E9)" ; |
138 | TokenStream Raw = lex(Code, Opts); |
139 | EXPECT_THAT( |
140 | Raw.tokens(), |
141 | ElementsAre( // and is not recognized as && until cook(). |
142 | AllOf(token("and" , tok::raw_identifier), |
143 | Not(hasFlag(LexFlags::NeedsCleaning))), |
144 | // Digraphs are just different spellings of tokens. |
145 | AllOf(token("<:" , tok::l_square), |
146 | Not(hasFlag(LexFlags::NeedsCleaning))), |
147 | // Trigraps are interpreted, still need text cleaning. |
148 | AllOf(token(R"(??!)" , tok::pipe), hasFlag(LexFlags::NeedsCleaning)), |
149 | // Trigraphs must be substituted inside constants too. |
150 | AllOf(token(R"('??=')" , tok::char_constant), |
151 | hasFlag(LexFlags::NeedsCleaning)), |
152 | // UCNs need substitution. |
153 | AllOf(token(R"(\u00E9)" , tok::raw_identifier), |
154 | hasFlag(LexFlags::NeedsCleaning)))); |
155 | |
156 | TokenStream Cooked = cook(Raw, Opts); |
157 | EXPECT_THAT( |
158 | Cooked.tokens(), |
159 | ElementsAre(token("and" , tok::ampamp), // alternate spelling recognized |
160 | token("<:" , tok::l_square), |
161 | token("|" , tok::pipe), // trigraph substituted |
162 | token("'#'" , tok::char_constant), // trigraph substituted |
163 | token("é" , tok::identifier))); // UCN substituted |
164 | } |
165 | |
166 | TEST(TokenTest, Indentation) { |
167 | LangOptions Opts; |
168 | std::string Code = R"cpp( hello world |
169 | no_indent \ |
170 | line_was_continued |
171 | )cpp" ; |
172 | TokenStream Raw = lex(Code, Opts); |
173 | EXPECT_THAT(Raw.tokens(), ElementsAreArray({ |
174 | lineIndent(0, 3), // hello |
175 | lineIndent(0, 3), // world |
176 | lineIndent(1, 0), // no_indent |
177 | lineIndent(2, 2), // line_was_continued |
178 | })); |
179 | } |
180 | |
181 | TEST(TokenTest, SplitGreaterGreater) { |
182 | LangOptions Opts; |
183 | std::string Code = R"cpp( |
184 | >> // split |
185 | // >> with an escaped newline in the middle, split |
186 | >\ |
187 | > |
188 | >>= // not split |
189 | )cpp" ; |
190 | TokenStream Cook = cook(lex(Code, Opts), Opts); |
191 | TokenStream Split = stripComments(Cook); |
192 | EXPECT_THAT(Split.tokens(), |
193 | ElementsAre(AllOf(token(">" , tok::greater), originalIndex(0)), |
194 | AllOf(token(">" , tok::greater), originalIndex(0)), |
195 | // Token 1 and 2 are comments. |
196 | AllOf(token(">" , tok::greater), originalIndex(3)), |
197 | AllOf(token(">" , tok::greater), originalIndex(3)), |
198 | AllOf(token(">>=" , tok::greatergreaterequal), |
199 | originalIndex(4)))); |
200 | } |
201 | |
202 | TEST(TokenTest, DropComments) { |
203 | LangOptions Opts; |
204 | std::string Code = R"cpp( |
205 | // comment |
206 | int /*abc*/; |
207 | )cpp" ; |
208 | TokenStream Raw = cook(lex(Code, Opts), Opts); |
209 | TokenStream Stripped = stripComments(Raw); |
210 | EXPECT_THAT( |
211 | Raw.tokens(), |
212 | ElementsAre(AllOf(token("// comment" , tok::comment), originalIndex(0)), |
213 | AllOf(token("int" , tok::kw_int), originalIndex(1)), |
214 | AllOf(token("/*abc*/" , tok::comment), originalIndex(2)), |
215 | AllOf(token(";" , tok::semi), originalIndex(3)))); |
216 | |
217 | EXPECT_THAT(Stripped.tokens(), |
218 | ElementsAre(AllOf(token("int" , tok::kw_int), originalIndex(1)), |
219 | AllOf(token(";" , tok::semi), originalIndex(3)))); |
220 | } |
221 | |
222 | } // namespace |
223 | } // namespace pseudo |
224 | } // namespace clang |
225 | |