1//===-- Unittests for StringConverter class -------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "hdr/errno_macros.h"
10#include "hdr/types/char32_t.h"
11#include "hdr/types/char8_t.h"
12#include "src/__support/error_or.h"
13#include "src/__support/macros/properties/os.h"
14#include "src/__support/wchar/mbstate.h"
15#include "src/__support/wchar/string_converter.h"
16#include "test/UnitTest/Test.h"
17
18// TODO: add support for 16-bit widechars to StringConverter to remove this
19// macro
20#ifdef LIBC_TARGET_OS_IS_WINDOWS
21TEST(LlvmLibcStringConverterTest, Windows) {
22 // pass on windows for now
23}
24
25#else
26
27TEST(LlvmLibcStringConverterTest, UTF8To32) {
28 // first 4 bytes are clown emoji (🤡)
29 // next 3 bytes are sigma symbol (∑)
30 // next 2 bytes are y with diaeresis (ÿ)
31 // last byte is the letter A
32 const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91\xC3\xBF\x41";
33 LIBC_NAMESPACE::internal::mbstate state;
34 LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
35 reinterpret_cast<const char8_t *>(src), &state, SIZE_MAX);
36
37 auto res = sc.popUTF32();
38 ASSERT_TRUE(res.has_value());
39 ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
40 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
41
42 res = sc.popUTF32();
43 ASSERT_TRUE(res.has_value());
44 ASSERT_EQ(static_cast<int>(res.value()), 0x2211);
45 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 7);
46
47 res = sc.popUTF32();
48 ASSERT_TRUE(res.has_value());
49 ASSERT_EQ(static_cast<int>(res.value()), 0xff);
50 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 9);
51
52 res = sc.popUTF32();
53 ASSERT_TRUE(res.has_value());
54 ASSERT_EQ(static_cast<int>(res.value()), 0x41);
55 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 10);
56
57 res = sc.popUTF32();
58 ASSERT_TRUE(res.has_value());
59 ASSERT_EQ(static_cast<int>(res.value()), 0);
60 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 11);
61
62 res = sc.popUTF32();
63 ASSERT_FALSE(res.has_value());
64 ASSERT_EQ(res.error(), -1);
65 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 11);
66}
67
68TEST(LlvmLibcStringConverterTest, UTF32To8) {
69 // clown emoji, sigma symbol, y with diaeresis, letter A
70 const wchar_t src[] = {static_cast<wchar_t>(0x1f921),
71 static_cast<wchar_t>(0x2211),
72 static_cast<wchar_t>(0xff), static_cast<wchar_t>(0x41),
73 static_cast<wchar_t>(0x0)};
74 LIBC_NAMESPACE::internal::mbstate state;
75 LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
76 reinterpret_cast<const char32_t *>(src), &state, SIZE_MAX);
77
78 auto res = sc.popUTF8();
79 ASSERT_TRUE(res.has_value());
80 ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
81 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
82
83 res = sc.popUTF8();
84 ASSERT_TRUE(res.has_value());
85 ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
86 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
87
88 res = sc.popUTF8();
89 ASSERT_TRUE(res.has_value());
90 ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
91 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
92
93 res = sc.popUTF8();
94 ASSERT_TRUE(res.has_value());
95 ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
96 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
97
98 // end of clown emoji, sigma symbol begins
99 res = sc.popUTF8();
100 ASSERT_TRUE(res.has_value());
101 ASSERT_EQ(static_cast<int>(res.value()), 0xE2);
102 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 2);
103
104 res = sc.popUTF8();
105 ASSERT_TRUE(res.has_value());
106 ASSERT_EQ(static_cast<int>(res.value()), 0x88);
107 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 2);
108
109 res = sc.popUTF8();
110 ASSERT_TRUE(res.has_value());
111 ASSERT_EQ(static_cast<int>(res.value()), 0x91);
112 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 2);
113
114 // end of sigma symbol, y with diaeresis begins
115 res = sc.popUTF8();
116 ASSERT_TRUE(res.has_value());
117 ASSERT_EQ(static_cast<int>(res.value()), 0xC3);
118 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 3);
119
120 res = sc.popUTF8();
121 ASSERT_TRUE(res.has_value());
122 ASSERT_EQ(static_cast<int>(res.value()), 0xBF);
123 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 3);
124
125 // end of y with diaeresis, letter A begins
126 res = sc.popUTF8();
127 ASSERT_TRUE(res.has_value());
128 ASSERT_EQ(static_cast<int>(res.value()), 0x41);
129 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
130
131 // null byte
132 res = sc.popUTF8();
133 ASSERT_TRUE(res.has_value());
134 ASSERT_EQ(static_cast<int>(res.value()), 0);
135 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 5);
136
137 res = sc.popUTF8();
138 ASSERT_FALSE(res.has_value());
139 ASSERT_EQ(res.error(), -1);
140 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 5);
141}
142
143TEST(LlvmLibcStringConverterTest, UTF32To8PartialRead) {
144 const wchar_t src[] = {
145 static_cast<wchar_t>(0x1f921), static_cast<wchar_t>(0x2211),
146 static_cast<wchar_t>(0x0)}; // clown emoji, sigma symbol
147 LIBC_NAMESPACE::internal::mbstate state;
148 LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
149 reinterpret_cast<const char32_t *>(src), &state, SIZE_MAX, 1);
150
151 auto res = sc.popUTF8();
152 ASSERT_TRUE(res.has_value());
153 ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
154 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
155
156 res = sc.popUTF8();
157 ASSERT_TRUE(res.has_value());
158 ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
159 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
160
161 res = sc.popUTF8();
162 ASSERT_TRUE(res.has_value());
163 ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
164 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
165
166 res = sc.popUTF8();
167 ASSERT_TRUE(res.has_value());
168 ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
169 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
170
171 // can only read 1 character from source string, so error on next pop
172 res = sc.popUTF8();
173 ASSERT_FALSE(res.has_value());
174 ASSERT_EQ(res.error(), -1);
175}
176
177TEST(LlvmLibcStringConverterTest, UTF8To32PartialRead) {
178 // first 4 bytes are clown emoji, then next 3 are sigma symbol
179 const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91";
180 LIBC_NAMESPACE::internal::mbstate state;
181 LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
182 reinterpret_cast<const char8_t *>(src), &state, SIZE_MAX, 5);
183
184 auto res = sc.popUTF32();
185 ASSERT_TRUE(res.has_value());
186 ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
187 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
188
189 res = sc.popUTF32();
190 ASSERT_FALSE(res.has_value());
191 ASSERT_EQ(static_cast<int>(res.error()), -1);
192 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 5);
193}
194
195TEST(LlvmLibcStringConverterTest, UTF32To8ErrorHandling) {
196 const wchar_t src[] = {
197 static_cast<wchar_t>(0x1f921), static_cast<wchar_t>(0xffffff),
198 static_cast<wchar_t>(0x0)}; // clown emoji, invalid utf32
199 LIBC_NAMESPACE::internal::mbstate state;
200 LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
201 reinterpret_cast<const char32_t *>(src), &state, SIZE_MAX);
202
203 auto res = sc.popUTF8();
204 ASSERT_TRUE(res.has_value());
205 ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
206 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
207
208 res = sc.popUTF8();
209 ASSERT_TRUE(res.has_value());
210 ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
211 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
212
213 res = sc.popUTF8();
214 ASSERT_TRUE(res.has_value());
215 ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
216 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
217
218 res = sc.popUTF8();
219 ASSERT_TRUE(res.has_value());
220 ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
221 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
222
223 res = sc.popUTF8();
224 ASSERT_FALSE(res.has_value());
225 ASSERT_EQ(static_cast<int>(res.error()), EILSEQ);
226 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
227}
228
229TEST(LlvmLibcStringConverterTest, UTF8To32ErrorHandling) {
230 // first 4 bytes are clown emoji (🤡)
231 // next 3 form an invalid character
232 const char *src = "\xF0\x9F\xA4\xA1\x90\x88\x30";
233 LIBC_NAMESPACE::internal::mbstate state;
234 LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
235 reinterpret_cast<const char8_t *>(src), &state, SIZE_MAX);
236
237 auto res = sc.popUTF32();
238 ASSERT_TRUE(res.has_value());
239 ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
240 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
241
242 res = sc.popUTF32();
243 ASSERT_FALSE(res.has_value());
244 ASSERT_EQ(static_cast<int>(res.error()), EILSEQ);
245 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
246}
247
248TEST(LlvmLibcStringConverterTest, MultipleStringConverters32To8) {
249 /*
250 We do NOT test partially popping a character and expecting the next
251 StringConverter to continue where we left off. This is not expected to work
252 and considered invalid.
253 */
254 const wchar_t src[] = {
255 static_cast<wchar_t>(0x1f921), static_cast<wchar_t>(0xff),
256 static_cast<wchar_t>(0x0)}; // clown emoji, y with diaeresis (ÿ)
257 LIBC_NAMESPACE::internal::mbstate state;
258 LIBC_NAMESPACE::internal::StringConverter<char32_t> sc1(
259 reinterpret_cast<const char32_t *>(src), &state, SIZE_MAX, 1);
260
261 auto res = sc1.popUTF8();
262 ASSERT_TRUE(res.has_value());
263 ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
264 ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 1);
265
266 res = sc1.popUTF8();
267 ASSERT_TRUE(res.has_value());
268 ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
269 ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 1);
270
271 res = sc1.popUTF8();
272 ASSERT_TRUE(res.has_value());
273 ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
274 ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 1);
275
276 res = sc1.popUTF8();
277 ASSERT_TRUE(res.has_value());
278 ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
279 ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 1);
280
281 // sc2 should pick up where sc1 left off and continue the conversion
282 LIBC_NAMESPACE::internal::StringConverter<char32_t> sc2(
283 reinterpret_cast<const char32_t *>(src) + sc1.getSourceIndex(), &state,
284 SIZE_MAX, 1);
285
286 res = sc2.popUTF8();
287 ASSERT_TRUE(res.has_value());
288 ASSERT_EQ(static_cast<int>(res.value()), 0xC3);
289 ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 1);
290
291 res = sc2.popUTF8();
292 ASSERT_TRUE(res.has_value());
293 ASSERT_EQ(static_cast<int>(res.value()), 0xBF);
294 ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 1);
295}
296
297TEST(LlvmLibcStringConverterTest, MultipleStringConverters8To32) {
298 const char *src = "\xF0\x9F\xA4\xA1"; // clown emoji
299 LIBC_NAMESPACE::internal::mbstate state;
300 LIBC_NAMESPACE::internal::StringConverter<char8_t> sc1(
301 reinterpret_cast<const char8_t *>(src), &state, SIZE_MAX, 2);
302
303 auto res = sc1.popUTF32();
304 ASSERT_FALSE(res.has_value());
305 ASSERT_EQ(static_cast<int>(res.error()), -1);
306 ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 2);
307
308 // sc2 should pick up where sc1 left off and continue the conversion
309 LIBC_NAMESPACE::internal::StringConverter<char8_t> sc2(
310 reinterpret_cast<const char8_t *>(src) + sc1.getSourceIndex(), &state,
311 SIZE_MAX, 3);
312
313 res = sc2.popUTF32();
314 ASSERT_TRUE(res.has_value());
315 ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
316 ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 2);
317
318 res = sc2.popUTF32();
319 ASSERT_TRUE(res.has_value());
320 ASSERT_EQ(static_cast<int>(res.value()), 0);
321 ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 3);
322}
323
324TEST(LlvmLibcStringConverterTest, DestLimitUTF8To32) {
325 const char *src = "\xF0\x9F\xA4\xA1\xF0\x9F\xA4\xA1"; // 2 clown emojis
326 LIBC_NAMESPACE::internal::mbstate state;
327 LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
328 reinterpret_cast<const char8_t *>(src), &state, 1);
329
330 auto res = sc.popUTF32();
331 ASSERT_TRUE(res.has_value());
332 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
333
334 res = sc.popUTF32(); // no space to pop this into
335 ASSERT_FALSE(res.has_value());
336}
337
338TEST(LlvmLibcStringConverterTest, DestLimitUTF32To8) {
339 const wchar_t src[] = {static_cast<wchar_t>(0x1f921),
340 static_cast<wchar_t>(0x1f921)}; // 2 clown emojis
341 LIBC_NAMESPACE::internal::mbstate state;
342 LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
343 reinterpret_cast<const char32_t *>(src), &state, 5);
344
345 auto res = sc.popUTF8();
346 ASSERT_TRUE(res.has_value());
347 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
348
349 res = sc.popUTF8();
350 ASSERT_TRUE(res.has_value());
351 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
352
353 res = sc.popUTF8();
354 ASSERT_TRUE(res.has_value());
355 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
356
357 res = sc.popUTF8();
358 ASSERT_TRUE(res.has_value());
359 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
360
361 res = sc.popUTF8();
362 ASSERT_FALSE(res.has_value());
363 ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
364}
365
366#endif
367

source code of libc/test/src/__support/wchar/string_converter_test.cpp