1 | //===----------------------------------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | // UNSUPPORTED: no-localization |
10 | // UNSUPPORTED: c++03, c++11, c++14 |
11 | // UNSUPPORTED: availability-filesystem-missing |
12 | // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS |
13 | |
14 | // <filesystem> |
15 | |
16 | // class path |
17 | |
18 | // Test constructors, accessors and modifiers that convert from/to various |
19 | // character encodings. Constructors and modifiers (append, concat, |
20 | // operator/=, operator+=) accept inputs with various character encodings, |
21 | // and accessors (*string(), string<>(), u8string()) export the string with |
22 | // various encodings. |
23 | // |
24 | // Some encodings are standardized; char16_t, char32_t and the u8string |
25 | // accessor and u8path constructor (and normal functions taking char8_t in |
26 | // C++20) convert from/to UTF-16, UTF-32 and UTF-8. wchar_t can be either |
27 | // UTF-16 or UTF-32 depending on the size of the wchar_t type, or can be |
28 | // left unimplemented. |
29 | // |
30 | // Plain char is implicitly UTF-8 on posix systems. On Windows, plain char |
31 | // is supposed to be in the same encoding as the platform's native file |
32 | // system APIs consumes in the functions that take narrow strings as path |
33 | // names. |
34 | |
35 | #include <filesystem> |
36 | #include <type_traits> |
37 | #include <cassert> |
38 | |
39 | #include "test_macros.h" |
40 | |
41 | #ifdef _WIN32 |
42 | # include <windows.h> // SetFileApisToANSI & friends |
43 | #endif |
44 | namespace fs = std::filesystem; |
45 | |
46 | // Test conversion with strings that fit within the latin1 charset, that fit |
47 | // within one code point in UTF-16, and that can be expressible in certain |
48 | // one-byte code pages. |
49 | static void test_latin_unicode() |
50 | { |
51 | const char16_t u16str[] = { 0xe5, 0xe4, 0xf6, 0x00 }; |
52 | const char32_t u32str[] = { 0xe5, 0xe4, 0xf6, 0x00 }; |
53 | const char str[] = { char(0xc3), char(0xa5), char(0xc3), char(0xa4), char(0xc3), char(0xb6), 0x00 }; // UTF8, in a regular char string |
54 | #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t) |
55 | const char8_t u8str[] = { 0xc3, 0xa5, 0xc3, 0xa4, 0xc3, 0xb6, 0x00 }; |
56 | #else |
57 | const char u8str[] = { char(0xc3), char(0xa5), char(0xc3), char(0xa4), char(0xc3), char(0xb6), 0x00 }; |
58 | #endif |
59 | #ifndef TEST_HAS_NO_WIDE_CHARACTERS |
60 | const wchar_t wstr[] = { 0xe5, 0xe4, 0xf6, 0x00 }; |
61 | #endif |
62 | |
63 | // Test well-defined conversion between UTF-8, UTF-16 and UTF-32 |
64 | { |
65 | const fs::path p(u16str); |
66 | assert(p.u8string() == u8str); |
67 | assert(p.u16string() == u16str); |
68 | assert(p.u32string() == u32str); |
69 | assert(p.string<char16_t>() == u16str); |
70 | assert(p.string<char32_t>() == u32str); |
71 | } |
72 | { |
73 | const fs::path p(u32str); |
74 | assert(p.u8string() == u8str); |
75 | assert(p.u16string() == u16str); |
76 | assert(p.u32string() == u32str); |
77 | assert(p.string<char16_t>() == u16str); |
78 | assert(p.string<char32_t>() == u32str); |
79 | } |
80 | { |
81 | const fs::path p = fs::u8path(str); |
82 | assert(p.u8string() == u8str); |
83 | assert(p.u16string() == u16str); |
84 | assert(p.u32string() == u32str); |
85 | assert(p.string<char16_t>() == u16str); |
86 | assert(p.string<char32_t>() == u32str); |
87 | } |
88 | #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t) |
89 | { |
90 | // In C++20, the path constructor can unambiguously handle UTF-8 input, |
91 | // even if the plain char constructor would treat it as something else. |
92 | const fs::path p(u8str); |
93 | assert(p.u8string() == u8str); |
94 | assert(p.u16string() == u16str); |
95 | assert(p.u32string() == u32str); |
96 | assert(p.string<char8_t>() == u8str); |
97 | assert(p.string<char16_t>() == u16str); |
98 | assert(p.string<char32_t>() == u32str); |
99 | } |
100 | // Check reading various inputs with string<char8_t>() |
101 | { |
102 | const fs::path p(u16str); |
103 | assert(p.string<char8_t>() == u8str); |
104 | } |
105 | { |
106 | const fs::path p(u32str); |
107 | assert(p.string<char8_t>() == u8str); |
108 | } |
109 | { |
110 | const fs::path p = fs::u8path(str); |
111 | assert(p.string<char8_t>() == u8str); |
112 | } |
113 | #endif |
114 | #ifndef TEST_HAS_NO_WIDE_CHARACTERS |
115 | // Test conversion to/from wchar_t. |
116 | { |
117 | const fs::path p(u16str); |
118 | assert(p.wstring() == wstr); |
119 | assert(p.string<wchar_t>() == wstr); |
120 | } |
121 | { |
122 | const fs::path p = fs::u8path(str); |
123 | assert(p.wstring() == wstr); |
124 | assert(p.string<wchar_t>() == wstr); |
125 | } |
126 | { |
127 | const fs::path p(wstr); |
128 | assert(p.wstring() == wstr); |
129 | assert(p.u8string() == u8str); |
130 | assert(p.u16string() == u16str); |
131 | assert(p.u32string() == u32str); |
132 | assert(p.string<wchar_t>() == wstr); |
133 | } |
134 | #endif // TEST_HAS_NO_WIDE_CHARACTERS |
135 | #ifndef _WIN32 |
136 | // Test conversion to/from regular char-based string. On POSIX, this |
137 | // is implied to convert to/from UTF-8. |
138 | { |
139 | const fs::path p(str); |
140 | assert(p.string() == str); |
141 | assert(p.u16string() == u16str); |
142 | assert(p.string<char>() == str); |
143 | } |
144 | { |
145 | const fs::path p(u16str); |
146 | assert(p.string() == str); |
147 | assert(p.string<char>() == str); |
148 | } |
149 | #else |
150 | // On windows, the narrow char-based input/output is supposed to be |
151 | // in the charset that narrow file IO APIs use. This can either be the |
152 | // current active code page (ACP) or the OEM code page, exposed by |
153 | // the AreFileApisANSI() function, and settable with SetFileApisToANSI() and |
154 | // SetFileApisToOEM(). We can't set which codepage is active within |
155 | // the process, but for some specific known ones, we can check if they |
156 | // behave as expected. |
157 | SetFileApisToANSI(); |
158 | if (GetACP() == 1252) { |
159 | const char latin1[] = { char(0xe5), char(0xe4), char(0xf6), 0x00 }; |
160 | { |
161 | const fs::path p(wstr); |
162 | assert(p.string() == latin1); |
163 | assert(p.string<char>() == latin1); |
164 | } |
165 | { |
166 | const fs::path p(latin1); |
167 | assert(p.string() == latin1); |
168 | assert(p.wstring() == wstr); |
169 | assert(p.u8string() == u8str); |
170 | assert(p.u16string() == u16str); |
171 | assert(p.string<char>() == latin1); |
172 | assert(p.string<wchar_t>() == wstr); |
173 | } |
174 | } |
175 | SetFileApisToOEM(); |
176 | if (GetOEMCP() == 850 || GetOEMCP() == 437) { |
177 | // These chars are identical in both CP 850 and 437 |
178 | const char cp850[] = { char(0x86), char(0x84), char(0x94), 0x00 }; |
179 | { |
180 | const fs::path p(wstr); |
181 | assert(p.string() == cp850); |
182 | assert(p.string<char>() == cp850); |
183 | } |
184 | { |
185 | const fs::path p(cp850); |
186 | assert(p.string() == cp850); |
187 | assert(p.wstring() == wstr); |
188 | assert(p.u8string() == u8str); |
189 | assert(p.u16string() == u16str); |
190 | assert(p.string<char>() == cp850); |
191 | assert(p.string<wchar_t>() == wstr); |
192 | } |
193 | } |
194 | #endif |
195 | } |
196 | |
197 | // Test conversion with strings that don't fit within one UTF-16 code point. |
198 | // Here, wchar_t can be either UTF-16 or UTF-32 depending on the size on the |
199 | // particular platform. |
200 | static void test_wide_unicode() |
201 | { |
202 | const char16_t u16str[] = { 0xd801, 0xdc37, 0x00 }; |
203 | const char32_t u32str[] = { 0x10437, 0x00 }; |
204 | #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t) |
205 | const char8_t u8str[] = { 0xf0, 0x90, 0x90, 0xb7, 0x00 }; |
206 | #else |
207 | const char u8str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 }; |
208 | #endif |
209 | const char str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 }; |
210 | { |
211 | const fs::path p = fs::u8path(source: str); |
212 | assert(p.u8string() == u8str); |
213 | assert(p.u16string() == u16str); |
214 | assert(p.u32string() == u32str); |
215 | } |
216 | { |
217 | const fs::path p(u16str); |
218 | assert(p.u8string() == u8str); |
219 | assert(p.u16string() == u16str); |
220 | assert(p.u32string() == u32str); |
221 | } |
222 | { |
223 | const fs::path p(u32str); |
224 | assert(p.u8string() == u8str); |
225 | assert(p.u16string() == u16str); |
226 | assert(p.u32string() == u32str); |
227 | } |
228 | #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__) |
229 | # if __SIZEOF_WCHAR_T__ == 2 |
230 | const wchar_t wstr[] = { 0xd801, 0xdc37, 0x00 }; |
231 | # else |
232 | const wchar_t wstr[] = { 0x10437, 0x00 }; |
233 | # endif |
234 | // Test conversion to/from wchar_t. |
235 | { |
236 | const fs::path p = fs::u8path(source: str); |
237 | assert(p.wstring() == wstr); |
238 | } |
239 | { |
240 | const fs::path p(u16str); |
241 | assert(p.wstring() == wstr); |
242 | } |
243 | { |
244 | const fs::path p(u32str); |
245 | assert(p.wstring() == wstr); |
246 | } |
247 | { |
248 | const fs::path p(wstr); |
249 | assert(p.u8string() == u8str); |
250 | assert(p.u16string() == u16str); |
251 | assert(p.u32string() == u32str); |
252 | assert(p.wstring() == wstr); |
253 | } |
254 | #endif // !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__) |
255 | } |
256 | |
257 | // Test appending paths in different encodings. |
258 | static void test_append() |
259 | { |
260 | const char16_t u16str[] = { 0xd801, 0xdc37, 0x00 }; |
261 | const char32_t u32str[] = { 0x10437, 0x00 }; |
262 | const char32_t u32ref[] = { 0x10437, fs::path::preferred_separator, 0x10437, fs::path::preferred_separator, 0x10437, 0x00 }; |
263 | const char str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 }; |
264 | { |
265 | fs::path p = fs::u8path(source: str) / u16str / u32str; |
266 | assert(p.u32string() == u32ref); |
267 | p = fs::u8path(source: str).append(source: u16str).append(source: u32str); |
268 | assert(p.u32string() == u32ref); |
269 | p = fs::u8path(source: str); |
270 | p /= u16str; |
271 | p /= u32str; |
272 | assert(p.u32string() == u32ref); |
273 | } |
274 | #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__) |
275 | # if __SIZEOF_WCHAR_T__ == 2 |
276 | const wchar_t wstr[] = { 0xd801, 0xdc37, 0x00 }; |
277 | # else |
278 | const wchar_t wstr[] = { 0x10437, 0x00 }; |
279 | # endif |
280 | // Test conversion from wchar_t. |
281 | { |
282 | fs::path p = fs::path(u16str) / wstr / u32str; |
283 | assert(p.u32string() == u32ref); |
284 | p = fs::path(u16str).append(source: wstr).append(source: u32str); |
285 | assert(p.u32string() == u32ref); |
286 | p = fs::path(u16str); |
287 | p /= wstr; |
288 | p /= u32str; |
289 | assert(p.u32string() == u32ref); |
290 | } |
291 | #endif // !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__) |
292 | } |
293 | |
294 | static void test_concat() |
295 | { |
296 | const char16_t u16str[] = { 0xd801, 0xdc37, 0x00 }; |
297 | const char32_t u32str[] = { 0x10437, 0x00 }; |
298 | const char32_t u32ref[] = { 0x10437, 0x10437, 0x10437, 0x00 }; |
299 | const char str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 }; |
300 | { |
301 | fs::path p = fs::u8path(source: str); |
302 | p += u16str; |
303 | p += u32str; |
304 | assert(p.u32string() == u32ref); |
305 | p = fs::u8path(source: str).concat(x: u16str).concat(x: u32str); |
306 | assert(p.u32string() == u32ref); |
307 | } |
308 | #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__) |
309 | # if __SIZEOF_WCHAR_T__ == 2 |
310 | const wchar_t wstr[] = { 0xd801, 0xdc37, 0x00 }; |
311 | # else |
312 | const wchar_t wstr[] = { 0x10437, 0x00 }; |
313 | # endif |
314 | // Test conversion from wchar_t. |
315 | { |
316 | fs::path p = fs::path(u16str); |
317 | p += wstr; |
318 | p += u32str; |
319 | assert(p.u32string() == u32ref); |
320 | p = fs::path(u16str).concat(x: wstr).concat(x: u32str); |
321 | assert(p.u32string() == u32ref); |
322 | } |
323 | #endif // !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__) |
324 | } |
325 | |
326 | static void test_append_concat_narrow() |
327 | { |
328 | const char16_t u16str[] = { 0xe5, 0x00 }; |
329 | const char32_t u32ref_append[] = { 0xe5, fs::path::preferred_separator, 0xe5, 0x00 }; |
330 | const char32_t u32ref_concat[] = { 0xe5, 0xe5, 0x00 }; |
331 | |
332 | #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t) |
333 | { |
334 | const char8_t u8str[] = { 0xc3, 0xa5, 0x00 }; |
335 | // In C++20, appends of a char8_t string is unambiguously treated as |
336 | // UTF-8. |
337 | fs::path p = fs::path(u16str) / u8str; |
338 | assert(p.u32string() == u32ref_append); |
339 | p = fs::path(u16str).append(u8str); |
340 | assert(p.u32string() == u32ref_append); |
341 | p = fs::path(u16str); |
342 | p /= u8str; |
343 | assert(p.u32string() == u32ref_append); |
344 | p = fs::path(u16str).concat(u8str); |
345 | assert(p.u32string() == u32ref_concat); |
346 | p = fs::path(u16str); |
347 | p += u8str; |
348 | assert(p.u32string() == u32ref_concat); |
349 | } |
350 | #endif |
351 | #ifndef _WIN32 |
352 | // Test appending a regular char-based string. On POSIX, this |
353 | // is implied to convert to/from UTF-8. |
354 | { |
355 | const char str[] = { char(0xc3), char(0xa5), 0x00 }; // UTF8, in a regular char string |
356 | fs::path p = fs::path(u16str) / str; |
357 | assert(p.u32string() == u32ref_append); |
358 | p = fs::path(u16str).append(source: str); |
359 | assert(p.u32string() == u32ref_append); |
360 | p = fs::path(u16str); |
361 | p /= str; |
362 | assert(p.u32string() == u32ref_append); |
363 | p = fs::path(u16str).concat(x: str); |
364 | assert(p.u32string() == u32ref_concat); |
365 | p = fs::path(u16str); |
366 | p += str; |
367 | assert(p.u32string() == u32ref_concat); |
368 | } |
369 | #else |
370 | SetFileApisToANSI(); |
371 | if (GetACP() == 1252) { |
372 | const char latin1[] = { char(0xe5), 0x00 }; |
373 | fs::path p = fs::path(u16str) / latin1; |
374 | assert(p.u32string() == u32ref_append); |
375 | p = fs::path(u16str).append(latin1); |
376 | assert(p.u32string() == u32ref_append); |
377 | p = fs::path(u16str); |
378 | p /= latin1; |
379 | assert(p.u32string() == u32ref_append); |
380 | p = fs::path(u16str).concat(latin1); |
381 | assert(p.u32string() == u32ref_concat); |
382 | p = fs::path(u16str); |
383 | p += latin1; |
384 | assert(p.u32string() == u32ref_concat); |
385 | } |
386 | SetFileApisToOEM(); |
387 | if (GetOEMCP() == 850 || GetOEMCP() == 437) { |
388 | // This chars is identical in both CP 850 and 437 |
389 | const char cp850[] = { char(0x86), 0x00 }; |
390 | fs::path p = fs::path(u16str) / cp850; |
391 | assert(p.u32string() == u32ref_append); |
392 | p = fs::path(u16str).append(cp850); |
393 | assert(p.u32string() == u32ref_append); |
394 | p = fs::path(u16str); |
395 | p /= cp850; |
396 | assert(p.u32string() == u32ref_append); |
397 | p = fs::path(u16str).concat(cp850); |
398 | assert(p.u32string() == u32ref_concat); |
399 | p = fs::path(u16str); |
400 | p += cp850; |
401 | assert(p.u32string() == u32ref_concat); |
402 | } |
403 | #endif |
404 | } |
405 | |
406 | int main(int, char**) |
407 | { |
408 | test_latin_unicode(); |
409 | test_wide_unicode(); |
410 | test_append(); |
411 | test_concat(); |
412 | test_append_concat_narrow(); |
413 | |
414 | return 0; |
415 | } |
416 | |