1//===----------------------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9// UNSUPPORTED: no-localization
10// UNSUPPORTED: c++03, c++11, c++14
11// UNSUPPORTED: availability-filesystem-missing
12// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
13
14// <filesystem>
15
16// class path
17
18// Test constructors, accessors and modifiers that convert from/to various
19// character encodings. Constructors and modifiers (append, concat,
20// operator/=, operator+=) accept inputs with various character encodings,
21// and accessors (*string(), string<>(), u8string()) export the string with
22// various encodings.
23//
24// Some encodings are standardized; char16_t, char32_t and the u8string
25// accessor and u8path constructor (and normal functions taking char8_t in
26// C++20) convert from/to UTF-16, UTF-32 and UTF-8. wchar_t can be either
27// UTF-16 or UTF-32 depending on the size of the wchar_t type, or can be
28// left unimplemented.
29//
30// Plain char is implicitly UTF-8 on posix systems. On Windows, plain char
31// is supposed to be in the same encoding as the platform's native file
32// system APIs consumes in the functions that take narrow strings as path
33// names.
34
35#include <filesystem>
36#include <type_traits>
37#include <cassert>
38
39#include "test_macros.h"
40
41#ifdef _WIN32
42# include <windows.h> // SetFileApisToANSI & friends
43#endif
44namespace fs = std::filesystem;
45
46// Test conversion with strings that fit within the latin1 charset, that fit
47// within one code point in UTF-16, and that can be expressible in certain
48// one-byte code pages.
49static void test_latin_unicode()
50{
51 const char16_t u16str[] = { 0xe5, 0xe4, 0xf6, 0x00 };
52 const char32_t u32str[] = { 0xe5, 0xe4, 0xf6, 0x00 };
53 const char str[] = { char(0xc3), char(0xa5), char(0xc3), char(0xa4), char(0xc3), char(0xb6), 0x00 }; // UTF8, in a regular char string
54#if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t)
55 const char8_t u8str[] = { 0xc3, 0xa5, 0xc3, 0xa4, 0xc3, 0xb6, 0x00 };
56#else
57 const char u8str[] = { char(0xc3), char(0xa5), char(0xc3), char(0xa4), char(0xc3), char(0xb6), 0x00 };
58#endif
59#ifndef TEST_HAS_NO_WIDE_CHARACTERS
60 const wchar_t wstr[] = { 0xe5, 0xe4, 0xf6, 0x00 };
61#endif
62
63 // Test well-defined conversion between UTF-8, UTF-16 and UTF-32
64 {
65 const fs::path p(u16str);
66 assert(p.u8string() == u8str);
67 assert(p.u16string() == u16str);
68 assert(p.u32string() == u32str);
69 assert(p.string<char16_t>() == u16str);
70 assert(p.string<char32_t>() == u32str);
71 }
72 {
73 const fs::path p(u32str);
74 assert(p.u8string() == u8str);
75 assert(p.u16string() == u16str);
76 assert(p.u32string() == u32str);
77 assert(p.string<char16_t>() == u16str);
78 assert(p.string<char32_t>() == u32str);
79 }
80 {
81 const fs::path p = fs::u8path(str);
82 assert(p.u8string() == u8str);
83 assert(p.u16string() == u16str);
84 assert(p.u32string() == u32str);
85 assert(p.string<char16_t>() == u16str);
86 assert(p.string<char32_t>() == u32str);
87 }
88#if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t)
89 {
90 // In C++20, the path constructor can unambiguously handle UTF-8 input,
91 // even if the plain char constructor would treat it as something else.
92 const fs::path p(u8str);
93 assert(p.u8string() == u8str);
94 assert(p.u16string() == u16str);
95 assert(p.u32string() == u32str);
96 assert(p.string<char8_t>() == u8str);
97 assert(p.string<char16_t>() == u16str);
98 assert(p.string<char32_t>() == u32str);
99 }
100 // Check reading various inputs with string<char8_t>()
101 {
102 const fs::path p(u16str);
103 assert(p.string<char8_t>() == u8str);
104 }
105 {
106 const fs::path p(u32str);
107 assert(p.string<char8_t>() == u8str);
108 }
109 {
110 const fs::path p = fs::u8path(str);
111 assert(p.string<char8_t>() == u8str);
112 }
113#endif
114#ifndef TEST_HAS_NO_WIDE_CHARACTERS
115 // Test conversion to/from wchar_t.
116 {
117 const fs::path p(u16str);
118 assert(p.wstring() == wstr);
119 assert(p.string<wchar_t>() == wstr);
120 }
121 {
122 const fs::path p = fs::u8path(str);
123 assert(p.wstring() == wstr);
124 assert(p.string<wchar_t>() == wstr);
125 }
126 {
127 const fs::path p(wstr);
128 assert(p.wstring() == wstr);
129 assert(p.u8string() == u8str);
130 assert(p.u16string() == u16str);
131 assert(p.u32string() == u32str);
132 assert(p.string<wchar_t>() == wstr);
133 }
134#endif // TEST_HAS_NO_WIDE_CHARACTERS
135#ifndef _WIN32
136 // Test conversion to/from regular char-based string. On POSIX, this
137 // is implied to convert to/from UTF-8.
138 {
139 const fs::path p(str);
140 assert(p.string() == str);
141 assert(p.u16string() == u16str);
142 assert(p.string<char>() == str);
143 }
144 {
145 const fs::path p(u16str);
146 assert(p.string() == str);
147 assert(p.string<char>() == str);
148 }
149#else
150 // On windows, the narrow char-based input/output is supposed to be
151 // in the charset that narrow file IO APIs use. This can either be the
152 // current active code page (ACP) or the OEM code page, exposed by
153 // the AreFileApisANSI() function, and settable with SetFileApisToANSI() and
154 // SetFileApisToOEM(). We can't set which codepage is active within
155 // the process, but for some specific known ones, we can check if they
156 // behave as expected.
157 SetFileApisToANSI();
158 if (GetACP() == 1252) {
159 const char latin1[] = { char(0xe5), char(0xe4), char(0xf6), 0x00 };
160 {
161 const fs::path p(wstr);
162 assert(p.string() == latin1);
163 assert(p.string<char>() == latin1);
164 }
165 {
166 const fs::path p(latin1);
167 assert(p.string() == latin1);
168 assert(p.wstring() == wstr);
169 assert(p.u8string() == u8str);
170 assert(p.u16string() == u16str);
171 assert(p.string<char>() == latin1);
172 assert(p.string<wchar_t>() == wstr);
173 }
174 }
175 SetFileApisToOEM();
176 if (GetOEMCP() == 850 || GetOEMCP() == 437) {
177 // These chars are identical in both CP 850 and 437
178 const char cp850[] = { char(0x86), char(0x84), char(0x94), 0x00 };
179 {
180 const fs::path p(wstr);
181 assert(p.string() == cp850);
182 assert(p.string<char>() == cp850);
183 }
184 {
185 const fs::path p(cp850);
186 assert(p.string() == cp850);
187 assert(p.wstring() == wstr);
188 assert(p.u8string() == u8str);
189 assert(p.u16string() == u16str);
190 assert(p.string<char>() == cp850);
191 assert(p.string<wchar_t>() == wstr);
192 }
193 }
194#endif
195}
196
197// Test conversion with strings that don't fit within one UTF-16 code point.
198// Here, wchar_t can be either UTF-16 or UTF-32 depending on the size on the
199// particular platform.
200static void test_wide_unicode()
201{
202 const char16_t u16str[] = { 0xd801, 0xdc37, 0x00 };
203 const char32_t u32str[] = { 0x10437, 0x00 };
204#if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t)
205 const char8_t u8str[] = { 0xf0, 0x90, 0x90, 0xb7, 0x00 };
206#else
207 const char u8str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 };
208#endif
209 const char str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 };
210 {
211 const fs::path p = fs::u8path(source: str);
212 assert(p.u8string() == u8str);
213 assert(p.u16string() == u16str);
214 assert(p.u32string() == u32str);
215 }
216 {
217 const fs::path p(u16str);
218 assert(p.u8string() == u8str);
219 assert(p.u16string() == u16str);
220 assert(p.u32string() == u32str);
221 }
222 {
223 const fs::path p(u32str);
224 assert(p.u8string() == u8str);
225 assert(p.u16string() == u16str);
226 assert(p.u32string() == u32str);
227 }
228#if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__)
229# if __SIZEOF_WCHAR_T__ == 2
230 const wchar_t wstr[] = { 0xd801, 0xdc37, 0x00 };
231# else
232 const wchar_t wstr[] = { 0x10437, 0x00 };
233# endif
234 // Test conversion to/from wchar_t.
235 {
236 const fs::path p = fs::u8path(source: str);
237 assert(p.wstring() == wstr);
238 }
239 {
240 const fs::path p(u16str);
241 assert(p.wstring() == wstr);
242 }
243 {
244 const fs::path p(u32str);
245 assert(p.wstring() == wstr);
246 }
247 {
248 const fs::path p(wstr);
249 assert(p.u8string() == u8str);
250 assert(p.u16string() == u16str);
251 assert(p.u32string() == u32str);
252 assert(p.wstring() == wstr);
253 }
254#endif // !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__)
255}
256
257// Test appending paths in different encodings.
258static void test_append()
259{
260 const char16_t u16str[] = { 0xd801, 0xdc37, 0x00 };
261 const char32_t u32str[] = { 0x10437, 0x00 };
262 const char32_t u32ref[] = { 0x10437, fs::path::preferred_separator, 0x10437, fs::path::preferred_separator, 0x10437, 0x00 };
263 const char str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 };
264 {
265 fs::path p = fs::u8path(source: str) / u16str / u32str;
266 assert(p.u32string() == u32ref);
267 p = fs::u8path(source: str).append(source: u16str).append(source: u32str);
268 assert(p.u32string() == u32ref);
269 p = fs::u8path(source: str);
270 p /= u16str;
271 p /= u32str;
272 assert(p.u32string() == u32ref);
273 }
274#if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__)
275# if __SIZEOF_WCHAR_T__ == 2
276 const wchar_t wstr[] = { 0xd801, 0xdc37, 0x00 };
277# else
278 const wchar_t wstr[] = { 0x10437, 0x00 };
279# endif
280 // Test conversion from wchar_t.
281 {
282 fs::path p = fs::path(u16str) / wstr / u32str;
283 assert(p.u32string() == u32ref);
284 p = fs::path(u16str).append(source: wstr).append(source: u32str);
285 assert(p.u32string() == u32ref);
286 p = fs::path(u16str);
287 p /= wstr;
288 p /= u32str;
289 assert(p.u32string() == u32ref);
290 }
291#endif // !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__)
292}
293
294static void test_concat()
295{
296 const char16_t u16str[] = { 0xd801, 0xdc37, 0x00 };
297 const char32_t u32str[] = { 0x10437, 0x00 };
298 const char32_t u32ref[] = { 0x10437, 0x10437, 0x10437, 0x00 };
299 const char str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 };
300 {
301 fs::path p = fs::u8path(source: str);
302 p += u16str;
303 p += u32str;
304 assert(p.u32string() == u32ref);
305 p = fs::u8path(source: str).concat(x: u16str).concat(x: u32str);
306 assert(p.u32string() == u32ref);
307 }
308#if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__)
309# if __SIZEOF_WCHAR_T__ == 2
310 const wchar_t wstr[] = { 0xd801, 0xdc37, 0x00 };
311# else
312 const wchar_t wstr[] = { 0x10437, 0x00 };
313# endif
314 // Test conversion from wchar_t.
315 {
316 fs::path p = fs::path(u16str);
317 p += wstr;
318 p += u32str;
319 assert(p.u32string() == u32ref);
320 p = fs::path(u16str).concat(x: wstr).concat(x: u32str);
321 assert(p.u32string() == u32ref);
322 }
323#endif // !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__)
324}
325
326static void test_append_concat_narrow()
327{
328 const char16_t u16str[] = { 0xe5, 0x00 };
329 const char32_t u32ref_append[] = { 0xe5, fs::path::preferred_separator, 0xe5, 0x00 };
330 const char32_t u32ref_concat[] = { 0xe5, 0xe5, 0x00 };
331
332#if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t)
333 {
334 const char8_t u8str[] = { 0xc3, 0xa5, 0x00 };
335 // In C++20, appends of a char8_t string is unambiguously treated as
336 // UTF-8.
337 fs::path p = fs::path(u16str) / u8str;
338 assert(p.u32string() == u32ref_append);
339 p = fs::path(u16str).append(u8str);
340 assert(p.u32string() == u32ref_append);
341 p = fs::path(u16str);
342 p /= u8str;
343 assert(p.u32string() == u32ref_append);
344 p = fs::path(u16str).concat(u8str);
345 assert(p.u32string() == u32ref_concat);
346 p = fs::path(u16str);
347 p += u8str;
348 assert(p.u32string() == u32ref_concat);
349 }
350#endif
351#ifndef _WIN32
352 // Test appending a regular char-based string. On POSIX, this
353 // is implied to convert to/from UTF-8.
354 {
355 const char str[] = { char(0xc3), char(0xa5), 0x00 }; // UTF8, in a regular char string
356 fs::path p = fs::path(u16str) / str;
357 assert(p.u32string() == u32ref_append);
358 p = fs::path(u16str).append(source: str);
359 assert(p.u32string() == u32ref_append);
360 p = fs::path(u16str);
361 p /= str;
362 assert(p.u32string() == u32ref_append);
363 p = fs::path(u16str).concat(x: str);
364 assert(p.u32string() == u32ref_concat);
365 p = fs::path(u16str);
366 p += str;
367 assert(p.u32string() == u32ref_concat);
368 }
369#else
370 SetFileApisToANSI();
371 if (GetACP() == 1252) {
372 const char latin1[] = { char(0xe5), 0x00 };
373 fs::path p = fs::path(u16str) / latin1;
374 assert(p.u32string() == u32ref_append);
375 p = fs::path(u16str).append(latin1);
376 assert(p.u32string() == u32ref_append);
377 p = fs::path(u16str);
378 p /= latin1;
379 assert(p.u32string() == u32ref_append);
380 p = fs::path(u16str).concat(latin1);
381 assert(p.u32string() == u32ref_concat);
382 p = fs::path(u16str);
383 p += latin1;
384 assert(p.u32string() == u32ref_concat);
385 }
386 SetFileApisToOEM();
387 if (GetOEMCP() == 850 || GetOEMCP() == 437) {
388 // This chars is identical in both CP 850 and 437
389 const char cp850[] = { char(0x86), 0x00 };
390 fs::path p = fs::path(u16str) / cp850;
391 assert(p.u32string() == u32ref_append);
392 p = fs::path(u16str).append(cp850);
393 assert(p.u32string() == u32ref_append);
394 p = fs::path(u16str);
395 p /= cp850;
396 assert(p.u32string() == u32ref_append);
397 p = fs::path(u16str).concat(cp850);
398 assert(p.u32string() == u32ref_concat);
399 p = fs::path(u16str);
400 p += cp850;
401 assert(p.u32string() == u32ref_concat);
402 }
403#endif
404}
405
406int main(int, char**)
407{
408 test_latin_unicode();
409 test_wide_unicode();
410 test_append();
411 test_concat();
412 test_append_concat_narrow();
413
414 return 0;
415}
416

source code of libcxx/test/std/input.output/filesystems/class.path/path.member/path.charconv.pass.cpp