1//===----------------------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_CODECVT
10// XFAIL: stdlib=apple-libc++ && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0|12.0|13.0}}
11
12#include <algorithm>
13#include <cassert>
14#include <codecvt>
15#include <locale>
16
17#include "test_macros.h"
18
19struct test_offsets_ok {
20 size_t in_size;
21 size_t out_size;
22};
23struct test_offsets_partial {
24 size_t in_size;
25 size_t out_size;
26 size_t expected_in_next;
27 size_t expected_out_next;
28};
29
30template <class CharT>
31struct test_offsets_error {
32 size_t in_size;
33 size_t out_size;
34 size_t expected_in_next;
35 size_t expected_out_next;
36 CharT replace_char;
37 size_t replace_pos;
38};
39
40#define array_size(x) (sizeof(x) / sizeof(x)[0])
41
42using std::begin;
43using std::char_traits;
44using std::codecvt_base;
45using std::copy;
46using std::end;
47
48template <class InternT, class ExternT>
49void utf8_to_utf32_in_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
50 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
51 const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
52 const char32_t expected[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
53 static_assert(array_size(input) == 11, "");
54 static_assert(array_size(expected) == 5, "");
55
56 ExternT in[array_size(input)];
57 InternT exp[array_size(expected)];
58 copy(begin(input), end(input), begin(in));
59 copy(begin(expected), end(expected), begin(exp));
60 assert(char_traits<ExternT>::length(in) == 10);
61 assert(char_traits<InternT>::length(exp) == 4);
62 test_offsets_ok offsets[] = {{.in_size: 0, .out_size: 0}, {.in_size: 1, .out_size: 1}, {.in_size: 3, .out_size: 2}, {.in_size: 6, .out_size: 3}, {.in_size: 10, .out_size: 4}};
63 for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
64 test_offsets_ok t = *it;
65 InternT out[array_size(exp) - 1] = {};
66 assert(t.in_size <= array_size(in));
67 assert(t.out_size <= array_size(out));
68 mbstate_t state = {};
69 const ExternT* in_next = nullptr;
70 InternT* out_next = nullptr;
71 codecvt_base::result res = codecvt_base::ok;
72
73 res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
74 assert(res == cvt.ok);
75 assert(in_next == in + t.in_size);
76 assert(out_next == out + t.out_size);
77 assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
78 if (t.out_size < array_size(out))
79 assert(out[t.out_size] == 0);
80
81 state = mbstate_t();
82 int len = cvt.length(state, in, in + t.in_size, t.out_size);
83 assert(len >= 0);
84 assert(static_cast<size_t>(len) == t.in_size);
85 }
86
87 for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
88 test_offsets_ok t = *it;
89 InternT out[array_size(exp)] = {};
90 assert(t.in_size <= array_size(in));
91 assert(t.out_size <= array_size(out));
92 mbstate_t state = {};
93 const ExternT* in_next = nullptr;
94 InternT* out_next = nullptr;
95 codecvt_base::result res = codecvt_base::ok;
96
97 res = cvt.in(state, in, in + t.in_size, in_next, out, end(out), out_next);
98 assert(res == cvt.ok);
99 assert(in_next == in + t.in_size);
100 assert(out_next == out + t.out_size);
101 assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
102 if (t.out_size < array_size(out))
103 assert(out[t.out_size] == 0);
104
105 state = mbstate_t();
106 int len = cvt.length(state, in, in + t.in_size, array_size(out));
107 assert(len >= 0);
108 assert(static_cast<size_t>(len) == t.in_size);
109 }
110}
111
112template <class InternT, class ExternT>
113void utf8_to_utf32_in_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
114 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
115 const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
116 const char32_t expected[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
117 static_assert(array_size(input) == 11, "");
118 static_assert(array_size(expected) == 5, "");
119
120 ExternT in[array_size(input)];
121 InternT exp[array_size(expected)];
122 copy(begin(arr: input), end(arr: input), begin(in));
123 copy(begin(arr: expected), end(arr: expected), begin(exp));
124 assert(char_traits<ExternT>::length(in) == 10);
125 assert(char_traits<InternT>::length(exp) == 4);
126
127 test_offsets_partial offsets[] = {
128 {.in_size: 1, .out_size: 0, .expected_in_next: 0, .expected_out_next: 0}, // no space for first CP
129
130 {.in_size: 3, .out_size: 1, .expected_in_next: 1, .expected_out_next: 1}, // no space for second CP
131 {.in_size: 2, .out_size: 2, .expected_in_next: 1, .expected_out_next: 1}, // incomplete second CP
132 {.in_size: 2, .out_size: 1, .expected_in_next: 1, .expected_out_next: 1}, // incomplete second CP, and no space for it
133
134 {.in_size: 6, .out_size: 2, .expected_in_next: 3, .expected_out_next: 2}, // no space for third CP
135 {.in_size: 4, .out_size: 3, .expected_in_next: 3, .expected_out_next: 2}, // incomplete third CP
136 {.in_size: 5, .out_size: 3, .expected_in_next: 3, .expected_out_next: 2}, // incomplete third CP
137 {.in_size: 4, .out_size: 2, .expected_in_next: 3, .expected_out_next: 2}, // incomplete third CP, and no space for it
138 {.in_size: 5, .out_size: 2, .expected_in_next: 3, .expected_out_next: 2}, // incomplete third CP, and no space for it
139
140 {.in_size: 10, .out_size: 3, .expected_in_next: 6, .expected_out_next: 3}, // no space for fourth CP
141 {.in_size: 7, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3}, // incomplete fourth CP
142 {.in_size: 8, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3}, // incomplete fourth CP
143 {.in_size: 9, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3}, // incomplete fourth CP
144 {.in_size: 7, .out_size: 3, .expected_in_next: 6, .expected_out_next: 3}, // incomplete fourth CP, and no space for it
145 {.in_size: 8, .out_size: 3, .expected_in_next: 6, .expected_out_next: 3}, // incomplete fourth CP, and no space for it
146 {.in_size: 9, .out_size: 3, .expected_in_next: 6, .expected_out_next: 3}, // incomplete fourth CP, and no space for it
147 };
148
149 for (test_offsets_partial* it = begin(arr&: offsets); it != end(arr&: offsets); ++it) {
150 test_offsets_partial t = *it;
151 InternT out[array_size(exp) - 1] = {};
152 assert(t.in_size <= array_size(in));
153 assert(t.out_size <= array_size(out));
154 assert(t.expected_in_next <= t.in_size);
155 assert(t.expected_out_next <= t.out_size);
156 mbstate_t state = {};
157 const ExternT* in_next = nullptr;
158 InternT* out_next = nullptr;
159 codecvt_base::result res = codecvt_base::ok;
160
161 res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
162 assert(res == cvt.partial);
163 assert(in_next == in + t.expected_in_next);
164 assert(out_next == out + t.expected_out_next);
165 assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
166 if (t.expected_out_next < array_size(out))
167 assert(out[t.expected_out_next] == 0);
168
169 state = mbstate_t();
170 int len = cvt.length(state, in, in + t.in_size, t.out_size);
171 assert(len >= 0);
172 assert(static_cast<size_t>(len) == t.expected_in_next);
173 }
174}
175
176template <class InternT, class ExternT>
177void utf8_to_utf32_in_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
178 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP, 4-byte CP
179 const unsigned char input[] = "b\u0448\uD700\U0010AAAA";
180 const char32_t expected[] = {'b', 0x0448, 0xD700, 0x10AAAA, 0};
181 static_assert(array_size(input) == 11, "");
182 static_assert(array_size(expected) == 5, "");
183
184 ExternT in[array_size(input)];
185 InternT exp[array_size(expected)];
186 copy(begin(arr: input), end(arr: input), begin(in));
187 copy(begin(arr: expected), end(arr: expected), begin(exp));
188 assert(char_traits<ExternT>::length(in) == 10);
189 assert(char_traits<InternT>::length(exp) == 4);
190
191 // There are 5 classes of errors in UTF-8 decoding
192 // 1. Missing leading byte
193 // 2. Missing trailing byte
194 // 3. Surrogate CP
195 // 4. Overlong sequence
196 // 5. CP out of Unicode range
197 test_offsets_error<unsigned char> offsets[] = {
198
199 // 1. Missing leading byte. We will replace the leading byte with
200 // non-leading byte, such as a byte that is always invalid or a trailing
201 // byte.
202
203 // replace leading byte with invalid byte
204 {.in_size: 1, .out_size: 4, .expected_in_next: 0, .expected_out_next: 0, .replace_char: 0xFF, .replace_pos: 0},
205 {.in_size: 3, .out_size: 4, .expected_in_next: 1, .expected_out_next: 1, .replace_char: 0xFF, .replace_pos: 1},
206 {.in_size: 6, .out_size: 4, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 0xFF, .replace_pos: 3},
207 {.in_size: 10, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0xFF, .replace_pos: 6},
208
209 // replace leading byte with trailing byte
210 {.in_size: 1, .out_size: 4, .expected_in_next: 0, .expected_out_next: 0, .replace_char: 0b10101010, .replace_pos: 0},
211 {.in_size: 3, .out_size: 4, .expected_in_next: 1, .expected_out_next: 1, .replace_char: 0b10101010, .replace_pos: 1},
212 {.in_size: 6, .out_size: 4, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 0b10101010, .replace_pos: 3},
213 {.in_size: 10, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0b10101010, .replace_pos: 6},
214
215 // 2. Missing trailing byte. We will replace the trailing byte with
216 // non-trailing byte, such as a byte that is always invalid or a leading
217 // byte (simple ASCII byte in our case).
218
219 // replace first trailing byte with ASCII byte
220 {.in_size: 3, .out_size: 4, .expected_in_next: 1, .expected_out_next: 1, .replace_char: 'z', .replace_pos: 2},
221 {.in_size: 6, .out_size: 4, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 'z', .replace_pos: 4},
222 {.in_size: 10, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'z', .replace_pos: 7},
223
224 // replace first trailing byte with invalid byte
225 {.in_size: 3, .out_size: 4, .expected_in_next: 1, .expected_out_next: 1, .replace_char: 0xFF, .replace_pos: 2},
226 {.in_size: 6, .out_size: 4, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 0xFF, .replace_pos: 4},
227 {.in_size: 10, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0xFF, .replace_pos: 7},
228
229 // replace second trailing byte with ASCII byte
230 {.in_size: 6, .out_size: 4, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 'z', .replace_pos: 5},
231 {.in_size: 10, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'z', .replace_pos: 8},
232
233 // replace second trailing byte with invalid byte
234 {.in_size: 6, .out_size: 4, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 0xFF, .replace_pos: 5},
235 {.in_size: 10, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0xFF, .replace_pos: 8},
236
237 // replace third trailing byte
238 {.in_size: 10, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'z', .replace_pos: 9},
239 {.in_size: 10, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0xFF, .replace_pos: 9},
240
241 // 2.1 The following test-cases raise doubt whether error or partial should
242 // be returned. For example, we have 4-byte sequence with valid leading
243 // byte. If we hide the last byte we need to return partial. But, if the
244 // second or third byte, which are visible to the call to codecvt, are
245 // malformed then error should be returned.
246
247 // replace first trailing byte with ASCII byte, also incomplete at end
248 {.in_size: 5, .out_size: 4, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 'z', .replace_pos: 4},
249 {.in_size: 8, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'z', .replace_pos: 7},
250 {.in_size: 9, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'z', .replace_pos: 7},
251
252 // replace first trailing byte with invalid byte, also incomplete at end
253 {.in_size: 5, .out_size: 4, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 0xFF, .replace_pos: 4},
254 {.in_size: 8, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0xFF, .replace_pos: 7},
255 {.in_size: 9, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0xFF, .replace_pos: 7},
256
257 // replace second trailing byte with ASCII byte, also incomplete at end
258 {.in_size: 9, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'z', .replace_pos: 8},
259
260 // replace second trailing byte with invalid byte, also incomplete at end
261 {.in_size: 9, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0xFF, .replace_pos: 8},
262
263 // 3. Surrogate CP. We modify the second byte (first trailing) of the 3-byte
264 // CP U+D700
265 {.in_size: 6, .out_size: 4, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 0b10100000, .replace_pos: 4}, // turn U+D700 into U+D800
266 {.in_size: 6, .out_size: 4, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 0b10101100, .replace_pos: 4}, // turn U+D700 into U+DB00
267 {.in_size: 6, .out_size: 4, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 0b10110000, .replace_pos: 4}, // turn U+D700 into U+DC00
268 {.in_size: 6, .out_size: 4, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 0b10111100, .replace_pos: 4}, // turn U+D700 into U+DF00
269
270 // 4. Overlong sequence. The CPs in the input are chosen such as modifying
271 // just the leading byte is enough to make them overlong, i.e. for the
272 // 3-byte and 4-byte CP the second byte (first trailing) has enough leading
273 // zeroes.
274 {.in_size: 3, .out_size: 4, .expected_in_next: 1, .expected_out_next: 1, .replace_char: 0b11000000, .replace_pos: 1}, // make the 2-byte CP overlong
275 {.in_size: 3, .out_size: 4, .expected_in_next: 1, .expected_out_next: 1, .replace_char: 0b11000001, .replace_pos: 1}, // make the 2-byte CP overlong
276 {.in_size: 6, .out_size: 4, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 0b11100000, .replace_pos: 3}, // make the 3-byte CP overlong
277 {.in_size: 10, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0b11110000, .replace_pos: 6}, // make the 4-byte CP overlong
278
279 // 5. CP above range
280 // turn U+10AAAA into U+14AAAA by changing its leading byte
281 {.in_size: 10, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0b11110101, .replace_pos: 6},
282 // turn U+10AAAA into U+11AAAA by changing its 2nd byte
283 {.in_size: 10, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0b10011010, .replace_pos: 7},
284 };
285 for (test_offsets_error<unsigned char>* it = begin(arr&: offsets); it != end(arr&: offsets); ++it) {
286 test_offsets_error<unsigned char> t = *it;
287 InternT out[array_size(exp) - 1] = {};
288 assert(t.in_size <= array_size(in));
289 assert(t.out_size <= array_size(out));
290 assert(t.expected_in_next <= t.in_size);
291 assert(t.expected_out_next <= t.out_size);
292 ExternT old_char = in[t.replace_pos];
293 in[t.replace_pos] = t.replace_char;
294
295 mbstate_t state = {};
296 const ExternT* in_next = nullptr;
297 InternT* out_next = nullptr;
298 codecvt_base::result res = codecvt_base::ok;
299
300 res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
301 assert(res == cvt.error);
302 assert(in_next == in + t.expected_in_next);
303 assert(out_next == out + t.expected_out_next);
304 assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
305 if (t.expected_out_next < array_size(out))
306 assert(out[t.expected_out_next] == 0);
307
308 state = mbstate_t();
309 int len = cvt.length(state, in, in + t.in_size, t.out_size);
310 assert(len >= 0);
311 assert(static_cast<size_t>(len) == t.expected_in_next);
312
313 in[t.replace_pos] = old_char;
314 }
315}
316
317template <class InternT, class ExternT>
318void utf8_to_utf32_in(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
319 utf8_to_utf32_in_ok(cvt);
320 utf8_to_utf32_in_partial(cvt);
321 utf8_to_utf32_in_error(cvt);
322}
323
324template <class InternT, class ExternT>
325void utf32_to_utf8_out_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
326 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
327 const char32_t input[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
328 const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
329 static_assert(array_size(input) == 5, "");
330 static_assert(array_size(expected) == 11, "");
331
332 InternT in[array_size(input)];
333 ExternT exp[array_size(expected)];
334 copy(begin(arr: input), end(arr: input), begin(in));
335 copy(begin(arr: expected), end(arr: expected), begin(exp));
336 assert(char_traits<InternT>::length(in) == 4);
337 assert(char_traits<ExternT>::length(exp) == 10);
338
339 test_offsets_ok offsets[] = {{.in_size: 0, .out_size: 0}, {.in_size: 1, .out_size: 1}, {.in_size: 2, .out_size: 3}, {.in_size: 3, .out_size: 6}, {.in_size: 4, .out_size: 10}};
340 for (test_offsets_ok* it = begin(arr&: offsets); it != end(arr&: offsets); ++it) {
341 test_offsets_ok t = *it;
342 ExternT out[array_size(exp) - 1] = {};
343 assert(t.in_size <= array_size(in));
344 assert(t.out_size <= array_size(out));
345 mbstate_t state = {};
346 const InternT* in_next = nullptr;
347 ExternT* out_next = nullptr;
348 codecvt_base::result res = codecvt_base::ok;
349
350 res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
351 assert(res == cvt.ok);
352 assert(in_next == in + t.in_size);
353 assert(out_next == out + t.out_size);
354 assert(char_traits<ExternT>::compare(out, exp, t.out_size) == 0);
355 if (t.out_size < array_size(out))
356 assert(out[t.out_size] == 0);
357 }
358}
359
360template <class InternT, class ExternT>
361void utf32_to_utf8_out_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
362 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
363 const char32_t input[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
364 const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
365 static_assert(array_size(input) == 5, "");
366 static_assert(array_size(expected) == 11, "");
367
368 InternT in[array_size(input)];
369 ExternT exp[array_size(expected)];
370 copy(begin(arr: input), end(arr: input), begin(in));
371 copy(begin(arr: expected), end(arr: expected), begin(exp));
372 assert(char_traits<InternT>::length(in) == 4);
373 assert(char_traits<ExternT>::length(exp) == 10);
374
375 test_offsets_partial offsets[] = {
376 {.in_size: 1, .out_size: 0, .expected_in_next: 0, .expected_out_next: 0}, // no space for first CP
377
378 {.in_size: 2, .out_size: 1, .expected_in_next: 1, .expected_out_next: 1}, // no space for second CP
379 {.in_size: 2, .out_size: 2, .expected_in_next: 1, .expected_out_next: 1}, // no space for second CP
380
381 {.in_size: 3, .out_size: 3, .expected_in_next: 2, .expected_out_next: 3}, // no space for third CP
382 {.in_size: 3, .out_size: 4, .expected_in_next: 2, .expected_out_next: 3}, // no space for third CP
383 {.in_size: 3, .out_size: 5, .expected_in_next: 2, .expected_out_next: 3}, // no space for third CP
384
385 {.in_size: 4, .out_size: 6, .expected_in_next: 3, .expected_out_next: 6}, // no space for fourth CP
386 {.in_size: 4, .out_size: 7, .expected_in_next: 3, .expected_out_next: 6}, // no space for fourth CP
387 {.in_size: 4, .out_size: 8, .expected_in_next: 3, .expected_out_next: 6}, // no space for fourth CP
388 {.in_size: 4, .out_size: 9, .expected_in_next: 3, .expected_out_next: 6}, // no space for fourth CP
389 };
390 for (test_offsets_partial* it = begin(arr&: offsets); it != end(arr&: offsets); ++it) {
391 test_offsets_partial t = *it;
392 ExternT out[array_size(exp) - 1] = {};
393 assert(t.in_size <= array_size(in));
394 assert(t.out_size <= array_size(out));
395 assert(t.expected_in_next <= t.in_size);
396 assert(t.expected_out_next <= t.out_size);
397 mbstate_t state = {};
398 const InternT* in_next = nullptr;
399 ExternT* out_next = nullptr;
400 codecvt_base::result res = codecvt_base::ok;
401
402 res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
403 assert(res == cvt.partial);
404 assert(in_next == in + t.expected_in_next);
405 assert(out_next == out + t.expected_out_next);
406 assert(char_traits<ExternT>::compare(out, exp, t.expected_out_next) == 0);
407 if (t.expected_out_next < array_size(out))
408 assert(out[t.expected_out_next] == 0);
409 }
410}
411
412template <class InternT, class ExternT>
413void utf32_to_utf8_out_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
414 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
415 const char32_t input[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
416 const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
417 static_assert(array_size(input) == 5, "");
418 static_assert(array_size(expected) == 11, "");
419
420 InternT in[array_size(input)];
421 ExternT exp[array_size(expected)];
422 copy(begin(arr: input), end(arr: input), begin(in));
423 copy(begin(arr: expected), end(arr: expected), begin(exp));
424 assert(char_traits<InternT>::length(in) == 4);
425 assert(char_traits<ExternT>::length(exp) == 10);
426
427 test_offsets_error<InternT> offsets[] = {
428
429 // Surrogate CP
430 {4, 10, 0, 0, 0xD800, 0},
431 {4, 10, 1, 1, 0xDBFF, 1},
432 {4, 10, 2, 3, 0xDC00, 2},
433 {4, 10, 3, 6, 0xDFFF, 3},
434
435 // CP out of range
436 {4, 10, 0, 0, 0x00110000, 0},
437 {4, 10, 1, 1, 0x00110000, 1},
438 {4, 10, 2, 3, 0x00110000, 2},
439 {4, 10, 3, 6, 0x00110000, 3}};
440
441 for (test_offsets_error<InternT>* it = begin(offsets); it != end(offsets); ++it) {
442 test_offsets_error<InternT> t = *it;
443 ExternT out[array_size(exp) - 1] = {};
444 assert(t.in_size <= array_size(in));
445 assert(t.out_size <= array_size(out));
446 assert(t.expected_in_next <= t.in_size);
447 assert(t.expected_out_next <= t.out_size);
448 InternT old_char = in[t.replace_pos];
449 in[t.replace_pos] = t.replace_char;
450
451 mbstate_t state = {};
452 const InternT* in_next = nullptr;
453 ExternT* out_next = nullptr;
454 codecvt_base::result res = codecvt_base::ok;
455
456 res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
457 assert(res == cvt.error);
458 assert(in_next == in + t.expected_in_next);
459 assert(out_next == out + t.expected_out_next);
460 assert(char_traits<ExternT>::compare(out, exp, t.expected_out_next) == 0);
461 if (t.expected_out_next < array_size(out))
462 assert(out[t.expected_out_next] == 0);
463
464 in[t.replace_pos] = old_char;
465 }
466}
467
468template <class InternT, class ExternT>
469void utf32_to_utf8_out(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
470 utf32_to_utf8_out_ok(cvt);
471 utf32_to_utf8_out_partial(cvt);
472 utf32_to_utf8_out_error(cvt);
473}
474
475template <class InternT, class ExternT>
476void test_utf8_utf32_cvt(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
477 utf8_to_utf32_in(cvt);
478 utf32_to_utf8_out(cvt);
479}
480
481template <class InternT, class ExternT>
482void utf8_to_utf16_in_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
483 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
484 const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
485 const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
486 static_assert(array_size(input) == 11, "");
487 static_assert(array_size(expected) == 6, "");
488
489 ExternT in[array_size(input)];
490 InternT exp[array_size(expected)];
491 copy(begin(arr: input), end(arr: input), begin(in));
492 copy(begin(arr: expected), end(arr: expected), begin(exp));
493 assert(char_traits<ExternT>::length(in) == 10);
494 assert(char_traits<InternT>::length(exp) == 5);
495
496 test_offsets_ok offsets[] = {{.in_size: 0, .out_size: 0}, {.in_size: 1, .out_size: 1}, {.in_size: 3, .out_size: 2}, {.in_size: 6, .out_size: 3}, {.in_size: 10, .out_size: 5}};
497 for (test_offsets_ok* it = begin(arr&: offsets); it != end(arr&: offsets); ++it) {
498 test_offsets_ok t = *it;
499 InternT out[array_size(exp) - 1] = {};
500 assert(t.in_size <= array_size(in));
501 assert(t.out_size <= array_size(out));
502 mbstate_t state = {};
503 const ExternT* in_next = nullptr;
504 InternT* out_next = nullptr;
505 codecvt_base::result res = codecvt_base::ok;
506
507 res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
508 assert(res == cvt.ok);
509 assert(in_next == in + t.in_size);
510 assert(out_next == out + t.out_size);
511 assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
512 if (t.out_size < array_size(out))
513 assert(out[t.out_size] == 0);
514
515 state = mbstate_t();
516 int len = cvt.length(state, in, in + t.in_size, t.out_size);
517 assert(len >= 0);
518 assert(static_cast<size_t>(len) == t.in_size);
519 }
520
521 for (test_offsets_ok* it = begin(arr&: offsets); it != end(arr&: offsets); ++it) {
522 test_offsets_ok t = *it;
523 InternT out[array_size(exp)] = {};
524 assert(t.in_size <= array_size(in));
525 assert(t.out_size <= array_size(out));
526 mbstate_t state = {};
527 const ExternT* in_next = nullptr;
528 InternT* out_next = nullptr;
529 codecvt_base::result res = codecvt_base::ok;
530
531 res = cvt.in(state, in, in + t.in_size, in_next, out, end(out), out_next);
532 assert(res == cvt.ok);
533 assert(in_next == in + t.in_size);
534 assert(out_next == out + t.out_size);
535 assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
536 if (t.out_size < array_size(out))
537 assert(out[t.out_size] == 0);
538
539 state = mbstate_t();
540 int len = cvt.length(state, in, in + t.in_size, array_size(out));
541 assert(len >= 0);
542 assert(static_cast<size_t>(len) == t.in_size);
543 }
544}
545
546template <class InternT, class ExternT>
547void utf8_to_utf16_in_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
548 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
549 const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
550 const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
551 static_assert(array_size(input) == 11, "");
552 static_assert(array_size(expected) == 6, "");
553
554 ExternT in[array_size(input)];
555 InternT exp[array_size(expected)];
556 copy(begin(arr: input), end(arr: input), begin(in));
557 copy(begin(arr: expected), end(arr: expected), begin(exp));
558 assert(char_traits<ExternT>::length(in) == 10);
559 assert(char_traits<InternT>::length(exp) == 5);
560
561 test_offsets_partial offsets[] = {
562 {.in_size: 1, .out_size: 0, .expected_in_next: 0, .expected_out_next: 0}, // no space for first CP
563
564 {.in_size: 3, .out_size: 1, .expected_in_next: 1, .expected_out_next: 1}, // no space for second CP
565 {.in_size: 2, .out_size: 2, .expected_in_next: 1, .expected_out_next: 1}, // incomplete second CP
566 {.in_size: 2, .out_size: 1, .expected_in_next: 1, .expected_out_next: 1}, // incomplete second CP, and no space for it
567
568 {.in_size: 6, .out_size: 2, .expected_in_next: 3, .expected_out_next: 2}, // no space for third CP
569 {.in_size: 4, .out_size: 3, .expected_in_next: 3, .expected_out_next: 2}, // incomplete third CP
570 {.in_size: 5, .out_size: 3, .expected_in_next: 3, .expected_out_next: 2}, // incomplete third CP
571 {.in_size: 4, .out_size: 2, .expected_in_next: 3, .expected_out_next: 2}, // incomplete third CP, and no space for it
572 {.in_size: 5, .out_size: 2, .expected_in_next: 3, .expected_out_next: 2}, // incomplete third CP, and no space for it
573
574 {.in_size: 10, .out_size: 3, .expected_in_next: 6, .expected_out_next: 3}, // no space for fourth CP
575 {.in_size: 10, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3}, // no space for fourth CP
576 {.in_size: 7, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3}, // incomplete fourth CP
577 {.in_size: 8, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3}, // incomplete fourth CP
578 {.in_size: 9, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3}, // incomplete fourth CP
579 {.in_size: 7, .out_size: 3, .expected_in_next: 6, .expected_out_next: 3}, // incomplete fourth CP, and no space for it
580 {.in_size: 8, .out_size: 3, .expected_in_next: 6, .expected_out_next: 3}, // incomplete fourth CP, and no space for it
581 {.in_size: 9, .out_size: 3, .expected_in_next: 6, .expected_out_next: 3}, // incomplete fourth CP, and no space for it
582 {.in_size: 7, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3}, // incomplete fourth CP, and no space for it
583 {.in_size: 8, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3}, // incomplete fourth CP, and no space for it
584 {.in_size: 9, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3}, // incomplete fourth CP, and no space for it
585
586 };
587
588 for (test_offsets_partial* it = begin(arr&: offsets); it != end(arr&: offsets); ++it) {
589 test_offsets_partial t = *it;
590 InternT out[array_size(exp) - 1] = {};
591 assert(t.in_size <= array_size(in));
592 assert(t.out_size <= array_size(out));
593 assert(t.expected_in_next <= t.in_size);
594 assert(t.expected_out_next <= t.out_size);
595 mbstate_t state = {};
596 const ExternT* in_next = nullptr;
597 InternT* out_next = nullptr;
598 codecvt_base::result res = codecvt_base::ok;
599
600 res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
601 assert(res == cvt.partial);
602 assert(in_next == in + t.expected_in_next);
603 assert(out_next == out + t.expected_out_next);
604 assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
605 if (t.expected_out_next < array_size(out))
606 assert(out[t.expected_out_next] == 0);
607
608 state = mbstate_t();
609 int len = cvt.length(state, in, in + t.in_size, t.out_size);
610 assert(len >= 0);
611 assert(static_cast<size_t>(len) == t.expected_in_next);
612 }
613}
614
615template <class InternT, class ExternT>
616void utf8_to_utf16_in_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
617 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP, 4-byte CP
618 const unsigned char input[] = "b\u0448\uD700\U0010AAAA";
619 const char16_t expected[] = {'b', 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0};
620 static_assert(array_size(input) == 11, "");
621 static_assert(array_size(expected) == 6, "");
622
623 ExternT in[array_size(input)];
624 InternT exp[array_size(expected)];
625 copy(begin(arr: input), end(arr: input), begin(in));
626 copy(begin(arr: expected), end(arr: expected), begin(exp));
627 assert(char_traits<ExternT>::length(in) == 10);
628 assert(char_traits<InternT>::length(exp) == 5);
629
630 // There are 5 classes of errors in UTF-8 decoding
631 // 1. Missing leading byte
632 // 2. Missing trailing byte
633 // 3. Surrogate CP
634 // 4. Overlong sequence
635 // 5. CP out of Unicode range
636 test_offsets_error<unsigned char> offsets[] = {
637
638 // 1. Missing leading byte. We will replace the leading byte with
639 // non-leading byte, such as a byte that is always invalid or a trailing
640 // byte.
641
642 // replace leading byte with invalid byte
643 {.in_size: 1, .out_size: 5, .expected_in_next: 0, .expected_out_next: 0, .replace_char: 0xFF, .replace_pos: 0},
644 {.in_size: 3, .out_size: 5, .expected_in_next: 1, .expected_out_next: 1, .replace_char: 0xFF, .replace_pos: 1},
645 {.in_size: 6, .out_size: 5, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 0xFF, .replace_pos: 3},
646 {.in_size: 10, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0xFF, .replace_pos: 6},
647
648 // replace leading byte with trailing byte
649 {.in_size: 1, .out_size: 5, .expected_in_next: 0, .expected_out_next: 0, .replace_char: 0b10101010, .replace_pos: 0},
650 {.in_size: 3, .out_size: 5, .expected_in_next: 1, .expected_out_next: 1, .replace_char: 0b10101010, .replace_pos: 1},
651 {.in_size: 6, .out_size: 5, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 0b10101010, .replace_pos: 3},
652 {.in_size: 10, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0b10101010, .replace_pos: 6},
653
654 // 2. Missing trailing byte. We will replace the trailing byte with
655 // non-trailing byte, such as a byte that is always invalid or a leading
656 // byte (simple ASCII byte in our case).
657
658 // replace first trailing byte with ASCII byte
659 {.in_size: 3, .out_size: 5, .expected_in_next: 1, .expected_out_next: 1, .replace_char: 'z', .replace_pos: 2},
660 {.in_size: 6, .out_size: 5, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 'z', .replace_pos: 4},
661 {.in_size: 10, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'z', .replace_pos: 7},
662
663 // replace first trailing byte with invalid byte
664 {.in_size: 3, .out_size: 5, .expected_in_next: 1, .expected_out_next: 1, .replace_char: 0xFF, .replace_pos: 2},
665 {.in_size: 6, .out_size: 5, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 0xFF, .replace_pos: 4},
666 {.in_size: 10, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0xFF, .replace_pos: 7},
667
668 // replace second trailing byte with ASCII byte
669 {.in_size: 6, .out_size: 5, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 'z', .replace_pos: 5},
670 {.in_size: 10, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'z', .replace_pos: 8},
671
672 // replace second trailing byte with invalid byte
673 {.in_size: 6, .out_size: 5, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 0xFF, .replace_pos: 5},
674 {.in_size: 10, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0xFF, .replace_pos: 8},
675
676 // replace third trailing byte
677 {.in_size: 10, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'z', .replace_pos: 9},
678 {.in_size: 10, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0xFF, .replace_pos: 9},
679
680 // 2.1 The following test-cases raise doubt whether error or partial should
681 // be returned. For example, we have 4-byte sequence with valid leading
682 // byte. If we hide the last byte we need to return partial. But, if the
683 // second or third byte, which are visible to the call to codecvt, are
684 // malformed then error should be returned.
685
686 // replace first trailing byte with ASCII byte, also incomplete at end
687 {.in_size: 5, .out_size: 5, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 'z', .replace_pos: 4},
688 {.in_size: 8, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'z', .replace_pos: 7},
689 {.in_size: 9, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'z', .replace_pos: 7},
690
691 // replace first trailing byte with invalid byte, also incomplete at end
692 {.in_size: 5, .out_size: 5, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 0xFF, .replace_pos: 4},
693 {.in_size: 8, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0xFF, .replace_pos: 7},
694 {.in_size: 9, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0xFF, .replace_pos: 7},
695
696 // replace second trailing byte with ASCII byte, also incomplete at end
697 {.in_size: 9, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'z', .replace_pos: 8},
698
699 // replace second trailing byte with invalid byte, also incomplete at end
700 {.in_size: 9, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0xFF, .replace_pos: 8},
701
702 // 3. Surrogate CP. We modify the second byte (first trailing) of the 3-byte
703 // CP U+D700
704 {.in_size: 6, .out_size: 5, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 0b10100000, .replace_pos: 4}, // turn U+D700 into U+D800
705 {.in_size: 6, .out_size: 5, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 0b10101100, .replace_pos: 4}, // turn U+D700 into U+DB00
706 {.in_size: 6, .out_size: 5, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 0b10110000, .replace_pos: 4}, // turn U+D700 into U+DC00
707 {.in_size: 6, .out_size: 5, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 0b10111100, .replace_pos: 4}, // turn U+D700 into U+DF00
708
709 // 4. Overlong sequence. The CPs in the input are chosen such as modifying
710 // just the leading byte is enough to make them overlong, i.e. for the
711 // 3-byte and 4-byte CP the second byte (first trailing) has enough leading
712 // zeroes.
713 {.in_size: 3, .out_size: 5, .expected_in_next: 1, .expected_out_next: 1, .replace_char: 0b11000000, .replace_pos: 1}, // make the 2-byte CP overlong
714 {.in_size: 3, .out_size: 5, .expected_in_next: 1, .expected_out_next: 1, .replace_char: 0b11000001, .replace_pos: 1}, // make the 2-byte CP overlong
715 {.in_size: 6, .out_size: 5, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 0b11100000, .replace_pos: 3}, // make the 3-byte CP overlong
716 {.in_size: 10, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0b11110000, .replace_pos: 6}, // make the 4-byte CP overlong
717
718 // 5. CP above range
719 // turn U+10AAAA into U+14AAAA by changing its leading byte
720 {.in_size: 10, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0b11110101, .replace_pos: 6},
721 // turn U+10AAAA into U+11AAAA by changing its 2nd byte
722 {.in_size: 10, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0b10011010, .replace_pos: 7},
723 };
724 for (test_offsets_error<unsigned char>* it = begin(arr&: offsets); it != end(arr&: offsets); ++it) {
725 test_offsets_error<unsigned char> t = *it;
726 InternT out[array_size(exp) - 1] = {};
727 assert(t.in_size <= array_size(in));
728 assert(t.out_size <= array_size(out));
729 assert(t.expected_in_next <= t.in_size);
730 assert(t.expected_out_next <= t.out_size);
731 ExternT old_char = in[t.replace_pos];
732 in[t.replace_pos] = t.replace_char;
733
734 mbstate_t state = {};
735 const ExternT* in_next = nullptr;
736 InternT* out_next = nullptr;
737 codecvt_base::result res = codecvt_base::ok;
738
739 res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
740 assert(res == cvt.error);
741 assert(in_next == in + t.expected_in_next);
742 assert(out_next == out + t.expected_out_next);
743 assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
744 if (t.expected_out_next < array_size(out))
745 assert(out[t.expected_out_next] == 0);
746
747 state = mbstate_t();
748 int len = cvt.length(state, in, in + t.in_size, t.out_size);
749 assert(len >= 0);
750 assert(static_cast<size_t>(len) == t.expected_in_next);
751
752 in[t.replace_pos] = old_char;
753 }
754}
755
756template <class InternT, class ExternT>
757void utf8_to_utf16_in(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
758 utf8_to_utf16_in_ok(cvt);
759 utf8_to_utf16_in_partial(cvt);
760 utf8_to_utf16_in_error(cvt);
761}
762
763template <class InternT, class ExternT>
764void utf16_to_utf8_out_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
765 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
766 const char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
767 const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
768 static_assert(array_size(input) == 6, "");
769 static_assert(array_size(expected) == 11, "");
770
771 InternT in[array_size(input)];
772 ExternT exp[array_size(expected)];
773 copy(begin(arr: input), end(arr: input), begin(in));
774 copy(begin(arr: expected), end(arr: expected), begin(exp));
775 assert(char_traits<InternT>::length(in) == 5);
776 assert(char_traits<ExternT>::length(exp) == 10);
777
778 test_offsets_ok offsets[] = {{.in_size: 0, .out_size: 0}, {.in_size: 1, .out_size: 1}, {.in_size: 2, .out_size: 3}, {.in_size: 3, .out_size: 6}, {.in_size: 5, .out_size: 10}};
779 for (test_offsets_ok* it = begin(arr&: offsets); it != end(arr&: offsets); ++it) {
780 test_offsets_ok t = *it;
781 ExternT out[array_size(exp) - 1] = {};
782 assert(t.in_size <= array_size(in));
783 assert(t.out_size <= array_size(out));
784 mbstate_t state = {};
785 const InternT* in_next = nullptr;
786 ExternT* out_next = nullptr;
787 codecvt_base::result res = codecvt_base::ok;
788
789 res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
790 assert(res == cvt.ok);
791 assert(in_next == in + t.in_size);
792 assert(out_next == out + t.out_size);
793 assert(char_traits<ExternT>::compare(out, exp, t.out_size) == 0);
794 if (t.out_size < array_size(out))
795 assert(out[t.out_size] == 0);
796 }
797}
798
799template <class InternT, class ExternT>
800void utf16_to_utf8_out_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
801 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
802 const char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
803 const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
804 static_assert(array_size(input) == 6, "");
805 static_assert(array_size(expected) == 11, "");
806
807 InternT in[array_size(input)];
808 ExternT exp[array_size(expected)];
809 copy(begin(arr: input), end(arr: input), begin(in));
810 copy(begin(arr: expected), end(arr: expected), begin(exp));
811 assert(char_traits<InternT>::length(in) == 5);
812 assert(char_traits<ExternT>::length(exp) == 10);
813
814 test_offsets_partial offsets[] = {
815 {.in_size: 1, .out_size: 0, .expected_in_next: 0, .expected_out_next: 0}, // no space for first CP
816
817 {.in_size: 2, .out_size: 1, .expected_in_next: 1, .expected_out_next: 1}, // no space for second CP
818 {.in_size: 2, .out_size: 2, .expected_in_next: 1, .expected_out_next: 1}, // no space for second CP
819
820 {.in_size: 3, .out_size: 3, .expected_in_next: 2, .expected_out_next: 3}, // no space for third CP
821 {.in_size: 3, .out_size: 4, .expected_in_next: 2, .expected_out_next: 3}, // no space for third CP
822 {.in_size: 3, .out_size: 5, .expected_in_next: 2, .expected_out_next: 3}, // no space for third CP
823
824 {.in_size: 5, .out_size: 6, .expected_in_next: 3, .expected_out_next: 6}, // no space for fourth CP
825 {.in_size: 5, .out_size: 7, .expected_in_next: 3, .expected_out_next: 6}, // no space for fourth CP
826 {.in_size: 5, .out_size: 8, .expected_in_next: 3, .expected_out_next: 6}, // no space for fourth CP
827 {.in_size: 5, .out_size: 9, .expected_in_next: 3, .expected_out_next: 6}, // no space for fourth CP
828
829 {.in_size: 4, .out_size: 10, .expected_in_next: 3, .expected_out_next: 6}, // incomplete fourth CP
830
831 {.in_size: 4, .out_size: 6, .expected_in_next: 3, .expected_out_next: 6}, // incomplete fourth CP, and no space for it
832 {.in_size: 4, .out_size: 7, .expected_in_next: 3, .expected_out_next: 6}, // incomplete fourth CP, and no space for it
833 {.in_size: 4, .out_size: 8, .expected_in_next: 3, .expected_out_next: 6}, // incomplete fourth CP, and no space for it
834 {.in_size: 4, .out_size: 9, .expected_in_next: 3, .expected_out_next: 6}, // incomplete fourth CP, and no space for it
835 };
836 for (test_offsets_partial* it = begin(arr&: offsets); it != end(arr&: offsets); ++it) {
837 test_offsets_partial t = *it;
838 ExternT out[array_size(exp) - 1] = {};
839 assert(t.in_size <= array_size(in));
840 assert(t.out_size <= array_size(out));
841 assert(t.expected_in_next <= t.in_size);
842 assert(t.expected_out_next <= t.out_size);
843 mbstate_t state = {};
844 const InternT* in_next = nullptr;
845 ExternT* out_next = nullptr;
846 codecvt_base::result res = codecvt_base::ok;
847
848 res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
849 assert(res == cvt.partial);
850 assert(in_next == in + t.expected_in_next);
851 assert(out_next == out + t.expected_out_next);
852 assert(char_traits<ExternT>::compare(out, exp, t.expected_out_next) == 0);
853 if (t.expected_out_next < array_size(out))
854 assert(out[t.expected_out_next] == 0);
855 }
856}
857
858template <class InternT, class ExternT>
859void utf16_to_utf8_out_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
860 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
861 const char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
862 const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
863 static_assert(array_size(input) == 6, "");
864 static_assert(array_size(expected) == 11, "");
865
866 InternT in[array_size(input)];
867 ExternT exp[array_size(expected)];
868 copy(begin(arr: input), end(arr: input), begin(in));
869 copy(begin(arr: expected), end(arr: expected), begin(exp));
870 assert(char_traits<InternT>::length(in) == 5);
871 assert(char_traits<ExternT>::length(exp) == 10);
872
873 // The only possible error in UTF-16 is unpaired surrogate code units.
874 // So we replace valid code points (scalar values) with lone surrogate CU.
875 test_offsets_error<InternT> offsets[] = {
876 {5, 10, 0, 0, 0xD800, 0},
877 {5, 10, 0, 0, 0xDBFF, 0},
878 {5, 10, 0, 0, 0xDC00, 0},
879 {5, 10, 0, 0, 0xDFFF, 0},
880
881 {5, 10, 1, 1, 0xD800, 1},
882 {5, 10, 1, 1, 0xDBFF, 1},
883 {5, 10, 1, 1, 0xDC00, 1},
884 {5, 10, 1, 1, 0xDFFF, 1},
885
886 {5, 10, 2, 3, 0xD800, 2},
887 {5, 10, 2, 3, 0xDBFF, 2},
888 {5, 10, 2, 3, 0xDC00, 2},
889 {5, 10, 2, 3, 0xDFFF, 2},
890
891 // make the leading surrogate a trailing one
892 {5, 10, 3, 6, 0xDC00, 3},
893 {5, 10, 3, 6, 0xDFFF, 3},
894
895 // make the trailing surrogate a leading one
896 {5, 10, 3, 6, 0xD800, 4},
897 {5, 10, 3, 6, 0xDBFF, 4},
898
899 // make the trailing surrogate a BMP char
900 {5, 10, 3, 6, 'z', 4},
901 };
902
903 for (test_offsets_error<InternT>* it = begin(offsets); it != end(offsets); ++it) {
904 test_offsets_error<InternT> t = *it;
905 ExternT out[array_size(exp) - 1] = {};
906 assert(t.in_size <= array_size(in));
907 assert(t.out_size <= array_size(out));
908 assert(t.expected_in_next <= t.in_size);
909 assert(t.expected_out_next <= t.out_size);
910 InternT old_char = in[t.replace_pos];
911 in[t.replace_pos] = t.replace_char;
912
913 mbstate_t state = {};
914 const InternT* in_next = nullptr;
915 ExternT* out_next = nullptr;
916 codecvt_base::result res = codecvt_base::ok;
917
918 res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
919 assert(res == cvt.error);
920 assert(in_next == in + t.expected_in_next);
921 assert(out_next == out + t.expected_out_next);
922 assert(char_traits<ExternT>::compare(out, exp, t.expected_out_next) == 0);
923 if (t.expected_out_next < array_size(out))
924 assert(out[t.expected_out_next] == 0);
925
926 in[t.replace_pos] = old_char;
927 }
928}
929
930template <class InternT, class ExternT>
931void utf16_to_utf8_out(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
932 utf16_to_utf8_out_ok(cvt);
933 utf16_to_utf8_out_partial(cvt);
934 utf16_to_utf8_out_error(cvt);
935}
936
937template <class InternT, class ExternT>
938void test_utf8_utf16_cvt(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
939 utf8_to_utf16_in(cvt);
940 utf16_to_utf8_out(cvt);
941}
942
943template <class InternT, class ExternT>
944void utf8_to_ucs2_in_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
945 // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP
946 const unsigned char input[] = "b\u0448\uAAAA";
947 const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0};
948 static_assert(array_size(input) == 7, "");
949 static_assert(array_size(expected) == 4, "");
950
951 ExternT in[array_size(input)];
952 InternT exp[array_size(expected)];
953 copy(begin(arr: input), end(arr: input), begin(in));
954 copy(begin(arr: expected), end(arr: expected), begin(exp));
955 assert(char_traits<ExternT>::length(in) == 6);
956 assert(char_traits<InternT>::length(exp) == 3);
957
958 test_offsets_ok offsets[] = {{.in_size: 0, .out_size: 0}, {.in_size: 1, .out_size: 1}, {.in_size: 3, .out_size: 2}, {.in_size: 6, .out_size: 3}};
959 for (test_offsets_ok* it = begin(arr&: offsets); it != end(arr&: offsets); ++it) {
960 test_offsets_ok t = *it;
961 InternT out[array_size(exp) - 1] = {};
962 assert(t.in_size <= array_size(in));
963 assert(t.out_size <= array_size(out));
964 mbstate_t state = {};
965 const ExternT* in_next = nullptr;
966 InternT* out_next = nullptr;
967 codecvt_base::result res = codecvt_base::ok;
968
969 res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
970 assert(res == cvt.ok);
971 assert(in_next == in + t.in_size);
972 assert(out_next == out + t.out_size);
973 assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
974 if (t.out_size < array_size(out))
975 assert(out[t.out_size] == 0);
976
977 state = mbstate_t();
978 int len = cvt.length(state, in, in + t.in_size, t.out_size);
979 assert(len >= 0);
980 assert(static_cast<size_t>(len) == t.in_size);
981 }
982
983 for (test_offsets_ok* it = begin(arr&: offsets); it != end(arr&: offsets); ++it) {
984 test_offsets_ok t = *it;
985 InternT out[array_size(exp)] = {};
986 assert(t.in_size <= array_size(in));
987 assert(t.out_size <= array_size(out));
988 mbstate_t state = {};
989 const ExternT* in_next = nullptr;
990 InternT* out_next = nullptr;
991 codecvt_base::result res = codecvt_base::ok;
992
993 res = cvt.in(state, in, in + t.in_size, in_next, out, end(out), out_next);
994 assert(res == cvt.ok);
995 assert(in_next == in + t.in_size);
996 assert(out_next == out + t.out_size);
997 assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
998 if (t.out_size < array_size(out))
999 assert(out[t.out_size] == 0);
1000
1001 state = mbstate_t();
1002 int len = cvt.length(state, in, in + t.in_size, array_size(out));
1003 assert(len >= 0);
1004 assert(static_cast<size_t>(len) == t.in_size);
1005 }
1006}
1007
1008template <class InternT, class ExternT>
1009void utf8_to_ucs2_in_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1010 // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP
1011 const unsigned char input[] = "b\u0448\uAAAA";
1012 const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0};
1013 static_assert(array_size(input) == 7, "");
1014 static_assert(array_size(expected) == 4, "");
1015
1016 ExternT in[array_size(input)];
1017 InternT exp[array_size(expected)];
1018 copy(begin(arr: input), end(arr: input), begin(in));
1019 copy(begin(arr: expected), end(arr: expected), begin(exp));
1020 assert(char_traits<ExternT>::length(in) == 6);
1021 assert(char_traits<InternT>::length(exp) == 3);
1022
1023 test_offsets_partial offsets[] = {
1024 {.in_size: 1, .out_size: 0, .expected_in_next: 0, .expected_out_next: 0}, // no space for first CP
1025
1026 {.in_size: 3, .out_size: 1, .expected_in_next: 1, .expected_out_next: 1}, // no space for second CP
1027 {.in_size: 2, .out_size: 2, .expected_in_next: 1, .expected_out_next: 1}, // incomplete second CP
1028 {.in_size: 2, .out_size: 1, .expected_in_next: 1, .expected_out_next: 1}, // incomplete second CP, and no space for it
1029
1030 {.in_size: 6, .out_size: 2, .expected_in_next: 3, .expected_out_next: 2}, // no space for third CP
1031 {.in_size: 4, .out_size: 3, .expected_in_next: 3, .expected_out_next: 2}, // incomplete third CP
1032 {.in_size: 5, .out_size: 3, .expected_in_next: 3, .expected_out_next: 2}, // incomplete third CP
1033 {.in_size: 4, .out_size: 2, .expected_in_next: 3, .expected_out_next: 2}, // incomplete third CP, and no space for it
1034 {.in_size: 5, .out_size: 2, .expected_in_next: 3, .expected_out_next: 2}, // incomplete third CP, and no space for it
1035 };
1036
1037 for (test_offsets_partial* it = begin(arr&: offsets); it != end(arr&: offsets); ++it) {
1038 test_offsets_partial t = *it;
1039 InternT out[array_size(exp) - 1] = {};
1040 assert(t.in_size <= array_size(in));
1041 assert(t.out_size <= array_size(out));
1042 assert(t.expected_in_next <= t.in_size);
1043 assert(t.expected_out_next <= t.out_size);
1044 mbstate_t state = {};
1045 const ExternT* in_next = nullptr;
1046 InternT* out_next = nullptr;
1047 codecvt_base::result res = codecvt_base::ok;
1048
1049 res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1050 assert(res == cvt.partial);
1051 assert(in_next == in + t.expected_in_next);
1052 assert(out_next == out + t.expected_out_next);
1053 assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
1054 if (t.expected_out_next < array_size(out))
1055 assert(out[t.expected_out_next] == 0);
1056
1057 state = mbstate_t();
1058 int len = cvt.length(state, in, in + t.in_size, t.out_size);
1059 assert(len >= 0);
1060 assert(static_cast<size_t>(len) == t.expected_in_next);
1061 }
1062}
1063
1064template <class InternT, class ExternT>
1065void utf8_to_ucs2_in_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1066 const unsigned char input[] = "b\u0448\uD700\U0010AAAA";
1067 const char16_t expected[] = {'b', 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0};
1068 static_assert(array_size(input) == 11, "");
1069 static_assert(array_size(expected) == 6, "");
1070
1071 ExternT in[array_size(input)];
1072 InternT exp[array_size(expected)];
1073 copy(begin(arr: input), end(arr: input), begin(in));
1074 copy(begin(arr: expected), end(arr: expected), begin(exp));
1075 assert(char_traits<ExternT>::length(in) == 10);
1076 assert(char_traits<InternT>::length(exp) == 5);
1077
1078 // There are 5 classes of errors in UTF-8 decoding
1079 // 1. Missing leading byte
1080 // 2. Missing trailing byte
1081 // 3. Surrogate CP
1082 // 4. Overlong sequence
1083 // 5. CP out of Unicode range
1084 test_offsets_error<unsigned char> offsets[] = {
1085
1086 // 1. Missing leading byte. We will replace the leading byte with
1087 // non-leading byte, such as a byte that is always invalid or a trailing
1088 // byte.
1089
1090 // replace leading byte with invalid byte
1091 {.in_size: 1, .out_size: 5, .expected_in_next: 0, .expected_out_next: 0, .replace_char: 0xFF, .replace_pos: 0},
1092 {.in_size: 3, .out_size: 5, .expected_in_next: 1, .expected_out_next: 1, .replace_char: 0xFF, .replace_pos: 1},
1093 {.in_size: 6, .out_size: 5, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 0xFF, .replace_pos: 3},
1094 {.in_size: 10, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0xFF, .replace_pos: 6},
1095
1096 // replace leading byte with trailing byte
1097 {.in_size: 1, .out_size: 5, .expected_in_next: 0, .expected_out_next: 0, .replace_char: 0b10101010, .replace_pos: 0},
1098 {.in_size: 3, .out_size: 5, .expected_in_next: 1, .expected_out_next: 1, .replace_char: 0b10101010, .replace_pos: 1},
1099 {.in_size: 6, .out_size: 5, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 0b10101010, .replace_pos: 3},
1100 {.in_size: 10, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0b10101010, .replace_pos: 6},
1101
1102 // 2. Missing trailing byte. We will replace the trailing byte with
1103 // non-trailing byte, such as a byte that is always invalid or a leading
1104 // byte (simple ASCII byte in our case).
1105
1106 // replace first trailing byte with ASCII byte
1107 {.in_size: 3, .out_size: 5, .expected_in_next: 1, .expected_out_next: 1, .replace_char: 'z', .replace_pos: 2},
1108 {.in_size: 6, .out_size: 5, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 'z', .replace_pos: 4},
1109 {.in_size: 10, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'z', .replace_pos: 7},
1110
1111 // replace first trailing byte with invalid byte
1112 {.in_size: 3, .out_size: 5, .expected_in_next: 1, .expected_out_next: 1, .replace_char: 0xFF, .replace_pos: 2},
1113 {.in_size: 6, .out_size: 5, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 0xFF, .replace_pos: 4},
1114 {.in_size: 10, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0xFF, .replace_pos: 7},
1115
1116 // replace second trailing byte with ASCII byte
1117 {.in_size: 6, .out_size: 5, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 'z', .replace_pos: 5},
1118 {.in_size: 10, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'z', .replace_pos: 8},
1119
1120 // replace second trailing byte with invalid byte
1121 {.in_size: 6, .out_size: 5, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 0xFF, .replace_pos: 5},
1122 {.in_size: 10, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0xFF, .replace_pos: 8},
1123
1124 // replace third trailing byte
1125 {.in_size: 10, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'z', .replace_pos: 9},
1126 {.in_size: 10, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0xFF, .replace_pos: 9},
1127
1128 // 2.1 The following test-cases raise doubt whether error or partial should
1129 // be returned. For example, we have 4-byte sequence with valid leading
1130 // byte. If we hide the last byte we need to return partial. But, if the
1131 // second or third byte, which are visible to the call to codecvt, are
1132 // malformed then error should be returned.
1133
1134 // replace first trailing byte with ASCII byte, also incomplete at end
1135 {.in_size: 5, .out_size: 5, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 'z', .replace_pos: 4},
1136 {.in_size: 8, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'z', .replace_pos: 7},
1137 {.in_size: 9, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'z', .replace_pos: 7},
1138
1139 // replace first trailing byte with invalid byte, also incomplete at end
1140 {.in_size: 5, .out_size: 5, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 0xFF, .replace_pos: 4},
1141 {.in_size: 8, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0xFF, .replace_pos: 7},
1142 {.in_size: 9, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0xFF, .replace_pos: 7},
1143
1144 // replace second trailing byte with ASCII byte, also incomplete at end
1145 {.in_size: 9, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'z', .replace_pos: 8},
1146
1147 // replace second trailing byte with invalid byte, also incomplete at end
1148 {.in_size: 9, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0xFF, .replace_pos: 8},
1149
1150 // 3. Surrogate CP. We modify the second byte (first trailing) of the 3-byte
1151 // CP U+D700
1152 {.in_size: 6, .out_size: 5, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 0b10100000, .replace_pos: 4}, // turn U+D700 into U+D800
1153 {.in_size: 6, .out_size: 5, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 0b10101100, .replace_pos: 4}, // turn U+D700 into U+DB00
1154 {.in_size: 6, .out_size: 5, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 0b10110000, .replace_pos: 4}, // turn U+D700 into U+DC00
1155 {.in_size: 6, .out_size: 5, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 0b10111100, .replace_pos: 4}, // turn U+D700 into U+DF00
1156
1157 // 4. Overlong sequence. The CPs in the input are chosen such as modifying
1158 // just the leading byte is enough to make them overlong, i.e. for the
1159 // 3-byte and 4-byte CP the second byte (first trailing) has enough leading
1160 // zeroes.
1161 {.in_size: 3, .out_size: 5, .expected_in_next: 1, .expected_out_next: 1, .replace_char: 0b11000000, .replace_pos: 1}, // make the 2-byte CP overlong
1162 {.in_size: 3, .out_size: 5, .expected_in_next: 1, .expected_out_next: 1, .replace_char: 0b11000001, .replace_pos: 1}, // make the 2-byte CP overlong
1163 {.in_size: 6, .out_size: 5, .expected_in_next: 3, .expected_out_next: 2, .replace_char: 0b11100000, .replace_pos: 3}, // make the 3-byte CP overlong
1164 {.in_size: 10, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0b11110000, .replace_pos: 6}, // make the 4-byte CP overlong
1165
1166 // 5. CP above range
1167 // turn U+10AAAA into U+14AAAA by changing its leading byte
1168 {.in_size: 10, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0b11110101, .replace_pos: 6},
1169 // turn U+10AAAA into U+11AAAA by changing its 2nd byte
1170 {.in_size: 10, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0b10011010, .replace_pos: 7},
1171 // Don't replace anything, show full 4-byte CP U+10AAAA
1172 {.in_size: 10, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'b', .replace_pos: 0},
1173 {.in_size: 10, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'b', .replace_pos: 0},
1174 // Don't replace anything, show incomplete 4-byte CP at the end. It's still
1175 // out of UCS2 range just by seeing the first byte.
1176 {.in_size: 7, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'b', .replace_pos: 0}, // incomplete fourth CP
1177 {.in_size: 8, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'b', .replace_pos: 0}, // incomplete fourth CP
1178 {.in_size: 9, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'b', .replace_pos: 0}, // incomplete fourth CP
1179 {.in_size: 7, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'b', .replace_pos: 0}, // incomplete fourth CP
1180 {.in_size: 8, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'b', .replace_pos: 0}, // incomplete fourth CP
1181 {.in_size: 9, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'b', .replace_pos: 0}, // incomplete fourth CP
1182 };
1183 for (test_offsets_error<unsigned char>* it = begin(arr&: offsets); it != end(arr&: offsets); ++it) {
1184 test_offsets_error<unsigned char> t = *it;
1185 InternT out[array_size(exp) - 1] = {};
1186 assert(t.in_size <= array_size(in));
1187 assert(t.out_size <= array_size(out));
1188 assert(t.expected_in_next <= t.in_size);
1189 assert(t.expected_out_next <= t.out_size);
1190 ExternT old_char = in[t.replace_pos];
1191 in[t.replace_pos] = t.replace_char;
1192
1193 mbstate_t state = {};
1194 const ExternT* in_next = nullptr;
1195 InternT* out_next = nullptr;
1196 codecvt_base::result res = codecvt_base::ok;
1197
1198 res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1199 assert(res == cvt.error);
1200 assert(in_next == in + t.expected_in_next);
1201 assert(out_next == out + t.expected_out_next);
1202 assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
1203 if (t.expected_out_next < array_size(out))
1204 assert(out[t.expected_out_next] == 0);
1205
1206 state = mbstate_t();
1207 int len = cvt.length(state, in, in + t.in_size, t.out_size);
1208 assert(len >= 0);
1209 assert(static_cast<size_t>(len) == t.expected_in_next);
1210
1211 in[t.replace_pos] = old_char;
1212 }
1213}
1214
1215template <class InternT, class ExternT>
1216void utf8_to_ucs2_in(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1217 utf8_to_ucs2_in_ok(cvt);
1218 utf8_to_ucs2_in_partial(cvt);
1219 utf8_to_ucs2_in_error(cvt);
1220}
1221
1222template <class InternT, class ExternT>
1223void ucs2_to_utf8_out_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1224 // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP
1225 const char16_t input[] = {'b', 0x0448, 0xAAAA, 0};
1226 const unsigned char expected[] = "b\u0448\uAAAA";
1227 static_assert(array_size(input) == 4, "");
1228 static_assert(array_size(expected) == 7, "");
1229
1230 InternT in[array_size(input)];
1231 ExternT exp[array_size(expected)];
1232 copy(begin(arr: input), end(arr: input), begin(in));
1233 copy(begin(arr: expected), end(arr: expected), begin(exp));
1234 assert(char_traits<InternT>::length(in) == 3);
1235 assert(char_traits<ExternT>::length(exp) == 6);
1236
1237 test_offsets_ok offsets[] = {{.in_size: 0, .out_size: 0}, {.in_size: 1, .out_size: 1}, {.in_size: 2, .out_size: 3}, {.in_size: 3, .out_size: 6}};
1238 for (test_offsets_ok* it = begin(arr&: offsets); it != end(arr&: offsets); ++it) {
1239 test_offsets_ok t = *it;
1240 ExternT out[array_size(exp) - 1] = {};
1241 assert(t.in_size <= array_size(in));
1242 assert(t.out_size <= array_size(out));
1243 mbstate_t state = {};
1244 const InternT* in_next = nullptr;
1245 ExternT* out_next = nullptr;
1246 codecvt_base::result res = codecvt_base::ok;
1247
1248 res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1249 assert(res == cvt.ok);
1250 assert(in_next == in + t.in_size);
1251 assert(out_next == out + t.out_size);
1252 assert(char_traits<ExternT>::compare(out, exp, t.out_size) == 0);
1253 if (t.out_size < array_size(out))
1254 assert(out[t.out_size] == 0);
1255 }
1256}
1257
1258template <class InternT, class ExternT>
1259void ucs2_to_utf8_out_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1260 // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP
1261 const char16_t input[] = {'b', 0x0448, 0xAAAA, 0};
1262 const unsigned char expected[] = "b\u0448\uAAAA";
1263 static_assert(array_size(input) == 4, "");
1264 static_assert(array_size(expected) == 7, "");
1265
1266 InternT in[array_size(input)];
1267 ExternT exp[array_size(expected)];
1268 copy(begin(arr: input), end(arr: input), begin(in));
1269 copy(begin(arr: expected), end(arr: expected), begin(exp));
1270 assert(char_traits<InternT>::length(in) == 3);
1271 assert(char_traits<ExternT>::length(exp) == 6);
1272
1273 test_offsets_partial offsets[] = {
1274 {.in_size: 1, .out_size: 0, .expected_in_next: 0, .expected_out_next: 0}, // no space for first CP
1275
1276 {.in_size: 2, .out_size: 1, .expected_in_next: 1, .expected_out_next: 1}, // no space for second CP
1277 {.in_size: 2, .out_size: 2, .expected_in_next: 1, .expected_out_next: 1}, // no space for second CP
1278
1279 {.in_size: 3, .out_size: 3, .expected_in_next: 2, .expected_out_next: 3}, // no space for third CP
1280 {.in_size: 3, .out_size: 4, .expected_in_next: 2, .expected_out_next: 3}, // no space for third CP
1281 {.in_size: 3, .out_size: 5, .expected_in_next: 2, .expected_out_next: 3}, // no space for third CP
1282 };
1283 for (test_offsets_partial* it = begin(arr&: offsets); it != end(arr&: offsets); ++it) {
1284 test_offsets_partial t = *it;
1285 ExternT out[array_size(exp) - 1] = {};
1286 assert(t.in_size <= array_size(in));
1287 assert(t.out_size <= array_size(out));
1288 assert(t.expected_in_next <= t.in_size);
1289 assert(t.expected_out_next <= t.out_size);
1290 mbstate_t state = {};
1291 const InternT* in_next = nullptr;
1292 ExternT* out_next = nullptr;
1293 codecvt_base::result res = codecvt_base::ok;
1294
1295 res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1296 assert(res == cvt.partial);
1297 assert(in_next == in + t.expected_in_next);
1298 assert(out_next == out + t.expected_out_next);
1299 assert(char_traits<ExternT>::compare(out, exp, t.expected_out_next) == 0);
1300 if (t.expected_out_next < array_size(out))
1301 assert(out[t.expected_out_next] == 0);
1302 }
1303}
1304
1305template <class InternT, class ExternT>
1306void ucs2_to_utf8_out_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1307 const char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1308 const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
1309 static_assert(array_size(input) == 6, "");
1310 static_assert(array_size(expected) == 11, "");
1311
1312 InternT in[array_size(input)];
1313 ExternT exp[array_size(expected)];
1314 copy(begin(arr: input), end(arr: input), begin(in));
1315 copy(begin(arr: expected), end(arr: expected), begin(exp));
1316 assert(char_traits<InternT>::length(in) == 5);
1317 assert(char_traits<ExternT>::length(exp) == 10);
1318
1319 test_offsets_error<InternT> offsets[] = {
1320 {3, 6, 0, 0, 0xD800, 0},
1321 {3, 6, 0, 0, 0xDBFF, 0},
1322 {3, 6, 0, 0, 0xDC00, 0},
1323 {3, 6, 0, 0, 0xDFFF, 0},
1324
1325 {3, 6, 1, 1, 0xD800, 1},
1326 {3, 6, 1, 1, 0xDBFF, 1},
1327 {3, 6, 1, 1, 0xDC00, 1},
1328 {3, 6, 1, 1, 0xDFFF, 1},
1329
1330 {3, 6, 2, 3, 0xD800, 2},
1331 {3, 6, 2, 3, 0xDBFF, 2},
1332 {3, 6, 2, 3, 0xDC00, 2},
1333 {3, 6, 2, 3, 0xDFFF, 2},
1334
1335 // make the leading surrogate a trailing one
1336 {5, 10, 3, 6, 0xDC00, 3},
1337 {5, 10, 3, 6, 0xDFFF, 3},
1338
1339 // make the trailing surrogate a leading one
1340 {5, 10, 3, 6, 0xD800, 4},
1341 {5, 10, 3, 6, 0xDBFF, 4},
1342
1343 // make the trailing surrogate a BMP char
1344 {5, 10, 3, 6, 'z', 4},
1345
1346 // don't replace anything in the test cases bellow, just show the surrogate
1347 // pair (fourth CP) fully or partially
1348 {5, 10, 3, 6, 'b', 0},
1349 {5, 7, 3, 6, 'b', 0}, // no space for fourth CP
1350 {5, 8, 3, 6, 'b', 0}, // no space for fourth CP
1351 {5, 9, 3, 6, 'b', 0}, // no space for fourth CP
1352
1353 {4, 10, 3, 6, 'b', 0}, // incomplete fourth CP
1354 {4, 7, 3, 6, 'b', 0}, // incomplete fourth CP, and no space for it
1355 {4, 8, 3, 6, 'b', 0}, // incomplete fourth CP, and no space for it
1356 {4, 9, 3, 6, 'b', 0}, // incomplete fourth CP, and no space for it
1357 };
1358
1359 for (test_offsets_error<InternT>* it = begin(offsets); it != end(offsets); ++it) {
1360 test_offsets_error<InternT> t = *it;
1361 ExternT out[array_size(exp) - 1] = {};
1362 assert(t.in_size <= array_size(in));
1363 assert(t.out_size <= array_size(out));
1364 assert(t.expected_in_next <= t.in_size);
1365 assert(t.expected_out_next <= t.out_size);
1366 InternT old_char = in[t.replace_pos];
1367 in[t.replace_pos] = t.replace_char;
1368
1369 mbstate_t state = {};
1370 const InternT* in_next = nullptr;
1371 ExternT* out_next = nullptr;
1372 codecvt_base::result res = codecvt_base::ok;
1373
1374 res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1375 assert(res == cvt.error);
1376 assert(in_next == in + t.expected_in_next);
1377 assert(out_next == out + t.expected_out_next);
1378 assert(char_traits<ExternT>::compare(out, exp, t.expected_out_next) == 0);
1379 if (t.expected_out_next < array_size(out))
1380 assert(out[t.expected_out_next] == 0);
1381
1382 in[t.replace_pos] = old_char;
1383 }
1384}
1385
1386template <class InternT, class ExternT>
1387void ucs2_to_utf8_out(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1388 ucs2_to_utf8_out_ok(cvt);
1389 ucs2_to_utf8_out_partial(cvt);
1390 ucs2_to_utf8_out_error(cvt);
1391}
1392
1393template <class InternT, class ExternT>
1394void test_utf8_ucs2_cvt(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1395 utf8_to_ucs2_in(cvt);
1396 ucs2_to_utf8_out(cvt);
1397}
1398
1399enum utf16_endianess { utf16_big_endian, utf16_little_endian };
1400
1401template <class Iter1, class Iter2>
1402Iter2 utf16_to_bytes(Iter1 f, Iter1 l, Iter2 o, utf16_endianess e) {
1403 if (e == utf16_big_endian)
1404 for (; f != l; ++f) {
1405 *o++ = (*f >> 8) & 0xFF;
1406 *o++ = *f & 0xFF;
1407 }
1408 else
1409 for (; f != l; ++f) {
1410 *o++ = *f & 0xFF;
1411 *o++ = (*f >> 8) & 0xFF;
1412 }
1413 return o;
1414}
1415
1416template <class InternT>
1417void utf16_to_utf32_in_ok(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1418 const char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1419 const char32_t expected[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1420 static_assert(array_size(input) == 6, "");
1421 static_assert(array_size(expected) == 5, "");
1422
1423 char in[array_size(input) * 2];
1424 InternT exp[array_size(expected)];
1425 utf16_to_bytes(f: begin(arr: input), l: end(arr: input), o: begin(arr&: in), e: endianess);
1426 copy(begin(arr: expected), end(arr: expected), begin(exp));
1427
1428 test_offsets_ok offsets[] = {{.in_size: 0, .out_size: 0}, {.in_size: 2, .out_size: 1}, {.in_size: 4, .out_size: 2}, {.in_size: 6, .out_size: 3}, {.in_size: 10, .out_size: 4}};
1429 for (test_offsets_ok* it = begin(arr&: offsets); it != end(arr&: offsets); ++it) {
1430 test_offsets_ok t = *it;
1431 InternT out[array_size(exp) - 1] = {};
1432 assert(t.in_size <= array_size(in));
1433 assert(t.out_size <= array_size(out));
1434 mbstate_t state = {};
1435 const char* in_next = nullptr;
1436 InternT* out_next = nullptr;
1437 codecvt_base::result res = codecvt_base::ok;
1438
1439 res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1440 assert(res == cvt.ok);
1441 assert(in_next == in + t.in_size);
1442 assert(out_next == out + t.out_size);
1443 assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
1444 if (t.out_size < array_size(out))
1445 assert(out[t.out_size] == 0);
1446
1447 state = mbstate_t();
1448 int len = cvt.length(state, in, in + t.in_size, t.out_size);
1449 assert(len >= 0);
1450 assert(static_cast<size_t>(len) == t.in_size);
1451 }
1452
1453 for (test_offsets_ok* it = begin(arr&: offsets); it != end(arr&: offsets); ++it) {
1454 test_offsets_ok t = *it;
1455 InternT out[array_size(exp)] = {};
1456 assert(t.in_size <= array_size(in));
1457 assert(t.out_size <= array_size(out));
1458 mbstate_t state = {};
1459 const char* in_next = nullptr;
1460 InternT* out_next = nullptr;
1461 codecvt_base::result res = codecvt_base::ok;
1462
1463 res = cvt.in(state, in, in + t.in_size, in_next, out, end(out), out_next);
1464 assert(res == cvt.ok);
1465 assert(in_next == in + t.in_size);
1466 assert(out_next == out + t.out_size);
1467 assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
1468 if (t.out_size < array_size(out))
1469 assert(out[t.out_size] == 0);
1470
1471 state = mbstate_t();
1472 int len = cvt.length(state, in, in + t.in_size, array_size(out));
1473 assert(len >= 0);
1474 assert(static_cast<size_t>(len) == t.in_size);
1475 }
1476}
1477
1478template <class InternT>
1479void utf16_to_utf32_in_partial(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1480 const char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1481 const char32_t expected[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1482 static_assert(array_size(input) == 6, "");
1483 static_assert(array_size(expected) == 5, "");
1484
1485 char in[array_size(input) * 2];
1486 InternT exp[array_size(expected)];
1487 utf16_to_bytes(f: begin(arr: input), l: end(arr: input), o: begin(arr&: in), e: endianess);
1488 copy(begin(arr: expected), end(arr: expected), begin(exp));
1489
1490 test_offsets_partial offsets[] = {
1491 {.in_size: 2, .out_size: 0, .expected_in_next: 0, .expected_out_next: 0}, // no space for first CP
1492 {.in_size: 1, .out_size: 1, .expected_in_next: 0, .expected_out_next: 0}, // incomplete first CP
1493 {.in_size: 1, .out_size: 0, .expected_in_next: 0, .expected_out_next: 0}, // incomplete first CP, and no space for it
1494
1495 {.in_size: 4, .out_size: 1, .expected_in_next: 2, .expected_out_next: 1}, // no space for second CP
1496 {.in_size: 3, .out_size: 2, .expected_in_next: 2, .expected_out_next: 1}, // incomplete second CP
1497 {.in_size: 3, .out_size: 1, .expected_in_next: 2, .expected_out_next: 1}, // incomplete second CP, and no space for it
1498
1499 {.in_size: 6, .out_size: 2, .expected_in_next: 4, .expected_out_next: 2}, // no space for third CP
1500 {.in_size: 5, .out_size: 3, .expected_in_next: 4, .expected_out_next: 2}, // incomplete third CP
1501 {.in_size: 5, .out_size: 2, .expected_in_next: 4, .expected_out_next: 2}, // incomplete third CP, and no space for it
1502
1503 {.in_size: 10, .out_size: 3, .expected_in_next: 6, .expected_out_next: 3}, // no space for fourth CP
1504 {.in_size: 7, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3}, // incomplete fourth CP
1505 {.in_size: 8, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3}, // incomplete fourth CP
1506 {.in_size: 9, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3}, // incomplete fourth CP
1507 {.in_size: 7, .out_size: 3, .expected_in_next: 6, .expected_out_next: 3}, // incomplete fourth CP, and no space for it
1508 {.in_size: 8, .out_size: 3, .expected_in_next: 6, .expected_out_next: 3}, // incomplete fourth CP, and no space for it
1509 {.in_size: 9, .out_size: 3, .expected_in_next: 6, .expected_out_next: 3}, // incomplete fourth CP, and no space for it
1510 };
1511
1512 for (test_offsets_partial* it = begin(arr&: offsets); it != end(arr&: offsets); ++it) {
1513 test_offsets_partial t = *it;
1514 InternT out[array_size(exp) - 1] = {};
1515 assert(t.in_size <= array_size(in));
1516 assert(t.out_size <= array_size(out));
1517 assert(t.expected_in_next <= t.in_size);
1518 assert(t.expected_out_next <= t.out_size);
1519 mbstate_t state = {};
1520 const char* in_next = nullptr;
1521 InternT* out_next = nullptr;
1522 codecvt_base::result res = codecvt_base::ok;
1523
1524 res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1525 assert(res == cvt.partial);
1526 assert(in_next == in + t.expected_in_next);
1527 assert(out_next == out + t.expected_out_next);
1528 assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
1529 if (t.expected_out_next < array_size(out))
1530 assert(out[t.expected_out_next] == 0);
1531
1532 state = mbstate_t();
1533 int len = cvt.length(state, in, in + t.in_size, t.out_size);
1534 assert(len >= 0);
1535 assert(static_cast<size_t>(len) == t.expected_in_next);
1536 }
1537}
1538
1539template <class InternT>
1540void utf16_to_utf32_in_error(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1541 char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1542 const char32_t expected[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1543 static_assert(array_size(input) == 6, "");
1544 static_assert(array_size(expected) == 5, "");
1545
1546 InternT exp[array_size(expected)];
1547 copy(begin(arr: expected), end(arr: expected), begin(exp));
1548
1549 // The only possible error in UTF-16 is unpaired surrogate code units.
1550 // So we replace valid code points (scalar values) with lone surrogate CU.
1551 test_offsets_error<char16_t> offsets[] = {
1552 {.in_size: 10, .out_size: 4, .expected_in_next: 0, .expected_out_next: 0, .replace_char: 0xD800, .replace_pos: 0},
1553 {.in_size: 10, .out_size: 4, .expected_in_next: 0, .expected_out_next: 0, .replace_char: 0xDBFF, .replace_pos: 0},
1554 {.in_size: 10, .out_size: 4, .expected_in_next: 0, .expected_out_next: 0, .replace_char: 0xDC00, .replace_pos: 0},
1555 {.in_size: 10, .out_size: 4, .expected_in_next: 0, .expected_out_next: 0, .replace_char: 0xDFFF, .replace_pos: 0},
1556
1557 {.in_size: 10, .out_size: 4, .expected_in_next: 2, .expected_out_next: 1, .replace_char: 0xD800, .replace_pos: 1},
1558 {.in_size: 10, .out_size: 4, .expected_in_next: 2, .expected_out_next: 1, .replace_char: 0xDBFF, .replace_pos: 1},
1559 {.in_size: 10, .out_size: 4, .expected_in_next: 2, .expected_out_next: 1, .replace_char: 0xDC00, .replace_pos: 1},
1560 {.in_size: 10, .out_size: 4, .expected_in_next: 2, .expected_out_next: 1, .replace_char: 0xDFFF, .replace_pos: 1},
1561
1562 {.in_size: 10, .out_size: 4, .expected_in_next: 4, .expected_out_next: 2, .replace_char: 0xD800, .replace_pos: 2},
1563 {.in_size: 10, .out_size: 4, .expected_in_next: 4, .expected_out_next: 2, .replace_char: 0xDBFF, .replace_pos: 2},
1564 {.in_size: 10, .out_size: 4, .expected_in_next: 4, .expected_out_next: 2, .replace_char: 0xDC00, .replace_pos: 2},
1565 {.in_size: 10, .out_size: 4, .expected_in_next: 4, .expected_out_next: 2, .replace_char: 0xDFFF, .replace_pos: 2},
1566
1567 // make the leading surrogate a trailing one
1568 {.in_size: 10, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0xDC00, .replace_pos: 3},
1569 {.in_size: 10, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0xDFFF, .replace_pos: 3},
1570
1571 // make the trailing surrogate a leading one
1572 {.in_size: 10, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0xD800, .replace_pos: 4},
1573 {.in_size: 10, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0xDBFF, .replace_pos: 4},
1574
1575 // make the trailing surrogate a BMP char
1576 {.in_size: 10, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'z', .replace_pos: 4},
1577 };
1578
1579 for (test_offsets_error<char16_t>* it = begin(arr&: offsets); it != end(arr&: offsets); ++it) {
1580 test_offsets_error<char16_t> t = *it;
1581 char in[array_size(input) * 2];
1582 InternT out[array_size(exp) - 1] = {};
1583 assert(t.in_size <= array_size(in));
1584 assert(t.out_size <= array_size(out));
1585 assert(t.expected_in_next <= t.in_size);
1586 assert(t.expected_out_next <= t.out_size);
1587 char16_t old_char = input[t.replace_pos];
1588 input[t.replace_pos] = t.replace_char; // replace in input, not in in
1589 utf16_to_bytes(f: begin(arr&: input), l: end(arr&: input), o: begin(arr&: in), e: endianess);
1590
1591 mbstate_t state = {};
1592 const char* in_next = nullptr;
1593 InternT* out_next = nullptr;
1594 codecvt_base::result res = codecvt_base::ok;
1595
1596 res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1597 assert(res == cvt.error);
1598 assert(in_next == in + t.expected_in_next);
1599 assert(out_next == out + t.expected_out_next);
1600 assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
1601 if (t.expected_out_next < array_size(out))
1602 assert(out[t.expected_out_next] == 0);
1603
1604 state = mbstate_t();
1605 int len = cvt.length(state, in, in + t.in_size, t.out_size);
1606 assert(len >= 0);
1607 assert(static_cast<size_t>(len) == t.expected_in_next);
1608
1609 input[t.replace_pos] = old_char;
1610 }
1611}
1612
1613template <class InternT>
1614void utf32_to_utf16_out_ok(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1615 const char32_t input[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1616 const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1617 static_assert(array_size(input) == 5, "");
1618 static_assert(array_size(expected) == 6, "");
1619
1620 InternT in[array_size(input)];
1621 char exp[array_size(expected) * 2];
1622 copy(begin(arr: input), end(arr: input), begin(in));
1623 utf16_to_bytes(f: begin(arr: expected), l: end(arr: expected), o: begin(arr&: exp), e: endianess);
1624
1625 test_offsets_ok offsets[] = {{.in_size: 0, .out_size: 0}, {.in_size: 1, .out_size: 2}, {.in_size: 2, .out_size: 4}, {.in_size: 3, .out_size: 6}, {.in_size: 4, .out_size: 10}};
1626 for (test_offsets_ok* it = begin(arr&: offsets); it != end(arr&: offsets); ++it) {
1627 test_offsets_ok t = *it;
1628 char out[array_size(exp) - 2] = {};
1629 assert(t.in_size <= array_size(in));
1630 assert(t.out_size <= array_size(out));
1631 mbstate_t state = {};
1632 const InternT* in_next = nullptr;
1633 char* out_next = nullptr;
1634 codecvt_base::result res = codecvt_base::ok;
1635
1636 res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1637 assert(res == cvt.ok);
1638 assert(in_next == in + t.in_size);
1639 assert(out_next == out + t.out_size);
1640 assert(char_traits<char>::compare(out, exp, t.out_size) == 0);
1641 if (t.out_size < array_size(out))
1642 assert(out[t.out_size] == 0);
1643 }
1644}
1645
1646template <class InternT>
1647void utf32_to_utf16_out_partial(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1648 const char32_t input[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1649 const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1650 static_assert(array_size(input) == 5, "");
1651 static_assert(array_size(expected) == 6, "");
1652
1653 InternT in[array_size(input)];
1654 char exp[array_size(expected) * 2];
1655 copy(begin(arr: input), end(arr: input), begin(in));
1656 utf16_to_bytes(f: begin(arr: expected), l: end(arr: expected), o: begin(arr&: exp), e: endianess);
1657
1658 test_offsets_partial offsets[] = {
1659 {.in_size: 1, .out_size: 0, .expected_in_next: 0, .expected_out_next: 0}, // no space for first CP
1660 {.in_size: 1, .out_size: 1, .expected_in_next: 0, .expected_out_next: 0}, // no space for first CP
1661
1662 {.in_size: 2, .out_size: 2, .expected_in_next: 1, .expected_out_next: 2}, // no space for second CP
1663 {.in_size: 2, .out_size: 3, .expected_in_next: 1, .expected_out_next: 2}, // no space for second CP
1664
1665 {.in_size: 3, .out_size: 4, .expected_in_next: 2, .expected_out_next: 4}, // no space for third CP
1666 {.in_size: 3, .out_size: 5, .expected_in_next: 2, .expected_out_next: 4}, // no space for third CP
1667
1668 {.in_size: 4, .out_size: 6, .expected_in_next: 3, .expected_out_next: 6}, // no space for fourth CP
1669 {.in_size: 4, .out_size: 7, .expected_in_next: 3, .expected_out_next: 6}, // no space for fourth CP
1670 {.in_size: 4, .out_size: 8, .expected_in_next: 3, .expected_out_next: 6}, // no space for fourth CP
1671 {.in_size: 4, .out_size: 9, .expected_in_next: 3, .expected_out_next: 6}, // no space for fourth CP
1672 };
1673 for (test_offsets_partial* it = begin(arr&: offsets); it != end(arr&: offsets); ++it) {
1674 test_offsets_partial t = *it;
1675 char out[array_size(exp) - 2] = {};
1676 assert(t.in_size <= array_size(in));
1677 assert(t.out_size <= array_size(out));
1678 assert(t.expected_in_next <= t.in_size);
1679 assert(t.expected_out_next <= t.out_size);
1680 mbstate_t state = {};
1681 const InternT* in_next = nullptr;
1682 char* out_next = nullptr;
1683 codecvt_base::result res = codecvt_base::ok;
1684
1685 res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1686 assert(res == cvt.partial);
1687 assert(in_next == in + t.expected_in_next);
1688 assert(out_next == out + t.expected_out_next);
1689 assert(char_traits<char>::compare(out, exp, t.expected_out_next) == 0);
1690 if (t.expected_out_next < array_size(out))
1691 assert(out[t.expected_out_next] == 0);
1692 }
1693}
1694
1695template <class InternT>
1696void utf32_to_utf16_out_error(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1697 const char32_t input[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1698 const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1699 static_assert(array_size(input) == 5, "");
1700 static_assert(array_size(expected) == 6, "");
1701
1702 InternT in[array_size(input)];
1703 char exp[array_size(expected) * 2];
1704 copy(begin(arr: input), end(arr: input), begin(in));
1705 utf16_to_bytes(f: begin(arr: expected), l: end(arr: expected), o: begin(arr&: exp), e: endianess);
1706
1707 test_offsets_error<InternT> offsets[] = {
1708
1709 // Surrogate CP
1710 {4, 10, 0, 0, 0xD800, 0},
1711 {4, 10, 1, 2, 0xDBFF, 1},
1712 {4, 10, 2, 4, 0xDC00, 2},
1713 {4, 10, 3, 6, 0xDFFF, 3},
1714
1715 // CP out of range
1716 {4, 10, 0, 0, 0x00110000, 0},
1717 {4, 10, 1, 2, 0x00110000, 1},
1718 {4, 10, 2, 4, 0x00110000, 2},
1719 {4, 10, 3, 6, 0x00110000, 3}};
1720
1721 for (test_offsets_error<InternT>* it = begin(offsets); it != end(offsets); ++it) {
1722 test_offsets_error<InternT> t = *it;
1723 char out[array_size(exp) - 2] = {};
1724 assert(t.in_size <= array_size(in));
1725 assert(t.out_size <= array_size(out));
1726 assert(t.expected_in_next <= t.in_size);
1727 assert(t.expected_out_next <= t.out_size);
1728 InternT old_char = in[t.replace_pos];
1729 in[t.replace_pos] = t.replace_char;
1730
1731 mbstate_t state = {};
1732 const InternT* in_next = nullptr;
1733 char* out_next = nullptr;
1734 codecvt_base::result res = codecvt_base::ok;
1735
1736 res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1737 assert(res == cvt.error);
1738 assert(in_next == in + t.expected_in_next);
1739 assert(out_next == out + t.expected_out_next);
1740 assert(char_traits<char>::compare(out, exp, t.expected_out_next) == 0);
1741 if (t.expected_out_next < array_size(out))
1742 assert(out[t.expected_out_next] == 0);
1743
1744 in[t.replace_pos] = old_char;
1745 }
1746}
1747
1748template <class InternT>
1749void test_utf16_utf32_cvt(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1750 utf16_to_utf32_in_ok(cvt, endianess);
1751 utf16_to_utf32_in_partial(cvt, endianess);
1752 utf16_to_utf32_in_error(cvt, endianess);
1753 utf32_to_utf16_out_ok(cvt, endianess);
1754 utf32_to_utf16_out_partial(cvt, endianess);
1755 utf32_to_utf16_out_error(cvt, endianess);
1756}
1757
1758template <class InternT>
1759void utf16_to_ucs2_in_ok(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1760 const char16_t input[] = {'b', 0x0448, 0xAAAA, 0};
1761 const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0};
1762 static_assert(array_size(input) == 4, "");
1763 static_assert(array_size(expected) == 4, "");
1764
1765 char in[array_size(input) * 2];
1766 InternT exp[array_size(expected)];
1767 utf16_to_bytes(f: begin(arr: input), l: end(arr: input), o: begin(arr&: in), e: endianess);
1768 copy(begin(arr: expected), end(arr: expected), begin(exp));
1769
1770 test_offsets_ok offsets[] = {{.in_size: 0, .out_size: 0}, {.in_size: 2, .out_size: 1}, {.in_size: 4, .out_size: 2}, {.in_size: 6, .out_size: 3}};
1771 for (test_offsets_ok* it = begin(arr&: offsets); it != end(arr&: offsets); ++it) {
1772 test_offsets_ok t = *it;
1773 InternT out[array_size(exp) - 1] = {};
1774 assert(t.in_size <= array_size(in));
1775 assert(t.out_size <= array_size(out));
1776 mbstate_t state = {};
1777 const char* in_next = nullptr;
1778 InternT* out_next = nullptr;
1779 codecvt_base::result res = codecvt_base::ok;
1780
1781 res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1782 assert(res == cvt.ok);
1783 assert(in_next == in + t.in_size);
1784 assert(out_next == out + t.out_size);
1785 assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
1786 if (t.out_size < array_size(out))
1787 assert(out[t.out_size] == 0);
1788
1789 state = mbstate_t();
1790 int len = cvt.length(state, in, in + t.in_size, t.out_size);
1791 assert(len >= 0);
1792 assert(static_cast<size_t>(len) == t.in_size);
1793 }
1794
1795 for (test_offsets_ok* it = begin(arr&: offsets); it != end(arr&: offsets); ++it) {
1796 test_offsets_ok t = *it;
1797 InternT out[array_size(exp)] = {};
1798 assert(t.in_size <= array_size(in));
1799 assert(t.out_size <= array_size(out));
1800 mbstate_t state = {};
1801 const char* in_next = nullptr;
1802 InternT* out_next = nullptr;
1803 codecvt_base::result res = codecvt_base::ok;
1804
1805 res = cvt.in(state, in, in + t.in_size, in_next, out, end(out), out_next);
1806 assert(res == cvt.ok);
1807 assert(in_next == in + t.in_size);
1808 assert(out_next == out + t.out_size);
1809 assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
1810 if (t.out_size < array_size(out))
1811 assert(out[t.out_size] == 0);
1812
1813 state = mbstate_t();
1814 int len = cvt.length(state, in, in + t.in_size, array_size(out));
1815 assert(len >= 0);
1816 assert(static_cast<size_t>(len) == t.in_size);
1817 }
1818}
1819
1820template <class InternT>
1821void utf16_to_ucs2_in_partial(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1822 const char16_t input[] = {'b', 0x0448, 0xAAAA, 0};
1823 const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0};
1824 static_assert(array_size(input) == 4, "");
1825 static_assert(array_size(expected) == 4, "");
1826
1827 char in[array_size(input) * 2];
1828 InternT exp[array_size(expected)];
1829 utf16_to_bytes(f: begin(arr: input), l: end(arr: input), o: begin(arr&: in), e: endianess);
1830 copy(begin(arr: expected), end(arr: expected), begin(exp));
1831
1832 test_offsets_partial offsets[] = {
1833 {.in_size: 2, .out_size: 0, .expected_in_next: 0, .expected_out_next: 0}, // no space for first CP
1834 {.in_size: 1, .out_size: 1, .expected_in_next: 0, .expected_out_next: 0}, // incomplete first CP
1835 {.in_size: 1, .out_size: 0, .expected_in_next: 0, .expected_out_next: 0}, // incomplete first CP, and no space for it
1836
1837 {.in_size: 4, .out_size: 1, .expected_in_next: 2, .expected_out_next: 1}, // no space for second CP
1838 {.in_size: 3, .out_size: 2, .expected_in_next: 2, .expected_out_next: 1}, // incomplete second CP
1839 {.in_size: 3, .out_size: 1, .expected_in_next: 2, .expected_out_next: 1}, // incomplete second CP, and no space for it
1840
1841 {.in_size: 6, .out_size: 2, .expected_in_next: 4, .expected_out_next: 2}, // no space for third CP
1842 {.in_size: 5, .out_size: 3, .expected_in_next: 4, .expected_out_next: 2}, // incomplete third CP
1843 {.in_size: 5, .out_size: 2, .expected_in_next: 4, .expected_out_next: 2}, // incomplete third CP, and no space for it
1844 };
1845
1846 for (test_offsets_partial* it = begin(arr&: offsets); it != end(arr&: offsets); ++it) {
1847 test_offsets_partial t = *it;
1848 InternT out[array_size(exp) - 1] = {};
1849 assert(t.in_size <= array_size(in));
1850 assert(t.out_size <= array_size(out));
1851 assert(t.expected_in_next <= t.in_size);
1852 assert(t.expected_out_next <= t.out_size);
1853 mbstate_t state = {};
1854 const char* in_next = nullptr;
1855 InternT* out_next = nullptr;
1856 codecvt_base::result res = codecvt_base::ok;
1857
1858 res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1859 assert(res == cvt.partial);
1860 assert(in_next == in + t.expected_in_next);
1861 assert(out_next == out + t.expected_out_next);
1862 assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
1863 if (t.expected_out_next < array_size(out))
1864 assert(out[t.expected_out_next] == 0);
1865
1866 state = mbstate_t();
1867 int len = cvt.length(state, in, in + t.in_size, t.out_size);
1868 assert(len >= 0);
1869 assert(static_cast<size_t>(len) == t.expected_in_next);
1870 }
1871}
1872
1873template <class InternT>
1874void utf16_to_ucs2_in_error(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1875 char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1876 const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1877 static_assert(array_size(input) == 6, "");
1878 static_assert(array_size(expected) == 6, "");
1879
1880 InternT exp[array_size(expected)];
1881 copy(begin(arr: expected), end(arr: expected), begin(exp));
1882
1883 // The only possible error in UTF-16 is unpaired surrogate code units.
1884 // Additionally, because the target encoding is UCS-2, a proper pair of
1885 // surrogates is also error. Simply, any surrogate CU is error.
1886 test_offsets_error<char16_t> offsets[] = {
1887 {.in_size: 6, .out_size: 3, .expected_in_next: 0, .expected_out_next: 0, .replace_char: 0xD800, .replace_pos: 0},
1888 {.in_size: 6, .out_size: 3, .expected_in_next: 0, .expected_out_next: 0, .replace_char: 0xDBFF, .replace_pos: 0},
1889 {.in_size: 6, .out_size: 3, .expected_in_next: 0, .expected_out_next: 0, .replace_char: 0xDC00, .replace_pos: 0},
1890 {.in_size: 6, .out_size: 3, .expected_in_next: 0, .expected_out_next: 0, .replace_char: 0xDFFF, .replace_pos: 0},
1891
1892 {.in_size: 6, .out_size: 3, .expected_in_next: 2, .expected_out_next: 1, .replace_char: 0xD800, .replace_pos: 1},
1893 {.in_size: 6, .out_size: 3, .expected_in_next: 2, .expected_out_next: 1, .replace_char: 0xDBFF, .replace_pos: 1},
1894 {.in_size: 6, .out_size: 3, .expected_in_next: 2, .expected_out_next: 1, .replace_char: 0xDC00, .replace_pos: 1},
1895 {.in_size: 6, .out_size: 3, .expected_in_next: 2, .expected_out_next: 1, .replace_char: 0xDFFF, .replace_pos: 1},
1896
1897 {.in_size: 6, .out_size: 3, .expected_in_next: 4, .expected_out_next: 2, .replace_char: 0xD800, .replace_pos: 2},
1898 {.in_size: 6, .out_size: 3, .expected_in_next: 4, .expected_out_next: 2, .replace_char: 0xDBFF, .replace_pos: 2},
1899 {.in_size: 6, .out_size: 3, .expected_in_next: 4, .expected_out_next: 2, .replace_char: 0xDC00, .replace_pos: 2},
1900 {.in_size: 6, .out_size: 3, .expected_in_next: 4, .expected_out_next: 2, .replace_char: 0xDFFF, .replace_pos: 2},
1901
1902 // make the leading surrogate a trailing one
1903 {.in_size: 10, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0xDC00, .replace_pos: 3},
1904 {.in_size: 10, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0xDFFF, .replace_pos: 3},
1905
1906 // make the trailing surrogate a leading one
1907 {.in_size: 10, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0xD800, .replace_pos: 4},
1908 {.in_size: 10, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 0xDBFF, .replace_pos: 4},
1909
1910 // make the trailing surrogate a BMP char
1911 {.in_size: 10, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'z', .replace_pos: 4},
1912
1913 // don't replace anything in the test cases bellow, just show the surrogate
1914 // pair (fourth CP) fully or partially (just the first surrogate)
1915 {.in_size: 10, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'b', .replace_pos: 0},
1916 {.in_size: 8, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'b', .replace_pos: 0},
1917 {.in_size: 9, .out_size: 5, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'b', .replace_pos: 0},
1918
1919 {.in_size: 10, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'b', .replace_pos: 0},
1920 {.in_size: 8, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'b', .replace_pos: 0},
1921 {.in_size: 9, .out_size: 4, .expected_in_next: 6, .expected_out_next: 3, .replace_char: 'b', .replace_pos: 0},
1922 };
1923
1924 for (test_offsets_error<char16_t>* it = begin(arr&: offsets); it != end(arr&: offsets); ++it) {
1925 test_offsets_error<char16_t> t = *it;
1926 char in[array_size(input) * 2];
1927 InternT out[array_size(exp) - 1] = {};
1928 assert(t.in_size <= array_size(in));
1929 assert(t.out_size <= array_size(out));
1930 assert(t.expected_in_next <= t.in_size);
1931 assert(t.expected_out_next <= t.out_size);
1932 char16_t old_char = input[t.replace_pos];
1933 input[t.replace_pos] = t.replace_char; // replace in input, not in in
1934 utf16_to_bytes(f: begin(arr&: input), l: end(arr&: input), o: begin(arr&: in), e: endianess);
1935
1936 mbstate_t state = {};
1937 const char* in_next = nullptr;
1938 InternT* out_next = nullptr;
1939 codecvt_base::result res = codecvt_base::ok;
1940
1941 res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1942 assert(res == cvt.error);
1943 assert(in_next == in + t.expected_in_next);
1944 assert(out_next == out + t.expected_out_next);
1945 assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
1946 if (t.expected_out_next < array_size(out))
1947 assert(out[t.expected_out_next] == 0);
1948
1949 state = mbstate_t();
1950 int len = cvt.length(state, in, in + t.in_size, t.out_size);
1951 assert(len >= 0);
1952 assert(static_cast<size_t>(len) == t.expected_in_next);
1953
1954 input[t.replace_pos] = old_char;
1955 }
1956}
1957
1958template <class InternT>
1959void ucs2_to_utf16_out_ok(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1960 const char16_t input[] = {'b', 0x0448, 0xAAAA, 0};
1961 const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0};
1962 static_assert(array_size(input) == 4, "");
1963 static_assert(array_size(expected) == 4, "");
1964
1965 InternT in[array_size(input)];
1966 char exp[array_size(expected) * 2];
1967 copy(begin(arr: input), end(arr: input), begin(in));
1968 utf16_to_bytes(f: begin(arr: expected), l: end(arr: expected), o: begin(arr&: exp), e: endianess);
1969
1970 test_offsets_ok offsets[] = {{.in_size: 0, .out_size: 0}, {.in_size: 1, .out_size: 2}, {.in_size: 2, .out_size: 4}, {.in_size: 3, .out_size: 6}};
1971 for (test_offsets_ok* it = begin(arr&: offsets); it != end(arr&: offsets); ++it) {
1972 test_offsets_ok t = *it;
1973 char out[array_size(exp) - 2] = {};
1974 assert(t.in_size <= array_size(in));
1975 assert(t.out_size <= array_size(out));
1976 mbstate_t state = {};
1977 const InternT* in_next = nullptr;
1978 char* out_next = nullptr;
1979 codecvt_base::result res = codecvt_base::ok;
1980
1981 res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1982 assert(res == cvt.ok);
1983 assert(in_next == in + t.in_size);
1984 assert(out_next == out + t.out_size);
1985 assert(char_traits<char>::compare(out, exp, t.out_size) == 0);
1986 if (t.out_size < array_size(out))
1987 assert(out[t.out_size] == 0);
1988 }
1989}
1990
1991template <class InternT>
1992void ucs2_to_utf16_out_partial(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1993 const char16_t input[] = {'b', 0x0448, 0xAAAA, 0};
1994 const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0};
1995 static_assert(array_size(input) == 4, "");
1996 static_assert(array_size(expected) == 4, "");
1997
1998 InternT in[array_size(input)];
1999 char exp[array_size(expected) * 2];
2000 copy(begin(arr: input), end(arr: input), begin(in));
2001 utf16_to_bytes(f: begin(arr: expected), l: end(arr: expected), o: begin(arr&: exp), e: endianess);
2002
2003 test_offsets_partial offsets[] = {
2004 {.in_size: 1, .out_size: 0, .expected_in_next: 0, .expected_out_next: 0}, // no space for first CP
2005 {.in_size: 1, .out_size: 1, .expected_in_next: 0, .expected_out_next: 0}, // no space for first CP
2006
2007 {.in_size: 2, .out_size: 2, .expected_in_next: 1, .expected_out_next: 2}, // no space for second CP
2008 {.in_size: 2, .out_size: 3, .expected_in_next: 1, .expected_out_next: 2}, // no space for second CP
2009
2010 {.in_size: 3, .out_size: 4, .expected_in_next: 2, .expected_out_next: 4}, // no space for third CP
2011 {.in_size: 3, .out_size: 5, .expected_in_next: 2, .expected_out_next: 4}, // no space for third CP
2012 };
2013 for (test_offsets_partial* it = begin(arr&: offsets); it != end(arr&: offsets); ++it) {
2014 test_offsets_partial t = *it;
2015 char out[array_size(exp) - 2] = {};
2016 assert(t.in_size <= array_size(in));
2017 assert(t.out_size <= array_size(out));
2018 assert(t.expected_in_next <= t.in_size);
2019 assert(t.expected_out_next <= t.out_size);
2020 mbstate_t state = {};
2021 const InternT* in_next = nullptr;
2022 char* out_next = nullptr;
2023 codecvt_base::result res = codecvt_base::ok;
2024
2025 res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
2026 assert(res == cvt.partial);
2027 assert(in_next == in + t.expected_in_next);
2028 assert(out_next == out + t.expected_out_next);
2029 assert(char_traits<char>::compare(out, exp, t.expected_out_next) == 0);
2030 if (t.expected_out_next < array_size(out))
2031 assert(out[t.expected_out_next] == 0);
2032 }
2033}
2034
2035template <class InternT>
2036void ucs2_to_utf16_out_error(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
2037 const char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
2038 const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
2039 static_assert(array_size(input) == 6, "");
2040 static_assert(array_size(expected) == 6, "");
2041
2042 InternT in[array_size(input)];
2043 char exp[array_size(expected) * 2];
2044 copy(begin(arr: input), end(arr: input), begin(in));
2045 utf16_to_bytes(f: begin(arr: expected), l: end(arr: expected), o: begin(arr&: exp), e: endianess);
2046
2047 test_offsets_error<InternT> offsets[] = {
2048 {3, 6, 0, 0, 0xD800, 0},
2049 {3, 6, 0, 0, 0xDBFF, 0},
2050 {3, 6, 0, 0, 0xDC00, 0},
2051 {3, 6, 0, 0, 0xDFFF, 0},
2052
2053 {3, 6, 1, 2, 0xD800, 1},
2054 {3, 6, 1, 2, 0xDBFF, 1},
2055 {3, 6, 1, 2, 0xDC00, 1},
2056 {3, 6, 1, 2, 0xDFFF, 1},
2057
2058 {3, 6, 2, 4, 0xD800, 2},
2059 {3, 6, 2, 4, 0xDBFF, 2},
2060 {3, 6, 2, 4, 0xDC00, 2},
2061 {3, 6, 2, 4, 0xDFFF, 2},
2062
2063 // make the leading surrogate a trailing one
2064 {5, 10, 3, 6, 0xDC00, 3},
2065 {5, 10, 3, 6, 0xDFFF, 3},
2066
2067 // make the trailing surrogate a leading one
2068 {5, 10, 3, 6, 0xD800, 4},
2069 {5, 10, 3, 6, 0xDBFF, 4},
2070
2071 // make the trailing surrogate a BMP char
2072 {5, 10, 3, 6, 'z', 4},
2073
2074 // don't replace anything in the test cases bellow, just show the surrogate
2075 // pair (fourth CP) fully or partially (just the first surrogate)
2076 {5, 10, 3, 6, 'b', 0},
2077 {5, 8, 3, 6, 'b', 0},
2078 {5, 9, 3, 6, 'b', 0},
2079
2080 {4, 10, 3, 6, 'b', 0},
2081 {4, 8, 3, 6, 'b', 0},
2082 {4, 9, 3, 6, 'b', 0},
2083 };
2084
2085 for (test_offsets_error<InternT>* it = begin(offsets); it != end(offsets); ++it) {
2086 test_offsets_error<InternT> t = *it;
2087 char out[array_size(exp) - 2] = {};
2088 assert(t.in_size <= array_size(in));
2089 assert(t.out_size <= array_size(out));
2090 assert(t.expected_in_next <= t.in_size);
2091 assert(t.expected_out_next <= t.out_size);
2092 InternT old_char = in[t.replace_pos];
2093 in[t.replace_pos] = t.replace_char;
2094
2095 mbstate_t state = {};
2096 const InternT* in_next = nullptr;
2097 char* out_next = nullptr;
2098 codecvt_base::result res = codecvt_base::ok;
2099
2100 res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
2101 assert(res == cvt.error);
2102 assert(in_next == in + t.expected_in_next);
2103 assert(out_next == out + t.expected_out_next);
2104 assert(char_traits<char>::compare(out, exp, t.expected_out_next) == 0);
2105 if (t.expected_out_next < array_size(out))
2106 assert(out[t.expected_out_next] == 0);
2107
2108 in[t.replace_pos] = old_char;
2109 }
2110}
2111
2112template <class InternT>
2113void test_utf16_ucs2_cvt(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
2114 utf16_to_ucs2_in_ok(cvt, endianess);
2115 utf16_to_ucs2_in_partial(cvt, endianess);
2116 utf16_to_ucs2_in_error(cvt, endianess);
2117 ucs2_to_utf16_out_ok(cvt, endianess);
2118 ucs2_to_utf16_out_partial(cvt, endianess);
2119 ucs2_to_utf16_out_error(cvt, endianess);
2120}
2121
2122using std::codecvt;
2123using std::codecvt_utf16;
2124using std::codecvt_utf8;
2125using std::codecvt_utf8_utf16;
2126using std::has_facet;
2127using std::locale;
2128using std::use_facet;
2129
2130void test_utf8_utf32_codecvts() {
2131 typedef codecvt<char32_t, char, mbstate_t> codecvt_c32;
2132 const locale& loc_c = locale::classic();
2133 assert(has_facet<codecvt_c32>(loc_c));
2134
2135 const codecvt_c32& cvt = use_facet<codecvt_c32>(loc: loc_c);
2136 test_utf8_utf32_cvt(cvt);
2137
2138 codecvt_utf8<char32_t> cvt2;
2139 test_utf8_utf32_cvt(cvt: cvt2);
2140
2141#if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && !defined(TEST_SHORT_WCHAR)
2142 codecvt_utf8<wchar_t> cvt3;
2143 test_utf8_utf32_cvt(cvt: cvt3);
2144#endif
2145
2146#ifndef TEST_HAS_NO_CHAR8_T
2147 typedef codecvt<char32_t, char8_t, mbstate_t> codecvt_c32_c8;
2148 assert(has_facet<codecvt_c32_c8>(loc_c));
2149 const codecvt_c32_c8& cvt4 = use_facet<codecvt_c32_c8>(loc: loc_c);
2150 test_utf8_utf32_cvt(cvt4);
2151#endif
2152}
2153
2154void test_utf8_utf16_codecvts() {
2155 typedef codecvt<char16_t, char, mbstate_t> codecvt_c16;
2156 const locale& loc_c = locale::classic();
2157 assert(has_facet<codecvt_c16>(loc_c));
2158
2159 const codecvt_c16& cvt = use_facet<codecvt_c16>(loc: loc_c);
2160 test_utf8_utf16_cvt(cvt);
2161
2162 codecvt_utf8_utf16<char16_t> cvt2;
2163 test_utf8_utf16_cvt(cvt: cvt2);
2164
2165 codecvt_utf8_utf16<char32_t> cvt3;
2166 test_utf8_utf16_cvt(cvt: cvt3);
2167
2168#ifndef TEST_HAS_NO_WIDE_CHARACTERS
2169 codecvt_utf8_utf16<wchar_t> cvt4;
2170 test_utf8_utf16_cvt(cvt: cvt4);
2171#endif
2172
2173#ifndef TEST_HAS_NO_CHAR8_T
2174 typedef codecvt<char16_t, char8_t, mbstate_t> codecvt_c16_c8;
2175 assert(has_facet<codecvt_c16_c8>(loc_c));
2176 const codecvt_c16_c8& cvt5 = use_facet<codecvt_c16_c8>(loc: loc_c);
2177 test_utf8_utf16_cvt(cvt5);
2178#endif
2179}
2180
2181void test_utf8_ucs2_codecvts() {
2182 codecvt_utf8<char16_t> cvt;
2183 test_utf8_ucs2_cvt(cvt);
2184
2185#if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(TEST_SHORT_WCHAR)
2186 codecvt_utf8<wchar_t> cvt2;
2187 test_utf8_ucs2_cvt(cvt2);
2188#endif
2189}
2190
2191void test_utf16_utf32_codecvts() {
2192 codecvt_utf16<char32_t> cvt;
2193 test_utf16_utf32_cvt(cvt, endianess: utf16_big_endian);
2194
2195 codecvt_utf16<char32_t, 0x10FFFF, std::little_endian> cvt2;
2196 test_utf16_utf32_cvt(cvt: cvt2, endianess: utf16_little_endian);
2197
2198#if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && !defined(TEST_SHORT_WCHAR)
2199 codecvt_utf16<wchar_t> cvt3;
2200 test_utf16_utf32_cvt(cvt: cvt3, endianess: utf16_big_endian);
2201
2202 codecvt_utf16<wchar_t, 0x10FFFF, std::little_endian> cvt4;
2203 test_utf16_utf32_cvt(cvt: cvt4, endianess: utf16_little_endian);
2204#endif
2205}
2206
2207void test_utf16_ucs2_codecvts() {
2208 codecvt_utf16<char16_t> cvt;
2209 test_utf16_ucs2_cvt(cvt, endianess: utf16_big_endian);
2210
2211 codecvt_utf16<char16_t, 0x10FFFF, std::little_endian> cvt2;
2212 test_utf16_ucs2_cvt(cvt: cvt2, endianess: utf16_little_endian);
2213
2214#if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(TEST_SHORT_WCHAR)
2215 codecvt_utf16<wchar_t> cvt3;
2216 test_utf16_ucs2_cvt(cvt3, utf16_big_endian);
2217
2218 codecvt_utf16<wchar_t, 0x10FFFF, std::little_endian> cvt4;
2219 test_utf16_ucs2_cvt(cvt4, utf16_little_endian);
2220#endif
2221}
2222
2223int main() {
2224 test_utf8_utf32_codecvts();
2225 test_utf8_utf16_codecvts();
2226 test_utf8_ucs2_codecvts();
2227 test_utf16_utf32_codecvts();
2228 test_utf16_ucs2_codecvts();
2229}
2230

source code of libcxx/test/std/localization/codecvt_unicode.pass.cpp