1//
2// Copyright (c) 2015 Artyom Beilis (Tonkikh)
3// Copyright (c) 2021-2023 Alexander Grund
4//
5// Distributed under the Boost Software License, Version 1.0.
6// https://www.boost.org/LICENSE_1_0.txt
7
8#include <boost/locale/utf8_codecvt.hpp>
9#include <boost/locale/util.hpp>
10#include <algorithm>
11#include <cstring>
12#include <iomanip>
13#include <iostream>
14#include <locale>
15#include <memory.h>
16#include <wchar.h>
17#define BOOST_LOCALE_ERROR_LIMIT -1
18#include "boostLocale/test/tools.hpp"
19#include "boostLocale/test/unit_test.hpp"
20
21#if defined(BOOST_MSVC) && BOOST_MSVC < 1700
22# pragma warning(disable : 4428) // universal-character-name encountered in source
23#endif
24
25static const char* utf8_name =
26 "\xf0\x9d\x92\x9e-\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82-\xE3\x82\x84\xE3\x81\x82.txt";
27static const wchar_t* wide_name = L"\U0001D49E-\u043F\u0440\u0438\u0432\u0435\u0442-\u3084\u3042.txt";
28
29typedef std::codecvt<wchar_t, char, std::mbstate_t> cvt_type;
30
31void test_codecvt_in_n_m(const cvt_type& cvt, int n, int m)
32{
33 const wchar_t* wptr = wide_name;
34 const size_t wlen = wcslen(s: wide_name);
35 const size_t u8len = strlen(s: utf8_name);
36 const char* from = utf8_name;
37 const char* end = from;
38 const char* real_end = utf8_name + u8len;
39 const char* from_next = from;
40 std::mbstate_t mb{};
41 while(from_next < real_end) {
42 if(from == end) {
43 end = from + n;
44 if(end > real_end)
45 end = real_end;
46 }
47
48 wchar_t buf[128];
49 wchar_t* to = buf;
50 wchar_t* to_end = to + m;
51 wchar_t* to_next = to;
52
53 std::mbstate_t mb2 = mb;
54 std::codecvt_base::result r = cvt.in(state&: mb, from: from, from_end: end, from_next&: from_next, to: to, to_end: to_end, to_next&: to_next);
55
56 int count = cvt.length(state&: mb2, from: from, end: end, max: to_end - to);
57 TEST_EQ(memcmp(&mb, &mb2, sizeof(mb)), 0);
58 if(count != from_next - from)
59 std::cout << count << " " << from_next - from << std::endl; // LCOV_EXCL_LINE
60 TEST_EQ(count, from_next - from);
61
62 if(r == cvt_type::partial) {
63 end += n;
64 if(end > real_end)
65 end = real_end;
66 } else
67 TEST_EQ(r, cvt_type::ok);
68 while(to != to_next) {
69 TEST_EQ(*wptr, *to);
70 wptr++;
71 to++;
72 }
73 to = to_next;
74 from = from_next;
75 }
76 TEST(wptr == wide_name + wlen);
77 TEST(from == real_end);
78}
79
80void test_codecvt_out_n_m(const cvt_type& cvt, int n, int m)
81{
82 const char* nptr = utf8_name;
83 const size_t wlen = wcslen(s: wide_name);
84 const size_t u8len = strlen(s: utf8_name);
85
86 std::mbstate_t mb{};
87
88 const wchar_t* from_next = wide_name;
89 const wchar_t* real_from_end = wide_name + wlen;
90
91 char buf[256];
92 char* to = buf;
93 char* to_next = to;
94 char* to_end = to + n;
95 char* real_to_end = buf + sizeof(buf);
96
97 // Unshift on initial state does nothing
98 TEST_EQ(cvt.unshift(mb, buf, std::end(buf), to_next), cvt_type::ok);
99 TEST(to_next == buf);
100
101 while(from_next < real_from_end) {
102 const wchar_t* from = from_next;
103 const wchar_t* from_end = from + m;
104 if(from_end > real_from_end)
105 from_end = real_from_end;
106 if(to_end == to)
107 to_end = to + n;
108
109 std::codecvt_base::result r = cvt.out(state&: mb, from: from, from_end: from_end, from_next&: from_next, to: to, to_end: to_end, to_next&: to_next);
110 if(r == cvt_type::partial) {
111 // If those are equal, then "partial" probably means: Need more input
112 // Otherwise "Need more output"
113 if(from_next != from_end) {
114 TEST_LT(to_end - to_next, cvt.max_length());
115 to_end = std::min(a: to_end + n, b: real_to_end);
116 }
117 } else
118 TEST_EQ(r, cvt_type::ok);
119
120 while(to != to_next) {
121 TEST_EQ(*nptr, *to);
122 nptr++;
123 to++;
124 }
125 from = from_next;
126 }
127 TEST(nptr == utf8_name + u8len);
128 TEST(from_next == real_from_end);
129 TEST_EQ(cvt.unshift(mb, to, to + n, to_next), cvt_type::ok);
130 TEST(to_next == to);
131
132 // Convert into a to small buffer
133 from_next = wide_name;
134 TEST_EQ(cvt.out(mb, wide_name, real_from_end, from_next, buf, buf + 1, to_next), cvt_type::partial);
135 if(from_next == wide_name) {
136 // Nothing consumed so nothing to do
137 TEST_EQ(cvt.unshift(mb, buf, std::end(buf), to_next), cvt_type::ok);
138 TEST(to_next == buf);
139 } else {
140 TEST(from_next == wide_name + 1);
141 TEST(to_next == buf);
142 // Unshift on non-default state is not possible
143 TEST_EQ(cvt.unshift(mb, buf, std::end(buf), to_next), cvt_type::error);
144 }
145}
146
147void test_codecvt_conv()
148{
149 std::cout << "Conversions " << std::endl;
150 std::locale l(std::locale::classic(), new boost::locale::utf8_codecvt<wchar_t>());
151
152 const cvt_type& cvt = std::use_facet<cvt_type>(loc: l);
153
154 TEST_EQ(cvt.encoding(), 0); // Characters have a variable width
155 TEST_EQ(cvt.max_length(), 4); // At most 4 UTF-8 code units are one internal char (one or two UTF-16 code units)
156 TEST(!cvt.always_noconv()); // Always convert
157
158 for(int i = 1; i <= (int)strlen(s: utf8_name) + 1; i++) {
159 for(int j = 1; j <= (int)wcslen(s: wide_name) + 1; j++) {
160 try {
161 test_codecvt_in_n_m(cvt, n: i, m: j);
162 test_codecvt_out_n_m(cvt, n: i, m: j);
163 } catch(...) { // LCOV_EXCL_LINE
164 std::cerr << "Wlen=" << j << " Nlen=" << i << std::endl; // LCOV_EXCL_LINE
165 throw; // LCOV_EXCL_LINE
166 }
167 }
168 }
169}
170
171void test_codecvt_err()
172{
173 std::cout << "Errors " << std::endl;
174 std::locale l(std::locale::classic(), new boost::locale::utf8_codecvt<wchar_t>());
175
176 const cvt_type& cvt = std::use_facet<cvt_type>(loc: l);
177
178 std::cout << "- UTF-8" << std::endl;
179 {
180 wchar_t buf[2];
181 wchar_t* to = buf;
182 wchar_t* to_end = buf + 2;
183 wchar_t* to_next = to;
184 const char* err_utf = "1\xFF\xFF";
185 {
186 std::mbstate_t mb{};
187 const char* from = err_utf;
188 const char* from_end = from + strlen(s: from);
189 const char* from_next = from;
190 to_next = to;
191 TEST_EQ(cvt.in(mb, from, from_end, from_next, to, to_end, to_next), cvt_type::error);
192 TEST(from_next == from + 1);
193 TEST(to_next == to + 1);
194 TEST_EQ(*to, '1');
195 }
196 err_utf++;
197 {
198 std::mbstate_t mb{};
199 const char* from = err_utf;
200 const char* from_end = from + strlen(s: from);
201 const char* from_next = from;
202 TEST_EQ(cvt.in(mb, from, from_end, from_next, to, to_end, to_next), cvt_type::error);
203 TEST(from_next == from);
204 TEST(to_next == to);
205 }
206 }
207 std::cout << "- Trailing UTF-16 surrogate" << std::endl;
208 {
209 char buf[4] = {};
210 char* const to = buf;
211 char* const to_end = buf + 4;
212 char* to_next = to;
213 const wchar_t* err_utf = L"\xD800"; // Trailing UTF-16 surrogate
214 std::mbstate_t mb{};
215 const wchar_t* from = err_utf;
216 const wchar_t* from_end = from + 1;
217 const wchar_t* from_next = from;
218 cvt_type::result res = cvt.out(state&: mb, from: from, from_end: from_end, from_next&: from_next, to: to, to_end: to_end, to_next&: to_next);
219 BOOST_LOCALE_START_CONST_CONDITION
220 if(sizeof(wchar_t) == 2) {
221 BOOST_LOCALE_END_CONST_CONDITION
222 TEST(res == cvt_type::partial);
223 TEST(from_next == from_end);
224 TEST(to_next == to);
225 TEST(buf[0] == 0);
226 } else {
227 // surrogate is invalid
228 TEST(res == cvt_type::error);
229 TEST(from_next == from);
230 TEST(to_next == to);
231 }
232 }
233
234 std::cout << "- UTF-16/32" << std::endl;
235 {
236 char buf[32];
237 char* to = buf;
238 char* to_end = buf + 32;
239 char* to_next = to;
240 wchar_t err_buf[3] = {'1', 0xDC9E, 0}; // second value is invalid for UTF-16 and 32
241 const wchar_t* err_utf = err_buf;
242 {
243 std::mbstate_t mb{};
244 const wchar_t* from = err_utf;
245 const wchar_t* from_end = from + wcslen(s: from);
246 const wchar_t* from_next = from;
247 TEST_EQ(cvt.out(mb, from, from_end, from_next, to, to_end, to_next), cvt_type::error);
248 TEST(from_next == from + 1);
249 TEST(to_next == to + 1);
250 TEST_EQ(*to, '1');
251 }
252 err_utf++;
253 {
254 std::mbstate_t mb{};
255 const wchar_t* from = err_utf;
256 const wchar_t* from_end = from + wcslen(s: from);
257 const wchar_t* from_next = from;
258 to_next = to;
259 TEST_EQ(cvt.out(mb, from, from_end, from_next, to, to_end, to_next), cvt_type::error);
260 TEST(from_next == from);
261 TEST(to_next == to);
262 }
263 }
264}
265
266void test_char_char()
267{
268 std::cout << "Char-char specialization" << std::endl;
269 std::locale l(std::locale::classic(), new boost::locale::utf8_codecvt<char>());
270 const std::codecvt<char, char, std::mbstate_t>& cvt = std::use_facet<std::codecvt<char, char, std::mbstate_t>>(loc: l);
271 std::mbstate_t mb{};
272 const char* from = "a";
273 const char* from_end = from + 1;
274 const char* from_next = from;
275 char buf[2];
276 char* to = buf;
277 char* to_end = buf + 1;
278 char* to_next = to;
279 TEST(cvt.always_noconv());
280 TEST_EQ(cvt.in(mb, from, from_end, from_next, to, to_end, to_next), cvt_type::noconv);
281 TEST(from_next == from);
282 TEST(to_next == to);
283 TEST_EQ(cvt.out(mb, from, from_end, from_next, to, to_end, to_next), cvt_type::noconv);
284 TEST(from_next == from);
285 TEST(to_next == to);
286 TEST_EQ(cvt.encoding(), 1);
287 TEST_EQ(cvt.max_length(), 1);
288}
289
290void test_codecvt_fallback()
291{
292 std::locale l =
293 boost::locale::util::create_codecvt(in: std::locale::classic(), cvt: nullptr, type: boost::locale::char_facet_t::wchar_f);
294 const cvt_type& cvt = std::use_facet<cvt_type>(loc: l);
295
296 std::mbstate_t mb{};
297 // Fallback converter can convert ASCII
298 const char from[] = "abyzAZ!?019";
299 const char* from_end = std::end(arr: from);
300 const char* from_next = from;
301 wchar_t buf[sizeof(from)]{};
302 wchar_t* to = buf;
303 wchar_t* const to_end = std::end(arr&: buf);
304 wchar_t* to_next = to;
305
306 TEST(!cvt.always_noconv());
307 TEST_EQ(cvt.encoding(), 0);
308 TEST_EQ(cvt.max_length(), 1);
309
310 TEST_EQ(cvt.in(mb, from, from_end, from_next, to, to_end, to_next), cvt_type::ok);
311 TEST(from_next == from_end);
312 TEST(to_next == to_end);
313 TEST_EQ(buf, ascii_to<wchar_t>(from));
314
315 char buf2[sizeof(from)]{};
316 char* to2 = buf2;
317 char* const to_end2 = std::end(arr&: buf2);
318 char* to_next2 = to2;
319 const wchar_t* to_next_wide = to;
320
321 TEST_EQ(cvt.out(mb, to, to_end, to_next_wide, to2, to_end2, to_next2), cvt_type::ok);
322 TEST(to_next_wide == to_end);
323 TEST(to_next2 == to_end2);
324 TEST_EQ(buf2, ascii_to<char>(from));
325
326 // Non-ASCII is an error
327 *to = L'\x81';
328 to_next_wide = to;
329 to_next2 = to2;
330 TEST_EQ(cvt.out(mb, to, to_end, to_next_wide, to2, to_end2, to_next2), cvt_type::error);
331 TEST(to_next_wide == to);
332 TEST(to_next2 == to2);
333
334 const char from_invalid[] = "\x80";
335 from_end = std::end(arr: from_invalid);
336 from_next = from_invalid;
337 to = buf;
338 to_next = to;
339 TEST_EQ(cvt.in(mb, from_invalid, from_end, from_next, to, to_end, to_next), cvt_type::error);
340 TEST(from_next == from_invalid);
341 TEST(to_next == to);
342}
343
344void test_main(int /*argc*/, char** /*argv*/)
345{
346 test_codecvt_conv();
347 test_codecvt_err();
348 test_char_char();
349 test_codecvt_fallback();
350}
351

source code of boost/libs/locale/test/test_codecvt.cpp