1 | // |
2 | // Copyright (c) 2015 Artyom Beilis (Tonkikh) |
3 | // Copyright (c) 2021-2023 Alexander Grund |
4 | // |
5 | // Distributed under the Boost Software License, Version 1.0. |
6 | // https://www.boost.org/LICENSE_1_0.txt |
7 | |
8 | #include <boost/locale/utf8_codecvt.hpp> |
9 | #include <boost/locale/util.hpp> |
10 | #include <algorithm> |
11 | #include <cstring> |
12 | #include <iomanip> |
13 | #include <iostream> |
14 | #include <locale> |
15 | #include <memory.h> |
16 | #include <wchar.h> |
17 | #define BOOST_LOCALE_ERROR_LIMIT -1 |
18 | #include "boostLocale/test/tools.hpp" |
19 | #include "boostLocale/test/unit_test.hpp" |
20 | |
21 | #if defined(BOOST_MSVC) && BOOST_MSVC < 1700 |
22 | # pragma warning(disable : 4428) // universal-character-name encountered in source |
23 | #endif |
24 | |
25 | static const char* utf8_name = |
26 | "\xf0\x9d\x92\x9e-\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82-\xE3\x82\x84\xE3\x81\x82.txt" ; |
27 | static const wchar_t* wide_name = L"\U0001D49E-\u043F\u0440\u0438\u0432\u0435\u0442-\u3084\u3042.txt" ; |
28 | |
29 | typedef std::codecvt<wchar_t, char, std::mbstate_t> cvt_type; |
30 | |
31 | void test_codecvt_in_n_m(const cvt_type& cvt, int n, int m) |
32 | { |
33 | const wchar_t* wptr = wide_name; |
34 | const size_t wlen = wcslen(s: wide_name); |
35 | const size_t u8len = strlen(s: utf8_name); |
36 | const char* from = utf8_name; |
37 | const char* end = from; |
38 | const char* real_end = utf8_name + u8len; |
39 | const char* from_next = from; |
40 | std::mbstate_t mb{}; |
41 | while(from_next < real_end) { |
42 | if(from == end) { |
43 | end = from + n; |
44 | if(end > real_end) |
45 | end = real_end; |
46 | } |
47 | |
48 | wchar_t buf[128]; |
49 | wchar_t* to = buf; |
50 | wchar_t* to_end = to + m; |
51 | wchar_t* to_next = to; |
52 | |
53 | std::mbstate_t mb2 = mb; |
54 | std::codecvt_base::result r = cvt.in(state&: mb, from: from, from_end: end, from_next&: from_next, to: to, to_end: to_end, to_next&: to_next); |
55 | |
56 | int count = cvt.length(state&: mb2, from: from, end: end, max: to_end - to); |
57 | TEST_EQ(memcmp(&mb, &mb2, sizeof(mb)), 0); |
58 | if(count != from_next - from) |
59 | std::cout << count << " " << from_next - from << std::endl; // LCOV_EXCL_LINE |
60 | TEST_EQ(count, from_next - from); |
61 | |
62 | if(r == cvt_type::partial) { |
63 | end += n; |
64 | if(end > real_end) |
65 | end = real_end; |
66 | } else |
67 | TEST_EQ(r, cvt_type::ok); |
68 | while(to != to_next) { |
69 | TEST_EQ(*wptr, *to); |
70 | wptr++; |
71 | to++; |
72 | } |
73 | to = to_next; |
74 | from = from_next; |
75 | } |
76 | TEST(wptr == wide_name + wlen); |
77 | TEST(from == real_end); |
78 | } |
79 | |
80 | void test_codecvt_out_n_m(const cvt_type& cvt, int n, int m) |
81 | { |
82 | const char* nptr = utf8_name; |
83 | const size_t wlen = wcslen(s: wide_name); |
84 | const size_t u8len = strlen(s: utf8_name); |
85 | |
86 | std::mbstate_t mb{}; |
87 | |
88 | const wchar_t* from_next = wide_name; |
89 | const wchar_t* real_from_end = wide_name + wlen; |
90 | |
91 | char buf[256]; |
92 | char* to = buf; |
93 | char* to_next = to; |
94 | char* to_end = to + n; |
95 | char* real_to_end = buf + sizeof(buf); |
96 | |
97 | // Unshift on initial state does nothing |
98 | TEST_EQ(cvt.unshift(mb, buf, std::end(buf), to_next), cvt_type::ok); |
99 | TEST(to_next == buf); |
100 | |
101 | while(from_next < real_from_end) { |
102 | const wchar_t* from = from_next; |
103 | const wchar_t* from_end = from + m; |
104 | if(from_end > real_from_end) |
105 | from_end = real_from_end; |
106 | if(to_end == to) |
107 | to_end = to + n; |
108 | |
109 | std::codecvt_base::result r = cvt.out(state&: mb, from: from, from_end: from_end, from_next&: from_next, to: to, to_end: to_end, to_next&: to_next); |
110 | if(r == cvt_type::partial) { |
111 | // If those are equal, then "partial" probably means: Need more input |
112 | // Otherwise "Need more output" |
113 | if(from_next != from_end) { |
114 | TEST_LT(to_end - to_next, cvt.max_length()); |
115 | to_end = std::min(a: to_end + n, b: real_to_end); |
116 | } |
117 | } else |
118 | TEST_EQ(r, cvt_type::ok); |
119 | |
120 | while(to != to_next) { |
121 | TEST_EQ(*nptr, *to); |
122 | nptr++; |
123 | to++; |
124 | } |
125 | from = from_next; |
126 | } |
127 | TEST(nptr == utf8_name + u8len); |
128 | TEST(from_next == real_from_end); |
129 | TEST_EQ(cvt.unshift(mb, to, to + n, to_next), cvt_type::ok); |
130 | TEST(to_next == to); |
131 | |
132 | // Convert into a to small buffer |
133 | from_next = wide_name; |
134 | TEST_EQ(cvt.out(mb, wide_name, real_from_end, from_next, buf, buf + 1, to_next), cvt_type::partial); |
135 | if(from_next == wide_name) { |
136 | // Nothing consumed so nothing to do |
137 | TEST_EQ(cvt.unshift(mb, buf, std::end(buf), to_next), cvt_type::ok); |
138 | TEST(to_next == buf); |
139 | } else { |
140 | TEST(from_next == wide_name + 1); |
141 | TEST(to_next == buf); |
142 | // Unshift on non-default state is not possible |
143 | TEST_EQ(cvt.unshift(mb, buf, std::end(buf), to_next), cvt_type::error); |
144 | } |
145 | } |
146 | |
147 | void test_codecvt_conv() |
148 | { |
149 | std::cout << "Conversions " << std::endl; |
150 | std::locale l(std::locale::classic(), new boost::locale::utf8_codecvt<wchar_t>()); |
151 | |
152 | const cvt_type& cvt = std::use_facet<cvt_type>(loc: l); |
153 | |
154 | TEST_EQ(cvt.encoding(), 0); // Characters have a variable width |
155 | TEST_EQ(cvt.max_length(), 4); // At most 4 UTF-8 code units are one internal char (one or two UTF-16 code units) |
156 | TEST(!cvt.always_noconv()); // Always convert |
157 | |
158 | for(int i = 1; i <= (int)strlen(s: utf8_name) + 1; i++) { |
159 | for(int j = 1; j <= (int)wcslen(s: wide_name) + 1; j++) { |
160 | try { |
161 | test_codecvt_in_n_m(cvt, n: i, m: j); |
162 | test_codecvt_out_n_m(cvt, n: i, m: j); |
163 | } catch(...) { // LCOV_EXCL_LINE |
164 | std::cerr << "Wlen=" << j << " Nlen=" << i << std::endl; // LCOV_EXCL_LINE |
165 | throw; // LCOV_EXCL_LINE |
166 | } |
167 | } |
168 | } |
169 | } |
170 | |
171 | void test_codecvt_err() |
172 | { |
173 | std::cout << "Errors " << std::endl; |
174 | std::locale l(std::locale::classic(), new boost::locale::utf8_codecvt<wchar_t>()); |
175 | |
176 | const cvt_type& cvt = std::use_facet<cvt_type>(loc: l); |
177 | |
178 | std::cout << "- UTF-8" << std::endl; |
179 | { |
180 | wchar_t buf[2]; |
181 | wchar_t* to = buf; |
182 | wchar_t* to_end = buf + 2; |
183 | wchar_t* to_next = to; |
184 | const char* err_utf = "1\xFF\xFF" ; |
185 | { |
186 | std::mbstate_t mb{}; |
187 | const char* from = err_utf; |
188 | const char* from_end = from + strlen(s: from); |
189 | const char* from_next = from; |
190 | to_next = to; |
191 | TEST_EQ(cvt.in(mb, from, from_end, from_next, to, to_end, to_next), cvt_type::error); |
192 | TEST(from_next == from + 1); |
193 | TEST(to_next == to + 1); |
194 | TEST_EQ(*to, '1'); |
195 | } |
196 | err_utf++; |
197 | { |
198 | std::mbstate_t mb{}; |
199 | const char* from = err_utf; |
200 | const char* from_end = from + strlen(s: from); |
201 | const char* from_next = from; |
202 | TEST_EQ(cvt.in(mb, from, from_end, from_next, to, to_end, to_next), cvt_type::error); |
203 | TEST(from_next == from); |
204 | TEST(to_next == to); |
205 | } |
206 | } |
207 | std::cout << "- Trailing UTF-16 surrogate" << std::endl; |
208 | { |
209 | char buf[4] = {}; |
210 | char* const to = buf; |
211 | char* const to_end = buf + 4; |
212 | char* to_next = to; |
213 | const wchar_t* err_utf = L"\xD800" ; // Trailing UTF-16 surrogate |
214 | std::mbstate_t mb{}; |
215 | const wchar_t* from = err_utf; |
216 | const wchar_t* from_end = from + 1; |
217 | const wchar_t* from_next = from; |
218 | cvt_type::result res = cvt.out(state&: mb, from: from, from_end: from_end, from_next&: from_next, to: to, to_end: to_end, to_next&: to_next); |
219 | BOOST_LOCALE_START_CONST_CONDITION |
220 | if(sizeof(wchar_t) == 2) { |
221 | BOOST_LOCALE_END_CONST_CONDITION |
222 | TEST(res == cvt_type::partial); |
223 | TEST(from_next == from_end); |
224 | TEST(to_next == to); |
225 | TEST(buf[0] == 0); |
226 | } else { |
227 | // surrogate is invalid |
228 | TEST(res == cvt_type::error); |
229 | TEST(from_next == from); |
230 | TEST(to_next == to); |
231 | } |
232 | } |
233 | |
234 | std::cout << "- UTF-16/32" << std::endl; |
235 | { |
236 | char buf[32]; |
237 | char* to = buf; |
238 | char* to_end = buf + 32; |
239 | char* to_next = to; |
240 | wchar_t err_buf[3] = {'1', 0xDC9E, 0}; // second value is invalid for UTF-16 and 32 |
241 | const wchar_t* err_utf = err_buf; |
242 | { |
243 | std::mbstate_t mb{}; |
244 | const wchar_t* from = err_utf; |
245 | const wchar_t* from_end = from + wcslen(s: from); |
246 | const wchar_t* from_next = from; |
247 | TEST_EQ(cvt.out(mb, from, from_end, from_next, to, to_end, to_next), cvt_type::error); |
248 | TEST(from_next == from + 1); |
249 | TEST(to_next == to + 1); |
250 | TEST_EQ(*to, '1'); |
251 | } |
252 | err_utf++; |
253 | { |
254 | std::mbstate_t mb{}; |
255 | const wchar_t* from = err_utf; |
256 | const wchar_t* from_end = from + wcslen(s: from); |
257 | const wchar_t* from_next = from; |
258 | to_next = to; |
259 | TEST_EQ(cvt.out(mb, from, from_end, from_next, to, to_end, to_next), cvt_type::error); |
260 | TEST(from_next == from); |
261 | TEST(to_next == to); |
262 | } |
263 | } |
264 | } |
265 | |
266 | void test_char_char() |
267 | { |
268 | std::cout << "Char-char specialization" << std::endl; |
269 | std::locale l(std::locale::classic(), new boost::locale::utf8_codecvt<char>()); |
270 | const std::codecvt<char, char, std::mbstate_t>& cvt = std::use_facet<std::codecvt<char, char, std::mbstate_t>>(loc: l); |
271 | std::mbstate_t mb{}; |
272 | const char* from = "a" ; |
273 | const char* from_end = from + 1; |
274 | const char* from_next = from; |
275 | char buf[2]; |
276 | char* to = buf; |
277 | char* to_end = buf + 1; |
278 | char* to_next = to; |
279 | TEST(cvt.always_noconv()); |
280 | TEST_EQ(cvt.in(mb, from, from_end, from_next, to, to_end, to_next), cvt_type::noconv); |
281 | TEST(from_next == from); |
282 | TEST(to_next == to); |
283 | TEST_EQ(cvt.out(mb, from, from_end, from_next, to, to_end, to_next), cvt_type::noconv); |
284 | TEST(from_next == from); |
285 | TEST(to_next == to); |
286 | TEST_EQ(cvt.encoding(), 1); |
287 | TEST_EQ(cvt.max_length(), 1); |
288 | } |
289 | |
290 | void test_codecvt_fallback() |
291 | { |
292 | std::locale l = |
293 | boost::locale::util::create_codecvt(in: std::locale::classic(), cvt: nullptr, type: boost::locale::char_facet_t::wchar_f); |
294 | const cvt_type& cvt = std::use_facet<cvt_type>(loc: l); |
295 | |
296 | std::mbstate_t mb{}; |
297 | // Fallback converter can convert ASCII |
298 | const char from[] = "abyzAZ!?019" ; |
299 | const char* from_end = std::end(arr: from); |
300 | const char* from_next = from; |
301 | wchar_t buf[sizeof(from)]{}; |
302 | wchar_t* to = buf; |
303 | wchar_t* const to_end = std::end(arr&: buf); |
304 | wchar_t* to_next = to; |
305 | |
306 | TEST(!cvt.always_noconv()); |
307 | TEST_EQ(cvt.encoding(), 0); |
308 | TEST_EQ(cvt.max_length(), 1); |
309 | |
310 | TEST_EQ(cvt.in(mb, from, from_end, from_next, to, to_end, to_next), cvt_type::ok); |
311 | TEST(from_next == from_end); |
312 | TEST(to_next == to_end); |
313 | TEST_EQ(buf, ascii_to<wchar_t>(from)); |
314 | |
315 | char buf2[sizeof(from)]{}; |
316 | char* to2 = buf2; |
317 | char* const to_end2 = std::end(arr&: buf2); |
318 | char* to_next2 = to2; |
319 | const wchar_t* to_next_wide = to; |
320 | |
321 | TEST_EQ(cvt.out(mb, to, to_end, to_next_wide, to2, to_end2, to_next2), cvt_type::ok); |
322 | TEST(to_next_wide == to_end); |
323 | TEST(to_next2 == to_end2); |
324 | TEST_EQ(buf2, ascii_to<char>(from)); |
325 | |
326 | // Non-ASCII is an error |
327 | *to = L'\x81'; |
328 | to_next_wide = to; |
329 | to_next2 = to2; |
330 | TEST_EQ(cvt.out(mb, to, to_end, to_next_wide, to2, to_end2, to_next2), cvt_type::error); |
331 | TEST(to_next_wide == to); |
332 | TEST(to_next2 == to2); |
333 | |
334 | const char from_invalid[] = "\x80" ; |
335 | from_end = std::end(arr: from_invalid); |
336 | from_next = from_invalid; |
337 | to = buf; |
338 | to_next = to; |
339 | TEST_EQ(cvt.in(mb, from_invalid, from_end, from_next, to, to_end, to_next), cvt_type::error); |
340 | TEST(from_next == from_invalid); |
341 | TEST(to_next == to); |
342 | } |
343 | |
344 | void test_main(int /*argc*/, char** /*argv*/) |
345 | { |
346 | test_codecvt_conv(); |
347 | test_codecvt_err(); |
348 | test_char_char(); |
349 | test_codecvt_fallback(); |
350 | } |
351 | |