test_codecvt.cpp source code [boost/libs/locale/test/test_codecvt.cpp]

1	//
2	// Copyright (c) 2015 Artyom Beilis (Tonkikh)
3	// Copyright (c) 2021-2023 Alexander Grund
4	//
5	// Distributed under the Boost Software License, Version 1.0.
6	// https://www.boost.org/LICENSE_1_0.txt
7
8	#include <boost/locale/utf8_codecvt.hpp>
9	#include <boost/locale/util.hpp>
10	#include <algorithm>
11	#include <cstring>
12	#include <iomanip>
13	#include <iostream>
14	#include <locale>
15	#include <memory.h>
16	#include <wchar.h>
17	#define BOOST_LOCALE_ERROR_LIMIT -1
18	#include "boostLocale/test/tools.hpp"
19	#include "boostLocale/test/unit_test.hpp"
20
21	#if defined(BOOST_MSVC) && BOOST_MSVC < 1700
22	# pragma warning(disable : 4428) // universal-character-name encountered in source
23	#endif
24
25	static const char* utf8_name =
26	"\xf0\x9d\x92\x9e-\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82-\xE3\x82\x84\xE3\x81\x82.txt";
27	static const wchar_t* wide_name = L"\U0001D49E-\u043F\u0440\u0438\u0432\u0435\u0442-\u3084\u3042.txt";
28
29	typedef std::codecvt<wchar_t, char, std::mbstate_t> cvt_type;
30
31	void test_codecvt_in_n_m(const cvt_type& cvt, int n, int m)
32	{
33	const wchar_t* wptr = wide_name;
34	const size_t wlen = wcslen(s: wide_name);
35	const size_t u8len = strlen(s: utf8_name);
36	const char* from = utf8_name;
37	const char* end = from;
38	const char* real_end = utf8_name + u8len;
39	const char* from_next = from;
40	std::mbstate_t mb{};
41	while(from_next < real_end) {
42	if(from == end) {
43	end = from + n;
44	if(end > real_end)
45	end = real_end;
46	}
47
48	wchar_t buf[`128`];
49	wchar_t* to = buf;
50	wchar_t* to_end = to + m;
51	wchar_t* to_next = to;
52
53	std::mbstate_t mb2 = mb;
54	std::codecvt_base::result r = cvt.in(state&: mb, from: from, from_end: end, from_next&: from_next, to: to, to_end: to_end, to_next&: to_next);
55
56	int count = cvt.length(state&: mb2, from: from, end: end, max: to_end - to);
57	TEST_EQ(memcmp(&mb, &mb2, sizeof(mb)), `0`);
58	if(count != from_next - from)
59	std::cout << count << " " << from_next - from << std::endl; // LCOV_EXCL_LINE
60	TEST_EQ(count, from_next - from);
61
62	if(r == cvt_type::partial) {
63	end += n;
64	if(end > real_end)
65	end = real_end;
66	} else
67	TEST_EQ(r, cvt_type::ok);
68	while(to != to_next) {
69	TEST_EQ(wptr, to);
70	wptr++;
71	to++;
72	}
73	to = to_next;
74	from = from_next;
75	}
76	TEST(wptr == wide_name + wlen);
77	TEST(from == real_end);
78	}
79
80	void test_codecvt_out_n_m(const cvt_type& cvt, int n, int m)
81	{
82	const char* nptr = utf8_name;
83	const size_t wlen = wcslen(s: wide_name);
84	const size_t u8len = strlen(s: utf8_name);
85
86	std::mbstate_t mb{};
87
88	const wchar_t* from_next = wide_name;
89	const wchar_t* real_from_end = wide_name + wlen;
90
91	char buf[`256`];
92	char* to = buf;
93	char* to_next = to;
94	char* to_end = to + n;
95	char* real_to_end = buf + sizeof(buf);
96
97	// Unshift on initial state does nothing
98	TEST_EQ(cvt.unshift(mb, buf, std::end(buf), to_next), cvt_type::ok);
99	TEST(to_next == buf);
100
101	while(from_next < real_from_end) {
102	const wchar_t* from = from_next;
103	const wchar_t* from_end = from + m;
104	if(from_end > real_from_end)
105	from_end = real_from_end;
106	if(to_end == to)
107	to_end = to + n;
108
109	std::codecvt_base::result r = cvt.out(state&: mb, from: from, from_end: from_end, from_next&: from_next, to: to, to_end: to_end, to_next&: to_next);
110	if(r == cvt_type::partial) {
111	// If those are equal, then "partial" probably means: Need more input
112	// Otherwise "Need more output"
113	if(from_next != from_end) {
114	TEST_LT(to_end - to_next, cvt.max_length());
115	to_end = std::min(a: to_end + n, b: real_to_end);
116	}
117	} else
118	TEST_EQ(r, cvt_type::ok);
119
120	while(to != to_next) {
121	TEST_EQ(nptr, to);
122	nptr++;
123	to++;
124	}
125	from = from_next;
126	}
127	TEST(nptr == utf8_name + u8len);
128	TEST(from_next == real_from_end);
129	TEST_EQ(cvt.unshift(mb, to, to + n, to_next), cvt_type::ok);
130	TEST(to_next == to);
131
132	// Convert into a to small buffer
133	from_next = wide_name;
134	TEST_EQ(cvt.out(mb, wide_name, real_from_end, from_next, buf, buf + `1`, to_next), cvt_type::partial);
135	if(from_next == wide_name) {
136	// Nothing consumed so nothing to do
137	TEST_EQ(cvt.unshift(mb, buf, std::end(buf), to_next), cvt_type::ok);
138	TEST(to_next == buf);
139	} else {
140	TEST(from_next == wide_name + `1`);
141	TEST(to_next == buf);
142	// Unshift on non-default state is not possible
143	TEST_EQ(cvt.unshift(mb, buf, std::end(buf), to_next), cvt_type::error);
144	}
145	}
146
147	void test_codecvt_conv()
148	{
149	std::cout << "Conversions " << std::endl;
150	std::locale l(std::locale::classic(), new boost::locale::utf8_codecvt<wchar_t>());
151
152	const cvt_type& cvt = std::use_facet<cvt_type>(loc: l);
153
154	TEST_EQ(cvt.encoding(), `0`); // Characters have a variable width
155	TEST_EQ(cvt.max_length(), `4`); // At most 4 UTF-8 code units are one internal char (one or two UTF-16 code units)
156	TEST(!cvt.always_noconv()); // Always convert
157
158	for(int i = `1`; i <= (int)strlen(s: utf8_name) + `1`; i++) {
159	for(int j = `1`; j <= (int)wcslen(s: wide_name) + `1`; j++) {
160	try {
161	test_codecvt_in_n_m(cvt, n: i, m: j);
162	test_codecvt_out_n_m(cvt, n: i, m: j);
163	} catch(...) { // LCOV_EXCL_LINE
164	std::cerr << "Wlen=" << j << " Nlen=" << i << std::endl; // LCOV_EXCL_LINE
165	throw; // LCOV_EXCL_LINE
166	}
167	}
168	}
169	}
170
171	void test_codecvt_err()
172	{
173	std::cout << "Errors " << std::endl;
174	std::locale l(std::locale::classic(), new boost::locale::utf8_codecvt<wchar_t>());
175
176	const cvt_type& cvt = std::use_facet<cvt_type>(loc: l);
177
178	std::cout << "- UTF-8" << std::endl;
179	{
180	wchar_t buf[`2`];
181	wchar_t* to = buf;
182	wchar_t* to_end = buf + `2`;
183	wchar_t* to_next = to;
184	const char* err_utf = "1\xFF\xFF";
185	{
186	std::mbstate_t mb{};
187	const char* from = err_utf;
188	const char* from_end = from + strlen(s: from);
189	const char* from_next = from;
190	to_next = to;
191	TEST_EQ(cvt.in(mb, from, from_end, from_next, to, to_end, to_next), cvt_type::error);
192	TEST(from_next == from + `1`);
193	TEST(to_next == to + `1`);
194	TEST_EQ(*to, `'1'`);
195	}
196	err_utf++;
197	{
198	std::mbstate_t mb{};
199	const char* from = err_utf;
200	const char* from_end = from + strlen(s: from);
201	const char* from_next = from;
202	TEST_EQ(cvt.in(mb, from, from_end, from_next, to, to_end, to_next), cvt_type::error);
203	TEST(from_next == from);
204	TEST(to_next == to);
205	}
206	}
207	std::cout << "- Trailing UTF-16 surrogate" << std::endl;
208	{
209	char buf[`4`] = {};
210	char* const to = buf;
211	char* const to_end = buf + `4`;
212	char* to_next = to;
213	const wchar_t* err_utf = L"\xD800"; // Trailing UTF-16 surrogate
214	std::mbstate_t mb{};
215	const wchar_t* from = err_utf;
216	const wchar_t* from_end = from + `1`;
217	const wchar_t* from_next = from;
218	cvt_type::result res = cvt.out(state&: mb, from: from, from_end: from_end, from_next&: from_next, to: to, to_end: to_end, to_next&: to_next);
219	BOOST_LOCALE_START_CONST_CONDITION
220	if(sizeof(wchar_t) == `2`) {
221	BOOST_LOCALE_END_CONST_CONDITION
222	TEST(res == cvt_type::partial);
223	TEST(from_next == from_end);
224	TEST(to_next == to);
225	TEST(buf[`0`] == `0`);
226	} else {
227	// surrogate is invalid
228	TEST(res == cvt_type::error);
229	TEST(from_next == from);
230	TEST(to_next == to);
231	}
232	}
233
234	std::cout << "- UTF-16/32" << std::endl;
235	{
236	char buf[`32`];
237	char* to = buf;
238	char* to_end = buf + `32`;
239	char* to_next = to;
240	wchar_t err_buf[`3`] = {`'1'`, `0xDC9E`, `0`}; // second value is invalid for UTF-16 and 32
241	const wchar_t* err_utf = err_buf;
242	{
243	std::mbstate_t mb{};
244	const wchar_t* from = err_utf;
245	const wchar_t* from_end = from + wcslen(s: from);
246	const wchar_t* from_next = from;
247	TEST_EQ(cvt.out(mb, from, from_end, from_next, to, to_end, to_next), cvt_type::error);
248	TEST(from_next == from + `1`);
249	TEST(to_next == to + `1`);
250	TEST_EQ(*to, `'1'`);
251	}
252	err_utf++;
253	{
254	std::mbstate_t mb{};
255	const wchar_t* from = err_utf;
256	const wchar_t* from_end = from + wcslen(s: from);
257	const wchar_t* from_next = from;
258	to_next = to;
259	TEST_EQ(cvt.out(mb, from, from_end, from_next, to, to_end, to_next), cvt_type::error);
260	TEST(from_next == from);
261	TEST(to_next == to);
262	}
263	}
264	}
265
266	void test_char_char()
267	{
268	std::cout << "Char-char specialization" << std::endl;
269	std::locale l(std::locale::classic(), new boost::locale::utf8_codecvt<char>());
270	const std::codecvt<char, char, std::mbstate_t>& cvt = std::use_facet<std::codecvt<char, char, std::mbstate_t>>(loc: l);
271	std::mbstate_t mb{};
272	const char* from = "a";
273	const char* from_end = from + `1`;
274	const char* from_next = from;
275	char buf[`2`];
276	char* to = buf;
277	char* to_end = buf + `1`;
278	char* to_next = to;
279	TEST(cvt.always_noconv());
280	TEST_EQ(cvt.in(mb, from, from_end, from_next, to, to_end, to_next), cvt_type::noconv);
281	TEST(from_next == from);
282	TEST(to_next == to);
283	TEST_EQ(cvt.out(mb, from, from_end, from_next, to, to_end, to_next), cvt_type::noconv);
284	TEST(from_next == from);
285	TEST(to_next == to);
286	TEST_EQ(cvt.encoding(), `1`);
287	TEST_EQ(cvt.max_length(), `1`);
288	}
289
290	void test_codecvt_fallback()
291	{
292	std::locale l =
293	boost::locale::util::create_codecvt(in: std::locale::classic(), cvt: nullptr, type: boost::locale::char_facet_t::wchar_f);
294	const cvt_type& cvt = std::use_facet<cvt_type>(loc: l);
295
296	std::mbstate_t mb{};
297	// Fallback converter can convert ASCII
298	const char from[] = "abyzAZ!?019";
299	const char* from_end = std::end(arr: from);
300	const char* from_next = from;
301	wchar_t buf[sizeof(from)]{};
302	wchar_t* to = buf;
303	wchar_t* const to_end = std::end(arr&: buf);
304	wchar_t* to_next = to;
305
306	TEST(!cvt.always_noconv());
307	TEST_EQ(cvt.encoding(), `0`);
308	TEST_EQ(cvt.max_length(), `1`);
309
310	TEST_EQ(cvt.in(mb, from, from_end, from_next, to, to_end, to_next), cvt_type::ok);
311	TEST(from_next == from_end);
312	TEST(to_next == to_end);
313	TEST_EQ(buf, ascii_to<wchar_t>(from));
314
315	char buf2[sizeof(from)]{};
316	char* to2 = buf2;
317	char* const to_end2 = std::end(arr&: buf2);
318	char* to_next2 = to2;
319	const wchar_t* to_next_wide = to;
320
321	TEST_EQ(cvt.out(mb, to, to_end, to_next_wide, to2, to_end2, to_next2), cvt_type::ok);
322	TEST(to_next_wide == to_end);
323	TEST(to_next2 == to_end2);
324	TEST_EQ(buf2, ascii_to<char>(from));
325
326	// Non-ASCII is an error
327	*to = L`'\x81'`;
328	to_next_wide = to;
329	to_next2 = to2;
330	TEST_EQ(cvt.out(mb, to, to_end, to_next_wide, to2, to_end2, to_next2), cvt_type::error);
331	TEST(to_next_wide == to);
332	TEST(to_next2 == to2);
333
334	const char from_invalid[] = "\x80";
335	from_end = std::end(arr: from_invalid);
336	from_next = from_invalid;
337	to = buf;
338	to_next = to;
339	TEST_EQ(cvt.in(mb, from_invalid, from_end, from_next, to, to_end, to_next), cvt_type::error);
340	TEST(from_next == from_invalid);
341	TEST(to_next == to);
342	}
343
344	void test_main(int /argc/, char** /argv/)
345	{
346	test_codecvt_conv();
347	test_codecvt_err();
348	test_char_char();
349	test_codecvt_fallback();
350	}
351

source code of boost/libs/locale/test/test_codecvt.cpp