bug-iconv12.c source code [glibc/iconvdata/bug-iconv12.c]

1	/ bug 19727: Testing UTF conversions with UTF16 surrogates as input.*
2	Copyright (C) 2016-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <stdio.h>
20	#include <stdlib.h>
21	#include <errno.h>
22	#include <string.h>
23	#include <inttypes.h>
24	#include <iconv.h>
25	#include <byteswap.h>
26
27	static int
28	run_conversion (const char from, const* char to, char* *inbuf, size_t inbuflen,
29	int exp_errno, int line)
30	{
31	char outbuf[`16`];
32	iconv_t cd;
33	char *inptr;
34	size_t inlen;
35	char *outptr;
36	size_t outlen;
37	size_t n;
38	int e;
39	int fails = `0`;
40
41	cd = iconv_open (tocode: to, fromcode: from);
42	if (cd == (iconv_t) -`1`)
43	{
44	printf (format: "line %d: cannot convert from %s to %s: %m\n", line, from, to);
45	return `1`;
46	}
47
48	inptr = (char *) inbuf;
49	inlen = inbuflen;
50	outptr = outbuf;
51	outlen = sizeof (outbuf);
52
53	errno = `0`;
54	n = iconv (cd: cd, inbuf: &inptr, inbytesleft: &inlen, outbuf: &outptr, outbytesleft: &outlen);
55	e = errno;
56
57	if (exp_errno == `0`)
58	{
59	if (n == (size_t) -`1`)
60	{
61	puts (s: "n should be >= 0, but n == -1");
62	fails ++;
63	}
64
65	if (e != `0`)
66	{
67	printf (format: "errno should be 0: 'Success', but errno == %d: '%s'\n"
68	, e, strerror(errnum: e));
69	fails ++;
70	}
71	}
72	else
73	{
74	if (n != (size_t) -`1`)
75	{
76	printf (format: "n should be -1, but n == %zd\n", n);
77	fails ++;
78	}
79
80	if (e != exp_errno)
81	{
82	printf (format: "errno should be %d: '%s', but errno == %d: '%s'\n"
83	, exp_errno, strerror (errnum: exp_errno), e, strerror (errnum: e));
84	fails ++;
85	}
86	}
87
88	iconv_close (cd: cd);
89
90	if (fails > `0`)
91	{
92	printf (format: "Errors in line %d while converting %s to %s.\n\n"
93	, line, from, to);
94	}
95
96	return fails;
97	}
98
99	static int
100	do_test (void)
101	{
102	int fails = `0`;
103	char buf[`4`];
104
105	/ This test runs iconv() with UTF character in range of an UTF16 surrogate.*
106	UTF-16 high surrogate is in range 0xD800..0xDBFF and
107	UTF-16 low surrogate is in range 0xDC00..0xDFFF.
108	Converting from or to UTF-xx has to report errors in those cases.
109	In UTF-16, surrogate pairs with a high surrogate in front of a low
110	surrogate is valid. /*
111
112	/ Use RUN_UCS4_UTF32_INPUT to test conversion ...*
113
114	... from INTERNAL to UTF-xx[LE\|BE]:
115	Converting from UCS4 to UTF-xx[LE\|BE] first converts UCS4 to INTERNAL
116	without checking for UTF-16 surrogate values
117	and then converts from INTERNAL to UTF-xx[LE\|BE].
118	The latter conversion has to report an error in those cases.
119
120	... from UTF-32[LE\|BE] to INTERNAL:
121	Converting directly from UTF-32LE to UTF-8\|16 is needed,
122	because e.g. s390x has iconv-modules which converts directly. /*
123	#define RUN_UCS4_UTF32_INPUT(b0, b1, b2, b3, err, line) \
124	buf[0] = b0; \
125	buf[1] = b1; \
126	buf[2] = b2; \
127	buf[3] = b3; \
128	fails += run_conversion ("UCS4", "UTF-8", buf, 4, err, line); \
129	fails += run_conversion ("UCS4", "UTF-16LE", buf, 4, err, line); \
130	fails += run_conversion ("UCS4", "UTF-16BE", buf, 4, err, line); \
131	fails += run_conversion ("UCS4", "UTF-32LE", buf, 4, err, line); \
132	fails += run_conversion ("UCS4", "UTF-32BE", buf, 4, err, line); \
133	fails += run_conversion ("UTF-32BE", "WCHAR_T", buf, 4, err, line); \
134	fails += run_conversion ("UTF-32BE", "UTF-8", buf, 4, err, line); \
135	fails += run_conversion ("UTF-32BE", "UTF-16LE", buf, 4, err, line); \
136	fails += run_conversion ("UTF-32BE", "UTF-16BE", buf, 4, err, line); \
137	buf[0] = b3; \
138	buf[1] = b2; \
139	buf[2] = b1; \
140	buf[3] = b0; \
141	fails += run_conversion ("UTF-32LE", "WCHAR_T", buf, 4, err, line); \
142	fails += run_conversion ("UTF-32LE", "UTF-8", buf, 4, err, line); \
143	fails += run_conversion ("UTF-32LE", "UTF-16LE", buf, 4, err, line); \
144	fails += run_conversion ("UTF-32LE", "UTF-16BE", buf, 4, err, line);
145
146	/ Use UCS4/UTF32 input of 0xD7FF. /
147	RUN_UCS4_UTF32_INPUT (`0x0`, `0x0`, `0xD7`, `0xFF`, `0`, __LINE__);
148
149	/ Use UCS4/UTF32 input of 0xD800. /
150	RUN_UCS4_UTF32_INPUT (`0x0`, `0x0`, `0xD8`, `0x00`, EILSEQ, __LINE__);
151
152	/ Use UCS4/UTF32 input of 0xDBFF. /
153	RUN_UCS4_UTF32_INPUT (`0x0`, `0x0`, `0xDB`, `0xFF`, EILSEQ, __LINE__);
154
155	/ Use UCS4/UTF32 input of 0xDC00. /
156	RUN_UCS4_UTF32_INPUT (`0x0`, `0x0`, `0xDC`, `0x00`, EILSEQ, __LINE__);
157
158	/ Use UCS4/UTF32 input of 0xDFFF. /
159	RUN_UCS4_UTF32_INPUT (`0x0`, `0x0`, `0xDF`, `0xFF`, EILSEQ, __LINE__);
160
161	/ Use UCS4/UTF32 input of 0xE000. /
162	RUN_UCS4_UTF32_INPUT (`0x0`, `0x0`, `0xE0`, `0x00`, `0`, __LINE__);
163
164
165	/ Use RUN_UTF16_INPUT to test conversion from UTF16[LE\|BE] to INTERNAL.*
166	Converting directly from UTF-16 to UTF-8\|32 is needed,
167	because e.g. s390x has iconv-modules which converts directly.
168	Use len == 2 or 4 to specify one or two UTF-16 characters. /*
169	#define RUN_UTF16_INPUT(b0, b1, b2, b3, len, err, line) \
170	buf[0] = b0; \
171	buf[1] = b1; \
172	buf[2] = b2; \
173	buf[3] = b3; \
174	fails += run_conversion ("UTF-16BE", "WCHAR_T", buf, len, err, line); \
175	fails += run_conversion ("UTF-16BE", "UTF-8", buf, len, err, line); \
176	fails += run_conversion ("UTF-16BE", "UTF-32LE", buf, len, err, line); \
177	fails += run_conversion ("UTF-16BE", "UTF-32BE", buf, len, err, line); \
178	buf[0] = b1; \
179	buf[1] = b0; \
180	buf[2] = b3; \
181	buf[3] = b2; \
182	fails += run_conversion ("UTF-16LE", "WCHAR_T", buf, len, err, line); \
183	fails += run_conversion ("UTF-16LE", "UTF-8", buf, len, err, line); \
184	fails += run_conversion ("UTF-16LE", "UTF-32LE", buf, len, err, line); \
185	fails += run_conversion ("UTF-16LE", "UTF-32BE", buf, len, err, line);
186
187	/ Use UTF16 input of 0xD7FF. /
188	RUN_UTF16_INPUT (`0xD7`, `0xFF`, `0xD7`, `0xFF`, `4`, `0`, __LINE__);
189
190	/ Use [single] UTF16 high surrogate 0xD800 [with a valid character behind].*
191	And check an UTF16 surrogate pair [without valid low surrogate]. /*
192	RUN_UTF16_INPUT (`0xD8`, `0x0`, `0x0`, `0x0`, `2`, EINVAL, __LINE__);
193	RUN_UTF16_INPUT (`0xD8`, `0x0`, `0xD7`, `0xFF`, `4`, EILSEQ, __LINE__);
194	RUN_UTF16_INPUT (`0xD8`, `0x0`, `0xD8`, `0x0`, `4`, EILSEQ, __LINE__);
195	RUN_UTF16_INPUT (`0xD8`, `0x0`, `0xE0`, `0x0`, `4`, EILSEQ, __LINE__);
196	RUN_UTF16_INPUT (`0xD8`, `0x0`, `0xDC`, `0x0`, `4`, `0`, __LINE__);
197
198	/ Use [single] UTF16 high surrogate 0xDBFF [with a valid character behind].*
199	And check an UTF16 surrogate pair [without valid low surrogate]. /*
200	RUN_UTF16_INPUT (`0xDB`, `0xFF`, `0x0`, `0x0`, `2`, EINVAL, __LINE__);
201	RUN_UTF16_INPUT (`0xDB`, `0xFF`, `0xD7`, `0xFF`, `4`, EILSEQ, __LINE__);
202	RUN_UTF16_INPUT (`0xDB`, `0xFF`, `0xDB`, `0xFF`, `4`, EILSEQ, __LINE__);
203	RUN_UTF16_INPUT (`0xDB`, `0xFF`, `0xE0`, `0x0`, `4`, EILSEQ, __LINE__);
204	RUN_UTF16_INPUT (`0xDB`, `0xFF`, `0xDF`, `0xFF`, `4`, `0`, __LINE__);
205
206	/ Use single UTF16 low surrogate 0xDC00 [with a valid character behind].*
207	And check an UTF16 surrogate pair [without valid high surrogate]. /*
208	RUN_UTF16_INPUT (`0xDC`, `0x0`, `0x0`, `0x0`, `2`, EILSEQ, __LINE__);
209	RUN_UTF16_INPUT (`0xDC`, `0x0`, `0xD7`, `0xFF`, `4`, EILSEQ, __LINE__);
210	RUN_UTF16_INPUT (`0xD8`, `0x0`, `0xDC`, `0x0`, `4`, `0`, __LINE__);
211	RUN_UTF16_INPUT (`0xD7`, `0xFF`, `0xDC`, `0x0`, `4`, EILSEQ, __LINE__);
212	RUN_UTF16_INPUT (`0xDC`, `0x0`, `0xDC`, `0x0`, `4`, EILSEQ, __LINE__);
213	RUN_UTF16_INPUT (`0xE0`, `0x0`, `0xDC`, `0x0`, `4`, EILSEQ, __LINE__);
214
215	/ Use single UTF16 low surrogate 0xDFFF [with a valid character behind].*
216	And check an UTF16 surrogate pair [without valid high surrogate]. /*
217	RUN_UTF16_INPUT (`0xDF`, `0xFF`, `0x0`, `0x0`, `2`, EILSEQ, __LINE__);
218	RUN_UTF16_INPUT (`0xDF`, `0xFF`, `0xD7`, `0xFF`, `4`, EILSEQ, __LINE__);
219	RUN_UTF16_INPUT (`0xDB`, `0xFF`, `0xDF`, `0xFF`, `4`, `0`, __LINE__);
220	RUN_UTF16_INPUT (`0xD7`, `0xFF`, `0xDF`, `0xFF`, `4`, EILSEQ, __LINE__);
221	RUN_UTF16_INPUT (`0xDF`, `0xFF`, `0xDF`, `0xFF`, `4`, EILSEQ, __LINE__);
222	RUN_UTF16_INPUT (`0xE0`, `0x0`, `0xDF`, `0xFF`, `4`, EILSEQ, __LINE__);
223
224	/ Use UCS4/UTF32 input of 0xE000. /
225	RUN_UTF16_INPUT (`0xE0`, `0x0`, `0xE0`, `0x0`, `4`, `0`, __LINE__);
226
227
228	/ Use RUN_UTF8_3BYTE_INPUT to test conversion from UTF-8 to INTERNAL.*
229	Converting directly from UTF-8 to UTF-16\|32 is needed,
230	because e.g. s390x has iconv-modules which converts directly. /*
231	#define RUN_UTF8_3BYTE_INPUT(b0, b1, b2, err, line) \
232	buf[0] = b0; \
233	buf[1] = b1; \
234	buf[2] = b2; \
235	fails += run_conversion ("UTF-8", "WCHAR_T", buf, 3, err, line); \
236	fails += run_conversion ("UTF-8", "UTF-16LE", buf, 3, err, line); \
237	fails += run_conversion ("UTF-8", "UTF-16BE", buf, 3, err, line); \
238	fails += run_conversion ("UTF-8", "UTF-32LE", buf, 3, err, line); \
239	fails += run_conversion ("UTF-8", "UTF-32BE", buf, 3, err, line);
240
241	/ Use UTF-8 input of 0xD7FF. /
242	RUN_UTF8_3BYTE_INPUT (`0xED`, `0x9F`, `0xBF`, `0`, __LINE__);
243
244	/ Use UTF-8 input of 0xD800. /
245	RUN_UTF8_3BYTE_INPUT (`0xED`, `0xA0`, `0x80`, EILSEQ, __LINE__);
246
247	/ Use UTF-8 input of 0xDBFF. /
248	RUN_UTF8_3BYTE_INPUT (`0xED`, `0xAF`, `0xBF`, EILSEQ, __LINE__);
249
250	/ Use UTF-8 input of 0xDC00. /
251	RUN_UTF8_3BYTE_INPUT (`0xED`, `0xB0`, `0x80`, EILSEQ, __LINE__);
252
253	/ Use UTF-8 input of 0xDFFF. /
254	RUN_UTF8_3BYTE_INPUT (`0xED`, `0xBF`, `0xBF`, EILSEQ, __LINE__);
255
256	/ Use UTF-8 input of 0xF000. /
257	RUN_UTF8_3BYTE_INPUT (`0xEF`, `0x80`, `0x80`, `0`, __LINE__);
258
259	return fails > `0` ? EXIT_FAILURE : EXIT_SUCCESS;
260	}
261
262	#define TEST_FUNCTION do_test ()
263	#include "../test-skeleton.c"
264

source code of glibc/iconvdata/bug-iconv12.c