unicode-encoding.c source code [gtk/subprojects/glib/tests/unicode-encoding.c]

1	#undef G_DISABLE_ASSERT
2	#undef G_LOG_DOMAIN
3
4	#include <stdarg.h>
5	#include <stdio.h>
6	#include <stdlib.h>
7	#include <string.h>
8	#include <glib.h>
9
10	static gint exit_status = `0`;
11
12	G_GNUC_PRINTF (`1`, `2`)
13	static void
14	croak (char *format, ...)
15	{
16	va_list va;
17
18	va_start (va, format);
19	vfprintf (stderr, format: format, arg: va);
20	va_end (va);
21
22	exit (status: `1`);
23	}
24
25	G_GNUC_PRINTF (`1`, `2`)
26	static void
27	fail (char *format, ...)
28	{
29	va_list va;
30
31	va_start (va, format);
32	vfprintf (stderr, format: format, arg: va);
33	va_end (va);
34
35	exit_status \|= `1`;
36	}
37
38	typedef enum
39	{
40	VALID,
41	INCOMPLETE,
42	NOTUNICODE,
43	OVERLONG,
44	MALFORMED
45	} Status;
46
47	static gboolean
48	ucs4_equal (gunichar a, gunichar b)
49	{
50	while (a && b && (a == b))
51	{
52	a++;
53	b++;
54	}
55
56	return (a == b);
57	}
58
59	static gboolean
60	utf16_equal (gunichar2 a, gunichar2 b)
61	{
62	while (a && b && (a == b))
63	{
64	a++;
65	b++;
66	}
67
68	return (a == b);
69	}
70
71	static gint
72	utf16_count (gunichar2 *a)
73	{
74	gint result = `0`;
75
76	while (a[result])
77	result++;
78
79	return result;
80	}
81
82	static void
83	print_ucs4 (const gchar prefix, gunichar ucs4, gint ucs4_len)
84	{
85	gint i;
86	g_print (format: "%s ", prefix);
87	for (i = `0`; i < ucs4_len; i++)
88	g_print (format: "%x ", ucs4[i]);
89	g_print (format: "\n");
90	}
91
92	static void
93	process (gint line,
94	gchar *utf8,
95	Status status,
96	gunichar *ucs4,
97	gint ucs4_len)
98	{
99	const gchar *end;
100	gboolean is_valid = g_utf8_validate (str: utf8, max_len: -`1`, end: &end);
101	GError *error = NULL;
102	glong items_read, items_written;
103
104	switch (status)
105	{
106	case VALID:
107	if (!is_valid)
108	{
109	fail (format: "line %d: valid but g_utf8_validate returned FALSE\n", line);
110	return;
111	}
112	break;
113	case NOTUNICODE:
114	case INCOMPLETE:
115	case OVERLONG:
116	case MALFORMED:
117	if (is_valid)
118	{
119	fail (format: "line %d: invalid but g_utf8_validate returned TRUE\n", line);
120	return;
121	}
122	break;
123	}
124
125	if (status == INCOMPLETE)
126	{
127	gunichar *ucs4_result;
128
129	ucs4_result = g_utf8_to_ucs4 (str: utf8, len: -`1`, NULL, NULL, error: &error);
130
131	if (!error \|\| !g_error_matches (error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_PARTIAL_INPUT))
132	{
133	fail (format: "line %d: incomplete input not properly detected\n", line);
134	return;
135	}
136	g_clear_error (err: &error);
137
138	ucs4_result = g_utf8_to_ucs4 (str: utf8, len: -`1`, items_read: &items_read, NULL, error: &error);
139
140	if (!ucs4_result \|\| items_read == strlen (s: utf8))
141	{
142	fail (format: "line %d: incomplete input not properly detected\n", line);
143	return;
144	}
145
146	g_free (mem: ucs4_result);
147	}
148
149	if (status == VALID \|\| status == NOTUNICODE)
150	{
151	gunichar *ucs4_result;
152
153	ucs4_result = g_utf8_to_ucs4 (str: utf8, len: -`1`, items_read: &items_read, items_written: &items_written, error: &error);
154	if (!ucs4_result)
155	{
156	fail (format: "line %d: conversion with status %d to ucs4 failed: %s\n", line, status, error->message);
157	return;
158	}
159
160	if (!ucs4_equal (a: ucs4_result, b: ucs4) \|\|
161	items_read != strlen (s: utf8) \|\|
162	items_written != ucs4_len)
163	{
164	fail (format: "line %d: results of conversion with status %d to ucs4 do not match expected.\n", line, status);
165	print_ucs4 (prefix: "expected: ", ucs4, ucs4_len);
166	print_ucs4 (prefix: "received: ", ucs4: ucs4_result, ucs4_len: items_written);
167	return;
168	}
169
170	g_free (mem: ucs4_result);
171	}
172
173	if (status == VALID)
174	{
175	gunichar *ucs4_result;
176	gchar *utf8_result;
177
178	ucs4_result = g_utf8_to_ucs4_fast (str: utf8, len: -`1`, items_written: &items_written);
179
180	if (!ucs4_equal (a: ucs4_result, b: ucs4) \|\|
181	items_written != ucs4_len)
182	{
183	fail (format: "line %d: results of fast conversion with status %d to ucs4 do not match expected.\n", line, status);
184	print_ucs4 (prefix: "expected: ", ucs4, ucs4_len);
185	print_ucs4 (prefix: "received: ", ucs4: ucs4_result, ucs4_len: items_written);
186	return;
187	}
188
189	utf8_result = g_ucs4_to_utf8 (str: ucs4_result, len: -`1`, items_read: &items_read, items_written: &items_written, error: &error);
190	if (!utf8_result)
191	{
192	fail (format: "line %d: conversion back to utf8 failed: %s", line, error->message);
193	return;
194	}
195
196	if (strcmp (s1: utf8_result, s2: utf8) != `0` \|\|
197	items_read != ucs4_len \|\|
198	items_written != strlen (s: utf8))
199	{
200	fail (format: "line %d: conversion back to utf8 did not match original\n", line);
201	return;
202	}
203
204	g_free (mem: utf8_result);
205	g_free (mem: ucs4_result);
206	}
207
208	if (status == VALID)
209	{
210	gunichar2 *utf16_expected_tmp;
211	gunichar2 *utf16_expected;
212	gunichar2 *utf16_from_utf8;
213	gunichar2 *utf16_from_ucs4;
214	gunichar *ucs4_result;
215	gsize bytes_written;
216	gint n_chars;
217	gchar *utf8_result;
218
219	#if G_BYTE_ORDER == G_LITTLE_ENDIAN
220	#define TARGET "UTF-16LE"
221	#else
222	#define TARGET "UTF-16"
223	#endif
224
225	if (!(utf16_expected_tmp = (gunichar2 *)g_convert (str: utf8, len: -`1`, TARGET, from_codeset: "UTF-8",
226	NULL, bytes_written: &bytes_written, NULL)))
227	{
228	fail (format: "line %d: could not convert to UTF-16 via g_convert\n", line);
229	return;
230	}
231
232	/ zero-terminate and remove BOM*
233	*/
234	n_chars = bytes_written / `2`;
235	if (utf16_expected_tmp[`0`] == `0xfeff`) / BOM /
236	{
237	n_chars--;
238	utf16_expected = g_new (gunichar2, n_chars + `1`);
239	memcpy (dest: utf16_expected, src: utf16_expected_tmp + `1`, n: sizeof(gunichar2) * n_chars);
240	}
241	else if (utf16_expected_tmp[`0`] == `0xfffe`) / ANTI-BOM /
242	{
243	fail (format: "line %d: conversion via iconv to \"UTF-16\" is not native-endian\n", line);
244	return;
245	}
246	else
247	{
248	utf16_expected = g_new (gunichar2, n_chars + `1`);
249	memcpy (dest: utf16_expected, src: utf16_expected_tmp, n: sizeof(gunichar2) * n_chars);
250	}
251
252	utf16_expected[n_chars] = `'\0'`;
253
254	if (!(utf16_from_utf8 = g_utf8_to_utf16 (str: utf8, len: -`1`, items_read: &items_read, items_written: &items_written, error: &error)))
255	{
256	fail (format: "line %d: conversion to ucs16 failed: %s\n", line, error->message);
257	return;
258	}
259
260	if (items_read != strlen (s: utf8) \|\|
261	utf16_count (a: utf16_from_utf8) != items_written)
262	{
263	fail (format: "line %d: length error in conversion to ucs16\n", line);
264	return;
265	}
266
267	if (!(utf16_from_ucs4 = g_ucs4_to_utf16 (str: ucs4, len: -`1`, items_read: &items_read, items_written: &items_written, error: &error)))
268	{
269	fail (format: "line %d: conversion to ucs16 failed: %s\n", line, error->message);
270	return;
271	}
272
273	if (items_read != ucs4_len \|\|
274	utf16_count (a: utf16_from_ucs4) != items_written)
275	{
276	fail (format: "line %d: length error in conversion to ucs16\n", line);
277	return;
278	}
279
280	if (!utf16_equal (a: utf16_from_utf8, b: utf16_expected) \|\|
281	!utf16_equal (a: utf16_from_ucs4, b: utf16_expected))
282	{
283	fail (format: "line %d: results of conversion to ucs16 do not match\n", line);
284	return;
285	}
286
287	if (!(utf8_result = g_utf16_to_utf8 (str: utf16_from_utf8, len: -`1`, items_read: &items_read, items_written: &items_written, error: &error)))
288	{
289	fail (format: "line %d: conversion back to utf8 failed: %s\n", line, error->message);
290	return;
291	}
292
293	if (items_read != utf16_count (a: utf16_from_utf8) \|\|
294	items_written != strlen (s: utf8))
295	{
296	fail (format: "line %d: length error in conversion from ucs16 to utf8\n", line);
297	return;
298	}
299
300	if (!(ucs4_result = g_utf16_to_ucs4 (str: utf16_from_ucs4, len: -`1`, items_read: &items_read, items_written: &items_written, error: &error)))
301	{
302	fail (format: "line %d: conversion back to utf8/ucs4 failed\n", line);
303	return;
304	}
305
306	if (items_read != utf16_count (a: utf16_from_utf8) \|\|
307	items_written != ucs4_len)
308	{
309	fail (format: "line %d: length error in conversion from ucs16 to ucs4\n", line);
310	return;
311	}
312
313	if (strcmp (s1: utf8, s2: utf8_result) != `0` \|\|
314	!ucs4_equal (a: ucs4, b: ucs4_result))
315	{
316	fail (format: "line %d: conversion back to utf8/ucs4 did not match original\n", line);
317	return;
318	}
319
320	g_free (mem: utf16_expected_tmp);
321	g_free (mem: utf16_expected);
322	g_free (mem: utf16_from_utf8);
323	g_free (mem: utf16_from_ucs4);
324	g_free (mem: utf8_result);
325	g_free (mem: ucs4_result);
326	}
327	}
328
329	int
330	main (int argc, char **argv)
331	{
332	gchar *testfile;
333	gchar *contents;
334	GError *error = NULL;
335	gchar p, end;
336	char *tmp;
337	gint state = `0`;
338	gint line = `1`;
339	gint start_line = `0`; / Quiet GCC /
340	gchar utf8 = NULL; /* Quiet GCC /
341	GArray *ucs4;
342	Status status = VALID; / Quiet GCC /
343
344	g_test_init (argc: &argc, argv: &argv, NULL);
345
346	testfile = g_test_build_filename (file_type: G_TEST_DIST, first_path: "utf8.txt", NULL);
347
348	g_file_get_contents (filename: testfile, contents: &contents, NULL, error: &error);
349	if (error)
350	croak (format: "Cannot open utf8.txt: %s", error->message);
351
352	ucs4 = g_array_new (TRUE, FALSE, element_size: sizeof(gunichar));
353
354	p = contents;
355
356	/ Loop over lines /
357	while (*p)
358	{
359	while (p && (p == `' '` \|\| *p == `'\t'`))
360	p++;
361
362	end = p;
363	while (end && (end != `'\r'` && *end != `'\n'`))
364	end++;
365
366	if (!p \|\| p == `'#'` \|\| p == `'\r'` \|\| p == `'\n'`)
367	goto next_line;
368
369	tmp = g_strstrip (g_strndup (p, end - p));
370
371	switch (state)
372	{
373	case `0`:
374	/ UTF-8 string /
375	start_line = line;
376	utf8 = tmp;
377	tmp = NULL;
378	break;
379
380	case `1`:
381	/ Status /
382	if (!strcmp (s1: tmp, s2: "VALID"))
383	status = VALID;
384	else if (!strcmp (s1: tmp, s2: "INCOMPLETE"))
385	status = INCOMPLETE;
386	else if (!strcmp (s1: tmp, s2: "NOTUNICODE"))
387	status = NOTUNICODE;
388	else if (!strcmp (s1: tmp, s2: "OVERLONG"))
389	status = OVERLONG;
390	else if (!strcmp (s1: tmp, s2: "MALFORMED"))
391	status = MALFORMED;
392	else
393	croak (format: "Invalid status on line %d\n", line);
394
395	if (status != VALID && status != NOTUNICODE)
396	state++; / No UCS-4 data /
397
398	break;
399
400	case `2`:
401	/ UCS-4 version /
402
403	p = strtok (s: tmp, delim: " \t");
404	while (p)
405	{
406	gchar *endptr;
407
408	gunichar ch = strtoul (nptr: p, endptr: &endptr, base: `16`);
409	if (*endptr != `'\0'`)
410	croak (format: "Invalid UCS-4 character on line %d\n", line);
411
412	g_array_append_val (ucs4, ch);
413
414	p = strtok (NULL, delim: " \t");
415	}
416
417	break;
418	}
419
420	g_free (mem: tmp);
421	state = (state + `1`) % `3`;
422
423	if (state == `0`)
424	{
425	process (line: start_line, utf8, status, ucs4: (gunichar *)ucs4->data, ucs4_len: ucs4->len);
426	g_array_set_size (array: ucs4, length: `0`);
427	g_free (mem: utf8);
428	}
429
430	next_line:
431	p = end;
432	if (p && p == `'\r'`)
433	p++;
434	if (p && p == `'\n'`)
435	p++;
436
437	line++;
438	}
439
440	g_free (mem: testfile);
441	g_array_free (array: ucs4, TRUE);
442	g_free (mem: contents);
443	return exit_status;
444	}
445

source code of gtk/subprojects/glib/tests/unicode-encoding.c