1#undef G_DISABLE_ASSERT
2#undef G_LOG_DOMAIN
3
4#include <stdarg.h>
5#include <stdio.h>
6#include <stdlib.h>
7#include <string.h>
8#include <glib.h>
9
10static gint exit_status = 0;
11
12G_GNUC_PRINTF (1, 2)
13static void
14croak (char *format, ...)
15{
16 va_list va;
17
18 va_start (va, format);
19 vfprintf (stderr, format: format, arg: va);
20 va_end (va);
21
22 exit (status: 1);
23}
24
25G_GNUC_PRINTF (1, 2)
26static void
27fail (char *format, ...)
28{
29 va_list va;
30
31 va_start (va, format);
32 vfprintf (stderr, format: format, arg: va);
33 va_end (va);
34
35 exit_status |= 1;
36}
37
38typedef enum
39{
40 VALID,
41 INCOMPLETE,
42 NOTUNICODE,
43 OVERLONG,
44 MALFORMED
45} Status;
46
47static gboolean
48ucs4_equal (gunichar *a, gunichar *b)
49{
50 while (*a && *b && (*a == *b))
51 {
52 a++;
53 b++;
54 }
55
56 return (*a == *b);
57}
58
59static gboolean
60utf16_equal (gunichar2 *a, gunichar2 *b)
61{
62 while (*a && *b && (*a == *b))
63 {
64 a++;
65 b++;
66 }
67
68 return (*a == *b);
69}
70
71static gint
72utf16_count (gunichar2 *a)
73{
74 gint result = 0;
75
76 while (a[result])
77 result++;
78
79 return result;
80}
81
82static void
83print_ucs4 (const gchar *prefix, gunichar *ucs4, gint ucs4_len)
84{
85 gint i;
86 g_print (format: "%s ", prefix);
87 for (i = 0; i < ucs4_len; i++)
88 g_print (format: "%x ", ucs4[i]);
89 g_print (format: "\n");
90}
91
92static void
93process (gint line,
94 gchar *utf8,
95 Status status,
96 gunichar *ucs4,
97 gint ucs4_len)
98{
99 const gchar *end;
100 gboolean is_valid = g_utf8_validate (str: utf8, max_len: -1, end: &end);
101 GError *error = NULL;
102 glong items_read, items_written;
103
104 switch (status)
105 {
106 case VALID:
107 if (!is_valid)
108 {
109 fail (format: "line %d: valid but g_utf8_validate returned FALSE\n", line);
110 return;
111 }
112 break;
113 case NOTUNICODE:
114 case INCOMPLETE:
115 case OVERLONG:
116 case MALFORMED:
117 if (is_valid)
118 {
119 fail (format: "line %d: invalid but g_utf8_validate returned TRUE\n", line);
120 return;
121 }
122 break;
123 }
124
125 if (status == INCOMPLETE)
126 {
127 gunichar *ucs4_result;
128
129 ucs4_result = g_utf8_to_ucs4 (str: utf8, len: -1, NULL, NULL, error: &error);
130
131 if (!error || !g_error_matches (error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_PARTIAL_INPUT))
132 {
133 fail (format: "line %d: incomplete input not properly detected\n", line);
134 return;
135 }
136 g_clear_error (err: &error);
137
138 ucs4_result = g_utf8_to_ucs4 (str: utf8, len: -1, items_read: &items_read, NULL, error: &error);
139
140 if (!ucs4_result || items_read == strlen (s: utf8))
141 {
142 fail (format: "line %d: incomplete input not properly detected\n", line);
143 return;
144 }
145
146 g_free (mem: ucs4_result);
147 }
148
149 if (status == VALID || status == NOTUNICODE)
150 {
151 gunichar *ucs4_result;
152
153 ucs4_result = g_utf8_to_ucs4 (str: utf8, len: -1, items_read: &items_read, items_written: &items_written, error: &error);
154 if (!ucs4_result)
155 {
156 fail (format: "line %d: conversion with status %d to ucs4 failed: %s\n", line, status, error->message);
157 return;
158 }
159
160 if (!ucs4_equal (a: ucs4_result, b: ucs4) ||
161 items_read != strlen (s: utf8) ||
162 items_written != ucs4_len)
163 {
164 fail (format: "line %d: results of conversion with status %d to ucs4 do not match expected.\n", line, status);
165 print_ucs4 (prefix: "expected: ", ucs4, ucs4_len);
166 print_ucs4 (prefix: "received: ", ucs4: ucs4_result, ucs4_len: items_written);
167 return;
168 }
169
170 g_free (mem: ucs4_result);
171 }
172
173 if (status == VALID)
174 {
175 gunichar *ucs4_result;
176 gchar *utf8_result;
177
178 ucs4_result = g_utf8_to_ucs4_fast (str: utf8, len: -1, items_written: &items_written);
179
180 if (!ucs4_equal (a: ucs4_result, b: ucs4) ||
181 items_written != ucs4_len)
182 {
183 fail (format: "line %d: results of fast conversion with status %d to ucs4 do not match expected.\n", line, status);
184 print_ucs4 (prefix: "expected: ", ucs4, ucs4_len);
185 print_ucs4 (prefix: "received: ", ucs4: ucs4_result, ucs4_len: items_written);
186 return;
187 }
188
189 utf8_result = g_ucs4_to_utf8 (str: ucs4_result, len: -1, items_read: &items_read, items_written: &items_written, error: &error);
190 if (!utf8_result)
191 {
192 fail (format: "line %d: conversion back to utf8 failed: %s", line, error->message);
193 return;
194 }
195
196 if (strcmp (s1: utf8_result, s2: utf8) != 0 ||
197 items_read != ucs4_len ||
198 items_written != strlen (s: utf8))
199 {
200 fail (format: "line %d: conversion back to utf8 did not match original\n", line);
201 return;
202 }
203
204 g_free (mem: utf8_result);
205 g_free (mem: ucs4_result);
206 }
207
208 if (status == VALID)
209 {
210 gunichar2 *utf16_expected_tmp;
211 gunichar2 *utf16_expected;
212 gunichar2 *utf16_from_utf8;
213 gunichar2 *utf16_from_ucs4;
214 gunichar *ucs4_result;
215 gsize bytes_written;
216 gint n_chars;
217 gchar *utf8_result;
218
219#if G_BYTE_ORDER == G_LITTLE_ENDIAN
220#define TARGET "UTF-16LE"
221#else
222#define TARGET "UTF-16"
223#endif
224
225 if (!(utf16_expected_tmp = (gunichar2 *)g_convert (str: utf8, len: -1, TARGET, from_codeset: "UTF-8",
226 NULL, bytes_written: &bytes_written, NULL)))
227 {
228 fail (format: "line %d: could not convert to UTF-16 via g_convert\n", line);
229 return;
230 }
231
232 /* zero-terminate and remove BOM
233 */
234 n_chars = bytes_written / 2;
235 if (utf16_expected_tmp[0] == 0xfeff) /* BOM */
236 {
237 n_chars--;
238 utf16_expected = g_new (gunichar2, n_chars + 1);
239 memcpy (dest: utf16_expected, src: utf16_expected_tmp + 1, n: sizeof(gunichar2) * n_chars);
240 }
241 else if (utf16_expected_tmp[0] == 0xfffe) /* ANTI-BOM */
242 {
243 fail (format: "line %d: conversion via iconv to \"UTF-16\" is not native-endian\n", line);
244 return;
245 }
246 else
247 {
248 utf16_expected = g_new (gunichar2, n_chars + 1);
249 memcpy (dest: utf16_expected, src: utf16_expected_tmp, n: sizeof(gunichar2) * n_chars);
250 }
251
252 utf16_expected[n_chars] = '\0';
253
254 if (!(utf16_from_utf8 = g_utf8_to_utf16 (str: utf8, len: -1, items_read: &items_read, items_written: &items_written, error: &error)))
255 {
256 fail (format: "line %d: conversion to ucs16 failed: %s\n", line, error->message);
257 return;
258 }
259
260 if (items_read != strlen (s: utf8) ||
261 utf16_count (a: utf16_from_utf8) != items_written)
262 {
263 fail (format: "line %d: length error in conversion to ucs16\n", line);
264 return;
265 }
266
267 if (!(utf16_from_ucs4 = g_ucs4_to_utf16 (str: ucs4, len: -1, items_read: &items_read, items_written: &items_written, error: &error)))
268 {
269 fail (format: "line %d: conversion to ucs16 failed: %s\n", line, error->message);
270 return;
271 }
272
273 if (items_read != ucs4_len ||
274 utf16_count (a: utf16_from_ucs4) != items_written)
275 {
276 fail (format: "line %d: length error in conversion to ucs16\n", line);
277 return;
278 }
279
280 if (!utf16_equal (a: utf16_from_utf8, b: utf16_expected) ||
281 !utf16_equal (a: utf16_from_ucs4, b: utf16_expected))
282 {
283 fail (format: "line %d: results of conversion to ucs16 do not match\n", line);
284 return;
285 }
286
287 if (!(utf8_result = g_utf16_to_utf8 (str: utf16_from_utf8, len: -1, items_read: &items_read, items_written: &items_written, error: &error)))
288 {
289 fail (format: "line %d: conversion back to utf8 failed: %s\n", line, error->message);
290 return;
291 }
292
293 if (items_read != utf16_count (a: utf16_from_utf8) ||
294 items_written != strlen (s: utf8))
295 {
296 fail (format: "line %d: length error in conversion from ucs16 to utf8\n", line);
297 return;
298 }
299
300 if (!(ucs4_result = g_utf16_to_ucs4 (str: utf16_from_ucs4, len: -1, items_read: &items_read, items_written: &items_written, error: &error)))
301 {
302 fail (format: "line %d: conversion back to utf8/ucs4 failed\n", line);
303 return;
304 }
305
306 if (items_read != utf16_count (a: utf16_from_utf8) ||
307 items_written != ucs4_len)
308 {
309 fail (format: "line %d: length error in conversion from ucs16 to ucs4\n", line);
310 return;
311 }
312
313 if (strcmp (s1: utf8, s2: utf8_result) != 0 ||
314 !ucs4_equal (a: ucs4, b: ucs4_result))
315 {
316 fail (format: "line %d: conversion back to utf8/ucs4 did not match original\n", line);
317 return;
318 }
319
320 g_free (mem: utf16_expected_tmp);
321 g_free (mem: utf16_expected);
322 g_free (mem: utf16_from_utf8);
323 g_free (mem: utf16_from_ucs4);
324 g_free (mem: utf8_result);
325 g_free (mem: ucs4_result);
326 }
327}
328
329int
330main (int argc, char **argv)
331{
332 gchar *testfile;
333 gchar *contents;
334 GError *error = NULL;
335 gchar *p, *end;
336 char *tmp;
337 gint state = 0;
338 gint line = 1;
339 gint start_line = 0; /* Quiet GCC */
340 gchar *utf8 = NULL; /* Quiet GCC */
341 GArray *ucs4;
342 Status status = VALID; /* Quiet GCC */
343
344 g_test_init (argc: &argc, argv: &argv, NULL);
345
346 testfile = g_test_build_filename (file_type: G_TEST_DIST, first_path: "utf8.txt", NULL);
347
348 g_file_get_contents (filename: testfile, contents: &contents, NULL, error: &error);
349 if (error)
350 croak (format: "Cannot open utf8.txt: %s", error->message);
351
352 ucs4 = g_array_new (TRUE, FALSE, element_size: sizeof(gunichar));
353
354 p = contents;
355
356 /* Loop over lines */
357 while (*p)
358 {
359 while (*p && (*p == ' ' || *p == '\t'))
360 p++;
361
362 end = p;
363 while (*end && (*end != '\r' && *end != '\n'))
364 end++;
365
366 if (!*p || *p == '#' || *p == '\r' || *p == '\n')
367 goto next_line;
368
369 tmp = g_strstrip (g_strndup (p, end - p));
370
371 switch (state)
372 {
373 case 0:
374 /* UTF-8 string */
375 start_line = line;
376 utf8 = tmp;
377 tmp = NULL;
378 break;
379
380 case 1:
381 /* Status */
382 if (!strcmp (s1: tmp, s2: "VALID"))
383 status = VALID;
384 else if (!strcmp (s1: tmp, s2: "INCOMPLETE"))
385 status = INCOMPLETE;
386 else if (!strcmp (s1: tmp, s2: "NOTUNICODE"))
387 status = NOTUNICODE;
388 else if (!strcmp (s1: tmp, s2: "OVERLONG"))
389 status = OVERLONG;
390 else if (!strcmp (s1: tmp, s2: "MALFORMED"))
391 status = MALFORMED;
392 else
393 croak (format: "Invalid status on line %d\n", line);
394
395 if (status != VALID && status != NOTUNICODE)
396 state++; /* No UCS-4 data */
397
398 break;
399
400 case 2:
401 /* UCS-4 version */
402
403 p = strtok (s: tmp, delim: " \t");
404 while (p)
405 {
406 gchar *endptr;
407
408 gunichar ch = strtoul (nptr: p, endptr: &endptr, base: 16);
409 if (*endptr != '\0')
410 croak (format: "Invalid UCS-4 character on line %d\n", line);
411
412 g_array_append_val (ucs4, ch);
413
414 p = strtok (NULL, delim: " \t");
415 }
416
417 break;
418 }
419
420 g_free (mem: tmp);
421 state = (state + 1) % 3;
422
423 if (state == 0)
424 {
425 process (line: start_line, utf8, status, ucs4: (gunichar *)ucs4->data, ucs4_len: ucs4->len);
426 g_array_set_size (array: ucs4, length: 0);
427 g_free (mem: utf8);
428 }
429
430 next_line:
431 p = end;
432 if (*p && *p == '\r')
433 p++;
434 if (*p && *p == '\n')
435 p++;
436
437 line++;
438 }
439
440 g_free (mem: testfile);
441 g_array_free (array: ucs4, TRUE);
442 g_free (mem: contents);
443 return exit_status;
444}
445

source code of gtk/subprojects/glib/tests/unicode-encoding.c