1 | #undef G_DISABLE_ASSERT |
2 | #undef G_LOG_DOMAIN |
3 | |
4 | #include <stdarg.h> |
5 | #include <stdio.h> |
6 | #include <stdlib.h> |
7 | #include <string.h> |
8 | #include <glib.h> |
9 | |
10 | static gint exit_status = 0; |
11 | |
12 | G_GNUC_PRINTF (1, 2) |
13 | static void |
14 | croak (char *format, ...) |
15 | { |
16 | va_list va; |
17 | |
18 | va_start (va, format); |
19 | vfprintf (stderr, format: format, arg: va); |
20 | va_end (va); |
21 | |
22 | exit (status: 1); |
23 | } |
24 | |
25 | G_GNUC_PRINTF (1, 2) |
26 | static void |
27 | fail (char *format, ...) |
28 | { |
29 | va_list va; |
30 | |
31 | va_start (va, format); |
32 | vfprintf (stderr, format: format, arg: va); |
33 | va_end (va); |
34 | |
35 | exit_status |= 1; |
36 | } |
37 | |
38 | typedef enum |
39 | { |
40 | VALID, |
41 | INCOMPLETE, |
42 | NOTUNICODE, |
43 | OVERLONG, |
44 | MALFORMED |
45 | } Status; |
46 | |
47 | static gboolean |
48 | ucs4_equal (gunichar *a, gunichar *b) |
49 | { |
50 | while (*a && *b && (*a == *b)) |
51 | { |
52 | a++; |
53 | b++; |
54 | } |
55 | |
56 | return (*a == *b); |
57 | } |
58 | |
59 | static gboolean |
60 | utf16_equal (gunichar2 *a, gunichar2 *b) |
61 | { |
62 | while (*a && *b && (*a == *b)) |
63 | { |
64 | a++; |
65 | b++; |
66 | } |
67 | |
68 | return (*a == *b); |
69 | } |
70 | |
71 | static gint |
72 | utf16_count (gunichar2 *a) |
73 | { |
74 | gint result = 0; |
75 | |
76 | while (a[result]) |
77 | result++; |
78 | |
79 | return result; |
80 | } |
81 | |
82 | static void |
83 | print_ucs4 (const gchar *prefix, gunichar *ucs4, gint ucs4_len) |
84 | { |
85 | gint i; |
86 | g_print (format: "%s " , prefix); |
87 | for (i = 0; i < ucs4_len; i++) |
88 | g_print (format: "%x " , ucs4[i]); |
89 | g_print (format: "\n" ); |
90 | } |
91 | |
92 | static void |
93 | process (gint line, |
94 | gchar *utf8, |
95 | Status status, |
96 | gunichar *ucs4, |
97 | gint ucs4_len) |
98 | { |
99 | const gchar *end; |
100 | gboolean is_valid = g_utf8_validate (str: utf8, max_len: -1, end: &end); |
101 | GError *error = NULL; |
102 | glong items_read, items_written; |
103 | |
104 | switch (status) |
105 | { |
106 | case VALID: |
107 | if (!is_valid) |
108 | { |
109 | fail (format: "line %d: valid but g_utf8_validate returned FALSE\n" , line); |
110 | return; |
111 | } |
112 | break; |
113 | case NOTUNICODE: |
114 | case INCOMPLETE: |
115 | case OVERLONG: |
116 | case MALFORMED: |
117 | if (is_valid) |
118 | { |
119 | fail (format: "line %d: invalid but g_utf8_validate returned TRUE\n" , line); |
120 | return; |
121 | } |
122 | break; |
123 | } |
124 | |
125 | if (status == INCOMPLETE) |
126 | { |
127 | gunichar *ucs4_result; |
128 | |
129 | ucs4_result = g_utf8_to_ucs4 (str: utf8, len: -1, NULL, NULL, error: &error); |
130 | |
131 | if (!error || !g_error_matches (error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_PARTIAL_INPUT)) |
132 | { |
133 | fail (format: "line %d: incomplete input not properly detected\n" , line); |
134 | return; |
135 | } |
136 | g_clear_error (err: &error); |
137 | |
138 | ucs4_result = g_utf8_to_ucs4 (str: utf8, len: -1, items_read: &items_read, NULL, error: &error); |
139 | |
140 | if (!ucs4_result || items_read == strlen (s: utf8)) |
141 | { |
142 | fail (format: "line %d: incomplete input not properly detected\n" , line); |
143 | return; |
144 | } |
145 | |
146 | g_free (mem: ucs4_result); |
147 | } |
148 | |
149 | if (status == VALID || status == NOTUNICODE) |
150 | { |
151 | gunichar *ucs4_result; |
152 | |
153 | ucs4_result = g_utf8_to_ucs4 (str: utf8, len: -1, items_read: &items_read, items_written: &items_written, error: &error); |
154 | if (!ucs4_result) |
155 | { |
156 | fail (format: "line %d: conversion with status %d to ucs4 failed: %s\n" , line, status, error->message); |
157 | return; |
158 | } |
159 | |
160 | if (!ucs4_equal (a: ucs4_result, b: ucs4) || |
161 | items_read != strlen (s: utf8) || |
162 | items_written != ucs4_len) |
163 | { |
164 | fail (format: "line %d: results of conversion with status %d to ucs4 do not match expected.\n" , line, status); |
165 | print_ucs4 (prefix: "expected: " , ucs4, ucs4_len); |
166 | print_ucs4 (prefix: "received: " , ucs4: ucs4_result, ucs4_len: items_written); |
167 | return; |
168 | } |
169 | |
170 | g_free (mem: ucs4_result); |
171 | } |
172 | |
173 | if (status == VALID) |
174 | { |
175 | gunichar *ucs4_result; |
176 | gchar *utf8_result; |
177 | |
178 | ucs4_result = g_utf8_to_ucs4_fast (str: utf8, len: -1, items_written: &items_written); |
179 | |
180 | if (!ucs4_equal (a: ucs4_result, b: ucs4) || |
181 | items_written != ucs4_len) |
182 | { |
183 | fail (format: "line %d: results of fast conversion with status %d to ucs4 do not match expected.\n" , line, status); |
184 | print_ucs4 (prefix: "expected: " , ucs4, ucs4_len); |
185 | print_ucs4 (prefix: "received: " , ucs4: ucs4_result, ucs4_len: items_written); |
186 | return; |
187 | } |
188 | |
189 | utf8_result = g_ucs4_to_utf8 (str: ucs4_result, len: -1, items_read: &items_read, items_written: &items_written, error: &error); |
190 | if (!utf8_result) |
191 | { |
192 | fail (format: "line %d: conversion back to utf8 failed: %s" , line, error->message); |
193 | return; |
194 | } |
195 | |
196 | if (strcmp (s1: utf8_result, s2: utf8) != 0 || |
197 | items_read != ucs4_len || |
198 | items_written != strlen (s: utf8)) |
199 | { |
200 | fail (format: "line %d: conversion back to utf8 did not match original\n" , line); |
201 | return; |
202 | } |
203 | |
204 | g_free (mem: utf8_result); |
205 | g_free (mem: ucs4_result); |
206 | } |
207 | |
208 | if (status == VALID) |
209 | { |
210 | gunichar2 *utf16_expected_tmp; |
211 | gunichar2 *utf16_expected; |
212 | gunichar2 *utf16_from_utf8; |
213 | gunichar2 *utf16_from_ucs4; |
214 | gunichar *ucs4_result; |
215 | gsize bytes_written; |
216 | gint n_chars; |
217 | gchar *utf8_result; |
218 | |
219 | #if G_BYTE_ORDER == G_LITTLE_ENDIAN |
220 | #define TARGET "UTF-16LE" |
221 | #else |
222 | #define TARGET "UTF-16" |
223 | #endif |
224 | |
225 | if (!(utf16_expected_tmp = (gunichar2 *)g_convert (str: utf8, len: -1, TARGET, from_codeset: "UTF-8" , |
226 | NULL, bytes_written: &bytes_written, NULL))) |
227 | { |
228 | fail (format: "line %d: could not convert to UTF-16 via g_convert\n" , line); |
229 | return; |
230 | } |
231 | |
232 | /* zero-terminate and remove BOM |
233 | */ |
234 | n_chars = bytes_written / 2; |
235 | if (utf16_expected_tmp[0] == 0xfeff) /* BOM */ |
236 | { |
237 | n_chars--; |
238 | utf16_expected = g_new (gunichar2, n_chars + 1); |
239 | memcpy (dest: utf16_expected, src: utf16_expected_tmp + 1, n: sizeof(gunichar2) * n_chars); |
240 | } |
241 | else if (utf16_expected_tmp[0] == 0xfffe) /* ANTI-BOM */ |
242 | { |
243 | fail (format: "line %d: conversion via iconv to \"UTF-16\" is not native-endian\n" , line); |
244 | return; |
245 | } |
246 | else |
247 | { |
248 | utf16_expected = g_new (gunichar2, n_chars + 1); |
249 | memcpy (dest: utf16_expected, src: utf16_expected_tmp, n: sizeof(gunichar2) * n_chars); |
250 | } |
251 | |
252 | utf16_expected[n_chars] = '\0'; |
253 | |
254 | if (!(utf16_from_utf8 = g_utf8_to_utf16 (str: utf8, len: -1, items_read: &items_read, items_written: &items_written, error: &error))) |
255 | { |
256 | fail (format: "line %d: conversion to ucs16 failed: %s\n" , line, error->message); |
257 | return; |
258 | } |
259 | |
260 | if (items_read != strlen (s: utf8) || |
261 | utf16_count (a: utf16_from_utf8) != items_written) |
262 | { |
263 | fail (format: "line %d: length error in conversion to ucs16\n" , line); |
264 | return; |
265 | } |
266 | |
267 | if (!(utf16_from_ucs4 = g_ucs4_to_utf16 (str: ucs4, len: -1, items_read: &items_read, items_written: &items_written, error: &error))) |
268 | { |
269 | fail (format: "line %d: conversion to ucs16 failed: %s\n" , line, error->message); |
270 | return; |
271 | } |
272 | |
273 | if (items_read != ucs4_len || |
274 | utf16_count (a: utf16_from_ucs4) != items_written) |
275 | { |
276 | fail (format: "line %d: length error in conversion to ucs16\n" , line); |
277 | return; |
278 | } |
279 | |
280 | if (!utf16_equal (a: utf16_from_utf8, b: utf16_expected) || |
281 | !utf16_equal (a: utf16_from_ucs4, b: utf16_expected)) |
282 | { |
283 | fail (format: "line %d: results of conversion to ucs16 do not match\n" , line); |
284 | return; |
285 | } |
286 | |
287 | if (!(utf8_result = g_utf16_to_utf8 (str: utf16_from_utf8, len: -1, items_read: &items_read, items_written: &items_written, error: &error))) |
288 | { |
289 | fail (format: "line %d: conversion back to utf8 failed: %s\n" , line, error->message); |
290 | return; |
291 | } |
292 | |
293 | if (items_read != utf16_count (a: utf16_from_utf8) || |
294 | items_written != strlen (s: utf8)) |
295 | { |
296 | fail (format: "line %d: length error in conversion from ucs16 to utf8\n" , line); |
297 | return; |
298 | } |
299 | |
300 | if (!(ucs4_result = g_utf16_to_ucs4 (str: utf16_from_ucs4, len: -1, items_read: &items_read, items_written: &items_written, error: &error))) |
301 | { |
302 | fail (format: "line %d: conversion back to utf8/ucs4 failed\n" , line); |
303 | return; |
304 | } |
305 | |
306 | if (items_read != utf16_count (a: utf16_from_utf8) || |
307 | items_written != ucs4_len) |
308 | { |
309 | fail (format: "line %d: length error in conversion from ucs16 to ucs4\n" , line); |
310 | return; |
311 | } |
312 | |
313 | if (strcmp (s1: utf8, s2: utf8_result) != 0 || |
314 | !ucs4_equal (a: ucs4, b: ucs4_result)) |
315 | { |
316 | fail (format: "line %d: conversion back to utf8/ucs4 did not match original\n" , line); |
317 | return; |
318 | } |
319 | |
320 | g_free (mem: utf16_expected_tmp); |
321 | g_free (mem: utf16_expected); |
322 | g_free (mem: utf16_from_utf8); |
323 | g_free (mem: utf16_from_ucs4); |
324 | g_free (mem: utf8_result); |
325 | g_free (mem: ucs4_result); |
326 | } |
327 | } |
328 | |
329 | int |
330 | main (int argc, char **argv) |
331 | { |
332 | gchar *testfile; |
333 | gchar *contents; |
334 | GError *error = NULL; |
335 | gchar *p, *end; |
336 | char *tmp; |
337 | gint state = 0; |
338 | gint line = 1; |
339 | gint start_line = 0; /* Quiet GCC */ |
340 | gchar *utf8 = NULL; /* Quiet GCC */ |
341 | GArray *ucs4; |
342 | Status status = VALID; /* Quiet GCC */ |
343 | |
344 | g_test_init (argc: &argc, argv: &argv, NULL); |
345 | |
346 | testfile = g_test_build_filename (file_type: G_TEST_DIST, first_path: "utf8.txt" , NULL); |
347 | |
348 | g_file_get_contents (filename: testfile, contents: &contents, NULL, error: &error); |
349 | if (error) |
350 | croak (format: "Cannot open utf8.txt: %s" , error->message); |
351 | |
352 | ucs4 = g_array_new (TRUE, FALSE, element_size: sizeof(gunichar)); |
353 | |
354 | p = contents; |
355 | |
356 | /* Loop over lines */ |
357 | while (*p) |
358 | { |
359 | while (*p && (*p == ' ' || *p == '\t')) |
360 | p++; |
361 | |
362 | end = p; |
363 | while (*end && (*end != '\r' && *end != '\n')) |
364 | end++; |
365 | |
366 | if (!*p || *p == '#' || *p == '\r' || *p == '\n') |
367 | goto next_line; |
368 | |
369 | tmp = g_strstrip (g_strndup (p, end - p)); |
370 | |
371 | switch (state) |
372 | { |
373 | case 0: |
374 | /* UTF-8 string */ |
375 | start_line = line; |
376 | utf8 = tmp; |
377 | tmp = NULL; |
378 | break; |
379 | |
380 | case 1: |
381 | /* Status */ |
382 | if (!strcmp (s1: tmp, s2: "VALID" )) |
383 | status = VALID; |
384 | else if (!strcmp (s1: tmp, s2: "INCOMPLETE" )) |
385 | status = INCOMPLETE; |
386 | else if (!strcmp (s1: tmp, s2: "NOTUNICODE" )) |
387 | status = NOTUNICODE; |
388 | else if (!strcmp (s1: tmp, s2: "OVERLONG" )) |
389 | status = OVERLONG; |
390 | else if (!strcmp (s1: tmp, s2: "MALFORMED" )) |
391 | status = MALFORMED; |
392 | else |
393 | croak (format: "Invalid status on line %d\n" , line); |
394 | |
395 | if (status != VALID && status != NOTUNICODE) |
396 | state++; /* No UCS-4 data */ |
397 | |
398 | break; |
399 | |
400 | case 2: |
401 | /* UCS-4 version */ |
402 | |
403 | p = strtok (s: tmp, delim: " \t" ); |
404 | while (p) |
405 | { |
406 | gchar *endptr; |
407 | |
408 | gunichar ch = strtoul (nptr: p, endptr: &endptr, base: 16); |
409 | if (*endptr != '\0') |
410 | croak (format: "Invalid UCS-4 character on line %d\n" , line); |
411 | |
412 | g_array_append_val (ucs4, ch); |
413 | |
414 | p = strtok (NULL, delim: " \t" ); |
415 | } |
416 | |
417 | break; |
418 | } |
419 | |
420 | g_free (mem: tmp); |
421 | state = (state + 1) % 3; |
422 | |
423 | if (state == 0) |
424 | { |
425 | process (line: start_line, utf8, status, ucs4: (gunichar *)ucs4->data, ucs4_len: ucs4->len); |
426 | g_array_set_size (array: ucs4, length: 0); |
427 | g_free (mem: utf8); |
428 | } |
429 | |
430 | next_line: |
431 | p = end; |
432 | if (*p && *p == '\r') |
433 | p++; |
434 | if (*p && *p == '\n') |
435 | p++; |
436 | |
437 | line++; |
438 | } |
439 | |
440 | g_free (mem: testfile); |
441 | g_array_free (array: ucs4, TRUE); |
442 | g_free (mem: contents); |
443 | return exit_status; |
444 | } |
445 | |