1 | /* GLIB - Library of useful routines for C programming |
2 | * Copyright (C) 2001 Matthias Clasen <matthiasc@poet.de> |
3 | * |
4 | * This library is free software; you can redistribute it and/or |
5 | * modify it under the terms of the GNU Lesser General Public |
6 | * License as published by the Free Software Foundation; either |
7 | * version 2.1 of the License, or (at your option) any later version. |
8 | * |
9 | * This library is distributed in the hope that it will be useful, |
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12 | * Lesser General Public License for more details. |
13 | * |
14 | * You should have received a copy of the GNU Lesser General Public |
15 | * License along with this library; if not, see <http://www.gnu.org/licenses/>. |
16 | */ |
17 | |
18 | #include "glib.h" |
19 | #include <string.h> |
20 | |
21 | #define UNICODE_VALID(Char) \ |
22 | ((Char) < 0x110000 && \ |
23 | (((Char) & 0xFFFFF800) != 0xD800) && \ |
24 | ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \ |
25 | ((Char) & 0xFFFE) != 0xFFFE) |
26 | |
27 | |
28 | typedef struct { |
29 | const gchar *text; |
30 | gint max_len; |
31 | gint offset; |
32 | gboolean valid; |
33 | } Test; |
34 | |
35 | Test test[] = { |
36 | /* some tests to check max_len handling */ |
37 | /* length 1 */ |
38 | { "abcde" , -1, 5, TRUE }, |
39 | { "abcde" , 3, 3, TRUE }, |
40 | { "abcde" , 5, 5, TRUE }, |
41 | { "abcde" , 7, 5, FALSE }, |
42 | /* length 2 */ |
43 | { "\xc2\xa9\xc2\xa9\xc2\xa9" , -1, 6, TRUE }, |
44 | { "\xc2\xa9\xc2\xa9\xc2\xa9" , 1, 0, FALSE }, |
45 | { "\xc2\xa9\xc2\xa9\xc2\xa9" , 2, 2, TRUE }, |
46 | { "\xc2\xa9\xc2\xa9\xc2\xa9" , 3, 2, FALSE }, |
47 | { "\xc2\xa9\xc2\xa9\xc2\xa9" , 4, 4, TRUE }, |
48 | { "\xc2\xa9\xc2\xa9\xc2\xa9" , 5, 4, FALSE }, |
49 | { "\xc2\xa9\xc2\xa9\xc2\xa9" , 6, 6, TRUE }, |
50 | { "\xc2\xa9\xc2\xa9\xc2\xa9" , 7, 6, FALSE }, |
51 | /* length 3 */ |
52 | { "\xe2\x89\xa0\xe2\x89\xa0" , -1, 6, TRUE }, |
53 | { "\xe2\x89\xa0\xe2\x89\xa0" , 1, 0, FALSE }, |
54 | { "\xe2\x89\xa0\xe2\x89\xa0" , 2, 0, FALSE }, |
55 | { "\xe2\x89\xa0\xe2\x89\xa0" , 3, 3, TRUE }, |
56 | { "\xe2\x89\xa0\xe2\x89\xa0" , 4, 3, FALSE }, |
57 | { "\xe2\x89\xa0\xe2\x89\xa0" , 5, 3, FALSE }, |
58 | { "\xe2\x89\xa0\xe2\x89\xa0" , 6, 6, TRUE }, |
59 | { "\xe2\x89\xa0\xe2\x89\xa0" , 7, 6, FALSE }, |
60 | |
61 | /* examples from http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt */ |
62 | /* greek 'kosme' */ |
63 | { "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5" , -1, 11, TRUE }, |
64 | /* first sequence of each length */ |
65 | { "\x00" , -1, 0, TRUE }, |
66 | { "\xc2\x80" , -1, 2, TRUE }, |
67 | { "\xe0\xa0\x80" , -1, 3, TRUE }, |
68 | { "\xf0\x90\x80\x80" , -1, 4, TRUE }, |
69 | { "\xf8\x88\x80\x80\x80" , -1, 0, FALSE }, |
70 | { "\xfc\x84\x80\x80\x80\x80" , -1, 0, FALSE }, |
71 | /* last sequence of each length */ |
72 | { "\x7f" , -1, 1, TRUE }, |
73 | { "\xdf\xbf" , -1, 2, TRUE }, |
74 | { "\xef\xbf\xbf" , -1, 3, TRUE }, |
75 | { "\xf7\xbf\xbf\xbf" , -1, 0, FALSE }, |
76 | { "\xfb\xbf\xbf\xbf\xbf" , -1, 0, FALSE }, |
77 | { "\xfd\xbf\xbf\xbf\xbf\xbf" , -1, 0, FALSE }, |
78 | /* other boundary conditions */ |
79 | { "\xed\x9f\xbf" , -1, 3, TRUE }, |
80 | { "\xee\x80\x80" , -1, 3, TRUE }, |
81 | { "\xef\xbf\xbd" , -1, 3, TRUE }, |
82 | { "\xf4\x8f\xbf\xbf" , -1, 4, TRUE }, |
83 | { "\xf4\x90\x80\x80" , -1, 0, FALSE }, |
84 | /* malformed sequences */ |
85 | /* continuation bytes */ |
86 | { "\x80" , -1, 0, FALSE }, |
87 | { "\xbf" , -1, 0, FALSE }, |
88 | { "\xbf\x80" , -1, 0, FALSE }, |
89 | { "\x80\xbf" , -1, 0, FALSE }, |
90 | { "\x80\xbf\x80" , -1, 0, FALSE }, |
91 | { "\x80\xbf\x80\xbf" , -1, 0, FALSE }, |
92 | { "\x80\xbf\x80\xbf\x80" , -1, 0, FALSE }, |
93 | { "\x80\xbf\x80\xbf\x80\xbf" , -1, 0, FALSE }, |
94 | { "\x80\xbf\x80\xbf\x80\xbf\x80" , -1, 0, FALSE }, |
95 | |
96 | /* all possible continuation byte */ |
97 | { "\x80" , -1, 0, FALSE }, |
98 | { "\x81" , -1, 0, FALSE }, |
99 | { "\x82" , -1, 0, FALSE }, |
100 | { "\x83" , -1, 0, FALSE }, |
101 | { "\x84" , -1, 0, FALSE }, |
102 | { "\x85" , -1, 0, FALSE }, |
103 | { "\x86" , -1, 0, FALSE }, |
104 | { "\x87" , -1, 0, FALSE }, |
105 | { "\x88" , -1, 0, FALSE }, |
106 | { "\x89" , -1, 0, FALSE }, |
107 | { "\x8a" , -1, 0, FALSE }, |
108 | { "\x8b" , -1, 0, FALSE }, |
109 | { "\x8c" , -1, 0, FALSE }, |
110 | { "\x8d" , -1, 0, FALSE }, |
111 | { "\x8e" , -1, 0, FALSE }, |
112 | { "\x8f" , -1, 0, FALSE }, |
113 | { "\x90" , -1, 0, FALSE }, |
114 | { "\x91" , -1, 0, FALSE }, |
115 | { "\x92" , -1, 0, FALSE }, |
116 | { "\x93" , -1, 0, FALSE }, |
117 | { "\x94" , -1, 0, FALSE }, |
118 | { "\x95" , -1, 0, FALSE }, |
119 | { "\x96" , -1, 0, FALSE }, |
120 | { "\x97" , -1, 0, FALSE }, |
121 | { "\x98" , -1, 0, FALSE }, |
122 | { "\x99" , -1, 0, FALSE }, |
123 | { "\x9a" , -1, 0, FALSE }, |
124 | { "\x9b" , -1, 0, FALSE }, |
125 | { "\x9c" , -1, 0, FALSE }, |
126 | { "\x9d" , -1, 0, FALSE }, |
127 | { "\x9e" , -1, 0, FALSE }, |
128 | { "\x9f" , -1, 0, FALSE }, |
129 | { "\xa0" , -1, 0, FALSE }, |
130 | { "\xa1" , -1, 0, FALSE }, |
131 | { "\xa2" , -1, 0, FALSE }, |
132 | { "\xa3" , -1, 0, FALSE }, |
133 | { "\xa4" , -1, 0, FALSE }, |
134 | { "\xa5" , -1, 0, FALSE }, |
135 | { "\xa6" , -1, 0, FALSE }, |
136 | { "\xa7" , -1, 0, FALSE }, |
137 | { "\xa8" , -1, 0, FALSE }, |
138 | { "\xa9" , -1, 0, FALSE }, |
139 | { "\xaa" , -1, 0, FALSE }, |
140 | { "\xab" , -1, 0, FALSE }, |
141 | { "\xac" , -1, 0, FALSE }, |
142 | { "\xad" , -1, 0, FALSE }, |
143 | { "\xae" , -1, 0, FALSE }, |
144 | { "\xaf" , -1, 0, FALSE }, |
145 | { "\xb0" , -1, 0, FALSE }, |
146 | { "\xb1" , -1, 0, FALSE }, |
147 | { "\xb2" , -1, 0, FALSE }, |
148 | { "\xb3" , -1, 0, FALSE }, |
149 | { "\xb4" , -1, 0, FALSE }, |
150 | { "\xb5" , -1, 0, FALSE }, |
151 | { "\xb6" , -1, 0, FALSE }, |
152 | { "\xb7" , -1, 0, FALSE }, |
153 | { "\xb8" , -1, 0, FALSE }, |
154 | { "\xb9" , -1, 0, FALSE }, |
155 | { "\xba" , -1, 0, FALSE }, |
156 | { "\xbb" , -1, 0, FALSE }, |
157 | { "\xbc" , -1, 0, FALSE }, |
158 | { "\xbd" , -1, 0, FALSE }, |
159 | { "\xbe" , -1, 0, FALSE }, |
160 | { "\xbf" , -1, 0, FALSE }, |
161 | /* lone start characters */ |
162 | { "\xc0\x20" , -1, 0, FALSE }, |
163 | { "\xc1\x20" , -1, 0, FALSE }, |
164 | { "\xc2\x20" , -1, 0, FALSE }, |
165 | { "\xc3\x20" , -1, 0, FALSE }, |
166 | { "\xc4\x20" , -1, 0, FALSE }, |
167 | { "\xc5\x20" , -1, 0, FALSE }, |
168 | { "\xc6\x20" , -1, 0, FALSE }, |
169 | { "\xc7\x20" , -1, 0, FALSE }, |
170 | { "\xc8\x20" , -1, 0, FALSE }, |
171 | { "\xc9\x20" , -1, 0, FALSE }, |
172 | { "\xca\x20" , -1, 0, FALSE }, |
173 | { "\xcb\x20" , -1, 0, FALSE }, |
174 | { "\xcc\x20" , -1, 0, FALSE }, |
175 | { "\xcd\x20" , -1, 0, FALSE }, |
176 | { "\xce\x20" , -1, 0, FALSE }, |
177 | { "\xcf\x20" , -1, 0, FALSE }, |
178 | { "\xd0\x20" , -1, 0, FALSE }, |
179 | { "\xd1\x20" , -1, 0, FALSE }, |
180 | { "\xd2\x20" , -1, 0, FALSE }, |
181 | { "\xd3\x20" , -1, 0, FALSE }, |
182 | { "\xd4\x20" , -1, 0, FALSE }, |
183 | { "\xd5\x20" , -1, 0, FALSE }, |
184 | { "\xd6\x20" , -1, 0, FALSE }, |
185 | { "\xd7\x20" , -1, 0, FALSE }, |
186 | { "\xd8\x20" , -1, 0, FALSE }, |
187 | { "\xd9\x20" , -1, 0, FALSE }, |
188 | { "\xda\x20" , -1, 0, FALSE }, |
189 | { "\xdb\x20" , -1, 0, FALSE }, |
190 | { "\xdc\x20" , -1, 0, FALSE }, |
191 | { "\xdd\x20" , -1, 0, FALSE }, |
192 | { "\xde\x20" , -1, 0, FALSE }, |
193 | { "\xdf\x20" , -1, 0, FALSE }, |
194 | { "\xe0\x20" , -1, 0, FALSE }, |
195 | { "\xe1\x20" , -1, 0, FALSE }, |
196 | { "\xe2\x20" , -1, 0, FALSE }, |
197 | { "\xe3\x20" , -1, 0, FALSE }, |
198 | { "\xe4\x20" , -1, 0, FALSE }, |
199 | { "\xe5\x20" , -1, 0, FALSE }, |
200 | { "\xe6\x20" , -1, 0, FALSE }, |
201 | { "\xe7\x20" , -1, 0, FALSE }, |
202 | { "\xe8\x20" , -1, 0, FALSE }, |
203 | { "\xe9\x20" , -1, 0, FALSE }, |
204 | { "\xea\x20" , -1, 0, FALSE }, |
205 | { "\xeb\x20" , -1, 0, FALSE }, |
206 | { "\xec\x20" , -1, 0, FALSE }, |
207 | { "\xed\x20" , -1, 0, FALSE }, |
208 | { "\xee\x20" , -1, 0, FALSE }, |
209 | { "\xef\x20" , -1, 0, FALSE }, |
210 | { "\xf0\x20" , -1, 0, FALSE }, |
211 | { "\xf1\x20" , -1, 0, FALSE }, |
212 | { "\xf2\x20" , -1, 0, FALSE }, |
213 | { "\xf3\x20" , -1, 0, FALSE }, |
214 | { "\xf4\x20" , -1, 0, FALSE }, |
215 | { "\xf5\x20" , -1, 0, FALSE }, |
216 | { "\xf6\x20" , -1, 0, FALSE }, |
217 | { "\xf7\x20" , -1, 0, FALSE }, |
218 | { "\xf8\x20" , -1, 0, FALSE }, |
219 | { "\xf9\x20" , -1, 0, FALSE }, |
220 | { "\xfa\x20" , -1, 0, FALSE }, |
221 | { "\xfb\x20" , -1, 0, FALSE }, |
222 | { "\xfc\x20" , -1, 0, FALSE }, |
223 | { "\xfd\x20" , -1, 0, FALSE }, |
224 | /* missing continuation bytes */ |
225 | { "\x20\xc0" , -1, 1, FALSE }, |
226 | { "\x20\xe0\x80" , -1, 1, FALSE }, |
227 | { "\x20\xf0\x80\x80" , -1, 1, FALSE }, |
228 | { "\x20\xf8\x80\x80\x80" , -1, 1, FALSE }, |
229 | { "\x20\xfc\x80\x80\x80\x80" , -1, 1, FALSE }, |
230 | { "\x20\xdf" , -1, 1, FALSE }, |
231 | { "\x20\xef\xbf" , -1, 1, FALSE }, |
232 | { "\x20\xf7\xbf\xbf" , -1, 1, FALSE }, |
233 | { "\x20\xfb\xbf\xbf\xbf" , -1, 1, FALSE }, |
234 | { "\x20\xfd\xbf\xbf\xbf\xbf" , -1, 1, FALSE }, |
235 | /* impossible bytes */ |
236 | { "\x20\xfe\x20" , -1, 1, FALSE }, |
237 | { "\x20\xff\x20" , -1, 1, FALSE }, |
238 | /* overlong sequences */ |
239 | { "\x20\xc0\xaf\x20" , -1, 1, FALSE }, |
240 | { "\x20\xe0\x80\xaf\x20" , -1, 1, FALSE }, |
241 | { "\x20\xf0\x80\x80\xaf\x20" , -1, 1, FALSE }, |
242 | { "\x20\xf8\x80\x80\x80\xaf\x20" , -1, 1, FALSE }, |
243 | { "\x20\xfc\x80\x80\x80\x80\xaf\x20" , -1, 1, FALSE }, |
244 | { "\x20\xc1\xbf\x20" , -1, 1, FALSE }, |
245 | { "\x20\xe0\x9f\xbf\x20" , -1, 1, FALSE }, |
246 | { "\x20\xf0\x8f\xbf\xbf\x20" , -1, 1, FALSE }, |
247 | { "\x20\xf8\x87\xbf\xbf\xbf\x20" , -1, 1, FALSE }, |
248 | { "\x20\xfc\x83\xbf\xbf\xbf\xbf\x20" , -1, 1, FALSE }, |
249 | { "\x20\xc0\x80\x20" , -1, 1, FALSE }, |
250 | { "\x20\xe0\x80\x80\x20" , -1, 1, FALSE }, |
251 | { "\x20\xf0\x80\x80\x80\x20" , -1, 1, FALSE }, |
252 | { "\x20\xf8\x80\x80\x80\x80\x20" , -1, 1, FALSE }, |
253 | { "\x20\xfc\x80\x80\x80\x80\x80\x20" , -1, 1, FALSE }, |
254 | /* illegal code positions */ |
255 | { "\x20\xed\xa0\x80\x20" , -1, 1, FALSE }, |
256 | { "\x20\xed\xad\xbf\x20" , -1, 1, FALSE }, |
257 | { "\x20\xed\xae\x80\x20" , -1, 1, FALSE }, |
258 | { "\x20\xed\xaf\xbf\x20" , -1, 1, FALSE }, |
259 | { "\x20\xed\xb0\x80\x20" , -1, 1, FALSE }, |
260 | { "\x20\xed\xbe\x80\x20" , -1, 1, FALSE }, |
261 | { "\x20\xed\xbf\xbf\x20" , -1, 1, FALSE }, |
262 | { "\x20\xed\xa0\x80\xed\xb0\x80\x20" , -1, 1, FALSE }, |
263 | { "\x20\xed\xa0\x80\xed\xbf\xbf\x20" , -1, 1, FALSE }, |
264 | { "\x20\xed\xad\xbf\xed\xb0\x80\x20" , -1, 1, FALSE }, |
265 | { "\x20\xed\xad\xbf\xed\xbf\xbf\x20" , -1, 1, FALSE }, |
266 | { "\x20\xed\xae\x80\xed\xb0\x80\x20" , -1, 1, FALSE }, |
267 | { "\x20\xed\xae\x80\xed\xbf\xbf\x20" , -1, 1, FALSE }, |
268 | { "\x20\xed\xaf\xbf\xed\xb0\x80\x20" , -1, 1, FALSE }, |
269 | { "\x20\xed\xaf\xbf\xed\xbf\xbf\x20" , -1, 1, FALSE }, |
270 | |
271 | { NULL, } |
272 | }; |
273 | |
274 | static void |
275 | do_test (gconstpointer d) |
276 | { |
277 | const Test *test = d; |
278 | const gchar *end; |
279 | gboolean result; |
280 | |
281 | result = g_utf8_validate (str: test->text, max_len: test->max_len, end: &end); |
282 | |
283 | g_assert_true (result == test->valid); |
284 | g_assert_cmpint (end - test->text, ==, test->offset); |
285 | |
286 | if (test->max_len < 0) |
287 | { |
288 | result = g_utf8_validate (str: test->text, max_len: strlen (s: test->text), end: &end); |
289 | |
290 | g_assert_true (result == test->valid); |
291 | g_assert_cmpint (end - test->text, ==, test->offset); |
292 | } |
293 | else |
294 | { |
295 | result = g_utf8_validate_len (str: test->text, max_len: test->max_len, end: &end); |
296 | |
297 | g_assert_true (result == test->valid); |
298 | g_assert_cmpint (end - test->text, ==, test->offset); |
299 | } |
300 | } |
301 | |
302 | /* Test the behaviour of g_utf8_get_char_validated() with various inputs and |
303 | * length restrictions. */ |
304 | static void |
305 | test_utf8_get_char_validated (void) |
306 | { |
307 | const struct { |
308 | const gchar *buf; |
309 | gssize max_len; |
310 | gunichar expected_result; |
311 | } test_vectors[] = { |
312 | /* Bug #780095: */ |
313 | { "\xC0\x00_45678" , 8, (gunichar) -2 }, |
314 | { "\xC0\x00_45678" , -1, (gunichar) -2 }, |
315 | /* It seems odd that the return value differs with the length input, but |
316 | * that’s how it’s documented: */ |
317 | { "" , 0, (gunichar) -2 }, |
318 | { "" , -1, (gunichar) 0 }, |
319 | { "\0" , 1, (gunichar) -2 }, |
320 | { "AB\0" , 3, 'A' }, |
321 | { "A\0B" , 3, 'A' }, |
322 | { "\0AB" , 3, (gunichar) -2 }, |
323 | { "\xD8\0" , 2, (gunichar) -2 }, |
324 | /* Normal inputs: */ |
325 | { "hello" , 5, (gunichar) 'h' }, |
326 | { "hello" , -1, (gunichar) 'h' }, |
327 | { "\xD8\x9F" , 2, 0x061F }, |
328 | { "\xD8\x9F" , -1, 0x061F }, |
329 | { "\xD8\x9Fmore" , 6, 0x061F }, |
330 | { "\xD8\x9Fmore" , -1, 0x061F }, |
331 | { "\xD8\x9F\0" , 3, 0x061F }, |
332 | { "\xE2\x96\xB3" , 3, 0x25B3 }, |
333 | { "\xE2\x96\xB3" , -1, 0x25B3 }, |
334 | { "\xE2\x96\xB3more" , 7, 0x25B3 }, |
335 | { "\xE2\x96\xB3more" , -1, 0x25B3 }, |
336 | { "\xF0\x9F\x92\xA9" , 4, 0x1F4A9 }, |
337 | { "\xF0\x9F\x92\xA9" , -1, 0x1F4A9 }, |
338 | { "\xF0\x9F\x92\xA9more" , 8, 0x1F4A9 }, |
339 | { "\xF0\x9F\x92\xA9more" , -1, 0x1F4A9 }, |
340 | /* Partial unichars: */ |
341 | { "\xD8" , -1, (gunichar) -2 }, |
342 | { "\xD8\x9F" , 1, (gunichar) -2 }, |
343 | { "\xCE" , -1, (gunichar) -2 }, |
344 | { "\xCE" , 1, (gunichar) -2 }, |
345 | }; |
346 | gsize i; |
347 | |
348 | for (i = 0; i < G_N_ELEMENTS (test_vectors); i++) |
349 | { |
350 | gunichar actual_result; |
351 | |
352 | g_test_message (format: "Vector %" G_GSIZE_FORMAT, i); |
353 | actual_result = g_utf8_get_char_validated (p: test_vectors[i].buf, |
354 | max_len: test_vectors[i].max_len); |
355 | g_assert_cmpint (actual_result, ==, test_vectors[i].expected_result); |
356 | } |
357 | } |
358 | |
359 | int |
360 | main (int argc, char *argv[]) |
361 | { |
362 | gint i; |
363 | gchar *path; |
364 | |
365 | g_test_init (argc: &argc, argv: &argv, NULL); |
366 | |
367 | for (i = 0; test[i].text; i++) |
368 | { |
369 | path = g_strdup_printf (format: "/utf8/validate/%d" , i); |
370 | g_test_add_data_func (testpath: path, test_data: &test[i], test_func: do_test); |
371 | g_free (mem: path); |
372 | } |
373 | |
374 | g_test_add_func (testpath: "/utf8/get-char-validated" , test_func: test_utf8_get_char_validated); |
375 | |
376 | return g_test_run (); |
377 | } |
378 | |