utf8-validate.c source code [gtk/subprojects/glib/glib/tests/utf8-validate.c]

1	/ GLIB - Library of useful routines for C programming*
2	* Copyright (C) 2001 Matthias Clasen <matthiasc@poet.de>
3	*
4	* This library is free software; you can redistribute it and/or
5	* modify it under the terms of the GNU Lesser General Public
6	* License as published by the Free Software Foundation; either
7	* version 2.1 of the License, or (at your option) any later version.
8	*
9	* This library is distributed in the hope that it will be useful,
10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12	* Lesser General Public License for more details.
13	*
14	* You should have received a copy of the GNU Lesser General Public
15	* License along with this library; if not, see <http://www.gnu.org/licenses/>.
16	*/
17
18	#include "glib.h"
19	#include <string.h>
20
21	#define UNICODE_VALID(Char) \
22	((Char) < 0x110000 && \
23	(((Char) & 0xFFFFF800) != 0xD800) && \
24	((Char) < 0xFDD0 \|\| (Char) > 0xFDEF) && \
25	((Char) & 0xFFFE) != 0xFFFE)
26
27
28	typedef struct {
29	const gchar *text;
30	gint max_len;
31	gint offset;
32	gboolean valid;
33	} Test;
34
35	Test test[] = {
36	/ some tests to check max_len handling /
37	/ length 1 /
38	{ "abcde", -`1`, `5`, TRUE },
39	{ "abcde", `3`, `3`, TRUE },
40	{ "abcde", `5`, `5`, TRUE },
41	{ "abcde", `7`, `5`, FALSE },
42	/ length 2 /
43	{ "\xc2\xa9\xc2\xa9\xc2\xa9", -`1`, `6`, TRUE },
44	{ "\xc2\xa9\xc2\xa9\xc2\xa9", `1`, `0`, FALSE },
45	{ "\xc2\xa9\xc2\xa9\xc2\xa9", `2`, `2`, TRUE },
46	{ "\xc2\xa9\xc2\xa9\xc2\xa9", `3`, `2`, FALSE },
47	{ "\xc2\xa9\xc2\xa9\xc2\xa9", `4`, `4`, TRUE },
48	{ "\xc2\xa9\xc2\xa9\xc2\xa9", `5`, `4`, FALSE },
49	{ "\xc2\xa9\xc2\xa9\xc2\xa9", `6`, `6`, TRUE },
50	{ "\xc2\xa9\xc2\xa9\xc2\xa9", `7`, `6`, FALSE },
51	/ length 3 /
52	{ "\xe2\x89\xa0\xe2\x89\xa0", -`1`, `6`, TRUE },
53	{ "\xe2\x89\xa0\xe2\x89\xa0", `1`, `0`, FALSE },
54	{ "\xe2\x89\xa0\xe2\x89\xa0", `2`, `0`, FALSE },
55	{ "\xe2\x89\xa0\xe2\x89\xa0", `3`, `3`, TRUE },
56	{ "\xe2\x89\xa0\xe2\x89\xa0", `4`, `3`, FALSE },
57	{ "\xe2\x89\xa0\xe2\x89\xa0", `5`, `3`, FALSE },
58	{ "\xe2\x89\xa0\xe2\x89\xa0", `6`, `6`, TRUE },
59	{ "\xe2\x89\xa0\xe2\x89\xa0", `7`, `6`, FALSE },
60
61	/ examples from http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt /
62	/ greek 'kosme' /
63	{ "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5", -`1`, `11`, TRUE },
64	/ first sequence of each length /
65	{ "\x00", -`1`, `0`, TRUE },
66	{ "\xc2\x80", -`1`, `2`, TRUE },
67	{ "\xe0\xa0\x80", -`1`, `3`, TRUE },
68	{ "\xf0\x90\x80\x80", -`1`, `4`, TRUE },
69	{ "\xf8\x88\x80\x80\x80", -`1`, `0`, FALSE },
70	{ "\xfc\x84\x80\x80\x80\x80", -`1`, `0`, FALSE },
71	/ last sequence of each length /
72	{ "\x7f", -`1`, `1`, TRUE },
73	{ "\xdf\xbf", -`1`, `2`, TRUE },
74	{ "\xef\xbf\xbf", -`1`, `3`, TRUE },
75	{ "\xf7\xbf\xbf\xbf", -`1`, `0`, FALSE },
76	{ "\xfb\xbf\xbf\xbf\xbf", -`1`, `0`, FALSE },
77	{ "\xfd\xbf\xbf\xbf\xbf\xbf", -`1`, `0`, FALSE },
78	/ other boundary conditions /
79	{ "\xed\x9f\xbf", -`1`, `3`, TRUE },
80	{ "\xee\x80\x80", -`1`, `3`, TRUE },
81	{ "\xef\xbf\xbd", -`1`, `3`, TRUE },
82	{ "\xf4\x8f\xbf\xbf", -`1`, `4`, TRUE },
83	{ "\xf4\x90\x80\x80", -`1`, `0`, FALSE },
84	/ malformed sequences /
85	/ continuation bytes /
86	{ "\x80", -`1`, `0`, FALSE },
87	{ "\xbf", -`1`, `0`, FALSE },
88	{ "\xbf\x80", -`1`, `0`, FALSE },
89	{ "\x80\xbf", -`1`, `0`, FALSE },
90	{ "\x80\xbf\x80", -`1`, `0`, FALSE },
91	{ "\x80\xbf\x80\xbf", -`1`, `0`, FALSE },
92	{ "\x80\xbf\x80\xbf\x80", -`1`, `0`, FALSE },
93	{ "\x80\xbf\x80\xbf\x80\xbf", -`1`, `0`, FALSE },
94	{ "\x80\xbf\x80\xbf\x80\xbf\x80", -`1`, `0`, FALSE },
95
96	/ all possible continuation byte /
97	{ "\x80", -`1`, `0`, FALSE },
98	{ "\x81", -`1`, `0`, FALSE },
99	{ "\x82", -`1`, `0`, FALSE },
100	{ "\x83", -`1`, `0`, FALSE },
101	{ "\x84", -`1`, `0`, FALSE },
102	{ "\x85", -`1`, `0`, FALSE },
103	{ "\x86", -`1`, `0`, FALSE },
104	{ "\x87", -`1`, `0`, FALSE },
105	{ "\x88", -`1`, `0`, FALSE },
106	{ "\x89", -`1`, `0`, FALSE },
107	{ "\x8a", -`1`, `0`, FALSE },
108	{ "\x8b", -`1`, `0`, FALSE },
109	{ "\x8c", -`1`, `0`, FALSE },
110	{ "\x8d", -`1`, `0`, FALSE },
111	{ "\x8e", -`1`, `0`, FALSE },
112	{ "\x8f", -`1`, `0`, FALSE },
113	{ "\x90", -`1`, `0`, FALSE },
114	{ "\x91", -`1`, `0`, FALSE },
115	{ "\x92", -`1`, `0`, FALSE },
116	{ "\x93", -`1`, `0`, FALSE },
117	{ "\x94", -`1`, `0`, FALSE },
118	{ "\x95", -`1`, `0`, FALSE },
119	{ "\x96", -`1`, `0`, FALSE },
120	{ "\x97", -`1`, `0`, FALSE },
121	{ "\x98", -`1`, `0`, FALSE },
122	{ "\x99", -`1`, `0`, FALSE },
123	{ "\x9a", -`1`, `0`, FALSE },
124	{ "\x9b", -`1`, `0`, FALSE },
125	{ "\x9c", -`1`, `0`, FALSE },
126	{ "\x9d", -`1`, `0`, FALSE },
127	{ "\x9e", -`1`, `0`, FALSE },
128	{ "\x9f", -`1`, `0`, FALSE },
129	{ "\xa0", -`1`, `0`, FALSE },
130	{ "\xa1", -`1`, `0`, FALSE },
131	{ "\xa2", -`1`, `0`, FALSE },
132	{ "\xa3", -`1`, `0`, FALSE },
133	{ "\xa4", -`1`, `0`, FALSE },
134	{ "\xa5", -`1`, `0`, FALSE },
135	{ "\xa6", -`1`, `0`, FALSE },
136	{ "\xa7", -`1`, `0`, FALSE },
137	{ "\xa8", -`1`, `0`, FALSE },
138	{ "\xa9", -`1`, `0`, FALSE },
139	{ "\xaa", -`1`, `0`, FALSE },
140	{ "\xab", -`1`, `0`, FALSE },
141	{ "\xac", -`1`, `0`, FALSE },
142	{ "\xad", -`1`, `0`, FALSE },
143	{ "\xae", -`1`, `0`, FALSE },
144	{ "\xaf", -`1`, `0`, FALSE },
145	{ "\xb0", -`1`, `0`, FALSE },
146	{ "\xb1", -`1`, `0`, FALSE },
147	{ "\xb2", -`1`, `0`, FALSE },
148	{ "\xb3", -`1`, `0`, FALSE },
149	{ "\xb4", -`1`, `0`, FALSE },
150	{ "\xb5", -`1`, `0`, FALSE },
151	{ "\xb6", -`1`, `0`, FALSE },
152	{ "\xb7", -`1`, `0`, FALSE },
153	{ "\xb8", -`1`, `0`, FALSE },
154	{ "\xb9", -`1`, `0`, FALSE },
155	{ "\xba", -`1`, `0`, FALSE },
156	{ "\xbb", -`1`, `0`, FALSE },
157	{ "\xbc", -`1`, `0`, FALSE },
158	{ "\xbd", -`1`, `0`, FALSE },
159	{ "\xbe", -`1`, `0`, FALSE },
160	{ "\xbf", -`1`, `0`, FALSE },
161	/ lone start characters /
162	{ "\xc0\x20", -`1`, `0`, FALSE },
163	{ "\xc1\x20", -`1`, `0`, FALSE },
164	{ "\xc2\x20", -`1`, `0`, FALSE },
165	{ "\xc3\x20", -`1`, `0`, FALSE },
166	{ "\xc4\x20", -`1`, `0`, FALSE },
167	{ "\xc5\x20", -`1`, `0`, FALSE },
168	{ "\xc6\x20", -`1`, `0`, FALSE },
169	{ "\xc7\x20", -`1`, `0`, FALSE },
170	{ "\xc8\x20", -`1`, `0`, FALSE },
171	{ "\xc9\x20", -`1`, `0`, FALSE },
172	{ "\xca\x20", -`1`, `0`, FALSE },
173	{ "\xcb\x20", -`1`, `0`, FALSE },
174	{ "\xcc\x20", -`1`, `0`, FALSE },
175	{ "\xcd\x20", -`1`, `0`, FALSE },
176	{ "\xce\x20", -`1`, `0`, FALSE },
177	{ "\xcf\x20", -`1`, `0`, FALSE },
178	{ "\xd0\x20", -`1`, `0`, FALSE },
179	{ "\xd1\x20", -`1`, `0`, FALSE },
180	{ "\xd2\x20", -`1`, `0`, FALSE },
181	{ "\xd3\x20", -`1`, `0`, FALSE },
182	{ "\xd4\x20", -`1`, `0`, FALSE },
183	{ "\xd5\x20", -`1`, `0`, FALSE },
184	{ "\xd6\x20", -`1`, `0`, FALSE },
185	{ "\xd7\x20", -`1`, `0`, FALSE },
186	{ "\xd8\x20", -`1`, `0`, FALSE },
187	{ "\xd9\x20", -`1`, `0`, FALSE },
188	{ "\xda\x20", -`1`, `0`, FALSE },
189	{ "\xdb\x20", -`1`, `0`, FALSE },
190	{ "\xdc\x20", -`1`, `0`, FALSE },
191	{ "\xdd\x20", -`1`, `0`, FALSE },
192	{ "\xde\x20", -`1`, `0`, FALSE },
193	{ "\xdf\x20", -`1`, `0`, FALSE },
194	{ "\xe0\x20", -`1`, `0`, FALSE },
195	{ "\xe1\x20", -`1`, `0`, FALSE },
196	{ "\xe2\x20", -`1`, `0`, FALSE },
197	{ "\xe3\x20", -`1`, `0`, FALSE },
198	{ "\xe4\x20", -`1`, `0`, FALSE },
199	{ "\xe5\x20", -`1`, `0`, FALSE },
200	{ "\xe6\x20", -`1`, `0`, FALSE },
201	{ "\xe7\x20", -`1`, `0`, FALSE },
202	{ "\xe8\x20", -`1`, `0`, FALSE },
203	{ "\xe9\x20", -`1`, `0`, FALSE },
204	{ "\xea\x20", -`1`, `0`, FALSE },
205	{ "\xeb\x20", -`1`, `0`, FALSE },
206	{ "\xec\x20", -`1`, `0`, FALSE },
207	{ "\xed\x20", -`1`, `0`, FALSE },
208	{ "\xee\x20", -`1`, `0`, FALSE },
209	{ "\xef\x20", -`1`, `0`, FALSE },
210	{ "\xf0\x20", -`1`, `0`, FALSE },
211	{ "\xf1\x20", -`1`, `0`, FALSE },
212	{ "\xf2\x20", -`1`, `0`, FALSE },
213	{ "\xf3\x20", -`1`, `0`, FALSE },
214	{ "\xf4\x20", -`1`, `0`, FALSE },
215	{ "\xf5\x20", -`1`, `0`, FALSE },
216	{ "\xf6\x20", -`1`, `0`, FALSE },
217	{ "\xf7\x20", -`1`, `0`, FALSE },
218	{ "\xf8\x20", -`1`, `0`, FALSE },
219	{ "\xf9\x20", -`1`, `0`, FALSE },
220	{ "\xfa\x20", -`1`, `0`, FALSE },
221	{ "\xfb\x20", -`1`, `0`, FALSE },
222	{ "\xfc\x20", -`1`, `0`, FALSE },
223	{ "\xfd\x20", -`1`, `0`, FALSE },
224	/ missing continuation bytes /
225	{ "\x20\xc0", -`1`, `1`, FALSE },
226	{ "\x20\xe0\x80", -`1`, `1`, FALSE },
227	{ "\x20\xf0\x80\x80", -`1`, `1`, FALSE },
228	{ "\x20\xf8\x80\x80\x80", -`1`, `1`, FALSE },
229	{ "\x20\xfc\x80\x80\x80\x80", -`1`, `1`, FALSE },
230	{ "\x20\xdf", -`1`, `1`, FALSE },
231	{ "\x20\xef\xbf", -`1`, `1`, FALSE },
232	{ "\x20\xf7\xbf\xbf", -`1`, `1`, FALSE },
233	{ "\x20\xfb\xbf\xbf\xbf", -`1`, `1`, FALSE },
234	{ "\x20\xfd\xbf\xbf\xbf\xbf", -`1`, `1`, FALSE },
235	/ impossible bytes /
236	{ "\x20\xfe\x20", -`1`, `1`, FALSE },
237	{ "\x20\xff\x20", -`1`, `1`, FALSE },
238	/ overlong sequences /
239	{ "\x20\xc0\xaf\x20", -`1`, `1`, FALSE },
240	{ "\x20\xe0\x80\xaf\x20", -`1`, `1`, FALSE },
241	{ "\x20\xf0\x80\x80\xaf\x20", -`1`, `1`, FALSE },
242	{ "\x20\xf8\x80\x80\x80\xaf\x20", -`1`, `1`, FALSE },
243	{ "\x20\xfc\x80\x80\x80\x80\xaf\x20", -`1`, `1`, FALSE },
244	{ "\x20\xc1\xbf\x20", -`1`, `1`, FALSE },
245	{ "\x20\xe0\x9f\xbf\x20", -`1`, `1`, FALSE },
246	{ "\x20\xf0\x8f\xbf\xbf\x20", -`1`, `1`, FALSE },
247	{ "\x20\xf8\x87\xbf\xbf\xbf\x20", -`1`, `1`, FALSE },
248	{ "\x20\xfc\x83\xbf\xbf\xbf\xbf\x20", -`1`, `1`, FALSE },
249	{ "\x20\xc0\x80\x20", -`1`, `1`, FALSE },
250	{ "\x20\xe0\x80\x80\x20", -`1`, `1`, FALSE },
251	{ "\x20\xf0\x80\x80\x80\x20", -`1`, `1`, FALSE },
252	{ "\x20\xf8\x80\x80\x80\x80\x20", -`1`, `1`, FALSE },
253	{ "\x20\xfc\x80\x80\x80\x80\x80\x20", -`1`, `1`, FALSE },
254	/ illegal code positions /
255	{ "\x20\xed\xa0\x80\x20", -`1`, `1`, FALSE },
256	{ "\x20\xed\xad\xbf\x20", -`1`, `1`, FALSE },
257	{ "\x20\xed\xae\x80\x20", -`1`, `1`, FALSE },
258	{ "\x20\xed\xaf\xbf\x20", -`1`, `1`, FALSE },
259	{ "\x20\xed\xb0\x80\x20", -`1`, `1`, FALSE },
260	{ "\x20\xed\xbe\x80\x20", -`1`, `1`, FALSE },
261	{ "\x20\xed\xbf\xbf\x20", -`1`, `1`, FALSE },
262	{ "\x20\xed\xa0\x80\xed\xb0\x80\x20", -`1`, `1`, FALSE },
263	{ "\x20\xed\xa0\x80\xed\xbf\xbf\x20", -`1`, `1`, FALSE },
264	{ "\x20\xed\xad\xbf\xed\xb0\x80\x20", -`1`, `1`, FALSE },
265	{ "\x20\xed\xad\xbf\xed\xbf\xbf\x20", -`1`, `1`, FALSE },
266	{ "\x20\xed\xae\x80\xed\xb0\x80\x20", -`1`, `1`, FALSE },
267	{ "\x20\xed\xae\x80\xed\xbf\xbf\x20", -`1`, `1`, FALSE },
268	{ "\x20\xed\xaf\xbf\xed\xb0\x80\x20", -`1`, `1`, FALSE },
269	{ "\x20\xed\xaf\xbf\xed\xbf\xbf\x20", -`1`, `1`, FALSE },
270
271	{ NULL, }
272	};
273
274	static void
275	do_test (gconstpointer d)
276	{
277	const Test *test = d;
278	const gchar *end;
279	gboolean result;
280
281	result = g_utf8_validate (str: test->text, max_len: test->max_len, end: &end);
282
283	g_assert_true (result == test->valid);
284	g_assert_cmpint (end - test->text, ==, test->offset);
285
286	if (test->max_len < `0`)
287	{
288	result = g_utf8_validate (str: test->text, max_len: strlen (s: test->text), end: &end);
289
290	g_assert_true (result == test->valid);
291	g_assert_cmpint (end - test->text, ==, test->offset);
292	}
293	else
294	{
295	result = g_utf8_validate_len (str: test->text, max_len: test->max_len, end: &end);
296
297	g_assert_true (result == test->valid);
298	g_assert_cmpint (end - test->text, ==, test->offset);
299	}
300	}
301
302	/ Test the behaviour of g_utf8_get_char_validated() with various inputs and*
303	* length restrictions. */
304	static void
305	test_utf8_get_char_validated (void)
306	{
307	const struct {
308	const gchar *buf;
309	gssize max_len;
310	gunichar expected_result;
311	} test_vectors[] = {
312	/ Bug #780095: /
313	{ "\xC0\x00_45678", `8`, (gunichar) -`2` },
314	{ "\xC0\x00_45678", -`1`, (gunichar) -`2` },
315	/ It seems odd that the return value differs with the length input, but*
316	* that’s how it’s documented: */
317	{ "", `0`, (gunichar) -`2` },
318	{ "", -`1`, (gunichar) `0` },
319	{ "\0", `1`, (gunichar) -`2` },
320	{ "AB\0", `3`, `'A'` },
321	{ "A\0B", `3`, `'A'` },
322	{ "\0AB", `3`, (gunichar) -`2` },
323	{ "\xD8\0", `2`, (gunichar) -`2` },
324	/ Normal inputs: /
325	{ "hello", `5`, (gunichar) `'h'` },
326	{ "hello", -`1`, (gunichar) `'h'` },
327	{ "\xD8\x9F", `2`, `0x061F` },
328	{ "\xD8\x9F", -`1`, `0x061F` },
329	{ "\xD8\x9Fmore", `6`, `0x061F` },
330	{ "\xD8\x9Fmore", -`1`, `0x061F` },
331	{ "\xD8\x9F\0", `3`, `0x061F` },
332	{ "\xE2\x96\xB3", `3`, `0x25B3` },
333	{ "\xE2\x96\xB3", -`1`, `0x25B3` },
334	{ "\xE2\x96\xB3more", `7`, `0x25B3` },
335	{ "\xE2\x96\xB3more", -`1`, `0x25B3` },
336	{ "\xF0\x9F\x92\xA9", `4`, `0x1F4A9` },
337	{ "\xF0\x9F\x92\xA9", -`1`, `0x1F4A9` },
338	{ "\xF0\x9F\x92\xA9more", `8`, `0x1F4A9` },
339	{ "\xF0\x9F\x92\xA9more", -`1`, `0x1F4A9` },
340	/ Partial unichars: /
341	{ "\xD8", -`1`, (gunichar) -`2` },
342	{ "\xD8\x9F", `1`, (gunichar) -`2` },
343	{ "\xCE", -`1`, (gunichar) -`2` },
344	{ "\xCE", `1`, (gunichar) -`2` },
345	};
346	gsize i;
347
348	for (i = `0`; i < G_N_ELEMENTS (test_vectors); i++)
349	{
350	gunichar actual_result;
351
352	g_test_message (format: "Vector %" G_GSIZE_FORMAT, i);
353	actual_result = g_utf8_get_char_validated (p: test_vectors[i].buf,
354	max_len: test_vectors[i].max_len);
355	g_assert_cmpint (actual_result, ==, test_vectors[i].expected_result);
356	}
357	}
358
359	int
360	main (int argc, char *argv[])
361	{
362	gint i;
363	gchar *path;
364
365	g_test_init (argc: &argc, argv: &argv, NULL);
366
367	for (i = `0`; test[i].text; i++)
368	{
369	path = g_strdup_printf (format: "/utf8/validate/%d", i);
370	g_test_add_data_func (testpath: path, test_data: &test[i], test_func: do_test);
371	g_free (mem: path);
372	}
373
374	g_test_add_func (testpath: "/utf8/get-char-validated", test_func: test_utf8_get_char_validated);
375
376	return g_test_run ();
377	}
378

source code of gtk/subprojects/glib/glib/tests/utf8-validate.c