1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Kernel module for testing utf-8 support. |
4 | * |
5 | * Copyright 2017 Collabora Ltd. |
6 | */ |
7 | |
8 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
9 | |
10 | #include <linux/module.h> |
11 | #include <linux/printk.h> |
12 | #include <linux/unicode.h> |
13 | #include <linux/dcache.h> |
14 | |
15 | #include "utf8n.h" |
16 | |
17 | unsigned int failed_tests; |
18 | unsigned int total_tests; |
19 | |
20 | /* Tests will be based on this version. */ |
21 | #define UTF8_LATEST UNICODE_AGE(12, 1, 0) |
22 | |
23 | #define _test(cond, func, line, fmt, ...) do { \ |
24 | total_tests++; \ |
25 | if (!cond) { \ |
26 | failed_tests++; \ |
27 | pr_err("test %s:%d Failed: %s%s", \ |
28 | func, line, #cond, (fmt?":":".")); \ |
29 | if (fmt) \ |
30 | pr_err(fmt, ##__VA_ARGS__); \ |
31 | } \ |
32 | } while (0) |
33 | #define test_f(cond, fmt, ...) _test(cond, __func__, __LINE__, fmt, ##__VA_ARGS__) |
34 | #define test(cond) _test(cond, __func__, __LINE__, "") |
35 | |
36 | static const struct { |
37 | /* UTF-8 strings in this vector _must_ be NULL-terminated. */ |
38 | unsigned char str[10]; |
39 | unsigned char dec[10]; |
40 | } nfdi_test_data[] = { |
41 | /* Trivial sequence */ |
42 | { |
43 | /* "ABba" decomposes to itself */ |
44 | .str = "aBba" , |
45 | .dec = "aBba" , |
46 | }, |
47 | /* Simple equivalent sequences */ |
48 | { |
49 | /* 'VULGAR FRACTION ONE QUARTER' cannot decompose to |
50 | 'NUMBER 1' + 'FRACTION SLASH' + 'NUMBER 4' on |
51 | canonical decomposition */ |
52 | .str = {0xc2, 0xbc, 0x00}, |
53 | .dec = {0xc2, 0xbc, 0x00}, |
54 | }, |
55 | { |
56 | /* 'LATIN SMALL LETTER A WITH DIAERESIS' decomposes to |
57 | 'LETTER A' + 'COMBINING DIAERESIS' */ |
58 | .str = {0xc3, 0xa4, 0x00}, |
59 | .dec = {0x61, 0xcc, 0x88, 0x00}, |
60 | }, |
61 | { |
62 | /* 'LATIN SMALL LETTER LJ' can't decompose to |
63 | 'LETTER L' + 'LETTER J' on canonical decomposition */ |
64 | .str = {0xC7, 0x89, 0x00}, |
65 | .dec = {0xC7, 0x89, 0x00}, |
66 | }, |
67 | { |
68 | /* GREEK ANO TELEIA decomposes to MIDDLE DOT */ |
69 | .str = {0xCE, 0x87, 0x00}, |
70 | .dec = {0xC2, 0xB7, 0x00} |
71 | }, |
72 | /* Canonical ordering */ |
73 | { |
74 | /* A + 'COMBINING ACUTE ACCENT' + 'COMBINING OGONEK' decomposes |
75 | to A + 'COMBINING OGONEK' + 'COMBINING ACUTE ACCENT' */ |
76 | .str = {0x41, 0xcc, 0x81, 0xcc, 0xa8, 0x0}, |
77 | .dec = {0x41, 0xcc, 0xa8, 0xcc, 0x81, 0x0}, |
78 | }, |
79 | { |
80 | /* 'LATIN SMALL LETTER A WITH DIAERESIS' + 'COMBINING OGONEK' |
81 | decomposes to |
82 | 'LETTER A' + 'COMBINING OGONEK' + 'COMBINING DIAERESIS' */ |
83 | .str = {0xc3, 0xa4, 0xCC, 0xA8, 0x00}, |
84 | |
85 | .dec = {0x61, 0xCC, 0xA8, 0xcc, 0x88, 0x00}, |
86 | }, |
87 | |
88 | }; |
89 | |
90 | static const struct { |
91 | /* UTF-8 strings in this vector _must_ be NULL-terminated. */ |
92 | unsigned char str[30]; |
93 | unsigned char ncf[30]; |
94 | } nfdicf_test_data[] = { |
95 | /* Trivial sequences */ |
96 | { |
97 | /* "ABba" folds to lowercase */ |
98 | .str = {0x41, 0x42, 0x62, 0x61, 0x00}, |
99 | .ncf = {0x61, 0x62, 0x62, 0x61, 0x00}, |
100 | }, |
101 | { |
102 | /* All ASCII folds to lower-case */ |
103 | .str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0.1" , |
104 | .ncf = "abcdefghijklmnopqrstuvwxyz0.1" , |
105 | }, |
106 | { |
107 | /* LATIN SMALL LETTER SHARP S folds to |
108 | LATIN SMALL LETTER S + LATIN SMALL LETTER S */ |
109 | .str = {0xc3, 0x9f, 0x00}, |
110 | .ncf = {0x73, 0x73, 0x00}, |
111 | }, |
112 | { |
113 | /* LATIN CAPITAL LETTER A WITH RING ABOVE folds to |
114 | LATIN SMALL LETTER A + COMBINING RING ABOVE */ |
115 | .str = {0xC3, 0x85, 0x00}, |
116 | .ncf = {0x61, 0xcc, 0x8a, 0x00}, |
117 | }, |
118 | /* Introduced by UTF-8.0.0. */ |
119 | /* Cherokee letters are interesting test-cases because they fold |
120 | to upper-case. Before 8.0.0, Cherokee lowercase were |
121 | undefined, thus, the folding from LC is not stable between |
122 | 7.0.0 -> 8.0.0, but it is from UC. */ |
123 | { |
124 | /* CHEROKEE SMALL LETTER A folds to CHEROKEE LETTER A */ |
125 | .str = {0xea, 0xad, 0xb0, 0x00}, |
126 | .ncf = {0xe1, 0x8e, 0xa0, 0x00}, |
127 | }, |
128 | { |
129 | /* CHEROKEE SMALL LETTER YE folds to CHEROKEE LETTER YE */ |
130 | .str = {0xe1, 0x8f, 0xb8, 0x00}, |
131 | .ncf = {0xe1, 0x8f, 0xb0, 0x00}, |
132 | }, |
133 | { |
134 | /* OLD HUNGARIAN CAPITAL LETTER AMB folds to |
135 | OLD HUNGARIAN SMALL LETTER AMB */ |
136 | .str = {0xf0, 0x90, 0xb2, 0x83, 0x00}, |
137 | .ncf = {0xf0, 0x90, 0xb3, 0x83, 0x00}, |
138 | }, |
139 | /* Introduced by UTF-9.0.0. */ |
140 | { |
141 | /* OSAGE CAPITAL LETTER CHA folds to |
142 | OSAGE SMALL LETTER CHA */ |
143 | .str = {0xf0, 0x90, 0x92, 0xb5, 0x00}, |
144 | .ncf = {0xf0, 0x90, 0x93, 0x9d, 0x00}, |
145 | }, |
146 | { |
147 | /* LATIN CAPITAL LETTER SMALL CAPITAL I folds to |
148 | LATIN LETTER SMALL CAPITAL I */ |
149 | .str = {0xea, 0x9e, 0xae, 0x00}, |
150 | .ncf = {0xc9, 0xaa, 0x00}, |
151 | }, |
152 | /* Introduced by UTF-11.0.0. */ |
153 | { |
154 | /* GEORGIAN SMALL LETTER AN folds to GEORGIAN MTAVRULI |
155 | CAPITAL LETTER AN */ |
156 | .str = {0xe1, 0xb2, 0x90, 0x00}, |
157 | .ncf = {0xe1, 0x83, 0x90, 0x00}, |
158 | } |
159 | }; |
160 | |
161 | static ssize_t utf8len(const struct unicode_map *um, enum utf8_normalization n, |
162 | const char *s) |
163 | { |
164 | return utf8nlen(um, n, s, len: (size_t)-1); |
165 | } |
166 | |
167 | static int utf8cursor(struct utf8cursor *u8c, const struct unicode_map *um, |
168 | enum utf8_normalization n, const char *s) |
169 | { |
170 | return utf8ncursor(u8c, um, n, s, len: (unsigned int)-1); |
171 | } |
172 | |
173 | static void check_utf8_nfdi(struct unicode_map *um) |
174 | { |
175 | int i; |
176 | struct utf8cursor u8c; |
177 | |
178 | for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) { |
179 | int len = strlen(nfdi_test_data[i].str); |
180 | int nlen = strlen(nfdi_test_data[i].dec); |
181 | int j = 0; |
182 | unsigned char c; |
183 | |
184 | test((utf8len(um, UTF8_NFDI, nfdi_test_data[i].str) == nlen)); |
185 | test((utf8nlen(um, UTF8_NFDI, nfdi_test_data[i].str, len) == |
186 | nlen)); |
187 | |
188 | if (utf8cursor(u8c: &u8c, um, n: UTF8_NFDI, s: nfdi_test_data[i].str) < 0) |
189 | pr_err("can't create cursor\n" ); |
190 | |
191 | while ((c = utf8byte(u8c: &u8c)) > 0) { |
192 | test_f((c == nfdi_test_data[i].dec[j]), |
193 | "Unexpected byte 0x%x should be 0x%x\n" , |
194 | c, nfdi_test_data[i].dec[j]); |
195 | j++; |
196 | } |
197 | |
198 | test((j == nlen)); |
199 | } |
200 | } |
201 | |
202 | static void check_utf8_nfdicf(struct unicode_map *um) |
203 | { |
204 | int i; |
205 | struct utf8cursor u8c; |
206 | |
207 | for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) { |
208 | int len = strlen(nfdicf_test_data[i].str); |
209 | int nlen = strlen(nfdicf_test_data[i].ncf); |
210 | int j = 0; |
211 | unsigned char c; |
212 | |
213 | test((utf8len(um, UTF8_NFDICF, nfdicf_test_data[i].str) == |
214 | nlen)); |
215 | test((utf8nlen(um, UTF8_NFDICF, nfdicf_test_data[i].str, len) == |
216 | nlen)); |
217 | |
218 | if (utf8cursor(u8c: &u8c, um, n: UTF8_NFDICF, |
219 | s: nfdicf_test_data[i].str) < 0) |
220 | pr_err("can't create cursor\n" ); |
221 | |
222 | while ((c = utf8byte(u8c: &u8c)) > 0) { |
223 | test_f((c == nfdicf_test_data[i].ncf[j]), |
224 | "Unexpected byte 0x%x should be 0x%x\n" , |
225 | c, nfdicf_test_data[i].ncf[j]); |
226 | j++; |
227 | } |
228 | |
229 | test((j == nlen)); |
230 | } |
231 | } |
232 | |
233 | static void check_utf8_comparisons(struct unicode_map *table) |
234 | { |
235 | int i; |
236 | |
237 | for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) { |
238 | const struct qstr s1 = {.name = nfdi_test_data[i].str, |
239 | .len = sizeof(nfdi_test_data[i].str)}; |
240 | const struct qstr s2 = {.name = nfdi_test_data[i].dec, |
241 | .len = sizeof(nfdi_test_data[i].dec)}; |
242 | |
243 | test_f(!utf8_strncmp(table, &s1, &s2), |
244 | "%s %s comparison mismatch\n" , s1.name, s2.name); |
245 | } |
246 | |
247 | for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) { |
248 | const struct qstr s1 = {.name = nfdicf_test_data[i].str, |
249 | .len = sizeof(nfdicf_test_data[i].str)}; |
250 | const struct qstr s2 = {.name = nfdicf_test_data[i].ncf, |
251 | .len = sizeof(nfdicf_test_data[i].ncf)}; |
252 | |
253 | test_f(!utf8_strncasecmp(table, &s1, &s2), |
254 | "%s %s comparison mismatch\n" , s1.name, s2.name); |
255 | } |
256 | } |
257 | |
258 | static void check_supported_versions(struct unicode_map *um) |
259 | { |
260 | /* Unicode 7.0.0 should be supported. */ |
261 | test(utf8version_is_supported(um, UNICODE_AGE(7, 0, 0))); |
262 | |
263 | /* Unicode 9.0.0 should be supported. */ |
264 | test(utf8version_is_supported(um, UNICODE_AGE(9, 0, 0))); |
265 | |
266 | /* Unicode 1x.0.0 (the latest version) should be supported. */ |
267 | test(utf8version_is_supported(um, UTF8_LATEST)); |
268 | |
269 | /* Next versions don't exist. */ |
270 | test(!utf8version_is_supported(um, UNICODE_AGE(13, 0, 0))); |
271 | test(!utf8version_is_supported(um, UNICODE_AGE(0, 0, 0))); |
272 | test(!utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1))); |
273 | } |
274 | |
275 | static int __init init_test_ucd(void) |
276 | { |
277 | struct unicode_map *um; |
278 | |
279 | failed_tests = 0; |
280 | total_tests = 0; |
281 | |
282 | um = utf8_load(UTF8_LATEST); |
283 | if (IS_ERR(ptr: um)) { |
284 | pr_err("%s: Unable to load utf8 table.\n" , __func__); |
285 | return PTR_ERR(ptr: um); |
286 | } |
287 | |
288 | check_supported_versions(um); |
289 | check_utf8_nfdi(um); |
290 | check_utf8_nfdicf(um); |
291 | check_utf8_comparisons(table: um); |
292 | |
293 | if (!failed_tests) |
294 | pr_info("All %u tests passed\n" , total_tests); |
295 | else |
296 | pr_err("%u out of %u tests failed\n" , failed_tests, |
297 | total_tests); |
298 | utf8_unload(um); |
299 | return 0; |
300 | } |
301 | |
302 | static void __exit exit_test_ucd(void) |
303 | { |
304 | } |
305 | |
306 | module_init(init_test_ucd); |
307 | module_exit(exit_test_ucd); |
308 | |
309 | MODULE_AUTHOR("Gabriel Krisman Bertazi <krisman@collabora.co.uk>" ); |
310 | MODULE_LICENSE("GPL" ); |
311 | |