1 | /* |
2 | * Copyright (c) 2014 SGI. |
3 | * All rights reserved. |
4 | * |
5 | * This program is free software; you can redistribute it and/or |
6 | * modify it under the terms of the GNU General Public License as |
7 | * published by the Free Software Foundation. |
8 | * |
9 | * This program is distributed in the hope that it would be useful, |
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12 | * GNU General Public License for more details. |
13 | * |
14 | * You should have received a copy of the GNU General Public License |
15 | * along with this program; if not, write the Free Software Foundation, |
16 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA |
17 | */ |
18 | |
19 | /* Generator for a compact trie for unicode normalization */ |
20 | |
21 | #include <sys/types.h> |
22 | #include <stddef.h> |
23 | #include <stdlib.h> |
24 | #include <stdio.h> |
25 | #include <assert.h> |
26 | #include <string.h> |
27 | #include <unistd.h> |
28 | #include <errno.h> |
29 | |
30 | /* Default names of the in- and output files. */ |
31 | |
32 | #define AGE_NAME "DerivedAge.txt" |
33 | #define CCC_NAME "DerivedCombiningClass.txt" |
34 | #define PROP_NAME "DerivedCoreProperties.txt" |
35 | #define DATA_NAME "UnicodeData.txt" |
36 | #define FOLD_NAME "CaseFolding.txt" |
37 | #define NORM_NAME "NormalizationCorrections.txt" |
38 | #define TEST_NAME "NormalizationTest.txt" |
39 | #define UTF8_NAME "utf8data.h" |
40 | |
41 | const char *age_name = AGE_NAME; |
42 | const char *ccc_name = CCC_NAME; |
43 | const char *prop_name = PROP_NAME; |
44 | const char *data_name = DATA_NAME; |
45 | const char *fold_name = FOLD_NAME; |
46 | const char *norm_name = NORM_NAME; |
47 | const char *test_name = TEST_NAME; |
48 | const char *utf8_name = UTF8_NAME; |
49 | |
50 | int verbose = 0; |
51 | |
52 | /* An arbitrary line size limit on input lines. */ |
53 | |
54 | #define LINESIZE 1024 |
55 | char line[LINESIZE]; |
56 | char buf0[LINESIZE]; |
57 | char buf1[LINESIZE]; |
58 | char buf2[LINESIZE]; |
59 | char buf3[LINESIZE]; |
60 | |
61 | const char *argv0; |
62 | |
63 | #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) |
64 | |
65 | /* ------------------------------------------------------------------ */ |
66 | |
67 | /* |
68 | * Unicode version numbers consist of three parts: major, minor, and a |
69 | * revision. These numbers are packed into an unsigned int to obtain |
70 | * a single version number. |
71 | * |
72 | * To save space in the generated trie, the unicode version is not |
73 | * stored directly, instead we calculate a generation number from the |
74 | * unicode versions seen in the DerivedAge file, and use that as an |
75 | * index into a table of unicode versions. |
76 | */ |
77 | #define UNICODE_MAJ_SHIFT (16) |
78 | #define UNICODE_MIN_SHIFT (8) |
79 | |
80 | #define UNICODE_MAJ_MAX ((unsigned short)-1) |
81 | #define UNICODE_MIN_MAX ((unsigned char)-1) |
82 | #define UNICODE_REV_MAX ((unsigned char)-1) |
83 | |
84 | #define UNICODE_AGE(MAJ,MIN,REV) \ |
85 | (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \ |
86 | ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \ |
87 | ((unsigned int)(REV))) |
88 | |
89 | unsigned int *ages; |
90 | int ages_count; |
91 | |
92 | unsigned int unicode_maxage; |
93 | |
94 | static int age_valid(unsigned int major, unsigned int minor, |
95 | unsigned int revision) |
96 | { |
97 | if (major > UNICODE_MAJ_MAX) |
98 | return 0; |
99 | if (minor > UNICODE_MIN_MAX) |
100 | return 0; |
101 | if (revision > UNICODE_REV_MAX) |
102 | return 0; |
103 | return 1; |
104 | } |
105 | |
106 | /* ------------------------------------------------------------------ */ |
107 | |
108 | /* |
109 | * utf8trie_t |
110 | * |
111 | * A compact binary tree, used to decode UTF-8 characters. |
112 | * |
113 | * Internal nodes are one byte for the node itself, and up to three |
114 | * bytes for an offset into the tree. The first byte contains the |
115 | * following information: |
116 | * NEXTBYTE - flag - advance to next byte if set |
117 | * BITNUM - 3 bit field - the bit number to tested |
118 | * OFFLEN - 2 bit field - number of bytes in the offset |
119 | * if offlen == 0 (non-branching node) |
120 | * RIGHTPATH - 1 bit field - set if the following node is for the |
121 | * right-hand path (tested bit is set) |
122 | * TRIENODE - 1 bit field - set if the following node is an internal |
123 | * node, otherwise it is a leaf node |
124 | * if offlen != 0 (branching node) |
125 | * LEFTNODE - 1 bit field - set if the left-hand node is internal |
126 | * RIGHTNODE - 1 bit field - set if the right-hand node is internal |
127 | * |
128 | * Due to the way utf8 works, there cannot be branching nodes with |
129 | * NEXTBYTE set, and moreover those nodes always have a righthand |
130 | * descendant. |
131 | */ |
132 | typedef unsigned char utf8trie_t; |
133 | #define BITNUM 0x07 |
134 | #define NEXTBYTE 0x08 |
135 | #define OFFLEN 0x30 |
136 | #define OFFLEN_SHIFT 4 |
137 | #define RIGHTPATH 0x40 |
138 | #define TRIENODE 0x80 |
139 | #define RIGHTNODE 0x40 |
140 | #define LEFTNODE 0x80 |
141 | |
142 | /* |
143 | * utf8leaf_t |
144 | * |
145 | * The leaves of the trie are embedded in the trie, and so the same |
146 | * underlying datatype, unsigned char. |
147 | * |
148 | * leaf[0]: The unicode version, stored as a generation number that is |
149 | * an index into utf8agetab[]. With this we can filter code |
150 | * points based on the unicode version in which they were |
151 | * defined. The CCC of a non-defined code point is 0. |
152 | * leaf[1]: Canonical Combining Class. During normalization, we need |
153 | * to do a stable sort into ascending order of all characters |
154 | * with a non-zero CCC that occur between two characters with |
155 | * a CCC of 0, or at the begin or end of a string. |
156 | * The unicode standard guarantees that all CCC values are |
157 | * between 0 and 254 inclusive, which leaves 255 available as |
158 | * a special value. |
159 | * Code points with CCC 0 are known as stoppers. |
160 | * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the |
161 | * start of a NUL-terminated string that is the decomposition |
162 | * of the character. |
163 | * The CCC of a decomposable character is the same as the CCC |
164 | * of the first character of its decomposition. |
165 | * Some characters decompose as the empty string: these are |
166 | * characters with the Default_Ignorable_Code_Point property. |
167 | * These do affect normalization, as they all have CCC 0. |
168 | * |
169 | * The decompositions in the trie have been fully expanded. |
170 | * |
171 | * Casefolding, if applicable, is also done using decompositions. |
172 | */ |
173 | typedef unsigned char utf8leaf_t; |
174 | |
175 | #define LEAF_GEN(LEAF) ((LEAF)[0]) |
176 | #define LEAF_CCC(LEAF) ((LEAF)[1]) |
177 | #define LEAF_STR(LEAF) ((const char*)((LEAF) + 2)) |
178 | |
179 | #define MAXGEN (255) |
180 | |
181 | #define MINCCC (0) |
182 | #define MAXCCC (254) |
183 | #define STOPPER (0) |
184 | #define DECOMPOSE (255) |
185 | #define HANGUL ((char)(255)) |
186 | |
187 | #define UTF8HANGULLEAF (12) |
188 | |
189 | struct tree; |
190 | static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *, |
191 | const char *, size_t); |
192 | static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *); |
193 | |
194 | unsigned char *utf8data; |
195 | size_t utf8data_size; |
196 | |
197 | utf8trie_t *nfdi; |
198 | utf8trie_t *nfdicf; |
199 | |
200 | /* ------------------------------------------------------------------ */ |
201 | |
202 | /* |
203 | * UTF8 valid ranges. |
204 | * |
205 | * The UTF-8 encoding spreads the bits of a 32bit word over several |
206 | * bytes. This table gives the ranges that can be held and how they'd |
207 | * be represented. |
208 | * |
209 | * 0x00000000 0x0000007F: 0xxxxxxx |
210 | * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx |
211 | * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx |
212 | * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
213 | * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
214 | * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
215 | * |
216 | * There is an additional requirement on UTF-8, in that only the |
217 | * shortest representation of a 32bit value is to be used. A decoder |
218 | * must not decode sequences that do not satisfy this requirement. |
219 | * Thus the allowed ranges have a lower bound. |
220 | * |
221 | * 0x00000000 0x0000007F: 0xxxxxxx |
222 | * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx |
223 | * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx |
224 | * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
225 | * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
226 | * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
227 | * |
228 | * Actual unicode characters are limited to the range 0x0 - 0x10FFFF, |
229 | * 17 planes of 65536 values. This limits the sequences actually seen |
230 | * even more, to just the following. |
231 | * |
232 | * 0 - 0x7f: 0 0x7f |
233 | * 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf |
234 | * 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf |
235 | * 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf |
236 | * |
237 | * Even within those ranges not all values are allowed: the surrogates |
238 | * 0xd800 - 0xdfff should never be seen. |
239 | * |
240 | * Note that the longest sequence seen with valid usage is 4 bytes, |
241 | * the same a single UTF-32 character. This makes the UTF-8 |
242 | * representation of Unicode strictly smaller than UTF-32. |
243 | * |
244 | * The shortest sequence requirement was introduced by: |
245 | * Corrigendum #1: UTF-8 Shortest Form |
246 | * It can be found here: |
247 | * http://www.unicode.org/versions/corrigendum1.html |
248 | * |
249 | */ |
250 | |
251 | #define UTF8_2_BITS 0xC0 |
252 | #define UTF8_3_BITS 0xE0 |
253 | #define UTF8_4_BITS 0xF0 |
254 | #define UTF8_N_BITS 0x80 |
255 | #define UTF8_2_MASK 0xE0 |
256 | #define UTF8_3_MASK 0xF0 |
257 | #define UTF8_4_MASK 0xF8 |
258 | #define UTF8_N_MASK 0xC0 |
259 | #define UTF8_V_MASK 0x3F |
260 | #define UTF8_V_SHIFT 6 |
261 | |
262 | static int utf8encode(char *str, unsigned int val) |
263 | { |
264 | int len; |
265 | |
266 | if (val < 0x80) { |
267 | str[0] = val; |
268 | len = 1; |
269 | } else if (val < 0x800) { |
270 | str[1] = val & UTF8_V_MASK; |
271 | str[1] |= UTF8_N_BITS; |
272 | val >>= UTF8_V_SHIFT; |
273 | str[0] = val; |
274 | str[0] |= UTF8_2_BITS; |
275 | len = 2; |
276 | } else if (val < 0x10000) { |
277 | str[2] = val & UTF8_V_MASK; |
278 | str[2] |= UTF8_N_BITS; |
279 | val >>= UTF8_V_SHIFT; |
280 | str[1] = val & UTF8_V_MASK; |
281 | str[1] |= UTF8_N_BITS; |
282 | val >>= UTF8_V_SHIFT; |
283 | str[0] = val; |
284 | str[0] |= UTF8_3_BITS; |
285 | len = 3; |
286 | } else if (val < 0x110000) { |
287 | str[3] = val & UTF8_V_MASK; |
288 | str[3] |= UTF8_N_BITS; |
289 | val >>= UTF8_V_SHIFT; |
290 | str[2] = val & UTF8_V_MASK; |
291 | str[2] |= UTF8_N_BITS; |
292 | val >>= UTF8_V_SHIFT; |
293 | str[1] = val & UTF8_V_MASK; |
294 | str[1] |= UTF8_N_BITS; |
295 | val >>= UTF8_V_SHIFT; |
296 | str[0] = val; |
297 | str[0] |= UTF8_4_BITS; |
298 | len = 4; |
299 | } else { |
300 | printf("%#x: illegal val\n" , val); |
301 | len = 0; |
302 | } |
303 | return len; |
304 | } |
305 | |
306 | static unsigned int utf8decode(const char *str) |
307 | { |
308 | const unsigned char *s = (const unsigned char*)str; |
309 | unsigned int unichar = 0; |
310 | |
311 | if (*s < 0x80) { |
312 | unichar = *s; |
313 | } else if (*s < UTF8_3_BITS) { |
314 | unichar = *s++ & 0x1F; |
315 | unichar <<= UTF8_V_SHIFT; |
316 | unichar |= *s & 0x3F; |
317 | } else if (*s < UTF8_4_BITS) { |
318 | unichar = *s++ & 0x0F; |
319 | unichar <<= UTF8_V_SHIFT; |
320 | unichar |= *s++ & 0x3F; |
321 | unichar <<= UTF8_V_SHIFT; |
322 | unichar |= *s & 0x3F; |
323 | } else { |
324 | unichar = *s++ & 0x0F; |
325 | unichar <<= UTF8_V_SHIFT; |
326 | unichar |= *s++ & 0x3F; |
327 | unichar <<= UTF8_V_SHIFT; |
328 | unichar |= *s++ & 0x3F; |
329 | unichar <<= UTF8_V_SHIFT; |
330 | unichar |= *s & 0x3F; |
331 | } |
332 | return unichar; |
333 | } |
334 | |
335 | static int utf32valid(unsigned int unichar) |
336 | { |
337 | return unichar < 0x110000; |
338 | } |
339 | |
340 | #define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3) |
341 | |
342 | #define NODE 1 |
343 | #define LEAF 0 |
344 | |
345 | struct tree { |
346 | void *root; |
347 | int childnode; |
348 | const char *type; |
349 | unsigned int maxage; |
350 | struct tree *next; |
351 | int (*leaf_equal)(void *, void *); |
352 | void (*leaf_print)(void *, int); |
353 | int (*leaf_mark)(void *); |
354 | int (*leaf_size)(void *); |
355 | int *(*leaf_index)(struct tree *, void *); |
356 | unsigned char *(*leaf_emit)(void *, unsigned char *); |
357 | int leafindex[0x110000]; |
358 | int index; |
359 | }; |
360 | |
361 | struct node { |
362 | int index; |
363 | int offset; |
364 | int mark; |
365 | int size; |
366 | struct node *parent; |
367 | void *left; |
368 | void *right; |
369 | unsigned char bitnum; |
370 | unsigned char nextbyte; |
371 | unsigned char leftnode; |
372 | unsigned char rightnode; |
373 | unsigned int keybits; |
374 | unsigned int keymask; |
375 | }; |
376 | |
377 | /* |
378 | * Example lookup function for a tree. |
379 | */ |
380 | static void *lookup(struct tree *tree, const char *key) |
381 | { |
382 | struct node *node; |
383 | void *leaf = NULL; |
384 | |
385 | node = tree->root; |
386 | while (!leaf && node) { |
387 | if (node->nextbyte) |
388 | key++; |
389 | if (*key & (1 << (node->bitnum & 7))) { |
390 | /* Right leg */ |
391 | if (node->rightnode == NODE) { |
392 | node = node->right; |
393 | } else if (node->rightnode == LEAF) { |
394 | leaf = node->right; |
395 | } else { |
396 | node = NULL; |
397 | } |
398 | } else { |
399 | /* Left leg */ |
400 | if (node->leftnode == NODE) { |
401 | node = node->left; |
402 | } else if (node->leftnode == LEAF) { |
403 | leaf = node->left; |
404 | } else { |
405 | node = NULL; |
406 | } |
407 | } |
408 | } |
409 | |
410 | return leaf; |
411 | } |
412 | |
413 | /* |
414 | * A simple non-recursive tree walker: keep track of visits to the |
415 | * left and right branches in the leftmask and rightmask. |
416 | */ |
417 | static void tree_walk(struct tree *tree) |
418 | { |
419 | struct node *node; |
420 | unsigned int leftmask; |
421 | unsigned int rightmask; |
422 | unsigned int bitmask; |
423 | int indent = 1; |
424 | int nodes, singletons, leaves; |
425 | |
426 | nodes = singletons = leaves = 0; |
427 | |
428 | printf("%s_%x root %p\n" , tree->type, tree->maxage, tree->root); |
429 | if (tree->childnode == LEAF) { |
430 | assert(tree->root); |
431 | tree->leaf_print(tree->root, indent); |
432 | leaves = 1; |
433 | } else { |
434 | assert(tree->childnode == NODE); |
435 | node = tree->root; |
436 | leftmask = rightmask = 0; |
437 | while (node) { |
438 | printf("%*snode @ %p bitnum %d nextbyte %d" |
439 | " left %p right %p mask %x bits %x\n" , |
440 | indent, "" , node, |
441 | node->bitnum, node->nextbyte, |
442 | node->left, node->right, |
443 | node->keymask, node->keybits); |
444 | nodes += 1; |
445 | if (!(node->left && node->right)) |
446 | singletons += 1; |
447 | |
448 | while (node) { |
449 | bitmask = 1 << node->bitnum; |
450 | if ((leftmask & bitmask) == 0) { |
451 | leftmask |= bitmask; |
452 | if (node->leftnode == LEAF) { |
453 | assert(node->left); |
454 | tree->leaf_print(node->left, |
455 | indent+1); |
456 | leaves += 1; |
457 | } else if (node->left) { |
458 | assert(node->leftnode == NODE); |
459 | indent += 1; |
460 | node = node->left; |
461 | break; |
462 | } |
463 | } |
464 | if ((rightmask & bitmask) == 0) { |
465 | rightmask |= bitmask; |
466 | if (node->rightnode == LEAF) { |
467 | assert(node->right); |
468 | tree->leaf_print(node->right, |
469 | indent+1); |
470 | leaves += 1; |
471 | } else if (node->right) { |
472 | assert(node->rightnode == NODE); |
473 | indent += 1; |
474 | node = node->right; |
475 | break; |
476 | } |
477 | } |
478 | leftmask &= ~bitmask; |
479 | rightmask &= ~bitmask; |
480 | node = node->parent; |
481 | indent -= 1; |
482 | } |
483 | } |
484 | } |
485 | printf("nodes %d leaves %d singletons %d\n" , |
486 | nodes, leaves, singletons); |
487 | } |
488 | |
489 | /* |
490 | * Allocate an initialize a new internal node. |
491 | */ |
492 | static struct node *alloc_node(struct node *parent) |
493 | { |
494 | struct node *node; |
495 | int bitnum; |
496 | |
497 | node = malloc(sizeof(*node)); |
498 | node->left = node->right = NULL; |
499 | node->parent = parent; |
500 | node->leftnode = NODE; |
501 | node->rightnode = NODE; |
502 | node->keybits = 0; |
503 | node->keymask = 0; |
504 | node->mark = 0; |
505 | node->index = 0; |
506 | node->offset = -1; |
507 | node->size = 4; |
508 | |
509 | if (node->parent) { |
510 | bitnum = parent->bitnum; |
511 | if ((bitnum & 7) == 0) { |
512 | node->bitnum = bitnum + 7 + 8; |
513 | node->nextbyte = 1; |
514 | } else { |
515 | node->bitnum = bitnum - 1; |
516 | node->nextbyte = 0; |
517 | } |
518 | } else { |
519 | node->bitnum = 7; |
520 | node->nextbyte = 0; |
521 | } |
522 | |
523 | return node; |
524 | } |
525 | |
526 | /* |
527 | * Insert a new leaf into the tree, and collapse any subtrees that are |
528 | * fully populated and end in identical leaves. A nextbyte tagged |
529 | * internal node will not be removed to preserve the tree's integrity. |
530 | * Note that due to the structure of utf8, no nextbyte tagged node |
531 | * will be a candidate for removal. |
532 | */ |
533 | static int insert(struct tree *tree, char *key, int keylen, void *leaf) |
534 | { |
535 | struct node *node; |
536 | struct node *parent; |
537 | void **cursor; |
538 | int keybits; |
539 | |
540 | assert(keylen >= 1 && keylen <= 4); |
541 | |
542 | node = NULL; |
543 | cursor = &tree->root; |
544 | keybits = 8 * keylen; |
545 | |
546 | /* Insert, creating path along the way. */ |
547 | while (keybits) { |
548 | if (!*cursor) |
549 | *cursor = alloc_node(parent: node); |
550 | node = *cursor; |
551 | if (node->nextbyte) |
552 | key++; |
553 | if (*key & (1 << (node->bitnum & 7))) |
554 | cursor = &node->right; |
555 | else |
556 | cursor = &node->left; |
557 | keybits--; |
558 | } |
559 | *cursor = leaf; |
560 | |
561 | /* Merge subtrees if possible. */ |
562 | while (node) { |
563 | if (*key & (1 << (node->bitnum & 7))) |
564 | node->rightnode = LEAF; |
565 | else |
566 | node->leftnode = LEAF; |
567 | if (node->nextbyte) |
568 | break; |
569 | if (node->leftnode == NODE || node->rightnode == NODE) |
570 | break; |
571 | assert(node->left); |
572 | assert(node->right); |
573 | /* Compare */ |
574 | if (! tree->leaf_equal(node->left, node->right)) |
575 | break; |
576 | /* Keep left, drop right leaf. */ |
577 | leaf = node->left; |
578 | /* Check in parent */ |
579 | parent = node->parent; |
580 | if (!parent) { |
581 | /* root of tree! */ |
582 | tree->root = leaf; |
583 | tree->childnode = LEAF; |
584 | } else if (parent->left == node) { |
585 | parent->left = leaf; |
586 | parent->leftnode = LEAF; |
587 | if (parent->right) { |
588 | parent->keymask = 0; |
589 | parent->keybits = 0; |
590 | } else { |
591 | parent->keymask |= (1 << node->bitnum); |
592 | } |
593 | } else if (parent->right == node) { |
594 | parent->right = leaf; |
595 | parent->rightnode = LEAF; |
596 | if (parent->left) { |
597 | parent->keymask = 0; |
598 | parent->keybits = 0; |
599 | } else { |
600 | parent->keymask |= (1 << node->bitnum); |
601 | parent->keybits |= (1 << node->bitnum); |
602 | } |
603 | } else { |
604 | /* internal tree error */ |
605 | assert(0); |
606 | } |
607 | free(node); |
608 | node = parent; |
609 | } |
610 | |
611 | /* Propagate keymasks up along singleton chains. */ |
612 | while (node) { |
613 | parent = node->parent; |
614 | if (!parent) |
615 | break; |
616 | /* Nix the mask for parents with two children. */ |
617 | if (node->keymask == 0) { |
618 | parent->keymask = 0; |
619 | parent->keybits = 0; |
620 | } else if (parent->left && parent->right) { |
621 | parent->keymask = 0; |
622 | parent->keybits = 0; |
623 | } else { |
624 | assert((parent->keymask & node->keymask) == 0); |
625 | parent->keymask |= node->keymask; |
626 | parent->keymask |= (1 << parent->bitnum); |
627 | parent->keybits |= node->keybits; |
628 | if (parent->right) |
629 | parent->keybits |= (1 << parent->bitnum); |
630 | } |
631 | node = parent; |
632 | } |
633 | |
634 | return 0; |
635 | } |
636 | |
637 | /* |
638 | * Prune internal nodes. |
639 | * |
640 | * Fully populated subtrees that end at the same leaf have already |
641 | * been collapsed. There are still internal nodes that have for both |
642 | * their left and right branches a sequence of singletons that make |
643 | * identical choices and end in identical leaves. The keymask and |
644 | * keybits collected in the nodes describe the choices made in these |
645 | * singleton chains. When they are identical for the left and right |
646 | * branch of a node, and the two leaves comare identical, the node in |
647 | * question can be removed. |
648 | * |
649 | * Note that nodes with the nextbyte tag set will not be removed by |
650 | * this to ensure tree integrity. Note as well that the structure of |
651 | * utf8 ensures that these nodes would not have been candidates for |
652 | * removal in any case. |
653 | */ |
654 | static void prune(struct tree *tree) |
655 | { |
656 | struct node *node; |
657 | struct node *left; |
658 | struct node *right; |
659 | struct node *parent; |
660 | void *leftleaf; |
661 | void *rightleaf; |
662 | unsigned int leftmask; |
663 | unsigned int rightmask; |
664 | unsigned int bitmask; |
665 | int count; |
666 | |
667 | if (verbose > 0) |
668 | printf("Pruning %s_%x\n" , tree->type, tree->maxage); |
669 | |
670 | count = 0; |
671 | if (tree->childnode == LEAF) |
672 | return; |
673 | if (!tree->root) |
674 | return; |
675 | |
676 | leftmask = rightmask = 0; |
677 | node = tree->root; |
678 | while (node) { |
679 | if (node->nextbyte) |
680 | goto advance; |
681 | if (node->leftnode == LEAF) |
682 | goto advance; |
683 | if (node->rightnode == LEAF) |
684 | goto advance; |
685 | if (!node->left) |
686 | goto advance; |
687 | if (!node->right) |
688 | goto advance; |
689 | left = node->left; |
690 | right = node->right; |
691 | if (left->keymask == 0) |
692 | goto advance; |
693 | if (right->keymask == 0) |
694 | goto advance; |
695 | if (left->keymask != right->keymask) |
696 | goto advance; |
697 | if (left->keybits != right->keybits) |
698 | goto advance; |
699 | leftleaf = NULL; |
700 | while (!leftleaf) { |
701 | assert(left->left || left->right); |
702 | if (left->leftnode == LEAF) |
703 | leftleaf = left->left; |
704 | else if (left->rightnode == LEAF) |
705 | leftleaf = left->right; |
706 | else if (left->left) |
707 | left = left->left; |
708 | else if (left->right) |
709 | left = left->right; |
710 | else |
711 | assert(0); |
712 | } |
713 | rightleaf = NULL; |
714 | while (!rightleaf) { |
715 | assert(right->left || right->right); |
716 | if (right->leftnode == LEAF) |
717 | rightleaf = right->left; |
718 | else if (right->rightnode == LEAF) |
719 | rightleaf = right->right; |
720 | else if (right->left) |
721 | right = right->left; |
722 | else if (right->right) |
723 | right = right->right; |
724 | else |
725 | assert(0); |
726 | } |
727 | if (! tree->leaf_equal(leftleaf, rightleaf)) |
728 | goto advance; |
729 | /* |
730 | * This node has identical singleton-only subtrees. |
731 | * Remove it. |
732 | */ |
733 | parent = node->parent; |
734 | left = node->left; |
735 | right = node->right; |
736 | if (parent->left == node) |
737 | parent->left = left; |
738 | else if (parent->right == node) |
739 | parent->right = left; |
740 | else |
741 | assert(0); |
742 | left->parent = parent; |
743 | left->keymask |= (1 << node->bitnum); |
744 | node->left = NULL; |
745 | while (node) { |
746 | bitmask = 1 << node->bitnum; |
747 | leftmask &= ~bitmask; |
748 | rightmask &= ~bitmask; |
749 | if (node->leftnode == NODE && node->left) { |
750 | left = node->left; |
751 | free(node); |
752 | count++; |
753 | node = left; |
754 | } else if (node->rightnode == NODE && node->right) { |
755 | right = node->right; |
756 | free(node); |
757 | count++; |
758 | node = right; |
759 | } else { |
760 | node = NULL; |
761 | } |
762 | } |
763 | /* Propagate keymasks up along singleton chains. */ |
764 | node = parent; |
765 | /* Force re-check */ |
766 | bitmask = 1 << node->bitnum; |
767 | leftmask &= ~bitmask; |
768 | rightmask &= ~bitmask; |
769 | for (;;) { |
770 | if (node->left && node->right) |
771 | break; |
772 | if (node->left) { |
773 | left = node->left; |
774 | node->keymask |= left->keymask; |
775 | node->keybits |= left->keybits; |
776 | } |
777 | if (node->right) { |
778 | right = node->right; |
779 | node->keymask |= right->keymask; |
780 | node->keybits |= right->keybits; |
781 | } |
782 | node->keymask |= (1 << node->bitnum); |
783 | node = node->parent; |
784 | /* Force re-check */ |
785 | bitmask = 1 << node->bitnum; |
786 | leftmask &= ~bitmask; |
787 | rightmask &= ~bitmask; |
788 | } |
789 | advance: |
790 | bitmask = 1 << node->bitnum; |
791 | if ((leftmask & bitmask) == 0 && |
792 | node->leftnode == NODE && |
793 | node->left) { |
794 | leftmask |= bitmask; |
795 | node = node->left; |
796 | } else if ((rightmask & bitmask) == 0 && |
797 | node->rightnode == NODE && |
798 | node->right) { |
799 | rightmask |= bitmask; |
800 | node = node->right; |
801 | } else { |
802 | leftmask &= ~bitmask; |
803 | rightmask &= ~bitmask; |
804 | node = node->parent; |
805 | } |
806 | } |
807 | if (verbose > 0) |
808 | printf("Pruned %d nodes\n" , count); |
809 | } |
810 | |
811 | /* |
812 | * Mark the nodes in the tree that lead to leaves that must be |
813 | * emitted. |
814 | */ |
815 | static void mark_nodes(struct tree *tree) |
816 | { |
817 | struct node *node; |
818 | struct node *n; |
819 | unsigned int leftmask; |
820 | unsigned int rightmask; |
821 | unsigned int bitmask; |
822 | int marked; |
823 | |
824 | marked = 0; |
825 | if (verbose > 0) |
826 | printf("Marking %s_%x\n" , tree->type, tree->maxage); |
827 | if (tree->childnode == LEAF) |
828 | goto done; |
829 | |
830 | assert(tree->childnode == NODE); |
831 | node = tree->root; |
832 | leftmask = rightmask = 0; |
833 | while (node) { |
834 | bitmask = 1 << node->bitnum; |
835 | if ((leftmask & bitmask) == 0) { |
836 | leftmask |= bitmask; |
837 | if (node->leftnode == LEAF) { |
838 | assert(node->left); |
839 | if (tree->leaf_mark(node->left)) { |
840 | n = node; |
841 | while (n && !n->mark) { |
842 | marked++; |
843 | n->mark = 1; |
844 | n = n->parent; |
845 | } |
846 | } |
847 | } else if (node->left) { |
848 | assert(node->leftnode == NODE); |
849 | node = node->left; |
850 | continue; |
851 | } |
852 | } |
853 | if ((rightmask & bitmask) == 0) { |
854 | rightmask |= bitmask; |
855 | if (node->rightnode == LEAF) { |
856 | assert(node->right); |
857 | if (tree->leaf_mark(node->right)) { |
858 | n = node; |
859 | while (n && !n->mark) { |
860 | marked++; |
861 | n->mark = 1; |
862 | n = n->parent; |
863 | } |
864 | } |
865 | } else if (node->right) { |
866 | assert(node->rightnode == NODE); |
867 | node = node->right; |
868 | continue; |
869 | } |
870 | } |
871 | leftmask &= ~bitmask; |
872 | rightmask &= ~bitmask; |
873 | node = node->parent; |
874 | } |
875 | |
876 | /* second pass: left siblings and singletons */ |
877 | |
878 | assert(tree->childnode == NODE); |
879 | node = tree->root; |
880 | leftmask = rightmask = 0; |
881 | while (node) { |
882 | bitmask = 1 << node->bitnum; |
883 | if ((leftmask & bitmask) == 0) { |
884 | leftmask |= bitmask; |
885 | if (node->leftnode == LEAF) { |
886 | assert(node->left); |
887 | if (tree->leaf_mark(node->left)) { |
888 | n = node; |
889 | while (n && !n->mark) { |
890 | marked++; |
891 | n->mark = 1; |
892 | n = n->parent; |
893 | } |
894 | } |
895 | } else if (node->left) { |
896 | assert(node->leftnode == NODE); |
897 | node = node->left; |
898 | if (!node->mark && node->parent->mark) { |
899 | marked++; |
900 | node->mark = 1; |
901 | } |
902 | continue; |
903 | } |
904 | } |
905 | if ((rightmask & bitmask) == 0) { |
906 | rightmask |= bitmask; |
907 | if (node->rightnode == LEAF) { |
908 | assert(node->right); |
909 | if (tree->leaf_mark(node->right)) { |
910 | n = node; |
911 | while (n && !n->mark) { |
912 | marked++; |
913 | n->mark = 1; |
914 | n = n->parent; |
915 | } |
916 | } |
917 | } else if (node->right) { |
918 | assert(node->rightnode == NODE); |
919 | node = node->right; |
920 | if (!node->mark && node->parent->mark && |
921 | !node->parent->left) { |
922 | marked++; |
923 | node->mark = 1; |
924 | } |
925 | continue; |
926 | } |
927 | } |
928 | leftmask &= ~bitmask; |
929 | rightmask &= ~bitmask; |
930 | node = node->parent; |
931 | } |
932 | done: |
933 | if (verbose > 0) |
934 | printf("Marked %d nodes\n" , marked); |
935 | } |
936 | |
937 | /* |
938 | * Compute the index of each node and leaf, which is the offset in the |
939 | * emitted trie. These values must be pre-computed because relative |
940 | * offsets between nodes are used to navigate the tree. |
941 | */ |
942 | static int index_nodes(struct tree *tree, int index) |
943 | { |
944 | struct node *node; |
945 | unsigned int leftmask; |
946 | unsigned int rightmask; |
947 | unsigned int bitmask; |
948 | int count; |
949 | int indent; |
950 | |
951 | /* Align to a cache line (or half a cache line?). */ |
952 | while (index % 64) |
953 | index++; |
954 | tree->index = index; |
955 | indent = 1; |
956 | count = 0; |
957 | |
958 | if (verbose > 0) |
959 | printf("Indexing %s_%x: %d\n" , tree->type, tree->maxage, index); |
960 | if (tree->childnode == LEAF) { |
961 | index += tree->leaf_size(tree->root); |
962 | goto done; |
963 | } |
964 | |
965 | assert(tree->childnode == NODE); |
966 | node = tree->root; |
967 | leftmask = rightmask = 0; |
968 | while (node) { |
969 | if (!node->mark) |
970 | goto skip; |
971 | count++; |
972 | if (node->index != index) |
973 | node->index = index; |
974 | index += node->size; |
975 | skip: |
976 | while (node) { |
977 | bitmask = 1 << node->bitnum; |
978 | if (node->mark && (leftmask & bitmask) == 0) { |
979 | leftmask |= bitmask; |
980 | if (node->leftnode == LEAF) { |
981 | assert(node->left); |
982 | *tree->leaf_index(tree, node->left) = |
983 | index; |
984 | index += tree->leaf_size(node->left); |
985 | count++; |
986 | } else if (node->left) { |
987 | assert(node->leftnode == NODE); |
988 | indent += 1; |
989 | node = node->left; |
990 | break; |
991 | } |
992 | } |
993 | if (node->mark && (rightmask & bitmask) == 0) { |
994 | rightmask |= bitmask; |
995 | if (node->rightnode == LEAF) { |
996 | assert(node->right); |
997 | *tree->leaf_index(tree, node->right) = index; |
998 | index += tree->leaf_size(node->right); |
999 | count++; |
1000 | } else if (node->right) { |
1001 | assert(node->rightnode == NODE); |
1002 | indent += 1; |
1003 | node = node->right; |
1004 | break; |
1005 | } |
1006 | } |
1007 | leftmask &= ~bitmask; |
1008 | rightmask &= ~bitmask; |
1009 | node = node->parent; |
1010 | indent -= 1; |
1011 | } |
1012 | } |
1013 | done: |
1014 | /* Round up to a multiple of 16 */ |
1015 | while (index % 16) |
1016 | index++; |
1017 | if (verbose > 0) |
1018 | printf("Final index %d\n" , index); |
1019 | return index; |
1020 | } |
1021 | |
1022 | /* |
1023 | * Mark the nodes in a subtree, helper for size_nodes(). |
1024 | */ |
1025 | static int mark_subtree(struct node *node) |
1026 | { |
1027 | int changed; |
1028 | |
1029 | if (!node || node->mark) |
1030 | return 0; |
1031 | node->mark = 1; |
1032 | node->index = node->parent->index; |
1033 | changed = 1; |
1034 | if (node->leftnode == NODE) |
1035 | changed += mark_subtree(node: node->left); |
1036 | if (node->rightnode == NODE) |
1037 | changed += mark_subtree(node: node->right); |
1038 | return changed; |
1039 | } |
1040 | |
1041 | /* |
1042 | * Compute the size of nodes and leaves. We start by assuming that |
1043 | * each node needs to store a three-byte offset. The indexes of the |
1044 | * nodes are calculated based on that, and then this function is |
1045 | * called to see if the sizes of some nodes can be reduced. This is |
1046 | * repeated until no more changes are seen. |
1047 | */ |
1048 | static int size_nodes(struct tree *tree) |
1049 | { |
1050 | struct tree *next; |
1051 | struct node *node; |
1052 | struct node *right; |
1053 | struct node *n; |
1054 | unsigned int leftmask; |
1055 | unsigned int rightmask; |
1056 | unsigned int bitmask; |
1057 | unsigned int pathbits; |
1058 | unsigned int pathmask; |
1059 | unsigned int nbit; |
1060 | int changed; |
1061 | int offset; |
1062 | int size; |
1063 | int indent; |
1064 | |
1065 | indent = 1; |
1066 | changed = 0; |
1067 | size = 0; |
1068 | |
1069 | if (verbose > 0) |
1070 | printf("Sizing %s_%x\n" , tree->type, tree->maxage); |
1071 | if (tree->childnode == LEAF) |
1072 | goto done; |
1073 | |
1074 | assert(tree->childnode == NODE); |
1075 | pathbits = 0; |
1076 | pathmask = 0; |
1077 | node = tree->root; |
1078 | leftmask = rightmask = 0; |
1079 | while (node) { |
1080 | if (!node->mark) |
1081 | goto skip; |
1082 | offset = 0; |
1083 | if (!node->left || !node->right) { |
1084 | size = 1; |
1085 | } else { |
1086 | if (node->rightnode == NODE) { |
1087 | /* |
1088 | * If the right node is not marked, |
1089 | * look for a corresponding node in |
1090 | * the next tree. Such a node need |
1091 | * not exist. |
1092 | */ |
1093 | right = node->right; |
1094 | next = tree->next; |
1095 | while (!right->mark) { |
1096 | assert(next); |
1097 | n = next->root; |
1098 | while (n->bitnum != node->bitnum) { |
1099 | nbit = 1 << n->bitnum; |
1100 | if (!(pathmask & nbit)) |
1101 | break; |
1102 | if (pathbits & nbit) { |
1103 | if (n->rightnode == LEAF) |
1104 | break; |
1105 | n = n->right; |
1106 | } else { |
1107 | if (n->leftnode == LEAF) |
1108 | break; |
1109 | n = n->left; |
1110 | } |
1111 | } |
1112 | if (n->bitnum != node->bitnum) |
1113 | break; |
1114 | n = n->right; |
1115 | right = n; |
1116 | next = next->next; |
1117 | } |
1118 | /* Make sure the right node is marked. */ |
1119 | if (!right->mark) |
1120 | changed += mark_subtree(node: right); |
1121 | offset = right->index - node->index; |
1122 | } else { |
1123 | offset = *tree->leaf_index(tree, node->right); |
1124 | offset -= node->index; |
1125 | } |
1126 | assert(offset >= 0); |
1127 | assert(offset <= 0xffffff); |
1128 | if (offset <= 0xff) { |
1129 | size = 2; |
1130 | } else if (offset <= 0xffff) { |
1131 | size = 3; |
1132 | } else { /* offset <= 0xffffff */ |
1133 | size = 4; |
1134 | } |
1135 | } |
1136 | if (node->size != size || node->offset != offset) { |
1137 | node->size = size; |
1138 | node->offset = offset; |
1139 | changed++; |
1140 | } |
1141 | skip: |
1142 | while (node) { |
1143 | bitmask = 1 << node->bitnum; |
1144 | pathmask |= bitmask; |
1145 | if (node->mark && (leftmask & bitmask) == 0) { |
1146 | leftmask |= bitmask; |
1147 | if (node->leftnode == LEAF) { |
1148 | assert(node->left); |
1149 | } else if (node->left) { |
1150 | assert(node->leftnode == NODE); |
1151 | indent += 1; |
1152 | node = node->left; |
1153 | break; |
1154 | } |
1155 | } |
1156 | if (node->mark && (rightmask & bitmask) == 0) { |
1157 | rightmask |= bitmask; |
1158 | pathbits |= bitmask; |
1159 | if (node->rightnode == LEAF) { |
1160 | assert(node->right); |
1161 | } else if (node->right) { |
1162 | assert(node->rightnode == NODE); |
1163 | indent += 1; |
1164 | node = node->right; |
1165 | break; |
1166 | } |
1167 | } |
1168 | leftmask &= ~bitmask; |
1169 | rightmask &= ~bitmask; |
1170 | pathmask &= ~bitmask; |
1171 | pathbits &= ~bitmask; |
1172 | node = node->parent; |
1173 | indent -= 1; |
1174 | } |
1175 | } |
1176 | done: |
1177 | if (verbose > 0) |
1178 | printf("Found %d changes\n" , changed); |
1179 | return changed; |
1180 | } |
1181 | |
1182 | /* |
1183 | * Emit a trie for the given tree into the data array. |
1184 | */ |
1185 | static void emit(struct tree *tree, unsigned char *data) |
1186 | { |
1187 | struct node *node; |
1188 | unsigned int leftmask; |
1189 | unsigned int rightmask; |
1190 | unsigned int bitmask; |
1191 | int offlen; |
1192 | int offset; |
1193 | int index; |
1194 | int indent; |
1195 | int size; |
1196 | int bytes; |
1197 | int leaves; |
1198 | int nodes[4]; |
1199 | unsigned char byte; |
1200 | |
1201 | nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0; |
1202 | leaves = 0; |
1203 | bytes = 0; |
1204 | index = tree->index; |
1205 | data += index; |
1206 | indent = 1; |
1207 | if (verbose > 0) |
1208 | printf("Emitting %s_%x\n" , tree->type, tree->maxage); |
1209 | if (tree->childnode == LEAF) { |
1210 | assert(tree->root); |
1211 | tree->leaf_emit(tree->root, data); |
1212 | size = tree->leaf_size(tree->root); |
1213 | index += size; |
1214 | leaves++; |
1215 | goto done; |
1216 | } |
1217 | |
1218 | assert(tree->childnode == NODE); |
1219 | node = tree->root; |
1220 | leftmask = rightmask = 0; |
1221 | while (node) { |
1222 | if (!node->mark) |
1223 | goto skip; |
1224 | assert(node->offset != -1); |
1225 | assert(node->index == index); |
1226 | |
1227 | byte = 0; |
1228 | if (node->nextbyte) |
1229 | byte |= NEXTBYTE; |
1230 | byte |= (node->bitnum & BITNUM); |
1231 | if (node->left && node->right) { |
1232 | if (node->leftnode == NODE) |
1233 | byte |= LEFTNODE; |
1234 | if (node->rightnode == NODE) |
1235 | byte |= RIGHTNODE; |
1236 | if (node->offset <= 0xff) |
1237 | offlen = 1; |
1238 | else if (node->offset <= 0xffff) |
1239 | offlen = 2; |
1240 | else |
1241 | offlen = 3; |
1242 | nodes[offlen]++; |
1243 | offset = node->offset; |
1244 | byte |= offlen << OFFLEN_SHIFT; |
1245 | *data++ = byte; |
1246 | index++; |
1247 | while (offlen--) { |
1248 | *data++ = offset & 0xff; |
1249 | index++; |
1250 | offset >>= 8; |
1251 | } |
1252 | } else if (node->left) { |
1253 | if (node->leftnode == NODE) |
1254 | byte |= TRIENODE; |
1255 | nodes[0]++; |
1256 | *data++ = byte; |
1257 | index++; |
1258 | } else if (node->right) { |
1259 | byte |= RIGHTNODE; |
1260 | if (node->rightnode == NODE) |
1261 | byte |= TRIENODE; |
1262 | nodes[0]++; |
1263 | *data++ = byte; |
1264 | index++; |
1265 | } else { |
1266 | assert(0); |
1267 | } |
1268 | skip: |
1269 | while (node) { |
1270 | bitmask = 1 << node->bitnum; |
1271 | if (node->mark && (leftmask & bitmask) == 0) { |
1272 | leftmask |= bitmask; |
1273 | if (node->leftnode == LEAF) { |
1274 | assert(node->left); |
1275 | data = tree->leaf_emit(node->left, |
1276 | data); |
1277 | size = tree->leaf_size(node->left); |
1278 | index += size; |
1279 | bytes += size; |
1280 | leaves++; |
1281 | } else if (node->left) { |
1282 | assert(node->leftnode == NODE); |
1283 | indent += 1; |
1284 | node = node->left; |
1285 | break; |
1286 | } |
1287 | } |
1288 | if (node->mark && (rightmask & bitmask) == 0) { |
1289 | rightmask |= bitmask; |
1290 | if (node->rightnode == LEAF) { |
1291 | assert(node->right); |
1292 | data = tree->leaf_emit(node->right, |
1293 | data); |
1294 | size = tree->leaf_size(node->right); |
1295 | index += size; |
1296 | bytes += size; |
1297 | leaves++; |
1298 | } else if (node->right) { |
1299 | assert(node->rightnode == NODE); |
1300 | indent += 1; |
1301 | node = node->right; |
1302 | break; |
1303 | } |
1304 | } |
1305 | leftmask &= ~bitmask; |
1306 | rightmask &= ~bitmask; |
1307 | node = node->parent; |
1308 | indent -= 1; |
1309 | } |
1310 | } |
1311 | done: |
1312 | if (verbose > 0) { |
1313 | printf("Emitted %d (%d) leaves" , |
1314 | leaves, bytes); |
1315 | printf(" %d (%d+%d+%d+%d) nodes" , |
1316 | nodes[0] + nodes[1] + nodes[2] + nodes[3], |
1317 | nodes[0], nodes[1], nodes[2], nodes[3]); |
1318 | printf(" %d total\n" , index - tree->index); |
1319 | } |
1320 | } |
1321 | |
1322 | /* ------------------------------------------------------------------ */ |
1323 | |
1324 | /* |
1325 | * Unicode data. |
1326 | * |
1327 | * We need to keep track of the Canonical Combining Class, the Age, |
1328 | * and decompositions for a code point. |
1329 | * |
1330 | * For the Age, we store the index into the ages table. Effectively |
1331 | * this is a generation number that the table maps to a unicode |
1332 | * version. |
1333 | * |
1334 | * The correction field is used to indicate that this entry is in the |
1335 | * corrections array, which contains decompositions that were |
1336 | * corrected in later revisions. The value of the correction field is |
1337 | * the Unicode version in which the mapping was corrected. |
1338 | */ |
1339 | struct unicode_data { |
1340 | unsigned int code; |
1341 | int ccc; |
1342 | int gen; |
1343 | int correction; |
1344 | unsigned int *utf32nfdi; |
1345 | unsigned int *utf32nfdicf; |
1346 | char *utf8nfdi; |
1347 | char *utf8nfdicf; |
1348 | }; |
1349 | |
1350 | struct unicode_data unicode_data[0x110000]; |
1351 | struct unicode_data *corrections; |
1352 | int corrections_count; |
1353 | |
1354 | struct tree *nfdi_tree; |
1355 | struct tree *nfdicf_tree; |
1356 | |
1357 | struct tree *trees; |
1358 | int trees_count; |
1359 | |
1360 | /* |
1361 | * Check the corrections array to see if this entry was corrected at |
1362 | * some point. |
1363 | */ |
1364 | static struct unicode_data *corrections_lookup(struct unicode_data *u) |
1365 | { |
1366 | int i; |
1367 | |
1368 | for (i = 0; i != corrections_count; i++) |
1369 | if (u->code == corrections[i].code) |
1370 | return &corrections[i]; |
1371 | return u; |
1372 | } |
1373 | |
1374 | static int nfdi_equal(void *l, void *r) |
1375 | { |
1376 | struct unicode_data *left = l; |
1377 | struct unicode_data *right = r; |
1378 | |
1379 | if (left->gen != right->gen) |
1380 | return 0; |
1381 | if (left->ccc != right->ccc) |
1382 | return 0; |
1383 | if (left->utf8nfdi && right->utf8nfdi && |
1384 | strcmp(left->utf8nfdi, right->utf8nfdi) == 0) |
1385 | return 1; |
1386 | if (left->utf8nfdi || right->utf8nfdi) |
1387 | return 0; |
1388 | return 1; |
1389 | } |
1390 | |
1391 | static int nfdicf_equal(void *l, void *r) |
1392 | { |
1393 | struct unicode_data *left = l; |
1394 | struct unicode_data *right = r; |
1395 | |
1396 | if (left->gen != right->gen) |
1397 | return 0; |
1398 | if (left->ccc != right->ccc) |
1399 | return 0; |
1400 | if (left->utf8nfdicf && right->utf8nfdicf && |
1401 | strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0) |
1402 | return 1; |
1403 | if (left->utf8nfdicf && right->utf8nfdicf) |
1404 | return 0; |
1405 | if (left->utf8nfdicf || right->utf8nfdicf) |
1406 | return 0; |
1407 | if (left->utf8nfdi && right->utf8nfdi && |
1408 | strcmp(left->utf8nfdi, right->utf8nfdi) == 0) |
1409 | return 1; |
1410 | if (left->utf8nfdi || right->utf8nfdi) |
1411 | return 0; |
1412 | return 1; |
1413 | } |
1414 | |
1415 | static void nfdi_print(void *l, int indent) |
1416 | { |
1417 | struct unicode_data *leaf = l; |
1418 | |
1419 | printf("%*sleaf @ %p code %X ccc %d gen %d" , indent, "" , leaf, |
1420 | leaf->code, leaf->ccc, leaf->gen); |
1421 | |
1422 | if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL) |
1423 | printf(" nfdi \"%s\"" , "HANGUL SYLLABLE" ); |
1424 | else if (leaf->utf8nfdi) |
1425 | printf(" nfdi \"%s\"" , (const char*)leaf->utf8nfdi); |
1426 | |
1427 | printf("\n" ); |
1428 | } |
1429 | |
1430 | static void nfdicf_print(void *l, int indent) |
1431 | { |
1432 | struct unicode_data *leaf = l; |
1433 | |
1434 | printf("%*sleaf @ %p code %X ccc %d gen %d" , indent, "" , leaf, |
1435 | leaf->code, leaf->ccc, leaf->gen); |
1436 | |
1437 | if (leaf->utf8nfdicf) |
1438 | printf(" nfdicf \"%s\"" , (const char*)leaf->utf8nfdicf); |
1439 | else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL) |
1440 | printf(" nfdi \"%s\"" , "HANGUL SYLLABLE" ); |
1441 | else if (leaf->utf8nfdi) |
1442 | printf(" nfdi \"%s\"" , (const char*)leaf->utf8nfdi); |
1443 | printf("\n" ); |
1444 | } |
1445 | |
1446 | static int nfdi_mark(void *l) |
1447 | { |
1448 | return 1; |
1449 | } |
1450 | |
1451 | static int nfdicf_mark(void *l) |
1452 | { |
1453 | struct unicode_data *leaf = l; |
1454 | |
1455 | if (leaf->utf8nfdicf) |
1456 | return 1; |
1457 | return 0; |
1458 | } |
1459 | |
1460 | static int correction_mark(void *l) |
1461 | { |
1462 | struct unicode_data *leaf = l; |
1463 | |
1464 | return leaf->correction; |
1465 | } |
1466 | |
1467 | static int nfdi_size(void *l) |
1468 | { |
1469 | struct unicode_data *leaf = l; |
1470 | int size = 2; |
1471 | |
1472 | if (HANGUL_SYLLABLE(leaf->code)) |
1473 | size += 1; |
1474 | else if (leaf->utf8nfdi) |
1475 | size += strlen(leaf->utf8nfdi) + 1; |
1476 | return size; |
1477 | } |
1478 | |
1479 | static int nfdicf_size(void *l) |
1480 | { |
1481 | struct unicode_data *leaf = l; |
1482 | int size = 2; |
1483 | |
1484 | if (HANGUL_SYLLABLE(leaf->code)) |
1485 | size += 1; |
1486 | else if (leaf->utf8nfdicf) |
1487 | size += strlen(leaf->utf8nfdicf) + 1; |
1488 | else if (leaf->utf8nfdi) |
1489 | size += strlen(leaf->utf8nfdi) + 1; |
1490 | return size; |
1491 | } |
1492 | |
1493 | static int *nfdi_index(struct tree *tree, void *l) |
1494 | { |
1495 | struct unicode_data *leaf = l; |
1496 | |
1497 | return &tree->leafindex[leaf->code]; |
1498 | } |
1499 | |
1500 | static int *nfdicf_index(struct tree *tree, void *l) |
1501 | { |
1502 | struct unicode_data *leaf = l; |
1503 | |
1504 | return &tree->leafindex[leaf->code]; |
1505 | } |
1506 | |
1507 | static unsigned char *nfdi_emit(void *l, unsigned char *data) |
1508 | { |
1509 | struct unicode_data *leaf = l; |
1510 | unsigned char *s; |
1511 | |
1512 | *data++ = leaf->gen; |
1513 | |
1514 | if (HANGUL_SYLLABLE(leaf->code)) { |
1515 | *data++ = DECOMPOSE; |
1516 | *data++ = HANGUL; |
1517 | } else if (leaf->utf8nfdi) { |
1518 | *data++ = DECOMPOSE; |
1519 | s = (unsigned char*)leaf->utf8nfdi; |
1520 | while ((*data++ = *s++) != 0) |
1521 | ; |
1522 | } else { |
1523 | *data++ = leaf->ccc; |
1524 | } |
1525 | return data; |
1526 | } |
1527 | |
1528 | static unsigned char *nfdicf_emit(void *l, unsigned char *data) |
1529 | { |
1530 | struct unicode_data *leaf = l; |
1531 | unsigned char *s; |
1532 | |
1533 | *data++ = leaf->gen; |
1534 | |
1535 | if (HANGUL_SYLLABLE(leaf->code)) { |
1536 | *data++ = DECOMPOSE; |
1537 | *data++ = HANGUL; |
1538 | } else if (leaf->utf8nfdicf) { |
1539 | *data++ = DECOMPOSE; |
1540 | s = (unsigned char*)leaf->utf8nfdicf; |
1541 | while ((*data++ = *s++) != 0) |
1542 | ; |
1543 | } else if (leaf->utf8nfdi) { |
1544 | *data++ = DECOMPOSE; |
1545 | s = (unsigned char*)leaf->utf8nfdi; |
1546 | while ((*data++ = *s++) != 0) |
1547 | ; |
1548 | } else { |
1549 | *data++ = leaf->ccc; |
1550 | } |
1551 | return data; |
1552 | } |
1553 | |
1554 | static void utf8_create(struct unicode_data *data) |
1555 | { |
1556 | char utf[18*4+1]; |
1557 | char *u; |
1558 | unsigned int *um; |
1559 | int i; |
1560 | |
1561 | if (data->utf8nfdi) { |
1562 | assert(data->utf8nfdi[0] == HANGUL); |
1563 | return; |
1564 | } |
1565 | |
1566 | u = utf; |
1567 | um = data->utf32nfdi; |
1568 | if (um) { |
1569 | for (i = 0; um[i]; i++) |
1570 | u += utf8encode(str: u, val: um[i]); |
1571 | *u = '\0'; |
1572 | data->utf8nfdi = strdup(utf); |
1573 | } |
1574 | u = utf; |
1575 | um = data->utf32nfdicf; |
1576 | if (um) { |
1577 | for (i = 0; um[i]; i++) |
1578 | u += utf8encode(str: u, val: um[i]); |
1579 | *u = '\0'; |
1580 | if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf)) |
1581 | data->utf8nfdicf = strdup(utf); |
1582 | } |
1583 | } |
1584 | |
1585 | static void utf8_init(void) |
1586 | { |
1587 | unsigned int unichar; |
1588 | int i; |
1589 | |
1590 | for (unichar = 0; unichar != 0x110000; unichar++) |
1591 | utf8_create(data: &unicode_data[unichar]); |
1592 | |
1593 | for (i = 0; i != corrections_count; i++) |
1594 | utf8_create(data: &corrections[i]); |
1595 | } |
1596 | |
1597 | static void trees_init(void) |
1598 | { |
1599 | struct unicode_data *data; |
1600 | unsigned int maxage; |
1601 | unsigned int nextage; |
1602 | int count; |
1603 | int i; |
1604 | int j; |
1605 | |
1606 | /* Count the number of different ages. */ |
1607 | count = 0; |
1608 | nextage = (unsigned int)-1; |
1609 | do { |
1610 | maxage = nextage; |
1611 | nextage = 0; |
1612 | for (i = 0; i <= corrections_count; i++) { |
1613 | data = &corrections[i]; |
1614 | if (nextage < data->correction && |
1615 | data->correction < maxage) |
1616 | nextage = data->correction; |
1617 | } |
1618 | count++; |
1619 | } while (nextage); |
1620 | |
1621 | /* Two trees per age: nfdi and nfdicf */ |
1622 | trees_count = count * 2; |
1623 | trees = calloc(trees_count, sizeof(struct tree)); |
1624 | |
1625 | /* Assign ages to the trees. */ |
1626 | count = trees_count; |
1627 | nextage = (unsigned int)-1; |
1628 | do { |
1629 | maxage = nextage; |
1630 | trees[--count].maxage = maxage; |
1631 | trees[--count].maxage = maxage; |
1632 | nextage = 0; |
1633 | for (i = 0; i <= corrections_count; i++) { |
1634 | data = &corrections[i]; |
1635 | if (nextage < data->correction && |
1636 | data->correction < maxage) |
1637 | nextage = data->correction; |
1638 | } |
1639 | } while (nextage); |
1640 | |
1641 | /* The ages assigned above are off by one. */ |
1642 | for (i = 0; i != trees_count; i++) { |
1643 | j = 0; |
1644 | while (ages[j] < trees[i].maxage) |
1645 | j++; |
1646 | trees[i].maxage = ages[j-1]; |
1647 | } |
1648 | |
1649 | /* Set up the forwarding between trees. */ |
1650 | trees[trees_count-2].next = &trees[trees_count-1]; |
1651 | trees[trees_count-1].leaf_mark = nfdi_mark; |
1652 | trees[trees_count-2].leaf_mark = nfdicf_mark; |
1653 | for (i = 0; i != trees_count-2; i += 2) { |
1654 | trees[i].next = &trees[trees_count-2]; |
1655 | trees[i].leaf_mark = correction_mark; |
1656 | trees[i+1].next = &trees[trees_count-1]; |
1657 | trees[i+1].leaf_mark = correction_mark; |
1658 | } |
1659 | |
1660 | /* Assign the callouts. */ |
1661 | for (i = 0; i != trees_count; i += 2) { |
1662 | trees[i].type = "nfdicf" ; |
1663 | trees[i].leaf_equal = nfdicf_equal; |
1664 | trees[i].leaf_print = nfdicf_print; |
1665 | trees[i].leaf_size = nfdicf_size; |
1666 | trees[i].leaf_index = nfdicf_index; |
1667 | trees[i].leaf_emit = nfdicf_emit; |
1668 | |
1669 | trees[i+1].type = "nfdi" ; |
1670 | trees[i+1].leaf_equal = nfdi_equal; |
1671 | trees[i+1].leaf_print = nfdi_print; |
1672 | trees[i+1].leaf_size = nfdi_size; |
1673 | trees[i+1].leaf_index = nfdi_index; |
1674 | trees[i+1].leaf_emit = nfdi_emit; |
1675 | } |
1676 | |
1677 | /* Finish init. */ |
1678 | for (i = 0; i != trees_count; i++) |
1679 | trees[i].childnode = NODE; |
1680 | } |
1681 | |
1682 | static void trees_populate(void) |
1683 | { |
1684 | struct unicode_data *data; |
1685 | unsigned int unichar; |
1686 | char keyval[4]; |
1687 | int keylen; |
1688 | int i; |
1689 | |
1690 | for (i = 0; i != trees_count; i++) { |
1691 | if (verbose > 0) { |
1692 | printf("Populating %s_%x\n" , |
1693 | trees[i].type, trees[i].maxage); |
1694 | } |
1695 | for (unichar = 0; unichar != 0x110000; unichar++) { |
1696 | if (unicode_data[unichar].gen < 0) |
1697 | continue; |
1698 | keylen = utf8encode(str: keyval, val: unichar); |
1699 | data = corrections_lookup(u: &unicode_data[unichar]); |
1700 | if (data->correction <= trees[i].maxage) |
1701 | data = &unicode_data[unichar]; |
1702 | insert(tree: &trees[i], key: keyval, keylen, leaf: data); |
1703 | } |
1704 | } |
1705 | } |
1706 | |
1707 | static void trees_reduce(void) |
1708 | { |
1709 | int i; |
1710 | int size; |
1711 | int changed; |
1712 | |
1713 | for (i = 0; i != trees_count; i++) |
1714 | prune(tree: &trees[i]); |
1715 | for (i = 0; i != trees_count; i++) |
1716 | mark_nodes(tree: &trees[i]); |
1717 | do { |
1718 | size = 0; |
1719 | for (i = 0; i != trees_count; i++) |
1720 | size = index_nodes(tree: &trees[i], index: size); |
1721 | changed = 0; |
1722 | for (i = 0; i != trees_count; i++) |
1723 | changed += size_nodes(tree: &trees[i]); |
1724 | } while (changed); |
1725 | |
1726 | utf8data = calloc(size, 1); |
1727 | utf8data_size = size; |
1728 | for (i = 0; i != trees_count; i++) |
1729 | emit(tree: &trees[i], data: utf8data); |
1730 | |
1731 | if (verbose > 0) { |
1732 | for (i = 0; i != trees_count; i++) { |
1733 | printf("%s_%x idx %d\n" , |
1734 | trees[i].type, trees[i].maxage, trees[i].index); |
1735 | } |
1736 | } |
1737 | |
1738 | nfdi = utf8data + trees[trees_count-1].index; |
1739 | nfdicf = utf8data + trees[trees_count-2].index; |
1740 | |
1741 | nfdi_tree = &trees[trees_count-1]; |
1742 | nfdicf_tree = &trees[trees_count-2]; |
1743 | } |
1744 | |
1745 | static void verify(struct tree *tree) |
1746 | { |
1747 | struct unicode_data *data; |
1748 | utf8leaf_t *leaf; |
1749 | unsigned int unichar; |
1750 | char key[4]; |
1751 | unsigned char hangul[UTF8HANGULLEAF]; |
1752 | int report; |
1753 | int nocf; |
1754 | |
1755 | if (verbose > 0) |
1756 | printf("Verifying %s_%x\n" , tree->type, tree->maxage); |
1757 | nocf = strcmp(tree->type, "nfdicf" ); |
1758 | |
1759 | for (unichar = 0; unichar != 0x110000; unichar++) { |
1760 | report = 0; |
1761 | data = corrections_lookup(u: &unicode_data[unichar]); |
1762 | if (data->correction <= tree->maxage) |
1763 | data = &unicode_data[unichar]; |
1764 | utf8encode(str: key,val: unichar); |
1765 | leaf = utf8lookup(tree, hangul, key); |
1766 | |
1767 | if (!leaf) { |
1768 | if (data->gen != -1) |
1769 | report++; |
1770 | if (unichar < 0xd800 || unichar > 0xdfff) |
1771 | report++; |
1772 | } else { |
1773 | if (unichar >= 0xd800 && unichar <= 0xdfff) |
1774 | report++; |
1775 | if (data->gen == -1) |
1776 | report++; |
1777 | if (data->gen != LEAF_GEN(leaf)) |
1778 | report++; |
1779 | if (LEAF_CCC(leaf) == DECOMPOSE) { |
1780 | if (HANGUL_SYLLABLE(data->code)) { |
1781 | if (data->utf8nfdi[0] != HANGUL) |
1782 | report++; |
1783 | } else if (nocf) { |
1784 | if (!data->utf8nfdi) { |
1785 | report++; |
1786 | } else if (strcmp(data->utf8nfdi, |
1787 | LEAF_STR(leaf))) { |
1788 | report++; |
1789 | } |
1790 | } else { |
1791 | if (!data->utf8nfdicf && |
1792 | !data->utf8nfdi) { |
1793 | report++; |
1794 | } else if (data->utf8nfdicf) { |
1795 | if (strcmp(data->utf8nfdicf, |
1796 | LEAF_STR(leaf))) |
1797 | report++; |
1798 | } else if (strcmp(data->utf8nfdi, |
1799 | LEAF_STR(leaf))) { |
1800 | report++; |
1801 | } |
1802 | } |
1803 | } else if (data->ccc != LEAF_CCC(leaf)) { |
1804 | report++; |
1805 | } |
1806 | } |
1807 | if (report) { |
1808 | printf("%X code %X gen %d ccc %d" |
1809 | " nfdi -> \"%s\"" , |
1810 | unichar, data->code, data->gen, |
1811 | data->ccc, |
1812 | data->utf8nfdi); |
1813 | if (leaf) { |
1814 | printf(" gen %d ccc %d" |
1815 | " nfdi -> \"%s\"" , |
1816 | LEAF_GEN(leaf), |
1817 | LEAF_CCC(leaf), |
1818 | LEAF_CCC(leaf) == DECOMPOSE ? |
1819 | LEAF_STR(leaf) : "" ); |
1820 | } |
1821 | printf("\n" ); |
1822 | } |
1823 | } |
1824 | } |
1825 | |
1826 | static void trees_verify(void) |
1827 | { |
1828 | int i; |
1829 | |
1830 | for (i = 0; i != trees_count; i++) |
1831 | verify(tree: &trees[i]); |
1832 | } |
1833 | |
1834 | /* ------------------------------------------------------------------ */ |
1835 | |
1836 | static void help(void) |
1837 | { |
1838 | printf("Usage: %s [options]\n" , argv0); |
1839 | printf("\n" ); |
1840 | printf("This program creates an a data trie used for parsing and\n" ); |
1841 | printf("normalization of UTF-8 strings. The trie is derived from\n" ); |
1842 | printf("a set of input files from the Unicode character database\n" ); |
1843 | printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n" ); |
1844 | printf("\n" ); |
1845 | printf("The generated tree supports two normalization forms:\n" ); |
1846 | printf("\n" ); |
1847 | printf("\tnfdi:\n" ); |
1848 | printf("\t- Apply unicode normalization form NFD.\n" ); |
1849 | printf("\t- Remove any Default_Ignorable_Code_Point.\n" ); |
1850 | printf("\n" ); |
1851 | printf("\tnfdicf:\n" ); |
1852 | printf("\t- Apply unicode normalization form NFD.\n" ); |
1853 | printf("\t- Remove any Default_Ignorable_Code_Point.\n" ); |
1854 | printf("\t- Apply a full casefold (C + F).\n" ); |
1855 | printf("\n" ); |
1856 | printf("These forms were chosen as being most useful when dealing\n" ); |
1857 | printf("with file names: NFD catches most cases where characters\n" ); |
1858 | printf("should be considered equivalent. The ignorables are mostly\n" ); |
1859 | printf("invisible, making names hard to type.\n" ); |
1860 | printf("\n" ); |
1861 | printf("The options to specify the files to be used are listed\n" ); |
1862 | printf("below with their default values, which are the names used\n" ); |
1863 | printf("by version 11.0.0 of the Unicode Character Database.\n" ); |
1864 | printf("\n" ); |
1865 | printf("The input files:\n" ); |
1866 | printf("\t-a %s\n" , AGE_NAME); |
1867 | printf("\t-c %s\n" , CCC_NAME); |
1868 | printf("\t-p %s\n" , PROP_NAME); |
1869 | printf("\t-d %s\n" , DATA_NAME); |
1870 | printf("\t-f %s\n" , FOLD_NAME); |
1871 | printf("\t-n %s\n" , NORM_NAME); |
1872 | printf("\n" ); |
1873 | printf("Additionally, the generated tables are tested using:\n" ); |
1874 | printf("\t-t %s\n" , TEST_NAME); |
1875 | printf("\n" ); |
1876 | printf("Finally, the output file:\n" ); |
1877 | printf("\t-o %s\n" , UTF8_NAME); |
1878 | printf("\n" ); |
1879 | } |
1880 | |
1881 | static void usage(void) |
1882 | { |
1883 | help(); |
1884 | exit(1); |
1885 | } |
1886 | |
1887 | static void open_fail(const char *name, int error) |
1888 | { |
1889 | printf("Error %d opening %s: %s\n" , error, name, strerror(error)); |
1890 | exit(1); |
1891 | } |
1892 | |
1893 | static void file_fail(const char *filename) |
1894 | { |
1895 | printf("Error parsing %s\n" , filename); |
1896 | exit(1); |
1897 | } |
1898 | |
1899 | static void line_fail(const char *filename, const char *line) |
1900 | { |
1901 | printf("Error parsing %s:%s\n" , filename, line); |
1902 | exit(1); |
1903 | } |
1904 | |
1905 | /* ------------------------------------------------------------------ */ |
1906 | |
1907 | static void print_utf32(unsigned int *utf32str) |
1908 | { |
1909 | int i; |
1910 | |
1911 | for (i = 0; utf32str[i]; i++) |
1912 | printf(" %X" , utf32str[i]); |
1913 | } |
1914 | |
1915 | static void print_utf32nfdi(unsigned int unichar) |
1916 | { |
1917 | printf(" %X ->" , unichar); |
1918 | print_utf32(utf32str: unicode_data[unichar].utf32nfdi); |
1919 | printf("\n" ); |
1920 | } |
1921 | |
1922 | static void print_utf32nfdicf(unsigned int unichar) |
1923 | { |
1924 | printf(" %X ->" , unichar); |
1925 | print_utf32(utf32str: unicode_data[unichar].utf32nfdicf); |
1926 | printf("\n" ); |
1927 | } |
1928 | |
1929 | /* ------------------------------------------------------------------ */ |
1930 | |
1931 | static void age_init(void) |
1932 | { |
1933 | FILE *file; |
1934 | unsigned int first; |
1935 | unsigned int last; |
1936 | unsigned int unichar; |
1937 | unsigned int major; |
1938 | unsigned int minor; |
1939 | unsigned int revision; |
1940 | int gen; |
1941 | int count; |
1942 | int ret; |
1943 | |
1944 | if (verbose > 0) |
1945 | printf("Parsing %s\n" , age_name); |
1946 | |
1947 | file = fopen(age_name, "r" ); |
1948 | if (!file) |
1949 | open_fail(name: age_name, error: errno); |
1950 | count = 0; |
1951 | |
1952 | gen = 0; |
1953 | while (fgets(line, LINESIZE, file)) { |
1954 | ret = sscanf(line, "# Age=V%d_%d_%d" , |
1955 | &major, &minor, &revision); |
1956 | if (ret == 3) { |
1957 | ages_count++; |
1958 | if (verbose > 1) |
1959 | printf(" Age V%d_%d_%d\n" , |
1960 | major, minor, revision); |
1961 | if (!age_valid(major, minor, revision)) |
1962 | line_fail(filename: age_name, line); |
1963 | continue; |
1964 | } |
1965 | ret = sscanf(line, "# Age=V%d_%d" , &major, &minor); |
1966 | if (ret == 2) { |
1967 | ages_count++; |
1968 | if (verbose > 1) |
1969 | printf(" Age V%d_%d\n" , major, minor); |
1970 | if (!age_valid(major, minor, revision: 0)) |
1971 | line_fail(filename: age_name, line); |
1972 | continue; |
1973 | } |
1974 | } |
1975 | |
1976 | /* We must have found something above. */ |
1977 | if (verbose > 1) |
1978 | printf("%d age entries\n" , ages_count); |
1979 | if (ages_count == 0 || ages_count > MAXGEN) |
1980 | file_fail(filename: age_name); |
1981 | |
1982 | /* There is a 0 entry. */ |
1983 | ages_count++; |
1984 | ages = calloc(ages_count + 1, sizeof(*ages)); |
1985 | /* And a guard entry. */ |
1986 | ages[ages_count] = (unsigned int)-1; |
1987 | |
1988 | rewind(file); |
1989 | count = 0; |
1990 | gen = 0; |
1991 | while (fgets(line, LINESIZE, file)) { |
1992 | ret = sscanf(line, "# Age=V%d_%d_%d" , |
1993 | &major, &minor, &revision); |
1994 | if (ret == 3) { |
1995 | ages[++gen] = |
1996 | UNICODE_AGE(major, minor, revision); |
1997 | if (verbose > 1) |
1998 | printf(" Age V%d_%d_%d = gen %d\n" , |
1999 | major, minor, revision, gen); |
2000 | if (!age_valid(major, minor, revision)) |
2001 | line_fail(filename: age_name, line); |
2002 | continue; |
2003 | } |
2004 | ret = sscanf(line, "# Age=V%d_%d" , &major, &minor); |
2005 | if (ret == 2) { |
2006 | ages[++gen] = UNICODE_AGE(major, minor, 0); |
2007 | if (verbose > 1) |
2008 | printf(" Age V%d_%d = %d\n" , |
2009 | major, minor, gen); |
2010 | if (!age_valid(major, minor, revision: 0)) |
2011 | line_fail(filename: age_name, line); |
2012 | continue; |
2013 | } |
2014 | ret = sscanf(line, "%X..%X ; %d.%d #" , |
2015 | &first, &last, &major, &minor); |
2016 | if (ret == 4) { |
2017 | for (unichar = first; unichar <= last; unichar++) |
2018 | unicode_data[unichar].gen = gen; |
2019 | count += 1 + last - first; |
2020 | if (verbose > 1) |
2021 | printf(" %X..%X gen %d\n" , first, last, gen); |
2022 | if (!utf32valid(unichar: first) || !utf32valid(unichar: last)) |
2023 | line_fail(filename: age_name, line); |
2024 | continue; |
2025 | } |
2026 | ret = sscanf(line, "%X ; %d.%d #" , &unichar, &major, &minor); |
2027 | if (ret == 3) { |
2028 | unicode_data[unichar].gen = gen; |
2029 | count++; |
2030 | if (verbose > 1) |
2031 | printf(" %X gen %d\n" , unichar, gen); |
2032 | if (!utf32valid(unichar)) |
2033 | line_fail(filename: age_name, line); |
2034 | continue; |
2035 | } |
2036 | } |
2037 | unicode_maxage = ages[gen]; |
2038 | fclose(file); |
2039 | |
2040 | /* Nix surrogate block */ |
2041 | if (verbose > 1) |
2042 | printf(" Removing surrogate block D800..DFFF\n" ); |
2043 | for (unichar = 0xd800; unichar <= 0xdfff; unichar++) |
2044 | unicode_data[unichar].gen = -1; |
2045 | |
2046 | if (verbose > 0) |
2047 | printf("Found %d entries\n" , count); |
2048 | if (count == 0) |
2049 | file_fail(filename: age_name); |
2050 | } |
2051 | |
2052 | static void ccc_init(void) |
2053 | { |
2054 | FILE *file; |
2055 | unsigned int first; |
2056 | unsigned int last; |
2057 | unsigned int unichar; |
2058 | unsigned int value; |
2059 | int count; |
2060 | int ret; |
2061 | |
2062 | if (verbose > 0) |
2063 | printf("Parsing %s\n" , ccc_name); |
2064 | |
2065 | file = fopen(ccc_name, "r" ); |
2066 | if (!file) |
2067 | open_fail(name: ccc_name, error: errno); |
2068 | |
2069 | count = 0; |
2070 | while (fgets(line, LINESIZE, file)) { |
2071 | ret = sscanf(line, "%X..%X ; %d #" , &first, &last, &value); |
2072 | if (ret == 3) { |
2073 | for (unichar = first; unichar <= last; unichar++) { |
2074 | unicode_data[unichar].ccc = value; |
2075 | count++; |
2076 | } |
2077 | if (verbose > 1) |
2078 | printf(" %X..%X ccc %d\n" , first, last, value); |
2079 | if (!utf32valid(unichar: first) || !utf32valid(unichar: last)) |
2080 | line_fail(filename: ccc_name, line); |
2081 | continue; |
2082 | } |
2083 | ret = sscanf(line, "%X ; %d #" , &unichar, &value); |
2084 | if (ret == 2) { |
2085 | unicode_data[unichar].ccc = value; |
2086 | count++; |
2087 | if (verbose > 1) |
2088 | printf(" %X ccc %d\n" , unichar, value); |
2089 | if (!utf32valid(unichar)) |
2090 | line_fail(filename: ccc_name, line); |
2091 | continue; |
2092 | } |
2093 | } |
2094 | fclose(file); |
2095 | |
2096 | if (verbose > 0) |
2097 | printf("Found %d entries\n" , count); |
2098 | if (count == 0) |
2099 | file_fail(filename: ccc_name); |
2100 | } |
2101 | |
2102 | static int ignore_compatibility_form(char *type) |
2103 | { |
2104 | int i; |
2105 | char *ignored_types[] = {"font" , "noBreak" , "initial" , "medial" , |
2106 | "final" , "isolated" , "circle" , "super" , |
2107 | "sub" , "vertical" , "wide" , "narrow" , |
2108 | "small" , "square" , "fraction" , "compat" }; |
2109 | |
2110 | for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++) |
2111 | if (strcmp(type, ignored_types[i]) == 0) |
2112 | return 1; |
2113 | return 0; |
2114 | } |
2115 | |
2116 | static void nfdi_init(void) |
2117 | { |
2118 | FILE *file; |
2119 | unsigned int unichar; |
2120 | unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ |
2121 | char *s; |
2122 | char *type; |
2123 | unsigned int *um; |
2124 | int count; |
2125 | int i; |
2126 | int ret; |
2127 | |
2128 | if (verbose > 0) |
2129 | printf("Parsing %s\n" , data_name); |
2130 | file = fopen(data_name, "r" ); |
2131 | if (!file) |
2132 | open_fail(name: data_name, error: errno); |
2133 | |
2134 | count = 0; |
2135 | while (fgets(line, LINESIZE, file)) { |
2136 | ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];" , |
2137 | &unichar, buf0); |
2138 | if (ret != 2) |
2139 | continue; |
2140 | if (!utf32valid(unichar)) |
2141 | line_fail(filename: data_name, line); |
2142 | |
2143 | s = buf0; |
2144 | /* skip over <tag> */ |
2145 | if (*s == '<') { |
2146 | type = ++s; |
2147 | while (*++s != '>'); |
2148 | *s++ = '\0'; |
2149 | if(ignore_compatibility_form(type)) |
2150 | continue; |
2151 | } |
2152 | /* decode the decomposition into UTF-32 */ |
2153 | i = 0; |
2154 | while (*s) { |
2155 | mapping[i] = strtoul(s, &s, 16); |
2156 | if (!utf32valid(unichar: mapping[i])) |
2157 | line_fail(filename: data_name, line); |
2158 | i++; |
2159 | } |
2160 | mapping[i++] = 0; |
2161 | |
2162 | um = malloc(i * sizeof(unsigned int)); |
2163 | memcpy(um, mapping, i * sizeof(unsigned int)); |
2164 | unicode_data[unichar].utf32nfdi = um; |
2165 | |
2166 | if (verbose > 1) |
2167 | print_utf32nfdi(unichar); |
2168 | count++; |
2169 | } |
2170 | fclose(file); |
2171 | if (verbose > 0) |
2172 | printf("Found %d entries\n" , count); |
2173 | if (count == 0) |
2174 | file_fail(filename: data_name); |
2175 | } |
2176 | |
2177 | static void nfdicf_init(void) |
2178 | { |
2179 | FILE *file; |
2180 | unsigned int unichar; |
2181 | unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ |
2182 | char status; |
2183 | char *s; |
2184 | unsigned int *um; |
2185 | int i; |
2186 | int count; |
2187 | int ret; |
2188 | |
2189 | if (verbose > 0) |
2190 | printf("Parsing %s\n" , fold_name); |
2191 | file = fopen(fold_name, "r" ); |
2192 | if (!file) |
2193 | open_fail(name: fold_name, error: errno); |
2194 | |
2195 | count = 0; |
2196 | while (fgets(line, LINESIZE, file)) { |
2197 | ret = sscanf(line, "%X; %c; %[^;];" , &unichar, &status, buf0); |
2198 | if (ret != 3) |
2199 | continue; |
2200 | if (!utf32valid(unichar)) |
2201 | line_fail(filename: fold_name, line); |
2202 | /* Use the C+F casefold. */ |
2203 | if (status != 'C' && status != 'F') |
2204 | continue; |
2205 | s = buf0; |
2206 | if (*s == '<') |
2207 | while (*s++ != ' ') |
2208 | ; |
2209 | i = 0; |
2210 | while (*s) { |
2211 | mapping[i] = strtoul(s, &s, 16); |
2212 | if (!utf32valid(unichar: mapping[i])) |
2213 | line_fail(filename: fold_name, line); |
2214 | i++; |
2215 | } |
2216 | mapping[i++] = 0; |
2217 | |
2218 | um = malloc(i * sizeof(unsigned int)); |
2219 | memcpy(um, mapping, i * sizeof(unsigned int)); |
2220 | unicode_data[unichar].utf32nfdicf = um; |
2221 | |
2222 | if (verbose > 1) |
2223 | print_utf32nfdicf(unichar); |
2224 | count++; |
2225 | } |
2226 | fclose(file); |
2227 | if (verbose > 0) |
2228 | printf("Found %d entries\n" , count); |
2229 | if (count == 0) |
2230 | file_fail(filename: fold_name); |
2231 | } |
2232 | |
2233 | static void ignore_init(void) |
2234 | { |
2235 | FILE *file; |
2236 | unsigned int unichar; |
2237 | unsigned int first; |
2238 | unsigned int last; |
2239 | unsigned int *um; |
2240 | int count; |
2241 | int ret; |
2242 | |
2243 | if (verbose > 0) |
2244 | printf("Parsing %s\n" , prop_name); |
2245 | file = fopen(prop_name, "r" ); |
2246 | if (!file) |
2247 | open_fail(name: prop_name, error: errno); |
2248 | assert(file); |
2249 | count = 0; |
2250 | while (fgets(line, LINESIZE, file)) { |
2251 | ret = sscanf(line, "%X..%X ; %s # " , &first, &last, buf0); |
2252 | if (ret == 3) { |
2253 | if (strcmp(buf0, "Default_Ignorable_Code_Point" )) |
2254 | continue; |
2255 | if (!utf32valid(unichar: first) || !utf32valid(unichar: last)) |
2256 | line_fail(filename: prop_name, line); |
2257 | for (unichar = first; unichar <= last; unichar++) { |
2258 | free(unicode_data[unichar].utf32nfdi); |
2259 | um = malloc(sizeof(unsigned int)); |
2260 | *um = 0; |
2261 | unicode_data[unichar].utf32nfdi = um; |
2262 | free(unicode_data[unichar].utf32nfdicf); |
2263 | um = malloc(sizeof(unsigned int)); |
2264 | *um = 0; |
2265 | unicode_data[unichar].utf32nfdicf = um; |
2266 | count++; |
2267 | } |
2268 | if (verbose > 1) |
2269 | printf(" %X..%X Default_Ignorable_Code_Point\n" , |
2270 | first, last); |
2271 | continue; |
2272 | } |
2273 | ret = sscanf(line, "%X ; %s # " , &unichar, buf0); |
2274 | if (ret == 2) { |
2275 | if (strcmp(buf0, "Default_Ignorable_Code_Point" )) |
2276 | continue; |
2277 | if (!utf32valid(unichar)) |
2278 | line_fail(filename: prop_name, line); |
2279 | free(unicode_data[unichar].utf32nfdi); |
2280 | um = malloc(sizeof(unsigned int)); |
2281 | *um = 0; |
2282 | unicode_data[unichar].utf32nfdi = um; |
2283 | free(unicode_data[unichar].utf32nfdicf); |
2284 | um = malloc(sizeof(unsigned int)); |
2285 | *um = 0; |
2286 | unicode_data[unichar].utf32nfdicf = um; |
2287 | if (verbose > 1) |
2288 | printf(" %X Default_Ignorable_Code_Point\n" , |
2289 | unichar); |
2290 | count++; |
2291 | continue; |
2292 | } |
2293 | } |
2294 | fclose(file); |
2295 | |
2296 | if (verbose > 0) |
2297 | printf("Found %d entries\n" , count); |
2298 | if (count == 0) |
2299 | file_fail(filename: prop_name); |
2300 | } |
2301 | |
2302 | static void corrections_init(void) |
2303 | { |
2304 | FILE *file; |
2305 | unsigned int unichar; |
2306 | unsigned int major; |
2307 | unsigned int minor; |
2308 | unsigned int revision; |
2309 | unsigned int age; |
2310 | unsigned int *um; |
2311 | unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ |
2312 | char *s; |
2313 | int i; |
2314 | int count; |
2315 | int ret; |
2316 | |
2317 | if (verbose > 0) |
2318 | printf("Parsing %s\n" , norm_name); |
2319 | file = fopen(norm_name, "r" ); |
2320 | if (!file) |
2321 | open_fail(norm_name, errno); |
2322 | |
2323 | count = 0; |
2324 | while (fgets(line, LINESIZE, file)) { |
2325 | ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #" , |
2326 | &unichar, buf0, buf1, |
2327 | &major, &minor, &revision); |
2328 | if (ret != 6) |
2329 | continue; |
2330 | if (!utf32valid(unichar) || !age_valid(major, minor, revision)) |
2331 | line_fail(filename: norm_name, line); |
2332 | count++; |
2333 | } |
2334 | corrections = calloc(count, sizeof(struct unicode_data)); |
2335 | corrections_count = count; |
2336 | rewind(file); |
2337 | |
2338 | count = 0; |
2339 | while (fgets(line, LINESIZE, file)) { |
2340 | ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #" , |
2341 | &unichar, buf0, buf1, |
2342 | &major, &minor, &revision); |
2343 | if (ret != 6) |
2344 | continue; |
2345 | if (!utf32valid(unichar) || !age_valid(major, minor, revision)) |
2346 | line_fail(filename: norm_name, line); |
2347 | corrections[count] = unicode_data[unichar]; |
2348 | assert(corrections[count].code == unichar); |
2349 | age = UNICODE_AGE(major, minor, revision); |
2350 | corrections[count].correction = age; |
2351 | |
2352 | i = 0; |
2353 | s = buf0; |
2354 | while (*s) { |
2355 | mapping[i] = strtoul(s, &s, 16); |
2356 | if (!utf32valid(unichar: mapping[i])) |
2357 | line_fail(filename: norm_name, line); |
2358 | i++; |
2359 | } |
2360 | mapping[i++] = 0; |
2361 | |
2362 | um = malloc(i * sizeof(unsigned int)); |
2363 | memcpy(um, mapping, i * sizeof(unsigned int)); |
2364 | corrections[count].utf32nfdi = um; |
2365 | |
2366 | if (verbose > 1) |
2367 | printf(" %X -> %s -> %s V%d_%d_%d\n" , |
2368 | unichar, buf0, buf1, major, minor, revision); |
2369 | count++; |
2370 | } |
2371 | fclose(file); |
2372 | |
2373 | if (verbose > 0) |
2374 | printf("Found %d entries\n" , count); |
2375 | if (count == 0) |
2376 | file_fail(filename: norm_name); |
2377 | } |
2378 | |
2379 | /* ------------------------------------------------------------------ */ |
2380 | |
2381 | /* |
2382 | * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0) |
2383 | * |
2384 | * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; |
2385 | * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; |
2386 | * |
2387 | * SBase = 0xAC00 |
2388 | * LBase = 0x1100 |
2389 | * VBase = 0x1161 |
2390 | * TBase = 0x11A7 |
2391 | * LCount = 19 |
2392 | * VCount = 21 |
2393 | * TCount = 28 |
2394 | * NCount = 588 (VCount * TCount) |
2395 | * SCount = 11172 (LCount * NCount) |
2396 | * |
2397 | * Decomposition: |
2398 | * SIndex = s - SBase |
2399 | * |
2400 | * LV (Canonical/Full) |
2401 | * LIndex = SIndex / NCount |
2402 | * VIndex = (Sindex % NCount) / TCount |
2403 | * LPart = LBase + LIndex |
2404 | * VPart = VBase + VIndex |
2405 | * |
2406 | * LVT (Canonical) |
2407 | * LVIndex = (SIndex / TCount) * TCount |
2408 | * TIndex = (Sindex % TCount) |
2409 | * LVPart = SBase + LVIndex |
2410 | * TPart = TBase + TIndex |
2411 | * |
2412 | * LVT (Full) |
2413 | * LIndex = SIndex / NCount |
2414 | * VIndex = (Sindex % NCount) / TCount |
2415 | * TIndex = (Sindex % TCount) |
2416 | * LPart = LBase + LIndex |
2417 | * VPart = VBase + VIndex |
2418 | * if (TIndex == 0) { |
2419 | * d = <LPart, VPart> |
2420 | * } else { |
2421 | * TPart = TBase + TIndex |
2422 | * d = <LPart, VPart, TPart> |
2423 | * } |
2424 | * |
2425 | */ |
2426 | |
2427 | static void hangul_decompose(void) |
2428 | { |
2429 | unsigned int sb = 0xAC00; |
2430 | unsigned int lb = 0x1100; |
2431 | unsigned int vb = 0x1161; |
2432 | unsigned int tb = 0x11a7; |
2433 | /* unsigned int lc = 19; */ |
2434 | unsigned int vc = 21; |
2435 | unsigned int tc = 28; |
2436 | unsigned int nc = (vc * tc); |
2437 | /* unsigned int sc = (lc * nc); */ |
2438 | unsigned int unichar; |
2439 | unsigned int mapping[4]; |
2440 | unsigned int *um; |
2441 | int count; |
2442 | int i; |
2443 | |
2444 | if (verbose > 0) |
2445 | printf("Decomposing hangul\n" ); |
2446 | /* Hangul */ |
2447 | count = 0; |
2448 | for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) { |
2449 | unsigned int si = unichar - sb; |
2450 | unsigned int li = si / nc; |
2451 | unsigned int vi = (si % nc) / tc; |
2452 | unsigned int ti = si % tc; |
2453 | |
2454 | i = 0; |
2455 | mapping[i++] = lb + li; |
2456 | mapping[i++] = vb + vi; |
2457 | if (ti) |
2458 | mapping[i++] = tb + ti; |
2459 | mapping[i++] = 0; |
2460 | |
2461 | assert(!unicode_data[unichar].utf32nfdi); |
2462 | um = malloc(i * sizeof(unsigned int)); |
2463 | memcpy(um, mapping, i * sizeof(unsigned int)); |
2464 | unicode_data[unichar].utf32nfdi = um; |
2465 | |
2466 | assert(!unicode_data[unichar].utf32nfdicf); |
2467 | um = malloc(i * sizeof(unsigned int)); |
2468 | memcpy(um, mapping, i * sizeof(unsigned int)); |
2469 | unicode_data[unichar].utf32nfdicf = um; |
2470 | |
2471 | /* |
2472 | * Add a cookie as a reminder that the hangul syllable |
2473 | * decompositions must not be stored in the generated |
2474 | * trie. |
2475 | */ |
2476 | unicode_data[unichar].utf8nfdi = malloc(2); |
2477 | unicode_data[unichar].utf8nfdi[0] = HANGUL; |
2478 | unicode_data[unichar].utf8nfdi[1] = '\0'; |
2479 | |
2480 | if (verbose > 1) |
2481 | print_utf32nfdi(unichar); |
2482 | |
2483 | count++; |
2484 | } |
2485 | if (verbose > 0) |
2486 | printf("Created %d entries\n" , count); |
2487 | } |
2488 | |
2489 | static void nfdi_decompose(void) |
2490 | { |
2491 | unsigned int unichar; |
2492 | unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ |
2493 | unsigned int *um; |
2494 | unsigned int *dc; |
2495 | int count; |
2496 | int i; |
2497 | int j; |
2498 | int ret; |
2499 | |
2500 | if (verbose > 0) |
2501 | printf("Decomposing nfdi\n" ); |
2502 | |
2503 | count = 0; |
2504 | for (unichar = 0; unichar != 0x110000; unichar++) { |
2505 | if (!unicode_data[unichar].utf32nfdi) |
2506 | continue; |
2507 | for (;;) { |
2508 | ret = 1; |
2509 | i = 0; |
2510 | um = unicode_data[unichar].utf32nfdi; |
2511 | while (*um) { |
2512 | dc = unicode_data[*um].utf32nfdi; |
2513 | if (dc) { |
2514 | for (j = 0; dc[j]; j++) |
2515 | mapping[i++] = dc[j]; |
2516 | ret = 0; |
2517 | } else { |
2518 | mapping[i++] = *um; |
2519 | } |
2520 | um++; |
2521 | } |
2522 | mapping[i++] = 0; |
2523 | if (ret) |
2524 | break; |
2525 | free(unicode_data[unichar].utf32nfdi); |
2526 | um = malloc(i * sizeof(unsigned int)); |
2527 | memcpy(um, mapping, i * sizeof(unsigned int)); |
2528 | unicode_data[unichar].utf32nfdi = um; |
2529 | } |
2530 | /* Add this decomposition to nfdicf if there is no entry. */ |
2531 | if (!unicode_data[unichar].utf32nfdicf) { |
2532 | um = malloc(i * sizeof(unsigned int)); |
2533 | memcpy(um, mapping, i * sizeof(unsigned int)); |
2534 | unicode_data[unichar].utf32nfdicf = um; |
2535 | } |
2536 | if (verbose > 1) |
2537 | print_utf32nfdi(unichar); |
2538 | count++; |
2539 | } |
2540 | if (verbose > 0) |
2541 | printf("Processed %d entries\n" , count); |
2542 | } |
2543 | |
2544 | static void nfdicf_decompose(void) |
2545 | { |
2546 | unsigned int unichar; |
2547 | unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ |
2548 | unsigned int *um; |
2549 | unsigned int *dc; |
2550 | int count; |
2551 | int i; |
2552 | int j; |
2553 | int ret; |
2554 | |
2555 | if (verbose > 0) |
2556 | printf("Decomposing nfdicf\n" ); |
2557 | count = 0; |
2558 | for (unichar = 0; unichar != 0x110000; unichar++) { |
2559 | if (!unicode_data[unichar].utf32nfdicf) |
2560 | continue; |
2561 | for (;;) { |
2562 | ret = 1; |
2563 | i = 0; |
2564 | um = unicode_data[unichar].utf32nfdicf; |
2565 | while (*um) { |
2566 | dc = unicode_data[*um].utf32nfdicf; |
2567 | if (dc) { |
2568 | for (j = 0; dc[j]; j++) |
2569 | mapping[i++] = dc[j]; |
2570 | ret = 0; |
2571 | } else { |
2572 | mapping[i++] = *um; |
2573 | } |
2574 | um++; |
2575 | } |
2576 | mapping[i++] = 0; |
2577 | if (ret) |
2578 | break; |
2579 | free(unicode_data[unichar].utf32nfdicf); |
2580 | um = malloc(i * sizeof(unsigned int)); |
2581 | memcpy(um, mapping, i * sizeof(unsigned int)); |
2582 | unicode_data[unichar].utf32nfdicf = um; |
2583 | } |
2584 | if (verbose > 1) |
2585 | print_utf32nfdicf(unichar); |
2586 | count++; |
2587 | } |
2588 | if (verbose > 0) |
2589 | printf("Processed %d entries\n" , count); |
2590 | } |
2591 | |
2592 | /* ------------------------------------------------------------------ */ |
2593 | |
2594 | int utf8agemax(struct tree *, const char *); |
2595 | int utf8nagemax(struct tree *, const char *, size_t); |
2596 | int utf8agemin(struct tree *, const char *); |
2597 | int utf8nagemin(struct tree *, const char *, size_t); |
2598 | ssize_t utf8len(struct tree *, const char *); |
2599 | ssize_t utf8nlen(struct tree *, const char *, size_t); |
2600 | struct utf8cursor; |
2601 | int utf8cursor(struct utf8cursor *, struct tree *, const char *); |
2602 | int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t); |
2603 | int utf8byte(struct utf8cursor *); |
2604 | |
2605 | /* |
2606 | * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0) |
2607 | * |
2608 | * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; |
2609 | * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; |
2610 | * |
2611 | * SBase = 0xAC00 |
2612 | * LBase = 0x1100 |
2613 | * VBase = 0x1161 |
2614 | * TBase = 0x11A7 |
2615 | * LCount = 19 |
2616 | * VCount = 21 |
2617 | * TCount = 28 |
2618 | * NCount = 588 (VCount * TCount) |
2619 | * SCount = 11172 (LCount * NCount) |
2620 | * |
2621 | * Decomposition: |
2622 | * SIndex = s - SBase |
2623 | * |
2624 | * LV (Canonical/Full) |
2625 | * LIndex = SIndex / NCount |
2626 | * VIndex = (Sindex % NCount) / TCount |
2627 | * LPart = LBase + LIndex |
2628 | * VPart = VBase + VIndex |
2629 | * |
2630 | * LVT (Canonical) |
2631 | * LVIndex = (SIndex / TCount) * TCount |
2632 | * TIndex = (Sindex % TCount) |
2633 | * LVPart = SBase + LVIndex |
2634 | * TPart = TBase + TIndex |
2635 | * |
2636 | * LVT (Full) |
2637 | * LIndex = SIndex / NCount |
2638 | * VIndex = (Sindex % NCount) / TCount |
2639 | * TIndex = (Sindex % TCount) |
2640 | * LPart = LBase + LIndex |
2641 | * VPart = VBase + VIndex |
2642 | * if (TIndex == 0) { |
2643 | * d = <LPart, VPart> |
2644 | * } else { |
2645 | * TPart = TBase + TIndex |
2646 | * d = <LPart, VPart, TPart> |
2647 | * } |
2648 | */ |
2649 | |
2650 | /* Constants */ |
2651 | #define SB (0xAC00) |
2652 | #define LB (0x1100) |
2653 | #define VB (0x1161) |
2654 | #define TB (0x11A7) |
2655 | #define LC (19) |
2656 | #define VC (21) |
2657 | #define TC (28) |
2658 | #define NC (VC * TC) |
2659 | #define SC (LC * NC) |
2660 | |
2661 | /* Algorithmic decomposition of hangul syllable. */ |
2662 | static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul) |
2663 | { |
2664 | unsigned int si; |
2665 | unsigned int li; |
2666 | unsigned int vi; |
2667 | unsigned int ti; |
2668 | unsigned char *h; |
2669 | |
2670 | /* Calculate the SI, LI, VI, and TI values. */ |
2671 | si = utf8decode(str) - SB; |
2672 | li = si / NC; |
2673 | vi = (si % NC) / TC; |
2674 | ti = si % TC; |
2675 | |
2676 | /* Fill in base of leaf. */ |
2677 | h = hangul; |
2678 | LEAF_GEN(h) = 2; |
2679 | LEAF_CCC(h) = DECOMPOSE; |
2680 | h += 2; |
2681 | |
2682 | /* Add LPart, a 3-byte UTF-8 sequence. */ |
2683 | h += utf8encode(str: (char *)h, val: li + LB); |
2684 | |
2685 | /* Add VPart, a 3-byte UTF-8 sequence. */ |
2686 | h += utf8encode(str: (char *)h, val: vi + VB); |
2687 | |
2688 | /* Add TPart if required, also a 3-byte UTF-8 sequence. */ |
2689 | if (ti) |
2690 | h += utf8encode(str: (char *)h, val: ti + TB); |
2691 | |
2692 | /* Terminate string. */ |
2693 | h[0] = '\0'; |
2694 | |
2695 | return hangul; |
2696 | } |
2697 | |
2698 | /* |
2699 | * Use trie to scan s, touching at most len bytes. |
2700 | * Returns the leaf if one exists, NULL otherwise. |
2701 | * |
2702 | * A non-NULL return guarantees that the UTF-8 sequence starting at s |
2703 | * is well-formed and corresponds to a known unicode code point. The |
2704 | * shorthand for this will be "is valid UTF-8 unicode". |
2705 | */ |
2706 | static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul, |
2707 | const char *s, size_t len) |
2708 | { |
2709 | utf8trie_t *trie; |
2710 | int offlen; |
2711 | int offset; |
2712 | int mask; |
2713 | int node; |
2714 | |
2715 | if (!tree) |
2716 | return NULL; |
2717 | if (len == 0) |
2718 | return NULL; |
2719 | node = 1; |
2720 | trie = utf8data + tree->index; |
2721 | while (node) { |
2722 | offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT; |
2723 | if (*trie & NEXTBYTE) { |
2724 | if (--len == 0) |
2725 | return NULL; |
2726 | s++; |
2727 | } |
2728 | mask = 1 << (*trie & BITNUM); |
2729 | if (*s & mask) { |
2730 | /* Right leg */ |
2731 | if (offlen) { |
2732 | /* Right node at offset of trie */ |
2733 | node = (*trie & RIGHTNODE); |
2734 | offset = trie[offlen]; |
2735 | while (--offlen) { |
2736 | offset <<= 8; |
2737 | offset |= trie[offlen]; |
2738 | } |
2739 | trie += offset; |
2740 | } else if (*trie & RIGHTPATH) { |
2741 | /* Right node after this node */ |
2742 | node = (*trie & TRIENODE); |
2743 | trie++; |
2744 | } else { |
2745 | /* No right node. */ |
2746 | return NULL; |
2747 | } |
2748 | } else { |
2749 | /* Left leg */ |
2750 | if (offlen) { |
2751 | /* Left node after this node. */ |
2752 | node = (*trie & LEFTNODE); |
2753 | trie += offlen + 1; |
2754 | } else if (*trie & RIGHTPATH) { |
2755 | /* No left node. */ |
2756 | return NULL; |
2757 | } else { |
2758 | /* Left node after this node */ |
2759 | node = (*trie & TRIENODE); |
2760 | trie++; |
2761 | } |
2762 | } |
2763 | } |
2764 | /* |
2765 | * Hangul decomposition is done algorithmically. These are the |
2766 | * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is |
2767 | * always 3 bytes long, so s has been advanced twice, and the |
2768 | * start of the sequence is at s-2. |
2769 | */ |
2770 | if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL) |
2771 | trie = utf8hangul(str: s - 2, hangul); |
2772 | return trie; |
2773 | } |
2774 | |
2775 | /* |
2776 | * Use trie to scan s. |
2777 | * Returns the leaf if one exists, NULL otherwise. |
2778 | * |
2779 | * Forwards to trie_nlookup(). |
2780 | */ |
2781 | static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul, |
2782 | const char *s) |
2783 | { |
2784 | return utf8nlookup(tree, hangul, s, (size_t)-1); |
2785 | } |
2786 | |
2787 | /* |
2788 | * Return the number of bytes used by the current UTF-8 sequence. |
2789 | * Assumes the input points to the first byte of a valid UTF-8 |
2790 | * sequence. |
2791 | */ |
2792 | static inline int utf8clen(const char *s) |
2793 | { |
2794 | unsigned char c = *s; |
2795 | return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0); |
2796 | } |
2797 | |
2798 | /* |
2799 | * Maximum age of any character in s. |
2800 | * Return -1 if s is not valid UTF-8 unicode. |
2801 | * Return 0 if only non-assigned code points are used. |
2802 | */ |
2803 | int utf8agemax(struct tree *tree, const char *s) |
2804 | { |
2805 | utf8leaf_t *leaf; |
2806 | int age = 0; |
2807 | int leaf_age; |
2808 | unsigned char hangul[UTF8HANGULLEAF]; |
2809 | |
2810 | if (!tree) |
2811 | return -1; |
2812 | |
2813 | while (*s) { |
2814 | leaf = utf8lookup(tree, hangul, s); |
2815 | if (!leaf) |
2816 | return -1; |
2817 | leaf_age = ages[LEAF_GEN(leaf)]; |
2818 | if (leaf_age <= tree->maxage && leaf_age > age) |
2819 | age = leaf_age; |
2820 | s += utf8clen(s); |
2821 | } |
2822 | return age; |
2823 | } |
2824 | |
2825 | /* |
2826 | * Minimum age of any character in s. |
2827 | * Return -1 if s is not valid UTF-8 unicode. |
2828 | * Return 0 if non-assigned code points are used. |
2829 | */ |
2830 | int utf8agemin(struct tree *tree, const char *s) |
2831 | { |
2832 | utf8leaf_t *leaf; |
2833 | int age; |
2834 | int leaf_age; |
2835 | unsigned char hangul[UTF8HANGULLEAF]; |
2836 | |
2837 | if (!tree) |
2838 | return -1; |
2839 | age = tree->maxage; |
2840 | while (*s) { |
2841 | leaf = utf8lookup(tree, hangul, s); |
2842 | if (!leaf) |
2843 | return -1; |
2844 | leaf_age = ages[LEAF_GEN(leaf)]; |
2845 | if (leaf_age <= tree->maxage && leaf_age < age) |
2846 | age = leaf_age; |
2847 | s += utf8clen(s); |
2848 | } |
2849 | return age; |
2850 | } |
2851 | |
2852 | /* |
2853 | * Maximum age of any character in s, touch at most len bytes. |
2854 | * Return -1 if s is not valid UTF-8 unicode. |
2855 | */ |
2856 | int utf8nagemax(struct tree *tree, const char *s, size_t len) |
2857 | { |
2858 | utf8leaf_t *leaf; |
2859 | int age = 0; |
2860 | int leaf_age; |
2861 | unsigned char hangul[UTF8HANGULLEAF]; |
2862 | |
2863 | if (!tree) |
2864 | return -1; |
2865 | |
2866 | while (len && *s) { |
2867 | leaf = utf8nlookup(tree, hangul, s, size_t: len); |
2868 | if (!leaf) |
2869 | return -1; |
2870 | leaf_age = ages[LEAF_GEN(leaf)]; |
2871 | if (leaf_age <= tree->maxage && leaf_age > age) |
2872 | age = leaf_age; |
2873 | len -= utf8clen(s); |
2874 | s += utf8clen(s); |
2875 | } |
2876 | return age; |
2877 | } |
2878 | |
2879 | /* |
2880 | * Maximum age of any character in s, touch at most len bytes. |
2881 | * Return -1 if s is not valid UTF-8 unicode. |
2882 | */ |
2883 | int utf8nagemin(struct tree *tree, const char *s, size_t len) |
2884 | { |
2885 | utf8leaf_t *leaf; |
2886 | int leaf_age; |
2887 | int age; |
2888 | unsigned char hangul[UTF8HANGULLEAF]; |
2889 | |
2890 | if (!tree) |
2891 | return -1; |
2892 | age = tree->maxage; |
2893 | while (len && *s) { |
2894 | leaf = utf8nlookup(tree, hangul, s, size_t: len); |
2895 | if (!leaf) |
2896 | return -1; |
2897 | leaf_age = ages[LEAF_GEN(leaf)]; |
2898 | if (leaf_age <= tree->maxage && leaf_age < age) |
2899 | age = leaf_age; |
2900 | len -= utf8clen(s); |
2901 | s += utf8clen(s); |
2902 | } |
2903 | return age; |
2904 | } |
2905 | |
2906 | /* |
2907 | * Length of the normalization of s. |
2908 | * Return -1 if s is not valid UTF-8 unicode. |
2909 | * |
2910 | * A string of Default_Ignorable_Code_Point has length 0. |
2911 | */ |
2912 | ssize_t utf8len(struct tree *tree, const char *s) |
2913 | { |
2914 | utf8leaf_t *leaf; |
2915 | size_t ret = 0; |
2916 | unsigned char hangul[UTF8HANGULLEAF]; |
2917 | |
2918 | if (!tree) |
2919 | return -1; |
2920 | while (*s) { |
2921 | leaf = utf8lookup(tree, hangul, s); |
2922 | if (!leaf) |
2923 | return -1; |
2924 | if (ages[LEAF_GEN(leaf)] > tree->maxage) |
2925 | ret += utf8clen(s); |
2926 | else if (LEAF_CCC(leaf) == DECOMPOSE) |
2927 | ret += strlen(LEAF_STR(leaf)); |
2928 | else |
2929 | ret += utf8clen(s); |
2930 | s += utf8clen(s); |
2931 | } |
2932 | return ret; |
2933 | } |
2934 | |
2935 | /* |
2936 | * Length of the normalization of s, touch at most len bytes. |
2937 | * Return -1 if s is not valid UTF-8 unicode. |
2938 | */ |
2939 | ssize_t utf8nlen(struct tree *tree, const char *s, size_t len) |
2940 | { |
2941 | utf8leaf_t *leaf; |
2942 | size_t ret = 0; |
2943 | unsigned char hangul[UTF8HANGULLEAF]; |
2944 | |
2945 | if (!tree) |
2946 | return -1; |
2947 | while (len && *s) { |
2948 | leaf = utf8nlookup(tree, hangul, s, size_t: len); |
2949 | if (!leaf) |
2950 | return -1; |
2951 | if (ages[LEAF_GEN(leaf)] > tree->maxage) |
2952 | ret += utf8clen(s); |
2953 | else if (LEAF_CCC(leaf) == DECOMPOSE) |
2954 | ret += strlen(LEAF_STR(leaf)); |
2955 | else |
2956 | ret += utf8clen(s); |
2957 | len -= utf8clen(s); |
2958 | s += utf8clen(s); |
2959 | } |
2960 | return ret; |
2961 | } |
2962 | |
2963 | /* |
2964 | * Cursor structure used by the normalizer. |
2965 | */ |
2966 | struct utf8cursor { |
2967 | struct tree *tree; |
2968 | const char *s; |
2969 | const char *p; |
2970 | const char *ss; |
2971 | const char *sp; |
2972 | unsigned int len; |
2973 | unsigned int slen; |
2974 | short int ccc; |
2975 | short int nccc; |
2976 | unsigned int unichar; |
2977 | unsigned char hangul[UTF8HANGULLEAF]; |
2978 | }; |
2979 | |
2980 | /* |
2981 | * Set up an utf8cursor for use by utf8byte(). |
2982 | * |
2983 | * s : string. |
2984 | * len : length of s. |
2985 | * u8c : pointer to cursor. |
2986 | * trie : utf8trie_t to use for normalization. |
2987 | * |
2988 | * Returns -1 on error, 0 on success. |
2989 | */ |
2990 | int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s, |
2991 | size_t len) |
2992 | { |
2993 | if (!tree) |
2994 | return -1; |
2995 | if (!s) |
2996 | return -1; |
2997 | u8c->tree = tree; |
2998 | u8c->s = s; |
2999 | u8c->p = NULL; |
3000 | u8c->ss = NULL; |
3001 | u8c->sp = NULL; |
3002 | u8c->len = len; |
3003 | u8c->slen = 0; |
3004 | u8c->ccc = STOPPER; |
3005 | u8c->nccc = STOPPER; |
3006 | u8c->unichar = 0; |
3007 | /* Check we didn't clobber the maximum length. */ |
3008 | if (u8c->len != len) |
3009 | return -1; |
3010 | /* The first byte of s may not be an utf8 continuation. */ |
3011 | if (len > 0 && (*s & 0xC0) == 0x80) |
3012 | return -1; |
3013 | return 0; |
3014 | } |
3015 | |
3016 | /* |
3017 | * Set up an utf8cursor for use by utf8byte(). |
3018 | * |
3019 | * s : NUL-terminated string. |
3020 | * u8c : pointer to cursor. |
3021 | * trie : utf8trie_t to use for normalization. |
3022 | * |
3023 | * Returns -1 on error, 0 on success. |
3024 | */ |
3025 | int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s) |
3026 | { |
3027 | return utf8ncursor(u8c, tree, s, size_t: (unsigned int)-1); |
3028 | } |
3029 | |
3030 | /* |
3031 | * Get one byte from the normalized form of the string described by u8c. |
3032 | * |
3033 | * Returns the byte cast to an unsigned char on succes, and -1 on failure. |
3034 | * |
3035 | * The cursor keeps track of the location in the string in u8c->s. |
3036 | * When a character is decomposed, the current location is stored in |
3037 | * u8c->p, and u8c->s is set to the start of the decomposition. Note |
3038 | * that bytes from a decomposition do not count against u8c->len. |
3039 | * |
3040 | * Characters are emitted if they match the current CCC in u8c->ccc. |
3041 | * Hitting end-of-string while u8c->ccc == STOPPER means we're done, |
3042 | * and the function returns 0 in that case. |
3043 | * |
3044 | * Sorting by CCC is done by repeatedly scanning the string. The |
3045 | * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at |
3046 | * the start of the scan. The first pass finds the lowest CCC to be |
3047 | * emitted and stores it in u8c->nccc, the second pass emits the |
3048 | * characters with this CCC and finds the next lowest CCC. This limits |
3049 | * the number of passes to 1 + the number of different CCCs in the |
3050 | * sequence being scanned. |
3051 | * |
3052 | * Therefore: |
3053 | * u8c->p != NULL -> a decomposition is being scanned. |
3054 | * u8c->ss != NULL -> this is a repeating scan. |
3055 | * u8c->ccc == -1 -> this is the first scan of a repeating scan. |
3056 | */ |
3057 | int utf8byte(struct utf8cursor *u8c) |
3058 | { |
3059 | utf8leaf_t *leaf; |
3060 | int ccc; |
3061 | |
3062 | for (;;) { |
3063 | /* Check for the end of a decomposed character. */ |
3064 | if (u8c->p && *u8c->s == '\0') { |
3065 | u8c->s = u8c->p; |
3066 | u8c->p = NULL; |
3067 | } |
3068 | |
3069 | /* Check for end-of-string. */ |
3070 | if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) { |
3071 | /* There is no next byte. */ |
3072 | if (u8c->ccc == STOPPER) |
3073 | return 0; |
3074 | /* End-of-string during a scan counts as a stopper. */ |
3075 | ccc = STOPPER; |
3076 | goto ccc_mismatch; |
3077 | } else if ((*u8c->s & 0xC0) == 0x80) { |
3078 | /* This is a continuation of the current character. */ |
3079 | if (!u8c->p) |
3080 | u8c->len--; |
3081 | return (unsigned char)*u8c->s++; |
3082 | } |
3083 | |
3084 | /* Look up the data for the current character. */ |
3085 | if (u8c->p) { |
3086 | leaf = utf8lookup(tree: u8c->tree, hangul: u8c->hangul, s: u8c->s); |
3087 | } else { |
3088 | leaf = utf8nlookup(u8c->tree, u8c->hangul, |
3089 | u8c->s, size_t: u8c->len); |
3090 | } |
3091 | |
3092 | /* No leaf found implies that the input is a binary blob. */ |
3093 | if (!leaf) |
3094 | return -1; |
3095 | |
3096 | /* Characters that are too new have CCC 0. */ |
3097 | if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) { |
3098 | ccc = STOPPER; |
3099 | } else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) { |
3100 | u8c->len -= utf8clen(s: u8c->s); |
3101 | u8c->p = u8c->s + utf8clen(s: u8c->s); |
3102 | u8c->s = LEAF_STR(leaf); |
3103 | /* Empty decomposition implies CCC 0. */ |
3104 | if (*u8c->s == '\0') { |
3105 | if (u8c->ccc == STOPPER) |
3106 | continue; |
3107 | ccc = STOPPER; |
3108 | goto ccc_mismatch; |
3109 | } |
3110 | leaf = utf8lookup(tree: u8c->tree, hangul: u8c->hangul, s: u8c->s); |
3111 | ccc = LEAF_CCC(leaf); |
3112 | } |
3113 | u8c->unichar = utf8decode(str: u8c->s); |
3114 | |
3115 | /* |
3116 | * If this is not a stopper, then see if it updates |
3117 | * the next canonical class to be emitted. |
3118 | */ |
3119 | if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc) |
3120 | u8c->nccc = ccc; |
3121 | |
3122 | /* |
3123 | * Return the current byte if this is the current |
3124 | * combining class. |
3125 | */ |
3126 | if (ccc == u8c->ccc) { |
3127 | if (!u8c->p) |
3128 | u8c->len--; |
3129 | return (unsigned char)*u8c->s++; |
3130 | } |
3131 | |
3132 | /* Current combining class mismatch. */ |
3133 | ccc_mismatch: |
3134 | if (u8c->nccc == STOPPER) { |
3135 | /* |
3136 | * Scan forward for the first canonical class |
3137 | * to be emitted. Save the position from |
3138 | * which to restart. |
3139 | */ |
3140 | assert(u8c->ccc == STOPPER); |
3141 | u8c->ccc = MINCCC - 1; |
3142 | u8c->nccc = ccc; |
3143 | u8c->sp = u8c->p; |
3144 | u8c->ss = u8c->s; |
3145 | u8c->slen = u8c->len; |
3146 | if (!u8c->p) |
3147 | u8c->len -= utf8clen(s: u8c->s); |
3148 | u8c->s += utf8clen(s: u8c->s); |
3149 | } else if (ccc != STOPPER) { |
3150 | /* Not a stopper, and not the ccc we're emitting. */ |
3151 | if (!u8c->p) |
3152 | u8c->len -= utf8clen(s: u8c->s); |
3153 | u8c->s += utf8clen(s: u8c->s); |
3154 | } else if (u8c->nccc != MAXCCC + 1) { |
3155 | /* At a stopper, restart for next ccc. */ |
3156 | u8c->ccc = u8c->nccc; |
3157 | u8c->nccc = MAXCCC + 1; |
3158 | u8c->s = u8c->ss; |
3159 | u8c->p = u8c->sp; |
3160 | u8c->len = u8c->slen; |
3161 | } else { |
3162 | /* All done, proceed from here. */ |
3163 | u8c->ccc = STOPPER; |
3164 | u8c->nccc = STOPPER; |
3165 | u8c->sp = NULL; |
3166 | u8c->ss = NULL; |
3167 | u8c->slen = 0; |
3168 | } |
3169 | } |
3170 | } |
3171 | |
3172 | /* ------------------------------------------------------------------ */ |
3173 | |
3174 | static int normalize_line(struct tree *tree) |
3175 | { |
3176 | char *s; |
3177 | char *t; |
3178 | int c; |
3179 | struct utf8cursor u8c; |
3180 | |
3181 | /* First test: null-terminated string. */ |
3182 | s = buf2; |
3183 | t = buf3; |
3184 | if (utf8cursor(u8c: &u8c, tree, s)) |
3185 | return -1; |
3186 | while ((c = utf8byte(u8c: &u8c)) > 0) |
3187 | if (c != (unsigned char)*t++) |
3188 | return -1; |
3189 | if (c < 0) |
3190 | return -1; |
3191 | if (*t != 0) |
3192 | return -1; |
3193 | |
3194 | /* Second test: length-limited string. */ |
3195 | s = buf2; |
3196 | /* Replace NUL with a value that will cause an error if seen. */ |
3197 | s[strlen(s) + 1] = -1; |
3198 | t = buf3; |
3199 | if (utf8cursor(u8c: &u8c, tree, s)) |
3200 | return -1; |
3201 | while ((c = utf8byte(u8c: &u8c)) > 0) |
3202 | if (c != (unsigned char)*t++) |
3203 | return -1; |
3204 | if (c < 0) |
3205 | return -1; |
3206 | if (*t != 0) |
3207 | return -1; |
3208 | |
3209 | return 0; |
3210 | } |
3211 | |
3212 | static void normalization_test(void) |
3213 | { |
3214 | FILE *file; |
3215 | unsigned int unichar; |
3216 | struct unicode_data *data; |
3217 | char *s; |
3218 | char *t; |
3219 | int ret; |
3220 | int ignorables; |
3221 | int tests = 0; |
3222 | int failures = 0; |
3223 | |
3224 | if (verbose > 0) |
3225 | printf("Parsing %s\n" , test_name); |
3226 | /* Step one, read data from file. */ |
3227 | file = fopen(test_name, "r" ); |
3228 | if (!file) |
3229 | open_fail(test_name, errno); |
3230 | |
3231 | while (fgets(line, LINESIZE, file)) { |
3232 | ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];" , |
3233 | buf0, buf1); |
3234 | if (ret != 2 || *line == '#') |
3235 | continue; |
3236 | s = buf0; |
3237 | t = buf2; |
3238 | while (*s) { |
3239 | unichar = strtoul(s, &s, 16); |
3240 | t += utf8encode(str: t, val: unichar); |
3241 | } |
3242 | *t = '\0'; |
3243 | |
3244 | ignorables = 0; |
3245 | s = buf1; |
3246 | t = buf3; |
3247 | while (*s) { |
3248 | unichar = strtoul(s, &s, 16); |
3249 | data = &unicode_data[unichar]; |
3250 | if (data->utf8nfdi && !*data->utf8nfdi) |
3251 | ignorables = 1; |
3252 | else |
3253 | t += utf8encode(str: t, val: unichar); |
3254 | } |
3255 | *t = '\0'; |
3256 | |
3257 | tests++; |
3258 | if (normalize_line(tree: nfdi_tree) < 0) { |
3259 | printf("Line %s -> %s" , buf0, buf1); |
3260 | if (ignorables) |
3261 | printf(" (ignorables removed)" ); |
3262 | printf(" failure\n" ); |
3263 | failures++; |
3264 | } |
3265 | } |
3266 | fclose(file); |
3267 | if (verbose > 0) |
3268 | printf("Ran %d tests with %d failures\n" , tests, failures); |
3269 | if (failures) |
3270 | file_fail(filename: test_name); |
3271 | } |
3272 | |
3273 | /* ------------------------------------------------------------------ */ |
3274 | |
3275 | static void write_file(void) |
3276 | { |
3277 | FILE *file; |
3278 | int i; |
3279 | int j; |
3280 | int t; |
3281 | int gen; |
3282 | |
3283 | if (verbose > 0) |
3284 | printf("Writing %s\n" , utf8_name); |
3285 | file = fopen(utf8_name, "w" ); |
3286 | if (!file) |
3287 | open_fail(utf8_name, errno); |
3288 | |
3289 | fprintf(file, "/* This file is generated code, do not edit. */\n" ); |
3290 | fprintf(file, "\n" ); |
3291 | fprintf(file, "#include <linux/module.h>\n" ); |
3292 | fprintf(file, "#include <linux/kernel.h>\n" ); |
3293 | fprintf(file, "#include \"utf8n.h\"\n" ); |
3294 | fprintf(file, "\n" ); |
3295 | fprintf(file, "static const unsigned int utf8agetab[] = {\n" ); |
3296 | for (i = 0; i != ages_count; i++) |
3297 | fprintf(file, "\t%#x%s\n" , ages[i], |
3298 | ages[i] == unicode_maxage ? "" : "," ); |
3299 | fprintf(file, "};\n" ); |
3300 | fprintf(file, "\n" ); |
3301 | fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n" ); |
3302 | t = 0; |
3303 | for (gen = 0; gen < ages_count; gen++) { |
3304 | fprintf(file, "\t{ %#x, %d }%s\n" , |
3305 | ages[gen], trees[t].index, |
3306 | ages[gen] == unicode_maxage ? "" : "," ); |
3307 | if (trees[t].maxage == ages[gen]) |
3308 | t += 2; |
3309 | } |
3310 | fprintf(file, "};\n" ); |
3311 | fprintf(file, "\n" ); |
3312 | fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n" ); |
3313 | t = 1; |
3314 | for (gen = 0; gen < ages_count; gen++) { |
3315 | fprintf(file, "\t{ %#x, %d }%s\n" , |
3316 | ages[gen], trees[t].index, |
3317 | ages[gen] == unicode_maxage ? "" : "," ); |
3318 | if (trees[t].maxage == ages[gen]) |
3319 | t += 2; |
3320 | } |
3321 | fprintf(file, "};\n" ); |
3322 | fprintf(file, "\n" ); |
3323 | fprintf(file, "static const unsigned char utf8data[%zd] = {\n" , |
3324 | utf8data_size); |
3325 | t = 0; |
3326 | for (i = 0; i != utf8data_size; i += 16) { |
3327 | if (i == trees[t].index) { |
3328 | fprintf(file, "\t/* %s_%x */\n" , |
3329 | trees[t].type, trees[t].maxage); |
3330 | if (t < trees_count-1) |
3331 | t++; |
3332 | } |
3333 | fprintf(file, "\t" ); |
3334 | for (j = i; j != i + 16; j++) |
3335 | fprintf(file, "0x%.2x%s" , utf8data[j], |
3336 | (j < utf8data_size -1 ? "," : "" )); |
3337 | fprintf(file, "\n" ); |
3338 | } |
3339 | fprintf(file, "};\n" ); |
3340 | fprintf(file, "\n" ); |
3341 | fprintf(file, "struct utf8data_table utf8_data_table = {\n" ); |
3342 | fprintf(file, "\t.utf8agetab = utf8agetab,\n" ); |
3343 | fprintf(file, "\t.utf8agetab_size = ARRAY_SIZE(utf8agetab),\n" ); |
3344 | fprintf(file, "\n" ); |
3345 | fprintf(file, "\t.utf8nfdicfdata = utf8nfdicfdata,\n" ); |
3346 | fprintf(file, "\t.utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata),\n" ); |
3347 | fprintf(file, "\n" ); |
3348 | fprintf(file, "\t.utf8nfdidata = utf8nfdidata,\n" ); |
3349 | fprintf(file, "\t.utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata),\n" ); |
3350 | fprintf(file, "\n" ); |
3351 | fprintf(file, "\t.utf8data = utf8data,\n" ); |
3352 | fprintf(file, "};\n" ); |
3353 | fprintf(file, "EXPORT_SYMBOL_GPL(utf8_data_table);" ); |
3354 | fprintf(file, "\n" ); |
3355 | fprintf(file, "MODULE_LICENSE(\"GPL v2\");\n" ); |
3356 | fclose(file); |
3357 | } |
3358 | |
3359 | /* ------------------------------------------------------------------ */ |
3360 | |
3361 | int main(int argc, char *argv[]) |
3362 | { |
3363 | unsigned int unichar; |
3364 | int opt; |
3365 | |
3366 | argv0 = argv[0]; |
3367 | |
3368 | while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v" )) != -1) { |
3369 | switch (opt) { |
3370 | case 'a': |
3371 | age_name = optarg; |
3372 | break; |
3373 | case 'c': |
3374 | ccc_name = optarg; |
3375 | break; |
3376 | case 'd': |
3377 | data_name = optarg; |
3378 | break; |
3379 | case 'f': |
3380 | fold_name = optarg; |
3381 | break; |
3382 | case 'n': |
3383 | norm_name = optarg; |
3384 | break; |
3385 | case 'o': |
3386 | utf8_name = optarg; |
3387 | break; |
3388 | case 'p': |
3389 | prop_name = optarg; |
3390 | break; |
3391 | case 't': |
3392 | test_name = optarg; |
3393 | break; |
3394 | case 'v': |
3395 | verbose++; |
3396 | break; |
3397 | case 'h': |
3398 | help(); |
3399 | exit(0); |
3400 | default: |
3401 | usage(); |
3402 | } |
3403 | } |
3404 | |
3405 | if (verbose > 1) |
3406 | help(); |
3407 | for (unichar = 0; unichar != 0x110000; unichar++) |
3408 | unicode_data[unichar].code = unichar; |
3409 | age_init(); |
3410 | ccc_init(); |
3411 | nfdi_init(); |
3412 | nfdicf_init(); |
3413 | ignore_init(); |
3414 | corrections_init(); |
3415 | hangul_decompose(); |
3416 | nfdi_decompose(); |
3417 | nfdicf_decompose(); |
3418 | utf8_init(); |
3419 | trees_init(); |
3420 | trees_populate(); |
3421 | trees_reduce(); |
3422 | trees_verify(); |
3423 | /* Prevent "unused function" warning. */ |
3424 | (void)lookup(tree: nfdi_tree, key: " " ); |
3425 | if (verbose > 2) |
3426 | tree_walk(tree: nfdi_tree); |
3427 | if (verbose > 2) |
3428 | tree_walk(tree: nfdicf_tree); |
3429 | normalization_test(); |
3430 | write_file(); |
3431 | |
3432 | return 0; |
3433 | } |
3434 | |