mkutf8data.c source code [linux/fs/unicode/mkutf8data.c]

1	/*
2	* Copyright (c) 2014 SGI.
3	* All rights reserved.
4	*
5	* This program is free software; you can redistribute it and/or
6	* modify it under the terms of the GNU General Public License as
7	* published by the Free Software Foundation.
8	*
9	* This program is distributed in the hope that it would be useful,
10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12	* GNU General Public License for more details.
13	*
14	* You should have received a copy of the GNU General Public License
15	* along with this program; if not, write the Free Software Foundation,
16	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17	*/
18
19	/ Generator for a compact trie for unicode normalization /
20
21	#include <sys/types.h>
22	#include <stddef.h>
23	#include <stdlib.h>
24	#include <stdio.h>
25	#include <assert.h>
26	#include <string.h>
27	#include <unistd.h>
28	#include <errno.h>
29
30	/ Default names of the in- and output files. /
31
32	#define AGE_NAME "DerivedAge.txt"
33	#define CCC_NAME "DerivedCombiningClass.txt"
34	#define PROP_NAME "DerivedCoreProperties.txt"
35	#define DATA_NAME "UnicodeData.txt"
36	#define FOLD_NAME "CaseFolding.txt"
37	#define NORM_NAME "NormalizationCorrections.txt"
38	#define TEST_NAME "NormalizationTest.txt"
39	#define UTF8_NAME "utf8data.h"
40
41	const char *age_name = AGE_NAME;
42	const char *ccc_name = CCC_NAME;
43	const char *prop_name = PROP_NAME;
44	const char *data_name = DATA_NAME;
45	const char *fold_name = FOLD_NAME;
46	const char *norm_name = NORM_NAME;
47	const char *test_name = TEST_NAME;
48	const char *utf8_name = UTF8_NAME;
49
50	int verbose = `0`;
51
52	/ An arbitrary line size limit on input lines. /
53
54	#define LINESIZE 1024
55	char line[LINESIZE];
56	char buf0[LINESIZE];
57	char buf1[LINESIZE];
58	char buf2[LINESIZE];
59	char buf3[LINESIZE];
60
61	const char *argv0;
62
63	#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
64
65	/ ------------------------------------------------------------------ /
66
67	/*
68	* Unicode version numbers consist of three parts: major, minor, and a
69	* revision. These numbers are packed into an unsigned int to obtain
70	* a single version number.
71	*
72	* To save space in the generated trie, the unicode version is not
73	* stored directly, instead we calculate a generation number from the
74	* unicode versions seen in the DerivedAge file, and use that as an
75	* index into a table of unicode versions.
76	*/
77	#define UNICODE_MAJ_SHIFT (16)
78	#define UNICODE_MIN_SHIFT (8)
79
80	#define UNICODE_MAJ_MAX ((unsigned short)-1)
81	#define UNICODE_MIN_MAX ((unsigned char)-1)
82	#define UNICODE_REV_MAX ((unsigned char)-1)
83
84	#define UNICODE_AGE(MAJ,MIN,REV) \
85	(((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) \| \
86	((unsigned int)(MIN) << UNICODE_MIN_SHIFT) \| \
87	((unsigned int)(REV)))
88
89	unsigned int *ages;
90	int ages_count;
91
92	unsigned int unicode_maxage;
93
94	static int age_valid(unsigned int major, unsigned int minor,
95	unsigned int revision)
96	{
97	if (major > UNICODE_MAJ_MAX)
98	return `0`;
99	if (minor > UNICODE_MIN_MAX)
100	return `0`;
101	if (revision > UNICODE_REV_MAX)
102	return `0`;
103	return `1`;
104	}
105
106	/ ------------------------------------------------------------------ /
107
108	/*
109	* utf8trie_t
110	*
111	* A compact binary tree, used to decode UTF-8 characters.
112	*
113	* Internal nodes are one byte for the node itself, and up to three
114	* bytes for an offset into the tree. The first byte contains the
115	* following information:
116	* NEXTBYTE - flag - advance to next byte if set
117	* BITNUM - 3 bit field - the bit number to tested
118	* OFFLEN - 2 bit field - number of bytes in the offset
119	* if offlen == 0 (non-branching node)
120	* RIGHTPATH - 1 bit field - set if the following node is for the
121	* right-hand path (tested bit is set)
122	* TRIENODE - 1 bit field - set if the following node is an internal
123	* node, otherwise it is a leaf node
124	* if offlen != 0 (branching node)
125	* LEFTNODE - 1 bit field - set if the left-hand node is internal
126	* RIGHTNODE - 1 bit field - set if the right-hand node is internal
127	*
128	* Due to the way utf8 works, there cannot be branching nodes with
129	* NEXTBYTE set, and moreover those nodes always have a righthand
130	* descendant.
131	*/
132	typedef unsigned char utf8trie_t;
133	#define BITNUM 0x07
134	#define NEXTBYTE 0x08
135	#define OFFLEN 0x30
136	#define OFFLEN_SHIFT 4
137	#define RIGHTPATH 0x40
138	#define TRIENODE 0x80
139	#define RIGHTNODE 0x40
140	#define LEFTNODE 0x80
141
142	/*
143	* utf8leaf_t
144	*
145	* The leaves of the trie are embedded in the trie, and so the same
146	* underlying datatype, unsigned char.
147	*
148	* leaf[0]: The unicode version, stored as a generation number that is
149	* an index into utf8agetab[]. With this we can filter code
150	* points based on the unicode version in which they were
151	* defined. The CCC of a non-defined code point is 0.
152	* leaf[1]: Canonical Combining Class. During normalization, we need
153	* to do a stable sort into ascending order of all characters
154	* with a non-zero CCC that occur between two characters with
155	* a CCC of 0, or at the begin or end of a string.
156	* The unicode standard guarantees that all CCC values are
157	* between 0 and 254 inclusive, which leaves 255 available as
158	* a special value.
159	* Code points with CCC 0 are known as stoppers.
160	* leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
161	* start of a NUL-terminated string that is the decomposition
162	* of the character.
163	* The CCC of a decomposable character is the same as the CCC
164	* of the first character of its decomposition.
165	* Some characters decompose as the empty string: these are
166	* characters with the Default_Ignorable_Code_Point property.
167	* These do affect normalization, as they all have CCC 0.
168	*
169	* The decompositions in the trie have been fully expanded.
170	*
171	* Casefolding, if applicable, is also done using decompositions.
172	*/
173	typedef unsigned char utf8leaf_t;
174
175	#define LEAF_GEN(LEAF) ((LEAF)[0])
176	#define LEAF_CCC(LEAF) ((LEAF)[1])
177	#define LEAF_STR(LEAF) ((const char*)((LEAF) + 2))
178
179	#define MAXGEN (255)
180
181	#define MINCCC (0)
182	#define MAXCCC (254)
183	#define STOPPER (0)
184	#define DECOMPOSE (255)
185	#define HANGUL ((char)(255))
186
187	#define UTF8HANGULLEAF (12)
188
189	struct tree;
190	static utf8leaf_t utf8nlookup(struct* tree , unsigned* char *,
191	const char *, size_t);
192	static utf8leaf_t utf8lookup(struct* tree , unsigned* char , const* char *);
193
194	unsigned char *utf8data;
195	size_t utf8data_size;
196
197	utf8trie_t *nfdi;
198	utf8trie_t *nfdicf;
199
200	/ ------------------------------------------------------------------ /
201
202	/*
203	* UTF8 valid ranges.
204	*
205	* The UTF-8 encoding spreads the bits of a 32bit word over several
206	* bytes. This table gives the ranges that can be held and how they'd
207	* be represented.
208	*
209	* 0x00000000 0x0000007F: 0xxxxxxx
210	* 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
211	* 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
212	* 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
213	* 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
214	* 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
215	*
216	* There is an additional requirement on UTF-8, in that only the
217	* shortest representation of a 32bit value is to be used. A decoder
218	* must not decode sequences that do not satisfy this requirement.
219	* Thus the allowed ranges have a lower bound.
220	*
221	* 0x00000000 0x0000007F: 0xxxxxxx
222	* 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
223	* 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
224	* 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
225	* 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
226	* 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
227	*
228	* Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
229	* 17 planes of 65536 values. This limits the sequences actually seen
230	* even more, to just the following.
231	*
232	* 0 - 0x7f: 0 0x7f
233	* 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf
234	* 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf
235	* 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf
236	*
237	* Even within those ranges not all values are allowed: the surrogates
238	* 0xd800 - 0xdfff should never be seen.
239	*
240	* Note that the longest sequence seen with valid usage is 4 bytes,
241	* the same a single UTF-32 character. This makes the UTF-8
242	* representation of Unicode strictly smaller than UTF-32.
243	*
244	* The shortest sequence requirement was introduced by:
245	* Corrigendum #1: UTF-8 Shortest Form
246	* It can be found here:
247	* http://www.unicode.org/versions/corrigendum1.html
248	*
249	*/
250
251	#define UTF8_2_BITS 0xC0
252	#define UTF8_3_BITS 0xE0
253	#define UTF8_4_BITS 0xF0
254	#define UTF8_N_BITS 0x80
255	#define UTF8_2_MASK 0xE0
256	#define UTF8_3_MASK 0xF0
257	#define UTF8_4_MASK 0xF8
258	#define UTF8_N_MASK 0xC0
259	#define UTF8_V_MASK 0x3F
260	#define UTF8_V_SHIFT 6
261
262	static int utf8encode(char str, unsigned* int val)
263	{
264	int len;
265
266	if (val < `0x80`) {
267	str[`0`] = val;
268	len = `1`;
269	} else if (val < `0x800`) {
270	str[`1`] = val & UTF8_V_MASK;
271	str[`1`] \|= UTF8_N_BITS;
272	val >>= UTF8_V_SHIFT;
273	str[`0`] = val;
274	str[`0`] \|= UTF8_2_BITS;
275	len = `2`;
276	} else if (val < `0x10000`) {
277	str[`2`] = val & UTF8_V_MASK;
278	str[`2`] \|= UTF8_N_BITS;
279	val >>= UTF8_V_SHIFT;
280	str[`1`] = val & UTF8_V_MASK;
281	str[`1`] \|= UTF8_N_BITS;
282	val >>= UTF8_V_SHIFT;
283	str[`0`] = val;
284	str[`0`] \|= UTF8_3_BITS;
285	len = `3`;
286	} else if (val < `0x110000`) {
287	str[`3`] = val & UTF8_V_MASK;
288	str[`3`] \|= UTF8_N_BITS;
289	val >>= UTF8_V_SHIFT;
290	str[`2`] = val & UTF8_V_MASK;
291	str[`2`] \|= UTF8_N_BITS;
292	val >>= UTF8_V_SHIFT;
293	str[`1`] = val & UTF8_V_MASK;
294	str[`1`] \|= UTF8_N_BITS;
295	val >>= UTF8_V_SHIFT;
296	str[`0`] = val;
297	str[`0`] \|= UTF8_4_BITS;
298	len = `4`;
299	} else {
300	printf("%#x: illegal val\n", val);
301	len = `0`;
302	}
303	return len;
304	}
305
306	static unsigned int utf8decode(const char *str)
307	{
308	const unsigned char s = (const* unsigned char*)str;
309	unsigned int unichar = `0`;
310
311	if (*s < `0x80`) {
312	unichar = *s;
313	} else if (*s < UTF8_3_BITS) {
314	unichar = *s++ & `0x1F`;
315	unichar <<= UTF8_V_SHIFT;
316	unichar \|= *s & `0x3F`;
317	} else if (*s < UTF8_4_BITS) {
318	unichar = *s++ & `0x0F`;
319	unichar <<= UTF8_V_SHIFT;
320	unichar \|= *s++ & `0x3F`;
321	unichar <<= UTF8_V_SHIFT;
322	unichar \|= *s & `0x3F`;
323	} else {
324	unichar = *s++ & `0x0F`;
325	unichar <<= UTF8_V_SHIFT;
326	unichar \|= *s++ & `0x3F`;
327	unichar <<= UTF8_V_SHIFT;
328	unichar \|= *s++ & `0x3F`;
329	unichar <<= UTF8_V_SHIFT;
330	unichar \|= *s & `0x3F`;
331	}
332	return unichar;
333	}
334
335	static int utf32valid(unsigned int unichar)
336	{
337	return unichar < `0x110000`;
338	}
339
340	#define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3)
341
342	#define NODE 1
343	#define LEAF 0
344
345	struct tree {
346	void *root;
347	int childnode;
348	const char *type;
349	unsigned int maxage;
350	struct tree *next;
351	int (leaf_equal)(void* , void* *);
352	void (leaf_print)(void* , int*);
353	int (leaf_mark)(void* *);
354	int (leaf_size)(void* *);
355	int (leaf_index)(struct tree , void* *);
356	unsigned char (leaf_emit)(void , unsigned* char *);
357	int leafindex[`0x110000`];
358	int index;
359	};
360
361	struct node {
362	int index;
363	int offset;
364	int mark;
365	int size;
366	struct node *parent;
367	void *left;
368	void *right;
369	unsigned char bitnum;
370	unsigned char nextbyte;
371	unsigned char leftnode;
372	unsigned char rightnode;
373	unsigned int keybits;
374	unsigned int keymask;
375	};
376
377	/*
378	* Example lookup function for a tree.
379	*/
380	static void lookup(struct* tree tree, const* char *key)
381	{
382	struct node *node;
383	void *leaf = NULL;
384
385	node = tree->root;
386	while (!leaf && node) {
387	if (node->nextbyte)
388	key++;
389	if (*key & (`1` << (node->bitnum & `7`))) {
390	/ Right leg /
391	if (node->rightnode == NODE) {
392	node = node->right;
393	} else if (node->rightnode == LEAF) {
394	leaf = node->right;
395	} else {
396	node = NULL;
397	}
398	} else {
399	/ Left leg /
400	if (node->leftnode == NODE) {
401	node = node->left;
402	} else if (node->leftnode == LEAF) {
403	leaf = node->left;
404	} else {
405	node = NULL;
406	}
407	}
408	}
409
410	return leaf;
411	}
412
413	/*
414	* A simple non-recursive tree walker: keep track of visits to the
415	* left and right branches in the leftmask and rightmask.
416	*/
417	static void tree_walk(struct tree *tree)
418	{
419	struct node *node;
420	unsigned int leftmask;
421	unsigned int rightmask;
422	unsigned int bitmask;
423	int indent = `1`;
424	int nodes, singletons, leaves;
425
426	nodes = singletons = leaves = `0`;
427
428	printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
429	if (tree->childnode == LEAF) {
430	assert(tree->root);
431	tree->leaf_print(tree->root, indent);
432	leaves = `1`;
433	} else {
434	assert(tree->childnode == NODE);
435	node = tree->root;
436	leftmask = rightmask = `0`;
437	while (node) {
438	printf("%*snode @ %p bitnum %d nextbyte %d"
439	" left %p right %p mask %x bits %x\n",
440	indent, "", node,
441	node->bitnum, node->nextbyte,
442	node->left, node->right,
443	node->keymask, node->keybits);
444	nodes += `1`;
445	if (!(node->left && node->right))
446	singletons += `1`;
447
448	while (node) {
449	bitmask = `1` << node->bitnum;
450	if ((leftmask & bitmask) == `0`) {
451	leftmask \|= bitmask;
452	if (node->leftnode == LEAF) {
453	assert(node->left);
454	tree->leaf_print(node->left,
455	indent+`1`);
456	leaves += `1`;
457	} else if (node->left) {
458	assert(node->leftnode == NODE);
459	indent += `1`;
460	node = node->left;
461	break;
462	}
463	}
464	if ((rightmask & bitmask) == `0`) {
465	rightmask \|= bitmask;
466	if (node->rightnode == LEAF) {
467	assert(node->right);
468	tree->leaf_print(node->right,
469	indent+`1`);
470	leaves += `1`;
471	} else if (node->right) {
472	assert(node->rightnode == NODE);
473	indent += `1`;
474	node = node->right;
475	break;
476	}
477	}
478	leftmask &= ~bitmask;
479	rightmask &= ~bitmask;
480	node = node->parent;
481	indent -= `1`;
482	}
483	}
484	}
485	printf("nodes %d leaves %d singletons %d\n",
486	nodes, leaves, singletons);
487	}
488
489	/*
490	* Allocate an initialize a new internal node.
491	*/
492	static struct node alloc_node(struct* node *parent)
493	{
494	struct node *node;
495	int bitnum;
496
497	node = malloc(sizeof(*node));
498	node->left = node->right = NULL;
499	node->parent = parent;
500	node->leftnode = NODE;
501	node->rightnode = NODE;
502	node->keybits = `0`;
503	node->keymask = `0`;
504	node->mark = `0`;
505	node->index = `0`;
506	node->offset = -`1`;
507	node->size = `4`;
508
509	if (node->parent) {
510	bitnum = parent->bitnum;
511	if ((bitnum & `7`) == `0`) {
512	node->bitnum = bitnum + `7` + `8`;
513	node->nextbyte = `1`;
514	} else {
515	node->bitnum = bitnum - `1`;
516	node->nextbyte = `0`;
517	}
518	} else {
519	node->bitnum = `7`;
520	node->nextbyte = `0`;
521	}
522
523	return node;
524	}
525
526	/*
527	* Insert a new leaf into the tree, and collapse any subtrees that are
528	* fully populated and end in identical leaves. A nextbyte tagged
529	* internal node will not be removed to preserve the tree's integrity.
530	* Note that due to the structure of utf8, no nextbyte tagged node
531	* will be a candidate for removal.
532	*/
533	static int insert(struct tree tree, char* key, int* keylen, void *leaf)
534	{
535	struct node *node;
536	struct node *parent;
537	void **cursor;
538	int keybits;
539
540	assert(keylen >= `1` && keylen <= `4`);
541
542	node = NULL;
543	cursor = &tree->root;
544	keybits = `8` * keylen;
545
546	/ Insert, creating path along the way. /
547	while (keybits) {
548	if (!*cursor)
549	*cursor = alloc_node(parent: node);
550	node = *cursor;
551	if (node->nextbyte)
552	key++;
553	if (*key & (`1` << (node->bitnum & `7`)))
554	cursor = &node->right;
555	else
556	cursor = &node->left;
557	keybits--;
558	}
559	*cursor = leaf;
560
561	/ Merge subtrees if possible. /
562	while (node) {
563	if (*key & (`1` << (node->bitnum & `7`)))
564	node->rightnode = LEAF;
565	else
566	node->leftnode = LEAF;
567	if (node->nextbyte)
568	break;
569	if (node->leftnode == NODE \|\| node->rightnode == NODE)
570	break;
571	assert(node->left);
572	assert(node->right);
573	/ Compare /
574	if (! tree->leaf_equal(node->left, node->right))
575	break;
576	/ Keep left, drop right leaf. /
577	leaf = node->left;
578	/ Check in parent /
579	parent = node->parent;
580	if (!parent) {
581	/ root of tree! /
582	tree->root = leaf;
583	tree->childnode = LEAF;
584	} else if (parent->left == node) {
585	parent->left = leaf;
586	parent->leftnode = LEAF;
587	if (parent->right) {
588	parent->keymask = `0`;
589	parent->keybits = `0`;
590	} else {
591	parent->keymask \|= (`1` << node->bitnum);
592	}
593	} else if (parent->right == node) {
594	parent->right = leaf;
595	parent->rightnode = LEAF;
596	if (parent->left) {
597	parent->keymask = `0`;
598	parent->keybits = `0`;
599	} else {
600	parent->keymask \|= (`1` << node->bitnum);
601	parent->keybits \|= (`1` << node->bitnum);
602	}
603	} else {
604	/ internal tree error /
605	assert(`0`);
606	}
607	free(node);
608	node = parent;
609	}
610
611	/ Propagate keymasks up along singleton chains. /
612	while (node) {
613	parent = node->parent;
614	if (!parent)
615	break;
616	/ Nix the mask for parents with two children. /
617	if (node->keymask == `0`) {
618	parent->keymask = `0`;
619	parent->keybits = `0`;
620	} else if (parent->left && parent->right) {
621	parent->keymask = `0`;
622	parent->keybits = `0`;
623	} else {
624	assert((parent->keymask & node->keymask) == `0`);
625	parent->keymask \|= node->keymask;
626	parent->keymask \|= (`1` << parent->bitnum);
627	parent->keybits \|= node->keybits;
628	if (parent->right)
629	parent->keybits \|= (`1` << parent->bitnum);
630	}
631	node = parent;
632	}
633
634	return `0`;
635	}
636
637	/*
638	* Prune internal nodes.
639	*
640	* Fully populated subtrees that end at the same leaf have already
641	* been collapsed. There are still internal nodes that have for both
642	* their left and right branches a sequence of singletons that make
643	* identical choices and end in identical leaves. The keymask and
644	* keybits collected in the nodes describe the choices made in these
645	* singleton chains. When they are identical for the left and right
646	* branch of a node, and the two leaves comare identical, the node in
647	* question can be removed.
648	*
649	* Note that nodes with the nextbyte tag set will not be removed by
650	* this to ensure tree integrity. Note as well that the structure of
651	* utf8 ensures that these nodes would not have been candidates for
652	* removal in any case.
653	*/
654	static void prune(struct tree *tree)
655	{
656	struct node *node;
657	struct node *left;
658	struct node *right;
659	struct node *parent;
660	void *leftleaf;
661	void *rightleaf;
662	unsigned int leftmask;
663	unsigned int rightmask;
664	unsigned int bitmask;
665	int count;
666
667	if (verbose > `0`)
668	printf("Pruning %s_%x\n", tree->type, tree->maxage);
669
670	count = `0`;
671	if (tree->childnode == LEAF)
672	return;
673	if (!tree->root)
674	return;
675
676	leftmask = rightmask = `0`;
677	node = tree->root;
678	while (node) {
679	if (node->nextbyte)
680	goto advance;
681	if (node->leftnode == LEAF)
682	goto advance;
683	if (node->rightnode == LEAF)
684	goto advance;
685	if (!node->left)
686	goto advance;
687	if (!node->right)
688	goto advance;
689	left = node->left;
690	right = node->right;
691	if (left->keymask == `0`)
692	goto advance;
693	if (right->keymask == `0`)
694	goto advance;
695	if (left->keymask != right->keymask)
696	goto advance;
697	if (left->keybits != right->keybits)
698	goto advance;
699	leftleaf = NULL;
700	while (!leftleaf) {
701	assert(left->left \|\| left->right);
702	if (left->leftnode == LEAF)
703	leftleaf = left->left;
704	else if (left->rightnode == LEAF)
705	leftleaf = left->right;
706	else if (left->left)
707	left = left->left;
708	else if (left->right)
709	left = left->right;
710	else
711	assert(`0`);
712	}
713	rightleaf = NULL;
714	while (!rightleaf) {
715	assert(right->left \|\| right->right);
716	if (right->leftnode == LEAF)
717	rightleaf = right->left;
718	else if (right->rightnode == LEAF)
719	rightleaf = right->right;
720	else if (right->left)
721	right = right->left;
722	else if (right->right)
723	right = right->right;
724	else
725	assert(`0`);
726	}
727	if (! tree->leaf_equal(leftleaf, rightleaf))
728	goto advance;
729	/*
730	* This node has identical singleton-only subtrees.
731	* Remove it.
732	*/
733	parent = node->parent;
734	left = node->left;
735	right = node->right;
736	if (parent->left == node)
737	parent->left = left;
738	else if (parent->right == node)
739	parent->right = left;
740	else
741	assert(`0`);
742	left->parent = parent;
743	left->keymask \|= (`1` << node->bitnum);
744	node->left = NULL;
745	while (node) {
746	bitmask = `1` << node->bitnum;
747	leftmask &= ~bitmask;
748	rightmask &= ~bitmask;
749	if (node->leftnode == NODE && node->left) {
750	left = node->left;
751	free(node);
752	count++;
753	node = left;
754	} else if (node->rightnode == NODE && node->right) {
755	right = node->right;
756	free(node);
757	count++;
758	node = right;
759	} else {
760	node = NULL;
761	}
762	}
763	/ Propagate keymasks up along singleton chains. /
764	node = parent;
765	/ Force re-check /
766	bitmask = `1` << node->bitnum;
767	leftmask &= ~bitmask;
768	rightmask &= ~bitmask;
769	for (;;) {
770	if (node->left && node->right)
771	break;
772	if (node->left) {
773	left = node->left;
774	node->keymask \|= left->keymask;
775	node->keybits \|= left->keybits;
776	}
777	if (node->right) {
778	right = node->right;
779	node->keymask \|= right->keymask;
780	node->keybits \|= right->keybits;
781	}
782	node->keymask \|= (`1` << node->bitnum);
783	node = node->parent;
784	/ Force re-check /
785	bitmask = `1` << node->bitnum;
786	leftmask &= ~bitmask;
787	rightmask &= ~bitmask;
788	}
789	advance:
790	bitmask = `1` << node->bitnum;
791	if ((leftmask & bitmask) == `0` &&
792	node->leftnode == NODE &&
793	node->left) {
794	leftmask \|= bitmask;
795	node = node->left;
796	} else if ((rightmask & bitmask) == `0` &&
797	node->rightnode == NODE &&
798	node->right) {
799	rightmask \|= bitmask;
800	node = node->right;
801	} else {
802	leftmask &= ~bitmask;
803	rightmask &= ~bitmask;
804	node = node->parent;
805	}
806	}
807	if (verbose > `0`)
808	printf("Pruned %d nodes\n", count);
809	}
810
811	/*
812	* Mark the nodes in the tree that lead to leaves that must be
813	* emitted.
814	*/
815	static void mark_nodes(struct tree *tree)
816	{
817	struct node *node;
818	struct node *n;
819	unsigned int leftmask;
820	unsigned int rightmask;
821	unsigned int bitmask;
822	int marked;
823
824	marked = `0`;
825	if (verbose > `0`)
826	printf("Marking %s_%x\n", tree->type, tree->maxage);
827	if (tree->childnode == LEAF)
828	goto done;
829
830	assert(tree->childnode == NODE);
831	node = tree->root;
832	leftmask = rightmask = `0`;
833	while (node) {
834	bitmask = `1` << node->bitnum;
835	if ((leftmask & bitmask) == `0`) {
836	leftmask \|= bitmask;
837	if (node->leftnode == LEAF) {
838	assert(node->left);
839	if (tree->leaf_mark(node->left)) {
840	n = node;
841	while (n && !n->mark) {
842	marked++;
843	n->mark = `1`;
844	n = n->parent;
845	}
846	}
847	} else if (node->left) {
848	assert(node->leftnode == NODE);
849	node = node->left;
850	continue;
851	}
852	}
853	if ((rightmask & bitmask) == `0`) {
854	rightmask \|= bitmask;
855	if (node->rightnode == LEAF) {
856	assert(node->right);
857	if (tree->leaf_mark(node->right)) {
858	n = node;
859	while (n && !n->mark) {
860	marked++;
861	n->mark = `1`;
862	n = n->parent;
863	}
864	}
865	} else if (node->right) {
866	assert(node->rightnode == NODE);
867	node = node->right;
868	continue;
869	}
870	}
871	leftmask &= ~bitmask;
872	rightmask &= ~bitmask;
873	node = node->parent;
874	}
875
876	/ second pass: left siblings and singletons /
877
878	assert(tree->childnode == NODE);
879	node = tree->root;
880	leftmask = rightmask = `0`;
881	while (node) {
882	bitmask = `1` << node->bitnum;
883	if ((leftmask & bitmask) == `0`) {
884	leftmask \|= bitmask;
885	if (node->leftnode == LEAF) {
886	assert(node->left);
887	if (tree->leaf_mark(node->left)) {
888	n = node;
889	while (n && !n->mark) {
890	marked++;
891	n->mark = `1`;
892	n = n->parent;
893	}
894	}
895	} else if (node->left) {
896	assert(node->leftnode == NODE);
897	node = node->left;
898	if (!node->mark && node->parent->mark) {
899	marked++;
900	node->mark = `1`;
901	}
902	continue;
903	}
904	}
905	if ((rightmask & bitmask) == `0`) {
906	rightmask \|= bitmask;
907	if (node->rightnode == LEAF) {
908	assert(node->right);
909	if (tree->leaf_mark(node->right)) {
910	n = node;
911	while (n && !n->mark) {
912	marked++;
913	n->mark = `1`;
914	n = n->parent;
915	}
916	}
917	} else if (node->right) {
918	assert(node->rightnode == NODE);
919	node = node->right;
920	if (!node->mark && node->parent->mark &&
921	!node->parent->left) {
922	marked++;
923	node->mark = `1`;
924	}
925	continue;
926	}
927	}
928	leftmask &= ~bitmask;
929	rightmask &= ~bitmask;
930	node = node->parent;
931	}
932	done:
933	if (verbose > `0`)
934	printf("Marked %d nodes\n", marked);
935	}
936
937	/*
938	* Compute the index of each node and leaf, which is the offset in the
939	* emitted trie. These values must be pre-computed because relative
940	* offsets between nodes are used to navigate the tree.
941	*/
942	static int index_nodes(struct tree tree, int* index)
943	{
944	struct node *node;
945	unsigned int leftmask;
946	unsigned int rightmask;
947	unsigned int bitmask;
948	int count;
949	int indent;
950
951	/ Align to a cache line (or half a cache line?). /
952	while (index % `64`)
953	index++;
954	tree->index = index;
955	indent = `1`;
956	count = `0`;
957
958	if (verbose > `0`)
959	printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
960	if (tree->childnode == LEAF) {
961	index += tree->leaf_size(tree->root);
962	goto done;
963	}
964
965	assert(tree->childnode == NODE);
966	node = tree->root;
967	leftmask = rightmask = `0`;
968	while (node) {
969	if (!node->mark)
970	goto skip;
971	count++;
972	if (node->index != index)
973	node->index = index;
974	index += node->size;
975	skip:
976	while (node) {
977	bitmask = `1` << node->bitnum;
978	if (node->mark && (leftmask & bitmask) == `0`) {
979	leftmask \|= bitmask;
980	if (node->leftnode == LEAF) {
981	assert(node->left);
982	*tree->leaf_index(tree, node->left) =
983	index;
984	index += tree->leaf_size(node->left);
985	count++;
986	} else if (node->left) {
987	assert(node->leftnode == NODE);
988	indent += `1`;
989	node = node->left;
990	break;
991	}
992	}
993	if (node->mark && (rightmask & bitmask) == `0`) {
994	rightmask \|= bitmask;
995	if (node->rightnode == LEAF) {
996	assert(node->right);
997	*tree->leaf_index(tree, node->right) = index;
998	index += tree->leaf_size(node->right);
999	count++;
1000	} else if (node->right) {
1001	assert(node->rightnode == NODE);
1002	indent += `1`;
1003	node = node->right;
1004	break;
1005	}
1006	}
1007	leftmask &= ~bitmask;
1008	rightmask &= ~bitmask;
1009	node = node->parent;
1010	indent -= `1`;
1011	}
1012	}
1013	done:
1014	/ Round up to a multiple of 16 /
1015	while (index % `16`)
1016	index++;
1017	if (verbose > `0`)
1018	printf("Final index %d\n", index);
1019	return index;
1020	}
1021
1022	/*
1023	* Mark the nodes in a subtree, helper for size_nodes().
1024	*/
1025	static int mark_subtree(struct node *node)
1026	{
1027	int changed;
1028
1029	if (!node \|\| node->mark)
1030	return `0`;
1031	node->mark = `1`;
1032	node->index = node->parent->index;
1033	changed = `1`;
1034	if (node->leftnode == NODE)
1035	changed += mark_subtree(node: node->left);
1036	if (node->rightnode == NODE)
1037	changed += mark_subtree(node: node->right);
1038	return changed;
1039	}
1040
1041	/*
1042	* Compute the size of nodes and leaves. We start by assuming that
1043	* each node needs to store a three-byte offset. The indexes of the
1044	* nodes are calculated based on that, and then this function is
1045	* called to see if the sizes of some nodes can be reduced. This is
1046	* repeated until no more changes are seen.
1047	*/
1048	static int size_nodes(struct tree *tree)
1049	{
1050	struct tree *next;
1051	struct node *node;
1052	struct node *right;
1053	struct node *n;
1054	unsigned int leftmask;
1055	unsigned int rightmask;
1056	unsigned int bitmask;
1057	unsigned int pathbits;
1058	unsigned int pathmask;
1059	unsigned int nbit;
1060	int changed;
1061	int offset;
1062	int size;
1063	int indent;
1064
1065	indent = `1`;
1066	changed = `0`;
1067	size = `0`;
1068
1069	if (verbose > `0`)
1070	printf("Sizing %s_%x\n", tree->type, tree->maxage);
1071	if (tree->childnode == LEAF)
1072	goto done;
1073
1074	assert(tree->childnode == NODE);
1075	pathbits = `0`;
1076	pathmask = `0`;
1077	node = tree->root;
1078	leftmask = rightmask = `0`;
1079	while (node) {
1080	if (!node->mark)
1081	goto skip;
1082	offset = `0`;
1083	if (!node->left \|\| !node->right) {
1084	size = `1`;
1085	} else {
1086	if (node->rightnode == NODE) {
1087	/*
1088	* If the right node is not marked,
1089	* look for a corresponding node in
1090	* the next tree. Such a node need
1091	* not exist.
1092	*/
1093	right = node->right;
1094	next = tree->next;
1095	while (!right->mark) {
1096	assert(next);
1097	n = next->root;
1098	while (n->bitnum != node->bitnum) {
1099	nbit = `1` << n->bitnum;
1100	if (!(pathmask & nbit))
1101	break;
1102	if (pathbits & nbit) {
1103	if (n->rightnode == LEAF)
1104	break;
1105	n = n->right;
1106	} else {
1107	if (n->leftnode == LEAF)
1108	break;
1109	n = n->left;
1110	}
1111	}
1112	if (n->bitnum != node->bitnum)
1113	break;
1114	n = n->right;
1115	right = n;
1116	next = next->next;
1117	}
1118	/ Make sure the right node is marked. /
1119	if (!right->mark)
1120	changed += mark_subtree(node: right);
1121	offset = right->index - node->index;
1122	} else {
1123	offset = *tree->leaf_index(tree, node->right);
1124	offset -= node->index;
1125	}
1126	assert(offset >= `0`);
1127	assert(offset <= `0xffffff`);
1128	if (offset <= `0xff`) {
1129	size = `2`;
1130	} else if (offset <= `0xffff`) {
1131	size = `3`;
1132	} else { / offset <= 0xffffff /
1133	size = `4`;
1134	}
1135	}
1136	if (node->size != size \|\| node->offset != offset) {
1137	node->size = size;
1138	node->offset = offset;
1139	changed++;
1140	}
1141	skip:
1142	while (node) {
1143	bitmask = `1` << node->bitnum;
1144	pathmask \|= bitmask;
1145	if (node->mark && (leftmask & bitmask) == `0`) {
1146	leftmask \|= bitmask;
1147	if (node->leftnode == LEAF) {
1148	assert(node->left);
1149	} else if (node->left) {
1150	assert(node->leftnode == NODE);
1151	indent += `1`;
1152	node = node->left;
1153	break;
1154	}
1155	}
1156	if (node->mark && (rightmask & bitmask) == `0`) {
1157	rightmask \|= bitmask;
1158	pathbits \|= bitmask;
1159	if (node->rightnode == LEAF) {
1160	assert(node->right);
1161	} else if (node->right) {
1162	assert(node->rightnode == NODE);
1163	indent += `1`;
1164	node = node->right;
1165	break;
1166	}
1167	}
1168	leftmask &= ~bitmask;
1169	rightmask &= ~bitmask;
1170	pathmask &= ~bitmask;
1171	pathbits &= ~bitmask;
1172	node = node->parent;
1173	indent -= `1`;
1174	}
1175	}
1176	done:
1177	if (verbose > `0`)
1178	printf("Found %d changes\n", changed);
1179	return changed;
1180	}
1181
1182	/*
1183	* Emit a trie for the given tree into the data array.
1184	*/
1185	static void emit(struct tree tree, unsigned* char *data)
1186	{
1187	struct node *node;
1188	unsigned int leftmask;
1189	unsigned int rightmask;
1190	unsigned int bitmask;
1191	int offlen;
1192	int offset;
1193	int index;
1194	int indent;
1195	int size;
1196	int bytes;
1197	int leaves;
1198	int nodes[`4`];
1199	unsigned char byte;
1200
1201	nodes[`0`] = nodes[`1`] = nodes[`2`] = nodes[`3`] = `0`;
1202	leaves = `0`;
1203	bytes = `0`;
1204	index = tree->index;
1205	data += index;
1206	indent = `1`;
1207	if (verbose > `0`)
1208	printf("Emitting %s_%x\n", tree->type, tree->maxage);
1209	if (tree->childnode == LEAF) {
1210	assert(tree->root);
1211	tree->leaf_emit(tree->root, data);
1212	size = tree->leaf_size(tree->root);
1213	index += size;
1214	leaves++;
1215	goto done;
1216	}
1217
1218	assert(tree->childnode == NODE);
1219	node = tree->root;
1220	leftmask = rightmask = `0`;
1221	while (node) {
1222	if (!node->mark)
1223	goto skip;
1224	assert(node->offset != -`1`);
1225	assert(node->index == index);
1226
1227	byte = `0`;
1228	if (node->nextbyte)
1229	byte \|= NEXTBYTE;
1230	byte \|= (node->bitnum & BITNUM);
1231	if (node->left && node->right) {
1232	if (node->leftnode == NODE)
1233	byte \|= LEFTNODE;
1234	if (node->rightnode == NODE)
1235	byte \|= RIGHTNODE;
1236	if (node->offset <= `0xff`)
1237	offlen = `1`;
1238	else if (node->offset <= `0xffff`)
1239	offlen = `2`;
1240	else
1241	offlen = `3`;
1242	nodes[offlen]++;
1243	offset = node->offset;
1244	byte \|= offlen << OFFLEN_SHIFT;
1245	*data++ = byte;
1246	index++;
1247	while (offlen--) {
1248	*data++ = offset & `0xff`;
1249	index++;
1250	offset >>= `8`;
1251	}
1252	} else if (node->left) {
1253	if (node->leftnode == NODE)
1254	byte \|= TRIENODE;
1255	nodes[`0`]++;
1256	*data++ = byte;
1257	index++;
1258	} else if (node->right) {
1259	byte \|= RIGHTNODE;
1260	if (node->rightnode == NODE)
1261	byte \|= TRIENODE;
1262	nodes[`0`]++;
1263	*data++ = byte;
1264	index++;
1265	} else {
1266	assert(`0`);
1267	}
1268	skip:
1269	while (node) {
1270	bitmask = `1` << node->bitnum;
1271	if (node->mark && (leftmask & bitmask) == `0`) {
1272	leftmask \|= bitmask;
1273	if (node->leftnode == LEAF) {
1274	assert(node->left);
1275	data = tree->leaf_emit(node->left,
1276	data);
1277	size = tree->leaf_size(node->left);
1278	index += size;
1279	bytes += size;
1280	leaves++;
1281	} else if (node->left) {
1282	assert(node->leftnode == NODE);
1283	indent += `1`;
1284	node = node->left;
1285	break;
1286	}
1287	}
1288	if (node->mark && (rightmask & bitmask) == `0`) {
1289	rightmask \|= bitmask;
1290	if (node->rightnode == LEAF) {
1291	assert(node->right);
1292	data = tree->leaf_emit(node->right,
1293	data);
1294	size = tree->leaf_size(node->right);
1295	index += size;
1296	bytes += size;
1297	leaves++;
1298	} else if (node->right) {
1299	assert(node->rightnode == NODE);
1300	indent += `1`;
1301	node = node->right;
1302	break;
1303	}
1304	}
1305	leftmask &= ~bitmask;
1306	rightmask &= ~bitmask;
1307	node = node->parent;
1308	indent -= `1`;
1309	}
1310	}
1311	done:
1312	if (verbose > `0`) {
1313	printf("Emitted %d (%d) leaves",
1314	leaves, bytes);
1315	printf(" %d (%d+%d+%d+%d) nodes",
1316	nodes[`0`] + nodes[`1`] + nodes[`2`] + nodes[`3`],
1317	nodes[`0`], nodes[`1`], nodes[`2`], nodes[`3`]);
1318	printf(" %d total\n", index - tree->index);
1319	}
1320	}
1321
1322	/ ------------------------------------------------------------------ /
1323
1324	/*
1325	* Unicode data.
1326	*
1327	* We need to keep track of the Canonical Combining Class, the Age,
1328	* and decompositions for a code point.
1329	*
1330	* For the Age, we store the index into the ages table. Effectively
1331	* this is a generation number that the table maps to a unicode
1332	* version.
1333	*
1334	* The correction field is used to indicate that this entry is in the
1335	* corrections array, which contains decompositions that were
1336	* corrected in later revisions. The value of the correction field is
1337	* the Unicode version in which the mapping was corrected.
1338	*/
1339	struct unicode_data {
1340	unsigned int code;
1341	int ccc;
1342	int gen;
1343	int correction;
1344	unsigned int *utf32nfdi;
1345	unsigned int *utf32nfdicf;
1346	char *utf8nfdi;
1347	char *utf8nfdicf;
1348	};
1349
1350	struct unicode_data unicode_data[`0x110000`];
1351	struct unicode_data *corrections;
1352	int corrections_count;
1353
1354	struct tree *nfdi_tree;
1355	struct tree *nfdicf_tree;
1356
1357	struct tree *trees;
1358	int trees_count;
1359
1360	/*
1361	* Check the corrections array to see if this entry was corrected at
1362	* some point.
1363	*/
1364	static struct unicode_data corrections_lookup(struct* unicode_data *u)
1365	{
1366	int i;
1367
1368	for (i = `0`; i != corrections_count; i++)
1369	if (u->code == corrections[i].code)
1370	return &corrections[i];
1371	return u;
1372	}
1373
1374	static int nfdi_equal(void l, void* *r)
1375	{
1376	struct unicode_data *left = l;
1377	struct unicode_data *right = r;
1378
1379	if (left->gen != right->gen)
1380	return `0`;
1381	if (left->ccc != right->ccc)
1382	return `0`;
1383	if (left->utf8nfdi && right->utf8nfdi &&
1384	strcmp(left->utf8nfdi, right->utf8nfdi) == `0`)
1385	return `1`;
1386	if (left->utf8nfdi \|\| right->utf8nfdi)
1387	return `0`;
1388	return `1`;
1389	}
1390
1391	static int nfdicf_equal(void l, void* *r)
1392	{
1393	struct unicode_data *left = l;
1394	struct unicode_data *right = r;
1395
1396	if (left->gen != right->gen)
1397	return `0`;
1398	if (left->ccc != right->ccc)
1399	return `0`;
1400	if (left->utf8nfdicf && right->utf8nfdicf &&
1401	strcmp(left->utf8nfdicf, right->utf8nfdicf) == `0`)
1402	return `1`;
1403	if (left->utf8nfdicf && right->utf8nfdicf)
1404	return `0`;
1405	if (left->utf8nfdicf \|\| right->utf8nfdicf)
1406	return `0`;
1407	if (left->utf8nfdi && right->utf8nfdi &&
1408	strcmp(left->utf8nfdi, right->utf8nfdi) == `0`)
1409	return `1`;
1410	if (left->utf8nfdi \|\| right->utf8nfdi)
1411	return `0`;
1412	return `1`;
1413	}
1414
1415	static void nfdi_print(void l, int* indent)
1416	{
1417	struct unicode_data *leaf = l;
1418
1419	printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1420	leaf->code, leaf->ccc, leaf->gen);
1421
1422	if (leaf->utf8nfdi && leaf->utf8nfdi[`0`] == HANGUL)
1423	printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
1424	else if (leaf->utf8nfdi)
1425	printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
1426
1427	printf("\n");
1428	}
1429
1430	static void nfdicf_print(void l, int* indent)
1431	{
1432	struct unicode_data *leaf = l;
1433
1434	printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1435	leaf->code, leaf->ccc, leaf->gen);
1436
1437	if (leaf->utf8nfdicf)
1438	printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf);
1439	else if (leaf->utf8nfdi && leaf->utf8nfdi[`0`] == HANGUL)
1440	printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
1441	else if (leaf->utf8nfdi)
1442	printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
1443	printf("\n");
1444	}
1445
1446	static int nfdi_mark(void *l)
1447	{
1448	return `1`;
1449	}
1450
1451	static int nfdicf_mark(void *l)
1452	{
1453	struct unicode_data *leaf = l;
1454
1455	if (leaf->utf8nfdicf)
1456	return `1`;
1457	return `0`;
1458	}
1459
1460	static int correction_mark(void *l)
1461	{
1462	struct unicode_data *leaf = l;
1463
1464	return leaf->correction;
1465	}
1466
1467	static int nfdi_size(void *l)
1468	{
1469	struct unicode_data *leaf = l;
1470	int size = `2`;
1471
1472	if (HANGUL_SYLLABLE(leaf->code))
1473	size += `1`;
1474	else if (leaf->utf8nfdi)
1475	size += strlen(leaf->utf8nfdi) + `1`;
1476	return size;
1477	}
1478
1479	static int nfdicf_size(void *l)
1480	{
1481	struct unicode_data *leaf = l;
1482	int size = `2`;
1483
1484	if (HANGUL_SYLLABLE(leaf->code))
1485	size += `1`;
1486	else if (leaf->utf8nfdicf)
1487	size += strlen(leaf->utf8nfdicf) + `1`;
1488	else if (leaf->utf8nfdi)
1489	size += strlen(leaf->utf8nfdi) + `1`;
1490	return size;
1491	}
1492
1493	static int nfdi_index(struct* tree tree, void* *l)
1494	{
1495	struct unicode_data *leaf = l;
1496
1497	return &tree->leafindex[leaf->code];
1498	}
1499
1500	static int nfdicf_index(struct* tree tree, void* *l)
1501	{
1502	struct unicode_data *leaf = l;
1503
1504	return &tree->leafindex[leaf->code];
1505	}
1506
1507	static unsigned char nfdi_emit(void* l, unsigned* char *data)
1508	{
1509	struct unicode_data *leaf = l;
1510	unsigned char *s;
1511
1512	*data++ = leaf->gen;
1513
1514	if (HANGUL_SYLLABLE(leaf->code)) {
1515	*data++ = DECOMPOSE;
1516	*data++ = HANGUL;
1517	} else if (leaf->utf8nfdi) {
1518	*data++ = DECOMPOSE;
1519	s = (unsigned char*)leaf->utf8nfdi;
1520	while ((data++ = s++) != `0`)
1521	;
1522	} else {
1523	*data++ = leaf->ccc;
1524	}
1525	return data;
1526	}
1527
1528	static unsigned char nfdicf_emit(void* l, unsigned* char *data)
1529	{
1530	struct unicode_data *leaf = l;
1531	unsigned char *s;
1532
1533	*data++ = leaf->gen;
1534
1535	if (HANGUL_SYLLABLE(leaf->code)) {
1536	*data++ = DECOMPOSE;
1537	*data++ = HANGUL;
1538	} else if (leaf->utf8nfdicf) {
1539	*data++ = DECOMPOSE;
1540	s = (unsigned char*)leaf->utf8nfdicf;
1541	while ((data++ = s++) != `0`)
1542	;
1543	} else if (leaf->utf8nfdi) {
1544	*data++ = DECOMPOSE;
1545	s = (unsigned char*)leaf->utf8nfdi;
1546	while ((data++ = s++) != `0`)
1547	;
1548	} else {
1549	*data++ = leaf->ccc;
1550	}
1551	return data;
1552	}
1553
1554	static void utf8_create(struct unicode_data *data)
1555	{
1556	char utf[`18`*`4`+`1`];
1557	char *u;
1558	unsigned int *um;
1559	int i;
1560
1561	if (data->utf8nfdi) {
1562	assert(data->utf8nfdi[`0`] == HANGUL);
1563	return;
1564	}
1565
1566	u = utf;
1567	um = data->utf32nfdi;
1568	if (um) {
1569	for (i = `0`; um[i]; i++)
1570	u += utf8encode(str: u, val: um[i]);
1571	*u = `'\0'`;
1572	data->utf8nfdi = strdup(utf);
1573	}
1574	u = utf;
1575	um = data->utf32nfdicf;
1576	if (um) {
1577	for (i = `0`; um[i]; i++)
1578	u += utf8encode(str: u, val: um[i]);
1579	*u = `'\0'`;
1580	if (!data->utf8nfdi \|\| strcmp(data->utf8nfdi, utf))
1581	data->utf8nfdicf = strdup(utf);
1582	}
1583	}
1584
1585	static void utf8_init(void)
1586	{
1587	unsigned int unichar;
1588	int i;
1589
1590	for (unichar = `0`; unichar != `0x110000`; unichar++)
1591	utf8_create(data: &unicode_data[unichar]);
1592
1593	for (i = `0`; i != corrections_count; i++)
1594	utf8_create(data: &corrections[i]);
1595	}
1596
1597	static void trees_init(void)
1598	{
1599	struct unicode_data *data;
1600	unsigned int maxage;
1601	unsigned int nextage;
1602	int count;
1603	int i;
1604	int j;
1605
1606	/ Count the number of different ages. /
1607	count = `0`;
1608	nextage = (unsigned int)-`1`;
1609	do {
1610	maxage = nextage;
1611	nextage = `0`;
1612	for (i = `0`; i <= corrections_count; i++) {
1613	data = &corrections[i];
1614	if (nextage < data->correction &&
1615	data->correction < maxage)
1616	nextage = data->correction;
1617	}
1618	count++;
1619	} while (nextage);
1620
1621	/ Two trees per age: nfdi and nfdicf /
1622	trees_count = count * `2`;
1623	trees = calloc(trees_count, sizeof(struct tree));
1624
1625	/ Assign ages to the trees. /
1626	count = trees_count;
1627	nextage = (unsigned int)-`1`;
1628	do {
1629	maxage = nextage;
1630	trees[--count].maxage = maxage;
1631	trees[--count].maxage = maxage;
1632	nextage = `0`;
1633	for (i = `0`; i <= corrections_count; i++) {
1634	data = &corrections[i];
1635	if (nextage < data->correction &&
1636	data->correction < maxage)
1637	nextage = data->correction;
1638	}
1639	} while (nextage);
1640
1641	/ The ages assigned above are off by one. /
1642	for (i = `0`; i != trees_count; i++) {
1643	j = `0`;
1644	while (ages[j] < trees[i].maxage)
1645	j++;
1646	trees[i].maxage = ages[j-`1`];
1647	}
1648
1649	/ Set up the forwarding between trees. /
1650	trees[trees_count-`2`].next = &trees[trees_count-`1`];
1651	trees[trees_count-`1`].leaf_mark = nfdi_mark;
1652	trees[trees_count-`2`].leaf_mark = nfdicf_mark;
1653	for (i = `0`; i != trees_count-`2`; i += `2`) {
1654	trees[i].next = &trees[trees_count-`2`];
1655	trees[i].leaf_mark = correction_mark;
1656	trees[i+`1`].next = &trees[trees_count-`1`];
1657	trees[i+`1`].leaf_mark = correction_mark;
1658	}
1659
1660	/ Assign the callouts. /
1661	for (i = `0`; i != trees_count; i += `2`) {
1662	trees[i].type = "nfdicf";
1663	trees[i].leaf_equal = nfdicf_equal;
1664	trees[i].leaf_print = nfdicf_print;
1665	trees[i].leaf_size = nfdicf_size;
1666	trees[i].leaf_index = nfdicf_index;
1667	trees[i].leaf_emit = nfdicf_emit;
1668
1669	trees[i+`1`].type = "nfdi";
1670	trees[i+`1`].leaf_equal = nfdi_equal;
1671	trees[i+`1`].leaf_print = nfdi_print;
1672	trees[i+`1`].leaf_size = nfdi_size;
1673	trees[i+`1`].leaf_index = nfdi_index;
1674	trees[i+`1`].leaf_emit = nfdi_emit;
1675	}
1676
1677	/ Finish init. /
1678	for (i = `0`; i != trees_count; i++)
1679	trees[i].childnode = NODE;
1680	}
1681
1682	static void trees_populate(void)
1683	{
1684	struct unicode_data *data;
1685	unsigned int unichar;
1686	char keyval[`4`];
1687	int keylen;
1688	int i;
1689
1690	for (i = `0`; i != trees_count; i++) {
1691	if (verbose > `0`) {
1692	printf("Populating %s_%x\n",
1693	trees[i].type, trees[i].maxage);
1694	}
1695	for (unichar = `0`; unichar != `0x110000`; unichar++) {
1696	if (unicode_data[unichar].gen < `0`)
1697	continue;
1698	keylen = utf8encode(str: keyval, val: unichar);
1699	data = corrections_lookup(u: &unicode_data[unichar]);
1700	if (data->correction <= trees[i].maxage)
1701	data = &unicode_data[unichar];
1702	insert(tree: &trees[i], key: keyval, keylen, leaf: data);
1703	}
1704	}
1705	}
1706
1707	static void trees_reduce(void)
1708	{
1709	int i;
1710	int size;
1711	int changed;
1712
1713	for (i = `0`; i != trees_count; i++)
1714	prune(tree: &trees[i]);
1715	for (i = `0`; i != trees_count; i++)
1716	mark_nodes(tree: &trees[i]);
1717	do {
1718	size = `0`;
1719	for (i = `0`; i != trees_count; i++)
1720	size = index_nodes(tree: &trees[i], index: size);
1721	changed = `0`;
1722	for (i = `0`; i != trees_count; i++)
1723	changed += size_nodes(tree: &trees[i]);
1724	} while (changed);
1725
1726	utf8data = calloc(size, `1`);
1727	utf8data_size = size;
1728	for (i = `0`; i != trees_count; i++)
1729	emit(tree: &trees[i], data: utf8data);
1730
1731	if (verbose > `0`) {
1732	for (i = `0`; i != trees_count; i++) {
1733	printf("%s_%x idx %d\n",
1734	trees[i].type, trees[i].maxage, trees[i].index);
1735	}
1736	}
1737
1738	nfdi = utf8data + trees[trees_count-`1`].index;
1739	nfdicf = utf8data + trees[trees_count-`2`].index;
1740
1741	nfdi_tree = &trees[trees_count-`1`];
1742	nfdicf_tree = &trees[trees_count-`2`];
1743	}
1744
1745	static void verify(struct tree *tree)
1746	{
1747	struct unicode_data *data;
1748	utf8leaf_t *leaf;
1749	unsigned int unichar;
1750	char key[`4`];
1751	unsigned char hangul[UTF8HANGULLEAF];
1752	int report;
1753	int nocf;
1754
1755	if (verbose > `0`)
1756	printf("Verifying %s_%x\n", tree->type, tree->maxage);
1757	nocf = strcmp(tree->type, "nfdicf");
1758
1759	for (unichar = `0`; unichar != `0x110000`; unichar++) {
1760	report = `0`;
1761	data = corrections_lookup(u: &unicode_data[unichar]);
1762	if (data->correction <= tree->maxage)
1763	data = &unicode_data[unichar];
1764	utf8encode(str: key,val: unichar);
1765	leaf = utf8lookup(tree, hangul, key);
1766
1767	if (!leaf) {
1768	if (data->gen != -`1`)
1769	report++;
1770	if (unichar < `0xd800` \|\| unichar > `0xdfff`)
1771	report++;
1772	} else {
1773	if (unichar >= `0xd800` && unichar <= `0xdfff`)
1774	report++;
1775	if (data->gen == -`1`)
1776	report++;
1777	if (data->gen != LEAF_GEN(leaf))
1778	report++;
1779	if (LEAF_CCC(leaf) == DECOMPOSE) {
1780	if (HANGUL_SYLLABLE(data->code)) {
1781	if (data->utf8nfdi[`0`] != HANGUL)
1782	report++;
1783	} else if (nocf) {
1784	if (!data->utf8nfdi) {
1785	report++;
1786	} else if (strcmp(data->utf8nfdi,
1787	LEAF_STR(leaf))) {
1788	report++;
1789	}
1790	} else {
1791	if (!data->utf8nfdicf &&
1792	!data->utf8nfdi) {
1793	report++;
1794	} else if (data->utf8nfdicf) {
1795	if (strcmp(data->utf8nfdicf,
1796	LEAF_STR(leaf)))
1797	report++;
1798	} else if (strcmp(data->utf8nfdi,
1799	LEAF_STR(leaf))) {
1800	report++;
1801	}
1802	}
1803	} else if (data->ccc != LEAF_CCC(leaf)) {
1804	report++;
1805	}
1806	}
1807	if (report) {
1808	printf("%X code %X gen %d ccc %d"
1809	" nfdi -> \"%s\"",
1810	unichar, data->code, data->gen,
1811	data->ccc,
1812	data->utf8nfdi);
1813	if (leaf) {
1814	printf(" gen %d ccc %d"
1815	" nfdi -> \"%s\"",
1816	LEAF_GEN(leaf),
1817	LEAF_CCC(leaf),
1818	LEAF_CCC(leaf) == DECOMPOSE ?
1819	LEAF_STR(leaf) : "");
1820	}
1821	printf("\n");
1822	}
1823	}
1824	}
1825
1826	static void trees_verify(void)
1827	{
1828	int i;
1829
1830	for (i = `0`; i != trees_count; i++)
1831	verify(tree: &trees[i]);
1832	}
1833
1834	/ ------------------------------------------------------------------ /
1835
1836	static void help(void)
1837	{
1838	printf("Usage: %s [options]\n", argv0);
1839	printf("\n");
1840	printf("This program creates an a data trie used for parsing and\n");
1841	printf("normalization of UTF-8 strings. The trie is derived from\n");
1842	printf("a set of input files from the Unicode character database\n");
1843	printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
1844	printf("\n");
1845	printf("The generated tree supports two normalization forms:\n");
1846	printf("\n");
1847	printf("\tnfdi:\n");
1848	printf("\t- Apply unicode normalization form NFD.\n");
1849	printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1850	printf("\n");
1851	printf("\tnfdicf:\n");
1852	printf("\t- Apply unicode normalization form NFD.\n");
1853	printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1854	printf("\t- Apply a full casefold (C + F).\n");
1855	printf("\n");
1856	printf("These forms were chosen as being most useful when dealing\n");
1857	printf("with file names: NFD catches most cases where characters\n");
1858	printf("should be considered equivalent. The ignorables are mostly\n");
1859	printf("invisible, making names hard to type.\n");
1860	printf("\n");
1861	printf("The options to specify the files to be used are listed\n");
1862	printf("below with their default values, which are the names used\n");
1863	printf("by version 11.0.0 of the Unicode Character Database.\n");
1864	printf("\n");
1865	printf("The input files:\n");
1866	printf("\t-a %s\n", AGE_NAME);
1867	printf("\t-c %s\n", CCC_NAME);
1868	printf("\t-p %s\n", PROP_NAME);
1869	printf("\t-d %s\n", DATA_NAME);
1870	printf("\t-f %s\n", FOLD_NAME);
1871	printf("\t-n %s\n", NORM_NAME);
1872	printf("\n");
1873	printf("Additionally, the generated tables are tested using:\n");
1874	printf("\t-t %s\n", TEST_NAME);
1875	printf("\n");
1876	printf("Finally, the output file:\n");
1877	printf("\t-o %s\n", UTF8_NAME);
1878	printf("\n");
1879	}
1880
1881	static void usage(void)
1882	{
1883	help();
1884	exit(`1`);
1885	}
1886
1887	static void open_fail(const char name, int* error)
1888	{
1889	printf("Error %d opening %s: %s\n", error, name, strerror(error));
1890	exit(`1`);
1891	}
1892
1893	static void file_fail(const char *filename)
1894	{
1895	printf("Error parsing %s\n", filename);
1896	exit(`1`);
1897	}
1898
1899	static void line_fail(const char filename, const* char *line)
1900	{
1901	printf("Error parsing %s:%s\n", filename, line);
1902	exit(`1`);
1903	}
1904
1905	/ ------------------------------------------------------------------ /
1906
1907	static void print_utf32(unsigned int *utf32str)
1908	{
1909	int i;
1910
1911	for (i = `0`; utf32str[i]; i++)
1912	printf(" %X", utf32str[i]);
1913	}
1914
1915	static void print_utf32nfdi(unsigned int unichar)
1916	{
1917	printf(" %X ->", unichar);
1918	print_utf32(utf32str: unicode_data[unichar].utf32nfdi);
1919	printf("\n");
1920	}
1921
1922	static void print_utf32nfdicf(unsigned int unichar)
1923	{
1924	printf(" %X ->", unichar);
1925	print_utf32(utf32str: unicode_data[unichar].utf32nfdicf);
1926	printf("\n");
1927	}
1928
1929	/ ------------------------------------------------------------------ /
1930
1931	static void age_init(void)
1932	{
1933	FILE *file;
1934	unsigned int first;
1935	unsigned int last;
1936	unsigned int unichar;
1937	unsigned int major;
1938	unsigned int minor;
1939	unsigned int revision;
1940	int gen;
1941	int count;
1942	int ret;
1943
1944	if (verbose > `0`)
1945	printf("Parsing %s\n", age_name);
1946
1947	file = fopen(age_name, "r");
1948	if (!file)
1949	open_fail(name: age_name, error: errno);
1950	count = `0`;
1951
1952	gen = `0`;
1953	while (fgets(line, LINESIZE, file)) {
1954	ret = sscanf(line, "# Age=V%d_%d_%d",
1955	&major, &minor, &revision);
1956	if (ret == `3`) {
1957	ages_count++;
1958	if (verbose > `1`)
1959	printf(" Age V%d_%d_%d\n",
1960	major, minor, revision);
1961	if (!age_valid(major, minor, revision))
1962	line_fail(filename: age_name, line);
1963	continue;
1964	}
1965	ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
1966	if (ret == `2`) {
1967	ages_count++;
1968	if (verbose > `1`)
1969	printf(" Age V%d_%d\n", major, minor);
1970	if (!age_valid(major, minor, revision: `0`))
1971	line_fail(filename: age_name, line);
1972	continue;
1973	}
1974	}
1975
1976	/ We must have found something above. /
1977	if (verbose > `1`)
1978	printf("%d age entries\n", ages_count);
1979	if (ages_count == `0` \|\| ages_count > MAXGEN)
1980	file_fail(filename: age_name);
1981
1982	/ There is a 0 entry. /
1983	ages_count++;
1984	ages = calloc(ages_count + `1`, sizeof(*ages));
1985	/ And a guard entry. /
1986	ages[ages_count] = (unsigned int)-`1`;
1987
1988	rewind(file);
1989	count = `0`;
1990	gen = `0`;
1991	while (fgets(line, LINESIZE, file)) {
1992	ret = sscanf(line, "# Age=V%d_%d_%d",
1993	&major, &minor, &revision);
1994	if (ret == `3`) {
1995	ages[++gen] =
1996	UNICODE_AGE(major, minor, revision);
1997	if (verbose > `1`)
1998	printf(" Age V%d_%d_%d = gen %d\n",
1999	major, minor, revision, gen);
2000	if (!age_valid(major, minor, revision))
2001	line_fail(filename: age_name, line);
2002	continue;
2003	}
2004	ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
2005	if (ret == `2`) {
2006	ages[++gen] = UNICODE_AGE(major, minor, `0`);
2007	if (verbose > `1`)
2008	printf(" Age V%d_%d = %d\n",
2009	major, minor, gen);
2010	if (!age_valid(major, minor, revision: `0`))
2011	line_fail(filename: age_name, line);
2012	continue;
2013	}
2014	ret = sscanf(line, "%X..%X ; %d.%d #",
2015	&first, &last, &major, &minor);
2016	if (ret == `4`) {
2017	for (unichar = first; unichar <= last; unichar++)
2018	unicode_data[unichar].gen = gen;
2019	count += `1` + last - first;
2020	if (verbose > `1`)
2021	printf(" %X..%X gen %d\n", first, last, gen);
2022	if (!utf32valid(unichar: first) \|\| !utf32valid(unichar: last))
2023	line_fail(filename: age_name, line);
2024	continue;
2025	}
2026	ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
2027	if (ret == `3`) {
2028	unicode_data[unichar].gen = gen;
2029	count++;
2030	if (verbose > `1`)
2031	printf(" %X gen %d\n", unichar, gen);
2032	if (!utf32valid(unichar))
2033	line_fail(filename: age_name, line);
2034	continue;
2035	}
2036	}
2037	unicode_maxage = ages[gen];
2038	fclose(file);
2039
2040	/ Nix surrogate block /
2041	if (verbose > `1`)
2042	printf(" Removing surrogate block D800..DFFF\n");
2043	for (unichar = `0xd800`; unichar <= `0xdfff`; unichar++)
2044	unicode_data[unichar].gen = -`1`;
2045
2046	if (verbose > `0`)
2047	printf("Found %d entries\n", count);
2048	if (count == `0`)
2049	file_fail(filename: age_name);
2050	}
2051
2052	static void ccc_init(void)
2053	{
2054	FILE *file;
2055	unsigned int first;
2056	unsigned int last;
2057	unsigned int unichar;
2058	unsigned int value;
2059	int count;
2060	int ret;
2061
2062	if (verbose > `0`)
2063	printf("Parsing %s\n", ccc_name);
2064
2065	file = fopen(ccc_name, "r");
2066	if (!file)
2067	open_fail(name: ccc_name, error: errno);
2068
2069	count = `0`;
2070	while (fgets(line, LINESIZE, file)) {
2071	ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
2072	if (ret == `3`) {
2073	for (unichar = first; unichar <= last; unichar++) {
2074	unicode_data[unichar].ccc = value;
2075	count++;
2076	}
2077	if (verbose > `1`)
2078	printf(" %X..%X ccc %d\n", first, last, value);
2079	if (!utf32valid(unichar: first) \|\| !utf32valid(unichar: last))
2080	line_fail(filename: ccc_name, line);
2081	continue;
2082	}
2083	ret = sscanf(line, "%X ; %d #", &unichar, &value);
2084	if (ret == `2`) {
2085	unicode_data[unichar].ccc = value;
2086	count++;
2087	if (verbose > `1`)
2088	printf(" %X ccc %d\n", unichar, value);
2089	if (!utf32valid(unichar))
2090	line_fail(filename: ccc_name, line);
2091	continue;
2092	}
2093	}
2094	fclose(file);
2095
2096	if (verbose > `0`)
2097	printf("Found %d entries\n", count);
2098	if (count == `0`)
2099	file_fail(filename: ccc_name);
2100	}
2101
2102	static int ignore_compatibility_form(char *type)
2103	{
2104	int i;
2105	char *ignored_types[] = {"font", "noBreak", "initial", "medial",
2106	"final", "isolated", "circle", "super",
2107	"sub", "vertical", "wide", "narrow",
2108	"small", "square", "fraction", "compat"};
2109
2110	for (i = `0` ; i < ARRAY_SIZE(ignored_types); i++)
2111	if (strcmp(type, ignored_types[i]) == `0`)
2112	return `1`;
2113	return `0`;
2114	}
2115
2116	static void nfdi_init(void)
2117	{
2118	FILE *file;
2119	unsigned int unichar;
2120	unsigned int mapping[`19`]; / Magic - guaranteed not to be exceeded. /
2121	char *s;
2122	char *type;
2123	unsigned int *um;
2124	int count;
2125	int i;
2126	int ret;
2127
2128	if (verbose > `0`)
2129	printf("Parsing %s\n", data_name);
2130	file = fopen(data_name, "r");
2131	if (!file)
2132	open_fail(name: data_name, error: errno);
2133
2134	count = `0`;
2135	while (fgets(line, LINESIZE, file)) {
2136	ret = sscanf(line, "%X;%[^;];%[^;];%[^;];%[^;];%[^;];",
2137	&unichar, buf0);
2138	if (ret != `2`)
2139	continue;
2140	if (!utf32valid(unichar))
2141	line_fail(filename: data_name, line);
2142
2143	s = buf0;
2144	/ skip over <tag> /
2145	if (*s == `'<'`) {
2146	type = ++s;
2147	while (*++s != `'>'`);
2148	*s++ = `'\0'`;
2149	if(ignore_compatibility_form(type))
2150	continue;
2151	}
2152	/ decode the decomposition into UTF-32 /
2153	i = `0`;
2154	while (*s) {
2155	mapping[i] = strtoul(s, &s, `16`);
2156	if (!utf32valid(unichar: mapping[i]))
2157	line_fail(filename: data_name, line);
2158	i++;
2159	}
2160	mapping[i++] = `0`;
2161
2162	um = malloc(i * sizeof(unsigned int));
2163	memcpy(um, mapping, i * sizeof(unsigned int));
2164	unicode_data[unichar].utf32nfdi = um;
2165
2166	if (verbose > `1`)
2167	print_utf32nfdi(unichar);
2168	count++;
2169	}
2170	fclose(file);
2171	if (verbose > `0`)
2172	printf("Found %d entries\n", count);
2173	if (count == `0`)
2174	file_fail(filename: data_name);
2175	}
2176
2177	static void nfdicf_init(void)
2178	{
2179	FILE *file;
2180	unsigned int unichar;
2181	unsigned int mapping[`19`]; / Magic - guaranteed not to be exceeded. /
2182	char status;
2183	char *s;
2184	unsigned int *um;
2185	int i;
2186	int count;
2187	int ret;
2188
2189	if (verbose > `0`)
2190	printf("Parsing %s\n", fold_name);
2191	file = fopen(fold_name, "r");
2192	if (!file)
2193	open_fail(name: fold_name, error: errno);
2194
2195	count = `0`;
2196	while (fgets(line, LINESIZE, file)) {
2197	ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
2198	if (ret != `3`)
2199	continue;
2200	if (!utf32valid(unichar))
2201	line_fail(filename: fold_name, line);
2202	/ Use the C+F casefold. /
2203	if (status != `'C'` && status != `'F'`)
2204	continue;
2205	s = buf0;
2206	if (*s == `'<'`)
2207	while (*s++ != `' '`)
2208	;
2209	i = `0`;
2210	while (*s) {
2211	mapping[i] = strtoul(s, &s, `16`);
2212	if (!utf32valid(unichar: mapping[i]))
2213	line_fail(filename: fold_name, line);
2214	i++;
2215	}
2216	mapping[i++] = `0`;
2217
2218	um = malloc(i * sizeof(unsigned int));
2219	memcpy(um, mapping, i * sizeof(unsigned int));
2220	unicode_data[unichar].utf32nfdicf = um;
2221
2222	if (verbose > `1`)
2223	print_utf32nfdicf(unichar);
2224	count++;
2225	}
2226	fclose(file);
2227	if (verbose > `0`)
2228	printf("Found %d entries\n", count);
2229	if (count == `0`)
2230	file_fail(filename: fold_name);
2231	}
2232
2233	static void ignore_init(void)
2234	{
2235	FILE *file;
2236	unsigned int unichar;
2237	unsigned int first;
2238	unsigned int last;
2239	unsigned int *um;
2240	int count;
2241	int ret;
2242
2243	if (verbose > `0`)
2244	printf("Parsing %s\n", prop_name);
2245	file = fopen(prop_name, "r");
2246	if (!file)
2247	open_fail(name: prop_name, error: errno);
2248	assert(file);
2249	count = `0`;
2250	while (fgets(line, LINESIZE, file)) {
2251	ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
2252	if (ret == `3`) {
2253	if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2254	continue;
2255	if (!utf32valid(unichar: first) \|\| !utf32valid(unichar: last))
2256	line_fail(filename: prop_name, line);
2257	for (unichar = first; unichar <= last; unichar++) {
2258	free(unicode_data[unichar].utf32nfdi);
2259	um = malloc(sizeof(unsigned int));
2260	*um = `0`;
2261	unicode_data[unichar].utf32nfdi = um;
2262	free(unicode_data[unichar].utf32nfdicf);
2263	um = malloc(sizeof(unsigned int));
2264	*um = `0`;
2265	unicode_data[unichar].utf32nfdicf = um;
2266	count++;
2267	}
2268	if (verbose > `1`)
2269	printf(" %X..%X Default_Ignorable_Code_Point\n",
2270	first, last);
2271	continue;
2272	}
2273	ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
2274	if (ret == `2`) {
2275	if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2276	continue;
2277	if (!utf32valid(unichar))
2278	line_fail(filename: prop_name, line);
2279	free(unicode_data[unichar].utf32nfdi);
2280	um = malloc(sizeof(unsigned int));
2281	*um = `0`;
2282	unicode_data[unichar].utf32nfdi = um;
2283	free(unicode_data[unichar].utf32nfdicf);
2284	um = malloc(sizeof(unsigned int));
2285	*um = `0`;
2286	unicode_data[unichar].utf32nfdicf = um;
2287	if (verbose > `1`)
2288	printf(" %X Default_Ignorable_Code_Point\n",
2289	unichar);
2290	count++;
2291	continue;
2292	}
2293	}
2294	fclose(file);
2295
2296	if (verbose > `0`)
2297	printf("Found %d entries\n", count);
2298	if (count == `0`)
2299	file_fail(filename: prop_name);
2300	}
2301
2302	static void corrections_init(void)
2303	{
2304	FILE *file;
2305	unsigned int unichar;
2306	unsigned int major;
2307	unsigned int minor;
2308	unsigned int revision;
2309	unsigned int age;
2310	unsigned int *um;
2311	unsigned int mapping[`19`]; / Magic - guaranteed not to be exceeded. /
2312	char *s;
2313	int i;
2314	int count;
2315	int ret;
2316
2317	if (verbose > `0`)
2318	printf("Parsing %s\n", norm_name);
2319	file = fopen(norm_name, "r");
2320	if (!file)
2321	open_fail(norm_name, errno);
2322
2323	count = `0`;
2324	while (fgets(line, LINESIZE, file)) {
2325	ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2326	&unichar, buf0, buf1,
2327	&major, &minor, &revision);
2328	if (ret != `6`)
2329	continue;
2330	if (!utf32valid(unichar) \|\| !age_valid(major, minor, revision))
2331	line_fail(filename: norm_name, line);
2332	count++;
2333	}
2334	corrections = calloc(count, sizeof(struct unicode_data));
2335	corrections_count = count;
2336	rewind(file);
2337
2338	count = `0`;
2339	while (fgets(line, LINESIZE, file)) {
2340	ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2341	&unichar, buf0, buf1,
2342	&major, &minor, &revision);
2343	if (ret != `6`)
2344	continue;
2345	if (!utf32valid(unichar) \|\| !age_valid(major, minor, revision))
2346	line_fail(filename: norm_name, line);
2347	corrections[count] = unicode_data[unichar];
2348	assert(corrections[count].code == unichar);
2349	age = UNICODE_AGE(major, minor, revision);
2350	corrections[count].correction = age;
2351
2352	i = `0`;
2353	s = buf0;
2354	while (*s) {
2355	mapping[i] = strtoul(s, &s, `16`);
2356	if (!utf32valid(unichar: mapping[i]))
2357	line_fail(filename: norm_name, line);
2358	i++;
2359	}
2360	mapping[i++] = `0`;
2361
2362	um = malloc(i * sizeof(unsigned int));
2363	memcpy(um, mapping, i * sizeof(unsigned int));
2364	corrections[count].utf32nfdi = um;
2365
2366	if (verbose > `1`)
2367	printf(" %X -> %s -> %s V%d_%d_%d\n",
2368	unichar, buf0, buf1, major, minor, revision);
2369	count++;
2370	}
2371	fclose(file);
2372
2373	if (verbose > `0`)
2374	printf("Found %d entries\n", count);
2375	if (count == `0`)
2376	file_fail(filename: norm_name);
2377	}
2378
2379	/ ------------------------------------------------------------------ /
2380
2381	/*
2382	* Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2383	*
2384	* AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2385	* D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2386	*
2387	* SBase = 0xAC00
2388	* LBase = 0x1100
2389	* VBase = 0x1161
2390	* TBase = 0x11A7
2391	* LCount = 19
2392	* VCount = 21
2393	* TCount = 28
2394	* NCount = 588 (VCount * TCount)
2395	* SCount = 11172 (LCount * NCount)
2396	*
2397	* Decomposition:
2398	* SIndex = s - SBase
2399	*
2400	* LV (Canonical/Full)
2401	* LIndex = SIndex / NCount
2402	* VIndex = (Sindex % NCount) / TCount
2403	* LPart = LBase + LIndex
2404	* VPart = VBase + VIndex
2405	*
2406	* LVT (Canonical)
2407	* LVIndex = (SIndex / TCount) * TCount
2408	* TIndex = (Sindex % TCount)
2409	* LVPart = SBase + LVIndex
2410	* TPart = TBase + TIndex
2411	*
2412	* LVT (Full)
2413	* LIndex = SIndex / NCount
2414	* VIndex = (Sindex % NCount) / TCount
2415	* TIndex = (Sindex % TCount)
2416	* LPart = LBase + LIndex
2417	* VPart = VBase + VIndex
2418	* if (TIndex == 0) {
2419	* d = <LPart, VPart>
2420	* } else {
2421	* TPart = TBase + TIndex
2422	* d = <LPart, VPart, TPart>
2423	* }
2424	*
2425	*/
2426
2427	static void hangul_decompose(void)
2428	{
2429	unsigned int sb = `0xAC00`;
2430	unsigned int lb = `0x1100`;
2431	unsigned int vb = `0x1161`;
2432	unsigned int tb = `0x11a7`;
2433	/ unsigned int lc = 19; /
2434	unsigned int vc = `21`;
2435	unsigned int tc = `28`;
2436	unsigned int nc = (vc * tc);
2437	/ unsigned int sc = (lc * nc); /
2438	unsigned int unichar;
2439	unsigned int mapping[`4`];
2440	unsigned int *um;
2441	int count;
2442	int i;
2443
2444	if (verbose > `0`)
2445	printf("Decomposing hangul\n");
2446	/ Hangul /
2447	count = `0`;
2448	for (unichar = `0xAC00`; unichar <= `0xD7A3`; unichar++) {
2449	unsigned int si = unichar - sb;
2450	unsigned int li = si / nc;
2451	unsigned int vi = (si % nc) / tc;
2452	unsigned int ti = si % tc;
2453
2454	i = `0`;
2455	mapping[i++] = lb + li;
2456	mapping[i++] = vb + vi;
2457	if (ti)
2458	mapping[i++] = tb + ti;
2459	mapping[i++] = `0`;
2460
2461	assert(!unicode_data[unichar].utf32nfdi);
2462	um = malloc(i * sizeof(unsigned int));
2463	memcpy(um, mapping, i * sizeof(unsigned int));
2464	unicode_data[unichar].utf32nfdi = um;
2465
2466	assert(!unicode_data[unichar].utf32nfdicf);
2467	um = malloc(i * sizeof(unsigned int));
2468	memcpy(um, mapping, i * sizeof(unsigned int));
2469	unicode_data[unichar].utf32nfdicf = um;
2470
2471	/*
2472	* Add a cookie as a reminder that the hangul syllable
2473	* decompositions must not be stored in the generated
2474	* trie.
2475	*/
2476	unicode_data[unichar].utf8nfdi = malloc(`2`);
2477	unicode_data[unichar].utf8nfdi[`0`] = HANGUL;
2478	unicode_data[unichar].utf8nfdi[`1`] = `'\0'`;
2479
2480	if (verbose > `1`)
2481	print_utf32nfdi(unichar);
2482
2483	count++;
2484	}
2485	if (verbose > `0`)
2486	printf("Created %d entries\n", count);
2487	}
2488
2489	static void nfdi_decompose(void)
2490	{
2491	unsigned int unichar;
2492	unsigned int mapping[`19`]; / Magic - guaranteed not to be exceeded. /
2493	unsigned int *um;
2494	unsigned int *dc;
2495	int count;
2496	int i;
2497	int j;
2498	int ret;
2499
2500	if (verbose > `0`)
2501	printf("Decomposing nfdi\n");
2502
2503	count = `0`;
2504	for (unichar = `0`; unichar != `0x110000`; unichar++) {
2505	if (!unicode_data[unichar].utf32nfdi)
2506	continue;
2507	for (;;) {
2508	ret = `1`;
2509	i = `0`;
2510	um = unicode_data[unichar].utf32nfdi;
2511	while (*um) {
2512	dc = unicode_data[*um].utf32nfdi;
2513	if (dc) {
2514	for (j = `0`; dc[j]; j++)
2515	mapping[i++] = dc[j];
2516	ret = `0`;
2517	} else {
2518	mapping[i++] = *um;
2519	}
2520	um++;
2521	}
2522	mapping[i++] = `0`;
2523	if (ret)
2524	break;
2525	free(unicode_data[unichar].utf32nfdi);
2526	um = malloc(i * sizeof(unsigned int));
2527	memcpy(um, mapping, i * sizeof(unsigned int));
2528	unicode_data[unichar].utf32nfdi = um;
2529	}
2530	/ Add this decomposition to nfdicf if there is no entry. /
2531	if (!unicode_data[unichar].utf32nfdicf) {
2532	um = malloc(i * sizeof(unsigned int));
2533	memcpy(um, mapping, i * sizeof(unsigned int));
2534	unicode_data[unichar].utf32nfdicf = um;
2535	}
2536	if (verbose > `1`)
2537	print_utf32nfdi(unichar);
2538	count++;
2539	}
2540	if (verbose > `0`)
2541	printf("Processed %d entries\n", count);
2542	}
2543
2544	static void nfdicf_decompose(void)
2545	{
2546	unsigned int unichar;
2547	unsigned int mapping[`19`]; / Magic - guaranteed not to be exceeded. /
2548	unsigned int *um;
2549	unsigned int *dc;
2550	int count;
2551	int i;
2552	int j;
2553	int ret;
2554
2555	if (verbose > `0`)
2556	printf("Decomposing nfdicf\n");
2557	count = `0`;
2558	for (unichar = `0`; unichar != `0x110000`; unichar++) {
2559	if (!unicode_data[unichar].utf32nfdicf)
2560	continue;
2561	for (;;) {
2562	ret = `1`;
2563	i = `0`;
2564	um = unicode_data[unichar].utf32nfdicf;
2565	while (*um) {
2566	dc = unicode_data[*um].utf32nfdicf;
2567	if (dc) {
2568	for (j = `0`; dc[j]; j++)
2569	mapping[i++] = dc[j];
2570	ret = `0`;
2571	} else {
2572	mapping[i++] = *um;
2573	}
2574	um++;
2575	}
2576	mapping[i++] = `0`;
2577	if (ret)
2578	break;
2579	free(unicode_data[unichar].utf32nfdicf);
2580	um = malloc(i * sizeof(unsigned int));
2581	memcpy(um, mapping, i * sizeof(unsigned int));
2582	unicode_data[unichar].utf32nfdicf = um;
2583	}
2584	if (verbose > `1`)
2585	print_utf32nfdicf(unichar);
2586	count++;
2587	}
2588	if (verbose > `0`)
2589	printf("Processed %d entries\n", count);
2590	}
2591
2592	/ ------------------------------------------------------------------ /
2593
2594	int utf8agemax(struct tree , const* char *);
2595	int utf8nagemax(struct tree , const* char *, size_t);
2596	int utf8agemin(struct tree , const* char *);
2597	int utf8nagemin(struct tree , const* char *, size_t);
2598	ssize_t utf8len(struct tree , const* char *);
2599	ssize_t utf8nlen(struct tree , const* char *, size_t);
2600	struct utf8cursor;
2601	int utf8cursor(struct utf8cursor , struct* tree , const* char *);
2602	int utf8ncursor(struct utf8cursor , struct* tree , const* char *, size_t);
2603	int utf8byte(struct utf8cursor *);
2604
2605	/*
2606	* Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2607	*
2608	* AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2609	* D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2610	*
2611	* SBase = 0xAC00
2612	* LBase = 0x1100
2613	* VBase = 0x1161
2614	* TBase = 0x11A7
2615	* LCount = 19
2616	* VCount = 21
2617	* TCount = 28
2618	* NCount = 588 (VCount * TCount)
2619	* SCount = 11172 (LCount * NCount)
2620	*
2621	* Decomposition:
2622	* SIndex = s - SBase
2623	*
2624	* LV (Canonical/Full)
2625	* LIndex = SIndex / NCount
2626	* VIndex = (Sindex % NCount) / TCount
2627	* LPart = LBase + LIndex
2628	* VPart = VBase + VIndex
2629	*
2630	* LVT (Canonical)
2631	* LVIndex = (SIndex / TCount) * TCount
2632	* TIndex = (Sindex % TCount)
2633	* LVPart = SBase + LVIndex
2634	* TPart = TBase + TIndex
2635	*
2636	* LVT (Full)
2637	* LIndex = SIndex / NCount
2638	* VIndex = (Sindex % NCount) / TCount
2639	* TIndex = (Sindex % TCount)
2640	* LPart = LBase + LIndex
2641	* VPart = VBase + VIndex
2642	* if (TIndex == 0) {
2643	* d = <LPart, VPart>
2644	* } else {
2645	* TPart = TBase + TIndex
2646	* d = <LPart, VPart, TPart>
2647	* }
2648	*/
2649
2650	/ Constants /
2651	#define SB (0xAC00)
2652	#define LB (0x1100)
2653	#define VB (0x1161)
2654	#define TB (0x11A7)
2655	#define LC (19)
2656	#define VC (21)
2657	#define TC (28)
2658	#define NC (VC * TC)
2659	#define SC (LC * NC)
2660
2661	/ Algorithmic decomposition of hangul syllable. /
2662	static utf8leaf_t utf8hangul(const* char str, unsigned* char *hangul)
2663	{
2664	unsigned int si;
2665	unsigned int li;
2666	unsigned int vi;
2667	unsigned int ti;
2668	unsigned char *h;
2669
2670	/ Calculate the SI, LI, VI, and TI values. /
2671	si = utf8decode(str) - SB;
2672	li = si / NC;
2673	vi = (si % NC) / TC;
2674	ti = si % TC;
2675
2676	/ Fill in base of leaf. /
2677	h = hangul;
2678	LEAF_GEN(h) = `2`;
2679	LEAF_CCC(h) = DECOMPOSE;
2680	h += `2`;
2681
2682	/ Add LPart, a 3-byte UTF-8 sequence. /
2683	h += utf8encode(str: (char *)h, val: li + LB);
2684
2685	/ Add VPart, a 3-byte UTF-8 sequence. /
2686	h += utf8encode(str: (char *)h, val: vi + VB);
2687
2688	/ Add TPart if required, also a 3-byte UTF-8 sequence. /
2689	if (ti)
2690	h += utf8encode(str: (char *)h, val: ti + TB);
2691
2692	/ Terminate string. /
2693	h[`0`] = `'\0'`;
2694
2695	return hangul;
2696	}
2697
2698	/*
2699	* Use trie to scan s, touching at most len bytes.
2700	* Returns the leaf if one exists, NULL otherwise.
2701	*
2702	* A non-NULL return guarantees that the UTF-8 sequence starting at s
2703	* is well-formed and corresponds to a known unicode code point. The
2704	* shorthand for this will be "is valid UTF-8 unicode".
2705	*/
2706	static utf8leaf_t utf8nlookup(struct* tree tree, unsigned* char *hangul,
2707	const char *s, size_t len)
2708	{
2709	utf8trie_t *trie;
2710	int offlen;
2711	int offset;
2712	int mask;
2713	int node;
2714
2715	if (!tree)
2716	return NULL;
2717	if (len == `0`)
2718	return NULL;
2719	node = `1`;
2720	trie = utf8data + tree->index;
2721	while (node) {
2722	offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
2723	if (*trie & NEXTBYTE) {
2724	if (--len == `0`)
2725	return NULL;
2726	s++;
2727	}
2728	mask = `1` << (*trie & BITNUM);
2729	if (*s & mask) {
2730	/ Right leg /
2731	if (offlen) {
2732	/ Right node at offset of trie /
2733	node = (*trie & RIGHTNODE);
2734	offset = trie[offlen];
2735	while (--offlen) {
2736	offset <<= `8`;
2737	offset \|= trie[offlen];
2738	}
2739	trie += offset;
2740	} else if (*trie & RIGHTPATH) {
2741	/ Right node after this node /
2742	node = (*trie & TRIENODE);
2743	trie++;
2744	} else {
2745	/ No right node. /
2746	return NULL;
2747	}
2748	} else {
2749	/ Left leg /
2750	if (offlen) {
2751	/ Left node after this node. /
2752	node = (*trie & LEFTNODE);
2753	trie += offlen + `1`;
2754	} else if (*trie & RIGHTPATH) {
2755	/ No left node. /
2756	return NULL;
2757	} else {
2758	/ Left node after this node /
2759	node = (*trie & TRIENODE);
2760	trie++;
2761	}
2762	}
2763	}
2764	/*
2765	* Hangul decomposition is done algorithmically. These are the
2766	* codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
2767	* always 3 bytes long, so s has been advanced twice, and the
2768	* start of the sequence is at s-2.
2769	*/
2770	if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[`0`] == HANGUL)
2771	trie = utf8hangul(str: s - `2`, hangul);
2772	return trie;
2773	}
2774
2775	/*
2776	* Use trie to scan s.
2777	* Returns the leaf if one exists, NULL otherwise.
2778	*
2779	* Forwards to trie_nlookup().
2780	*/
2781	static utf8leaf_t utf8lookup(struct* tree tree, unsigned* char *hangul,
2782	const char *s)
2783	{
2784	return utf8nlookup(tree, hangul, s, (size_t)-`1`);
2785	}
2786
2787	/*
2788	* Return the number of bytes used by the current UTF-8 sequence.
2789	* Assumes the input points to the first byte of a valid UTF-8
2790	* sequence.
2791	*/
2792	static inline int utf8clen(const char *s)
2793	{
2794	unsigned char c = *s;
2795	return `1` + (c >= `0xC0`) + (c >= `0xE0`) + (c >= `0xF0`);
2796	}
2797
2798	/*
2799	* Maximum age of any character in s.
2800	* Return -1 if s is not valid UTF-8 unicode.
2801	* Return 0 if only non-assigned code points are used.
2802	*/
2803	int utf8agemax(struct tree tree, const* char *s)
2804	{
2805	utf8leaf_t *leaf;
2806	int age = `0`;
2807	int leaf_age;
2808	unsigned char hangul[UTF8HANGULLEAF];
2809
2810	if (!tree)
2811	return -`1`;
2812
2813	while (*s) {
2814	leaf = utf8lookup(tree, hangul, s);
2815	if (!leaf)
2816	return -`1`;
2817	leaf_age = ages[LEAF_GEN(leaf)];
2818	if (leaf_age <= tree->maxage && leaf_age > age)
2819	age = leaf_age;
2820	s += utf8clen(s);
2821	}
2822	return age;
2823	}
2824
2825	/*
2826	* Minimum age of any character in s.
2827	* Return -1 if s is not valid UTF-8 unicode.
2828	* Return 0 if non-assigned code points are used.
2829	*/
2830	int utf8agemin(struct tree tree, const* char *s)
2831	{
2832	utf8leaf_t *leaf;
2833	int age;
2834	int leaf_age;
2835	unsigned char hangul[UTF8HANGULLEAF];
2836
2837	if (!tree)
2838	return -`1`;
2839	age = tree->maxage;
2840	while (*s) {
2841	leaf = utf8lookup(tree, hangul, s);
2842	if (!leaf)
2843	return -`1`;
2844	leaf_age = ages[LEAF_GEN(leaf)];
2845	if (leaf_age <= tree->maxage && leaf_age < age)
2846	age = leaf_age;
2847	s += utf8clen(s);
2848	}
2849	return age;
2850	}
2851
2852	/*
2853	* Maximum age of any character in s, touch at most len bytes.
2854	* Return -1 if s is not valid UTF-8 unicode.
2855	*/
2856	int utf8nagemax(struct tree tree, const* char *s, size_t len)
2857	{
2858	utf8leaf_t *leaf;
2859	int age = `0`;
2860	int leaf_age;
2861	unsigned char hangul[UTF8HANGULLEAF];
2862
2863	if (!tree)
2864	return -`1`;
2865
2866	while (len && *s) {
2867	leaf = utf8nlookup(tree, hangul, s, size_t: len);
2868	if (!leaf)
2869	return -`1`;
2870	leaf_age = ages[LEAF_GEN(leaf)];
2871	if (leaf_age <= tree->maxage && leaf_age > age)
2872	age = leaf_age;
2873	len -= utf8clen(s);
2874	s += utf8clen(s);
2875	}
2876	return age;
2877	}
2878
2879	/*
2880	* Maximum age of any character in s, touch at most len bytes.
2881	* Return -1 if s is not valid UTF-8 unicode.
2882	*/
2883	int utf8nagemin(struct tree tree, const* char *s, size_t len)
2884	{
2885	utf8leaf_t *leaf;
2886	int leaf_age;
2887	int age;
2888	unsigned char hangul[UTF8HANGULLEAF];
2889
2890	if (!tree)
2891	return -`1`;
2892	age = tree->maxage;
2893	while (len && *s) {
2894	leaf = utf8nlookup(tree, hangul, s, size_t: len);
2895	if (!leaf)
2896	return -`1`;
2897	leaf_age = ages[LEAF_GEN(leaf)];
2898	if (leaf_age <= tree->maxage && leaf_age < age)
2899	age = leaf_age;
2900	len -= utf8clen(s);
2901	s += utf8clen(s);
2902	}
2903	return age;
2904	}
2905
2906	/*
2907	* Length of the normalization of s.
2908	* Return -1 if s is not valid UTF-8 unicode.
2909	*
2910	* A string of Default_Ignorable_Code_Point has length 0.
2911	*/
2912	ssize_t utf8len(struct tree tree, const* char *s)
2913	{
2914	utf8leaf_t *leaf;
2915	size_t ret = `0`;
2916	unsigned char hangul[UTF8HANGULLEAF];
2917
2918	if (!tree)
2919	return -`1`;
2920	while (*s) {
2921	leaf = utf8lookup(tree, hangul, s);
2922	if (!leaf)
2923	return -`1`;
2924	if (ages[LEAF_GEN(leaf)] > tree->maxage)
2925	ret += utf8clen(s);
2926	else if (LEAF_CCC(leaf) == DECOMPOSE)
2927	ret += strlen(LEAF_STR(leaf));
2928	else
2929	ret += utf8clen(s);
2930	s += utf8clen(s);
2931	}
2932	return ret;
2933	}
2934
2935	/*
2936	* Length of the normalization of s, touch at most len bytes.
2937	* Return -1 if s is not valid UTF-8 unicode.
2938	*/
2939	ssize_t utf8nlen(struct tree tree, const* char *s, size_t len)
2940	{
2941	utf8leaf_t *leaf;
2942	size_t ret = `0`;
2943	unsigned char hangul[UTF8HANGULLEAF];
2944
2945	if (!tree)
2946	return -`1`;
2947	while (len && *s) {
2948	leaf = utf8nlookup(tree, hangul, s, size_t: len);
2949	if (!leaf)
2950	return -`1`;
2951	if (ages[LEAF_GEN(leaf)] > tree->maxage)
2952	ret += utf8clen(s);
2953	else if (LEAF_CCC(leaf) == DECOMPOSE)
2954	ret += strlen(LEAF_STR(leaf));
2955	else
2956	ret += utf8clen(s);
2957	len -= utf8clen(s);
2958	s += utf8clen(s);
2959	}
2960	return ret;
2961	}
2962
2963	/*
2964	* Cursor structure used by the normalizer.
2965	*/
2966	struct utf8cursor {
2967	struct tree *tree;
2968	const char *s;
2969	const char *p;
2970	const char *ss;
2971	const char *sp;
2972	unsigned int len;
2973	unsigned int slen;
2974	short int ccc;
2975	short int nccc;
2976	unsigned int unichar;
2977	unsigned char hangul[UTF8HANGULLEAF];
2978	};
2979
2980	/*
2981	* Set up an utf8cursor for use by utf8byte().
2982	*
2983	* s : string.
2984	* len : length of s.
2985	* u8c : pointer to cursor.
2986	* trie : utf8trie_t to use for normalization.
2987	*
2988	* Returns -1 on error, 0 on success.
2989	*/
2990	int utf8ncursor(struct utf8cursor u8c, struct* tree tree, const* char *s,
2991	size_t len)
2992	{
2993	if (!tree)
2994	return -`1`;
2995	if (!s)
2996	return -`1`;
2997	u8c->tree = tree;
2998	u8c->s = s;
2999	u8c->p = NULL;
3000	u8c->ss = NULL;
3001	u8c->sp = NULL;
3002	u8c->len = len;
3003	u8c->slen = `0`;
3004	u8c->ccc = STOPPER;
3005	u8c->nccc = STOPPER;
3006	u8c->unichar = `0`;
3007	/ Check we didn't clobber the maximum length. /
3008	if (u8c->len != len)
3009	return -`1`;
3010	/ The first byte of s may not be an utf8 continuation. /
3011	if (len > `0` && (*s & `0xC0`) == `0x80`)
3012	return -`1`;
3013	return `0`;
3014	}
3015
3016	/*
3017	* Set up an utf8cursor for use by utf8byte().
3018	*
3019	* s : NUL-terminated string.
3020	* u8c : pointer to cursor.
3021	* trie : utf8trie_t to use for normalization.
3022	*
3023	* Returns -1 on error, 0 on success.
3024	*/
3025	int utf8cursor(struct utf8cursor u8c, struct* tree tree, const* char *s)
3026	{
3027	return utf8ncursor(u8c, tree, s, size_t: (unsigned int)-`1`);
3028	}
3029
3030	/*
3031	* Get one byte from the normalized form of the string described by u8c.
3032	*
3033	* Returns the byte cast to an unsigned char on succes, and -1 on failure.
3034	*
3035	* The cursor keeps track of the location in the string in u8c->s.
3036	* When a character is decomposed, the current location is stored in
3037	* u8c->p, and u8c->s is set to the start of the decomposition. Note
3038	* that bytes from a decomposition do not count against u8c->len.
3039	*
3040	* Characters are emitted if they match the current CCC in u8c->ccc.
3041	* Hitting end-of-string while u8c->ccc == STOPPER means we're done,
3042	* and the function returns 0 in that case.
3043	*
3044	* Sorting by CCC is done by repeatedly scanning the string. The
3045	* values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
3046	* the start of the scan. The first pass finds the lowest CCC to be
3047	* emitted and stores it in u8c->nccc, the second pass emits the
3048	* characters with this CCC and finds the next lowest CCC. This limits
3049	* the number of passes to 1 + the number of different CCCs in the
3050	* sequence being scanned.
3051	*
3052	* Therefore:
3053	* u8c->p != NULL -> a decomposition is being scanned.
3054	* u8c->ss != NULL -> this is a repeating scan.
3055	* u8c->ccc == -1 -> this is the first scan of a repeating scan.
3056	*/
3057	int utf8byte(struct utf8cursor *u8c)
3058	{
3059	utf8leaf_t *leaf;
3060	int ccc;
3061
3062	for (;;) {
3063	/ Check for the end of a decomposed character. /
3064	if (u8c->p && *u8c->s == `'\0'`) {
3065	u8c->s = u8c->p;
3066	u8c->p = NULL;
3067	}
3068
3069	/ Check for end-of-string. /
3070	if (!u8c->p && (u8c->len == `0` \|\| *u8c->s == `'\0'`)) {
3071	/ There is no next byte. /
3072	if (u8c->ccc == STOPPER)
3073	return `0`;
3074	/ End-of-string during a scan counts as a stopper. /
3075	ccc = STOPPER;
3076	goto ccc_mismatch;
3077	} else if ((*u8c->s & `0xC0`) == `0x80`) {
3078	/ This is a continuation of the current character. /
3079	if (!u8c->p)
3080	u8c->len--;
3081	return (unsigned char)*u8c->s++;
3082	}
3083
3084	/ Look up the data for the current character. /
3085	if (u8c->p) {
3086	leaf = utf8lookup(tree: u8c->tree, hangul: u8c->hangul, s: u8c->s);
3087	} else {
3088	leaf = utf8nlookup(u8c->tree, u8c->hangul,
3089	u8c->s, size_t: u8c->len);
3090	}
3091
3092	/ No leaf found implies that the input is a binary blob. /
3093	if (!leaf)
3094	return -`1`;
3095
3096	/ Characters that are too new have CCC 0. /
3097	if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
3098	ccc = STOPPER;
3099	} else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
3100	u8c->len -= utf8clen(s: u8c->s);
3101	u8c->p = u8c->s + utf8clen(s: u8c->s);
3102	u8c->s = LEAF_STR(leaf);
3103	/ Empty decomposition implies CCC 0. /
3104	if (*u8c->s == `'\0'`) {
3105	if (u8c->ccc == STOPPER)
3106	continue;
3107	ccc = STOPPER;
3108	goto ccc_mismatch;
3109	}
3110	leaf = utf8lookup(tree: u8c->tree, hangul: u8c->hangul, s: u8c->s);
3111	ccc = LEAF_CCC(leaf);
3112	}
3113	u8c->unichar = utf8decode(str: u8c->s);
3114
3115	/*
3116	* If this is not a stopper, then see if it updates
3117	* the next canonical class to be emitted.
3118	*/
3119	if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
3120	u8c->nccc = ccc;
3121
3122	/*
3123	* Return the current byte if this is the current
3124	* combining class.
3125	*/
3126	if (ccc == u8c->ccc) {
3127	if (!u8c->p)
3128	u8c->len--;
3129	return (unsigned char)*u8c->s++;
3130	}
3131
3132	/ Current combining class mismatch. /
3133	ccc_mismatch:
3134	if (u8c->nccc == STOPPER) {
3135	/*
3136	* Scan forward for the first canonical class
3137	* to be emitted. Save the position from
3138	* which to restart.
3139	*/
3140	assert(u8c->ccc == STOPPER);
3141	u8c->ccc = MINCCC - `1`;
3142	u8c->nccc = ccc;
3143	u8c->sp = u8c->p;
3144	u8c->ss = u8c->s;
3145	u8c->slen = u8c->len;
3146	if (!u8c->p)
3147	u8c->len -= utf8clen(s: u8c->s);
3148	u8c->s += utf8clen(s: u8c->s);
3149	} else if (ccc != STOPPER) {
3150	/ Not a stopper, and not the ccc we're emitting. /
3151	if (!u8c->p)
3152	u8c->len -= utf8clen(s: u8c->s);
3153	u8c->s += utf8clen(s: u8c->s);
3154	} else if (u8c->nccc != MAXCCC + `1`) {
3155	/ At a stopper, restart for next ccc. /
3156	u8c->ccc = u8c->nccc;
3157	u8c->nccc = MAXCCC + `1`;
3158	u8c->s = u8c->ss;
3159	u8c->p = u8c->sp;
3160	u8c->len = u8c->slen;
3161	} else {
3162	/ All done, proceed from here. /
3163	u8c->ccc = STOPPER;
3164	u8c->nccc = STOPPER;
3165	u8c->sp = NULL;
3166	u8c->ss = NULL;
3167	u8c->slen = `0`;
3168	}
3169	}
3170	}
3171
3172	/ ------------------------------------------------------------------ /
3173
3174	static int normalize_line(struct tree *tree)
3175	{
3176	char *s;
3177	char *t;
3178	int c;
3179	struct utf8cursor u8c;
3180
3181	/ First test: null-terminated string. /
3182	s = buf2;
3183	t = buf3;
3184	if (utf8cursor(u8c: &u8c, tree, s))
3185	return -`1`;
3186	while ((c = utf8byte(u8c: &u8c)) > `0`)
3187	if (c != (unsigned char)*t++)
3188	return -`1`;
3189	if (c < `0`)
3190	return -`1`;
3191	if (*t != `0`)
3192	return -`1`;
3193
3194	/ Second test: length-limited string. /
3195	s = buf2;
3196	/ Replace NUL with a value that will cause an error if seen. /
3197	s[strlen(s) + `1`] = -`1`;
3198	t = buf3;
3199	if (utf8cursor(u8c: &u8c, tree, s))
3200	return -`1`;
3201	while ((c = utf8byte(u8c: &u8c)) > `0`)
3202	if (c != (unsigned char)*t++)
3203	return -`1`;
3204	if (c < `0`)
3205	return -`1`;
3206	if (*t != `0`)
3207	return -`1`;
3208
3209	return `0`;
3210	}
3211
3212	static void normalization_test(void)
3213	{
3214	FILE *file;
3215	unsigned int unichar;
3216	struct unicode_data *data;
3217	char *s;
3218	char *t;
3219	int ret;
3220	int ignorables;
3221	int tests = `0`;
3222	int failures = `0`;
3223
3224	if (verbose > `0`)
3225	printf("Parsing %s\n", test_name);
3226	/ Step one, read data from file. /
3227	file = fopen(test_name, "r");
3228	if (!file)
3229	open_fail(test_name, errno);
3230
3231	while (fgets(line, LINESIZE, file)) {
3232	ret = sscanf(line, "%[^;];%[^;];%[^;];%[^;];%*[^;];",
3233	buf0, buf1);
3234	if (ret != `2` \|\| *line == `'#'`)
3235	continue;
3236	s = buf0;
3237	t = buf2;
3238	while (*s) {
3239	unichar = strtoul(s, &s, `16`);
3240	t += utf8encode(str: t, val: unichar);
3241	}
3242	*t = `'\0'`;
3243
3244	ignorables = `0`;
3245	s = buf1;
3246	t = buf3;
3247	while (*s) {
3248	unichar = strtoul(s, &s, `16`);
3249	data = &unicode_data[unichar];
3250	if (data->utf8nfdi && !*data->utf8nfdi)
3251	ignorables = `1`;
3252	else
3253	t += utf8encode(str: t, val: unichar);
3254	}
3255	*t = `'\0'`;
3256
3257	tests++;
3258	if (normalize_line(tree: nfdi_tree) < `0`) {
3259	printf("Line %s -> %s", buf0, buf1);
3260	if (ignorables)
3261	printf(" (ignorables removed)");
3262	printf(" failure\n");
3263	failures++;
3264	}
3265	}
3266	fclose(file);
3267	if (verbose > `0`)
3268	printf("Ran %d tests with %d failures\n", tests, failures);
3269	if (failures)
3270	file_fail(filename: test_name);
3271	}
3272
3273	/ ------------------------------------------------------------------ /
3274
3275	static void write_file(void)
3276	{
3277	FILE *file;
3278	int i;
3279	int j;
3280	int t;
3281	int gen;
3282
3283	if (verbose > `0`)
3284	printf("Writing %s\n", utf8_name);
3285	file = fopen(utf8_name, "w");
3286	if (!file)
3287	open_fail(utf8_name, errno);
3288
3289	fprintf(file, "/* This file is generated code, do not edit. */\n");
3290	fprintf(file, "\n");
3291	fprintf(file, "#include <linux/module.h>\n");
3292	fprintf(file, "#include <linux/kernel.h>\n");
3293	fprintf(file, "#include \"utf8n.h\"\n");
3294	fprintf(file, "\n");
3295	fprintf(file, "static const unsigned int utf8agetab[] = {\n");
3296	for (i = `0`; i != ages_count; i++)
3297	fprintf(file, "\t%#x%s\n", ages[i],
3298	ages[i] == unicode_maxage ? "" : ",");
3299	fprintf(file, "};\n");
3300	fprintf(file, "\n");
3301	fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n");
3302	t = `0`;
3303	for (gen = `0`; gen < ages_count; gen++) {
3304	fprintf(file, "\t{ %#x, %d }%s\n",
3305	ages[gen], trees[t].index,
3306	ages[gen] == unicode_maxage ? "" : ",");
3307	if (trees[t].maxage == ages[gen])
3308	t += `2`;
3309	}
3310	fprintf(file, "};\n");
3311	fprintf(file, "\n");
3312	fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n");
3313	t = `1`;
3314	for (gen = `0`; gen < ages_count; gen++) {
3315	fprintf(file, "\t{ %#x, %d }%s\n",
3316	ages[gen], trees[t].index,
3317	ages[gen] == unicode_maxage ? "" : ",");
3318	if (trees[t].maxage == ages[gen])
3319	t += `2`;
3320	}
3321	fprintf(file, "};\n");
3322	fprintf(file, "\n");
3323	fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
3324	utf8data_size);
3325	t = `0`;
3326	for (i = `0`; i != utf8data_size; i += `16`) {
3327	if (i == trees[t].index) {
3328	fprintf(file, "\t/* %s_%x */\n",
3329	trees[t].type, trees[t].maxage);
3330	if (t < trees_count-`1`)
3331	t++;
3332	}
3333	fprintf(file, "\t");
3334	for (j = i; j != i + `16`; j++)
3335	fprintf(file, "0x%.2x%s", utf8data[j],
3336	(j < utf8data_size -`1` ? "," : ""));
3337	fprintf(file, "\n");
3338	}
3339	fprintf(file, "};\n");
3340	fprintf(file, "\n");
3341	fprintf(file, "struct utf8data_table utf8_data_table = {\n");
3342	fprintf(file, "\t.utf8agetab = utf8agetab,\n");
3343	fprintf(file, "\t.utf8agetab_size = ARRAY_SIZE(utf8agetab),\n");
3344	fprintf(file, "\n");
3345	fprintf(file, "\t.utf8nfdicfdata = utf8nfdicfdata,\n");
3346	fprintf(file, "\t.utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata),\n");
3347	fprintf(file, "\n");
3348	fprintf(file, "\t.utf8nfdidata = utf8nfdidata,\n");
3349	fprintf(file, "\t.utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata),\n");
3350	fprintf(file, "\n");
3351	fprintf(file, "\t.utf8data = utf8data,\n");
3352	fprintf(file, "};\n");
3353	fprintf(file, "EXPORT_SYMBOL_GPL(utf8_data_table);");
3354	fprintf(file, "\n");
3355	fprintf(file, "MODULE_LICENSE(\"GPL v2\");\n");
3356	fclose(file);
3357	}
3358
3359	/ ------------------------------------------------------------------ /
3360
3361	int main(int argc, char *argv[])
3362	{
3363	unsigned int unichar;
3364	int opt;
3365
3366	argv0 = argv[`0`];
3367
3368	while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -`1`) {
3369	switch (opt) {
3370	case `'a'`:
3371	age_name = optarg;
3372	break;
3373	case `'c'`:
3374	ccc_name = optarg;
3375	break;
3376	case `'d'`:
3377	data_name = optarg;
3378	break;
3379	case `'f'`:
3380	fold_name = optarg;
3381	break;
3382	case `'n'`:
3383	norm_name = optarg;
3384	break;
3385	case `'o'`:
3386	utf8_name = optarg;
3387	break;
3388	case `'p'`:
3389	prop_name = optarg;
3390	break;
3391	case `'t'`:
3392	test_name = optarg;
3393	break;
3394	case `'v'`:
3395	verbose++;
3396	break;
3397	case `'h'`:
3398	help();
3399	exit(`0`);
3400	default:
3401	usage();
3402	}
3403	}
3404
3405	if (verbose > `1`)
3406	help();
3407	for (unichar = `0`; unichar != `0x110000`; unichar++)
3408	unicode_data[unichar].code = unichar;
3409	age_init();
3410	ccc_init();
3411	nfdi_init();
3412	nfdicf_init();
3413	ignore_init();
3414	corrections_init();
3415	hangul_decompose();
3416	nfdi_decompose();
3417	nfdicf_decompose();
3418	utf8_init();
3419	trees_init();
3420	trees_populate();
3421	trees_reduce();
3422	trees_verify();
3423	/ Prevent "unused function" warning. /
3424	(void)lookup(tree: nfdi_tree, key: " ");
3425	if (verbose > `2`)
3426	tree_walk(tree: nfdi_tree);
3427	if (verbose > `2`)
3428	tree_walk(tree: nfdicf_tree);
3429	normalization_test();
3430	write_file();
3431
3432	return `0`;
3433	}
3434

source code of linux/fs/unicode/mkutf8data.c