UnicodeNameMappingGenerator.cpp source code [llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp]

1	//===--- UnicodeNameMappingGenerator.cpp - Unicode name data generator ---===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file is used to generate lib/Support/UnicodeNameToCodepointGenerated.cpp
10	// using UnicodeData.txt and NameAliases.txt available at
11	// https://unicode.org/Public/15.1.0/ucd/
12	//===----------------------------------------------------------------------===//
13
14	#include "llvm/ADT/STLExtras.h"
15	#include "llvm/ADT/StringExtras.h"
16	#include "llvm/ADT/StringRef.h"
17	#include <algorithm>
18	#include <array>
19	#include <deque>
20	#include <fstream>
21	#include <memory>
22	#include <optional>
23	#include <set>
24	#include <string>
25	#include <unordered_map>
26	#include <utility>
27	#include <vector>
28
29	static const llvm::StringRef Letters =
30	" _-ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
31
32	// Collect names UnicodeData.txt and AliasNames.txt
33	// There may be multiple names per code points.
34	static std::unordered_multimap<char32_t, std::string>
35	loadDataFiles(const std::string &NamesFile, const std::string &AliasesFile) {
36	std::unordered_multimap<char32_t, std::string> CollectedCharacters;
37	auto FromFile = [&](const std::string &File, bool IsAliasFile = false) {
38	std::ifstream InputFile(File);
39	for (std::string Line; getline(is&: InputFile, str&: Line);) {
40	if (Line.empty() \|\| !isxdigit(Line [`0`]))
41	continue;
42	auto FirstSemiPos = Line.find(c: `';'`);
43	if (FirstSemiPos == std::string::npos)
44	continue;
45	auto SecondSemiPos = Line.find(c: `';'`, pos: FirstSemiPos + `1`);
46	if (SecondSemiPos == std::string::npos)
47	continue;
48	unsigned long long CodePoint;
49	if (llvm::getAsUnsignedInteger(
50	Str: llvm::StringRef (Line.c_str(), FirstSemiPos), Radix: `16`, Result&: CodePoint)) {
51	continue;
52	}
53
54	std::string Name =
55	Line.substr(pos: FirstSemiPos + `1`, n: SecondSemiPos - FirstSemiPos - `1`);
56
57	if (!Name.empty() && Name [`0`] == `'<'`) {
58	// Ignore ranges of characters, as their name is either absent or
59	// generated.
60	continue;
61	}
62
63	// Some aliases are ignored for compatibility with C++
64	if (IsAliasFile) {
65	std::string Kind = Line.substr(pos: SecondSemiPos + `1`);
66	if (Kind != "control" && Kind != "correction" && Kind != "alternate")
67	continue;
68	}
69
70	auto InsertUnique = [&](char32_t CP, std::string Name) {
71	auto It = CollectedCharacters.find(x: CP);
72	while (It != std::end(cont&: CollectedCharacters) && It ->first == CP) {
73	if (It ->second == Name)
74	return;
75	++It;
76	}
77	CollectedCharacters.insert(x: {CP, std::move(Name)});
78	};
79	InsertUnique(CodePoint, std::move(Name));
80	}
81	};
82
83	FromFile (NamesFile);
84	FromFile (AliasesFile, true);
85	return CollectedCharacters;
86	}
87
88	class Trie {
89	struct Node;
90
91	public:
92	// When inserting named codepoint
93	// We create a node per character in the name.
94	// SPARKLE becomes S <- P <- A <- R <- K <- L <- E
95	// Once all characters are inserted, the tree is compacted
96	void insert(llvm::StringRef Name, char32_t Codepoint) {
97	Node *N = Root.get();
98	bool IsBeforeMedial = false;
99	for (auto ChIt = Name.begin(); ChIt != Name.end();
100	ChIt += (IsBeforeMedial ? `3` : `1`)) {
101	char Ch = *ChIt;
102	assert(Letters.contains(Ch) && "Unexpected symbol in Unicode name");
103
104	std::string Label(`1`, Ch);
105
106	// We need to ensure a node never ends or starts by
107	// a medial hyphen as this would break the
108	// loose matching algorithm.
109	IsBeforeMedial = llvm::isAlnum(C: Ch) && ChIt + `1` != Name.end() &&
110	*(ChIt + `1`) == `'-'` && ChIt + `2` != Name.end() &&
111	llvm::isAlnum(C: *(ChIt + `2`));
112	if (IsBeforeMedial)
113	Label.assign(first: ChIt, last: ChIt + `3`);
114
115	auto It = llvm::find_if(Range&: N->Children,
116	P: [&](const auto &C) { return C->Name == Label; });
117	if (It == N->Children.end()) {
118	It = N->Children.insert(position: It, x: std::make_unique<Node>(args&: Label, args&: N));
119	}
120	N = It ->get();
121	}
122	N->Value = Codepoint;
123	}
124
125	void compact() { compact(N: Root.get()); }
126
127	// This creates 2 arrays of bytes from the tree:
128	// A serialized dictionary of node labels,
129	// And the nodes themselves.
130	// The name of each label is found by indexing into the dictionary.
131	// The longest names are inserted first into the dictionary,
132	// in the hope it will contain shorter labels as substring,
133	// thereby reducing duplication.
134	// We could theorically be more clever by trying to minimizing the size
135	// of the dictionary.
136	std::pair<std::string, std::vector<uint8_t>> serialize() {
137	std::set<std::string> Names = this->getNameFragments();
138	std::vector<std::string> Sorted(Names.begin(), Names.end());
139	llvm::sort(C&: Sorted, Comp: [](const auto &a, const auto &b) {
140	return a.size() > b.size();
141	});
142	std::string Dict(Letters.begin(), Letters.end());
143	Dict.reserve(res: `50000`);
144	for (const std::string &Name : Sorted) {
145	if (Name.size() <= `1`)
146	continue;
147	if (Dict.find(str: Name) != std::string::npos)
148	continue;
149	Dict += Name;
150	}
151
152	if (Dict.size() >= std::numeric_limits<uint16_t>::max()) {
153	fprintf(stderr, format: "Dictionary too big to be serialized");
154	exit(status: `1`);
155	}
156
157	auto Bytes = dumpIndex(Dict);
158	return {Dict, Bytes};
159	}
160
161	std::set<std::string> getNameFragments() {
162	std::set<std::string> Keys;
163	collectKeys(N: Root.get(), Keys);
164	return Keys;
165	}
166
167	// Maps a valid char in an Unicode character name
168	// To a 6 bits index.
169	static uint8_t letter(char C) {
170	auto Pos = Letters.find(C);
171	assert(Pos != std::string::npos &&
172	"Invalid letter in Unicode character name");
173	return Pos;
174	}
175
176	// clang-format off
177	// +================+============+======================+=============+========+===+==============+===============+
178	// \| 0 \| 1 \| 2-7 (6) \| 8-23 \| 24-44 \| \| 46 \| 47 \|
179	// +================+============+======================+=============+========+===+==============+===============+
180	// \| Has Value \| Has Long Name \| Letter OR Name Size \| Dict Index \| Value \| \| Has Sibling \| Has Children \|
181	// +----------------+------------+----------------------+-------------+--------+---+--------------+---------------+
182	// clang-format on
183
184	std::vector<uint8_t> dumpIndex(const std::string &Dict) {
185	struct ChildrenOffset {
186	Node *FirstChild;
187	std::size_t Offset;
188	bool HasValue;
189	};
190
191	// Keep track of the start of each node
192	// position in the serialized data.
193	std::unordered_map<Node *, int32_t> Offsets;
194
195	// Keep track of where to write the index
196	// of the first children
197	std::vector<ChildrenOffset> ChildrenOffsets;
198	std::unordered_map<Node , bool*> SiblingTracker;
199	std::deque<Node *> AllNodes;
200	std::vector<uint8_t> Bytes;
201	Bytes.reserve(n: `250'000`);
202	// This leading byte is used by the reading code to detect the root node.
203	Bytes.push_back(x: `0`);
204
205	auto CollectChildren = [&SiblingTracker, &AllNodes](const auto &Children) {
206	for (std::size_t Index = `0`; Index < Children.size(); Index++) {
207	const std::unique_ptr<Node> &Child = Children[Index];
208	AllNodes.push_back(x: Child.get());
209	if (Index != Children.size() - `1`)
210	SiblingTracker [Child.get()] = true;
211	}
212	};
213	CollectChildren(Root ->Children);
214
215	while (!AllNodes.empty()) {
216	const std::size_t Offset = Bytes.size();
217	Node *const N = AllNodes.front();
218	AllNodes.pop_front();
219
220	assert(!N->Name.empty());
221	Offsets [N] = Offset;
222
223	uint8_t FirstByte = (!!N->Value) ? `0x80` : `0`;
224	// Single letter node are indexed in 6 bits
225	if (N->Name.size() == `1`) {
226	FirstByte \|= letter(C: N->Name [`0`]);
227	Bytes.push_back(x: FirstByte);
228	} else {
229	// Otherwise we use a 16 bits index
230	FirstByte = FirstByte \| uint8_t(N->Name.size()) \| `0x40`;
231	Bytes.push_back(x: FirstByte);
232	auto PosInDict = Dict.find(str: N->Name);
233	assert(PosInDict != std::string::npos);
234	uint8_t Low = PosInDict;
235	uint8_t High = ((PosInDict >> `8`) & `0xFF`);
236	Bytes.push_back(x: High);
237	Bytes.push_back(x: Low);
238	}
239
240	const bool HasSibling = SiblingTracker.count(x: N) != `0`;
241	const bool HasChildren = N->Children.size() != `0`;
242
243	if (!!N->Value) {
244	uint32_t Value = (*(N->Value) << `3`);
245	uint8_t H = ((Value >> `16`) & `0xFF`);
246	uint8_t M = ((Value >> `8`) & `0xFF`);
247	uint8_t L = (Value & `0xFF`) \| uint8_t(HasSibling ? `0x01` : `0`) \|
248	uint8_t(HasChildren ? `0x02` : `0`);
249
250	Bytes.push_back(x: H);
251	Bytes.push_back(x: M);
252	Bytes.push_back(x: L);
253
254	if (HasChildren) {
255	ChildrenOffsets.push_back(
256	x: ChildrenOffset{.FirstChild: N->Children [`0`].get(), .Offset: Bytes.size(), .HasValue: true});
257	// index of the first children
258	Bytes.push_back(x: `0x00`);
259	Bytes.push_back(x: `0x00`);
260	Bytes.push_back(x: `0x00`);
261	}
262	} else {
263	// When there is no value (that's most intermediate nodes)
264	// Dispense of the 3 values bytes, and only store
265	// 1 byte to track whether the node has sibling and children
266	// + 2 bytes for the index of the first children if necessary.
267	// That index also uses bytes 0-6 of the previous byte.
268	uint8_t Byte =
269	uint8_t(HasSibling ? `0x80` : `0`) \| uint8_t(HasChildren ? `0x40` : `0`);
270	Bytes.push_back(x: Byte);
271	if (HasChildren) {
272	ChildrenOffsets.emplace_back(
273	args: ChildrenOffset{.FirstChild: N->Children [`0`].get(), .Offset: Bytes.size() - `1`, .HasValue: false});
274	Bytes.push_back(x: `0x00`);
275	Bytes.push_back(x: `0x00`);
276	}
277	}
278	CollectChildren(N->Children);
279	}
280
281	// Once all the nodes are in the inndex
282	// Fill the bytes we left to indicate the position
283	// of the children
284	for (const ChildrenOffset &Parent : ChildrenOffsets) {
285	const auto It = Offsets.find(x: Parent.FirstChild);
286	assert(It != Offsets.end());
287	std::size_t Pos = It ->second;
288	if (Parent.HasValue) {
289	Bytes [Parent.Offset] = ((Pos >> `16`) & `0xFF`);
290	} else {
291	Bytes [Parent.Offset] =
292	Bytes [Parent.Offset] \| uint8_t((Pos >> `16`) & `0xFF`);
293	}
294	Bytes [Parent.Offset + `1`] = ((Pos >> `8`) & `0xFF`);
295	Bytes [Parent.Offset + `2`] = Pos & `0xFF`;
296	}
297
298	// Add some padding so that the deserialization code
299	// doesn't try to read past the enf of the array.
300	Bytes.push_back(x: `0`);
301	Bytes.push_back(x: `0`);
302	Bytes.push_back(x: `0`);
303	Bytes.push_back(x: `0`);
304	Bytes.push_back(x: `0`);
305	Bytes.push_back(x: `0`);
306
307	return Bytes;
308	}
309
310	private:
311	void collectKeys(Node *N, std::set<std::string> &Keys) {
312	Keys.insert(x: N->Name);
313	for (const std::unique_ptr<Node> &Child : N->Children) {
314	collectKeys(N: Child.get(), Keys);
315	}
316	}
317
318	// Merge sequences of 1-character nodes
319	// This greatly reduce the total number of nodes,
320	// and therefore the size of the index.
321	// When the tree gets serialized, we only have 5 bytes to store the
322	// size of a name. Overlong names (>32 characters) are therefore
323	// kep into separate nodes
324	void compact(Node *N) {
325	for (auto &&Child : N->Children) {
326	compact(N: Child.get());
327	}
328	if (N->Parent && N->Parent->Children.size() == `1` && !N->Parent->Value &&
329	(N->Parent->Name.size() + N->Name.size() <= `32`)) {
330	N->Parent->Value = N->Value;
331	N->Parent->Name += N->Name;
332	N->Parent->Children = std::move(N->Children);
333	for (std::unique_ptr<Node> &c : N->Parent->Children) {
334	c ->Parent = N->Parent;
335	}
336	}
337	}
338	struct Node {
339	Node(std::string Name, Node Parent = nullptr*)
340	: Name (Name), Parent(Parent) {}
341
342	std::vector<std::unique_ptr<Node>> Children;
343	std::string Name;
344	Node Parent = nullptr*;
345	std::optional<char32_t> Value;
346	};
347
348	std::unique_ptr<Node> Root = std::make_unique<Node>(args: "");
349	};
350
351	extern const char *UnicodeLicense;
352
353	int main(int argc, char **argv) {
354	printf(format: "Unicode name -> codepoint mapping generator\n"
355	"Usage: %s UnicodeData.txt NameAliases.txt output\n\n",
356	argv[`0`]);
357	printf(format: "NameAliases.txt can be found at "
358	"https://unicode.org/Public/15.1.0/ucd/NameAliases.txt\n"
359	"UnicodeData.txt can be found at "
360	"https://unicode.org/Public/15.1.0/ucd/UnicodeData.txt\n\n");
361
362	if (argc != `4`)
363	return EXIT_FAILURE;
364
365	FILE *Out = fopen(filename: argv[`3`], modes: "w");
366	if (!Out) {
367	printf(format: "Error creating output file.\n");
368	return EXIT_FAILURE;
369	}
370
371	Trie T;
372	uint32_t NameCount = `0`;
373	std::size_t LongestName = `0`;
374	auto Entries = loadDataFiles(NamesFile: argv[`1`], AliasesFile: argv[`2`]);
375	for (const std::pair<const char32_t, std::string> &Entry : Entries) {
376	char32_t Codepoint = Entry.first;
377	const std::string &Name = Entry.second;
378	// Ignore names which are not valid.
379	if (Name.empty() \|\|
380	!llvm::all_of(Range: Name, P: [](char C) { return Letters.contains(C); })) {
381	continue;
382	}
383	printf(format: "%06x: %s\n", static_cast<unsigned int>(Codepoint), Name.c_str());
384	T.insert(Name, Codepoint);
385	LongestName =
386	std::max(a: LongestName, b: std::size_t(llvm::count_if(Range: Name, P: llvm::isAlnum)));
387	NameCount++;
388	}
389	T.compact();
390
391	std::pair<std::string, std::vector<uint8_t>> Data = T.serialize();
392	const std::string &Dict = Data.first;
393	const std::vector<uint8_t> &Tree = Data.second;
394
395	fprintf(stream: Out, format: R"(
396	//===------------- Support/UnicodeNameToCodepointGenerated.cpp ------------===//
397	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
398	// See https://llvm.org/LICENSE.txt for license information.
399	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
400	//
401	//===----------------------------------------------------------------------===//
402	//
403	// This file implements mapping the name of a unicode code point to its value.
404	//
405	// This file was generated using %s.
406	// Do not edit manually.
407	//
408	//===----------------------------------------------------------------------===//
409	%s
410
411
412
413	#include "llvm/Support/Compiler.h"
414	#include <cstddef>
415	#include <cstdint>
416	)",
417	argv[`0`], UnicodeLicense);
418
419	fprintf(stream: Out,
420	format: "namespace llvm { namespace sys { namespace unicode { \n"
421	"extern const char *UnicodeNameToCodepointDict;\n"
422	"extern const uint8_t *UnicodeNameToCodepointIndex;\n"
423	"extern const std::size_t UnicodeNameToCodepointIndexSize;\n"
424	"extern const std::size_t UnicodeNameToCodepointLargestNameSize;\n");
425
426	fprintf(stream: Out, format: "const char* UnicodeNameToCodepointDict = \"%s\";\n",
427	Dict.c_str());
428
429	fprintf(stream: Out, format: "uint8_t UnicodeNameToCodepointIndex_[%zu] = {\n",
430	Tree.size() + `1`);
431
432	for (auto Byte : Tree) {
433	fprintf(stream: Out, format: "0x%02x,", Byte);
434	}
435
436	fprintf(stream: Out, format: "0};");
437	fprintf(stream: Out, format: "const uint8_t* UnicodeNameToCodepointIndex = "
438	"UnicodeNameToCodepointIndex_; \n");
439	fprintf(stream: Out, format: "const std::size_t UnicodeNameToCodepointIndexSize = %zu;\n",
440	Tree.size() + `1`);
441	fprintf(stream: Out,
442	format: "const std::size_t UnicodeNameToCodepointLargestNameSize = %zu;\n",
443	LongestName);
444	fprintf(stream: Out, format: "\n}}}\n");
445	fclose(stream: Out);
446	printf(format: "Generated %s: %u Files.\nIndex: %f kB, Dictionary: %f kB.\nDone\n\n",
447	argv[`3`], NameCount, Tree.size() / `1024.0`, Dict.size() / `1024.0`);
448	}
449
450	const char *UnicodeLicense = R"(
451	/*
452	UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
453
454	See Terms of Use <https://www.unicode.org/copyright.html>
455	for definitions of Unicode Inc.’s Data Files and Software.
456
457	NOTICE TO USER: Carefully read the following legal agreement.
458	BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
459	DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
460	YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
461	TERMS AND CONDITIONS OF THIS AGREEMENT.
462	IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
463	THE DATA FILES OR SOFTWARE.
464
465	COPYRIGHT AND PERMISSION NOTICE
466
467	Copyright © 1991-2022 Unicode, Inc. All rights reserved.
468	Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
469
470	Permission is hereby granted, free of charge, to any person obtaining
471	a copy of the Unicode data files and any associated documentation
472	(the "Data Files") or Unicode software and any associated documentation
473	(the "Software") to deal in the Data Files or Software
474	without restriction, including without limitation the rights to use,
475	copy, modify, merge, publish, distribute, and/or sell copies of
476	the Data Files or Software, and to permit persons to whom the Data Files
477	or Software are furnished to do so, provided that either
478	(a) this copyright and permission notice appear with all copies
479	of the Data Files or Software, or
480	(b) this copyright and permission notice appear in associated
481	Documentation.
482
483	THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
484	ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
485	WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
486	NONINFRINGEMENT OF THIRD PARTY RIGHTS.
487	IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
488	NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
489	DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
490	DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
491	TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
492	PERFORMANCE OF THE DATA FILES OR SOFTWARE.
493
494	Except as contained in this notice, the name of a copyright holder
495	shall not be used in advertising or otherwise to promote the sale,
496	use or other dealings in these Data Files or Software without prior
497	written authorization of the copyright holder.
498	*/
499	)";
500

source code of llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp