1 | //===- unittests/Support/UnicodeTest.cpp - Unicode.h tests ----------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "llvm/Support/Unicode.h" |
10 | #include "llvm/ADT/StringExtras.h" |
11 | #include "llvm/ADT/edit_distance.h" |
12 | #include "llvm/Support/ConvertUTF.h" |
13 | #include "gmock/gmock.h" |
14 | #include "gtest/gtest.h" |
15 | |
16 | namespace llvm { |
17 | namespace sys { |
18 | namespace unicode { |
19 | namespace { |
20 | |
21 | TEST(Unicode, columnWidthUTF8) { |
22 | EXPECT_EQ(0, columnWidthUTF8("" )); |
23 | EXPECT_EQ(1, columnWidthUTF8(" " )); |
24 | EXPECT_EQ(1, columnWidthUTF8("a" )); |
25 | EXPECT_EQ(1, columnWidthUTF8("~" )); |
26 | |
27 | EXPECT_EQ(6, columnWidthUTF8("abcdef" )); |
28 | |
29 | EXPECT_EQ(-1, columnWidthUTF8("\x01" )); |
30 | EXPECT_EQ(-1, columnWidthUTF8("\t" )); |
31 | EXPECT_EQ(-1, columnWidthUTF8("aaaaaaaaaa\x01" )); |
32 | EXPECT_EQ(-1, columnWidthUTF8("\342\200\213" )); // 200B ZERO WIDTH SPACE |
33 | |
34 | // 00AD SOFT HYPHEN is displayed on most terminals as a space or a dash. Some |
35 | // text editors display it only when a line is broken at it, some use it as a |
36 | // line-break hint, but don't display. We choose terminal-oriented |
37 | // interpretation. |
38 | EXPECT_EQ(1, columnWidthUTF8("\302\255" )); |
39 | |
40 | EXPECT_EQ(0, columnWidthUTF8("\314\200" )); // 0300 COMBINING GRAVE ACCENT |
41 | EXPECT_EQ(1, columnWidthUTF8("\340\270\201" )); // 0E01 THAI CHARACTER KO KAI |
42 | EXPECT_EQ(2, columnWidthUTF8("\344\270\200" )); // CJK UNIFIED IDEOGRAPH-4E00 |
43 | |
44 | EXPECT_EQ(4, columnWidthUTF8("\344\270\200\344\270\200" )); |
45 | EXPECT_EQ(3, columnWidthUTF8("q\344\270\200" )); |
46 | EXPECT_EQ(3, columnWidthUTF8("\314\200\340\270\201\344\270\200" )); |
47 | |
48 | EXPECT_EQ(2, columnWidthUTF8("\342\214\232" )); // U+231A WATCH (emoji) |
49 | EXPECT_EQ(2, columnWidthUTF8("\360\237\253\233" )); // U+1FADB PEA POD (Unicode 15 emoji) |
50 | EXPECT_EQ(2, columnWidthUTF8("\360\233\204\262" )); // U+1B132 HIRAGANA LETTER SMALL KO |
51 | EXPECT_EQ(2, columnWidthUTF8("\360\227\201\202" )); // U+17042 TANGUT IDEOGRAPH |
52 | |
53 | // Invalid UTF-8 strings, columnWidthUTF8 should error out. |
54 | EXPECT_EQ(-2, columnWidthUTF8("\344" )); |
55 | EXPECT_EQ(-2, columnWidthUTF8("\344\270" )); |
56 | EXPECT_EQ(-2, columnWidthUTF8("\344\270\033" )); |
57 | EXPECT_EQ(-2, columnWidthUTF8("\344\270\300" )); |
58 | EXPECT_EQ(-2, columnWidthUTF8("\377\366\355" )); |
59 | |
60 | EXPECT_EQ(-2, columnWidthUTF8("qwer\344" )); |
61 | EXPECT_EQ(-2, columnWidthUTF8("qwer\344\270" )); |
62 | EXPECT_EQ(-2, columnWidthUTF8("qwer\344\270\033" )); |
63 | EXPECT_EQ(-2, columnWidthUTF8("qwer\344\270\300" )); |
64 | EXPECT_EQ(-2, columnWidthUTF8("qwer\377\366\355" )); |
65 | |
66 | // UTF-8 sequences longer than 4 bytes correspond to unallocated Unicode |
67 | // characters. |
68 | EXPECT_EQ(-2, columnWidthUTF8("\370\200\200\200\200" )); // U+200000 |
69 | EXPECT_EQ(-2, columnWidthUTF8("\374\200\200\200\200\200" )); // U+4000000 |
70 | } |
71 | |
72 | TEST(Unicode, isPrintable) { |
73 | EXPECT_FALSE(isPrintable(0)); // <control-0000>-<control-001F> |
74 | EXPECT_FALSE(isPrintable(0x01)); |
75 | EXPECT_FALSE(isPrintable(0x1F)); |
76 | EXPECT_TRUE(isPrintable(' ')); |
77 | EXPECT_TRUE(isPrintable('A')); |
78 | EXPECT_TRUE(isPrintable('~')); |
79 | EXPECT_FALSE(isPrintable(0x7F)); // <control-007F>..<control-009F> |
80 | EXPECT_FALSE(isPrintable(0x90)); |
81 | EXPECT_FALSE(isPrintable(0x9F)); |
82 | |
83 | EXPECT_TRUE(isPrintable(0xAC)); |
84 | EXPECT_TRUE(isPrintable(0xAD)); // SOFT HYPHEN is displayed on most terminals |
85 | // as either a space or a dash. |
86 | EXPECT_TRUE(isPrintable(0xAE)); |
87 | |
88 | EXPECT_TRUE(isPrintable(0x0377)); // GREEK SMALL LETTER PAMPHYLIAN DIGAMMA |
89 | EXPECT_FALSE(isPrintable(0x0378)); // <reserved-0378>..<reserved-0379> |
90 | |
91 | EXPECT_FALSE(isPrintable(0x0600)); // ARABIC NUMBER SIGN |
92 | |
93 | EXPECT_FALSE(isPrintable(0x1FFFF)); // <reserved-1F774>..<noncharacter-1FFFF> |
94 | EXPECT_TRUE(isPrintable(0x20000)); // CJK UNIFIED IDEOGRAPH-20000 |
95 | |
96 | EXPECT_FALSE(isPrintable(0x10FFFF)); // noncharacter |
97 | |
98 | // test the validity of a fast path in columnWidthUTF8 |
99 | for (unsigned char c = 0; c < 128; ++c) { |
100 | const UTF8 buf8[2] = {c, 0}; |
101 | const UTF8 *Target8 = &buf8[0]; |
102 | UTF32 buf32[1]; |
103 | UTF32 *Target32 = &buf32[0]; |
104 | auto status = ConvertUTF8toUTF32(sourceStart: &Target8, sourceEnd: Target8 + 1, targetStart: &Target32, |
105 | targetEnd: Target32 + 1, flags: strictConversion); |
106 | EXPECT_EQ(status, conversionOK); |
107 | EXPECT_EQ((columnWidthUTF8(reinterpret_cast<const char *>(buf8)) == 1), |
108 | (bool)isPrintable(buf32[0])); |
109 | } |
110 | } |
111 | |
112 | TEST(Unicode, nameToCodepointStrict) { |
113 | auto map = [](StringRef Str) { |
114 | return nameToCodepointStrict(Name: Str).value_or(u: 0xFFFF'FFFF); |
115 | }; |
116 | |
117 | // generated codepoints |
118 | EXPECT_EQ(0x03400u, map("CJK UNIFIED IDEOGRAPH-3400" )); |
119 | EXPECT_EQ(0x04DBFu, map("CJK UNIFIED IDEOGRAPH-4DBF" )); |
120 | EXPECT_EQ(0x04E00u, map("CJK UNIFIED IDEOGRAPH-4E00" )); |
121 | EXPECT_EQ(0x09FFCu, map("CJK UNIFIED IDEOGRAPH-9FFC" )); |
122 | EXPECT_EQ(0x20000u, map("CJK UNIFIED IDEOGRAPH-20000" )); |
123 | EXPECT_EQ(0x2A6DDu, map("CJK UNIFIED IDEOGRAPH-2A6DD" )); |
124 | EXPECT_EQ(0x2A700u, map("CJK UNIFIED IDEOGRAPH-2A700" )); |
125 | EXPECT_EQ(0x2B740u, map("CJK UNIFIED IDEOGRAPH-2B740" )); |
126 | EXPECT_EQ(0x2B81Du, map("CJK UNIFIED IDEOGRAPH-2B81D" )); |
127 | EXPECT_EQ(0x2B820u, map("CJK UNIFIED IDEOGRAPH-2B820" )); |
128 | EXPECT_EQ(0x2CEA1u, map("CJK UNIFIED IDEOGRAPH-2CEA1" )); |
129 | EXPECT_EQ(0x2CEB0u, map("CJK UNIFIED IDEOGRAPH-2CEB0" )); |
130 | EXPECT_EQ(0x2EBE0u, map("CJK UNIFIED IDEOGRAPH-2EBE0" )); |
131 | EXPECT_EQ(0x30000u, map("CJK UNIFIED IDEOGRAPH-30000" )); |
132 | EXPECT_EQ(0x3134Au, map("CJK UNIFIED IDEOGRAPH-3134A" )); |
133 | EXPECT_EQ(0x17000u, map("TANGUT IDEOGRAPH-17000" )); |
134 | EXPECT_EQ(0x187F7u, map("TANGUT IDEOGRAPH-187F7" )); |
135 | EXPECT_EQ(0x18D00u, map("TANGUT IDEOGRAPH-18D00" )); |
136 | EXPECT_EQ(0x18D08u, map("TANGUT IDEOGRAPH-18D08" )); |
137 | EXPECT_EQ(0x18B00u, map("KHITAN SMALL SCRIPT CHARACTER-18B00" )); |
138 | EXPECT_EQ(0x18CD5u, map("KHITAN SMALL SCRIPT CHARACTER-18CD5" )); |
139 | EXPECT_EQ(0x1B170u, map("NUSHU CHARACTER-1B170" )); |
140 | EXPECT_EQ(0x1B2FBu, map("NUSHU CHARACTER-1B2FB" )); |
141 | EXPECT_EQ(0x0F900u, map("CJK COMPATIBILITY IDEOGRAPH-F900" )); |
142 | EXPECT_EQ(0x0FA6Du, map("CJK COMPATIBILITY IDEOGRAPH-FA6D" )); |
143 | EXPECT_EQ(0x0FA70u, map("CJK COMPATIBILITY IDEOGRAPH-FA70" )); |
144 | EXPECT_EQ(0x0FAD9u, map("CJK COMPATIBILITY IDEOGRAPH-FAD9" )); |
145 | EXPECT_EQ(0x2F800u, map("CJK COMPATIBILITY IDEOGRAPH-2F800" )); |
146 | EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH-2FA1D" )); |
147 | EXPECT_EQ(0x31350u, map("CJK UNIFIED IDEOGRAPH-31350" )); // Unicode 15.0 |
148 | EXPECT_EQ(0x2EBF0u, map("CJK UNIFIED IDEOGRAPH-2EBF0" )); // Unicode 15.1 |
149 | |
150 | EXPECT_EQ(0xAC00u, map("HANGUL SYLLABLE GA" )); |
151 | EXPECT_EQ(0xAC14u, map("HANGUL SYLLABLE GASS" )); |
152 | EXPECT_EQ(0xAC2Bu, map("HANGUL SYLLABLE GAELH" )); |
153 | EXPECT_EQ(0xAC7Bu, map("HANGUL SYLLABLE GEOLB" )); |
154 | EXPECT_EQ(0xC640u, map("HANGUL SYLLABLE WA" )); |
155 | EXPECT_EQ(0xC544u, map("HANGUL SYLLABLE A" )); |
156 | EXPECT_EQ(0xC5D0u, map("HANGUL SYLLABLE E" )); |
157 | EXPECT_EQ(0xC774u, map("HANGUL SYLLABLE I" )); |
158 | |
159 | EXPECT_EQ(0x1F984u, map("UNICORN FACE" )); |
160 | EXPECT_EQ(0x00640u, map("ARABIC TATWEEL" )); |
161 | EXPECT_EQ(0x02C05u, map("GLAGOLITIC CAPITAL LETTER YESTU" )); |
162 | EXPECT_EQ(0x13000u, map("EGYPTIAN HIEROGLYPH A001" )); |
163 | EXPECT_EQ(0x02235u, map("BECAUSE" )); |
164 | EXPECT_EQ(0x1F514u, map("BELL" )); |
165 | EXPECT_EQ(0x1F9A9u, map("FLAMINGO" )); |
166 | EXPECT_EQ(0x1F9A9u, map("FLAMINGO" )); |
167 | EXPECT_EQ(0x1F402u, map("OX" )); // 2 characters |
168 | EXPECT_EQ(0x0FBF9u, map("ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " |
169 | "ABOVE WITH ALEF MAKSURA ISOLATED FORM" )); |
170 | EXPECT_EQ(0x11F04u, map("KAWI LETTER A" )); // Unicode 15.0 |
171 | EXPECT_EQ(0x1FA77u, map("PINK HEART" )); // Unicode 15.0 |
172 | EXPECT_EQ(0x2FFFu, |
173 | map("IDEOGRAPHIC DESCRIPTION CHARACTER ROTATION" )); // Unicode 15.1 |
174 | |
175 | // Aliases |
176 | EXPECT_EQ(0x0000u, map("NULL" )); |
177 | EXPECT_EQ(0x0007u, map("ALERT" )); |
178 | EXPECT_EQ(0x0009u, map("HORIZONTAL TABULATION" )); |
179 | EXPECT_EQ(0x0009u, map("CHARACTER TABULATION" )); |
180 | EXPECT_EQ(0x000Au, map("LINE FEED" )); |
181 | EXPECT_EQ(0x000Au, map("NEW LINE" )); |
182 | EXPECT_EQ(0x0089u, map("CHARACTER TABULATION WITH JUSTIFICATION" )); |
183 | EXPECT_EQ(0x0089u, map("HORIZONTAL TABULATION WITH JUSTIFICATION" )); |
184 | EXPECT_EQ(0x2118u, |
185 | map("WEIERSTRASS ELLIPTIC FUNCTION" )); // correction |
186 | EXPECT_EQ(0x2118u, map("SCRIPT CAPITAL P" )); // correction |
187 | EXPECT_EQ(0xFEFFu, map("BYTE ORDER MARK" )); // alternate |
188 | EXPECT_EQ(0xFEFFu, map("ZERO WIDTH NO-BREAK SPACE" )); // alternate |
189 | |
190 | // Should perform exact case match |
191 | EXPECT_EQ(0xFFFFFFFFu, map("" )); |
192 | EXPECT_EQ(0xFFFFFFFFu, map("NOT A UNICODE CHARACTER" )); |
193 | EXPECT_EQ(0xFFFFFFFFu, map("unicorn face" )); |
194 | EXPECT_EQ(0xFFFFFFFFu, map("UNICORN FaCE" )); |
195 | EXPECT_EQ(0xFFFFFFFFu, map("UNICORNFaCE" )); |
196 | EXPECT_EQ(0xFFFFFFFFu, map("UNICORN" )); |
197 | EXPECT_EQ(0xFFFFFFFFu, map("HANGUL SYLLABLE i" )); |
198 | EXPECT_EQ(0xFFFFFFFFu, map("hANGUL SYLLABLE i" )); |
199 | EXPECT_EQ(0xFFFFFFFFu, map("HANGULSYLLABLEI" )); |
200 | EXPECT_EQ(0xFFFFFFFFu, map("HANGUL SYLLABLE" )); |
201 | EXPECT_EQ(0xFFFFFFFFu, map("cJK COMPATIBILITY IDEOGRAPH-2FA1D" )); |
202 | EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-2FA1d" )); |
203 | EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH 2FA1D" )); |
204 | EXPECT_EQ(0xFFFFFFFF, map("CJK COMPATIBILITY IDEOGRAPH-NOTANUMBER" )); |
205 | EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-1" )); |
206 | EXPECT_EQ(0xFFFFFFFFu, map("ZERO WIDTH NO BREAK SPACE" )); |
207 | |
208 | // Should not support abbreviations or figments |
209 | EXPECT_EQ(0xFFFFFFFFu, map("FVS1" )); |
210 | EXPECT_EQ(0xFFFFFFFFu, map("HIGH OCTET PRESET" )); |
211 | EXPECT_EQ(0xFFFFFFFFu, map("BEL" )); |
212 | } |
213 | |
214 | TEST(Unicode, nameToCodepointLoose) { |
215 | auto map = [](StringRef Str) { |
216 | auto Opt = nameToCodepointLooseMatching(Name: Str); |
217 | if (!Opt) |
218 | return char32_t(0xFFFF'FFFF); |
219 | return Opt->CodePoint; |
220 | }; |
221 | |
222 | // generated codepoints |
223 | EXPECT_EQ(0x04DBFu, map("CJK UNIFIED IDEOGRAPH-4DBF" )); |
224 | EXPECT_EQ(0x04E00u, map("CJK UNIFIED IDEOGRAPH-4E00" )); |
225 | EXPECT_EQ(0x09FFCu, map("CJK UNIFIED IDEOGRAPH-9FFC" )); |
226 | EXPECT_EQ(0x20000u, map("CJK UNIFIED IDEOGRAPH-20000" )); |
227 | EXPECT_EQ(0x2A6DDu, map("CJK UNIFIED IDEOGRAPH-2A6DD" )); |
228 | EXPECT_EQ(0x2A700u, map("CJK UNIFIED IDEOGRAPH-2A700" )); |
229 | EXPECT_EQ(0x2B740u, map("CJK UNIFIED IDEOGRAPH-2B740" )); |
230 | EXPECT_EQ(0x03400u, map("CJK UNIFIED IDEOGRAPH-3400" )); |
231 | EXPECT_EQ(0x2B81Du, map("CJK UNIFIED IDEOGRAPH-2B81D" )); |
232 | EXPECT_EQ(0x2B820u, map("CJK UNIFIED IDEOGRAPH-2B820" )); |
233 | EXPECT_EQ(0x2CEA1u, map("CJK UNIFIED IDEOGRAPH-2CEA1" )); |
234 | EXPECT_EQ(0x2CEB0u, map("CJK UNIFIED IDEOGRAPH-2CEB0" )); |
235 | EXPECT_EQ(0x2EBE0u, map("CJK UNIFIED IDEOGRAPH-2EBE0" )); |
236 | EXPECT_EQ(0x30000u, map("CJK UNIFIED IDEOGRAPH-30000" )); |
237 | EXPECT_EQ(0x3134Au, map("CJK UNIFIED IDEOGRAPH-3134A" )); |
238 | EXPECT_EQ(0x17000u, map("TANGUT IDEOGRAPH-17000" )); |
239 | EXPECT_EQ(0x187F7u, map("TANGUT IDEOGRAPH-187F7" )); |
240 | EXPECT_EQ(0x18D00u, map("TANGUT IDEOGRAPH-18D00" )); |
241 | EXPECT_EQ(0x18D08u, map("TANGUT IDEOGRAPH-18D08" )); |
242 | EXPECT_EQ(0x18B00u, map("KHITAN SMALL SCRIPT CHARACTER-18B00" )); |
243 | EXPECT_EQ(0x18CD5u, map("KHITAN SMALL SCRIPT CHARACTER-18CD5" )); |
244 | EXPECT_EQ(0x1B170u, map("NUSHU CHARACTER-1B170" )); |
245 | EXPECT_EQ(0x1B2FBu, map("NUSHU CHARACTER-1B2FB" )); |
246 | EXPECT_EQ(0x0F900u, map("CJK COMPATIBILITY IDEOGRAPH-F900" )); |
247 | EXPECT_EQ(0x0FA6Du, map("CJK COMPATIBILITY IDEOGRAPH-FA6D" )); |
248 | EXPECT_EQ(0x0FA70u, map("CJK COMPATIBILITY IDEOGRAPH-FA70" )); |
249 | EXPECT_EQ(0x0FAD9u, map("CJK COMPATIBILITY IDEOGRAPH-FAD9" )); |
250 | EXPECT_EQ(0x2F800u, map("CJK COMPATIBILITY IDEOGRAPH-2F800" )); |
251 | EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH-2FA1D" )); |
252 | |
253 | EXPECT_EQ(0xAC00u, map("HANGUL SYLLABLE GA" )); |
254 | EXPECT_EQ(0xAC14u, map("HANGUL SYLLABLE GASS" )); |
255 | EXPECT_EQ(0xAC2Bu, map("HANGUL SYLLABLE GAELH" )); |
256 | EXPECT_EQ(0xAC7Bu, map("HANGUL SYLLABLE GEOLB" )); |
257 | EXPECT_EQ(0xC640u, map("HANGUL SYLLABLE WA" )); |
258 | EXPECT_EQ(0xC544u, map("HANGUL SYLLABLE A" )); |
259 | EXPECT_EQ(0xC5D0u, map("HANGUL SYLLABLE E" )); |
260 | EXPECT_EQ(0xC774u, map("HANGUL SYLLABLE I" )); |
261 | |
262 | EXPECT_EQ(0x1F984u, map("UNICORN FACE" )); |
263 | EXPECT_EQ(0x00640u, map("ARABIC TATWEEL" )); |
264 | EXPECT_EQ(0x02C05u, map("GLAGOLITIC CAPITAL LETTER YESTU" )); |
265 | EXPECT_EQ(0x13000u, map("EGYPTIAN HIEROGLYPH A001" )); |
266 | EXPECT_EQ(0x02235u, map("BECAUSE" )); |
267 | EXPECT_EQ(0x1F514u, map("BELL" )); |
268 | EXPECT_EQ(0x1F9A9u, map("FLAMINGO" )); |
269 | EXPECT_EQ(0x1F402u, map("OX" )); // 2 characters |
270 | EXPECT_EQ(0x0FBF9u, map("ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " |
271 | "ABOVE WITH ALEF MAKSURA ISOLATED FORM" )); |
272 | |
273 | // Aliases |
274 | EXPECT_EQ(0x0000u, map("NULL" )); |
275 | EXPECT_EQ(0x0007u, map("ALERT" )); |
276 | EXPECT_EQ(0x0009u, map("HORIZONTAL TABULATION" )); |
277 | EXPECT_EQ(0x0009u, map("CHARACTER TABULATION" )); |
278 | EXPECT_EQ(0x000Au, map("LINE FEED" )); |
279 | EXPECT_EQ(0x000Au, map("NEW LINE" )); |
280 | EXPECT_EQ(0x0089u, map("CHARACTER TABULATION WITH JUSTIFICATION" )); |
281 | EXPECT_EQ(0x0089u, map("HORIZONTAL TABULATION WITH JUSTIFICATION" )); |
282 | EXPECT_EQ(0x2118u, |
283 | map("WEIERSTRASS ELLIPTIC FUNCTION" )); // correction |
284 | EXPECT_EQ(0x2118u, map("SCRIPT CAPITAL P" )); // correction |
285 | EXPECT_EQ(0xFEFFu, map("BYTE ORDER MARK" )); // alternate |
286 | EXPECT_EQ(0xFEFFu, map("ZERO WIDTH NO-BREAK SPACE" )); // alternate |
287 | EXPECT_EQ(0xFEFFu, map("ZERO WIDTH NO BREAK SPACE" )); // alternate |
288 | |
289 | // Should perform loose matching |
290 | EXPECT_EQ(0xFFFFFFFFu, map("" )); |
291 | EXPECT_EQ(0xFFFFFFFFu, map("NOT A UNICODE CHARACTER" )); |
292 | EXPECT_EQ(0x0001F984u, map("unicorn face" )); |
293 | EXPECT_EQ(0x0001F984u, map("UNICORN FaCE" )); |
294 | EXPECT_EQ(0x0001F984u, map("UNICORNFaCE" )); |
295 | EXPECT_EQ(0xFFFFFFFFu, map("UNICORN" )); |
296 | EXPECT_EQ(0xC774u, map("HANGUL SYLLABLE i" )); |
297 | EXPECT_EQ(0xC774u, map("hANGUL SYLLABLE i" )); |
298 | EXPECT_EQ(0xC774u, map("HANGULSYLLABLEI" )); |
299 | EXPECT_EQ(0xFFFFFFFFu, map("HANGUL SYLLABLE" )); |
300 | |
301 | EXPECT_EQ(0x2FA1Du, map("cJK COMPATIBILITY IDEOGRAPH-2FA1D" )); |
302 | EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH-2FA1d" )); |
303 | EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH 2FA1D" )); |
304 | |
305 | EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-NOTANUMBER" )); |
306 | EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-1" )); |
307 | |
308 | // https://unicode.org/reports/tr44/#Matching_Names |
309 | // UAX44-LM2: Medial hypens are ignored, non medial hyphens are not |
310 | EXPECT_EQ(0x1FBC5u, map("S-T-I-C-K-F-I-G-U-R-E" )); |
311 | EXPECT_EQ(0xFFFFFFFFu, map("-STICK FIGURE" )); |
312 | EXPECT_EQ(0xFFFFFFFFu, map("STICK FIGURE-" )); |
313 | EXPECT_EQ(0xFFFFFFFFu, map("STICK FIGURE -" )); |
314 | EXPECT_EQ(0xFFFFFFFFu, map("STICK FIGURE --" )); |
315 | EXPECT_EQ(0xFFFFFFFFu, map("STICK--FIGURE" )); |
316 | |
317 | EXPECT_EQ(0x0F68u, map("TIBETAN LETTER A" )); |
318 | EXPECT_EQ(0x0F68u, map("TIBETAN LETTERA" )); |
319 | EXPECT_EQ(0x0F68u, map("TIBETAN LETTER-A" )); |
320 | EXPECT_EQ(0x0F60u, map("TIBETAN LETTER -A" )); |
321 | EXPECT_EQ(0x0F60u, map("TIBETAN LETTER -A" )); |
322 | |
323 | // GH64161 |
324 | EXPECT_EQ(0x202Du, map("LEFT-TO-RIGHT OVERRIDE" )); |
325 | EXPECT_EQ(0x202Du, map("LEFT TO RIGHT OVERRIDE" )); |
326 | EXPECT_EQ(0x202Du, map("LEFTTORIGHTOVERRIDE" )); |
327 | EXPECT_EQ(0x202Du, map("LEFT-TO-RIGHT-OVERRIDE" )); |
328 | EXPECT_FALSE(nameToCodepointLooseMatching("-LEFT-TO-RIGHT OVERRIDE" )); |
329 | EXPECT_FALSE(nameToCodepointLooseMatching("LEFT-TO-RIGHT OVERRIDE-" )); |
330 | |
331 | // special case |
332 | EXPECT_EQ(0x1180u, map("HANGUL JUNGSEONG O-E" )); |
333 | EXPECT_EQ(0x116Cu, map("HANGUL JUNGSEONG OE" )); |
334 | |
335 | // names that are prefix to existing characters should not match |
336 | EXPECT_FALSE(nameToCodepointLooseMatching("B" )); |
337 | EXPECT_FALSE(nameToCodepointLooseMatching("BE" )); |
338 | EXPECT_FALSE(nameToCodepointLooseMatching("BEE" )); |
339 | EXPECT_FALSE(nameToCodepointLooseMatching("BEET" )); |
340 | EXPECT_FALSE(nameToCodepointLooseMatching("BEETL" )); |
341 | EXPECT_TRUE(nameToCodepointLooseMatching("BEETLE" )); |
342 | } |
343 | |
344 | } // namespace |
345 | |
346 | bool operator==(MatchForCodepointName a, MatchForCodepointName b) { |
347 | return a.Name == b.Name && a.Distance == b.Distance && a.Value == b.Value; |
348 | } |
349 | |
350 | namespace { |
351 | |
352 | TEST(Unicode, nearestMatchesForCodepointName) { |
353 | auto Normalize = [](StringRef Name) { |
354 | std::string Out; |
355 | Out.reserve(res: Name.size()); |
356 | for (char C : Name) { |
357 | if (isAlnum(C)) |
358 | Out.push_back(c: toUpper(x: C)); |
359 | } |
360 | return Out; |
361 | }; |
362 | |
363 | auto L = [&](StringRef name) { |
364 | auto v = nearestMatchesForCodepointName(Pattern: name, MaxMatchesCount: 3); |
365 | for (auto &r : v) { |
366 | auto A = Normalize(r.Name); |
367 | auto B = Normalize(name); |
368 | EXPECT_EQ(StringRef(A).edit_distance(B, true), r.Distance); |
369 | } |
370 | return v; |
371 | }; |
372 | using ::testing::ElementsAre; |
373 | using M = MatchForCodepointName; |
374 | |
375 | ASSERT_THAT(L("" ), ElementsAre(M{"OX" , 2, 0x1F402}, M{"ANT" , 3, 0x1F41C}, |
376 | M{"ARC" , 3, 0x2312})); |
377 | // shortest name |
378 | ASSERT_THAT(L("OX" ), ElementsAre(M{"OX" , 0, 0x1F402}, M{"AXE" , 2, 0x1FA93}, |
379 | M{"BOY" , 2, 0x1F466})); |
380 | |
381 | // longest name |
382 | ASSERT_THAT(L("ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA ABOVE WITH ALEF " |
383 | "MAKSURA INITIAL FORM" ), |
384 | ElementsAre(M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " |
385 | "ABOVE WITH ALEF MAKSURA INITIAL FORM" , |
386 | 0, 0xFBFB}, |
387 | M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " |
388 | "ABOVE WITH ALEF MAKSURA FINAL FORM" , |
389 | 4, 0xFBFA}, |
390 | M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " |
391 | "ABOVE WITH ALEF MAKSURA ISOLATED FORM" , |
392 | 7, 0xFBF9})); |
393 | |
394 | // same result with underscore, spaces, etc |
395 | ASSERT_THAT(L("______ARABICLIGATUREUIGHUR KIRGHIZ YEH with HAMZA ABOVE WITH " |
396 | "ALEF MAKsURAINITIAL form_" ), |
397 | ElementsAre(M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " |
398 | "ABOVE WITH ALEF MAKSURA INITIAL FORM" , |
399 | 0, 0xFBFB}, |
400 | M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " |
401 | "ABOVE WITH ALEF MAKSURA FINAL FORM" , |
402 | 4, 0xFBFA}, |
403 | M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " |
404 | "ABOVE WITH ALEF MAKSURA ISOLATED FORM" , |
405 | 7, 0xFBF9})); |
406 | |
407 | ASSERT_THAT(L("GREEK CAPITAL LETTER LAMBDA" ), |
408 | ElementsAre(M{"GREEK CAPITAL LETTER LAMDA" , 1, 0x39B}, |
409 | M{"GREEK CAPITAL LETTER GAMMA" , 3, 0x0393}, |
410 | M{"GREEK CAPITAL LETTER ALPHA" , 4, 0x0391})); |
411 | |
412 | ASSERT_THAT(L("greekcapitalletter-lambda" ), |
413 | ElementsAre(M{"GREEK CAPITAL LETTER LAMDA" , 1, 0x39B}, |
414 | M{"GREEK CAPITAL LETTER GAMMA" , 3, 0x0393}, |
415 | M{"GREEK CAPITAL LETTER ALPHA" , 4, 0x0391})); |
416 | |
417 | // typo http://www.unicode.org/notes/tn27/tn27-5.html |
418 | ASSERT_THAT( |
419 | L("PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET" ), |
420 | ElementsAre( |
421 | M{"PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET" , 0, |
422 | 0xFE18}, // typo |
423 | M{"PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET" , 2, |
424 | 0xFE18}, // correction |
425 | M{"PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET" , 6, |
426 | 0xFE17})); |
427 | |
428 | // typo http://www.unicode.org/notes/tn27/tn27-5.html |
429 | ASSERT_THAT( |
430 | L("BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS" ), |
431 | ElementsAre( |
432 | M{"BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS" , 0, 0x1D0C5}, |
433 | M{"BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS" , 2, 0x1D0C5}, |
434 | M{"BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA SYNAFI" , 7, |
435 | 0x1D0C6})); |
436 | } |
437 | |
438 | } // namespace |
439 | } // namespace unicode |
440 | } // namespace sys |
441 | } // namespace llvm |
442 | |