1//! Character inclusion in binary or General_Category value Unicode sets.
2//!
3//! We rely on dead code elimination to remove the tables that aren't needed.
4
5#![allow(bad_style)]
6#![allow(clippy::all)]
7
8use alloc::boxed::Box;
9
10macro_rules! property_functions {
11 ($module:ident, $property_names:ident, [$(
12 $prop:ident,
13 )*]) => {
14 #[allow(unused)]
15 mod $module;
16 // unicode::ALPHABETIC('a')
17 $(pub fn $prop(c: char) -> bool {
18 self::$module::$prop.contains_char(c)
19 })*
20
21 pub static $property_names: &[&str] = &[
22 $(stringify!($prop),)*
23 ];
24 };
25}
26
27macro_rules! char_property_functions {
28 // For define custom property names
29 {$(
30 mod $module:ident;
31 static $property_names:ident = [$(
32 $prop:ident,
33 )*];
34 )*} => {$(
35 property_functions!($module, $property_names, [$(
36 $prop,
37 )*]);
38 )*};
39 // For define property by copy BY_NAME values from `ucd-generate` generated.
40 {$(
41 mod $module:ident;
42 static $property_names:ident = [$(
43 ($_name:tt, $prop:ident),
44 )*];
45 )*} => {$(
46 property_functions!($module, $property_names, [$(
47 $prop,
48 )*]);
49 )*};
50}
51
52char_property_functions! {
53 mod binary;
54 static BINARY_PROPERTY_NAMES = [
55 // ASCII_HEX_DIGIT, // let this one be stripped out -- the full trie is wasteful for ASCII
56 ALPHABETIC, BIDI_CONTROL, CASE_IGNORABLE, CASED, CHANGES_WHEN_CASEFOLDED,
57 CHANGES_WHEN_CASEMAPPED, CHANGES_WHEN_LOWERCASED, CHANGES_WHEN_TITLECASED,
58 CHANGES_WHEN_UPPERCASED, DASH, DEFAULT_IGNORABLE_CODE_POINT, DEPRECATED, DIACRITIC,
59 EMOJI, EMOJI_COMPONENT, EMOJI_MODIFIER, EMOJI_MODIFIER_BASE, EMOJI_PRESENTATION, EXTENDED_PICTOGRAPHIC,
60 EXTENDER, GRAPHEME_BASE, GRAPHEME_EXTEND, GRAPHEME_LINK, HEX_DIGIT, HYPHEN,
61 IDS_BINARY_OPERATOR, IDS_TRINARY_OPERATOR, ID_CONTINUE, ID_START, IDEOGRAPHIC, JOIN_CONTROL,
62 LOGICAL_ORDER_EXCEPTION, LOWERCASE, MATH, NONCHARACTER_CODE_POINT, OTHER_ALPHABETIC,
63 OTHER_DEFAULT_IGNORABLE_CODE_POINT, OTHER_GRAPHEME_EXTEND, OTHER_ID_CONTINUE,
64 OTHER_ID_START, OTHER_LOWERCASE, OTHER_MATH, OTHER_UPPERCASE, PATTERN_SYNTAX,
65 PATTERN_WHITE_SPACE, PREPENDED_CONCATENATION_MARK, QUOTATION_MARK, RADICAL,
66 REGIONAL_INDICATOR, SENTENCE_TERMINAL, SOFT_DOTTED, TERMINAL_PUNCTUATION, UNIFIED_IDEOGRAPH,
67 UPPERCASE, VARIATION_SELECTOR, WHITE_SPACE, XID_CONTINUE, XID_START,
68 ];
69}
70
71char_property_functions! {
72 mod category;
73 // Copy from category::BY_NAME
74 static CATEGORY_PROPERTY_NAMES = [
75 ("Cased_Letter", CASED_LETTER), ("Close_Punctuation", CLOSE_PUNCTUATION),
76 ("Connector_Punctuation", CONNECTOR_PUNCTUATION), ("Control", CONTROL),
77 ("Currency_Symbol", CURRENCY_SYMBOL),
78 ("Dash_Punctuation", DASH_PUNCTUATION), ("Decimal_Number", DECIMAL_NUMBER),
79 ("Enclosing_Mark", ENCLOSING_MARK),
80 ("Final_Punctuation", FINAL_PUNCTUATION), ("Format", FORMAT),
81 ("Initial_Punctuation", INITIAL_PUNCTUATION), ("Letter", LETTER),
82 ("Letter_Number", LETTER_NUMBER), ("Line_Separator", LINE_SEPARATOR),
83 ("Lowercase_Letter", LOWERCASE_LETTER), ("Mark", MARK),
84 ("Math_Symbol", MATH_SYMBOL), ("Modifier_Letter", MODIFIER_LETTER),
85 ("Modifier_Symbol", MODIFIER_SYMBOL), ("Nonspacing_Mark", NONSPACING_MARK),
86 ("Number", NUMBER), ("Open_Punctuation", OPEN_PUNCTUATION),
87 ("Other", OTHER), ("Other_Letter", OTHER_LETTER),
88 ("Other_Number", OTHER_NUMBER), ("Other_Punctuation", OTHER_PUNCTUATION),
89 ("Other_Symbol", OTHER_SYMBOL),
90 ("Paragraph_Separator", PARAGRAPH_SEPARATOR), ("Private_Use", PRIVATE_USE),
91 ("Punctuation", PUNCTUATION), ("Separator", SEPARATOR),
92 ("Space_Separator", SPACE_SEPARATOR), ("Spacing_Mark", SPACING_MARK),
93 ("Surrogate", SURROGATE), ("Symbol", SYMBOL),
94 ("Titlecase_Letter", TITLECASE_LETTER), ("Unassigned", UNASSIGNED),
95 ("Uppercase_Letter", UPPERCASE_LETTER),
96 ];
97
98 mod script;
99 // Copy from script::BY_NAME
100 static SCRIPT_PROPERTY_NAMES = [
101 ("Adlam", ADLAM),
102 ("Ahom", AHOM),
103 ("Anatolian_Hieroglyphs", ANATOLIAN_HIEROGLYPHS),
104 ("Arabic", ARABIC),
105 ("Armenian", ARMENIAN),
106 ("Avestan", AVESTAN),
107 ("Balinese", BALINESE),
108 ("Bamum", BAMUM),
109 ("Bassa_Vah", BASSA_VAH),
110 ("Batak", BATAK),
111 ("Bengali", BENGALI),
112 ("Bhaiksuki", BHAIKSUKI),
113 ("Bopomofo", BOPOMOFO),
114 ("Brahmi", BRAHMI),
115 ("Braille", BRAILLE),
116 ("Buginese", BUGINESE),
117 ("Buhid", BUHID),
118 ("Canadian_Aboriginal", CANADIAN_ABORIGINAL),
119 ("Carian", CARIAN),
120 ("Caucasian_Albanian", CAUCASIAN_ALBANIAN),
121 ("Chakma", CHAKMA),
122 ("Cham", CHAM),
123 ("Cherokee", CHEROKEE),
124 ("Chorasmian", CHORASMIAN),
125 ("Common", COMMON),
126 ("Coptic", COPTIC),
127 ("Cuneiform", CUNEIFORM),
128 ("Cypriot", CYPRIOT),
129 ("Cypro_Minoan", CYPRO_MINOAN),
130 ("Cyrillic", CYRILLIC),
131 ("Deseret", DESERET),
132 ("Devanagari", DEVANAGARI),
133 ("Dives_Akuru", DIVES_AKURU),
134 ("Dogra", DOGRA),
135 ("Duployan", DUPLOYAN),
136 ("Egyptian_Hieroglyphs", EGYPTIAN_HIEROGLYPHS),
137 ("Elbasan", ELBASAN),
138 ("Elymaic", ELYMAIC),
139 ("Ethiopic", ETHIOPIC),
140 ("Georgian", GEORGIAN),
141 ("Glagolitic", GLAGOLITIC),
142 ("Gothic", GOTHIC),
143 ("Grantha", GRANTHA),
144 ("Greek", GREEK),
145 ("Gujarati", GUJARATI),
146 ("Gunjala_Gondi", GUNJALA_GONDI),
147 ("Gurmukhi", GURMUKHI),
148 ("Han", HAN),
149 ("Hangul", HANGUL),
150 ("Hanifi_Rohingya", HANIFI_ROHINGYA),
151 ("Hanunoo", HANUNOO),
152 ("Hatran", HATRAN),
153 ("Hebrew", HEBREW),
154 ("Hiragana", HIRAGANA),
155 ("Imperial_Aramaic", IMPERIAL_ARAMAIC),
156 ("Inherited", INHERITED),
157 ("Inscriptional_Pahlavi", INSCRIPTIONAL_PAHLAVI),
158 ("Inscriptional_Parthian", INSCRIPTIONAL_PARTHIAN),
159 ("Javanese", JAVANESE),
160 ("Kaithi", KAITHI),
161 ("Kannada", KANNADA),
162 ("Katakana", KATAKANA),
163 ("Kawi", KAWI),
164 ("Kayah_Li", KAYAH_LI),
165 ("Kharoshthi", KHAROSHTHI),
166 ("Khitan_Small_Script", KHITAN_SMALL_SCRIPT),
167 ("Khmer", KHMER),
168 ("Khojki", KHOJKI),
169 ("Khudawadi", KHUDAWADI),
170 ("Lao", LAO),
171 ("Latin", LATIN),
172 ("Lepcha", LEPCHA),
173 ("Limbu", LIMBU),
174 ("Linear_A", LINEAR_A),
175 ("Linear_B", LINEAR_B),
176 ("Lisu", LISU),
177 ("Lycian", LYCIAN),
178 ("Lydian", LYDIAN),
179 ("Mahajani", MAHAJANI),
180 ("Makasar", MAKASAR),
181 ("Malayalam", MALAYALAM),
182 ("Mandaic", MANDAIC),
183 ("Manichaean", MANICHAEAN),
184 ("Marchen", MARCHEN),
185 ("Masaram_Gondi", MASARAM_GONDI),
186 ("Medefaidrin", MEDEFAIDRIN),
187 ("Meetei_Mayek", MEETEI_MAYEK),
188 ("Mende_Kikakui", MENDE_KIKAKUI),
189 ("Meroitic_Cursive", MEROITIC_CURSIVE),
190 ("Meroitic_Hieroglyphs", MEROITIC_HIEROGLYPHS),
191 ("Miao", MIAO),
192 ("Modi", MODI),
193 ("Mongolian", MONGOLIAN),
194 ("Mro", MRO),
195 ("Multani", MULTANI),
196 ("Myanmar", MYANMAR),
197 ("Nabataean", NABATAEAN),
198 ("Nag_Mundari", NAG_MUNDARI),
199 ("Nandinagari", NANDINAGARI),
200 ("New_Tai_Lue", NEW_TAI_LUE),
201 ("Newa", NEWA),
202 ("Nko", NKO),
203 ("Nushu", NUSHU),
204 ("Nyiakeng_Puachue_Hmong", NYIAKENG_PUACHUE_HMONG),
205 ("Ogham", OGHAM),
206 ("Ol_Chiki", OL_CHIKI),
207 ("Old_Hungarian", OLD_HUNGARIAN),
208 ("Old_Italic", OLD_ITALIC),
209 ("Old_North_Arabian", OLD_NORTH_ARABIAN),
210 ("Old_Permic", OLD_PERMIC),
211 ("Old_Persian", OLD_PERSIAN),
212 ("Old_Sogdian", OLD_SOGDIAN),
213 ("Old_South_Arabian", OLD_SOUTH_ARABIAN),
214 ("Old_Turkic", OLD_TURKIC),
215 ("Old_Uyghur", OLD_UYGHUR),
216 ("Oriya", ORIYA),
217 ("Osage", OSAGE),
218 ("Osmanya", OSMANYA),
219 ("Pahawh_Hmong", PAHAWH_HMONG),
220 ("Palmyrene", PALMYRENE),
221 ("Pau_Cin_Hau", PAU_CIN_HAU),
222 ("Phags_Pa", PHAGS_PA),
223 ("Phoenician", PHOENICIAN),
224 ("Psalter_Pahlavi", PSALTER_PAHLAVI),
225 ("Rejang", REJANG),
226 ("Runic", RUNIC),
227 ("Samaritan", SAMARITAN),
228 ("Saurashtra", SAURASHTRA),
229 ("Sharada", SHARADA),
230 ("Shavian", SHAVIAN),
231 ("Siddham", SIDDHAM),
232 ("SignWriting", SIGNWRITING),
233 ("Sinhala", SINHALA),
234 ("Sogdian", SOGDIAN),
235 ("Sora_Sompeng", SORA_SOMPENG),
236 ("Soyombo", SOYOMBO),
237 ("Sundanese", SUNDANESE),
238 ("Syloti_Nagri", SYLOTI_NAGRI),
239 ("Syriac", SYRIAC),
240 ("Tagalog", TAGALOG),
241 ("Tagbanwa", TAGBANWA),
242 ("Tai_Le", TAI_LE),
243 ("Tai_Tham", TAI_THAM),
244 ("Tai_Viet", TAI_VIET),
245 ("Takri", TAKRI),
246 ("Tamil", TAMIL),
247 ("Tangsa", TANGSA),
248 ("Tangut", TANGUT),
249 ("Telugu", TELUGU),
250 ("Thaana", THAANA),
251 ("Thai", THAI),
252 ("Tibetan", TIBETAN),
253 ("Tifinagh", TIFINAGH),
254 ("Tirhuta", TIRHUTA),
255 ("Toto", TOTO),
256 ("Ugaritic", UGARITIC),
257 ("Vai", VAI),
258 ("Vithkuqi", VITHKUQI),
259 ("Wancho", WANCHO),
260 ("Warang_Citi", WARANG_CITI),
261 ("Yezidi", YEZIDI),
262 ("Yi", YI),
263 ("Zanabazar_Square", ZANABAZAR_SQUARE),
264 ];
265}
266
267/// Return all available unicode property names
268pub fn unicode_property_names() -> Box<dyn Iterator<Item = &'static str>> {
269 Box::new(
270 BINARY_PROPERTY_NAMESimpl Iterator
271 .iter()
272 .map(|name: &&str| *name)
273 .chain(CATEGORY_PROPERTY_NAMES.iter().map(|name: &&str| *name))
274 .chain(SCRIPT_PROPERTY_NAMES.iter().map(|name: &&str| *name)),
275 )
276}
277
278pub fn by_name(name: &str) -> Option<Box<dyn Fn(char) -> bool>> {
279 for property: &(&str, &TrieSetSlice<'_>) in binary::BY_NAME {
280 if name == property.0.to_uppercase() {
281 return Some(Box::new(move |c: char| property.1.contains_char(c)));
282 }
283 }
284
285 for property: &(&str, &TrieSetSlice<'_>) in category::BY_NAME {
286 if name == property.0.to_uppercase() {
287 return Some(Box::new(move |c: char| property.1.contains_char(c)));
288 }
289 }
290
291 for property: &(&str, &TrieSetSlice<'_>) in script::BY_NAME {
292 if name == property.0.to_uppercase() {
293 return Some(Box::new(move |c: char| property.1.contains_char(c)));
294 }
295 }
296
297 None
298}
299