| 1 | //! Character inclusion in binary or General_Category value Unicode sets. |
| 2 | //! |
| 3 | //! We rely on dead code elimination to remove the tables that aren't needed. |
| 4 | |
| 5 | #![allow (bad_style)] |
| 6 | #![allow (clippy::all)] |
| 7 | |
| 8 | use alloc::boxed::Box; |
| 9 | |
| 10 | macro_rules! property_functions { |
| 11 | ($module:ident, $property_names:ident, [$( |
| 12 | $prop:ident, |
| 13 | )*]) => { |
| 14 | #[allow(unused)] |
| 15 | mod $module; |
| 16 | // unicode::ALPHABETIC('a') |
| 17 | $(pub fn $prop(c: char) -> bool { |
| 18 | self::$module::$prop.contains_char(c) |
| 19 | })* |
| 20 | |
| 21 | pub static $property_names: &[&str] = &[ |
| 22 | $(stringify!($prop),)* |
| 23 | ]; |
| 24 | }; |
| 25 | } |
| 26 | |
| 27 | macro_rules! char_property_functions { |
| 28 | // For define custom property names |
| 29 | {$( |
| 30 | mod $module:ident; |
| 31 | static $property_names:ident = [$( |
| 32 | $prop:ident, |
| 33 | )*]; |
| 34 | )*} => {$( |
| 35 | property_functions!($module, $property_names, [$( |
| 36 | $prop, |
| 37 | )*]); |
| 38 | )*}; |
| 39 | // For define property by copy BY_NAME values from `ucd-generate` generated. |
| 40 | {$( |
| 41 | mod $module:ident; |
| 42 | static $property_names:ident = [$( |
| 43 | ($_name:tt, $prop:ident), |
| 44 | )*]; |
| 45 | )*} => {$( |
| 46 | property_functions!($module, $property_names, [$( |
| 47 | $prop, |
| 48 | )*]); |
| 49 | )*}; |
| 50 | } |
| 51 | |
| 52 | char_property_functions! { |
| 53 | mod binary; |
| 54 | static BINARY_PROPERTY_NAMES = [ |
| 55 | // ASCII_HEX_DIGIT, // let this one be stripped out -- the full trie is wasteful for ASCII |
| 56 | ALPHABETIC, BIDI_CONTROL, CASE_IGNORABLE, CASED, CHANGES_WHEN_CASEFOLDED, |
| 57 | CHANGES_WHEN_CASEMAPPED, CHANGES_WHEN_LOWERCASED, CHANGES_WHEN_TITLECASED, |
| 58 | CHANGES_WHEN_UPPERCASED, DASH, DEFAULT_IGNORABLE_CODE_POINT, DEPRECATED, DIACRITIC, |
| 59 | EMOJI, EMOJI_COMPONENT, EMOJI_MODIFIER, EMOJI_MODIFIER_BASE, EMOJI_PRESENTATION, EXTENDED_PICTOGRAPHIC, |
| 60 | EXTENDER, GRAPHEME_BASE, GRAPHEME_EXTEND, GRAPHEME_LINK, HEX_DIGIT, HYPHEN, |
| 61 | IDS_BINARY_OPERATOR, IDS_TRINARY_OPERATOR, ID_CONTINUE, ID_START, IDEOGRAPHIC, JOIN_CONTROL, |
| 62 | LOGICAL_ORDER_EXCEPTION, LOWERCASE, MATH, NONCHARACTER_CODE_POINT, OTHER_ALPHABETIC, |
| 63 | OTHER_DEFAULT_IGNORABLE_CODE_POINT, OTHER_GRAPHEME_EXTEND, OTHER_ID_CONTINUE, |
| 64 | OTHER_ID_START, OTHER_LOWERCASE, OTHER_MATH, OTHER_UPPERCASE, PATTERN_SYNTAX, |
| 65 | PATTERN_WHITE_SPACE, PREPENDED_CONCATENATION_MARK, QUOTATION_MARK, RADICAL, |
| 66 | REGIONAL_INDICATOR, SENTENCE_TERMINAL, SOFT_DOTTED, TERMINAL_PUNCTUATION, UNIFIED_IDEOGRAPH, |
| 67 | UPPERCASE, VARIATION_SELECTOR, WHITE_SPACE, XID_CONTINUE, XID_START, |
| 68 | ]; |
| 69 | } |
| 70 | |
| 71 | char_property_functions! { |
| 72 | mod category; |
| 73 | // Copy from category::BY_NAME |
| 74 | static CATEGORY_PROPERTY_NAMES = [ |
| 75 | ("Cased_Letter" , CASED_LETTER), ("Close_Punctuation" , CLOSE_PUNCTUATION), |
| 76 | ("Connector_Punctuation" , CONNECTOR_PUNCTUATION), ("Control" , CONTROL), |
| 77 | ("Currency_Symbol" , CURRENCY_SYMBOL), |
| 78 | ("Dash_Punctuation" , DASH_PUNCTUATION), ("Decimal_Number" , DECIMAL_NUMBER), |
| 79 | ("Enclosing_Mark" , ENCLOSING_MARK), |
| 80 | ("Final_Punctuation" , FINAL_PUNCTUATION), ("Format" , FORMAT), |
| 81 | ("Initial_Punctuation" , INITIAL_PUNCTUATION), ("Letter" , LETTER), |
| 82 | ("Letter_Number" , LETTER_NUMBER), ("Line_Separator" , LINE_SEPARATOR), |
| 83 | ("Lowercase_Letter" , LOWERCASE_LETTER), ("Mark" , MARK), |
| 84 | ("Math_Symbol" , MATH_SYMBOL), ("Modifier_Letter" , MODIFIER_LETTER), |
| 85 | ("Modifier_Symbol" , MODIFIER_SYMBOL), ("Nonspacing_Mark" , NONSPACING_MARK), |
| 86 | ("Number" , NUMBER), ("Open_Punctuation" , OPEN_PUNCTUATION), |
| 87 | ("Other" , OTHER), ("Other_Letter" , OTHER_LETTER), |
| 88 | ("Other_Number" , OTHER_NUMBER), ("Other_Punctuation" , OTHER_PUNCTUATION), |
| 89 | ("Other_Symbol" , OTHER_SYMBOL), |
| 90 | ("Paragraph_Separator" , PARAGRAPH_SEPARATOR), ("Private_Use" , PRIVATE_USE), |
| 91 | ("Punctuation" , PUNCTUATION), ("Separator" , SEPARATOR), |
| 92 | ("Space_Separator" , SPACE_SEPARATOR), ("Spacing_Mark" , SPACING_MARK), |
| 93 | ("Surrogate" , SURROGATE), ("Symbol" , SYMBOL), |
| 94 | ("Titlecase_Letter" , TITLECASE_LETTER), ("Unassigned" , UNASSIGNED), |
| 95 | ("Uppercase_Letter" , UPPERCASE_LETTER), |
| 96 | ]; |
| 97 | |
| 98 | mod script; |
| 99 | // Copy from script::BY_NAME |
| 100 | static SCRIPT_PROPERTY_NAMES = [ |
| 101 | ("Adlam" , ADLAM), |
| 102 | ("Ahom" , AHOM), |
| 103 | ("Anatolian_Hieroglyphs" , ANATOLIAN_HIEROGLYPHS), |
| 104 | ("Arabic" , ARABIC), |
| 105 | ("Armenian" , ARMENIAN), |
| 106 | ("Avestan" , AVESTAN), |
| 107 | ("Balinese" , BALINESE), |
| 108 | ("Bamum" , BAMUM), |
| 109 | ("Bassa_Vah" , BASSA_VAH), |
| 110 | ("Batak" , BATAK), |
| 111 | ("Bengali" , BENGALI), |
| 112 | ("Bhaiksuki" , BHAIKSUKI), |
| 113 | ("Bopomofo" , BOPOMOFO), |
| 114 | ("Brahmi" , BRAHMI), |
| 115 | ("Braille" , BRAILLE), |
| 116 | ("Buginese" , BUGINESE), |
| 117 | ("Buhid" , BUHID), |
| 118 | ("Canadian_Aboriginal" , CANADIAN_ABORIGINAL), |
| 119 | ("Carian" , CARIAN), |
| 120 | ("Caucasian_Albanian" , CAUCASIAN_ALBANIAN), |
| 121 | ("Chakma" , CHAKMA), |
| 122 | ("Cham" , CHAM), |
| 123 | ("Cherokee" , CHEROKEE), |
| 124 | ("Chorasmian" , CHORASMIAN), |
| 125 | ("Common" , COMMON), |
| 126 | ("Coptic" , COPTIC), |
| 127 | ("Cuneiform" , CUNEIFORM), |
| 128 | ("Cypriot" , CYPRIOT), |
| 129 | ("Cypro_Minoan" , CYPRO_MINOAN), |
| 130 | ("Cyrillic" , CYRILLIC), |
| 131 | ("Deseret" , DESERET), |
| 132 | ("Devanagari" , DEVANAGARI), |
| 133 | ("Dives_Akuru" , DIVES_AKURU), |
| 134 | ("Dogra" , DOGRA), |
| 135 | ("Duployan" , DUPLOYAN), |
| 136 | ("Egyptian_Hieroglyphs" , EGYPTIAN_HIEROGLYPHS), |
| 137 | ("Elbasan" , ELBASAN), |
| 138 | ("Elymaic" , ELYMAIC), |
| 139 | ("Ethiopic" , ETHIOPIC), |
| 140 | ("Georgian" , GEORGIAN), |
| 141 | ("Glagolitic" , GLAGOLITIC), |
| 142 | ("Gothic" , GOTHIC), |
| 143 | ("Grantha" , GRANTHA), |
| 144 | ("Greek" , GREEK), |
| 145 | ("Gujarati" , GUJARATI), |
| 146 | ("Gunjala_Gondi" , GUNJALA_GONDI), |
| 147 | ("Gurmukhi" , GURMUKHI), |
| 148 | ("Han" , HAN), |
| 149 | ("Hangul" , HANGUL), |
| 150 | ("Hanifi_Rohingya" , HANIFI_ROHINGYA), |
| 151 | ("Hanunoo" , HANUNOO), |
| 152 | ("Hatran" , HATRAN), |
| 153 | ("Hebrew" , HEBREW), |
| 154 | ("Hiragana" , HIRAGANA), |
| 155 | ("Imperial_Aramaic" , IMPERIAL_ARAMAIC), |
| 156 | ("Inherited" , INHERITED), |
| 157 | ("Inscriptional_Pahlavi" , INSCRIPTIONAL_PAHLAVI), |
| 158 | ("Inscriptional_Parthian" , INSCRIPTIONAL_PARTHIAN), |
| 159 | ("Javanese" , JAVANESE), |
| 160 | ("Kaithi" , KAITHI), |
| 161 | ("Kannada" , KANNADA), |
| 162 | ("Katakana" , KATAKANA), |
| 163 | ("Kawi" , KAWI), |
| 164 | ("Kayah_Li" , KAYAH_LI), |
| 165 | ("Kharoshthi" , KHAROSHTHI), |
| 166 | ("Khitan_Small_Script" , KHITAN_SMALL_SCRIPT), |
| 167 | ("Khmer" , KHMER), |
| 168 | ("Khojki" , KHOJKI), |
| 169 | ("Khudawadi" , KHUDAWADI), |
| 170 | ("Lao" , LAO), |
| 171 | ("Latin" , LATIN), |
| 172 | ("Lepcha" , LEPCHA), |
| 173 | ("Limbu" , LIMBU), |
| 174 | ("Linear_A" , LINEAR_A), |
| 175 | ("Linear_B" , LINEAR_B), |
| 176 | ("Lisu" , LISU), |
| 177 | ("Lycian" , LYCIAN), |
| 178 | ("Lydian" , LYDIAN), |
| 179 | ("Mahajani" , MAHAJANI), |
| 180 | ("Makasar" , MAKASAR), |
| 181 | ("Malayalam" , MALAYALAM), |
| 182 | ("Mandaic" , MANDAIC), |
| 183 | ("Manichaean" , MANICHAEAN), |
| 184 | ("Marchen" , MARCHEN), |
| 185 | ("Masaram_Gondi" , MASARAM_GONDI), |
| 186 | ("Medefaidrin" , MEDEFAIDRIN), |
| 187 | ("Meetei_Mayek" , MEETEI_MAYEK), |
| 188 | ("Mende_Kikakui" , MENDE_KIKAKUI), |
| 189 | ("Meroitic_Cursive" , MEROITIC_CURSIVE), |
| 190 | ("Meroitic_Hieroglyphs" , MEROITIC_HIEROGLYPHS), |
| 191 | ("Miao" , MIAO), |
| 192 | ("Modi" , MODI), |
| 193 | ("Mongolian" , MONGOLIAN), |
| 194 | ("Mro" , MRO), |
| 195 | ("Multani" , MULTANI), |
| 196 | ("Myanmar" , MYANMAR), |
| 197 | ("Nabataean" , NABATAEAN), |
| 198 | ("Nag_Mundari" , NAG_MUNDARI), |
| 199 | ("Nandinagari" , NANDINAGARI), |
| 200 | ("New_Tai_Lue" , NEW_TAI_LUE), |
| 201 | ("Newa" , NEWA), |
| 202 | ("Nko" , NKO), |
| 203 | ("Nushu" , NUSHU), |
| 204 | ("Nyiakeng_Puachue_Hmong" , NYIAKENG_PUACHUE_HMONG), |
| 205 | ("Ogham" , OGHAM), |
| 206 | ("Ol_Chiki" , OL_CHIKI), |
| 207 | ("Old_Hungarian" , OLD_HUNGARIAN), |
| 208 | ("Old_Italic" , OLD_ITALIC), |
| 209 | ("Old_North_Arabian" , OLD_NORTH_ARABIAN), |
| 210 | ("Old_Permic" , OLD_PERMIC), |
| 211 | ("Old_Persian" , OLD_PERSIAN), |
| 212 | ("Old_Sogdian" , OLD_SOGDIAN), |
| 213 | ("Old_South_Arabian" , OLD_SOUTH_ARABIAN), |
| 214 | ("Old_Turkic" , OLD_TURKIC), |
| 215 | ("Old_Uyghur" , OLD_UYGHUR), |
| 216 | ("Oriya" , ORIYA), |
| 217 | ("Osage" , OSAGE), |
| 218 | ("Osmanya" , OSMANYA), |
| 219 | ("Pahawh_Hmong" , PAHAWH_HMONG), |
| 220 | ("Palmyrene" , PALMYRENE), |
| 221 | ("Pau_Cin_Hau" , PAU_CIN_HAU), |
| 222 | ("Phags_Pa" , PHAGS_PA), |
| 223 | ("Phoenician" , PHOENICIAN), |
| 224 | ("Psalter_Pahlavi" , PSALTER_PAHLAVI), |
| 225 | ("Rejang" , REJANG), |
| 226 | ("Runic" , RUNIC), |
| 227 | ("Samaritan" , SAMARITAN), |
| 228 | ("Saurashtra" , SAURASHTRA), |
| 229 | ("Sharada" , SHARADA), |
| 230 | ("Shavian" , SHAVIAN), |
| 231 | ("Siddham" , SIDDHAM), |
| 232 | ("SignWriting" , SIGNWRITING), |
| 233 | ("Sinhala" , SINHALA), |
| 234 | ("Sogdian" , SOGDIAN), |
| 235 | ("Sora_Sompeng" , SORA_SOMPENG), |
| 236 | ("Soyombo" , SOYOMBO), |
| 237 | ("Sundanese" , SUNDANESE), |
| 238 | ("Syloti_Nagri" , SYLOTI_NAGRI), |
| 239 | ("Syriac" , SYRIAC), |
| 240 | ("Tagalog" , TAGALOG), |
| 241 | ("Tagbanwa" , TAGBANWA), |
| 242 | ("Tai_Le" , TAI_LE), |
| 243 | ("Tai_Tham" , TAI_THAM), |
| 244 | ("Tai_Viet" , TAI_VIET), |
| 245 | ("Takri" , TAKRI), |
| 246 | ("Tamil" , TAMIL), |
| 247 | ("Tangsa" , TANGSA), |
| 248 | ("Tangut" , TANGUT), |
| 249 | ("Telugu" , TELUGU), |
| 250 | ("Thaana" , THAANA), |
| 251 | ("Thai" , THAI), |
| 252 | ("Tibetan" , TIBETAN), |
| 253 | ("Tifinagh" , TIFINAGH), |
| 254 | ("Tirhuta" , TIRHUTA), |
| 255 | ("Toto" , TOTO), |
| 256 | ("Ugaritic" , UGARITIC), |
| 257 | ("Vai" , VAI), |
| 258 | ("Vithkuqi" , VITHKUQI), |
| 259 | ("Wancho" , WANCHO), |
| 260 | ("Warang_Citi" , WARANG_CITI), |
| 261 | ("Yezidi" , YEZIDI), |
| 262 | ("Yi" , YI), |
| 263 | ("Zanabazar_Square" , ZANABAZAR_SQUARE), |
| 264 | ]; |
| 265 | } |
| 266 | |
| 267 | /// Return all available unicode property names |
| 268 | pub fn unicode_property_names() -> Box<dyn Iterator<Item = &'static str>> { |
| 269 | Box::new( |
| 270 | BINARY_PROPERTY_NAMESimpl Iterator |
| 271 | .iter() |
| 272 | .map(|name: &&str| *name) |
| 273 | .chain(CATEGORY_PROPERTY_NAMES.iter().map(|name: &&str| *name)) |
| 274 | .chain(SCRIPT_PROPERTY_NAMES.iter().map(|name: &&str| *name)), |
| 275 | ) |
| 276 | } |
| 277 | |
| 278 | pub fn by_name(name: &str) -> Option<Box<dyn Fn(char) -> bool>> { |
| 279 | for property: &'static (&str, &TrieSetSlice<'_>) in binary::BY_NAME { |
| 280 | if name == property.0.to_uppercase() { |
| 281 | return Some(Box::new(move |c: char| property.1.contains_char(c))); |
| 282 | } |
| 283 | } |
| 284 | |
| 285 | for property: &'static (&str, &TrieSetSlice<'_>) in category::BY_NAME { |
| 286 | if name == property.0.to_uppercase() { |
| 287 | return Some(Box::new(move |c: char| property.1.contains_char(c))); |
| 288 | } |
| 289 | } |
| 290 | |
| 291 | for property: &'static (&str, &TrieSetSlice<'_>) in script::BY_NAME { |
| 292 | if name == property.0.to_uppercase() { |
| 293 | return Some(Box::new(move |c: char| property.1.contains_char(c))); |
| 294 | } |
| 295 | } |
| 296 | |
| 297 | None |
| 298 | } |
| 299 | |