1 | // This file is part of ICU4X. For terms of use, please see the file |
2 | // called LICENSE at the top level of the ICU4X source tree |
3 | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | |
5 | //! 🚧 \[Experimental\] This module is experimental and currently crate-private. Let us know if you |
6 | //! have a use case for this! |
7 | //! |
8 | //! This module contains utilities for working with properties where the specific property in use |
9 | //! is not known at compile time. |
10 | //! |
11 | //! For regex engines, [`crate::sets::load_for_ecma262_unstable()`] is a convenient API for working |
12 | //! with properties at runtime tailored for the use case of ECMA262-compatible regex engines. |
13 | |
14 | #[cfg (doc)] |
15 | use super::{maps, script, GeneralCategory, GeneralCategoryGroup, Script}; |
16 | |
17 | /// This type can represent any Unicode property. |
18 | /// |
19 | /// This is intended to be used in situations where the exact unicode property needed is |
20 | /// only known at runtime, for example in regex engines. |
21 | /// |
22 | /// The values are intended to be identical to ICU4C's UProperty enum |
23 | #[allow (clippy::exhaustive_structs)] // newtype |
24 | #[derive (Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] |
25 | pub struct UnicodeProperty(pub u32); |
26 | |
27 | #[allow (non_upper_case_globals)] |
28 | #[allow (unused)] // experimental, may be made public later |
29 | impl UnicodeProperty { |
30 | /// Binary property `Alphabetic` |
31 | pub const Alphabetic: Self = UnicodeProperty(0); |
32 | /// Binary property `ASCII_Hex_Digit` |
33 | pub const AsciiHexDigit: Self = UnicodeProperty(1); |
34 | /// Binary property `Bidi_Control` |
35 | pub const BidiControl: Self = UnicodeProperty(2); |
36 | /// Binary property `Bidi_Mirrored` |
37 | pub const BidiMirrored: Self = UnicodeProperty(3); |
38 | /// Binary property `Dash` |
39 | pub const Dash: Self = UnicodeProperty(4); |
40 | /// Binary property `Default_Ignorable_Code_Point` |
41 | pub const DefaultIgnorableCodePoint: Self = UnicodeProperty(5); |
42 | /// Binary property `Deprecated` |
43 | pub const Deprecated: Self = UnicodeProperty(6); |
44 | /// Binary property `Diacritic` |
45 | pub const Diacritic: Self = UnicodeProperty(7); |
46 | /// Binary property `Extender` |
47 | pub const Extender: Self = UnicodeProperty(8); |
48 | /// Binary property `Full_Composition_Exclusion` |
49 | pub const FullCompositionExclusion: Self = UnicodeProperty(9); |
50 | /// Binary property `Grapheme_Base` |
51 | pub const GraphemeBase: Self = UnicodeProperty(10); |
52 | /// Binary property `Grapheme_Extend` |
53 | pub const GraphemeExtend: Self = UnicodeProperty(11); |
54 | /// Binary property `Grapheme_Link` |
55 | pub const GraphemeLink: Self = UnicodeProperty(12); |
56 | /// Binary property `Hex_Digit` |
57 | pub const HexDigit: Self = UnicodeProperty(13); |
58 | /// Binary property `Hyphen` |
59 | pub const Hyphen: Self = UnicodeProperty(14); |
60 | /// Binary property `ID_Continue` |
61 | pub const IdContinue: Self = UnicodeProperty(15); |
62 | /// Binary property `ID_Start` |
63 | pub const IdStart: Self = UnicodeProperty(16); |
64 | /// Binary property `Ideographic` |
65 | pub const Ideographic: Self = UnicodeProperty(17); |
66 | /// Binary property `IDS_Binary_Operator` |
67 | pub const IdsBinaryOperator: Self = UnicodeProperty(18); |
68 | /// Binary property `IDS_Trinary_Operator` |
69 | pub const IdsTrinaryOperator: Self = UnicodeProperty(19); |
70 | /// Binary property `Join_Control` |
71 | pub const JoinControl: Self = UnicodeProperty(20); |
72 | /// Binary property `Logical_Order_Exception` |
73 | pub const LogicalOrderException: Self = UnicodeProperty(21); |
74 | /// Binary property `Lowercase` |
75 | pub const Lowercase: Self = UnicodeProperty(22); |
76 | /// Binary property `Math` |
77 | pub const Math: Self = UnicodeProperty(23); |
78 | /// Binary property `Noncharacter_Code_Point` |
79 | pub const NoncharacterCodePoint: Self = UnicodeProperty(24); |
80 | /// Binary property `Quotation_Mark` |
81 | pub const QuotationMark: Self = UnicodeProperty(25); |
82 | /// Binary property `Radical` |
83 | pub const Radical: Self = UnicodeProperty(26); |
84 | /// Binary property `Soft_Dotted` |
85 | pub const SoftDotted: Self = UnicodeProperty(27); |
86 | /// Binary property `Terminal_Punctuation` |
87 | pub const TerminalPunctuation: Self = UnicodeProperty(28); |
88 | /// Binary property `Unified_Ideograph` |
89 | pub const UnifiedIdeograph: Self = UnicodeProperty(29); |
90 | /// Binary property `Uppercase` |
91 | pub const Uppercase: Self = UnicodeProperty(30); |
92 | /// Binary property `White_Space` |
93 | pub const WhiteSpace: Self = UnicodeProperty(31); |
94 | /// Binary property `XID_Continue` |
95 | pub const XidContinue: Self = UnicodeProperty(32); |
96 | /// Binary property `XID_Start` |
97 | pub const XidStart: Self = UnicodeProperty(33); |
98 | /// Binary property `Case_Sensitive` |
99 | pub const CaseSensitive: Self = UnicodeProperty(34); |
100 | /// Binary property `Sentence_Terminal` |
101 | pub const SentenceTerminal: Self = UnicodeProperty(35); |
102 | /// Binary property `Variation_Selector` |
103 | pub const VariationSelector: Self = UnicodeProperty(36); |
104 | /// Binary property `NFD_Inert` |
105 | pub const NfdInert: Self = UnicodeProperty(37); |
106 | /// Binary property `NFKD_Inert` |
107 | pub const NfkdInert: Self = UnicodeProperty(38); |
108 | /// Binary property `NFC_Inert` |
109 | pub const NfcInert: Self = UnicodeProperty(39); |
110 | /// Binary property `NFKC_Inert` |
111 | pub const NfkcInert: Self = UnicodeProperty(40); |
112 | /// Binary property `Segment_Starter` |
113 | pub const SegmentStarter: Self = UnicodeProperty(41); |
114 | /// Binary property `Pattern_Syntax` |
115 | pub const PatternSyntax: Self = UnicodeProperty(42); |
116 | /// Binary property `Pattern_White_Space` |
117 | pub const PatternWhiteSpace: Self = UnicodeProperty(43); |
118 | /// Binary property `alnum` |
119 | pub const Alnum: Self = UnicodeProperty(44); |
120 | /// Binary property `blank` |
121 | pub const Blank: Self = UnicodeProperty(45); |
122 | /// Binary property `graph` |
123 | pub const Graph: Self = UnicodeProperty(46); |
124 | /// Binary property `print` |
125 | pub const Print: Self = UnicodeProperty(47); |
126 | /// Binary property `xdigit` |
127 | pub const XDigit: Self = UnicodeProperty(48); |
128 | /// Binary property `Cased` |
129 | pub const Cased: Self = UnicodeProperty(49); |
130 | /// Binary property `Case_Ignorable` |
131 | pub const CaseIgnorable: Self = UnicodeProperty(50); |
132 | /// Binary property `Changes_When_Lowercased` |
133 | pub const ChangesWhenLowercased: Self = UnicodeProperty(51); |
134 | /// Binary property `Changes_When_Uppercased` |
135 | pub const ChangesWhenUppercased: Self = UnicodeProperty(52); |
136 | /// Binary property `Changes_When_Titlecased` |
137 | pub const ChangesWhenTitlecased: Self = UnicodeProperty(53); |
138 | /// Binary property `Changes_When_Casefolded` |
139 | pub const ChangesWhenCasefolded: Self = UnicodeProperty(54); |
140 | /// Binary property `Changes_When_Casemapped` |
141 | pub const ChangesWhenCasemapped: Self = UnicodeProperty(55); |
142 | /// Binary property `Changes_When_NFKC_Casefolded` |
143 | pub const ChangesWhenNfkcCasefolded: Self = UnicodeProperty(56); |
144 | /// Binary property `Emoji` |
145 | pub const Emoji: Self = UnicodeProperty(57); |
146 | /// Binary property `Emoji_Presentation` |
147 | pub const EmojiPresentation: Self = UnicodeProperty(58); |
148 | /// Binary property `Emoji_Modifier` |
149 | pub const EmojiModifier: Self = UnicodeProperty(59); |
150 | /// Binary property `Emoji_Modifier_Base` |
151 | pub const EmojiModifierBase: Self = UnicodeProperty(60); |
152 | /// Binary property `Emoji_Component` |
153 | pub const EmojiComponent: Self = UnicodeProperty(61); |
154 | /// Binary property `Regional_Indicator` |
155 | pub const RegionalIndicator: Self = UnicodeProperty(62); |
156 | /// Binary property `Prepended_Concatenation_Mark` |
157 | pub const PrependedConcatenationMark: Self = UnicodeProperty(63); |
158 | /// Binary property `Extended_Pictographic` |
159 | pub const ExtendedPictographic: Self = UnicodeProperty(64); |
160 | /// Binary property `Basic_Emoji` |
161 | pub const BasicEmoji: Self = UnicodeProperty(65); |
162 | /// Binary property `Emoji_Keycap_Sequence` |
163 | pub const EmojiKeycapSequence: Self = UnicodeProperty(66); |
164 | /// Binary property `RGI_Emoji_Modifier_Sequence` |
165 | pub const RgiEmojiModifierSequence: Self = UnicodeProperty(67); |
166 | /// Binary property `RGI_Emoji_Flag_Sequence` |
167 | pub const RgiEmojiFlagSequence: Self = UnicodeProperty(68); |
168 | /// Binary property `RGI_Emoji_Tag_Sequence` |
169 | pub const RgiEmojiTagSequence: Self = UnicodeProperty(69); |
170 | /// Binary property `RGI_Emoji_ZWJ_Sequence` |
171 | pub const RgiEmojiZWJSequence: Self = UnicodeProperty(70); |
172 | /// Binary property `RGI_Emoji` |
173 | pub const RgiEmoji: Self = UnicodeProperty(71); |
174 | |
175 | const BINARY_MAX: Self = Self::RgiEmoji; |
176 | |
177 | /// Enumerated property `Bidi_Class` |
178 | pub const BidiClass: Self = UnicodeProperty(0x1000); |
179 | /// Enumerated property `Block` |
180 | pub const Block: Self = UnicodeProperty(0x1001); |
181 | /// Enumerated property `Canonical_Combining_Class` |
182 | pub const CombiningClass: Self = UnicodeProperty(0x1002); |
183 | /// Enumerated property `Decomposition_Type` |
184 | pub const DecompositionType: Self = UnicodeProperty(0x1003); |
185 | /// Enumerated property `East_Asian_Width` |
186 | pub const EastAsianWidth: Self = UnicodeProperty(0x1004); |
187 | /// Enumerated property `General_Category` |
188 | pub const GeneralCategory: Self = UnicodeProperty(0x1005); |
189 | /// Enumerated property `Joining_Group` |
190 | pub const JoiningGroup: Self = UnicodeProperty(0x1006); |
191 | /// Enumerated property `Joining_Type` |
192 | pub const JoiningType: Self = UnicodeProperty(0x1007); |
193 | /// Enumerated property `Line_Break` |
194 | pub const LineBreak: Self = UnicodeProperty(0x1008); |
195 | /// Enumerated property `Numeric_Type` |
196 | pub const NumericType: Self = UnicodeProperty(0x1009); |
197 | /// Enumerated property `Script` |
198 | pub const Script: Self = UnicodeProperty(0x100A); |
199 | /// Enumerated property `Hangul_Syllable_Type` |
200 | pub const HangulSyllableType: Self = UnicodeProperty(0x100B); |
201 | /// Enumerated property `NFD_Quick_Check` |
202 | pub const NFDQuickCheck: Self = UnicodeProperty(0x100C); |
203 | /// Enumerated property `NFKD_Quick_Check` |
204 | pub const NFKDQuickCheck: Self = UnicodeProperty(0x100D); |
205 | /// Enumerated property `NFC_Quick_Check` |
206 | pub const NFCQuickCheck: Self = UnicodeProperty(0x100E); |
207 | /// Enumerated property `NFKC_Quick_Check` |
208 | pub const NFKCQuickCheck: Self = UnicodeProperty(0x100F); |
209 | /// Enumerated property `Lead_Canonical_Combining_Class` |
210 | pub const LeadCanonicalCombiningClass: Self = UnicodeProperty(0x1010); |
211 | /// Enumerated property `Trail_Canonical_Combining_Class` |
212 | pub const TrailCanonicalCombiningClass: Self = UnicodeProperty(0x1011); |
213 | /// Enumerated property `Grapheme_Cluster_Break` |
214 | pub const GraphemeClusterBreak: Self = UnicodeProperty(0x1012); |
215 | /// Enumerated property `Sentence_Break` |
216 | pub const SentenceBreak: Self = UnicodeProperty(0x1013); |
217 | /// Enumerated property `Word_Break` |
218 | pub const WordBreak: Self = UnicodeProperty(0x1014); |
219 | /// Enumerated property `Bidi_Paired_Bracket_Type` |
220 | pub const BidiPairedBracketType: Self = UnicodeProperty(0x1015); |
221 | /// Enumerated property `Indic_Positional_Category` |
222 | pub const IndicPositionalCategory: Self = UnicodeProperty(0x1016); |
223 | /// Enumerated property `Indic_Syllabic_Category` |
224 | pub const IndicSyllabicCategory: Self = UnicodeProperty(0x1017); |
225 | /// Enumerated property `Vertical_Orientation` |
226 | pub const VerticalOrientation: Self = UnicodeProperty(0x1018); |
227 | |
228 | const ENUMERATED_MAX: Self = Self::VerticalOrientation; |
229 | |
230 | /// Mask property `General_Category_Mask` |
231 | pub const GeneralCategoryMask: Self = UnicodeProperty(0x2000); |
232 | |
233 | /// Double property `Numeric_Value` |
234 | pub const NumericValue: Self = UnicodeProperty(0x3000); |
235 | |
236 | /// String property `Age` |
237 | pub const Age: Self = UnicodeProperty(0x4000); |
238 | /// String property `Bidi_Mirroring_Glyph` |
239 | pub const BidiMirroringGlyph: Self = UnicodeProperty(0x4001); |
240 | /// String property `Case_Folding` |
241 | pub const CaseFolding: Self = UnicodeProperty(0x4002); |
242 | /// String property `ISO_Comment` |
243 | pub const ISOComment: Self = UnicodeProperty(0x4003); |
244 | /// String property `Lowercase_Mapping` |
245 | pub const LowercaseMapping: Self = UnicodeProperty(0x4004); |
246 | /// String property `Name` |
247 | pub const Name: Self = UnicodeProperty(0x4005); |
248 | /// String property `Simple_Case_Folding` |
249 | pub const SimpleCaseFolding: Self = UnicodeProperty(0x4006); |
250 | /// String property `Simple_Lowercase_Mapping` |
251 | pub const SimpleLowercaseMapping: Self = UnicodeProperty(0x4007); |
252 | /// String property `Simple_Titlecase_Mapping` |
253 | pub const SimpleTitlecaseMapping: Self = UnicodeProperty(0x4008); |
254 | /// String property `Simple_Uppercase_Mapping` |
255 | pub const SimpleUppercaseMapping: Self = UnicodeProperty(0x4009); |
256 | /// String property `Titlecase_Mapping` |
257 | pub const TitlecaseMapping: Self = UnicodeProperty(0x400A); |
258 | /// String property `Unicode_1_Name` |
259 | pub const Unicode1_Name: Self = UnicodeProperty(0x400B); |
260 | /// String property `Uppercase_Mapping` |
261 | pub const UppercaseMapping: Self = UnicodeProperty(0x400C); |
262 | /// String property `Bidi_Paired_Bracket` |
263 | pub const BidiPairedBracket: Self = UnicodeProperty(0x400D); |
264 | |
265 | const STRING_MAX: Self = Self::BidiPairedBracket; |
266 | |
267 | /// Misc property `Script_Extensions` |
268 | pub const ScriptExtensions: Self = UnicodeProperty(0x7000); |
269 | } |
270 | |
271 | #[allow (unused)] // experimental, may be made public later |
272 | impl UnicodeProperty { |
273 | /// Given a property name (long, short, or alias), returns the corresponding [`UnicodeProperty`] |
274 | /// value for it provided it belongs to the [subset relevant for ECMA262 regexes][subset] |
275 | /// |
276 | /// Returns none if the name does not match any of the names in this subset. Performs |
277 | /// strict matching of names. |
278 | /// |
279 | /// If using this to implement an ECMA262-compliant regex engine, please note these caveats: |
280 | /// |
281 | /// - This only returns binary and enumerated properties, as well as [`Self::ScriptExtensions`]. |
282 | /// Lookup can be performed sufficiently with [`Self::load_ecma262_binary_property_unstable()`], |
283 | /// [`maps::load_general_category()`], [`maps::load_script()`] and [`script::load_script_with_extensions_unstable()`]. |
284 | /// - This does not handle the `Any`, `Assigned`, or `ASCII` pseudoproperties, since they are not |
285 | /// defined as properties. |
286 | /// - `Any` can be expressed as the range `[\u{0}-\u{10FFFF}]` |
287 | /// - `Assigned` can be expressed as the inverse of the set `gc=Cn` (i.e., `\P{gc=Cn}`). |
288 | /// - `ASCII` can be expressed as the range `[\u{0}-\u{7F}]` |
289 | /// - ECMA262 regexes transparently allow `General_Category_Mask` values for `GeneralCategory`. |
290 | /// This method does not return [`Self::GeneralCategoryMask`], and instead relies on the caller to use mask-related lookup |
291 | /// functions where necessary. |
292 | /// - ECMA262 regexes allow treating `General_Category` (and `gcm`) values as binary properties, |
293 | /// e.g. you can do things like `\p{Lu}` as shortform for `\p{gc=Lu}`. This method does not do so |
294 | /// since these are property values, not properties, but you can use |
295 | /// [`GeneralCategory::get_name_to_enum_mapper()`] or [`GeneralCategoryGroup::get_name_to_enum_mapper()`] |
296 | /// to handle this. |
297 | /// |
298 | /// |
299 | /// [subset]: https://tc39.es/ecma262/#table-nonbinary-unicode-properties |
300 | pub fn parse_ecma262_name(name: &str) -> Option<Self> { |
301 | let prop = match name { |
302 | "General_Category" | "gc" => Self::GeneralCategory, |
303 | "Script" | "sc" => Self::Script, |
304 | "Script_Extensions" | "scx" => Self::ScriptExtensions, |
305 | "ASCII_Hex_Digit" | "AHex" => Self::AsciiHexDigit, |
306 | "Alphabetic" | "Alpha" => Self::Alphabetic, |
307 | "Bidi_Control" | "Bidi_C" => Self::BidiControl, |
308 | "Bidi_Mirrored" | "Bidi_M" => Self::BidiMirrored, |
309 | "Case_Ignorable" | "CI" => Self::CaseIgnorable, |
310 | "Cased" => Self::Cased, |
311 | "Changes_When_Casefolded" | "CWCF" => Self::ChangesWhenCasefolded, |
312 | "Changes_When_Casemapped" | "CWCM" => Self::ChangesWhenCasemapped, |
313 | "Changes_When_Lowercased" | "CWL" => Self::ChangesWhenLowercased, |
314 | "Changes_When_NFKC_Casefolded" | "CWKCF" => Self::ChangesWhenNfkcCasefolded, |
315 | "Changes_When_Titlecased" | "CWT" => Self::ChangesWhenTitlecased, |
316 | "Changes_When_Uppercased" | "CWU" => Self::ChangesWhenUppercased, |
317 | "Dash" => Self::Dash, |
318 | "Default_Ignorable_Code_Point" | "DI" => Self::DefaultIgnorableCodePoint, |
319 | "Deprecated" | "Dep" => Self::Deprecated, |
320 | "Diacritic" | "Dia" => Self::Diacritic, |
321 | "Emoji" => Self::Emoji, |
322 | "Emoji_Component" | "EComp" => Self::EmojiComponent, |
323 | "Emoji_Modifier" | "EMod" => Self::EmojiModifier, |
324 | "Emoji_Modifier_Base" | "EBase" => Self::EmojiModifierBase, |
325 | "Emoji_Presentation" | "EPres" => Self::EmojiPresentation, |
326 | "Extended_Pictographic" | "ExtPict" => Self::ExtendedPictographic, |
327 | "Extender" | "Ext" => Self::Extender, |
328 | "Grapheme_Base" | "Gr_Base" => Self::GraphemeBase, |
329 | "Grapheme_Extend" | "Gr_Ext" => Self::GraphemeExtend, |
330 | "Hex_Digit" | "Hex" => Self::HexDigit, |
331 | "IDS_Binary_Operator" | "IDSB" => Self::IdsBinaryOperator, |
332 | "IDS_Trinary_Operator" | "IDST" => Self::IdsTrinaryOperator, |
333 | "ID_Continue" | "IDC" => Self::IdContinue, |
334 | "ID_Start" | "IDS" => Self::IdStart, |
335 | "Ideographic" | "Ideo" => Self::Ideographic, |
336 | "Join_Control" | "Join_C" => Self::JoinControl, |
337 | "Logical_Order_Exception" | "LOE" => Self::LogicalOrderException, |
338 | "Lowercase" | "Lower" => Self::Lowercase, |
339 | "Math" => Self::Math, |
340 | "Noncharacter_Code_Point" | "NChar" => Self::NoncharacterCodePoint, |
341 | "Pattern_Syntax" | "Pat_Syn" => Self::PatternSyntax, |
342 | "Pattern_White_Space" | "Pat_WS" => Self::PatternWhiteSpace, |
343 | "Quotation_Mark" | "QMark" => Self::QuotationMark, |
344 | "Radical" => Self::Radical, |
345 | "Regional_Indicator" | "RI" => Self::RegionalIndicator, |
346 | "Sentence_Terminal" | "STerm" => Self::SentenceTerminal, |
347 | "Soft_Dotted" | "SD" => Self::SoftDotted, |
348 | "Terminal_Punctuation" | "Term" => Self::TerminalPunctuation, |
349 | "Unified_Ideograph" | "UIdeo" => Self::UnifiedIdeograph, |
350 | "Uppercase" | "Upper" => Self::Uppercase, |
351 | "Variation_Selector" | "VS" => Self::VariationSelector, |
352 | "White_Space" | "space" => Self::WhiteSpace, |
353 | "XID_Continue" | "XIDC" => Self::XidContinue, |
354 | "XID_Start" | "XIDS" => Self::XidStart, |
355 | _ => return None, |
356 | }; |
357 | |
358 | Some(prop) |
359 | } |
360 | } |
361 | |