1 | use alloc::{ |
2 | string::{String, ToString}, |
3 | vec::Vec, |
4 | }; |
5 | |
6 | use crate::hir; |
7 | |
8 | /// An inclusive range of codepoints from a generated file (hence the static |
9 | /// lifetime). |
10 | type Range = &'static [(char, char)]; |
11 | |
12 | /// An error that occurs when dealing with Unicode. |
13 | /// |
14 | /// We don't impl the Error trait here because these always get converted |
15 | /// into other public errors. (This error type isn't exported.) |
16 | #[derive(Debug)] |
17 | pub enum Error { |
18 | PropertyNotFound, |
19 | PropertyValueNotFound, |
20 | // Not used when unicode-perl is enabled. |
21 | #[allow (dead_code)] |
22 | PerlClassNotFound, |
23 | } |
24 | |
25 | /// An error that occurs when Unicode-aware simple case folding fails. |
26 | /// |
27 | /// This error can occur when the case mapping tables necessary for Unicode |
28 | /// aware case folding are unavailable. This only occurs when the |
29 | /// `unicode-case` feature is disabled. (The feature is enabled by default.) |
30 | #[derive(Debug)] |
31 | pub struct CaseFoldError(()); |
32 | |
33 | #[cfg (feature = "std" )] |
34 | impl std::error::Error for CaseFoldError {} |
35 | |
36 | impl core::fmt::Display for CaseFoldError { |
37 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
38 | write!( |
39 | f, |
40 | "Unicode-aware case folding is not available \ |
41 | (probably because the unicode-case feature is not enabled)" |
42 | ) |
43 | } |
44 | } |
45 | |
46 | /// An error that occurs when the Unicode-aware `\w` class is unavailable. |
47 | /// |
48 | /// This error can occur when the data tables necessary for the Unicode aware |
49 | /// Perl character class `\w` are unavailable. This only occurs when the |
50 | /// `unicode-perl` feature is disabled. (The feature is enabled by default.) |
51 | #[derive(Debug)] |
52 | pub struct UnicodeWordError(()); |
53 | |
54 | #[cfg (feature = "std" )] |
55 | impl std::error::Error for UnicodeWordError {} |
56 | |
57 | impl core::fmt::Display for UnicodeWordError { |
58 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
59 | write!( |
60 | f, |
61 | "Unicode-aware \\w class is not available \ |
62 | (probably because the unicode-perl feature is not enabled)" |
63 | ) |
64 | } |
65 | } |
66 | |
67 | /// A state oriented traverser of the simple case folding table. |
68 | /// |
69 | /// A case folder can be constructed via `SimpleCaseFolder::new()`, which will |
70 | /// return an error if the underlying case folding table is unavailable. |
71 | /// |
72 | /// After construction, it is expected that callers will use |
73 | /// `SimpleCaseFolder::mapping` by calling it with codepoints in strictly |
74 | /// increasing order. For example, calling it on `b` and then on `a` is illegal |
75 | /// and will result in a panic. |
76 | /// |
77 | /// The main idea of this type is that it tries hard to make mapping lookups |
78 | /// fast by exploiting the structure of the underlying table, and the ordering |
79 | /// assumption enables this. |
80 | #[derive(Debug)] |
81 | pub struct SimpleCaseFolder { |
82 | /// The simple case fold table. It's a sorted association list, where the |
83 | /// keys are Unicode scalar values and the values are the corresponding |
84 | /// equivalence class (not including the key) of the "simple" case folded |
85 | /// Unicode scalar values. |
86 | table: &'static [(char, &'static [char])], |
87 | /// The last codepoint that was used for a lookup. |
88 | last: Option<char>, |
89 | /// The index to the entry in `table` corresponding to the smallest key `k` |
90 | /// such that `k > k0`, where `k0` is the most recent key lookup. Note that |
91 | /// in particular, `k0` may not be in the table! |
92 | next: usize, |
93 | } |
94 | |
95 | impl SimpleCaseFolder { |
96 | /// Create a new simple case folder, returning an error if the underlying |
97 | /// case folding table is unavailable. |
98 | pub fn new() -> Result<SimpleCaseFolder, CaseFoldError> { |
99 | #[cfg (not(feature = "unicode-case" ))] |
100 | { |
101 | Err(CaseFoldError(())) |
102 | } |
103 | #[cfg (feature = "unicode-case" )] |
104 | { |
105 | Ok(SimpleCaseFolder { |
106 | table: crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE, |
107 | last: None, |
108 | next: 0, |
109 | }) |
110 | } |
111 | } |
112 | |
113 | /// Return the equivalence class of case folded codepoints for the given |
114 | /// codepoint. The equivalence class returned never includes the codepoint |
115 | /// given. If the given codepoint has no case folded codepoints (i.e., |
116 | /// no entry in the underlying case folding table), then this returns an |
117 | /// empty slice. |
118 | /// |
119 | /// # Panics |
120 | /// |
121 | /// This panics when called with a `c` that is less than or equal to the |
122 | /// previous call. In other words, callers need to use this method with |
123 | /// strictly increasing values of `c`. |
124 | pub fn mapping(&mut self, c: char) -> &'static [char] { |
125 | if let Some(last) = self.last { |
126 | assert!( |
127 | last < c, |
128 | "got codepoint U+{:X} which occurs before \ |
129 | last codepoint U+{:X}" , |
130 | u32::from(c), |
131 | u32::from(last), |
132 | ); |
133 | } |
134 | self.last = Some(c); |
135 | if self.next >= self.table.len() { |
136 | return &[]; |
137 | } |
138 | let (k, v) = self.table[self.next]; |
139 | if k == c { |
140 | self.next += 1; |
141 | return v; |
142 | } |
143 | match self.get(c) { |
144 | Err(i) => { |
145 | self.next = i; |
146 | &[] |
147 | } |
148 | Ok(i) => { |
149 | // Since we require lookups to proceed |
150 | // in order, anything we find should be |
151 | // after whatever we thought might be |
152 | // next. Otherwise, the caller is either |
153 | // going out of order or we would have |
154 | // found our next key at 'self.next'. |
155 | assert!(i > self.next); |
156 | self.next = i + 1; |
157 | self.table[i].1 |
158 | } |
159 | } |
160 | } |
161 | |
162 | /// Returns true if and only if the given range overlaps with any region |
163 | /// of the underlying case folding table. That is, when true, there exists |
164 | /// at least one codepoint in the inclusive range `[start, end]` that has |
165 | /// a non-trivial equivalence class of case folded codepoints. Conversely, |
166 | /// when this returns false, all codepoints in the range `[start, end]` |
167 | /// correspond to the trivial equivalence class of case folded codepoints, |
168 | /// i.e., itself. |
169 | /// |
170 | /// This is useful to call before iterating over the codepoints in the |
171 | /// range and looking up the mapping for each. If you know none of the |
172 | /// mappings will return anything, then you might be able to skip doing it |
173 | /// altogether. |
174 | /// |
175 | /// # Panics |
176 | /// |
177 | /// This panics when `end < start`. |
178 | pub fn overlaps(&self, start: char, end: char) -> bool { |
179 | use core::cmp::Ordering; |
180 | |
181 | assert!(start <= end); |
182 | self.table |
183 | .binary_search_by(|&(c, _)| { |
184 | if start <= c && c <= end { |
185 | Ordering::Equal |
186 | } else if c > end { |
187 | Ordering::Greater |
188 | } else { |
189 | Ordering::Less |
190 | } |
191 | }) |
192 | .is_ok() |
193 | } |
194 | |
195 | /// Returns the index at which `c` occurs in the simple case fold table. If |
196 | /// `c` does not occur, then this returns an `i` such that `table[i-1].0 < |
197 | /// c` and `table[i].0 > c`. |
198 | fn get(&self, c: char) -> Result<usize, usize> { |
199 | self.table.binary_search_by_key(&c, |&(c1, _)| c1) |
200 | } |
201 | } |
202 | |
203 | /// A query for finding a character class defined by Unicode. This supports |
204 | /// either use of a property name directly, or lookup by property value. The |
205 | /// former generally refers to Binary properties (see UTS#44, Table 8), but |
206 | /// as a special exception (see UTS#18, Section 1.2) both general categories |
207 | /// (an enumeration) and scripts (a catalog) are supported as if each of their |
208 | /// possible values were a binary property. |
209 | /// |
210 | /// In all circumstances, property names and values are normalized and |
211 | /// canonicalized. That is, `GC == gc == GeneralCategory == general_category`. |
212 | /// |
213 | /// The lifetime `'a` refers to the shorter of the lifetimes of property name |
214 | /// and property value. |
215 | #[derive(Debug)] |
216 | pub enum ClassQuery<'a> { |
217 | /// Return a class corresponding to a Unicode binary property, named by |
218 | /// a single letter. |
219 | OneLetter(char), |
220 | /// Return a class corresponding to a Unicode binary property. |
221 | /// |
222 | /// Note that, by special exception (see UTS#18, Section 1.2), both |
223 | /// general category values and script values are permitted here as if |
224 | /// they were a binary property. |
225 | Binary(&'a str), |
226 | /// Return a class corresponding to all codepoints whose property |
227 | /// (identified by `property_name`) corresponds to the given value |
228 | /// (identified by `property_value`). |
229 | ByValue { |
230 | /// A property name. |
231 | property_name: &'a str, |
232 | /// A property value. |
233 | property_value: &'a str, |
234 | }, |
235 | } |
236 | |
237 | impl<'a> ClassQuery<'a> { |
238 | fn canonicalize(&self) -> Result<CanonicalClassQuery, Error> { |
239 | match *self { |
240 | ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()), |
241 | ClassQuery::Binary(name) => self.canonical_binary(name), |
242 | ClassQuery::ByValue { property_name, property_value } => { |
243 | let property_name = symbolic_name_normalize(property_name); |
244 | let property_value = symbolic_name_normalize(property_value); |
245 | |
246 | let canon_name = match canonical_prop(&property_name)? { |
247 | None => return Err(Error::PropertyNotFound), |
248 | Some(canon_name) => canon_name, |
249 | }; |
250 | Ok(match canon_name { |
251 | "General_Category" => { |
252 | let canon = match canonical_gencat(&property_value)? { |
253 | None => return Err(Error::PropertyValueNotFound), |
254 | Some(canon) => canon, |
255 | }; |
256 | CanonicalClassQuery::GeneralCategory(canon) |
257 | } |
258 | "Script" => { |
259 | let canon = match canonical_script(&property_value)? { |
260 | None => return Err(Error::PropertyValueNotFound), |
261 | Some(canon) => canon, |
262 | }; |
263 | CanonicalClassQuery::Script(canon) |
264 | } |
265 | _ => { |
266 | let vals = match property_values(canon_name)? { |
267 | None => return Err(Error::PropertyValueNotFound), |
268 | Some(vals) => vals, |
269 | }; |
270 | let canon_val = |
271 | match canonical_value(vals, &property_value) { |
272 | None => { |
273 | return Err(Error::PropertyValueNotFound) |
274 | } |
275 | Some(canon_val) => canon_val, |
276 | }; |
277 | CanonicalClassQuery::ByValue { |
278 | property_name: canon_name, |
279 | property_value: canon_val, |
280 | } |
281 | } |
282 | }) |
283 | } |
284 | } |
285 | } |
286 | |
287 | fn canonical_binary( |
288 | &self, |
289 | name: &str, |
290 | ) -> Result<CanonicalClassQuery, Error> { |
291 | let norm = symbolic_name_normalize(name); |
292 | |
293 | // This is a special case where 'cf' refers to the 'Format' general |
294 | // category, but where the 'cf' abbreviation is also an abbreviation |
295 | // for the 'Case_Folding' property. But we want to treat it as |
296 | // a general category. (Currently, we don't even support the |
297 | // 'Case_Folding' property. But if we do in the future, users will be |
298 | // required to spell it out.) |
299 | // |
300 | // Also 'sc' refers to the 'Currency_Symbol' general category, but is |
301 | // also the abbreviation for the 'Script' property. So we avoid calling |
302 | // 'canonical_prop' for it too, which would erroneously normalize it |
303 | // to 'Script'. |
304 | // |
305 | // Another case: 'lc' is an abbreviation for the 'Cased_Letter' |
306 | // general category, but is also an abbreviation for the 'Lowercase_Mapping' |
307 | // property. We don't currently support the latter, so as with 'cf' |
308 | // above, we treat 'lc' as 'Cased_Letter'. |
309 | if norm != "cf" && norm != "sc" && norm != "lc" { |
310 | if let Some(canon) = canonical_prop(&norm)? { |
311 | return Ok(CanonicalClassQuery::Binary(canon)); |
312 | } |
313 | } |
314 | if let Some(canon) = canonical_gencat(&norm)? { |
315 | return Ok(CanonicalClassQuery::GeneralCategory(canon)); |
316 | } |
317 | if let Some(canon) = canonical_script(&norm)? { |
318 | return Ok(CanonicalClassQuery::Script(canon)); |
319 | } |
320 | Err(Error::PropertyNotFound) |
321 | } |
322 | } |
323 | |
324 | /// Like ClassQuery, but its parameters have been canonicalized. This also |
325 | /// differentiates binary properties from flattened general categories and |
326 | /// scripts. |
327 | #[derive(Debug, Eq, PartialEq)] |
328 | enum CanonicalClassQuery { |
329 | /// The canonical binary property name. |
330 | Binary(&'static str), |
331 | /// The canonical general category name. |
332 | GeneralCategory(&'static str), |
333 | /// The canonical script name. |
334 | Script(&'static str), |
335 | /// An arbitrary association between property and value, both of which |
336 | /// have been canonicalized. |
337 | /// |
338 | /// Note that by construction, the property name of ByValue will never |
339 | /// be General_Category or Script. Those two cases are subsumed by the |
340 | /// eponymous variants. |
341 | ByValue { |
342 | /// The canonical property name. |
343 | property_name: &'static str, |
344 | /// The canonical property value. |
345 | property_value: &'static str, |
346 | }, |
347 | } |
348 | |
349 | /// Looks up a Unicode class given a query. If one doesn't exist, then |
350 | /// `None` is returned. |
351 | pub fn class(query: ClassQuery<'_>) -> Result<hir::ClassUnicode, Error> { |
352 | use self::CanonicalClassQuery::*; |
353 | |
354 | match query.canonicalize()? { |
355 | Binary(name) => bool_property(name), |
356 | GeneralCategory(name) => gencat(name), |
357 | Script(name) => script(name), |
358 | ByValue { property_name: "Age" , property_value } => { |
359 | let mut class = hir::ClassUnicode::empty(); |
360 | for set in ages(property_value)? { |
361 | class.union(&hir_class(set)); |
362 | } |
363 | Ok(class) |
364 | } |
365 | ByValue { property_name: "Script_Extensions" , property_value } => { |
366 | script_extension(property_value) |
367 | } |
368 | ByValue { |
369 | property_name: "Grapheme_Cluster_Break" , |
370 | property_value, |
371 | } => gcb(property_value), |
372 | ByValue { property_name: "Sentence_Break" , property_value } => { |
373 | sb(property_value) |
374 | } |
375 | ByValue { property_name: "Word_Break" , property_value } => { |
376 | wb(property_value) |
377 | } |
378 | _ => { |
379 | // What else should we support? |
380 | Err(Error::PropertyNotFound) |
381 | } |
382 | } |
383 | } |
384 | |
385 | /// Returns a Unicode aware class for \w. |
386 | /// |
387 | /// This returns an error if the data is not available for \w. |
388 | pub fn perl_word() -> Result<hir::ClassUnicode, Error> { |
389 | #[cfg (not(feature = "unicode-perl" ))] |
390 | fn imp() -> Result<hir::ClassUnicode, Error> { |
391 | Err(Error::PerlClassNotFound) |
392 | } |
393 | |
394 | #[cfg (feature = "unicode-perl" )] |
395 | fn imp() -> Result<hir::ClassUnicode, Error> { |
396 | use crate::unicode_tables::perl_word::PERL_WORD; |
397 | Ok(hir_class(PERL_WORD)) |
398 | } |
399 | |
400 | imp() |
401 | } |
402 | |
403 | /// Returns a Unicode aware class for \s. |
404 | /// |
405 | /// This returns an error if the data is not available for \s. |
406 | pub fn perl_space() -> Result<hir::ClassUnicode, Error> { |
407 | #[cfg (not(any(feature = "unicode-perl" , feature = "unicode-bool" )))] |
408 | fn imp() -> Result<hir::ClassUnicode, Error> { |
409 | Err(Error::PerlClassNotFound) |
410 | } |
411 | |
412 | #[cfg (all(feature = "unicode-perl" , not(feature = "unicode-bool" )))] |
413 | fn imp() -> Result<hir::ClassUnicode, Error> { |
414 | use crate::unicode_tables::perl_space::WHITE_SPACE; |
415 | Ok(hir_class(WHITE_SPACE)) |
416 | } |
417 | |
418 | #[cfg (feature = "unicode-bool" )] |
419 | fn imp() -> Result<hir::ClassUnicode, Error> { |
420 | use crate::unicode_tables::property_bool::WHITE_SPACE; |
421 | Ok(hir_class(WHITE_SPACE)) |
422 | } |
423 | |
424 | imp() |
425 | } |
426 | |
427 | /// Returns a Unicode aware class for \d. |
428 | /// |
429 | /// This returns an error if the data is not available for \d. |
430 | pub fn perl_digit() -> Result<hir::ClassUnicode, Error> { |
431 | #[cfg (not(any(feature = "unicode-perl" , feature = "unicode-gencat" )))] |
432 | fn imp() -> Result<hir::ClassUnicode, Error> { |
433 | Err(Error::PerlClassNotFound) |
434 | } |
435 | |
436 | #[cfg (all(feature = "unicode-perl" , not(feature = "unicode-gencat" )))] |
437 | fn imp() -> Result<hir::ClassUnicode, Error> { |
438 | use crate::unicode_tables::perl_decimal::DECIMAL_NUMBER; |
439 | Ok(hir_class(DECIMAL_NUMBER)) |
440 | } |
441 | |
442 | #[cfg (feature = "unicode-gencat" )] |
443 | fn imp() -> Result<hir::ClassUnicode, Error> { |
444 | use crate::unicode_tables::general_category::DECIMAL_NUMBER; |
445 | Ok(hir_class(DECIMAL_NUMBER)) |
446 | } |
447 | |
448 | imp() |
449 | } |
450 | |
451 | /// Build a Unicode HIR class from a sequence of Unicode scalar value ranges. |
452 | pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode { |
453 | let hir_ranges: Vec<hir::ClassUnicodeRange> = ranges |
454 | .iter() |
455 | .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) |
456 | .collect(); |
457 | hir::ClassUnicode::new(hir_ranges) |
458 | } |
459 | |
460 | /// Returns true only if the given codepoint is in the `\w` character class. |
461 | /// |
462 | /// If the `unicode-perl` feature is not enabled, then this returns an error. |
463 | pub fn is_word_character(c: char) -> Result<bool, UnicodeWordError> { |
464 | #[cfg (not(feature = "unicode-perl" ))] |
465 | fn imp(_: char) -> Result<bool, UnicodeWordError> { |
466 | Err(UnicodeWordError(())) |
467 | } |
468 | |
469 | #[cfg (feature = "unicode-perl" )] |
470 | fn imp(c: char) -> Result<bool, UnicodeWordError> { |
471 | use crate::{is_word_byte, unicode_tables::perl_word::PERL_WORD}; |
472 | |
473 | if u8::try_from(c).map_or(false, is_word_byte) { |
474 | return Ok(true); |
475 | } |
476 | Ok(PERL_WORD |
477 | .binary_search_by(|&(start, end)| { |
478 | use core::cmp::Ordering; |
479 | |
480 | if start <= c && c <= end { |
481 | Ordering::Equal |
482 | } else if start > c { |
483 | Ordering::Greater |
484 | } else { |
485 | Ordering::Less |
486 | } |
487 | }) |
488 | .is_ok()) |
489 | } |
490 | |
491 | imp(c) |
492 | } |
493 | |
494 | /// A mapping of property values for a specific property. |
495 | /// |
496 | /// The first element of each tuple is a normalized property value while the |
497 | /// second element of each tuple is the corresponding canonical property |
498 | /// value. |
499 | type PropertyValues = &'static [(&'static str, &'static str)]; |
500 | |
501 | fn canonical_gencat( |
502 | normalized_value: &str, |
503 | ) -> Result<Option<&'static str>, Error> { |
504 | Ok(match normalized_value { |
505 | "any" => Some("Any" ), |
506 | "assigned" => Some("Assigned" ), |
507 | "ascii" => Some("ASCII" ), |
508 | _ => { |
509 | let gencats = property_values("General_Category" )?.unwrap(); |
510 | canonical_value(gencats, normalized_value) |
511 | } |
512 | }) |
513 | } |
514 | |
515 | fn canonical_script( |
516 | normalized_value: &str, |
517 | ) -> Result<Option<&'static str>, Error> { |
518 | let scripts = property_values("Script" )?.unwrap(); |
519 | Ok(canonical_value(scripts, normalized_value)) |
520 | } |
521 | |
522 | /// Find the canonical property name for the given normalized property name. |
523 | /// |
524 | /// If no such property exists, then `None` is returned. |
525 | /// |
526 | /// The normalized property name must have been normalized according to |
527 | /// UAX44 LM3, which can be done using `symbolic_name_normalize`. |
528 | /// |
529 | /// If the property names data is not available, then an error is returned. |
530 | fn canonical_prop( |
531 | normalized_name: &str, |
532 | ) -> Result<Option<&'static str>, Error> { |
533 | #[cfg (not(any( |
534 | feature = "unicode-age" , |
535 | feature = "unicode-bool" , |
536 | feature = "unicode-gencat" , |
537 | feature = "unicode-perl" , |
538 | feature = "unicode-script" , |
539 | feature = "unicode-segment" , |
540 | )))] |
541 | fn imp(_: &str) -> Result<Option<&'static str>, Error> { |
542 | Err(Error::PropertyNotFound) |
543 | } |
544 | |
545 | #[cfg (any( |
546 | feature = "unicode-age" , |
547 | feature = "unicode-bool" , |
548 | feature = "unicode-gencat" , |
549 | feature = "unicode-perl" , |
550 | feature = "unicode-script" , |
551 | feature = "unicode-segment" , |
552 | ))] |
553 | fn imp(name: &str) -> Result<Option<&'static str>, Error> { |
554 | use crate::unicode_tables::property_names::PROPERTY_NAMES; |
555 | |
556 | Ok(PROPERTY_NAMES |
557 | .binary_search_by_key(&name, |&(n, _)| n) |
558 | .ok() |
559 | .map(|i| PROPERTY_NAMES[i].1)) |
560 | } |
561 | |
562 | imp(normalized_name) |
563 | } |
564 | |
565 | /// Find the canonical property value for the given normalized property |
566 | /// value. |
567 | /// |
568 | /// The given property values should correspond to the values for the property |
569 | /// under question, which can be found using `property_values`. |
570 | /// |
571 | /// If no such property value exists, then `None` is returned. |
572 | /// |
573 | /// The normalized property value must have been normalized according to |
574 | /// UAX44 LM3, which can be done using `symbolic_name_normalize`. |
575 | fn canonical_value( |
576 | vals: PropertyValues, |
577 | normalized_value: &str, |
578 | ) -> Option<&'static str> { |
579 | vals.binary_search_by_key(&normalized_value, |&(n, _)| n) |
580 | .ok() |
581 | .map(|i| vals[i].1) |
582 | } |
583 | |
584 | /// Return the table of property values for the given property name. |
585 | /// |
586 | /// If the property values data is not available, then an error is returned. |
587 | fn property_values( |
588 | canonical_property_name: &'static str, |
589 | ) -> Result<Option<PropertyValues>, Error> { |
590 | #[cfg (not(any( |
591 | feature = "unicode-age" , |
592 | feature = "unicode-bool" , |
593 | feature = "unicode-gencat" , |
594 | feature = "unicode-perl" , |
595 | feature = "unicode-script" , |
596 | feature = "unicode-segment" , |
597 | )))] |
598 | fn imp(_: &'static str) -> Result<Option<PropertyValues>, Error> { |
599 | Err(Error::PropertyValueNotFound) |
600 | } |
601 | |
602 | #[cfg (any( |
603 | feature = "unicode-age" , |
604 | feature = "unicode-bool" , |
605 | feature = "unicode-gencat" , |
606 | feature = "unicode-perl" , |
607 | feature = "unicode-script" , |
608 | feature = "unicode-segment" , |
609 | ))] |
610 | fn imp(name: &'static str) -> Result<Option<PropertyValues>, Error> { |
611 | use crate::unicode_tables::property_values::PROPERTY_VALUES; |
612 | |
613 | Ok(PROPERTY_VALUES |
614 | .binary_search_by_key(&name, |&(n, _)| n) |
615 | .ok() |
616 | .map(|i| PROPERTY_VALUES[i].1)) |
617 | } |
618 | |
619 | imp(canonical_property_name) |
620 | } |
621 | |
622 | // This is only used in some cases, but small enough to just let it be dead |
623 | // instead of figuring out (and maintaining) the right set of features. |
624 | #[allow (dead_code)] |
625 | fn property_set( |
626 | name_map: &'static [(&'static str, Range)], |
627 | canonical: &'static str, |
628 | ) -> Option<Range> { |
629 | name_map |
630 | .binary_search_by_key(&canonical, |x| x.0) |
631 | .ok() |
632 | .map(|i| name_map[i].1) |
633 | } |
634 | |
635 | /// Returns an iterator over Unicode Age sets. Each item corresponds to a set |
636 | /// of codepoints that were added in a particular revision of Unicode. The |
637 | /// iterator yields items in chronological order. |
638 | /// |
639 | /// If the given age value isn't valid or if the data isn't available, then an |
640 | /// error is returned instead. |
641 | fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>, Error> { |
642 | #[cfg (not(feature = "unicode-age" ))] |
643 | fn imp(_: &str) -> Result<impl Iterator<Item = Range>, Error> { |
644 | use core::option::IntoIter; |
645 | Err::<IntoIter<Range>, _>(Error::PropertyNotFound) |
646 | } |
647 | |
648 | #[cfg (feature = "unicode-age" )] |
649 | fn imp(canonical_age: &str) -> Result<impl Iterator<Item = Range>, Error> { |
650 | use crate::unicode_tables::age; |
651 | |
652 | const AGES: &[(&str, Range)] = &[ |
653 | ("V1_1" , age::V1_1), |
654 | ("V2_0" , age::V2_0), |
655 | ("V2_1" , age::V2_1), |
656 | ("V3_0" , age::V3_0), |
657 | ("V3_1" , age::V3_1), |
658 | ("V3_2" , age::V3_2), |
659 | ("V4_0" , age::V4_0), |
660 | ("V4_1" , age::V4_1), |
661 | ("V5_0" , age::V5_0), |
662 | ("V5_1" , age::V5_1), |
663 | ("V5_2" , age::V5_2), |
664 | ("V6_0" , age::V6_0), |
665 | ("V6_1" , age::V6_1), |
666 | ("V6_2" , age::V6_2), |
667 | ("V6_3" , age::V6_3), |
668 | ("V7_0" , age::V7_0), |
669 | ("V8_0" , age::V8_0), |
670 | ("V9_0" , age::V9_0), |
671 | ("V10_0" , age::V10_0), |
672 | ("V11_0" , age::V11_0), |
673 | ("V12_0" , age::V12_0), |
674 | ("V12_1" , age::V12_1), |
675 | ("V13_0" , age::V13_0), |
676 | ("V14_0" , age::V14_0), |
677 | ("V15_0" , age::V15_0), |
678 | ]; |
679 | assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync" ); |
680 | |
681 | let pos = AGES.iter().position(|&(age, _)| canonical_age == age); |
682 | match pos { |
683 | None => Err(Error::PropertyValueNotFound), |
684 | Some(i) => Ok(AGES[..=i].iter().map(|&(_, classes)| classes)), |
685 | } |
686 | } |
687 | |
688 | imp(canonical_age) |
689 | } |
690 | |
691 | /// Returns the Unicode HIR class corresponding to the given general category. |
692 | /// |
693 | /// Name canonicalization is assumed to be performed by the caller. |
694 | /// |
695 | /// If the given general category could not be found, or if the general |
696 | /// category data is not available, then an error is returned. |
697 | fn gencat(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> { |
698 | #[cfg (not(feature = "unicode-gencat" ))] |
699 | fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> { |
700 | Err(Error::PropertyNotFound) |
701 | } |
702 | |
703 | #[cfg (feature = "unicode-gencat" )] |
704 | fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> { |
705 | use crate::unicode_tables::general_category::BY_NAME; |
706 | match name { |
707 | "ASCII" => Ok(hir_class(&[(' \0' , ' \x7F' )])), |
708 | "Any" => Ok(hir_class(&[(' \0' , ' \u{10FFFF}' )])), |
709 | "Assigned" => { |
710 | let mut cls = gencat("Unassigned" )?; |
711 | cls.negate(); |
712 | Ok(cls) |
713 | } |
714 | name => property_set(BY_NAME, name) |
715 | .map(hir_class) |
716 | .ok_or(Error::PropertyValueNotFound), |
717 | } |
718 | } |
719 | |
720 | match canonical_name { |
721 | "Decimal_Number" => perl_digit(), |
722 | name => imp(name), |
723 | } |
724 | } |
725 | |
726 | /// Returns the Unicode HIR class corresponding to the given script. |
727 | /// |
728 | /// Name canonicalization is assumed to be performed by the caller. |
729 | /// |
730 | /// If the given script could not be found, or if the script data is not |
731 | /// available, then an error is returned. |
732 | fn script(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> { |
733 | #[cfg (not(feature = "unicode-script" ))] |
734 | fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> { |
735 | Err(Error::PropertyNotFound) |
736 | } |
737 | |
738 | #[cfg (feature = "unicode-script" )] |
739 | fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> { |
740 | use crate::unicode_tables::script::BY_NAME; |
741 | property_set(BY_NAME, name) |
742 | .map(hir_class) |
743 | .ok_or(Error::PropertyValueNotFound) |
744 | } |
745 | |
746 | imp(canonical_name) |
747 | } |
748 | |
749 | /// Returns the Unicode HIR class corresponding to the given script extension. |
750 | /// |
751 | /// Name canonicalization is assumed to be performed by the caller. |
752 | /// |
753 | /// If the given script extension could not be found, or if the script data is |
754 | /// not available, then an error is returned. |
755 | fn script_extension( |
756 | canonical_name: &'static str, |
757 | ) -> Result<hir::ClassUnicode, Error> { |
758 | #[cfg (not(feature = "unicode-script" ))] |
759 | fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> { |
760 | Err(Error::PropertyNotFound) |
761 | } |
762 | |
763 | #[cfg (feature = "unicode-script" )] |
764 | fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> { |
765 | use crate::unicode_tables::script_extension::BY_NAME; |
766 | property_set(BY_NAME, name) |
767 | .map(hir_class) |
768 | .ok_or(Error::PropertyValueNotFound) |
769 | } |
770 | |
771 | imp(canonical_name) |
772 | } |
773 | |
774 | /// Returns the Unicode HIR class corresponding to the given Unicode boolean |
775 | /// property. |
776 | /// |
777 | /// Name canonicalization is assumed to be performed by the caller. |
778 | /// |
779 | /// If the given boolean property could not be found, or if the boolean |
780 | /// property data is not available, then an error is returned. |
781 | fn bool_property( |
782 | canonical_name: &'static str, |
783 | ) -> Result<hir::ClassUnicode, Error> { |
784 | #[cfg (not(feature = "unicode-bool" ))] |
785 | fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> { |
786 | Err(Error::PropertyNotFound) |
787 | } |
788 | |
789 | #[cfg (feature = "unicode-bool" )] |
790 | fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> { |
791 | use crate::unicode_tables::property_bool::BY_NAME; |
792 | property_set(BY_NAME, name) |
793 | .map(hir_class) |
794 | .ok_or(Error::PropertyNotFound) |
795 | } |
796 | |
797 | match canonical_name { |
798 | "Decimal_Number" => perl_digit(), |
799 | "White_Space" => perl_space(), |
800 | name => imp(name), |
801 | } |
802 | } |
803 | |
804 | /// Returns the Unicode HIR class corresponding to the given grapheme cluster |
805 | /// break property. |
806 | /// |
807 | /// Name canonicalization is assumed to be performed by the caller. |
808 | /// |
809 | /// If the given property could not be found, or if the corresponding data is |
810 | /// not available, then an error is returned. |
811 | fn gcb(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> { |
812 | #[cfg (not(feature = "unicode-segment" ))] |
813 | fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> { |
814 | Err(Error::PropertyNotFound) |
815 | } |
816 | |
817 | #[cfg (feature = "unicode-segment" )] |
818 | fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> { |
819 | use crate::unicode_tables::grapheme_cluster_break::BY_NAME; |
820 | property_set(BY_NAME, name) |
821 | .map(hir_class) |
822 | .ok_or(Error::PropertyValueNotFound) |
823 | } |
824 | |
825 | imp(canonical_name) |
826 | } |
827 | |
828 | /// Returns the Unicode HIR class corresponding to the given word break |
829 | /// property. |
830 | /// |
831 | /// Name canonicalization is assumed to be performed by the caller. |
832 | /// |
833 | /// If the given property could not be found, or if the corresponding data is |
834 | /// not available, then an error is returned. |
835 | fn wb(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> { |
836 | #[cfg (not(feature = "unicode-segment" ))] |
837 | fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> { |
838 | Err(Error::PropertyNotFound) |
839 | } |
840 | |
841 | #[cfg (feature = "unicode-segment" )] |
842 | fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> { |
843 | use crate::unicode_tables::word_break::BY_NAME; |
844 | property_set(BY_NAME, name) |
845 | .map(hir_class) |
846 | .ok_or(Error::PropertyValueNotFound) |
847 | } |
848 | |
849 | imp(canonical_name) |
850 | } |
851 | |
852 | /// Returns the Unicode HIR class corresponding to the given sentence |
853 | /// break property. |
854 | /// |
855 | /// Name canonicalization is assumed to be performed by the caller. |
856 | /// |
857 | /// If the given property could not be found, or if the corresponding data is |
858 | /// not available, then an error is returned. |
859 | fn sb(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> { |
860 | #[cfg (not(feature = "unicode-segment" ))] |
861 | fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> { |
862 | Err(Error::PropertyNotFound) |
863 | } |
864 | |
865 | #[cfg (feature = "unicode-segment" )] |
866 | fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> { |
867 | use crate::unicode_tables::sentence_break::BY_NAME; |
868 | property_set(BY_NAME, name) |
869 | .map(hir_class) |
870 | .ok_or(Error::PropertyValueNotFound) |
871 | } |
872 | |
873 | imp(canonical_name) |
874 | } |
875 | |
876 | /// Like symbolic_name_normalize_bytes, but operates on a string. |
877 | fn symbolic_name_normalize(x: &str) -> String { |
878 | let mut tmp = x.as_bytes().to_vec(); |
879 | let len = symbolic_name_normalize_bytes(&mut tmp).len(); |
880 | tmp.truncate(len); |
881 | // This should always succeed because `symbolic_name_normalize_bytes` |
882 | // guarantees that `&tmp[..len]` is always valid UTF-8. |
883 | // |
884 | // N.B. We could avoid the additional UTF-8 check here, but it's unlikely |
885 | // to be worth skipping the additional safety check. A benchmark must |
886 | // justify it first. |
887 | String::from_utf8(tmp).unwrap() |
888 | } |
889 | |
890 | /// Normalize the given symbolic name in place according to UAX44-LM3. |
891 | /// |
892 | /// A "symbolic name" typically corresponds to property names and property |
893 | /// value aliases. Note, though, that it should not be applied to property |
894 | /// string values. |
895 | /// |
896 | /// The slice returned is guaranteed to be valid UTF-8 for all possible values |
897 | /// of `slice`. |
898 | /// |
899 | /// See: https://unicode.org/reports/tr44/#UAX44-LM3 |
900 | fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] { |
901 | // I couldn't find a place in the standard that specified that property |
902 | // names/aliases had a particular structure (unlike character names), but |
903 | // we assume that it's ASCII only and drop anything that isn't ASCII. |
904 | let mut start = 0; |
905 | let mut starts_with_is = false; |
906 | if slice.len() >= 2 { |
907 | // Ignore any "is" prefix. |
908 | starts_with_is = slice[0..2] == b"is" [..] |
909 | || slice[0..2] == b"IS" [..] |
910 | || slice[0..2] == b"iS" [..] |
911 | || slice[0..2] == b"Is" [..]; |
912 | if starts_with_is { |
913 | start = 2; |
914 | } |
915 | } |
916 | let mut next_write = 0; |
917 | for i in start..slice.len() { |
918 | // VALIDITY ARGUMENT: To guarantee that the resulting slice is valid |
919 | // UTF-8, we ensure that the slice contains only ASCII bytes. In |
920 | // particular, we drop every non-ASCII byte from the normalized string. |
921 | let b = slice[i]; |
922 | if b == b' ' || b == b'_' || b == b'-' { |
923 | continue; |
924 | } else if b'A' <= b && b <= b'Z' { |
925 | slice[next_write] = b + (b'a' - b'A' ); |
926 | next_write += 1; |
927 | } else if b <= 0x7F { |
928 | slice[next_write] = b; |
929 | next_write += 1; |
930 | } |
931 | } |
932 | // Special case: ISO_Comment has a 'isc' abbreviation. Since we generally |
933 | // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross |
934 | // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it |
935 | // is actually an alias for the 'Other' general category. |
936 | if starts_with_is && next_write == 1 && slice[0] == b'c' { |
937 | slice[0] = b'i' ; |
938 | slice[1] = b's' ; |
939 | slice[2] = b'c' ; |
940 | next_write = 3; |
941 | } |
942 | &mut slice[..next_write] |
943 | } |
944 | |
945 | #[cfg (test)] |
946 | mod tests { |
947 | use super::*; |
948 | |
949 | #[cfg (feature = "unicode-case" )] |
950 | fn simple_fold_ok(c: char) -> impl Iterator<Item = char> { |
951 | SimpleCaseFolder::new().unwrap().mapping(c).iter().copied() |
952 | } |
953 | |
954 | #[cfg (feature = "unicode-case" )] |
955 | fn contains_case_map(start: char, end: char) -> bool { |
956 | SimpleCaseFolder::new().unwrap().overlaps(start, end) |
957 | } |
958 | |
959 | #[test] |
960 | #[cfg (feature = "unicode-case" )] |
961 | fn simple_fold_k() { |
962 | let xs: Vec<char> = simple_fold_ok('k' ).collect(); |
963 | assert_eq!(xs, alloc::vec!['K' , 'K' ]); |
964 | |
965 | let xs: Vec<char> = simple_fold_ok('K' ).collect(); |
966 | assert_eq!(xs, alloc::vec!['k' , 'K' ]); |
967 | |
968 | let xs: Vec<char> = simple_fold_ok('K' ).collect(); |
969 | assert_eq!(xs, alloc::vec!['K' , 'k' ]); |
970 | } |
971 | |
972 | #[test] |
973 | #[cfg (feature = "unicode-case" )] |
974 | fn simple_fold_a() { |
975 | let xs: Vec<char> = simple_fold_ok('a' ).collect(); |
976 | assert_eq!(xs, alloc::vec!['A' ]); |
977 | |
978 | let xs: Vec<char> = simple_fold_ok('A' ).collect(); |
979 | assert_eq!(xs, alloc::vec!['a' ]); |
980 | } |
981 | |
982 | #[test] |
983 | #[cfg (not(feature = "unicode-case" ))] |
984 | fn simple_fold_disabled() { |
985 | assert!(SimpleCaseFolder::new().is_err()); |
986 | } |
987 | |
988 | #[test] |
989 | #[cfg (feature = "unicode-case" )] |
990 | fn range_contains() { |
991 | assert!(contains_case_map('A' , 'A' )); |
992 | assert!(contains_case_map('Z' , 'Z' )); |
993 | assert!(contains_case_map('A' , 'Z' )); |
994 | assert!(contains_case_map('@' , 'A' )); |
995 | assert!(contains_case_map('Z' , '[' )); |
996 | assert!(contains_case_map('☃' , 'Ⰰ' )); |
997 | |
998 | assert!(!contains_case_map('[' , '[' )); |
999 | assert!(!contains_case_map('[' , '`' )); |
1000 | |
1001 | assert!(!contains_case_map('☃' , '☃' )); |
1002 | } |
1003 | |
1004 | #[test] |
1005 | #[cfg (feature = "unicode-gencat" )] |
1006 | fn regression_466() { |
1007 | use super::{CanonicalClassQuery, ClassQuery}; |
1008 | |
1009 | let q = ClassQuery::OneLetter('C' ); |
1010 | assert_eq!( |
1011 | q.canonicalize().unwrap(), |
1012 | CanonicalClassQuery::GeneralCategory("Other" ) |
1013 | ); |
1014 | } |
1015 | |
1016 | #[test] |
1017 | fn sym_normalize() { |
1018 | let sym_norm = symbolic_name_normalize; |
1019 | |
1020 | assert_eq!(sym_norm("Line_Break" ), "linebreak" ); |
1021 | assert_eq!(sym_norm("Line-break" ), "linebreak" ); |
1022 | assert_eq!(sym_norm("linebreak" ), "linebreak" ); |
1023 | assert_eq!(sym_norm("BA" ), "ba" ); |
1024 | assert_eq!(sym_norm("ba" ), "ba" ); |
1025 | assert_eq!(sym_norm("Greek" ), "greek" ); |
1026 | assert_eq!(sym_norm("isGreek" ), "greek" ); |
1027 | assert_eq!(sym_norm("IS_Greek" ), "greek" ); |
1028 | assert_eq!(sym_norm("isc" ), "isc" ); |
1029 | assert_eq!(sym_norm("is c" ), "isc" ); |
1030 | assert_eq!(sym_norm("is_c" ), "isc" ); |
1031 | } |
1032 | |
1033 | #[test] |
1034 | fn valid_utf8_symbolic() { |
1035 | let mut x = b"abc \xFFxyz" .to_vec(); |
1036 | let y = symbolic_name_normalize_bytes(&mut x); |
1037 | assert_eq!(y, b"abcxyz" ); |
1038 | } |
1039 | } |
1040 | |