1use alloc::{
2 string::{String, ToString},
3 vec::Vec,
4};
5
6use crate::hir;
7
8/// An inclusive range of codepoints from a generated file (hence the static
9/// lifetime).
10type Range = &'static [(char, char)];
11
12/// An error that occurs when dealing with Unicode.
13///
14/// We don't impl the Error trait here because these always get converted
15/// into other public errors. (This error type isn't exported.)
16#[derive(Debug)]
17pub enum Error {
18 PropertyNotFound,
19 PropertyValueNotFound,
20 // Not used when unicode-perl is enabled.
21 #[allow(dead_code)]
22 PerlClassNotFound,
23}
24
25/// An error that occurs when Unicode-aware simple case folding fails.
26///
27/// This error can occur when the case mapping tables necessary for Unicode
28/// aware case folding are unavailable. This only occurs when the
29/// `unicode-case` feature is disabled. (The feature is enabled by default.)
30#[derive(Debug)]
31pub struct CaseFoldError(());
32
33#[cfg(feature = "std")]
34impl std::error::Error for CaseFoldError {}
35
36impl core::fmt::Display for CaseFoldError {
37 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
38 write!(
39 f,
40 "Unicode-aware case folding is not available \
41 (probably because the unicode-case feature is not enabled)"
42 )
43 }
44}
45
46/// An error that occurs when the Unicode-aware `\w` class is unavailable.
47///
48/// This error can occur when the data tables necessary for the Unicode aware
49/// Perl character class `\w` are unavailable. This only occurs when the
50/// `unicode-perl` feature is disabled. (The feature is enabled by default.)
51#[derive(Debug)]
52pub struct UnicodeWordError(());
53
54#[cfg(feature = "std")]
55impl std::error::Error for UnicodeWordError {}
56
57impl core::fmt::Display for UnicodeWordError {
58 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
59 write!(
60 f,
61 "Unicode-aware \\w class is not available \
62 (probably because the unicode-perl feature is not enabled)"
63 )
64 }
65}
66
67/// A state oriented traverser of the simple case folding table.
68///
69/// A case folder can be constructed via `SimpleCaseFolder::new()`, which will
70/// return an error if the underlying case folding table is unavailable.
71///
72/// After construction, it is expected that callers will use
73/// `SimpleCaseFolder::mapping` by calling it with codepoints in strictly
74/// increasing order. For example, calling it on `b` and then on `a` is illegal
75/// and will result in a panic.
76///
77/// The main idea of this type is that it tries hard to make mapping lookups
78/// fast by exploiting the structure of the underlying table, and the ordering
79/// assumption enables this.
80#[derive(Debug)]
81pub struct SimpleCaseFolder {
82 /// The simple case fold table. It's a sorted association list, where the
83 /// keys are Unicode scalar values and the values are the corresponding
84 /// equivalence class (not including the key) of the "simple" case folded
85 /// Unicode scalar values.
86 table: &'static [(char, &'static [char])],
87 /// The last codepoint that was used for a lookup.
88 last: Option<char>,
89 /// The index to the entry in `table` corresponding to the smallest key `k`
90 /// such that `k > k0`, where `k0` is the most recent key lookup. Note that
91 /// in particular, `k0` may not be in the table!
92 next: usize,
93}
94
95impl SimpleCaseFolder {
96 /// Create a new simple case folder, returning an error if the underlying
97 /// case folding table is unavailable.
98 pub fn new() -> Result<SimpleCaseFolder, CaseFoldError> {
99 #[cfg(not(feature = "unicode-case"))]
100 {
101 Err(CaseFoldError(()))
102 }
103 #[cfg(feature = "unicode-case")]
104 {
105 Ok(SimpleCaseFolder {
106 table: crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE,
107 last: None,
108 next: 0,
109 })
110 }
111 }
112
113 /// Return the equivalence class of case folded codepoints for the given
114 /// codepoint. The equivalence class returned never includes the codepoint
115 /// given. If the given codepoint has no case folded codepoints (i.e.,
116 /// no entry in the underlying case folding table), then this returns an
117 /// empty slice.
118 ///
119 /// # Panics
120 ///
121 /// This panics when called with a `c` that is less than or equal to the
122 /// previous call. In other words, callers need to use this method with
123 /// strictly increasing values of `c`.
124 pub fn mapping(&mut self, c: char) -> &'static [char] {
125 if let Some(last) = self.last {
126 assert!(
127 last < c,
128 "got codepoint U+{:X} which occurs before \
129 last codepoint U+{:X}",
130 u32::from(c),
131 u32::from(last),
132 );
133 }
134 self.last = Some(c);
135 if self.next >= self.table.len() {
136 return &[];
137 }
138 let (k, v) = self.table[self.next];
139 if k == c {
140 self.next += 1;
141 return v;
142 }
143 match self.get(c) {
144 Err(i) => {
145 self.next = i;
146 &[]
147 }
148 Ok(i) => {
149 // Since we require lookups to proceed
150 // in order, anything we find should be
151 // after whatever we thought might be
152 // next. Otherwise, the caller is either
153 // going out of order or we would have
154 // found our next key at 'self.next'.
155 assert!(i > self.next);
156 self.next = i + 1;
157 self.table[i].1
158 }
159 }
160 }
161
162 /// Returns true if and only if the given range overlaps with any region
163 /// of the underlying case folding table. That is, when true, there exists
164 /// at least one codepoint in the inclusive range `[start, end]` that has
165 /// a non-trivial equivalence class of case folded codepoints. Conversely,
166 /// when this returns false, all codepoints in the range `[start, end]`
167 /// correspond to the trivial equivalence class of case folded codepoints,
168 /// i.e., itself.
169 ///
170 /// This is useful to call before iterating over the codepoints in the
171 /// range and looking up the mapping for each. If you know none of the
172 /// mappings will return anything, then you might be able to skip doing it
173 /// altogether.
174 ///
175 /// # Panics
176 ///
177 /// This panics when `end < start`.
178 pub fn overlaps(&self, start: char, end: char) -> bool {
179 use core::cmp::Ordering;
180
181 assert!(start <= end);
182 self.table
183 .binary_search_by(|&(c, _)| {
184 if start <= c && c <= end {
185 Ordering::Equal
186 } else if c > end {
187 Ordering::Greater
188 } else {
189 Ordering::Less
190 }
191 })
192 .is_ok()
193 }
194
195 /// Returns the index at which `c` occurs in the simple case fold table. If
196 /// `c` does not occur, then this returns an `i` such that `table[i-1].0 <
197 /// c` and `table[i].0 > c`.
198 fn get(&self, c: char) -> Result<usize, usize> {
199 self.table.binary_search_by_key(&c, |&(c1, _)| c1)
200 }
201}
202
203/// A query for finding a character class defined by Unicode. This supports
204/// either use of a property name directly, or lookup by property value. The
205/// former generally refers to Binary properties (see UTS#44, Table 8), but
206/// as a special exception (see UTS#18, Section 1.2) both general categories
207/// (an enumeration) and scripts (a catalog) are supported as if each of their
208/// possible values were a binary property.
209///
210/// In all circumstances, property names and values are normalized and
211/// canonicalized. That is, `GC == gc == GeneralCategory == general_category`.
212///
213/// The lifetime `'a` refers to the shorter of the lifetimes of property name
214/// and property value.
215#[derive(Debug)]
216pub enum ClassQuery<'a> {
217 /// Return a class corresponding to a Unicode binary property, named by
218 /// a single letter.
219 OneLetter(char),
220 /// Return a class corresponding to a Unicode binary property.
221 ///
222 /// Note that, by special exception (see UTS#18, Section 1.2), both
223 /// general category values and script values are permitted here as if
224 /// they were a binary property.
225 Binary(&'a str),
226 /// Return a class corresponding to all codepoints whose property
227 /// (identified by `property_name`) corresponds to the given value
228 /// (identified by `property_value`).
229 ByValue {
230 /// A property name.
231 property_name: &'a str,
232 /// A property value.
233 property_value: &'a str,
234 },
235}
236
237impl<'a> ClassQuery<'a> {
238 fn canonicalize(&self) -> Result<CanonicalClassQuery, Error> {
239 match *self {
240 ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()),
241 ClassQuery::Binary(name) => self.canonical_binary(name),
242 ClassQuery::ByValue { property_name, property_value } => {
243 let property_name = symbolic_name_normalize(property_name);
244 let property_value = symbolic_name_normalize(property_value);
245
246 let canon_name = match canonical_prop(&property_name)? {
247 None => return Err(Error::PropertyNotFound),
248 Some(canon_name) => canon_name,
249 };
250 Ok(match canon_name {
251 "General_Category" => {
252 let canon = match canonical_gencat(&property_value)? {
253 None => return Err(Error::PropertyValueNotFound),
254 Some(canon) => canon,
255 };
256 CanonicalClassQuery::GeneralCategory(canon)
257 }
258 "Script" => {
259 let canon = match canonical_script(&property_value)? {
260 None => return Err(Error::PropertyValueNotFound),
261 Some(canon) => canon,
262 };
263 CanonicalClassQuery::Script(canon)
264 }
265 _ => {
266 let vals = match property_values(canon_name)? {
267 None => return Err(Error::PropertyValueNotFound),
268 Some(vals) => vals,
269 };
270 let canon_val =
271 match canonical_value(vals, &property_value) {
272 None => {
273 return Err(Error::PropertyValueNotFound)
274 }
275 Some(canon_val) => canon_val,
276 };
277 CanonicalClassQuery::ByValue {
278 property_name: canon_name,
279 property_value: canon_val,
280 }
281 }
282 })
283 }
284 }
285 }
286
287 fn canonical_binary(
288 &self,
289 name: &str,
290 ) -> Result<CanonicalClassQuery, Error> {
291 let norm = symbolic_name_normalize(name);
292
293 // This is a special case where 'cf' refers to the 'Format' general
294 // category, but where the 'cf' abbreviation is also an abbreviation
295 // for the 'Case_Folding' property. But we want to treat it as
296 // a general category. (Currently, we don't even support the
297 // 'Case_Folding' property. But if we do in the future, users will be
298 // required to spell it out.)
299 //
300 // Also 'sc' refers to the 'Currency_Symbol' general category, but is
301 // also the abbreviation for the 'Script' property. So we avoid calling
302 // 'canonical_prop' for it too, which would erroneously normalize it
303 // to 'Script'.
304 //
305 // Another case: 'lc' is an abbreviation for the 'Cased_Letter'
306 // general category, but is also an abbreviation for the 'Lowercase_Mapping'
307 // property. We don't currently support the latter, so as with 'cf'
308 // above, we treat 'lc' as 'Cased_Letter'.
309 if norm != "cf" && norm != "sc" && norm != "lc" {
310 if let Some(canon) = canonical_prop(&norm)? {
311 return Ok(CanonicalClassQuery::Binary(canon));
312 }
313 }
314 if let Some(canon) = canonical_gencat(&norm)? {
315 return Ok(CanonicalClassQuery::GeneralCategory(canon));
316 }
317 if let Some(canon) = canonical_script(&norm)? {
318 return Ok(CanonicalClassQuery::Script(canon));
319 }
320 Err(Error::PropertyNotFound)
321 }
322}
323
324/// Like ClassQuery, but its parameters have been canonicalized. This also
325/// differentiates binary properties from flattened general categories and
326/// scripts.
327#[derive(Debug, Eq, PartialEq)]
328enum CanonicalClassQuery {
329 /// The canonical binary property name.
330 Binary(&'static str),
331 /// The canonical general category name.
332 GeneralCategory(&'static str),
333 /// The canonical script name.
334 Script(&'static str),
335 /// An arbitrary association between property and value, both of which
336 /// have been canonicalized.
337 ///
338 /// Note that by construction, the property name of ByValue will never
339 /// be General_Category or Script. Those two cases are subsumed by the
340 /// eponymous variants.
341 ByValue {
342 /// The canonical property name.
343 property_name: &'static str,
344 /// The canonical property value.
345 property_value: &'static str,
346 },
347}
348
349/// Looks up a Unicode class given a query. If one doesn't exist, then
350/// `None` is returned.
351pub fn class(query: ClassQuery<'_>) -> Result<hir::ClassUnicode, Error> {
352 use self::CanonicalClassQuery::*;
353
354 match query.canonicalize()? {
355 Binary(name) => bool_property(name),
356 GeneralCategory(name) => gencat(name),
357 Script(name) => script(name),
358 ByValue { property_name: "Age", property_value } => {
359 let mut class = hir::ClassUnicode::empty();
360 for set in ages(property_value)? {
361 class.union(&hir_class(set));
362 }
363 Ok(class)
364 }
365 ByValue { property_name: "Script_Extensions", property_value } => {
366 script_extension(property_value)
367 }
368 ByValue {
369 property_name: "Grapheme_Cluster_Break",
370 property_value,
371 } => gcb(property_value),
372 ByValue { property_name: "Sentence_Break", property_value } => {
373 sb(property_value)
374 }
375 ByValue { property_name: "Word_Break", property_value } => {
376 wb(property_value)
377 }
378 _ => {
379 // What else should we support?
380 Err(Error::PropertyNotFound)
381 }
382 }
383}
384
385/// Returns a Unicode aware class for \w.
386///
387/// This returns an error if the data is not available for \w.
388pub fn perl_word() -> Result<hir::ClassUnicode, Error> {
389 #[cfg(not(feature = "unicode-perl"))]
390 fn imp() -> Result<hir::ClassUnicode, Error> {
391 Err(Error::PerlClassNotFound)
392 }
393
394 #[cfg(feature = "unicode-perl")]
395 fn imp() -> Result<hir::ClassUnicode, Error> {
396 use crate::unicode_tables::perl_word::PERL_WORD;
397 Ok(hir_class(PERL_WORD))
398 }
399
400 imp()
401}
402
403/// Returns a Unicode aware class for \s.
404///
405/// This returns an error if the data is not available for \s.
406pub fn perl_space() -> Result<hir::ClassUnicode, Error> {
407 #[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))]
408 fn imp() -> Result<hir::ClassUnicode, Error> {
409 Err(Error::PerlClassNotFound)
410 }
411
412 #[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))]
413 fn imp() -> Result<hir::ClassUnicode, Error> {
414 use crate::unicode_tables::perl_space::WHITE_SPACE;
415 Ok(hir_class(WHITE_SPACE))
416 }
417
418 #[cfg(feature = "unicode-bool")]
419 fn imp() -> Result<hir::ClassUnicode, Error> {
420 use crate::unicode_tables::property_bool::WHITE_SPACE;
421 Ok(hir_class(WHITE_SPACE))
422 }
423
424 imp()
425}
426
427/// Returns a Unicode aware class for \d.
428///
429/// This returns an error if the data is not available for \d.
430pub fn perl_digit() -> Result<hir::ClassUnicode, Error> {
431 #[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))]
432 fn imp() -> Result<hir::ClassUnicode, Error> {
433 Err(Error::PerlClassNotFound)
434 }
435
436 #[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))]
437 fn imp() -> Result<hir::ClassUnicode, Error> {
438 use crate::unicode_tables::perl_decimal::DECIMAL_NUMBER;
439 Ok(hir_class(DECIMAL_NUMBER))
440 }
441
442 #[cfg(feature = "unicode-gencat")]
443 fn imp() -> Result<hir::ClassUnicode, Error> {
444 use crate::unicode_tables::general_category::DECIMAL_NUMBER;
445 Ok(hir_class(DECIMAL_NUMBER))
446 }
447
448 imp()
449}
450
451/// Build a Unicode HIR class from a sequence of Unicode scalar value ranges.
452pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode {
453 let hir_ranges: Vec<hir::ClassUnicodeRange> = ranges
454 .iter()
455 .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
456 .collect();
457 hir::ClassUnicode::new(hir_ranges)
458}
459
460/// Returns true only if the given codepoint is in the `\w` character class.
461///
462/// If the `unicode-perl` feature is not enabled, then this returns an error.
463pub fn is_word_character(c: char) -> Result<bool, UnicodeWordError> {
464 #[cfg(not(feature = "unicode-perl"))]
465 fn imp(_: char) -> Result<bool, UnicodeWordError> {
466 Err(UnicodeWordError(()))
467 }
468
469 #[cfg(feature = "unicode-perl")]
470 fn imp(c: char) -> Result<bool, UnicodeWordError> {
471 use crate::{is_word_byte, unicode_tables::perl_word::PERL_WORD};
472
473 if u8::try_from(c).map_or(false, is_word_byte) {
474 return Ok(true);
475 }
476 Ok(PERL_WORD
477 .binary_search_by(|&(start, end)| {
478 use core::cmp::Ordering;
479
480 if start <= c && c <= end {
481 Ordering::Equal
482 } else if start > c {
483 Ordering::Greater
484 } else {
485 Ordering::Less
486 }
487 })
488 .is_ok())
489 }
490
491 imp(c)
492}
493
494/// A mapping of property values for a specific property.
495///
496/// The first element of each tuple is a normalized property value while the
497/// second element of each tuple is the corresponding canonical property
498/// value.
499type PropertyValues = &'static [(&'static str, &'static str)];
500
501fn canonical_gencat(
502 normalized_value: &str,
503) -> Result<Option<&'static str>, Error> {
504 Ok(match normalized_value {
505 "any" => Some("Any"),
506 "assigned" => Some("Assigned"),
507 "ascii" => Some("ASCII"),
508 _ => {
509 let gencats = property_values("General_Category")?.unwrap();
510 canonical_value(gencats, normalized_value)
511 }
512 })
513}
514
515fn canonical_script(
516 normalized_value: &str,
517) -> Result<Option<&'static str>, Error> {
518 let scripts = property_values("Script")?.unwrap();
519 Ok(canonical_value(scripts, normalized_value))
520}
521
522/// Find the canonical property name for the given normalized property name.
523///
524/// If no such property exists, then `None` is returned.
525///
526/// The normalized property name must have been normalized according to
527/// UAX44 LM3, which can be done using `symbolic_name_normalize`.
528///
529/// If the property names data is not available, then an error is returned.
530fn canonical_prop(
531 normalized_name: &str,
532) -> Result<Option<&'static str>, Error> {
533 #[cfg(not(any(
534 feature = "unicode-age",
535 feature = "unicode-bool",
536 feature = "unicode-gencat",
537 feature = "unicode-perl",
538 feature = "unicode-script",
539 feature = "unicode-segment",
540 )))]
541 fn imp(_: &str) -> Result<Option<&'static str>, Error> {
542 Err(Error::PropertyNotFound)
543 }
544
545 #[cfg(any(
546 feature = "unicode-age",
547 feature = "unicode-bool",
548 feature = "unicode-gencat",
549 feature = "unicode-perl",
550 feature = "unicode-script",
551 feature = "unicode-segment",
552 ))]
553 fn imp(name: &str) -> Result<Option<&'static str>, Error> {
554 use crate::unicode_tables::property_names::PROPERTY_NAMES;
555
556 Ok(PROPERTY_NAMES
557 .binary_search_by_key(&name, |&(n, _)| n)
558 .ok()
559 .map(|i| PROPERTY_NAMES[i].1))
560 }
561
562 imp(normalized_name)
563}
564
565/// Find the canonical property value for the given normalized property
566/// value.
567///
568/// The given property values should correspond to the values for the property
569/// under question, which can be found using `property_values`.
570///
571/// If no such property value exists, then `None` is returned.
572///
573/// The normalized property value must have been normalized according to
574/// UAX44 LM3, which can be done using `symbolic_name_normalize`.
575fn canonical_value(
576 vals: PropertyValues,
577 normalized_value: &str,
578) -> Option<&'static str> {
579 vals.binary_search_by_key(&normalized_value, |&(n, _)| n)
580 .ok()
581 .map(|i| vals[i].1)
582}
583
584/// Return the table of property values for the given property name.
585///
586/// If the property values data is not available, then an error is returned.
587fn property_values(
588 canonical_property_name: &'static str,
589) -> Result<Option<PropertyValues>, Error> {
590 #[cfg(not(any(
591 feature = "unicode-age",
592 feature = "unicode-bool",
593 feature = "unicode-gencat",
594 feature = "unicode-perl",
595 feature = "unicode-script",
596 feature = "unicode-segment",
597 )))]
598 fn imp(_: &'static str) -> Result<Option<PropertyValues>, Error> {
599 Err(Error::PropertyValueNotFound)
600 }
601
602 #[cfg(any(
603 feature = "unicode-age",
604 feature = "unicode-bool",
605 feature = "unicode-gencat",
606 feature = "unicode-perl",
607 feature = "unicode-script",
608 feature = "unicode-segment",
609 ))]
610 fn imp(name: &'static str) -> Result<Option<PropertyValues>, Error> {
611 use crate::unicode_tables::property_values::PROPERTY_VALUES;
612
613 Ok(PROPERTY_VALUES
614 .binary_search_by_key(&name, |&(n, _)| n)
615 .ok()
616 .map(|i| PROPERTY_VALUES[i].1))
617 }
618
619 imp(canonical_property_name)
620}
621
622// This is only used in some cases, but small enough to just let it be dead
623// instead of figuring out (and maintaining) the right set of features.
624#[allow(dead_code)]
625fn property_set(
626 name_map: &'static [(&'static str, Range)],
627 canonical: &'static str,
628) -> Option<Range> {
629 name_map
630 .binary_search_by_key(&canonical, |x| x.0)
631 .ok()
632 .map(|i| name_map[i].1)
633}
634
635/// Returns an iterator over Unicode Age sets. Each item corresponds to a set
636/// of codepoints that were added in a particular revision of Unicode. The
637/// iterator yields items in chronological order.
638///
639/// If the given age value isn't valid or if the data isn't available, then an
640/// error is returned instead.
641fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>, Error> {
642 #[cfg(not(feature = "unicode-age"))]
643 fn imp(_: &str) -> Result<impl Iterator<Item = Range>, Error> {
644 use core::option::IntoIter;
645 Err::<IntoIter<Range>, _>(Error::PropertyNotFound)
646 }
647
648 #[cfg(feature = "unicode-age")]
649 fn imp(canonical_age: &str) -> Result<impl Iterator<Item = Range>, Error> {
650 use crate::unicode_tables::age;
651
652 const AGES: &[(&str, Range)] = &[
653 ("V1_1", age::V1_1),
654 ("V2_0", age::V2_0),
655 ("V2_1", age::V2_1),
656 ("V3_0", age::V3_0),
657 ("V3_1", age::V3_1),
658 ("V3_2", age::V3_2),
659 ("V4_0", age::V4_0),
660 ("V4_1", age::V4_1),
661 ("V5_0", age::V5_0),
662 ("V5_1", age::V5_1),
663 ("V5_2", age::V5_2),
664 ("V6_0", age::V6_0),
665 ("V6_1", age::V6_1),
666 ("V6_2", age::V6_2),
667 ("V6_3", age::V6_3),
668 ("V7_0", age::V7_0),
669 ("V8_0", age::V8_0),
670 ("V9_0", age::V9_0),
671 ("V10_0", age::V10_0),
672 ("V11_0", age::V11_0),
673 ("V12_0", age::V12_0),
674 ("V12_1", age::V12_1),
675 ("V13_0", age::V13_0),
676 ("V14_0", age::V14_0),
677 ("V15_0", age::V15_0),
678 ];
679 assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync");
680
681 let pos = AGES.iter().position(|&(age, _)| canonical_age == age);
682 match pos {
683 None => Err(Error::PropertyValueNotFound),
684 Some(i) => Ok(AGES[..=i].iter().map(|&(_, classes)| classes)),
685 }
686 }
687
688 imp(canonical_age)
689}
690
691/// Returns the Unicode HIR class corresponding to the given general category.
692///
693/// Name canonicalization is assumed to be performed by the caller.
694///
695/// If the given general category could not be found, or if the general
696/// category data is not available, then an error is returned.
697fn gencat(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> {
698 #[cfg(not(feature = "unicode-gencat"))]
699 fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
700 Err(Error::PropertyNotFound)
701 }
702
703 #[cfg(feature = "unicode-gencat")]
704 fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
705 use crate::unicode_tables::general_category::BY_NAME;
706 match name {
707 "ASCII" => Ok(hir_class(&[('\0', '\x7F')])),
708 "Any" => Ok(hir_class(&[('\0', '\u{10FFFF}')])),
709 "Assigned" => {
710 let mut cls = gencat("Unassigned")?;
711 cls.negate();
712 Ok(cls)
713 }
714 name => property_set(BY_NAME, name)
715 .map(hir_class)
716 .ok_or(Error::PropertyValueNotFound),
717 }
718 }
719
720 match canonical_name {
721 "Decimal_Number" => perl_digit(),
722 name => imp(name),
723 }
724}
725
726/// Returns the Unicode HIR class corresponding to the given script.
727///
728/// Name canonicalization is assumed to be performed by the caller.
729///
730/// If the given script could not be found, or if the script data is not
731/// available, then an error is returned.
732fn script(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> {
733 #[cfg(not(feature = "unicode-script"))]
734 fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
735 Err(Error::PropertyNotFound)
736 }
737
738 #[cfg(feature = "unicode-script")]
739 fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
740 use crate::unicode_tables::script::BY_NAME;
741 property_set(BY_NAME, name)
742 .map(hir_class)
743 .ok_or(Error::PropertyValueNotFound)
744 }
745
746 imp(canonical_name)
747}
748
749/// Returns the Unicode HIR class corresponding to the given script extension.
750///
751/// Name canonicalization is assumed to be performed by the caller.
752///
753/// If the given script extension could not be found, or if the script data is
754/// not available, then an error is returned.
755fn script_extension(
756 canonical_name: &'static str,
757) -> Result<hir::ClassUnicode, Error> {
758 #[cfg(not(feature = "unicode-script"))]
759 fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
760 Err(Error::PropertyNotFound)
761 }
762
763 #[cfg(feature = "unicode-script")]
764 fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
765 use crate::unicode_tables::script_extension::BY_NAME;
766 property_set(BY_NAME, name)
767 .map(hir_class)
768 .ok_or(Error::PropertyValueNotFound)
769 }
770
771 imp(canonical_name)
772}
773
774/// Returns the Unicode HIR class corresponding to the given Unicode boolean
775/// property.
776///
777/// Name canonicalization is assumed to be performed by the caller.
778///
779/// If the given boolean property could not be found, or if the boolean
780/// property data is not available, then an error is returned.
781fn bool_property(
782 canonical_name: &'static str,
783) -> Result<hir::ClassUnicode, Error> {
784 #[cfg(not(feature = "unicode-bool"))]
785 fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
786 Err(Error::PropertyNotFound)
787 }
788
789 #[cfg(feature = "unicode-bool")]
790 fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
791 use crate::unicode_tables::property_bool::BY_NAME;
792 property_set(BY_NAME, name)
793 .map(hir_class)
794 .ok_or(Error::PropertyNotFound)
795 }
796
797 match canonical_name {
798 "Decimal_Number" => perl_digit(),
799 "White_Space" => perl_space(),
800 name => imp(name),
801 }
802}
803
804/// Returns the Unicode HIR class corresponding to the given grapheme cluster
805/// break property.
806///
807/// Name canonicalization is assumed to be performed by the caller.
808///
809/// If the given property could not be found, or if the corresponding data is
810/// not available, then an error is returned.
811fn gcb(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> {
812 #[cfg(not(feature = "unicode-segment"))]
813 fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
814 Err(Error::PropertyNotFound)
815 }
816
817 #[cfg(feature = "unicode-segment")]
818 fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
819 use crate::unicode_tables::grapheme_cluster_break::BY_NAME;
820 property_set(BY_NAME, name)
821 .map(hir_class)
822 .ok_or(Error::PropertyValueNotFound)
823 }
824
825 imp(canonical_name)
826}
827
828/// Returns the Unicode HIR class corresponding to the given word break
829/// property.
830///
831/// Name canonicalization is assumed to be performed by the caller.
832///
833/// If the given property could not be found, or if the corresponding data is
834/// not available, then an error is returned.
835fn wb(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> {
836 #[cfg(not(feature = "unicode-segment"))]
837 fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
838 Err(Error::PropertyNotFound)
839 }
840
841 #[cfg(feature = "unicode-segment")]
842 fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
843 use crate::unicode_tables::word_break::BY_NAME;
844 property_set(BY_NAME, name)
845 .map(hir_class)
846 .ok_or(Error::PropertyValueNotFound)
847 }
848
849 imp(canonical_name)
850}
851
852/// Returns the Unicode HIR class corresponding to the given sentence
853/// break property.
854///
855/// Name canonicalization is assumed to be performed by the caller.
856///
857/// If the given property could not be found, or if the corresponding data is
858/// not available, then an error is returned.
859fn sb(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> {
860 #[cfg(not(feature = "unicode-segment"))]
861 fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
862 Err(Error::PropertyNotFound)
863 }
864
865 #[cfg(feature = "unicode-segment")]
866 fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
867 use crate::unicode_tables::sentence_break::BY_NAME;
868 property_set(BY_NAME, name)
869 .map(hir_class)
870 .ok_or(Error::PropertyValueNotFound)
871 }
872
873 imp(canonical_name)
874}
875
876/// Like symbolic_name_normalize_bytes, but operates on a string.
877fn symbolic_name_normalize(x: &str) -> String {
878 let mut tmp = x.as_bytes().to_vec();
879 let len = symbolic_name_normalize_bytes(&mut tmp).len();
880 tmp.truncate(len);
881 // This should always succeed because `symbolic_name_normalize_bytes`
882 // guarantees that `&tmp[..len]` is always valid UTF-8.
883 //
884 // N.B. We could avoid the additional UTF-8 check here, but it's unlikely
885 // to be worth skipping the additional safety check. A benchmark must
886 // justify it first.
887 String::from_utf8(tmp).unwrap()
888}
889
890/// Normalize the given symbolic name in place according to UAX44-LM3.
891///
892/// A "symbolic name" typically corresponds to property names and property
893/// value aliases. Note, though, that it should not be applied to property
894/// string values.
895///
896/// The slice returned is guaranteed to be valid UTF-8 for all possible values
897/// of `slice`.
898///
899/// See: https://unicode.org/reports/tr44/#UAX44-LM3
900fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {
901 // I couldn't find a place in the standard that specified that property
902 // names/aliases had a particular structure (unlike character names), but
903 // we assume that it's ASCII only and drop anything that isn't ASCII.
904 let mut start = 0;
905 let mut starts_with_is = false;
906 if slice.len() >= 2 {
907 // Ignore any "is" prefix.
908 starts_with_is = slice[0..2] == b"is"[..]
909 || slice[0..2] == b"IS"[..]
910 || slice[0..2] == b"iS"[..]
911 || slice[0..2] == b"Is"[..];
912 if starts_with_is {
913 start = 2;
914 }
915 }
916 let mut next_write = 0;
917 for i in start..slice.len() {
918 // VALIDITY ARGUMENT: To guarantee that the resulting slice is valid
919 // UTF-8, we ensure that the slice contains only ASCII bytes. In
920 // particular, we drop every non-ASCII byte from the normalized string.
921 let b = slice[i];
922 if b == b' ' || b == b'_' || b == b'-' {
923 continue;
924 } else if b'A' <= b && b <= b'Z' {
925 slice[next_write] = b + (b'a' - b'A');
926 next_write += 1;
927 } else if b <= 0x7F {
928 slice[next_write] = b;
929 next_write += 1;
930 }
931 }
932 // Special case: ISO_Comment has a 'isc' abbreviation. Since we generally
933 // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross
934 // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it
935 // is actually an alias for the 'Other' general category.
936 if starts_with_is && next_write == 1 && slice[0] == b'c' {
937 slice[0] = b'i';
938 slice[1] = b's';
939 slice[2] = b'c';
940 next_write = 3;
941 }
942 &mut slice[..next_write]
943}
944
945#[cfg(test)]
946mod tests {
947 use super::*;
948
949 #[cfg(feature = "unicode-case")]
950 fn simple_fold_ok(c: char) -> impl Iterator<Item = char> {
951 SimpleCaseFolder::new().unwrap().mapping(c).iter().copied()
952 }
953
954 #[cfg(feature = "unicode-case")]
955 fn contains_case_map(start: char, end: char) -> bool {
956 SimpleCaseFolder::new().unwrap().overlaps(start, end)
957 }
958
959 #[test]
960 #[cfg(feature = "unicode-case")]
961 fn simple_fold_k() {
962 let xs: Vec<char> = simple_fold_ok('k').collect();
963 assert_eq!(xs, alloc::vec!['K', 'K']);
964
965 let xs: Vec<char> = simple_fold_ok('K').collect();
966 assert_eq!(xs, alloc::vec!['k', 'K']);
967
968 let xs: Vec<char> = simple_fold_ok('K').collect();
969 assert_eq!(xs, alloc::vec!['K', 'k']);
970 }
971
972 #[test]
973 #[cfg(feature = "unicode-case")]
974 fn simple_fold_a() {
975 let xs: Vec<char> = simple_fold_ok('a').collect();
976 assert_eq!(xs, alloc::vec!['A']);
977
978 let xs: Vec<char> = simple_fold_ok('A').collect();
979 assert_eq!(xs, alloc::vec!['a']);
980 }
981
982 #[test]
983 #[cfg(not(feature = "unicode-case"))]
984 fn simple_fold_disabled() {
985 assert!(SimpleCaseFolder::new().is_err());
986 }
987
988 #[test]
989 #[cfg(feature = "unicode-case")]
990 fn range_contains() {
991 assert!(contains_case_map('A', 'A'));
992 assert!(contains_case_map('Z', 'Z'));
993 assert!(contains_case_map('A', 'Z'));
994 assert!(contains_case_map('@', 'A'));
995 assert!(contains_case_map('Z', '['));
996 assert!(contains_case_map('☃', 'Ⰰ'));
997
998 assert!(!contains_case_map('[', '['));
999 assert!(!contains_case_map('[', '`'));
1000
1001 assert!(!contains_case_map('☃', '☃'));
1002 }
1003
1004 #[test]
1005 #[cfg(feature = "unicode-gencat")]
1006 fn regression_466() {
1007 use super::{CanonicalClassQuery, ClassQuery};
1008
1009 let q = ClassQuery::OneLetter('C');
1010 assert_eq!(
1011 q.canonicalize().unwrap(),
1012 CanonicalClassQuery::GeneralCategory("Other")
1013 );
1014 }
1015
1016 #[test]
1017 fn sym_normalize() {
1018 let sym_norm = symbolic_name_normalize;
1019
1020 assert_eq!(sym_norm("Line_Break"), "linebreak");
1021 assert_eq!(sym_norm("Line-break"), "linebreak");
1022 assert_eq!(sym_norm("linebreak"), "linebreak");
1023 assert_eq!(sym_norm("BA"), "ba");
1024 assert_eq!(sym_norm("ba"), "ba");
1025 assert_eq!(sym_norm("Greek"), "greek");
1026 assert_eq!(sym_norm("isGreek"), "greek");
1027 assert_eq!(sym_norm("IS_Greek"), "greek");
1028 assert_eq!(sym_norm("isc"), "isc");
1029 assert_eq!(sym_norm("is c"), "isc");
1030 assert_eq!(sym_norm("is_c"), "isc");
1031 }
1032
1033 #[test]
1034 fn valid_utf8_symbolic() {
1035 let mut x = b"abc\xFFxyz".to_vec();
1036 let y = symbolic_name_normalize_bytes(&mut x);
1037 assert_eq!(y, b"abcxyz");
1038 }
1039}
1040