1use std::error;
2use std::fmt;
3use std::result;
4
5use crate::hir;
6
7/// A type alias for errors specific to Unicode handling of classes.
8pub type Result<T> = result::Result<T, Error>;
9
10/// An inclusive range of codepoints from a generated file (hence the static
11/// lifetime).
12type Range = &'static [(char, char)];
13
14/// An error that occurs when dealing with Unicode.
15///
16/// We don't impl the Error trait here because these always get converted
17/// into other public errors. (This error type isn't exported.)
18#[derive(Debug)]
19pub enum Error {
20 PropertyNotFound,
21 PropertyValueNotFound,
22 // Not used when unicode-perl is enabled.
23 #[allow(dead_code)]
24 PerlClassNotFound,
25}
26
27/// A type alias for errors specific to Unicode case folding.
28pub type FoldResult<T> = result::Result<T, CaseFoldError>;
29
30/// An error that occurs when Unicode-aware simple case folding fails.
31///
32/// This error can occur when the case mapping tables necessary for Unicode
33/// aware case folding are unavailable. This only occurs when the
34/// `unicode-case` feature is disabled. (The feature is enabled by default.)
35#[derive(Debug)]
36pub struct CaseFoldError(());
37
38impl error::Error for CaseFoldError {}
39
40impl fmt::Display for CaseFoldError {
41 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
42 write!(
43 f,
44 "Unicode-aware case folding is not available \
45 (probably because the unicode-case feature is not enabled)"
46 )
47 }
48}
49
50/// An error that occurs when the Unicode-aware `\w` class is unavailable.
51///
52/// This error can occur when the data tables necessary for the Unicode aware
53/// Perl character class `\w` are unavailable. This only occurs when the
54/// `unicode-perl` feature is disabled. (The feature is enabled by default.)
55#[derive(Debug)]
56pub struct UnicodeWordError(());
57
58impl error::Error for UnicodeWordError {}
59
60impl fmt::Display for UnicodeWordError {
61 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
62 write!(
63 f,
64 "Unicode-aware \\w class is not available \
65 (probably because the unicode-perl feature is not enabled)"
66 )
67 }
68}
69
70/// Return an iterator over the equivalence class of simple case mappings
71/// for the given codepoint. The equivalence class does not include the
72/// given codepoint.
73///
74/// If the equivalence class is empty, then this returns the next scalar
75/// value that has a non-empty equivalence class, if it exists. If no such
76/// scalar value exists, then `None` is returned. The point of this behavior
77/// is to permit callers to avoid calling `simple_fold` more than they need
78/// to, since there is some cost to fetching the equivalence class.
79///
80/// This returns an error if the Unicode case folding tables are not available.
81pub fn simple_fold(
82 c: char,
83) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> {
84 #[cfg(not(feature = "unicode-case"))]
85 fn imp(
86 _: char,
87 ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>>
88 {
89 use std::option::IntoIter;
90 Err::<result::Result<IntoIter<char>, _>, _>(CaseFoldError(()))
91 }
92
93 #[cfg(feature = "unicode-case")]
94 fn imp(
95 c: char,
96 ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>>
97 {
98 use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
99
100 Ok(CASE_FOLDING_SIMPLE
101 .binary_search_by_key(&c, |&(c1, _)| c1)
102 .map(|i| CASE_FOLDING_SIMPLE[i].1.iter().copied())
103 .map_err(|i| {
104 if i >= CASE_FOLDING_SIMPLE.len() {
105 None
106 } else {
107 Some(CASE_FOLDING_SIMPLE[i].0)
108 }
109 }))
110 }
111
112 imp(c)
113}
114
115/// Returns true if and only if the given (inclusive) range contains at least
116/// one Unicode scalar value that has a non-empty non-trivial simple case
117/// mapping.
118///
119/// This function panics if `end < start`.
120///
121/// This returns an error if the Unicode case folding tables are not available.
122pub fn contains_simple_case_mapping(
123 start: char,
124 end: char,
125) -> FoldResult<bool> {
126 #[cfg(not(feature = "unicode-case"))]
127 fn imp(_: char, _: char) -> FoldResult<bool> {
128 Err(CaseFoldError(()))
129 }
130
131 #[cfg(feature = "unicode-case")]
132 fn imp(start: char, end: char) -> FoldResult<bool> {
133 use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
134 use std::cmp::Ordering;
135
136 assert!(start <= end);
137 Ok(CASE_FOLDING_SIMPLE
138 .binary_search_by(|&(c, _)| {
139 if start <= c && c <= end {
140 Ordering::Equal
141 } else if c > end {
142 Ordering::Greater
143 } else {
144 Ordering::Less
145 }
146 })
147 .is_ok())
148 }
149
150 imp(start, end)
151}
152
153/// A query for finding a character class defined by Unicode. This supports
154/// either use of a property name directly, or lookup by property value. The
155/// former generally refers to Binary properties (see UTS#44, Table 8), but
156/// as a special exception (see UTS#18, Section 1.2) both general categories
157/// (an enumeration) and scripts (a catalog) are supported as if each of their
158/// possible values were a binary property.
159///
160/// In all circumstances, property names and values are normalized and
161/// canonicalized. That is, `GC == gc == GeneralCategory == general_category`.
162///
163/// The lifetime `'a` refers to the shorter of the lifetimes of property name
164/// and property value.
165#[derive(Debug)]
166pub enum ClassQuery<'a> {
167 /// Return a class corresponding to a Unicode binary property, named by
168 /// a single letter.
169 OneLetter(char),
170 /// Return a class corresponding to a Unicode binary property.
171 ///
172 /// Note that, by special exception (see UTS#18, Section 1.2), both
173 /// general category values and script values are permitted here as if
174 /// they were a binary property.
175 Binary(&'a str),
176 /// Return a class corresponding to all codepoints whose property
177 /// (identified by `property_name`) corresponds to the given value
178 /// (identified by `property_value`).
179 ByValue {
180 /// A property name.
181 property_name: &'a str,
182 /// A property value.
183 property_value: &'a str,
184 },
185}
186
187impl<'a> ClassQuery<'a> {
188 fn canonicalize(&self) -> Result<CanonicalClassQuery> {
189 match *self {
190 ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()),
191 ClassQuery::Binary(name) => self.canonical_binary(name),
192 ClassQuery::ByValue { property_name, property_value } => {
193 let property_name = symbolic_name_normalize(property_name);
194 let property_value = symbolic_name_normalize(property_value);
195
196 let canon_name = match canonical_prop(&property_name)? {
197 None => return Err(Error::PropertyNotFound),
198 Some(canon_name) => canon_name,
199 };
200 Ok(match canon_name {
201 "General_Category" => {
202 let canon = match canonical_gencat(&property_value)? {
203 None => return Err(Error::PropertyValueNotFound),
204 Some(canon) => canon,
205 };
206 CanonicalClassQuery::GeneralCategory(canon)
207 }
208 "Script" => {
209 let canon = match canonical_script(&property_value)? {
210 None => return Err(Error::PropertyValueNotFound),
211 Some(canon) => canon,
212 };
213 CanonicalClassQuery::Script(canon)
214 }
215 _ => {
216 let vals = match property_values(canon_name)? {
217 None => return Err(Error::PropertyValueNotFound),
218 Some(vals) => vals,
219 };
220 let canon_val =
221 match canonical_value(vals, &property_value) {
222 None => {
223 return Err(Error::PropertyValueNotFound)
224 }
225 Some(canon_val) => canon_val,
226 };
227 CanonicalClassQuery::ByValue {
228 property_name: canon_name,
229 property_value: canon_val,
230 }
231 }
232 })
233 }
234 }
235 }
236
237 fn canonical_binary(&self, name: &str) -> Result<CanonicalClassQuery> {
238 let norm = symbolic_name_normalize(name);
239
240 // This is a special case where 'cf' refers to the 'Format' general
241 // category, but where the 'cf' abbreviation is also an abbreviation
242 // for the 'Case_Folding' property. But we want to treat it as
243 // a general category. (Currently, we don't even support the
244 // 'Case_Folding' property. But if we do in the future, users will be
245 // required to spell it out.)
246 if norm != "cf" {
247 if let Some(canon) = canonical_prop(&norm)? {
248 return Ok(CanonicalClassQuery::Binary(canon));
249 }
250 }
251 if let Some(canon) = canonical_gencat(&norm)? {
252 return Ok(CanonicalClassQuery::GeneralCategory(canon));
253 }
254 if let Some(canon) = canonical_script(&norm)? {
255 return Ok(CanonicalClassQuery::Script(canon));
256 }
257 Err(Error::PropertyNotFound)
258 }
259}
260
261/// Like ClassQuery, but its parameters have been canonicalized. This also
262/// differentiates binary properties from flattened general categories and
263/// scripts.
264#[derive(Debug, Eq, PartialEq)]
265enum CanonicalClassQuery {
266 /// The canonical binary property name.
267 Binary(&'static str),
268 /// The canonical general category name.
269 GeneralCategory(&'static str),
270 /// The canonical script name.
271 Script(&'static str),
272 /// An arbitrary association between property and value, both of which
273 /// have been canonicalized.
274 ///
275 /// Note that by construction, the property name of ByValue will never
276 /// be General_Category or Script. Those two cases are subsumed by the
277 /// eponymous variants.
278 ByValue {
279 /// The canonical property name.
280 property_name: &'static str,
281 /// The canonical property value.
282 property_value: &'static str,
283 },
284}
285
286/// Looks up a Unicode class given a query. If one doesn't exist, then
287/// `None` is returned.
288pub fn class(query: ClassQuery<'_>) -> Result<hir::ClassUnicode> {
289 use self::CanonicalClassQuery::*;
290
291 match query.canonicalize()? {
292 Binary(name) => bool_property(name),
293 GeneralCategory(name) => gencat(name),
294 Script(name) => script(name),
295 ByValue { property_name: "Age", property_value } => {
296 let mut class = hir::ClassUnicode::empty();
297 for set in ages(property_value)? {
298 class.union(&hir_class(set));
299 }
300 Ok(class)
301 }
302 ByValue { property_name: "Script_Extensions", property_value } => {
303 script_extension(property_value)
304 }
305 ByValue {
306 property_name: "Grapheme_Cluster_Break",
307 property_value,
308 } => gcb(property_value),
309 ByValue { property_name: "Sentence_Break", property_value } => {
310 sb(property_value)
311 }
312 ByValue { property_name: "Word_Break", property_value } => {
313 wb(property_value)
314 }
315 _ => {
316 // What else should we support?
317 Err(Error::PropertyNotFound)
318 }
319 }
320}
321
322/// Returns a Unicode aware class for \w.
323///
324/// This returns an error if the data is not available for \w.
325pub fn perl_word() -> Result<hir::ClassUnicode> {
326 #[cfg(not(feature = "unicode-perl"))]
327 fn imp() -> Result<hir::ClassUnicode> {
328 Err(Error::PerlClassNotFound)
329 }
330
331 #[cfg(feature = "unicode-perl")]
332 fn imp() -> Result<hir::ClassUnicode> {
333 use crate::unicode_tables::perl_word::PERL_WORD;
334 Ok(hir_class(PERL_WORD))
335 }
336
337 imp()
338}
339
340/// Returns a Unicode aware class for \s.
341///
342/// This returns an error if the data is not available for \s.
343pub fn perl_space() -> Result<hir::ClassUnicode> {
344 #[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))]
345 fn imp() -> Result<hir::ClassUnicode> {
346 Err(Error::PerlClassNotFound)
347 }
348
349 #[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))]
350 fn imp() -> Result<hir::ClassUnicode> {
351 use crate::unicode_tables::perl_space::WHITE_SPACE;
352 Ok(hir_class(WHITE_SPACE))
353 }
354
355 #[cfg(feature = "unicode-bool")]
356 fn imp() -> Result<hir::ClassUnicode> {
357 use crate::unicode_tables::property_bool::WHITE_SPACE;
358 Ok(hir_class(WHITE_SPACE))
359 }
360
361 imp()
362}
363
364/// Returns a Unicode aware class for \d.
365///
366/// This returns an error if the data is not available for \d.
367pub fn perl_digit() -> Result<hir::ClassUnicode> {
368 #[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))]
369 fn imp() -> Result<hir::ClassUnicode> {
370 Err(Error::PerlClassNotFound)
371 }
372
373 #[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))]
374 fn imp() -> Result<hir::ClassUnicode> {
375 use crate::unicode_tables::perl_decimal::DECIMAL_NUMBER;
376 Ok(hir_class(DECIMAL_NUMBER))
377 }
378
379 #[cfg(feature = "unicode-gencat")]
380 fn imp() -> Result<hir::ClassUnicode> {
381 use crate::unicode_tables::general_category::DECIMAL_NUMBER;
382 Ok(hir_class(DECIMAL_NUMBER))
383 }
384
385 imp()
386}
387
388/// Build a Unicode HIR class from a sequence of Unicode scalar value ranges.
389pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode {
390 let hir_ranges: Vec<hir::ClassUnicodeRange> = ranges
391 .iter()
392 .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
393 .collect();
394 hir::ClassUnicode::new(hir_ranges)
395}
396
397/// Returns true only if the given codepoint is in the `\w` character class.
398///
399/// If the `unicode-perl` feature is not enabled, then this returns an error.
400pub fn is_word_character(c: char) -> result::Result<bool, UnicodeWordError> {
401 #[cfg(not(feature = "unicode-perl"))]
402 fn imp(_: char) -> result::Result<bool, UnicodeWordError> {
403 Err(UnicodeWordError(()))
404 }
405
406 #[cfg(feature = "unicode-perl")]
407 fn imp(c: char) -> result::Result<bool, UnicodeWordError> {
408 use crate::is_word_byte;
409 use crate::unicode_tables::perl_word::PERL_WORD;
410 use std::cmp::Ordering;
411
412 if c <= 0x7F as char && is_word_byte(c as u8) {
413 return Ok(true);
414 }
415 Ok(PERL_WORD
416 .binary_search_by(|&(start, end)| {
417 if start <= c && c <= end {
418 Ordering::Equal
419 } else if start > c {
420 Ordering::Greater
421 } else {
422 Ordering::Less
423 }
424 })
425 .is_ok())
426 }
427
428 imp(c)
429}
430
431/// A mapping of property values for a specific property.
432///
433/// The first element of each tuple is a normalized property value while the
434/// second element of each tuple is the corresponding canonical property
435/// value.
436type PropertyValues = &'static [(&'static str, &'static str)];
437
438fn canonical_gencat(normalized_value: &str) -> Result<Option<&'static str>> {
439 Ok(match normalized_value {
440 "any" => Some("Any"),
441 "assigned" => Some("Assigned"),
442 "ascii" => Some("ASCII"),
443 _ => {
444 let gencats = property_values("General_Category")?.unwrap();
445 canonical_value(gencats, normalized_value)
446 }
447 })
448}
449
450fn canonical_script(normalized_value: &str) -> Result<Option<&'static str>> {
451 let scripts = property_values("Script")?.unwrap();
452 Ok(canonical_value(scripts, normalized_value))
453}
454
455/// Find the canonical property name for the given normalized property name.
456///
457/// If no such property exists, then `None` is returned.
458///
459/// The normalized property name must have been normalized according to
460/// UAX44 LM3, which can be done using `symbolic_name_normalize`.
461///
462/// If the property names data is not available, then an error is returned.
463fn canonical_prop(normalized_name: &str) -> Result<Option<&'static str>> {
464 #[cfg(not(any(
465 feature = "unicode-age",
466 feature = "unicode-bool",
467 feature = "unicode-gencat",
468 feature = "unicode-perl",
469 feature = "unicode-script",
470 feature = "unicode-segment",
471 )))]
472 fn imp(_: &str) -> Result<Option<&'static str>> {
473 Err(Error::PropertyNotFound)
474 }
475
476 #[cfg(any(
477 feature = "unicode-age",
478 feature = "unicode-bool",
479 feature = "unicode-gencat",
480 feature = "unicode-perl",
481 feature = "unicode-script",
482 feature = "unicode-segment",
483 ))]
484 fn imp(name: &str) -> Result<Option<&'static str>> {
485 use crate::unicode_tables::property_names::PROPERTY_NAMES;
486
487 Ok(PROPERTY_NAMES
488 .binary_search_by_key(&name, |&(n, _)| n)
489 .ok()
490 .map(|i| PROPERTY_NAMES[i].1))
491 }
492
493 imp(normalized_name)
494}
495
496/// Find the canonical property value for the given normalized property
497/// value.
498///
499/// The given property values should correspond to the values for the property
500/// under question, which can be found using `property_values`.
501///
502/// If no such property value exists, then `None` is returned.
503///
504/// The normalized property value must have been normalized according to
505/// UAX44 LM3, which can be done using `symbolic_name_normalize`.
506fn canonical_value(
507 vals: PropertyValues,
508 normalized_value: &str,
509) -> Option<&'static str> {
510 vals.binary_search_by_key(&normalized_value, |&(n, _)| n)
511 .ok()
512 .map(|i| vals[i].1)
513}
514
515/// Return the table of property values for the given property name.
516///
517/// If the property values data is not available, then an error is returned.
518fn property_values(
519 canonical_property_name: &'static str,
520) -> Result<Option<PropertyValues>> {
521 #[cfg(not(any(
522 feature = "unicode-age",
523 feature = "unicode-bool",
524 feature = "unicode-gencat",
525 feature = "unicode-perl",
526 feature = "unicode-script",
527 feature = "unicode-segment",
528 )))]
529 fn imp(_: &'static str) -> Result<Option<PropertyValues>> {
530 Err(Error::PropertyValueNotFound)
531 }
532
533 #[cfg(any(
534 feature = "unicode-age",
535 feature = "unicode-bool",
536 feature = "unicode-gencat",
537 feature = "unicode-perl",
538 feature = "unicode-script",
539 feature = "unicode-segment",
540 ))]
541 fn imp(name: &'static str) -> Result<Option<PropertyValues>> {
542 use crate::unicode_tables::property_values::PROPERTY_VALUES;
543
544 Ok(PROPERTY_VALUES
545 .binary_search_by_key(&name, |&(n, _)| n)
546 .ok()
547 .map(|i| PROPERTY_VALUES[i].1))
548 }
549
550 imp(canonical_property_name)
551}
552
553// This is only used in some cases, but small enough to just let it be dead
554// instead of figuring out (and maintaining) the right set of features.
555#[allow(dead_code)]
556fn property_set(
557 name_map: &'static [(&'static str, Range)],
558 canonical: &'static str,
559) -> Option<Range> {
560 name_map
561 .binary_search_by_key(&canonical, |x| x.0)
562 .ok()
563 .map(|i| name_map[i].1)
564}
565
566/// Returns an iterator over Unicode Age sets. Each item corresponds to a set
567/// of codepoints that were added in a particular revision of Unicode. The
568/// iterator yields items in chronological order.
569///
570/// If the given age value isn't valid or if the data isn't available, then an
571/// error is returned instead.
572fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>> {
573 #[cfg(not(feature = "unicode-age"))]
574 fn imp(_: &str) -> Result<impl Iterator<Item = Range>> {
575 use std::option::IntoIter;
576 Err::<IntoIter<Range>, _>(Error::PropertyNotFound)
577 }
578
579 #[cfg(feature = "unicode-age")]
580 fn imp(canonical_age: &str) -> Result<impl Iterator<Item = Range>> {
581 use crate::unicode_tables::age;
582
583 const AGES: &[(&str, Range)] = &[
584 ("V1_1", age::V1_1),
585 ("V2_0", age::V2_0),
586 ("V2_1", age::V2_1),
587 ("V3_0", age::V3_0),
588 ("V3_1", age::V3_1),
589 ("V3_2", age::V3_2),
590 ("V4_0", age::V4_0),
591 ("V4_1", age::V4_1),
592 ("V5_0", age::V5_0),
593 ("V5_1", age::V5_1),
594 ("V5_2", age::V5_2),
595 ("V6_0", age::V6_0),
596 ("V6_1", age::V6_1),
597 ("V6_2", age::V6_2),
598 ("V6_3", age::V6_3),
599 ("V7_0", age::V7_0),
600 ("V8_0", age::V8_0),
601 ("V9_0", age::V9_0),
602 ("V10_0", age::V10_0),
603 ("V11_0", age::V11_0),
604 ("V12_0", age::V12_0),
605 ("V12_1", age::V12_1),
606 ("V13_0", age::V13_0),
607 ("V14_0", age::V14_0),
608 ("V15_0", age::V15_0),
609 ];
610 assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync");
611
612 let pos = AGES.iter().position(|&(age, _)| canonical_age == age);
613 match pos {
614 None => Err(Error::PropertyValueNotFound),
615 Some(i) => Ok(AGES[..=i].iter().map(|&(_, classes)| classes)),
616 }
617 }
618
619 imp(canonical_age)
620}
621
622/// Returns the Unicode HIR class corresponding to the given general category.
623///
624/// Name canonicalization is assumed to be performed by the caller.
625///
626/// If the given general category could not be found, or if the general
627/// category data is not available, then an error is returned.
628fn gencat(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
629 #[cfg(not(feature = "unicode-gencat"))]
630 fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
631 Err(Error::PropertyNotFound)
632 }
633
634 #[cfg(feature = "unicode-gencat")]
635 fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
636 use crate::unicode_tables::general_category::BY_NAME;
637 match name {
638 "ASCII" => Ok(hir_class(&[('\0', '\x7F')])),
639 "Any" => Ok(hir_class(&[('\0', '\u{10FFFF}')])),
640 "Assigned" => {
641 let mut cls = gencat("Unassigned")?;
642 cls.negate();
643 Ok(cls)
644 }
645 name => property_set(BY_NAME, name)
646 .map(hir_class)
647 .ok_or(Error::PropertyValueNotFound),
648 }
649 }
650
651 match canonical_name {
652 "Decimal_Number" => perl_digit(),
653 name => imp(name),
654 }
655}
656
657/// Returns the Unicode HIR class corresponding to the given script.
658///
659/// Name canonicalization is assumed to be performed by the caller.
660///
661/// If the given script could not be found, or if the script data is not
662/// available, then an error is returned.
663fn script(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
664 #[cfg(not(feature = "unicode-script"))]
665 fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
666 Err(Error::PropertyNotFound)
667 }
668
669 #[cfg(feature = "unicode-script")]
670 fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
671 use crate::unicode_tables::script::BY_NAME;
672 property_set(BY_NAME, name)
673 .map(hir_class)
674 .ok_or(Error::PropertyValueNotFound)
675 }
676
677 imp(canonical_name)
678}
679
680/// Returns the Unicode HIR class corresponding to the given script extension.
681///
682/// Name canonicalization is assumed to be performed by the caller.
683///
684/// If the given script extension could not be found, or if the script data is
685/// not available, then an error is returned.
686fn script_extension(
687 canonical_name: &'static str,
688) -> Result<hir::ClassUnicode> {
689 #[cfg(not(feature = "unicode-script"))]
690 fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
691 Err(Error::PropertyNotFound)
692 }
693
694 #[cfg(feature = "unicode-script")]
695 fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
696 use crate::unicode_tables::script_extension::BY_NAME;
697 property_set(BY_NAME, name)
698 .map(hir_class)
699 .ok_or(Error::PropertyValueNotFound)
700 }
701
702 imp(canonical_name)
703}
704
705/// Returns the Unicode HIR class corresponding to the given Unicode boolean
706/// property.
707///
708/// Name canonicalization is assumed to be performed by the caller.
709///
710/// If the given boolean property could not be found, or if the boolean
711/// property data is not available, then an error is returned.
712fn bool_property(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
713 #[cfg(not(feature = "unicode-bool"))]
714 fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
715 Err(Error::PropertyNotFound)
716 }
717
718 #[cfg(feature = "unicode-bool")]
719 fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
720 use crate::unicode_tables::property_bool::BY_NAME;
721 property_set(BY_NAME, name)
722 .map(hir_class)
723 .ok_or(Error::PropertyNotFound)
724 }
725
726 match canonical_name {
727 "Decimal_Number" => perl_digit(),
728 "White_Space" => perl_space(),
729 name => imp(name),
730 }
731}
732
733/// Returns the Unicode HIR class corresponding to the given grapheme cluster
734/// break property.
735///
736/// Name canonicalization is assumed to be performed by the caller.
737///
738/// If the given property could not be found, or if the corresponding data is
739/// not available, then an error is returned.
740fn gcb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
741 #[cfg(not(feature = "unicode-segment"))]
742 fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
743 Err(Error::PropertyNotFound)
744 }
745
746 #[cfg(feature = "unicode-segment")]
747 fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
748 use crate::unicode_tables::grapheme_cluster_break::BY_NAME;
749 property_set(BY_NAME, name)
750 .map(hir_class)
751 .ok_or(Error::PropertyValueNotFound)
752 }
753
754 imp(canonical_name)
755}
756
757/// Returns the Unicode HIR class corresponding to the given word break
758/// property.
759///
760/// Name canonicalization is assumed to be performed by the caller.
761///
762/// If the given property could not be found, or if the corresponding data is
763/// not available, then an error is returned.
764fn wb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
765 #[cfg(not(feature = "unicode-segment"))]
766 fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
767 Err(Error::PropertyNotFound)
768 }
769
770 #[cfg(feature = "unicode-segment")]
771 fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
772 use crate::unicode_tables::word_break::BY_NAME;
773 property_set(BY_NAME, name)
774 .map(hir_class)
775 .ok_or(Error::PropertyValueNotFound)
776 }
777
778 imp(canonical_name)
779}
780
781/// Returns the Unicode HIR class corresponding to the given sentence
782/// break property.
783///
784/// Name canonicalization is assumed to be performed by the caller.
785///
786/// If the given property could not be found, or if the corresponding data is
787/// not available, then an error is returned.
788fn sb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
789 #[cfg(not(feature = "unicode-segment"))]
790 fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
791 Err(Error::PropertyNotFound)
792 }
793
794 #[cfg(feature = "unicode-segment")]
795 fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
796 use crate::unicode_tables::sentence_break::BY_NAME;
797 property_set(BY_NAME, name)
798 .map(hir_class)
799 .ok_or(Error::PropertyValueNotFound)
800 }
801
802 imp(canonical_name)
803}
804
805/// Like symbolic_name_normalize_bytes, but operates on a string.
806fn symbolic_name_normalize(x: &str) -> String {
807 let mut tmp = x.as_bytes().to_vec();
808 let len = symbolic_name_normalize_bytes(&mut tmp).len();
809 tmp.truncate(len);
810 // This should always succeed because `symbolic_name_normalize_bytes`
811 // guarantees that `&tmp[..len]` is always valid UTF-8.
812 //
813 // N.B. We could avoid the additional UTF-8 check here, but it's unlikely
814 // to be worth skipping the additional safety check. A benchmark must
815 // justify it first.
816 String::from_utf8(tmp).unwrap()
817}
818
819/// Normalize the given symbolic name in place according to UAX44-LM3.
820///
821/// A "symbolic name" typically corresponds to property names and property
822/// value aliases. Note, though, that it should not be applied to property
823/// string values.
824///
825/// The slice returned is guaranteed to be valid UTF-8 for all possible values
826/// of `slice`.
827///
828/// See: https://unicode.org/reports/tr44/#UAX44-LM3
829fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {
830 // I couldn't find a place in the standard that specified that property
831 // names/aliases had a particular structure (unlike character names), but
832 // we assume that it's ASCII only and drop anything that isn't ASCII.
833 let mut start = 0;
834 let mut starts_with_is = false;
835 if slice.len() >= 2 {
836 // Ignore any "is" prefix.
837 starts_with_is = slice[0..2] == b"is"[..]
838 || slice[0..2] == b"IS"[..]
839 || slice[0..2] == b"iS"[..]
840 || slice[0..2] == b"Is"[..];
841 if starts_with_is {
842 start = 2;
843 }
844 }
845 let mut next_write = 0;
846 for i in start..slice.len() {
847 // VALIDITY ARGUMENT: To guarantee that the resulting slice is valid
848 // UTF-8, we ensure that the slice contains only ASCII bytes. In
849 // particular, we drop every non-ASCII byte from the normalized string.
850 let b = slice[i];
851 if b == b' ' || b == b'_' || b == b'-' {
852 continue;
853 } else if b'A' <= b && b <= b'Z' {
854 slice[next_write] = b + (b'a' - b'A');
855 next_write += 1;
856 } else if b <= 0x7F {
857 slice[next_write] = b;
858 next_write += 1;
859 }
860 }
861 // Special case: ISO_Comment has a 'isc' abbreviation. Since we generally
862 // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross
863 // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it
864 // is actually an alias for the 'Other' general category.
865 if starts_with_is && next_write == 1 && slice[0] == b'c' {
866 slice[0] = b'i';
867 slice[1] = b's';
868 slice[2] = b'c';
869 next_write = 3;
870 }
871 &mut slice[..next_write]
872}
873
874#[cfg(test)]
875mod tests {
876 use super::{
877 contains_simple_case_mapping, simple_fold, symbolic_name_normalize,
878 symbolic_name_normalize_bytes,
879 };
880
881 #[cfg(feature = "unicode-case")]
882 fn simple_fold_ok(c: char) -> impl Iterator<Item = char> {
883 simple_fold(c).unwrap().unwrap()
884 }
885
886 #[cfg(feature = "unicode-case")]
887 fn simple_fold_err(c: char) -> Option<char> {
888 match simple_fold(c).unwrap() {
889 Ok(_) => unreachable!("simple_fold returned Ok iterator"),
890 Err(next) => next,
891 }
892 }
893
894 #[cfg(feature = "unicode-case")]
895 fn contains_case_map(start: char, end: char) -> bool {
896 contains_simple_case_mapping(start, end).unwrap()
897 }
898
899 #[test]
900 #[cfg(feature = "unicode-case")]
901 fn simple_fold_k() {
902 let xs: Vec<char> = simple_fold_ok('k').collect();
903 assert_eq!(xs, vec!['K', 'K']);
904
905 let xs: Vec<char> = simple_fold_ok('K').collect();
906 assert_eq!(xs, vec!['k', 'K']);
907
908 let xs: Vec<char> = simple_fold_ok('K').collect();
909 assert_eq!(xs, vec!['K', 'k']);
910 }
911
912 #[test]
913 #[cfg(feature = "unicode-case")]
914 fn simple_fold_a() {
915 let xs: Vec<char> = simple_fold_ok('a').collect();
916 assert_eq!(xs, vec!['A']);
917
918 let xs: Vec<char> = simple_fold_ok('A').collect();
919 assert_eq!(xs, vec!['a']);
920 }
921
922 #[test]
923 #[cfg(feature = "unicode-case")]
924 fn simple_fold_empty() {
925 assert_eq!(Some('A'), simple_fold_err('?'));
926 assert_eq!(Some('A'), simple_fold_err('@'));
927 assert_eq!(Some('a'), simple_fold_err('['));
928 assert_eq!(Some('Ⰰ'), simple_fold_err('☃'));
929 }
930
931 #[test]
932 #[cfg(feature = "unicode-case")]
933 fn simple_fold_max() {
934 assert_eq!(None, simple_fold_err('\u{10FFFE}'));
935 assert_eq!(None, simple_fold_err('\u{10FFFF}'));
936 }
937
938 #[test]
939 #[cfg(not(feature = "unicode-case"))]
940 fn simple_fold_disabled() {
941 assert!(simple_fold('a').is_err());
942 }
943
944 #[test]
945 #[cfg(feature = "unicode-case")]
946 fn range_contains() {
947 assert!(contains_case_map('A', 'A'));
948 assert!(contains_case_map('Z', 'Z'));
949 assert!(contains_case_map('A', 'Z'));
950 assert!(contains_case_map('@', 'A'));
951 assert!(contains_case_map('Z', '['));
952 assert!(contains_case_map('☃', 'Ⰰ'));
953
954 assert!(!contains_case_map('[', '['));
955 assert!(!contains_case_map('[', '`'));
956
957 assert!(!contains_case_map('☃', '☃'));
958 }
959
960 #[test]
961 #[cfg(not(feature = "unicode-case"))]
962 fn range_contains_disabled() {
963 assert!(contains_simple_case_mapping('a', 'a').is_err());
964 }
965
966 #[test]
967 #[cfg(feature = "unicode-gencat")]
968 fn regression_466() {
969 use super::{CanonicalClassQuery, ClassQuery};
970
971 let q = ClassQuery::OneLetter('C');
972 assert_eq!(
973 q.canonicalize().unwrap(),
974 CanonicalClassQuery::GeneralCategory("Other")
975 );
976 }
977
978 #[test]
979 fn sym_normalize() {
980 let sym_norm = symbolic_name_normalize;
981
982 assert_eq!(sym_norm("Line_Break"), "linebreak");
983 assert_eq!(sym_norm("Line-break"), "linebreak");
984 assert_eq!(sym_norm("linebreak"), "linebreak");
985 assert_eq!(sym_norm("BA"), "ba");
986 assert_eq!(sym_norm("ba"), "ba");
987 assert_eq!(sym_norm("Greek"), "greek");
988 assert_eq!(sym_norm("isGreek"), "greek");
989 assert_eq!(sym_norm("IS_Greek"), "greek");
990 assert_eq!(sym_norm("isc"), "isc");
991 assert_eq!(sym_norm("is c"), "isc");
992 assert_eq!(sym_norm("is_c"), "isc");
993 }
994
995 #[test]
996 fn valid_utf8_symbolic() {
997 let mut x = b"abc\xFFxyz".to_vec();
998 let y = symbolic_name_normalize_bytes(&mut x);
999 assert_eq!(y, b"abcxyz");
1000 }
1001}
1002