1 | use std::error; |
2 | use std::fmt; |
3 | use std::result; |
4 | |
5 | use crate::hir; |
6 | |
7 | /// A type alias for errors specific to Unicode handling of classes. |
8 | pub type Result<T> = result::Result<T, Error>; |
9 | |
10 | /// An inclusive range of codepoints from a generated file (hence the static |
11 | /// lifetime). |
12 | type Range = &'static [(char, char)]; |
13 | |
14 | /// An error that occurs when dealing with Unicode. |
15 | /// |
16 | /// We don't impl the Error trait here because these always get converted |
17 | /// into other public errors. (This error type isn't exported.) |
18 | #[derive (Debug)] |
19 | pub enum Error { |
20 | PropertyNotFound, |
21 | PropertyValueNotFound, |
22 | // Not used when unicode-perl is enabled. |
23 | #[allow (dead_code)] |
24 | PerlClassNotFound, |
25 | } |
26 | |
27 | /// A type alias for errors specific to Unicode case folding. |
28 | pub type FoldResult<T> = result::Result<T, CaseFoldError>; |
29 | |
30 | /// An error that occurs when Unicode-aware simple case folding fails. |
31 | /// |
32 | /// This error can occur when the case mapping tables necessary for Unicode |
33 | /// aware case folding are unavailable. This only occurs when the |
34 | /// `unicode-case` feature is disabled. (The feature is enabled by default.) |
35 | #[derive (Debug)] |
36 | pub struct CaseFoldError(()); |
37 | |
38 | impl error::Error for CaseFoldError {} |
39 | |
40 | impl fmt::Display for CaseFoldError { |
41 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
42 | write!( |
43 | f, |
44 | "Unicode-aware case folding is not available \ |
45 | (probably because the unicode-case feature is not enabled)" |
46 | ) |
47 | } |
48 | } |
49 | |
50 | /// An error that occurs when the Unicode-aware `\w` class is unavailable. |
51 | /// |
52 | /// This error can occur when the data tables necessary for the Unicode aware |
53 | /// Perl character class `\w` are unavailable. This only occurs when the |
54 | /// `unicode-perl` feature is disabled. (The feature is enabled by default.) |
55 | #[derive (Debug)] |
56 | pub struct UnicodeWordError(()); |
57 | |
58 | impl error::Error for UnicodeWordError {} |
59 | |
60 | impl fmt::Display for UnicodeWordError { |
61 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
62 | write!( |
63 | f, |
64 | "Unicode-aware \\w class is not available \ |
65 | (probably because the unicode-perl feature is not enabled)" |
66 | ) |
67 | } |
68 | } |
69 | |
70 | /// Return an iterator over the equivalence class of simple case mappings |
71 | /// for the given codepoint. The equivalence class does not include the |
72 | /// given codepoint. |
73 | /// |
74 | /// If the equivalence class is empty, then this returns the next scalar |
75 | /// value that has a non-empty equivalence class, if it exists. If no such |
76 | /// scalar value exists, then `None` is returned. The point of this behavior |
77 | /// is to permit callers to avoid calling `simple_fold` more than they need |
78 | /// to, since there is some cost to fetching the equivalence class. |
79 | /// |
80 | /// This returns an error if the Unicode case folding tables are not available. |
81 | pub fn simple_fold( |
82 | c: char, |
83 | ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> { |
84 | #[cfg (not(feature = "unicode-case" ))] |
85 | fn imp( |
86 | _: char, |
87 | ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> |
88 | { |
89 | use std::option::IntoIter; |
90 | Err::<result::Result<IntoIter<char>, _>, _>(CaseFoldError(())) |
91 | } |
92 | |
93 | #[cfg (feature = "unicode-case" )] |
94 | fn imp( |
95 | c: char, |
96 | ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> |
97 | { |
98 | use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; |
99 | |
100 | Ok(CASE_FOLDING_SIMPLE |
101 | .binary_search_by_key(&c, |&(c1, _)| c1) |
102 | .map(|i| CASE_FOLDING_SIMPLE[i].1.iter().copied()) |
103 | .map_err(|i| { |
104 | if i >= CASE_FOLDING_SIMPLE.len() { |
105 | None |
106 | } else { |
107 | Some(CASE_FOLDING_SIMPLE[i].0) |
108 | } |
109 | })) |
110 | } |
111 | |
112 | imp(c) |
113 | } |
114 | |
115 | /// Returns true if and only if the given (inclusive) range contains at least |
116 | /// one Unicode scalar value that has a non-empty non-trivial simple case |
117 | /// mapping. |
118 | /// |
119 | /// This function panics if `end < start`. |
120 | /// |
121 | /// This returns an error if the Unicode case folding tables are not available. |
122 | pub fn contains_simple_case_mapping( |
123 | start: char, |
124 | end: char, |
125 | ) -> FoldResult<bool> { |
126 | #[cfg (not(feature = "unicode-case" ))] |
127 | fn imp(_: char, _: char) -> FoldResult<bool> { |
128 | Err(CaseFoldError(())) |
129 | } |
130 | |
131 | #[cfg (feature = "unicode-case" )] |
132 | fn imp(start: char, end: char) -> FoldResult<bool> { |
133 | use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; |
134 | use std::cmp::Ordering; |
135 | |
136 | assert!(start <= end); |
137 | Ok(CASE_FOLDING_SIMPLE |
138 | .binary_search_by(|&(c, _)| { |
139 | if start <= c && c <= end { |
140 | Ordering::Equal |
141 | } else if c > end { |
142 | Ordering::Greater |
143 | } else { |
144 | Ordering::Less |
145 | } |
146 | }) |
147 | .is_ok()) |
148 | } |
149 | |
150 | imp(start, end) |
151 | } |
152 | |
153 | /// A query for finding a character class defined by Unicode. This supports |
154 | /// either use of a property name directly, or lookup by property value. The |
155 | /// former generally refers to Binary properties (see UTS#44, Table 8), but |
156 | /// as a special exception (see UTS#18, Section 1.2) both general categories |
157 | /// (an enumeration) and scripts (a catalog) are supported as if each of their |
158 | /// possible values were a binary property. |
159 | /// |
160 | /// In all circumstances, property names and values are normalized and |
161 | /// canonicalized. That is, `GC == gc == GeneralCategory == general_category`. |
162 | /// |
163 | /// The lifetime `'a` refers to the shorter of the lifetimes of property name |
164 | /// and property value. |
165 | #[derive (Debug)] |
166 | pub enum ClassQuery<'a> { |
167 | /// Return a class corresponding to a Unicode binary property, named by |
168 | /// a single letter. |
169 | OneLetter(char), |
170 | /// Return a class corresponding to a Unicode binary property. |
171 | /// |
172 | /// Note that, by special exception (see UTS#18, Section 1.2), both |
173 | /// general category values and script values are permitted here as if |
174 | /// they were a binary property. |
175 | Binary(&'a str), |
176 | /// Return a class corresponding to all codepoints whose property |
177 | /// (identified by `property_name`) corresponds to the given value |
178 | /// (identified by `property_value`). |
179 | ByValue { |
180 | /// A property name. |
181 | property_name: &'a str, |
182 | /// A property value. |
183 | property_value: &'a str, |
184 | }, |
185 | } |
186 | |
187 | impl<'a> ClassQuery<'a> { |
188 | fn canonicalize(&self) -> Result<CanonicalClassQuery> { |
189 | match *self { |
190 | ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()), |
191 | ClassQuery::Binary(name) => self.canonical_binary(name), |
192 | ClassQuery::ByValue { property_name, property_value } => { |
193 | let property_name = symbolic_name_normalize(property_name); |
194 | let property_value = symbolic_name_normalize(property_value); |
195 | |
196 | let canon_name = match canonical_prop(&property_name)? { |
197 | None => return Err(Error::PropertyNotFound), |
198 | Some(canon_name) => canon_name, |
199 | }; |
200 | Ok(match canon_name { |
201 | "General_Category" => { |
202 | let canon = match canonical_gencat(&property_value)? { |
203 | None => return Err(Error::PropertyValueNotFound), |
204 | Some(canon) => canon, |
205 | }; |
206 | CanonicalClassQuery::GeneralCategory(canon) |
207 | } |
208 | "Script" => { |
209 | let canon = match canonical_script(&property_value)? { |
210 | None => return Err(Error::PropertyValueNotFound), |
211 | Some(canon) => canon, |
212 | }; |
213 | CanonicalClassQuery::Script(canon) |
214 | } |
215 | _ => { |
216 | let vals = match property_values(canon_name)? { |
217 | None => return Err(Error::PropertyValueNotFound), |
218 | Some(vals) => vals, |
219 | }; |
220 | let canon_val = |
221 | match canonical_value(vals, &property_value) { |
222 | None => { |
223 | return Err(Error::PropertyValueNotFound) |
224 | } |
225 | Some(canon_val) => canon_val, |
226 | }; |
227 | CanonicalClassQuery::ByValue { |
228 | property_name: canon_name, |
229 | property_value: canon_val, |
230 | } |
231 | } |
232 | }) |
233 | } |
234 | } |
235 | } |
236 | |
237 | fn canonical_binary(&self, name: &str) -> Result<CanonicalClassQuery> { |
238 | let norm = symbolic_name_normalize(name); |
239 | |
240 | // This is a special case where 'cf' refers to the 'Format' general |
241 | // category, but where the 'cf' abbreviation is also an abbreviation |
242 | // for the 'Case_Folding' property. But we want to treat it as |
243 | // a general category. (Currently, we don't even support the |
244 | // 'Case_Folding' property. But if we do in the future, users will be |
245 | // required to spell it out.) |
246 | if norm != "cf" { |
247 | if let Some(canon) = canonical_prop(&norm)? { |
248 | return Ok(CanonicalClassQuery::Binary(canon)); |
249 | } |
250 | } |
251 | if let Some(canon) = canonical_gencat(&norm)? { |
252 | return Ok(CanonicalClassQuery::GeneralCategory(canon)); |
253 | } |
254 | if let Some(canon) = canonical_script(&norm)? { |
255 | return Ok(CanonicalClassQuery::Script(canon)); |
256 | } |
257 | Err(Error::PropertyNotFound) |
258 | } |
259 | } |
260 | |
261 | /// Like ClassQuery, but its parameters have been canonicalized. This also |
262 | /// differentiates binary properties from flattened general categories and |
263 | /// scripts. |
264 | #[derive (Debug, Eq, PartialEq)] |
265 | enum CanonicalClassQuery { |
266 | /// The canonical binary property name. |
267 | Binary(&'static str), |
268 | /// The canonical general category name. |
269 | GeneralCategory(&'static str), |
270 | /// The canonical script name. |
271 | Script(&'static str), |
272 | /// An arbitrary association between property and value, both of which |
273 | /// have been canonicalized. |
274 | /// |
275 | /// Note that by construction, the property name of ByValue will never |
276 | /// be General_Category or Script. Those two cases are subsumed by the |
277 | /// eponymous variants. |
278 | ByValue { |
279 | /// The canonical property name. |
280 | property_name: &'static str, |
281 | /// The canonical property value. |
282 | property_value: &'static str, |
283 | }, |
284 | } |
285 | |
286 | /// Looks up a Unicode class given a query. If one doesn't exist, then |
287 | /// `None` is returned. |
288 | pub fn class(query: ClassQuery<'_>) -> Result<hir::ClassUnicode> { |
289 | use self::CanonicalClassQuery::*; |
290 | |
291 | match query.canonicalize()? { |
292 | Binary(name) => bool_property(name), |
293 | GeneralCategory(name) => gencat(name), |
294 | Script(name) => script(name), |
295 | ByValue { property_name: "Age" , property_value } => { |
296 | let mut class = hir::ClassUnicode::empty(); |
297 | for set in ages(property_value)? { |
298 | class.union(&hir_class(set)); |
299 | } |
300 | Ok(class) |
301 | } |
302 | ByValue { property_name: "Script_Extensions" , property_value } => { |
303 | script_extension(property_value) |
304 | } |
305 | ByValue { |
306 | property_name: "Grapheme_Cluster_Break" , |
307 | property_value, |
308 | } => gcb(property_value), |
309 | ByValue { property_name: "Sentence_Break" , property_value } => { |
310 | sb(property_value) |
311 | } |
312 | ByValue { property_name: "Word_Break" , property_value } => { |
313 | wb(property_value) |
314 | } |
315 | _ => { |
316 | // What else should we support? |
317 | Err(Error::PropertyNotFound) |
318 | } |
319 | } |
320 | } |
321 | |
322 | /// Returns a Unicode aware class for \w. |
323 | /// |
324 | /// This returns an error if the data is not available for \w. |
325 | pub fn perl_word() -> Result<hir::ClassUnicode> { |
326 | #[cfg (not(feature = "unicode-perl" ))] |
327 | fn imp() -> Result<hir::ClassUnicode> { |
328 | Err(Error::PerlClassNotFound) |
329 | } |
330 | |
331 | #[cfg (feature = "unicode-perl" )] |
332 | fn imp() -> Result<hir::ClassUnicode> { |
333 | use crate::unicode_tables::perl_word::PERL_WORD; |
334 | Ok(hir_class(PERL_WORD)) |
335 | } |
336 | |
337 | imp() |
338 | } |
339 | |
340 | /// Returns a Unicode aware class for \s. |
341 | /// |
342 | /// This returns an error if the data is not available for \s. |
343 | pub fn perl_space() -> Result<hir::ClassUnicode> { |
344 | #[cfg (not(any(feature = "unicode-perl" , feature = "unicode-bool" )))] |
345 | fn imp() -> Result<hir::ClassUnicode> { |
346 | Err(Error::PerlClassNotFound) |
347 | } |
348 | |
349 | #[cfg (all(feature = "unicode-perl" , not(feature = "unicode-bool" )))] |
350 | fn imp() -> Result<hir::ClassUnicode> { |
351 | use crate::unicode_tables::perl_space::WHITE_SPACE; |
352 | Ok(hir_class(WHITE_SPACE)) |
353 | } |
354 | |
355 | #[cfg (feature = "unicode-bool" )] |
356 | fn imp() -> Result<hir::ClassUnicode> { |
357 | use crate::unicode_tables::property_bool::WHITE_SPACE; |
358 | Ok(hir_class(WHITE_SPACE)) |
359 | } |
360 | |
361 | imp() |
362 | } |
363 | |
364 | /// Returns a Unicode aware class for \d. |
365 | /// |
366 | /// This returns an error if the data is not available for \d. |
367 | pub fn perl_digit() -> Result<hir::ClassUnicode> { |
368 | #[cfg (not(any(feature = "unicode-perl" , feature = "unicode-gencat" )))] |
369 | fn imp() -> Result<hir::ClassUnicode> { |
370 | Err(Error::PerlClassNotFound) |
371 | } |
372 | |
373 | #[cfg (all(feature = "unicode-perl" , not(feature = "unicode-gencat" )))] |
374 | fn imp() -> Result<hir::ClassUnicode> { |
375 | use crate::unicode_tables::perl_decimal::DECIMAL_NUMBER; |
376 | Ok(hir_class(DECIMAL_NUMBER)) |
377 | } |
378 | |
379 | #[cfg (feature = "unicode-gencat" )] |
380 | fn imp() -> Result<hir::ClassUnicode> { |
381 | use crate::unicode_tables::general_category::DECIMAL_NUMBER; |
382 | Ok(hir_class(DECIMAL_NUMBER)) |
383 | } |
384 | |
385 | imp() |
386 | } |
387 | |
388 | /// Build a Unicode HIR class from a sequence of Unicode scalar value ranges. |
389 | pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode { |
390 | let hir_ranges: Vec<hir::ClassUnicodeRange> = rangesimpl Iterator |
391 | .iter() |
392 | .map(|&(s: char, e: char)| hir::ClassUnicodeRange::new(start:s, end:e)) |
393 | .collect(); |
394 | hir::ClassUnicode::new(hir_ranges) |
395 | } |
396 | |
397 | /// Returns true only if the given codepoint is in the `\w` character class. |
398 | /// |
399 | /// If the `unicode-perl` feature is not enabled, then this returns an error. |
400 | pub fn is_word_character(c: char) -> result::Result<bool, UnicodeWordError> { |
401 | #[cfg (not(feature = "unicode-perl" ))] |
402 | fn imp(_: char) -> result::Result<bool, UnicodeWordError> { |
403 | Err(UnicodeWordError(())) |
404 | } |
405 | |
406 | #[cfg (feature = "unicode-perl" )] |
407 | fn imp(c: char) -> result::Result<bool, UnicodeWordError> { |
408 | use crate::is_word_byte; |
409 | use crate::unicode_tables::perl_word::PERL_WORD; |
410 | use std::cmp::Ordering; |
411 | |
412 | if c <= 0x7F as char && is_word_byte(c as u8) { |
413 | return Ok(true); |
414 | } |
415 | Ok(PERL_WORD |
416 | .binary_search_by(|&(start, end)| { |
417 | if start <= c && c <= end { |
418 | Ordering::Equal |
419 | } else if start > c { |
420 | Ordering::Greater |
421 | } else { |
422 | Ordering::Less |
423 | } |
424 | }) |
425 | .is_ok()) |
426 | } |
427 | |
428 | imp(c) |
429 | } |
430 | |
431 | /// A mapping of property values for a specific property. |
432 | /// |
433 | /// The first element of each tuple is a normalized property value while the |
434 | /// second element of each tuple is the corresponding canonical property |
435 | /// value. |
436 | type PropertyValues = &'static [(&'static str, &'static str)]; |
437 | |
438 | fn canonical_gencat(normalized_value: &str) -> Result<Option<&'static str>> { |
439 | Ok(match normalized_value { |
440 | "any" => Some("Any" ), |
441 | "assigned" => Some("Assigned" ), |
442 | "ascii" => Some("ASCII" ), |
443 | _ => { |
444 | let gencats: &[(&str, &str)] = property_values(canonical_property_name:"General_Category" )?.unwrap(); |
445 | canonical_value(vals:gencats, normalized_value) |
446 | } |
447 | }) |
448 | } |
449 | |
450 | fn canonical_script(normalized_value: &str) -> Result<Option<&'static str>> { |
451 | let scripts: &[(&str, &str)] = property_values(canonical_property_name:"Script" )?.unwrap(); |
452 | Ok(canonical_value(vals:scripts, normalized_value)) |
453 | } |
454 | |
455 | /// Find the canonical property name for the given normalized property name. |
456 | /// |
457 | /// If no such property exists, then `None` is returned. |
458 | /// |
459 | /// The normalized property name must have been normalized according to |
460 | /// UAX44 LM3, which can be done using `symbolic_name_normalize`. |
461 | /// |
462 | /// If the property names data is not available, then an error is returned. |
463 | fn canonical_prop(normalized_name: &str) -> Result<Option<&'static str>> { |
464 | #[cfg (not(any( |
465 | feature = "unicode-age" , |
466 | feature = "unicode-bool" , |
467 | feature = "unicode-gencat" , |
468 | feature = "unicode-perl" , |
469 | feature = "unicode-script" , |
470 | feature = "unicode-segment" , |
471 | )))] |
472 | fn imp(_: &str) -> Result<Option<&'static str>> { |
473 | Err(Error::PropertyNotFound) |
474 | } |
475 | |
476 | #[cfg (any( |
477 | feature = "unicode-age" , |
478 | feature = "unicode-bool" , |
479 | feature = "unicode-gencat" , |
480 | feature = "unicode-perl" , |
481 | feature = "unicode-script" , |
482 | feature = "unicode-segment" , |
483 | ))] |
484 | fn imp(name: &str) -> Result<Option<&'static str>> { |
485 | use crate::unicode_tables::property_names::PROPERTY_NAMES; |
486 | |
487 | Ok(PROPERTY_NAMES |
488 | .binary_search_by_key(&name, |&(n, _)| n) |
489 | .ok() |
490 | .map(|i| PROPERTY_NAMES[i].1)) |
491 | } |
492 | |
493 | imp(normalized_name) |
494 | } |
495 | |
496 | /// Find the canonical property value for the given normalized property |
497 | /// value. |
498 | /// |
499 | /// The given property values should correspond to the values for the property |
500 | /// under question, which can be found using `property_values`. |
501 | /// |
502 | /// If no such property value exists, then `None` is returned. |
503 | /// |
504 | /// The normalized property value must have been normalized according to |
505 | /// UAX44 LM3, which can be done using `symbolic_name_normalize`. |
506 | fn canonical_value( |
507 | vals: PropertyValues, |
508 | normalized_value: &str, |
509 | ) -> Option<&'static str> { |
510 | valsOption.binary_search_by_key(&normalized_value, |&(n: &str, _)| n) |
511 | .ok() |
512 | .map(|i: usize| vals[i].1) |
513 | } |
514 | |
515 | /// Return the table of property values for the given property name. |
516 | /// |
517 | /// If the property values data is not available, then an error is returned. |
518 | fn property_values( |
519 | canonical_property_name: &'static str, |
520 | ) -> Result<Option<PropertyValues>> { |
521 | #[cfg (not(any( |
522 | feature = "unicode-age" , |
523 | feature = "unicode-bool" , |
524 | feature = "unicode-gencat" , |
525 | feature = "unicode-perl" , |
526 | feature = "unicode-script" , |
527 | feature = "unicode-segment" , |
528 | )))] |
529 | fn imp(_: &'static str) -> Result<Option<PropertyValues>> { |
530 | Err(Error::PropertyValueNotFound) |
531 | } |
532 | |
533 | #[cfg (any( |
534 | feature = "unicode-age" , |
535 | feature = "unicode-bool" , |
536 | feature = "unicode-gencat" , |
537 | feature = "unicode-perl" , |
538 | feature = "unicode-script" , |
539 | feature = "unicode-segment" , |
540 | ))] |
541 | fn imp(name: &'static str) -> Result<Option<PropertyValues>> { |
542 | use crate::unicode_tables::property_values::PROPERTY_VALUES; |
543 | |
544 | Ok(PROPERTY_VALUES |
545 | .binary_search_by_key(&name, |&(n, _)| n) |
546 | .ok() |
547 | .map(|i| PROPERTY_VALUES[i].1)) |
548 | } |
549 | |
550 | imp(canonical_property_name) |
551 | } |
552 | |
553 | // This is only used in some cases, but small enough to just let it be dead |
554 | // instead of figuring out (and maintaining) the right set of features. |
555 | #[allow (dead_code)] |
556 | fn property_set( |
557 | name_map: &'static [(&'static str, Range)], |
558 | canonical: &'static str, |
559 | ) -> Option<Range> { |
560 | name_mapOption |
561 | .binary_search_by_key(&canonical, |x: &(&str, &[(char, char)])| x.0) |
562 | .ok() |
563 | .map(|i: usize| name_map[i].1) |
564 | } |
565 | |
566 | /// Returns an iterator over Unicode Age sets. Each item corresponds to a set |
567 | /// of codepoints that were added in a particular revision of Unicode. The |
568 | /// iterator yields items in chronological order. |
569 | /// |
570 | /// If the given age value isn't valid or if the data isn't available, then an |
571 | /// error is returned instead. |
572 | fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>> { |
573 | #[cfg (not(feature = "unicode-age" ))] |
574 | fn imp(_: &str) -> Result<impl Iterator<Item = Range>> { |
575 | use std::option::IntoIter; |
576 | Err::<IntoIter<Range>, _>(Error::PropertyNotFound) |
577 | } |
578 | |
579 | #[cfg (feature = "unicode-age" )] |
580 | fn imp(canonical_age: &str) -> Result<impl Iterator<Item = Range>> { |
581 | use crate::unicode_tables::age; |
582 | |
583 | const AGES: &[(&str, Range)] = &[ |
584 | ("V1_1" , age::V1_1), |
585 | ("V2_0" , age::V2_0), |
586 | ("V2_1" , age::V2_1), |
587 | ("V3_0" , age::V3_0), |
588 | ("V3_1" , age::V3_1), |
589 | ("V3_2" , age::V3_2), |
590 | ("V4_0" , age::V4_0), |
591 | ("V4_1" , age::V4_1), |
592 | ("V5_0" , age::V5_0), |
593 | ("V5_1" , age::V5_1), |
594 | ("V5_2" , age::V5_2), |
595 | ("V6_0" , age::V6_0), |
596 | ("V6_1" , age::V6_1), |
597 | ("V6_2" , age::V6_2), |
598 | ("V6_3" , age::V6_3), |
599 | ("V7_0" , age::V7_0), |
600 | ("V8_0" , age::V8_0), |
601 | ("V9_0" , age::V9_0), |
602 | ("V10_0" , age::V10_0), |
603 | ("V11_0" , age::V11_0), |
604 | ("V12_0" , age::V12_0), |
605 | ("V12_1" , age::V12_1), |
606 | ("V13_0" , age::V13_0), |
607 | ("V14_0" , age::V14_0), |
608 | ("V15_0" , age::V15_0), |
609 | ]; |
610 | assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync" ); |
611 | |
612 | let pos = AGES.iter().position(|&(age, _)| canonical_age == age); |
613 | match pos { |
614 | None => Err(Error::PropertyValueNotFound), |
615 | Some(i) => Ok(AGES[..=i].iter().map(|&(_, classes)| classes)), |
616 | } |
617 | } |
618 | |
619 | imp(canonical_age) |
620 | } |
621 | |
622 | /// Returns the Unicode HIR class corresponding to the given general category. |
623 | /// |
624 | /// Name canonicalization is assumed to be performed by the caller. |
625 | /// |
626 | /// If the given general category could not be found, or if the general |
627 | /// category data is not available, then an error is returned. |
628 | fn gencat(canonical_name: &'static str) -> Result<hir::ClassUnicode> { |
629 | #[cfg (not(feature = "unicode-gencat" ))] |
630 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { |
631 | Err(Error::PropertyNotFound) |
632 | } |
633 | |
634 | #[cfg (feature = "unicode-gencat" )] |
635 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { |
636 | use crate::unicode_tables::general_category::BY_NAME; |
637 | match name { |
638 | "ASCII" => Ok(hir_class(&[(' \0' , ' \x7F' )])), |
639 | "Any" => Ok(hir_class(&[(' \0' , ' \u{10FFFF}' )])), |
640 | "Assigned" => { |
641 | let mut cls = gencat("Unassigned" )?; |
642 | cls.negate(); |
643 | Ok(cls) |
644 | } |
645 | name => property_set(BY_NAME, name) |
646 | .map(hir_class) |
647 | .ok_or(Error::PropertyValueNotFound), |
648 | } |
649 | } |
650 | |
651 | match canonical_name { |
652 | "Decimal_Number" => perl_digit(), |
653 | name => imp(name), |
654 | } |
655 | } |
656 | |
657 | /// Returns the Unicode HIR class corresponding to the given script. |
658 | /// |
659 | /// Name canonicalization is assumed to be performed by the caller. |
660 | /// |
661 | /// If the given script could not be found, or if the script data is not |
662 | /// available, then an error is returned. |
663 | fn script(canonical_name: &'static str) -> Result<hir::ClassUnicode> { |
664 | #[cfg (not(feature = "unicode-script" ))] |
665 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { |
666 | Err(Error::PropertyNotFound) |
667 | } |
668 | |
669 | #[cfg (feature = "unicode-script" )] |
670 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { |
671 | use crate::unicode_tables::script::BY_NAME; |
672 | property_set(BY_NAME, name) |
673 | .map(hir_class) |
674 | .ok_or(err:Error::PropertyValueNotFound) |
675 | } |
676 | |
677 | imp(canonical_name) |
678 | } |
679 | |
680 | /// Returns the Unicode HIR class corresponding to the given script extension. |
681 | /// |
682 | /// Name canonicalization is assumed to be performed by the caller. |
683 | /// |
684 | /// If the given script extension could not be found, or if the script data is |
685 | /// not available, then an error is returned. |
686 | fn script_extension( |
687 | canonical_name: &'static str, |
688 | ) -> Result<hir::ClassUnicode> { |
689 | #[cfg (not(feature = "unicode-script" ))] |
690 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { |
691 | Err(Error::PropertyNotFound) |
692 | } |
693 | |
694 | #[cfg (feature = "unicode-script" )] |
695 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { |
696 | use crate::unicode_tables::script_extension::BY_NAME; |
697 | property_set(BY_NAME, name) |
698 | .map(hir_class) |
699 | .ok_or(err:Error::PropertyValueNotFound) |
700 | } |
701 | |
702 | imp(canonical_name) |
703 | } |
704 | |
705 | /// Returns the Unicode HIR class corresponding to the given Unicode boolean |
706 | /// property. |
707 | /// |
708 | /// Name canonicalization is assumed to be performed by the caller. |
709 | /// |
710 | /// If the given boolean property could not be found, or if the boolean |
711 | /// property data is not available, then an error is returned. |
712 | fn bool_property(canonical_name: &'static str) -> Result<hir::ClassUnicode> { |
713 | #[cfg (not(feature = "unicode-bool" ))] |
714 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { |
715 | Err(Error::PropertyNotFound) |
716 | } |
717 | |
718 | #[cfg (feature = "unicode-bool" )] |
719 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { |
720 | use crate::unicode_tables::property_bool::BY_NAME; |
721 | property_set(BY_NAME, name) |
722 | .map(hir_class) |
723 | .ok_or(err:Error::PropertyNotFound) |
724 | } |
725 | |
726 | match canonical_name { |
727 | "Decimal_Number" => perl_digit(), |
728 | "White_Space" => perl_space(), |
729 | name: &str => imp(name), |
730 | } |
731 | } |
732 | |
733 | /// Returns the Unicode HIR class corresponding to the given grapheme cluster |
734 | /// break property. |
735 | /// |
736 | /// Name canonicalization is assumed to be performed by the caller. |
737 | /// |
738 | /// If the given property could not be found, or if the corresponding data is |
739 | /// not available, then an error is returned. |
740 | fn gcb(canonical_name: &'static str) -> Result<hir::ClassUnicode> { |
741 | #[cfg (not(feature = "unicode-segment" ))] |
742 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { |
743 | Err(Error::PropertyNotFound) |
744 | } |
745 | |
746 | #[cfg (feature = "unicode-segment" )] |
747 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { |
748 | use crate::unicode_tables::grapheme_cluster_break::BY_NAME; |
749 | property_set(BY_NAME, name) |
750 | .map(hir_class) |
751 | .ok_or(err:Error::PropertyValueNotFound) |
752 | } |
753 | |
754 | imp(canonical_name) |
755 | } |
756 | |
757 | /// Returns the Unicode HIR class corresponding to the given word break |
758 | /// property. |
759 | /// |
760 | /// Name canonicalization is assumed to be performed by the caller. |
761 | /// |
762 | /// If the given property could not be found, or if the corresponding data is |
763 | /// not available, then an error is returned. |
764 | fn wb(canonical_name: &'static str) -> Result<hir::ClassUnicode> { |
765 | #[cfg (not(feature = "unicode-segment" ))] |
766 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { |
767 | Err(Error::PropertyNotFound) |
768 | } |
769 | |
770 | #[cfg (feature = "unicode-segment" )] |
771 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { |
772 | use crate::unicode_tables::word_break::BY_NAME; |
773 | property_set(BY_NAME, name) |
774 | .map(hir_class) |
775 | .ok_or(err:Error::PropertyValueNotFound) |
776 | } |
777 | |
778 | imp(canonical_name) |
779 | } |
780 | |
781 | /// Returns the Unicode HIR class corresponding to the given sentence |
782 | /// break property. |
783 | /// |
784 | /// Name canonicalization is assumed to be performed by the caller. |
785 | /// |
786 | /// If the given property could not be found, or if the corresponding data is |
787 | /// not available, then an error is returned. |
788 | fn sb(canonical_name: &'static str) -> Result<hir::ClassUnicode> { |
789 | #[cfg (not(feature = "unicode-segment" ))] |
790 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { |
791 | Err(Error::PropertyNotFound) |
792 | } |
793 | |
794 | #[cfg (feature = "unicode-segment" )] |
795 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { |
796 | use crate::unicode_tables::sentence_break::BY_NAME; |
797 | property_set(BY_NAME, name) |
798 | .map(hir_class) |
799 | .ok_or(err:Error::PropertyValueNotFound) |
800 | } |
801 | |
802 | imp(canonical_name) |
803 | } |
804 | |
805 | /// Like symbolic_name_normalize_bytes, but operates on a string. |
806 | fn symbolic_name_normalize(x: &str) -> String { |
807 | let mut tmp: Vec = x.as_bytes().to_vec(); |
808 | let len: usize = symbolic_name_normalize_bytes(&mut tmp).len(); |
809 | tmp.truncate(len); |
810 | // This should always succeed because `symbolic_name_normalize_bytes` |
811 | // guarantees that `&tmp[..len]` is always valid UTF-8. |
812 | // |
813 | // N.B. We could avoid the additional UTF-8 check here, but it's unlikely |
814 | // to be worth skipping the additional safety check. A benchmark must |
815 | // justify it first. |
816 | String::from_utf8(vec:tmp).unwrap() |
817 | } |
818 | |
819 | /// Normalize the given symbolic name in place according to UAX44-LM3. |
820 | /// |
821 | /// A "symbolic name" typically corresponds to property names and property |
822 | /// value aliases. Note, though, that it should not be applied to property |
823 | /// string values. |
824 | /// |
825 | /// The slice returned is guaranteed to be valid UTF-8 for all possible values |
826 | /// of `slice`. |
827 | /// |
828 | /// See: https://unicode.org/reports/tr44/#UAX44-LM3 |
829 | fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] { |
830 | // I couldn't find a place in the standard that specified that property |
831 | // names/aliases had a particular structure (unlike character names), but |
832 | // we assume that it's ASCII only and drop anything that isn't ASCII. |
833 | let mut start = 0; |
834 | let mut starts_with_is = false; |
835 | if slice.len() >= 2 { |
836 | // Ignore any "is" prefix. |
837 | starts_with_is = slice[0..2] == b"is" [..] |
838 | || slice[0..2] == b"IS" [..] |
839 | || slice[0..2] == b"iS" [..] |
840 | || slice[0..2] == b"Is" [..]; |
841 | if starts_with_is { |
842 | start = 2; |
843 | } |
844 | } |
845 | let mut next_write = 0; |
846 | for i in start..slice.len() { |
847 | // VALIDITY ARGUMENT: To guarantee that the resulting slice is valid |
848 | // UTF-8, we ensure that the slice contains only ASCII bytes. In |
849 | // particular, we drop every non-ASCII byte from the normalized string. |
850 | let b = slice[i]; |
851 | if b == b' ' || b == b'_' || b == b'-' { |
852 | continue; |
853 | } else if b'A' <= b && b <= b'Z' { |
854 | slice[next_write] = b + (b'a' - b'A' ); |
855 | next_write += 1; |
856 | } else if b <= 0x7F { |
857 | slice[next_write] = b; |
858 | next_write += 1; |
859 | } |
860 | } |
861 | // Special case: ISO_Comment has a 'isc' abbreviation. Since we generally |
862 | // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross |
863 | // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it |
864 | // is actually an alias for the 'Other' general category. |
865 | if starts_with_is && next_write == 1 && slice[0] == b'c' { |
866 | slice[0] = b'i' ; |
867 | slice[1] = b's' ; |
868 | slice[2] = b'c' ; |
869 | next_write = 3; |
870 | } |
871 | &mut slice[..next_write] |
872 | } |
873 | |
874 | #[cfg (test)] |
875 | mod tests { |
876 | use super::{ |
877 | contains_simple_case_mapping, simple_fold, symbolic_name_normalize, |
878 | symbolic_name_normalize_bytes, |
879 | }; |
880 | |
881 | #[cfg (feature = "unicode-case" )] |
882 | fn simple_fold_ok(c: char) -> impl Iterator<Item = char> { |
883 | simple_fold(c).unwrap().unwrap() |
884 | } |
885 | |
886 | #[cfg (feature = "unicode-case" )] |
887 | fn simple_fold_err(c: char) -> Option<char> { |
888 | match simple_fold(c).unwrap() { |
889 | Ok(_) => unreachable!("simple_fold returned Ok iterator" ), |
890 | Err(next) => next, |
891 | } |
892 | } |
893 | |
894 | #[cfg (feature = "unicode-case" )] |
895 | fn contains_case_map(start: char, end: char) -> bool { |
896 | contains_simple_case_mapping(start, end).unwrap() |
897 | } |
898 | |
899 | #[test ] |
900 | #[cfg (feature = "unicode-case" )] |
901 | fn simple_fold_k() { |
902 | let xs: Vec<char> = simple_fold_ok('k' ).collect(); |
903 | assert_eq!(xs, vec!['K' , 'K' ]); |
904 | |
905 | let xs: Vec<char> = simple_fold_ok('K' ).collect(); |
906 | assert_eq!(xs, vec!['k' , 'K' ]); |
907 | |
908 | let xs: Vec<char> = simple_fold_ok('K' ).collect(); |
909 | assert_eq!(xs, vec!['K' , 'k' ]); |
910 | } |
911 | |
912 | #[test ] |
913 | #[cfg (feature = "unicode-case" )] |
914 | fn simple_fold_a() { |
915 | let xs: Vec<char> = simple_fold_ok('a' ).collect(); |
916 | assert_eq!(xs, vec!['A' ]); |
917 | |
918 | let xs: Vec<char> = simple_fold_ok('A' ).collect(); |
919 | assert_eq!(xs, vec!['a' ]); |
920 | } |
921 | |
922 | #[test ] |
923 | #[cfg (feature = "unicode-case" )] |
924 | fn simple_fold_empty() { |
925 | assert_eq!(Some('A' ), simple_fold_err('?' )); |
926 | assert_eq!(Some('A' ), simple_fold_err('@' )); |
927 | assert_eq!(Some('a' ), simple_fold_err('[' )); |
928 | assert_eq!(Some('Ⰰ' ), simple_fold_err('☃' )); |
929 | } |
930 | |
931 | #[test ] |
932 | #[cfg (feature = "unicode-case" )] |
933 | fn simple_fold_max() { |
934 | assert_eq!(None, simple_fold_err(' \u{10FFFE}' )); |
935 | assert_eq!(None, simple_fold_err(' \u{10FFFF}' )); |
936 | } |
937 | |
938 | #[test ] |
939 | #[cfg (not(feature = "unicode-case" ))] |
940 | fn simple_fold_disabled() { |
941 | assert!(simple_fold('a' ).is_err()); |
942 | } |
943 | |
944 | #[test ] |
945 | #[cfg (feature = "unicode-case" )] |
946 | fn range_contains() { |
947 | assert!(contains_case_map('A' , 'A' )); |
948 | assert!(contains_case_map('Z' , 'Z' )); |
949 | assert!(contains_case_map('A' , 'Z' )); |
950 | assert!(contains_case_map('@' , 'A' )); |
951 | assert!(contains_case_map('Z' , '[' )); |
952 | assert!(contains_case_map('☃' , 'Ⰰ' )); |
953 | |
954 | assert!(!contains_case_map('[' , '[' )); |
955 | assert!(!contains_case_map('[' , '`' )); |
956 | |
957 | assert!(!contains_case_map('☃' , '☃' )); |
958 | } |
959 | |
960 | #[test ] |
961 | #[cfg (not(feature = "unicode-case" ))] |
962 | fn range_contains_disabled() { |
963 | assert!(contains_simple_case_mapping('a' , 'a' ).is_err()); |
964 | } |
965 | |
966 | #[test ] |
967 | #[cfg (feature = "unicode-gencat" )] |
968 | fn regression_466() { |
969 | use super::{CanonicalClassQuery, ClassQuery}; |
970 | |
971 | let q = ClassQuery::OneLetter('C' ); |
972 | assert_eq!( |
973 | q.canonicalize().unwrap(), |
974 | CanonicalClassQuery::GeneralCategory("Other" ) |
975 | ); |
976 | } |
977 | |
978 | #[test ] |
979 | fn sym_normalize() { |
980 | let sym_norm = symbolic_name_normalize; |
981 | |
982 | assert_eq!(sym_norm("Line_Break" ), "linebreak" ); |
983 | assert_eq!(sym_norm("Line-break" ), "linebreak" ); |
984 | assert_eq!(sym_norm("linebreak" ), "linebreak" ); |
985 | assert_eq!(sym_norm("BA" ), "ba" ); |
986 | assert_eq!(sym_norm("ba" ), "ba" ); |
987 | assert_eq!(sym_norm("Greek" ), "greek" ); |
988 | assert_eq!(sym_norm("isGreek" ), "greek" ); |
989 | assert_eq!(sym_norm("IS_Greek" ), "greek" ); |
990 | assert_eq!(sym_norm("isc" ), "isc" ); |
991 | assert_eq!(sym_norm("is c" ), "isc" ); |
992 | assert_eq!(sym_norm("is_c" ), "isc" ); |
993 | } |
994 | |
995 | #[test ] |
996 | fn valid_utf8_symbolic() { |
997 | let mut x = b"abc \xFFxyz" .to_vec(); |
998 | let y = symbolic_name_normalize_bytes(&mut x); |
999 | assert_eq!(y, b"abcxyz" ); |
1000 | } |
1001 | } |
1002 | |