| 1 | use std::error; | 
| 2 | use std::fmt; | 
|---|
| 3 | use std::result; | 
|---|
| 4 |  | 
|---|
| 5 | use crate::hir; | 
|---|
| 6 |  | 
|---|
| 7 | /// A type alias for errors specific to Unicode handling of classes. | 
|---|
| 8 | pub type Result<T> = result::Result<T, Error>; | 
|---|
| 9 |  | 
|---|
| 10 | /// An inclusive range of codepoints from a generated file (hence the static | 
|---|
| 11 | /// lifetime). | 
|---|
| 12 | type Range = &'static [(char, char)]; | 
|---|
| 13 |  | 
|---|
| 14 | /// An error that occurs when dealing with Unicode. | 
|---|
| 15 | /// | 
|---|
| 16 | /// We don't impl the Error trait here because these always get converted | 
|---|
| 17 | /// into other public errors. (This error type isn't exported.) | 
|---|
| 18 | #[ derive(Debug)] | 
|---|
| 19 | pub enum Error { | 
|---|
| 20 | PropertyNotFound, | 
|---|
| 21 | PropertyValueNotFound, | 
|---|
| 22 | // Not used when unicode-perl is enabled. | 
|---|
| 23 | #[ allow(dead_code)] | 
|---|
| 24 | PerlClassNotFound, | 
|---|
| 25 | } | 
|---|
| 26 |  | 
|---|
| 27 | /// A type alias for errors specific to Unicode case folding. | 
|---|
| 28 | pub type FoldResult<T> = result::Result<T, CaseFoldError>; | 
|---|
| 29 |  | 
|---|
| 30 | /// An error that occurs when Unicode-aware simple case folding fails. | 
|---|
| 31 | /// | 
|---|
| 32 | /// This error can occur when the case mapping tables necessary for Unicode | 
|---|
| 33 | /// aware case folding are unavailable. This only occurs when the | 
|---|
| 34 | /// `unicode-case` feature is disabled. (The feature is enabled by default.) | 
|---|
| 35 | #[ derive(Debug)] | 
|---|
| 36 | pub struct CaseFoldError(()); | 
|---|
| 37 |  | 
|---|
| 38 | impl error::Error for CaseFoldError {} | 
|---|
| 39 |  | 
|---|
| 40 | impl fmt::Display for CaseFoldError { | 
|---|
| 41 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { | 
|---|
| 42 | write!( | 
|---|
| 43 | f, | 
|---|
| 44 | "Unicode-aware case folding is not available \ | 
|---|
| 45 |              (probably because the unicode-case feature is not enabled)" | 
|---|
| 46 | ) | 
|---|
| 47 | } | 
|---|
| 48 | } | 
|---|
| 49 |  | 
|---|
| 50 | /// An error that occurs when the Unicode-aware `\w` class is unavailable. | 
|---|
| 51 | /// | 
|---|
| 52 | /// This error can occur when the data tables necessary for the Unicode aware | 
|---|
| 53 | /// Perl character class `\w` are unavailable. This only occurs when the | 
|---|
| 54 | /// `unicode-perl` feature is disabled. (The feature is enabled by default.) | 
|---|
| 55 | #[ derive(Debug)] | 
|---|
| 56 | pub struct UnicodeWordError(()); | 
|---|
| 57 |  | 
|---|
| 58 | impl error::Error for UnicodeWordError {} | 
|---|
| 59 |  | 
|---|
| 60 | impl fmt::Display for UnicodeWordError { | 
|---|
| 61 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { | 
|---|
| 62 | write!( | 
|---|
| 63 | f, | 
|---|
| 64 | "Unicode-aware \\ w class is not available \ | 
|---|
| 65 |              (probably because the unicode-perl feature is not enabled)" | 
|---|
| 66 | ) | 
|---|
| 67 | } | 
|---|
| 68 | } | 
|---|
| 69 |  | 
|---|
| 70 | /// Return an iterator over the equivalence class of simple case mappings | 
|---|
| 71 | /// for the given codepoint. The equivalence class does not include the | 
|---|
| 72 | /// given codepoint. | 
|---|
| 73 | /// | 
|---|
| 74 | /// If the equivalence class is empty, then this returns the next scalar | 
|---|
| 75 | /// value that has a non-empty equivalence class, if it exists. If no such | 
|---|
| 76 | /// scalar value exists, then `None` is returned. The point of this behavior | 
|---|
| 77 | /// is to permit callers to avoid calling `simple_fold` more than they need | 
|---|
| 78 | /// to, since there is some cost to fetching the equivalence class. | 
|---|
| 79 | /// | 
|---|
| 80 | /// This returns an error if the Unicode case folding tables are not available. | 
|---|
| 81 | pub fn simple_fold( | 
|---|
| 82 | c: char, | 
|---|
| 83 | ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> { | 
|---|
| 84 | #[ cfg(not(feature = "unicode-case"))] | 
|---|
| 85 | fn imp( | 
|---|
| 86 | _: char, | 
|---|
| 87 | ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> | 
|---|
| 88 | { | 
|---|
| 89 | use std::option::IntoIter; | 
|---|
| 90 | Err::<result::Result<IntoIter<char>, _>, _>(CaseFoldError(())) | 
|---|
| 91 | } | 
|---|
| 92 |  | 
|---|
| 93 | #[ cfg(feature = "unicode-case")] | 
|---|
| 94 | fn imp( | 
|---|
| 95 | c: char, | 
|---|
| 96 | ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> | 
|---|
| 97 | { | 
|---|
| 98 | use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; | 
|---|
| 99 |  | 
|---|
| 100 | Ok(CASE_FOLDING_SIMPLE | 
|---|
| 101 | .binary_search_by_key(&c, |&(c1, _)| c1) | 
|---|
| 102 | .map(|i| CASE_FOLDING_SIMPLE[i].1.iter().copied()) | 
|---|
| 103 | .map_err(|i| { | 
|---|
| 104 | if i >= CASE_FOLDING_SIMPLE.len() { | 
|---|
| 105 | None | 
|---|
| 106 | } else { | 
|---|
| 107 | Some(CASE_FOLDING_SIMPLE[i].0) | 
|---|
| 108 | } | 
|---|
| 109 | })) | 
|---|
| 110 | } | 
|---|
| 111 |  | 
|---|
| 112 | imp(c) | 
|---|
| 113 | } | 
|---|
| 114 |  | 
|---|
| 115 | /// Returns true if and only if the given (inclusive) range contains at least | 
|---|
| 116 | /// one Unicode scalar value that has a non-empty non-trivial simple case | 
|---|
| 117 | /// mapping. | 
|---|
| 118 | /// | 
|---|
| 119 | /// This function panics if `end < start`. | 
|---|
| 120 | /// | 
|---|
| 121 | /// This returns an error if the Unicode case folding tables are not available. | 
|---|
| 122 | pub fn contains_simple_case_mapping( | 
|---|
| 123 | start: char, | 
|---|
| 124 | end: char, | 
|---|
| 125 | ) -> FoldResult<bool> { | 
|---|
| 126 | #[ cfg(not(feature = "unicode-case"))] | 
|---|
| 127 | fn imp(_: char, _: char) -> FoldResult<bool> { | 
|---|
| 128 | Err(CaseFoldError(())) | 
|---|
| 129 | } | 
|---|
| 130 |  | 
|---|
| 131 | #[ cfg(feature = "unicode-case")] | 
|---|
| 132 | fn imp(start: char, end: char) -> FoldResult<bool> { | 
|---|
| 133 | use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; | 
|---|
| 134 | use std::cmp::Ordering; | 
|---|
| 135 |  | 
|---|
| 136 | assert!(start <= end); | 
|---|
| 137 | Ok(CASE_FOLDING_SIMPLE | 
|---|
| 138 | .binary_search_by(|&(c, _)| { | 
|---|
| 139 | if start <= c && c <= end { | 
|---|
| 140 | Ordering::Equal | 
|---|
| 141 | } else if c > end { | 
|---|
| 142 | Ordering::Greater | 
|---|
| 143 | } else { | 
|---|
| 144 | Ordering::Less | 
|---|
| 145 | } | 
|---|
| 146 | }) | 
|---|
| 147 | .is_ok()) | 
|---|
| 148 | } | 
|---|
| 149 |  | 
|---|
| 150 | imp(start, end) | 
|---|
| 151 | } | 
|---|
| 152 |  | 
|---|
| 153 | /// A query for finding a character class defined by Unicode. This supports | 
|---|
| 154 | /// either use of a property name directly, or lookup by property value. The | 
|---|
| 155 | /// former generally refers to Binary properties (see UTS#44, Table 8), but | 
|---|
| 156 | /// as a special exception (see UTS#18, Section 1.2) both general categories | 
|---|
| 157 | /// (an enumeration) and scripts (a catalog) are supported as if each of their | 
|---|
| 158 | /// possible values were a binary property. | 
|---|
| 159 | /// | 
|---|
| 160 | /// In all circumstances, property names and values are normalized and | 
|---|
| 161 | /// canonicalized. That is, `GC == gc == GeneralCategory == general_category`. | 
|---|
| 162 | /// | 
|---|
| 163 | /// The lifetime `'a` refers to the shorter of the lifetimes of property name | 
|---|
| 164 | /// and property value. | 
|---|
| 165 | #[ derive(Debug)] | 
|---|
| 166 | pub enum ClassQuery<'a> { | 
|---|
| 167 | /// Return a class corresponding to a Unicode binary property, named by | 
|---|
| 168 | /// a single letter. | 
|---|
| 169 | OneLetter(char), | 
|---|
| 170 | /// Return a class corresponding to a Unicode binary property. | 
|---|
| 171 | /// | 
|---|
| 172 | /// Note that, by special exception (see UTS#18, Section 1.2), both | 
|---|
| 173 | /// general category values and script values are permitted here as if | 
|---|
| 174 | /// they were a binary property. | 
|---|
| 175 | Binary(&'a str), | 
|---|
| 176 | /// Return a class corresponding to all codepoints whose property | 
|---|
| 177 | /// (identified by `property_name`) corresponds to the given value | 
|---|
| 178 | /// (identified by `property_value`). | 
|---|
| 179 | ByValue { | 
|---|
| 180 | /// A property name. | 
|---|
| 181 | property_name: &'a str, | 
|---|
| 182 | /// A property value. | 
|---|
| 183 | property_value: &'a str, | 
|---|
| 184 | }, | 
|---|
| 185 | } | 
|---|
| 186 |  | 
|---|
| 187 | impl<'a> ClassQuery<'a> { | 
|---|
| 188 | fn canonicalize(&self) -> Result<CanonicalClassQuery> { | 
|---|
| 189 | match *self { | 
|---|
| 190 | ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()), | 
|---|
| 191 | ClassQuery::Binary(name) => self.canonical_binary(name), | 
|---|
| 192 | ClassQuery::ByValue { property_name, property_value } => { | 
|---|
| 193 | let property_name = symbolic_name_normalize(property_name); | 
|---|
| 194 | let property_value = symbolic_name_normalize(property_value); | 
|---|
| 195 |  | 
|---|
| 196 | let canon_name = match canonical_prop(&property_name)? { | 
|---|
| 197 | None => return Err(Error::PropertyNotFound), | 
|---|
| 198 | Some(canon_name) => canon_name, | 
|---|
| 199 | }; | 
|---|
| 200 | Ok(match canon_name { | 
|---|
| 201 | "General_Category"=> { | 
|---|
| 202 | let canon = match canonical_gencat(&property_value)? { | 
|---|
| 203 | None => return Err(Error::PropertyValueNotFound), | 
|---|
| 204 | Some(canon) => canon, | 
|---|
| 205 | }; | 
|---|
| 206 | CanonicalClassQuery::GeneralCategory(canon) | 
|---|
| 207 | } | 
|---|
| 208 | "Script"=> { | 
|---|
| 209 | let canon = match canonical_script(&property_value)? { | 
|---|
| 210 | None => return Err(Error::PropertyValueNotFound), | 
|---|
| 211 | Some(canon) => canon, | 
|---|
| 212 | }; | 
|---|
| 213 | CanonicalClassQuery::Script(canon) | 
|---|
| 214 | } | 
|---|
| 215 | _ => { | 
|---|
| 216 | let vals = match property_values(canon_name)? { | 
|---|
| 217 | None => return Err(Error::PropertyValueNotFound), | 
|---|
| 218 | Some(vals) => vals, | 
|---|
| 219 | }; | 
|---|
| 220 | let canon_val = | 
|---|
| 221 | match canonical_value(vals, &property_value) { | 
|---|
| 222 | None => { | 
|---|
| 223 | return Err(Error::PropertyValueNotFound) | 
|---|
| 224 | } | 
|---|
| 225 | Some(canon_val) => canon_val, | 
|---|
| 226 | }; | 
|---|
| 227 | CanonicalClassQuery::ByValue { | 
|---|
| 228 | property_name: canon_name, | 
|---|
| 229 | property_value: canon_val, | 
|---|
| 230 | } | 
|---|
| 231 | } | 
|---|
| 232 | }) | 
|---|
| 233 | } | 
|---|
| 234 | } | 
|---|
| 235 | } | 
|---|
| 236 |  | 
|---|
| 237 | fn canonical_binary(&self, name: &str) -> Result<CanonicalClassQuery> { | 
|---|
| 238 | let norm = symbolic_name_normalize(name); | 
|---|
| 239 |  | 
|---|
| 240 | // This is a special case where 'cf' refers to the 'Format' general | 
|---|
| 241 | // category, but where the 'cf' abbreviation is also an abbreviation | 
|---|
| 242 | // for the 'Case_Folding' property. But we want to treat it as | 
|---|
| 243 | // a general category. (Currently, we don't even support the | 
|---|
| 244 | // 'Case_Folding' property. But if we do in the future, users will be | 
|---|
| 245 | // required to spell it out.) | 
|---|
| 246 | if norm != "cf"{ | 
|---|
| 247 | if let Some(canon) = canonical_prop(&norm)? { | 
|---|
| 248 | return Ok(CanonicalClassQuery::Binary(canon)); | 
|---|
| 249 | } | 
|---|
| 250 | } | 
|---|
| 251 | if let Some(canon) = canonical_gencat(&norm)? { | 
|---|
| 252 | return Ok(CanonicalClassQuery::GeneralCategory(canon)); | 
|---|
| 253 | } | 
|---|
| 254 | if let Some(canon) = canonical_script(&norm)? { | 
|---|
| 255 | return Ok(CanonicalClassQuery::Script(canon)); | 
|---|
| 256 | } | 
|---|
| 257 | Err(Error::PropertyNotFound) | 
|---|
| 258 | } | 
|---|
| 259 | } | 
|---|
| 260 |  | 
|---|
| 261 | /// Like ClassQuery, but its parameters have been canonicalized. This also | 
|---|
| 262 | /// differentiates binary properties from flattened general categories and | 
|---|
| 263 | /// scripts. | 
|---|
| 264 | #[ derive(Debug, Eq, PartialEq)] | 
|---|
| 265 | enum CanonicalClassQuery { | 
|---|
| 266 | /// The canonical binary property name. | 
|---|
| 267 | Binary(&'static str), | 
|---|
| 268 | /// The canonical general category name. | 
|---|
| 269 | GeneralCategory(&'static str), | 
|---|
| 270 | /// The canonical script name. | 
|---|
| 271 | Script(&'static str), | 
|---|
| 272 | /// An arbitrary association between property and value, both of which | 
|---|
| 273 | /// have been canonicalized. | 
|---|
| 274 | /// | 
|---|
| 275 | /// Note that by construction, the property name of ByValue will never | 
|---|
| 276 | /// be General_Category or Script. Those two cases are subsumed by the | 
|---|
| 277 | /// eponymous variants. | 
|---|
| 278 | ByValue { | 
|---|
| 279 | /// The canonical property name. | 
|---|
| 280 | property_name: &'static str, | 
|---|
| 281 | /// The canonical property value. | 
|---|
| 282 | property_value: &'static str, | 
|---|
| 283 | }, | 
|---|
| 284 | } | 
|---|
| 285 |  | 
|---|
| 286 | /// Looks up a Unicode class given a query. If one doesn't exist, then | 
|---|
| 287 | /// `None` is returned. | 
|---|
| 288 | pub fn class(query: ClassQuery<'_>) -> Result<hir::ClassUnicode> { | 
|---|
| 289 | use self::CanonicalClassQuery::*; | 
|---|
| 290 |  | 
|---|
| 291 | match query.canonicalize()? { | 
|---|
| 292 | Binary(name) => bool_property(name), | 
|---|
| 293 | GeneralCategory(name) => gencat(name), | 
|---|
| 294 | Script(name) => script(name), | 
|---|
| 295 | ByValue { property_name: "Age", property_value } => { | 
|---|
| 296 | let mut class = hir::ClassUnicode::empty(); | 
|---|
| 297 | for set in ages(property_value)? { | 
|---|
| 298 | class.union(&hir_class(set)); | 
|---|
| 299 | } | 
|---|
| 300 | Ok(class) | 
|---|
| 301 | } | 
|---|
| 302 | ByValue { property_name: "Script_Extensions", property_value } => { | 
|---|
| 303 | script_extension(property_value) | 
|---|
| 304 | } | 
|---|
| 305 | ByValue { | 
|---|
| 306 | property_name: "Grapheme_Cluster_Break", | 
|---|
| 307 | property_value, | 
|---|
| 308 | } => gcb(property_value), | 
|---|
| 309 | ByValue { property_name: "Sentence_Break", property_value } => { | 
|---|
| 310 | sb(property_value) | 
|---|
| 311 | } | 
|---|
| 312 | ByValue { property_name: "Word_Break", property_value } => { | 
|---|
| 313 | wb(property_value) | 
|---|
| 314 | } | 
|---|
| 315 | _ => { | 
|---|
| 316 | // What else should we support? | 
|---|
| 317 | Err(Error::PropertyNotFound) | 
|---|
| 318 | } | 
|---|
| 319 | } | 
|---|
| 320 | } | 
|---|
| 321 |  | 
|---|
| 322 | /// Returns a Unicode aware class for \w. | 
|---|
| 323 | /// | 
|---|
| 324 | /// This returns an error if the data is not available for \w. | 
|---|
| 325 | pub fn perl_word() -> Result<hir::ClassUnicode> { | 
|---|
| 326 | #[ cfg(not(feature = "unicode-perl"))] | 
|---|
| 327 | fn imp() -> Result<hir::ClassUnicode> { | 
|---|
| 328 | Err(Error::PerlClassNotFound) | 
|---|
| 329 | } | 
|---|
| 330 |  | 
|---|
| 331 | #[ cfg(feature = "unicode-perl")] | 
|---|
| 332 | fn imp() -> Result<hir::ClassUnicode> { | 
|---|
| 333 | use crate::unicode_tables::perl_word::PERL_WORD; | 
|---|
| 334 | Ok(hir_class(PERL_WORD)) | 
|---|
| 335 | } | 
|---|
| 336 |  | 
|---|
| 337 | imp() | 
|---|
| 338 | } | 
|---|
| 339 |  | 
|---|
| 340 | /// Returns a Unicode aware class for \s. | 
|---|
| 341 | /// | 
|---|
| 342 | /// This returns an error if the data is not available for \s. | 
|---|
| 343 | pub fn perl_space() -> Result<hir::ClassUnicode> { | 
|---|
| 344 | #[ cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))] | 
|---|
| 345 | fn imp() -> Result<hir::ClassUnicode> { | 
|---|
| 346 | Err(Error::PerlClassNotFound) | 
|---|
| 347 | } | 
|---|
| 348 |  | 
|---|
| 349 | #[ cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))] | 
|---|
| 350 | fn imp() -> Result<hir::ClassUnicode> { | 
|---|
| 351 | use crate::unicode_tables::perl_space::WHITE_SPACE; | 
|---|
| 352 | Ok(hir_class(WHITE_SPACE)) | 
|---|
| 353 | } | 
|---|
| 354 |  | 
|---|
| 355 | #[ cfg(feature = "unicode-bool")] | 
|---|
| 356 | fn imp() -> Result<hir::ClassUnicode> { | 
|---|
| 357 | use crate::unicode_tables::property_bool::WHITE_SPACE; | 
|---|
| 358 | Ok(hir_class(WHITE_SPACE)) | 
|---|
| 359 | } | 
|---|
| 360 |  | 
|---|
| 361 | imp() | 
|---|
| 362 | } | 
|---|
| 363 |  | 
|---|
| 364 | /// Returns a Unicode aware class for \d. | 
|---|
| 365 | /// | 
|---|
| 366 | /// This returns an error if the data is not available for \d. | 
|---|
| 367 | pub fn perl_digit() -> Result<hir::ClassUnicode> { | 
|---|
| 368 | #[ cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))] | 
|---|
| 369 | fn imp() -> Result<hir::ClassUnicode> { | 
|---|
| 370 | Err(Error::PerlClassNotFound) | 
|---|
| 371 | } | 
|---|
| 372 |  | 
|---|
| 373 | #[ cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))] | 
|---|
| 374 | fn imp() -> Result<hir::ClassUnicode> { | 
|---|
| 375 | use crate::unicode_tables::perl_decimal::DECIMAL_NUMBER; | 
|---|
| 376 | Ok(hir_class(DECIMAL_NUMBER)) | 
|---|
| 377 | } | 
|---|
| 378 |  | 
|---|
| 379 | #[ cfg(feature = "unicode-gencat")] | 
|---|
| 380 | fn imp() -> Result<hir::ClassUnicode> { | 
|---|
| 381 | use crate::unicode_tables::general_category::DECIMAL_NUMBER; | 
|---|
| 382 | Ok(hir_class(DECIMAL_NUMBER)) | 
|---|
| 383 | } | 
|---|
| 384 |  | 
|---|
| 385 | imp() | 
|---|
| 386 | } | 
|---|
| 387 |  | 
|---|
| 388 | /// Build a Unicode HIR class from a sequence of Unicode scalar value ranges. | 
|---|
| 389 | pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode { | 
|---|
| 390 | let hir_ranges: Vec<hir::ClassUnicodeRange> = rangesimpl Iterator  | 
|---|
| 391 | .iter() | 
|---|
| 392 | .map(|&(s: char, e: char)| hir::ClassUnicodeRange::new(start:s, end:e)) | 
|---|
| 393 | .collect(); | 
|---|
| 394 | hir::ClassUnicode::new(hir_ranges) | 
|---|
| 395 | } | 
|---|
| 396 |  | 
|---|
| 397 | /// Returns true only if the given codepoint is in the `\w` character class. | 
|---|
| 398 | /// | 
|---|
| 399 | /// If the `unicode-perl` feature is not enabled, then this returns an error. | 
|---|
| 400 | pub fn is_word_character(c: char) -> result::Result<bool, UnicodeWordError> { | 
|---|
| 401 | #[ cfg(not(feature = "unicode-perl"))] | 
|---|
| 402 | fn imp(_: char) -> result::Result<bool, UnicodeWordError> { | 
|---|
| 403 | Err(UnicodeWordError(())) | 
|---|
| 404 | } | 
|---|
| 405 |  | 
|---|
| 406 | #[ cfg(feature = "unicode-perl")] | 
|---|
| 407 | fn imp(c: char) -> result::Result<bool, UnicodeWordError> { | 
|---|
| 408 | use crate::is_word_byte; | 
|---|
| 409 | use crate::unicode_tables::perl_word::PERL_WORD; | 
|---|
| 410 | use std::cmp::Ordering; | 
|---|
| 411 |  | 
|---|
| 412 | if c <= 0x7F as char && is_word_byte(c as u8) { | 
|---|
| 413 | return Ok(true); | 
|---|
| 414 | } | 
|---|
| 415 | Ok(PERL_WORD | 
|---|
| 416 | .binary_search_by(|&(start, end)| { | 
|---|
| 417 | if start <= c && c <= end { | 
|---|
| 418 | Ordering::Equal | 
|---|
| 419 | } else if start > c { | 
|---|
| 420 | Ordering::Greater | 
|---|
| 421 | } else { | 
|---|
| 422 | Ordering::Less | 
|---|
| 423 | } | 
|---|
| 424 | }) | 
|---|
| 425 | .is_ok()) | 
|---|
| 426 | } | 
|---|
| 427 |  | 
|---|
| 428 | imp(c) | 
|---|
| 429 | } | 
|---|
| 430 |  | 
|---|
| 431 | /// A mapping of property values for a specific property. | 
|---|
| 432 | /// | 
|---|
| 433 | /// The first element of each tuple is a normalized property value while the | 
|---|
| 434 | /// second element of each tuple is the corresponding canonical property | 
|---|
| 435 | /// value. | 
|---|
| 436 | type PropertyValues = &'static [(&'static str, &'static str)]; | 
|---|
| 437 |  | 
|---|
| 438 | fn canonical_gencat(normalized_value: &str) -> Result<Option<&'static str>> { | 
|---|
| 439 | Ok(match normalized_value { | 
|---|
| 440 | "any"=> Some( "Any"), | 
|---|
| 441 | "assigned"=> Some( "Assigned"), | 
|---|
| 442 | "ascii"=> Some( "ASCII"), | 
|---|
| 443 | _ => { | 
|---|
| 444 | let gencats: &'static [(&str, &str)] = property_values(canonical_property_name: "General_Category")?.unwrap(); | 
|---|
| 445 | canonical_value(vals:gencats, normalized_value) | 
|---|
| 446 | } | 
|---|
| 447 | }) | 
|---|
| 448 | } | 
|---|
| 449 |  | 
|---|
| 450 | fn canonical_script(normalized_value: &str) -> Result<Option<&'static str>> { | 
|---|
| 451 | let scripts: &'static [(&str, &str)] = property_values(canonical_property_name: "Script")?.unwrap(); | 
|---|
| 452 | Ok(canonical_value(vals:scripts, normalized_value)) | 
|---|
| 453 | } | 
|---|
| 454 |  | 
|---|
| 455 | /// Find the canonical property name for the given normalized property name. | 
|---|
| 456 | /// | 
|---|
| 457 | /// If no such property exists, then `None` is returned. | 
|---|
| 458 | /// | 
|---|
| 459 | /// The normalized property name must have been normalized according to | 
|---|
| 460 | /// UAX44 LM3, which can be done using `symbolic_name_normalize`. | 
|---|
| 461 | /// | 
|---|
| 462 | /// If the property names data is not available, then an error is returned. | 
|---|
| 463 | fn canonical_prop(normalized_name: &str) -> Result<Option<&'static str>> { | 
|---|
| 464 | #[ cfg(not(any( | 
|---|
| 465 | feature = "unicode-age", | 
|---|
| 466 | feature = "unicode-bool", | 
|---|
| 467 | feature = "unicode-gencat", | 
|---|
| 468 | feature = "unicode-perl", | 
|---|
| 469 | feature = "unicode-script", | 
|---|
| 470 | feature = "unicode-segment", | 
|---|
| 471 | )))] | 
|---|
| 472 | fn imp(_: &str) -> Result<Option<&'static str>> { | 
|---|
| 473 | Err(Error::PropertyNotFound) | 
|---|
| 474 | } | 
|---|
| 475 |  | 
|---|
| 476 | #[ cfg(any( | 
|---|
| 477 | feature = "unicode-age", | 
|---|
| 478 | feature = "unicode-bool", | 
|---|
| 479 | feature = "unicode-gencat", | 
|---|
| 480 | feature = "unicode-perl", | 
|---|
| 481 | feature = "unicode-script", | 
|---|
| 482 | feature = "unicode-segment", | 
|---|
| 483 | ))] | 
|---|
| 484 | fn imp(name: &str) -> Result<Option<&'static str>> { | 
|---|
| 485 | use crate::unicode_tables::property_names::PROPERTY_NAMES; | 
|---|
| 486 |  | 
|---|
| 487 | Ok(PROPERTY_NAMES | 
|---|
| 488 | .binary_search_by_key(&name, |&(n, _)| n) | 
|---|
| 489 | .ok() | 
|---|
| 490 | .map(|i| PROPERTY_NAMES[i].1)) | 
|---|
| 491 | } | 
|---|
| 492 |  | 
|---|
| 493 | imp(normalized_name) | 
|---|
| 494 | } | 
|---|
| 495 |  | 
|---|
| 496 | /// Find the canonical property value for the given normalized property | 
|---|
| 497 | /// value. | 
|---|
| 498 | /// | 
|---|
| 499 | /// The given property values should correspond to the values for the property | 
|---|
| 500 | /// under question, which can be found using `property_values`. | 
|---|
| 501 | /// | 
|---|
| 502 | /// If no such property value exists, then `None` is returned. | 
|---|
| 503 | /// | 
|---|
| 504 | /// The normalized property value must have been normalized according to | 
|---|
| 505 | /// UAX44 LM3, which can be done using `symbolic_name_normalize`. | 
|---|
| 506 | fn canonical_value( | 
|---|
| 507 | vals: PropertyValues, | 
|---|
| 508 | normalized_value: &str, | 
|---|
| 509 | ) -> Option<&'static str> { | 
|---|
| 510 | valsOption.binary_search_by_key(&normalized_value, |&(n: &str, _)| n) | 
|---|
| 511 | .ok() | 
|---|
| 512 | .map(|i: usize| vals[i].1) | 
|---|
| 513 | } | 
|---|
| 514 |  | 
|---|
| 515 | /// Return the table of property values for the given property name. | 
|---|
| 516 | /// | 
|---|
| 517 | /// If the property values data is not available, then an error is returned. | 
|---|
| 518 | fn property_values( | 
|---|
| 519 | canonical_property_name: &'static str, | 
|---|
| 520 | ) -> Result<Option<PropertyValues>> { | 
|---|
| 521 | #[ cfg(not(any( | 
|---|
| 522 | feature = "unicode-age", | 
|---|
| 523 | feature = "unicode-bool", | 
|---|
| 524 | feature = "unicode-gencat", | 
|---|
| 525 | feature = "unicode-perl", | 
|---|
| 526 | feature = "unicode-script", | 
|---|
| 527 | feature = "unicode-segment", | 
|---|
| 528 | )))] | 
|---|
| 529 | fn imp(_: &'static str) -> Result<Option<PropertyValues>> { | 
|---|
| 530 | Err(Error::PropertyValueNotFound) | 
|---|
| 531 | } | 
|---|
| 532 |  | 
|---|
| 533 | #[ cfg(any( | 
|---|
| 534 | feature = "unicode-age", | 
|---|
| 535 | feature = "unicode-bool", | 
|---|
| 536 | feature = "unicode-gencat", | 
|---|
| 537 | feature = "unicode-perl", | 
|---|
| 538 | feature = "unicode-script", | 
|---|
| 539 | feature = "unicode-segment", | 
|---|
| 540 | ))] | 
|---|
| 541 | fn imp(name: &'static str) -> Result<Option<PropertyValues>> { | 
|---|
| 542 | use crate::unicode_tables::property_values::PROPERTY_VALUES; | 
|---|
| 543 |  | 
|---|
| 544 | Ok(PROPERTY_VALUES | 
|---|
| 545 | .binary_search_by_key(&name, |&(n, _)| n) | 
|---|
| 546 | .ok() | 
|---|
| 547 | .map(|i| PROPERTY_VALUES[i].1)) | 
|---|
| 548 | } | 
|---|
| 549 |  | 
|---|
| 550 | imp(canonical_property_name) | 
|---|
| 551 | } | 
|---|
| 552 |  | 
|---|
| 553 | // This is only used in some cases, but small enough to just let it be dead | 
|---|
| 554 | // instead of figuring out (and maintaining) the right set of features. | 
|---|
| 555 | #[ allow(dead_code)] | 
|---|
| 556 | fn property_set( | 
|---|
| 557 | name_map: &'static [(&'static str, Range)], | 
|---|
| 558 | canonical: &'static str, | 
|---|
| 559 | ) -> Option<Range> { | 
|---|
| 560 | name_mapOption | 
|---|
| 561 | .binary_search_by_key(&canonical, |x: &(&str, &[(char, char)])| x.0) | 
|---|
| 562 | .ok() | 
|---|
| 563 | .map(|i: usize| name_map[i].1) | 
|---|
| 564 | } | 
|---|
| 565 |  | 
|---|
| 566 | /// Returns an iterator over Unicode Age sets. Each item corresponds to a set | 
|---|
| 567 | /// of codepoints that were added in a particular revision of Unicode. The | 
|---|
| 568 | /// iterator yields items in chronological order. | 
|---|
| 569 | /// | 
|---|
| 570 | /// If the given age value isn't valid or if the data isn't available, then an | 
|---|
| 571 | /// error is returned instead. | 
|---|
| 572 | fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>> { | 
|---|
| 573 | #[ cfg(not(feature = "unicode-age"))] | 
|---|
| 574 | fn imp(_: &str) -> Result<impl Iterator<Item = Range>> { | 
|---|
| 575 | use std::option::IntoIter; | 
|---|
| 576 | Err::<IntoIter<Range>, _>(Error::PropertyNotFound) | 
|---|
| 577 | } | 
|---|
| 578 |  | 
|---|
| 579 | #[ cfg(feature = "unicode-age")] | 
|---|
| 580 | fn imp(canonical_age: &str) -> Result<impl Iterator<Item = Range>> { | 
|---|
| 581 | use crate::unicode_tables::age; | 
|---|
| 582 |  | 
|---|
| 583 | const AGES: &[(&str, Range)] = &[ | 
|---|
| 584 | ( "V1_1", age::V1_1), | 
|---|
| 585 | ( "V2_0", age::V2_0), | 
|---|
| 586 | ( "V2_1", age::V2_1), | 
|---|
| 587 | ( "V3_0", age::V3_0), | 
|---|
| 588 | ( "V3_1", age::V3_1), | 
|---|
| 589 | ( "V3_2", age::V3_2), | 
|---|
| 590 | ( "V4_0", age::V4_0), | 
|---|
| 591 | ( "V4_1", age::V4_1), | 
|---|
| 592 | ( "V5_0", age::V5_0), | 
|---|
| 593 | ( "V5_1", age::V5_1), | 
|---|
| 594 | ( "V5_2", age::V5_2), | 
|---|
| 595 | ( "V6_0", age::V6_0), | 
|---|
| 596 | ( "V6_1", age::V6_1), | 
|---|
| 597 | ( "V6_2", age::V6_2), | 
|---|
| 598 | ( "V6_3", age::V6_3), | 
|---|
| 599 | ( "V7_0", age::V7_0), | 
|---|
| 600 | ( "V8_0", age::V8_0), | 
|---|
| 601 | ( "V9_0", age::V9_0), | 
|---|
| 602 | ( "V10_0", age::V10_0), | 
|---|
| 603 | ( "V11_0", age::V11_0), | 
|---|
| 604 | ( "V12_0", age::V12_0), | 
|---|
| 605 | ( "V12_1", age::V12_1), | 
|---|
| 606 | ( "V13_0", age::V13_0), | 
|---|
| 607 | ( "V14_0", age::V14_0), | 
|---|
| 608 | ( "V15_0", age::V15_0), | 
|---|
| 609 | ]; | 
|---|
| 610 | assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync"); | 
|---|
| 611 |  | 
|---|
| 612 | let pos = AGES.iter().position(|&(age, _)| canonical_age == age); | 
|---|
| 613 | match pos { | 
|---|
| 614 | None => Err(Error::PropertyValueNotFound), | 
|---|
| 615 | Some(i) => Ok(AGES[..=i].iter().map(|&(_, classes)| classes)), | 
|---|
| 616 | } | 
|---|
| 617 | } | 
|---|
| 618 |  | 
|---|
| 619 | imp(canonical_age) | 
|---|
| 620 | } | 
|---|
| 621 |  | 
|---|
| 622 | /// Returns the Unicode HIR class corresponding to the given general category. | 
|---|
| 623 | /// | 
|---|
| 624 | /// Name canonicalization is assumed to be performed by the caller. | 
|---|
| 625 | /// | 
|---|
| 626 | /// If the given general category could not be found, or if the general | 
|---|
| 627 | /// category data is not available, then an error is returned. | 
|---|
| 628 | fn gencat(canonical_name: &'static str) -> Result<hir::ClassUnicode> { | 
|---|
| 629 | #[ cfg(not(feature = "unicode-gencat"))] | 
|---|
| 630 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { | 
|---|
| 631 | Err(Error::PropertyNotFound) | 
|---|
| 632 | } | 
|---|
| 633 |  | 
|---|
| 634 | #[ cfg(feature = "unicode-gencat")] | 
|---|
| 635 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { | 
|---|
| 636 | use crate::unicode_tables::general_category::BY_NAME; | 
|---|
| 637 | match name { | 
|---|
| 638 | "ASCII"=> Ok(hir_class(&[( '\0 ', '\x7F ')])), | 
|---|
| 639 | "Any"=> Ok(hir_class(&[( '\0 ', '\u{10FFFF} ')])), | 
|---|
| 640 | "Assigned"=> { | 
|---|
| 641 | let mut cls = gencat( "Unassigned")?; | 
|---|
| 642 | cls.negate(); | 
|---|
| 643 | Ok(cls) | 
|---|
| 644 | } | 
|---|
| 645 | name => property_set(BY_NAME, name) | 
|---|
| 646 | .map(hir_class) | 
|---|
| 647 | .ok_or(Error::PropertyValueNotFound), | 
|---|
| 648 | } | 
|---|
| 649 | } | 
|---|
| 650 |  | 
|---|
| 651 | match canonical_name { | 
|---|
| 652 | "Decimal_Number"=> perl_digit(), | 
|---|
| 653 | name => imp(name), | 
|---|
| 654 | } | 
|---|
| 655 | } | 
|---|
| 656 |  | 
|---|
| 657 | /// Returns the Unicode HIR class corresponding to the given script. | 
|---|
| 658 | /// | 
|---|
| 659 | /// Name canonicalization is assumed to be performed by the caller. | 
|---|
| 660 | /// | 
|---|
| 661 | /// If the given script could not be found, or if the script data is not | 
|---|
| 662 | /// available, then an error is returned. | 
|---|
| 663 | fn script(canonical_name: &'static str) -> Result<hir::ClassUnicode> { | 
|---|
| 664 | #[ cfg(not(feature = "unicode-script"))] | 
|---|
| 665 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { | 
|---|
| 666 | Err(Error::PropertyNotFound) | 
|---|
| 667 | } | 
|---|
| 668 |  | 
|---|
| 669 | #[ cfg(feature = "unicode-script")] | 
|---|
| 670 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { | 
|---|
| 671 | use crate::unicode_tables::script::BY_NAME; | 
|---|
| 672 | property_set(BY_NAME, name) | 
|---|
| 673 | .map(hir_class) | 
|---|
| 674 | .ok_or(err:Error::PropertyValueNotFound) | 
|---|
| 675 | } | 
|---|
| 676 |  | 
|---|
| 677 | imp(canonical_name) | 
|---|
| 678 | } | 
|---|
| 679 |  | 
|---|
| 680 | /// Returns the Unicode HIR class corresponding to the given script extension. | 
|---|
| 681 | /// | 
|---|
| 682 | /// Name canonicalization is assumed to be performed by the caller. | 
|---|
| 683 | /// | 
|---|
| 684 | /// If the given script extension could not be found, or if the script data is | 
|---|
| 685 | /// not available, then an error is returned. | 
|---|
| 686 | fn script_extension( | 
|---|
| 687 | canonical_name: &'static str, | 
|---|
| 688 | ) -> Result<hir::ClassUnicode> { | 
|---|
| 689 | #[ cfg(not(feature = "unicode-script"))] | 
|---|
| 690 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { | 
|---|
| 691 | Err(Error::PropertyNotFound) | 
|---|
| 692 | } | 
|---|
| 693 |  | 
|---|
| 694 | #[ cfg(feature = "unicode-script")] | 
|---|
| 695 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { | 
|---|
| 696 | use crate::unicode_tables::script_extension::BY_NAME; | 
|---|
| 697 | property_set(BY_NAME, name) | 
|---|
| 698 | .map(hir_class) | 
|---|
| 699 | .ok_or(err:Error::PropertyValueNotFound) | 
|---|
| 700 | } | 
|---|
| 701 |  | 
|---|
| 702 | imp(canonical_name) | 
|---|
| 703 | } | 
|---|
| 704 |  | 
|---|
| 705 | /// Returns the Unicode HIR class corresponding to the given Unicode boolean | 
|---|
| 706 | /// property. | 
|---|
| 707 | /// | 
|---|
| 708 | /// Name canonicalization is assumed to be performed by the caller. | 
|---|
| 709 | /// | 
|---|
| 710 | /// If the given boolean property could not be found, or if the boolean | 
|---|
| 711 | /// property data is not available, then an error is returned. | 
|---|
| 712 | fn bool_property(canonical_name: &'static str) -> Result<hir::ClassUnicode> { | 
|---|
| 713 | #[ cfg(not(feature = "unicode-bool"))] | 
|---|
| 714 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { | 
|---|
| 715 | Err(Error::PropertyNotFound) | 
|---|
| 716 | } | 
|---|
| 717 |  | 
|---|
| 718 | #[ cfg(feature = "unicode-bool")] | 
|---|
| 719 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { | 
|---|
| 720 | use crate::unicode_tables::property_bool::BY_NAME; | 
|---|
| 721 | property_set(BY_NAME, name) | 
|---|
| 722 | .map(hir_class) | 
|---|
| 723 | .ok_or(err:Error::PropertyNotFound) | 
|---|
| 724 | } | 
|---|
| 725 |  | 
|---|
| 726 | match canonical_name { | 
|---|
| 727 | "Decimal_Number"=> perl_digit(), | 
|---|
| 728 | "White_Space"=> perl_space(), | 
|---|
| 729 | name: &'static str => imp(name), | 
|---|
| 730 | } | 
|---|
| 731 | } | 
|---|
| 732 |  | 
|---|
| 733 | /// Returns the Unicode HIR class corresponding to the given grapheme cluster | 
|---|
| 734 | /// break property. | 
|---|
| 735 | /// | 
|---|
| 736 | /// Name canonicalization is assumed to be performed by the caller. | 
|---|
| 737 | /// | 
|---|
| 738 | /// If the given property could not be found, or if the corresponding data is | 
|---|
| 739 | /// not available, then an error is returned. | 
|---|
| 740 | fn gcb(canonical_name: &'static str) -> Result<hir::ClassUnicode> { | 
|---|
| 741 | #[ cfg(not(feature = "unicode-segment"))] | 
|---|
| 742 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { | 
|---|
| 743 | Err(Error::PropertyNotFound) | 
|---|
| 744 | } | 
|---|
| 745 |  | 
|---|
| 746 | #[ cfg(feature = "unicode-segment")] | 
|---|
| 747 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { | 
|---|
| 748 | use crate::unicode_tables::grapheme_cluster_break::BY_NAME; | 
|---|
| 749 | property_set(BY_NAME, name) | 
|---|
| 750 | .map(hir_class) | 
|---|
| 751 | .ok_or(err:Error::PropertyValueNotFound) | 
|---|
| 752 | } | 
|---|
| 753 |  | 
|---|
| 754 | imp(canonical_name) | 
|---|
| 755 | } | 
|---|
| 756 |  | 
|---|
| 757 | /// Returns the Unicode HIR class corresponding to the given word break | 
|---|
| 758 | /// property. | 
|---|
| 759 | /// | 
|---|
| 760 | /// Name canonicalization is assumed to be performed by the caller. | 
|---|
| 761 | /// | 
|---|
| 762 | /// If the given property could not be found, or if the corresponding data is | 
|---|
| 763 | /// not available, then an error is returned. | 
|---|
| 764 | fn wb(canonical_name: &'static str) -> Result<hir::ClassUnicode> { | 
|---|
| 765 | #[ cfg(not(feature = "unicode-segment"))] | 
|---|
| 766 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { | 
|---|
| 767 | Err(Error::PropertyNotFound) | 
|---|
| 768 | } | 
|---|
| 769 |  | 
|---|
| 770 | #[ cfg(feature = "unicode-segment")] | 
|---|
| 771 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { | 
|---|
| 772 | use crate::unicode_tables::word_break::BY_NAME; | 
|---|
| 773 | property_set(BY_NAME, name) | 
|---|
| 774 | .map(hir_class) | 
|---|
| 775 | .ok_or(err:Error::PropertyValueNotFound) | 
|---|
| 776 | } | 
|---|
| 777 |  | 
|---|
| 778 | imp(canonical_name) | 
|---|
| 779 | } | 
|---|
| 780 |  | 
|---|
| 781 | /// Returns the Unicode HIR class corresponding to the given sentence | 
|---|
| 782 | /// break property. | 
|---|
| 783 | /// | 
|---|
| 784 | /// Name canonicalization is assumed to be performed by the caller. | 
|---|
| 785 | /// | 
|---|
| 786 | /// If the given property could not be found, or if the corresponding data is | 
|---|
| 787 | /// not available, then an error is returned. | 
|---|
| 788 | fn sb(canonical_name: &'static str) -> Result<hir::ClassUnicode> { | 
|---|
| 789 | #[ cfg(not(feature = "unicode-segment"))] | 
|---|
| 790 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { | 
|---|
| 791 | Err(Error::PropertyNotFound) | 
|---|
| 792 | } | 
|---|
| 793 |  | 
|---|
| 794 | #[ cfg(feature = "unicode-segment")] | 
|---|
| 795 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { | 
|---|
| 796 | use crate::unicode_tables::sentence_break::BY_NAME; | 
|---|
| 797 | property_set(BY_NAME, name) | 
|---|
| 798 | .map(hir_class) | 
|---|
| 799 | .ok_or(err:Error::PropertyValueNotFound) | 
|---|
| 800 | } | 
|---|
| 801 |  | 
|---|
| 802 | imp(canonical_name) | 
|---|
| 803 | } | 
|---|
| 804 |  | 
|---|
| 805 | /// Like symbolic_name_normalize_bytes, but operates on a string. | 
|---|
| 806 | fn symbolic_name_normalize(x: &str) -> String { | 
|---|
| 807 | let mut tmp: Vec = x.as_bytes().to_vec(); | 
|---|
| 808 | let len: usize = symbolic_name_normalize_bytes(&mut tmp).len(); | 
|---|
| 809 | tmp.truncate(len); | 
|---|
| 810 | // This should always succeed because `symbolic_name_normalize_bytes` | 
|---|
| 811 | // guarantees that `&tmp[..len]` is always valid UTF-8. | 
|---|
| 812 | // | 
|---|
| 813 | // N.B. We could avoid the additional UTF-8 check here, but it's unlikely | 
|---|
| 814 | // to be worth skipping the additional safety check. A benchmark must | 
|---|
| 815 | // justify it first. | 
|---|
| 816 | String::from_utf8(vec:tmp).unwrap() | 
|---|
| 817 | } | 
|---|
| 818 |  | 
|---|
| 819 | /// Normalize the given symbolic name in place according to UAX44-LM3. | 
|---|
| 820 | /// | 
|---|
| 821 | /// A "symbolic name" typically corresponds to property names and property | 
|---|
| 822 | /// value aliases. Note, though, that it should not be applied to property | 
|---|
| 823 | /// string values. | 
|---|
| 824 | /// | 
|---|
| 825 | /// The slice returned is guaranteed to be valid UTF-8 for all possible values | 
|---|
| 826 | /// of `slice`. | 
|---|
| 827 | /// | 
|---|
| 828 | /// See: https://unicode.org/reports/tr44/#UAX44-LM3 | 
|---|
| 829 | fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] { | 
|---|
| 830 | // I couldn't find a place in the standard that specified that property | 
|---|
| 831 | // names/aliases had a particular structure (unlike character names), but | 
|---|
| 832 | // we assume that it's ASCII only and drop anything that isn't ASCII. | 
|---|
| 833 | let mut start = 0; | 
|---|
| 834 | let mut starts_with_is = false; | 
|---|
| 835 | if slice.len() >= 2 { | 
|---|
| 836 | // Ignore any "is" prefix. | 
|---|
| 837 | starts_with_is = slice[0..2] == b"is"[..] | 
|---|
| 838 | || slice[0..2] == b"IS"[..] | 
|---|
| 839 | || slice[0..2] == b"iS"[..] | 
|---|
| 840 | || slice[0..2] == b"Is"[..]; | 
|---|
| 841 | if starts_with_is { | 
|---|
| 842 | start = 2; | 
|---|
| 843 | } | 
|---|
| 844 | } | 
|---|
| 845 | let mut next_write = 0; | 
|---|
| 846 | for i in start..slice.len() { | 
|---|
| 847 | // VALIDITY ARGUMENT: To guarantee that the resulting slice is valid | 
|---|
| 848 | // UTF-8, we ensure that the slice contains only ASCII bytes. In | 
|---|
| 849 | // particular, we drop every non-ASCII byte from the normalized string. | 
|---|
| 850 | let b = slice[i]; | 
|---|
| 851 | if b == b' '|| b == b'_'|| b == b'-'{ | 
|---|
| 852 | continue; | 
|---|
| 853 | } else if b'A'<= b && b <= b'Z'{ | 
|---|
| 854 | slice[next_write] = b + ( b'a'- b'A'); | 
|---|
| 855 | next_write += 1; | 
|---|
| 856 | } else if b <= 0x7F { | 
|---|
| 857 | slice[next_write] = b; | 
|---|
| 858 | next_write += 1; | 
|---|
| 859 | } | 
|---|
| 860 | } | 
|---|
| 861 | // Special case: ISO_Comment has a 'isc' abbreviation. Since we generally | 
|---|
| 862 | // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross | 
|---|
| 863 | // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it | 
|---|
| 864 | // is actually an alias for the 'Other' general category. | 
|---|
| 865 | if starts_with_is && next_write == 1 && slice[0] == b'c'{ | 
|---|
| 866 | slice[0] = b'i'; | 
|---|
| 867 | slice[1] = b's'; | 
|---|
| 868 | slice[2] = b'c'; | 
|---|
| 869 | next_write = 3; | 
|---|
| 870 | } | 
|---|
| 871 | &mut slice[..next_write] | 
|---|
| 872 | } | 
|---|
| 873 |  | 
|---|
| 874 | #[ cfg(test)] | 
|---|
| 875 | mod tests { | 
|---|
| 876 | use super::{ | 
|---|
| 877 | contains_simple_case_mapping, simple_fold, symbolic_name_normalize, | 
|---|
| 878 | symbolic_name_normalize_bytes, | 
|---|
| 879 | }; | 
|---|
| 880 |  | 
|---|
| 881 | #[ cfg(feature = "unicode-case")] | 
|---|
| 882 | fn simple_fold_ok(c: char) -> impl Iterator<Item = char> { | 
|---|
| 883 | simple_fold(c).unwrap().unwrap() | 
|---|
| 884 | } | 
|---|
| 885 |  | 
|---|
| 886 | #[ cfg(feature = "unicode-case")] | 
|---|
| 887 | fn simple_fold_err(c: char) -> Option<char> { | 
|---|
| 888 | match simple_fold(c).unwrap() { | 
|---|
| 889 | Ok(_) => unreachable!( "simple_fold returned Ok iterator"), | 
|---|
| 890 | Err(next) => next, | 
|---|
| 891 | } | 
|---|
| 892 | } | 
|---|
| 893 |  | 
|---|
| 894 | #[ cfg(feature = "unicode-case")] | 
|---|
| 895 | fn contains_case_map(start: char, end: char) -> bool { | 
|---|
| 896 | contains_simple_case_mapping(start, end).unwrap() | 
|---|
| 897 | } | 
|---|
| 898 |  | 
|---|
| 899 | #[ test] | 
|---|
| 900 | #[ cfg(feature = "unicode-case")] | 
|---|
| 901 | fn simple_fold_k() { | 
|---|
| 902 | let xs: Vec<char> = simple_fold_ok( 'k').collect(); | 
|---|
| 903 | assert_eq!(xs, vec![ 'K', 'K']); | 
|---|
| 904 |  | 
|---|
| 905 | let xs: Vec<char> = simple_fold_ok( 'K').collect(); | 
|---|
| 906 | assert_eq!(xs, vec![ 'k', 'K']); | 
|---|
| 907 |  | 
|---|
| 908 | let xs: Vec<char> = simple_fold_ok( 'K').collect(); | 
|---|
| 909 | assert_eq!(xs, vec![ 'K', 'k']); | 
|---|
| 910 | } | 
|---|
| 911 |  | 
|---|
| 912 | #[ test] | 
|---|
| 913 | #[ cfg(feature = "unicode-case")] | 
|---|
| 914 | fn simple_fold_a() { | 
|---|
| 915 | let xs: Vec<char> = simple_fold_ok( 'a').collect(); | 
|---|
| 916 | assert_eq!(xs, vec![ 'A']); | 
|---|
| 917 |  | 
|---|
| 918 | let xs: Vec<char> = simple_fold_ok( 'A').collect(); | 
|---|
| 919 | assert_eq!(xs, vec![ 'a']); | 
|---|
| 920 | } | 
|---|
| 921 |  | 
|---|
| 922 | #[ test] | 
|---|
| 923 | #[ cfg(feature = "unicode-case")] | 
|---|
| 924 | fn simple_fold_empty() { | 
|---|
| 925 | assert_eq!(Some( 'A'), simple_fold_err( '?')); | 
|---|
| 926 | assert_eq!(Some( 'A'), simple_fold_err( '@')); | 
|---|
| 927 | assert_eq!(Some( 'a'), simple_fold_err( '[')); | 
|---|
| 928 | assert_eq!(Some( 'Ⰰ'), simple_fold_err( '☃')); | 
|---|
| 929 | } | 
|---|
| 930 |  | 
|---|
| 931 | #[ test] | 
|---|
| 932 | #[ cfg(feature = "unicode-case")] | 
|---|
| 933 | fn simple_fold_max() { | 
|---|
| 934 | assert_eq!(None, simple_fold_err( '\u{10FFFE} ')); | 
|---|
| 935 | assert_eq!(None, simple_fold_err( '\u{10FFFF} ')); | 
|---|
| 936 | } | 
|---|
| 937 |  | 
|---|
| 938 | #[ test] | 
|---|
| 939 | #[ cfg(not(feature = "unicode-case"))] | 
|---|
| 940 | fn simple_fold_disabled() { | 
|---|
| 941 | assert!(simple_fold( 'a').is_err()); | 
|---|
| 942 | } | 
|---|
| 943 |  | 
|---|
| 944 | #[ test] | 
|---|
| 945 | #[ cfg(feature = "unicode-case")] | 
|---|
| 946 | fn range_contains() { | 
|---|
| 947 | assert!(contains_case_map( 'A', 'A')); | 
|---|
| 948 | assert!(contains_case_map( 'Z', 'Z')); | 
|---|
| 949 | assert!(contains_case_map( 'A', 'Z')); | 
|---|
| 950 | assert!(contains_case_map( '@', 'A')); | 
|---|
| 951 | assert!(contains_case_map( 'Z', '[')); | 
|---|
| 952 | assert!(contains_case_map( '☃', 'Ⰰ')); | 
|---|
| 953 |  | 
|---|
| 954 | assert!(!contains_case_map( '[', '[')); | 
|---|
| 955 | assert!(!contains_case_map( '[', '`')); | 
|---|
| 956 |  | 
|---|
| 957 | assert!(!contains_case_map( '☃', '☃')); | 
|---|
| 958 | } | 
|---|
| 959 |  | 
|---|
| 960 | #[ test] | 
|---|
| 961 | #[ cfg(not(feature = "unicode-case"))] | 
|---|
| 962 | fn range_contains_disabled() { | 
|---|
| 963 | assert!(contains_simple_case_mapping( 'a', 'a').is_err()); | 
|---|
| 964 | } | 
|---|
| 965 |  | 
|---|
| 966 | #[ test] | 
|---|
| 967 | #[ cfg(feature = "unicode-gencat")] | 
|---|
| 968 | fn regression_466() { | 
|---|
| 969 | use super::{CanonicalClassQuery, ClassQuery}; | 
|---|
| 970 |  | 
|---|
| 971 | let q = ClassQuery::OneLetter( 'C'); | 
|---|
| 972 | assert_eq!( | 
|---|
| 973 | q.canonicalize().unwrap(), | 
|---|
| 974 | CanonicalClassQuery::GeneralCategory( "Other") | 
|---|
| 975 | ); | 
|---|
| 976 | } | 
|---|
| 977 |  | 
|---|
| 978 | #[ test] | 
|---|
| 979 | fn sym_normalize() { | 
|---|
| 980 | let sym_norm = symbolic_name_normalize; | 
|---|
| 981 |  | 
|---|
| 982 | assert_eq!(sym_norm( "Line_Break"), "linebreak"); | 
|---|
| 983 | assert_eq!(sym_norm( "Line-break"), "linebreak"); | 
|---|
| 984 | assert_eq!(sym_norm( "linebreak"), "linebreak"); | 
|---|
| 985 | assert_eq!(sym_norm( "BA"), "ba"); | 
|---|
| 986 | assert_eq!(sym_norm( "ba"), "ba"); | 
|---|
| 987 | assert_eq!(sym_norm( "Greek"), "greek"); | 
|---|
| 988 | assert_eq!(sym_norm( "isGreek"), "greek"); | 
|---|
| 989 | assert_eq!(sym_norm( "IS_Greek"), "greek"); | 
|---|
| 990 | assert_eq!(sym_norm( "isc"), "isc"); | 
|---|
| 991 | assert_eq!(sym_norm( "is c"), "isc"); | 
|---|
| 992 | assert_eq!(sym_norm( "is_c"), "isc"); | 
|---|
| 993 | } | 
|---|
| 994 |  | 
|---|
| 995 | #[ test] | 
|---|
| 996 | fn valid_utf8_symbolic() { | 
|---|
| 997 | let mut x = b"abc\xFF xyz".to_vec(); | 
|---|
| 998 | let y = symbolic_name_normalize_bytes(&mut x); | 
|---|
| 999 | assert_eq!(y, b"abcxyz"); | 
|---|
| 1000 | } | 
|---|
| 1001 | } | 
|---|
| 1002 |  | 
|---|