| 1 | use alloc::{ | 
| 2 | string::{String, ToString}, | 
|---|
| 3 | vec::Vec, | 
|---|
| 4 | }; | 
|---|
| 5 |  | 
|---|
| 6 | use crate::hir; | 
|---|
| 7 |  | 
|---|
| 8 | /// An inclusive range of codepoints from a generated file (hence the static | 
|---|
| 9 | /// lifetime). | 
|---|
| 10 | type Range = &'static [(char, char)]; | 
|---|
| 11 |  | 
|---|
| 12 | /// An error that occurs when dealing with Unicode. | 
|---|
| 13 | /// | 
|---|
| 14 | /// We don't impl the Error trait here because these always get converted | 
|---|
| 15 | /// into other public errors. (This error type isn't exported.) | 
|---|
| 16 | #[ derive(Debug)] | 
|---|
| 17 | pub enum Error { | 
|---|
| 18 | PropertyNotFound, | 
|---|
| 19 | PropertyValueNotFound, | 
|---|
| 20 | // Not used when unicode-perl is enabled. | 
|---|
| 21 | #[ allow(dead_code)] | 
|---|
| 22 | PerlClassNotFound, | 
|---|
| 23 | } | 
|---|
| 24 |  | 
|---|
| 25 | /// An error that occurs when Unicode-aware simple case folding fails. | 
|---|
| 26 | /// | 
|---|
| 27 | /// This error can occur when the case mapping tables necessary for Unicode | 
|---|
| 28 | /// aware case folding are unavailable. This only occurs when the | 
|---|
| 29 | /// `unicode-case` feature is disabled. (The feature is enabled by default.) | 
|---|
| 30 | #[ derive(Debug)] | 
|---|
| 31 | pub struct CaseFoldError(()); | 
|---|
| 32 |  | 
|---|
| 33 | #[ cfg(feature = "std")] | 
|---|
| 34 | impl std::error::Error for CaseFoldError {} | 
|---|
| 35 |  | 
|---|
| 36 | impl core::fmt::Display for CaseFoldError { | 
|---|
| 37 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { | 
|---|
| 38 | write!( | 
|---|
| 39 | f, | 
|---|
| 40 | "Unicode-aware case folding is not available \ | 
|---|
| 41 |              (probably because the unicode-case feature is not enabled)" | 
|---|
| 42 | ) | 
|---|
| 43 | } | 
|---|
| 44 | } | 
|---|
| 45 |  | 
|---|
| 46 | /// An error that occurs when the Unicode-aware `\w` class is unavailable. | 
|---|
| 47 | /// | 
|---|
| 48 | /// This error can occur when the data tables necessary for the Unicode aware | 
|---|
| 49 | /// Perl character class `\w` are unavailable. This only occurs when the | 
|---|
| 50 | /// `unicode-perl` feature is disabled. (The feature is enabled by default.) | 
|---|
| 51 | #[ derive(Debug)] | 
|---|
| 52 | pub struct UnicodeWordError(()); | 
|---|
| 53 |  | 
|---|
| 54 | #[ cfg(feature = "std")] | 
|---|
| 55 | impl std::error::Error for UnicodeWordError {} | 
|---|
| 56 |  | 
|---|
| 57 | impl core::fmt::Display for UnicodeWordError { | 
|---|
| 58 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { | 
|---|
| 59 | write!( | 
|---|
| 60 | f, | 
|---|
| 61 | "Unicode-aware \\ w class is not available \ | 
|---|
| 62 |              (probably because the unicode-perl feature is not enabled)" | 
|---|
| 63 | ) | 
|---|
| 64 | } | 
|---|
| 65 | } | 
|---|
| 66 |  | 
|---|
| 67 | /// A state oriented traverser of the simple case folding table. | 
|---|
| 68 | /// | 
|---|
| 69 | /// A case folder can be constructed via `SimpleCaseFolder::new()`, which will | 
|---|
| 70 | /// return an error if the underlying case folding table is unavailable. | 
|---|
| 71 | /// | 
|---|
| 72 | /// After construction, it is expected that callers will use | 
|---|
| 73 | /// `SimpleCaseFolder::mapping` by calling it with codepoints in strictly | 
|---|
| 74 | /// increasing order. For example, calling it on `b` and then on `a` is illegal | 
|---|
| 75 | /// and will result in a panic. | 
|---|
| 76 | /// | 
|---|
| 77 | /// The main idea of this type is that it tries hard to make mapping lookups | 
|---|
| 78 | /// fast by exploiting the structure of the underlying table, and the ordering | 
|---|
| 79 | /// assumption enables this. | 
|---|
| 80 | #[ derive(Debug)] | 
|---|
| 81 | pub struct SimpleCaseFolder { | 
|---|
| 82 | /// The simple case fold table. It's a sorted association list, where the | 
|---|
| 83 | /// keys are Unicode scalar values and the values are the corresponding | 
|---|
| 84 | /// equivalence class (not including the key) of the "simple" case folded | 
|---|
| 85 | /// Unicode scalar values. | 
|---|
| 86 | table: &'static [(char, &'static [char])], | 
|---|
| 87 | /// The last codepoint that was used for a lookup. | 
|---|
| 88 | last: Option<char>, | 
|---|
| 89 | /// The index to the entry in `table` corresponding to the smallest key `k` | 
|---|
| 90 | /// such that `k > k0`, where `k0` is the most recent key lookup. Note that | 
|---|
| 91 | /// in particular, `k0` may not be in the table! | 
|---|
| 92 | next: usize, | 
|---|
| 93 | } | 
|---|
| 94 |  | 
|---|
| 95 | impl SimpleCaseFolder { | 
|---|
| 96 | /// Create a new simple case folder, returning an error if the underlying | 
|---|
| 97 | /// case folding table is unavailable. | 
|---|
| 98 | pub fn new() -> Result<SimpleCaseFolder, CaseFoldError> { | 
|---|
| 99 | #[ cfg(not(feature = "unicode-case"))] | 
|---|
| 100 | { | 
|---|
| 101 | Err(CaseFoldError(())) | 
|---|
| 102 | } | 
|---|
| 103 | #[ cfg(feature = "unicode-case")] | 
|---|
| 104 | { | 
|---|
| 105 | Ok(SimpleCaseFolder { | 
|---|
| 106 | table: crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE, | 
|---|
| 107 | last: None, | 
|---|
| 108 | next: 0, | 
|---|
| 109 | }) | 
|---|
| 110 | } | 
|---|
| 111 | } | 
|---|
| 112 |  | 
|---|
| 113 | /// Return the equivalence class of case folded codepoints for the given | 
|---|
| 114 | /// codepoint. The equivalence class returned never includes the codepoint | 
|---|
| 115 | /// given. If the given codepoint has no case folded codepoints (i.e., | 
|---|
| 116 | /// no entry in the underlying case folding table), then this returns an | 
|---|
| 117 | /// empty slice. | 
|---|
| 118 | /// | 
|---|
| 119 | /// # Panics | 
|---|
| 120 | /// | 
|---|
| 121 | /// This panics when called with a `c` that is less than or equal to the | 
|---|
| 122 | /// previous call. In other words, callers need to use this method with | 
|---|
| 123 | /// strictly increasing values of `c`. | 
|---|
| 124 | pub fn mapping(&mut self, c: char) -> &'static [char] { | 
|---|
| 125 | if let Some(last) = self.last { | 
|---|
| 126 | assert!( | 
|---|
| 127 | last < c, | 
|---|
| 128 | "got codepoint U+{:X}  which occurs before \ | 
|---|
| 129 |                  last codepoint U+{:X} ", | 
|---|
| 130 | u32::from(c), | 
|---|
| 131 | u32::from(last), | 
|---|
| 132 | ); | 
|---|
| 133 | } | 
|---|
| 134 | self.last = Some(c); | 
|---|
| 135 | if self.next >= self.table.len() { | 
|---|
| 136 | return &[]; | 
|---|
| 137 | } | 
|---|
| 138 | let (k, v) = self.table[self.next]; | 
|---|
| 139 | if k == c { | 
|---|
| 140 | self.next += 1; | 
|---|
| 141 | return v; | 
|---|
| 142 | } | 
|---|
| 143 | match self.get(c) { | 
|---|
| 144 | Err(i) => { | 
|---|
| 145 | self.next = i; | 
|---|
| 146 | &[] | 
|---|
| 147 | } | 
|---|
| 148 | Ok(i) => { | 
|---|
| 149 | // Since we require lookups to proceed | 
|---|
| 150 | // in order, anything we find should be | 
|---|
| 151 | // after whatever we thought might be | 
|---|
| 152 | // next. Otherwise, the caller is either | 
|---|
| 153 | // going out of order or we would have | 
|---|
| 154 | // found our next key at 'self.next'. | 
|---|
| 155 | assert!(i > self.next); | 
|---|
| 156 | self.next = i + 1; | 
|---|
| 157 | self.table[i].1 | 
|---|
| 158 | } | 
|---|
| 159 | } | 
|---|
| 160 | } | 
|---|
| 161 |  | 
|---|
| 162 | /// Returns true if and only if the given range overlaps with any region | 
|---|
| 163 | /// of the underlying case folding table. That is, when true, there exists | 
|---|
| 164 | /// at least one codepoint in the inclusive range `[start, end]` that has | 
|---|
| 165 | /// a non-trivial equivalence class of case folded codepoints. Conversely, | 
|---|
| 166 | /// when this returns false, all codepoints in the range `[start, end]` | 
|---|
| 167 | /// correspond to the trivial equivalence class of case folded codepoints, | 
|---|
| 168 | /// i.e., itself. | 
|---|
| 169 | /// | 
|---|
| 170 | /// This is useful to call before iterating over the codepoints in the | 
|---|
| 171 | /// range and looking up the mapping for each. If you know none of the | 
|---|
| 172 | /// mappings will return anything, then you might be able to skip doing it | 
|---|
| 173 | /// altogether. | 
|---|
| 174 | /// | 
|---|
| 175 | /// # Panics | 
|---|
| 176 | /// | 
|---|
| 177 | /// This panics when `end < start`. | 
|---|
| 178 | pub fn overlaps(&self, start: char, end: char) -> bool { | 
|---|
| 179 | use core::cmp::Ordering; | 
|---|
| 180 |  | 
|---|
| 181 | assert!(start <= end); | 
|---|
| 182 | self.table | 
|---|
| 183 | .binary_search_by(|&(c, _)| { | 
|---|
| 184 | if start <= c && c <= end { | 
|---|
| 185 | Ordering::Equal | 
|---|
| 186 | } else if c > end { | 
|---|
| 187 | Ordering::Greater | 
|---|
| 188 | } else { | 
|---|
| 189 | Ordering::Less | 
|---|
| 190 | } | 
|---|
| 191 | }) | 
|---|
| 192 | .is_ok() | 
|---|
| 193 | } | 
|---|
| 194 |  | 
|---|
| 195 | /// Returns the index at which `c` occurs in the simple case fold table. If | 
|---|
| 196 | /// `c` does not occur, then this returns an `i` such that `table[i-1].0 < | 
|---|
| 197 | /// c` and `table[i].0 > c`. | 
|---|
| 198 | fn get(&self, c: char) -> Result<usize, usize> { | 
|---|
| 199 | self.table.binary_search_by_key(&c, |&(c1, _)| c1) | 
|---|
| 200 | } | 
|---|
| 201 | } | 
|---|
| 202 |  | 
|---|
| 203 | /// A query for finding a character class defined by Unicode. This supports | 
|---|
| 204 | /// either use of a property name directly, or lookup by property value. The | 
|---|
| 205 | /// former generally refers to Binary properties (see UTS#44, Table 8), but | 
|---|
| 206 | /// as a special exception (see UTS#18, Section 1.2) both general categories | 
|---|
| 207 | /// (an enumeration) and scripts (a catalog) are supported as if each of their | 
|---|
| 208 | /// possible values were a binary property. | 
|---|
| 209 | /// | 
|---|
| 210 | /// In all circumstances, property names and values are normalized and | 
|---|
| 211 | /// canonicalized. That is, `GC == gc == GeneralCategory == general_category`. | 
|---|
| 212 | /// | 
|---|
| 213 | /// The lifetime `'a` refers to the shorter of the lifetimes of property name | 
|---|
| 214 | /// and property value. | 
|---|
| 215 | #[ derive(Debug)] | 
|---|
| 216 | pub enum ClassQuery<'a> { | 
|---|
| 217 | /// Return a class corresponding to a Unicode binary property, named by | 
|---|
| 218 | /// a single letter. | 
|---|
| 219 | OneLetter(char), | 
|---|
| 220 | /// Return a class corresponding to a Unicode binary property. | 
|---|
| 221 | /// | 
|---|
| 222 | /// Note that, by special exception (see UTS#18, Section 1.2), both | 
|---|
| 223 | /// general category values and script values are permitted here as if | 
|---|
| 224 | /// they were a binary property. | 
|---|
| 225 | Binary(&'a str), | 
|---|
| 226 | /// Return a class corresponding to all codepoints whose property | 
|---|
| 227 | /// (identified by `property_name`) corresponds to the given value | 
|---|
| 228 | /// (identified by `property_value`). | 
|---|
| 229 | ByValue { | 
|---|
| 230 | /// A property name. | 
|---|
| 231 | property_name: &'a str, | 
|---|
| 232 | /// A property value. | 
|---|
| 233 | property_value: &'a str, | 
|---|
| 234 | }, | 
|---|
| 235 | } | 
|---|
| 236 |  | 
|---|
| 237 | impl<'a> ClassQuery<'a> { | 
|---|
| 238 | fn canonicalize(&self) -> Result<CanonicalClassQuery, Error> { | 
|---|
| 239 | match *self { | 
|---|
| 240 | ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()), | 
|---|
| 241 | ClassQuery::Binary(name) => self.canonical_binary(name), | 
|---|
| 242 | ClassQuery::ByValue { property_name, property_value } => { | 
|---|
| 243 | let property_name = symbolic_name_normalize(property_name); | 
|---|
| 244 | let property_value = symbolic_name_normalize(property_value); | 
|---|
| 245 |  | 
|---|
| 246 | let canon_name = match canonical_prop(&property_name)? { | 
|---|
| 247 | None => return Err(Error::PropertyNotFound), | 
|---|
| 248 | Some(canon_name) => canon_name, | 
|---|
| 249 | }; | 
|---|
| 250 | Ok(match canon_name { | 
|---|
| 251 | "General_Category"=> { | 
|---|
| 252 | let canon = match canonical_gencat(&property_value)? { | 
|---|
| 253 | None => return Err(Error::PropertyValueNotFound), | 
|---|
| 254 | Some(canon) => canon, | 
|---|
| 255 | }; | 
|---|
| 256 | CanonicalClassQuery::GeneralCategory(canon) | 
|---|
| 257 | } | 
|---|
| 258 | "Script"=> { | 
|---|
| 259 | let canon = match canonical_script(&property_value)? { | 
|---|
| 260 | None => return Err(Error::PropertyValueNotFound), | 
|---|
| 261 | Some(canon) => canon, | 
|---|
| 262 | }; | 
|---|
| 263 | CanonicalClassQuery::Script(canon) | 
|---|
| 264 | } | 
|---|
| 265 | _ => { | 
|---|
| 266 | let vals = match property_values(canon_name)? { | 
|---|
| 267 | None => return Err(Error::PropertyValueNotFound), | 
|---|
| 268 | Some(vals) => vals, | 
|---|
| 269 | }; | 
|---|
| 270 | let canon_val = | 
|---|
| 271 | match canonical_value(vals, &property_value) { | 
|---|
| 272 | None => { | 
|---|
| 273 | return Err(Error::PropertyValueNotFound) | 
|---|
| 274 | } | 
|---|
| 275 | Some(canon_val) => canon_val, | 
|---|
| 276 | }; | 
|---|
| 277 | CanonicalClassQuery::ByValue { | 
|---|
| 278 | property_name: canon_name, | 
|---|
| 279 | property_value: canon_val, | 
|---|
| 280 | } | 
|---|
| 281 | } | 
|---|
| 282 | }) | 
|---|
| 283 | } | 
|---|
| 284 | } | 
|---|
| 285 | } | 
|---|
| 286 |  | 
|---|
| 287 | fn canonical_binary( | 
|---|
| 288 | &self, | 
|---|
| 289 | name: &str, | 
|---|
| 290 | ) -> Result<CanonicalClassQuery, Error> { | 
|---|
| 291 | let norm = symbolic_name_normalize(name); | 
|---|
| 292 |  | 
|---|
| 293 | // This is a special case where 'cf' refers to the 'Format' general | 
|---|
| 294 | // category, but where the 'cf' abbreviation is also an abbreviation | 
|---|
| 295 | // for the 'Case_Folding' property. But we want to treat it as | 
|---|
| 296 | // a general category. (Currently, we don't even support the | 
|---|
| 297 | // 'Case_Folding' property. But if we do in the future, users will be | 
|---|
| 298 | // required to spell it out.) | 
|---|
| 299 | // | 
|---|
| 300 | // Also 'sc' refers to the 'Currency_Symbol' general category, but is | 
|---|
| 301 | // also the abbreviation for the 'Script' property. So we avoid calling | 
|---|
| 302 | // 'canonical_prop' for it too, which would erroneously normalize it | 
|---|
| 303 | // to 'Script'. | 
|---|
| 304 | // | 
|---|
| 305 | // Another case: 'lc' is an abbreviation for the 'Cased_Letter' | 
|---|
| 306 | // general category, but is also an abbreviation for the 'Lowercase_Mapping' | 
|---|
| 307 | // property. We don't currently support the latter, so as with 'cf' | 
|---|
| 308 | // above, we treat 'lc' as 'Cased_Letter'. | 
|---|
| 309 | if norm != "cf"&& norm != "sc"&& norm != "lc"{ | 
|---|
| 310 | if let Some(canon) = canonical_prop(&norm)? { | 
|---|
| 311 | return Ok(CanonicalClassQuery::Binary(canon)); | 
|---|
| 312 | } | 
|---|
| 313 | } | 
|---|
| 314 | if let Some(canon) = canonical_gencat(&norm)? { | 
|---|
| 315 | return Ok(CanonicalClassQuery::GeneralCategory(canon)); | 
|---|
| 316 | } | 
|---|
| 317 | if let Some(canon) = canonical_script(&norm)? { | 
|---|
| 318 | return Ok(CanonicalClassQuery::Script(canon)); | 
|---|
| 319 | } | 
|---|
| 320 | Err(Error::PropertyNotFound) | 
|---|
| 321 | } | 
|---|
| 322 | } | 
|---|
| 323 |  | 
|---|
| 324 | /// Like ClassQuery, but its parameters have been canonicalized. This also | 
|---|
| 325 | /// differentiates binary properties from flattened general categories and | 
|---|
| 326 | /// scripts. | 
|---|
| 327 | #[ derive(Debug, Eq, PartialEq)] | 
|---|
| 328 | enum CanonicalClassQuery { | 
|---|
| 329 | /// The canonical binary property name. | 
|---|
| 330 | Binary(&'static str), | 
|---|
| 331 | /// The canonical general category name. | 
|---|
| 332 | GeneralCategory(&'static str), | 
|---|
| 333 | /// The canonical script name. | 
|---|
| 334 | Script(&'static str), | 
|---|
| 335 | /// An arbitrary association between property and value, both of which | 
|---|
| 336 | /// have been canonicalized. | 
|---|
| 337 | /// | 
|---|
| 338 | /// Note that by construction, the property name of ByValue will never | 
|---|
| 339 | /// be General_Category or Script. Those two cases are subsumed by the | 
|---|
| 340 | /// eponymous variants. | 
|---|
| 341 | ByValue { | 
|---|
| 342 | /// The canonical property name. | 
|---|
| 343 | property_name: &'static str, | 
|---|
| 344 | /// The canonical property value. | 
|---|
| 345 | property_value: &'static str, | 
|---|
| 346 | }, | 
|---|
| 347 | } | 
|---|
| 348 |  | 
|---|
| 349 | /// Looks up a Unicode class given a query. If one doesn't exist, then | 
|---|
| 350 | /// `None` is returned. | 
|---|
| 351 | pub fn class(query: ClassQuery<'_>) -> Result<hir::ClassUnicode, Error> { | 
|---|
| 352 | use self::CanonicalClassQuery::*; | 
|---|
| 353 |  | 
|---|
| 354 | match query.canonicalize()? { | 
|---|
| 355 | Binary(name) => bool_property(name), | 
|---|
| 356 | GeneralCategory(name) => gencat(name), | 
|---|
| 357 | Script(name) => script(name), | 
|---|
| 358 | ByValue { property_name: "Age", property_value } => { | 
|---|
| 359 | let mut class = hir::ClassUnicode::empty(); | 
|---|
| 360 | for set in ages(property_value)? { | 
|---|
| 361 | class.union(&hir_class(set)); | 
|---|
| 362 | } | 
|---|
| 363 | Ok(class) | 
|---|
| 364 | } | 
|---|
| 365 | ByValue { property_name: "Script_Extensions", property_value } => { | 
|---|
| 366 | script_extension(property_value) | 
|---|
| 367 | } | 
|---|
| 368 | ByValue { | 
|---|
| 369 | property_name: "Grapheme_Cluster_Break", | 
|---|
| 370 | property_value, | 
|---|
| 371 | } => gcb(property_value), | 
|---|
| 372 | ByValue { property_name: "Sentence_Break", property_value } => { | 
|---|
| 373 | sb(property_value) | 
|---|
| 374 | } | 
|---|
| 375 | ByValue { property_name: "Word_Break", property_value } => { | 
|---|
| 376 | wb(property_value) | 
|---|
| 377 | } | 
|---|
| 378 | _ => { | 
|---|
| 379 | // What else should we support? | 
|---|
| 380 | Err(Error::PropertyNotFound) | 
|---|
| 381 | } | 
|---|
| 382 | } | 
|---|
| 383 | } | 
|---|
| 384 |  | 
|---|
| 385 | /// Returns a Unicode aware class for \w. | 
|---|
| 386 | /// | 
|---|
| 387 | /// This returns an error if the data is not available for \w. | 
|---|
| 388 | pub fn perl_word() -> Result<hir::ClassUnicode, Error> { | 
|---|
| 389 | #[ cfg(not(feature = "unicode-perl"))] | 
|---|
| 390 | fn imp() -> Result<hir::ClassUnicode, Error> { | 
|---|
| 391 | Err(Error::PerlClassNotFound) | 
|---|
| 392 | } | 
|---|
| 393 |  | 
|---|
| 394 | #[ cfg(feature = "unicode-perl")] | 
|---|
| 395 | fn imp() -> Result<hir::ClassUnicode, Error> { | 
|---|
| 396 | use crate::unicode_tables::perl_word::PERL_WORD; | 
|---|
| 397 | Ok(hir_class(PERL_WORD)) | 
|---|
| 398 | } | 
|---|
| 399 |  | 
|---|
| 400 | imp() | 
|---|
| 401 | } | 
|---|
| 402 |  | 
|---|
| 403 | /// Returns a Unicode aware class for \s. | 
|---|
| 404 | /// | 
|---|
| 405 | /// This returns an error if the data is not available for \s. | 
|---|
| 406 | pub fn perl_space() -> Result<hir::ClassUnicode, Error> { | 
|---|
| 407 | #[ cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))] | 
|---|
| 408 | fn imp() -> Result<hir::ClassUnicode, Error> { | 
|---|
| 409 | Err(Error::PerlClassNotFound) | 
|---|
| 410 | } | 
|---|
| 411 |  | 
|---|
| 412 | #[ cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))] | 
|---|
| 413 | fn imp() -> Result<hir::ClassUnicode, Error> { | 
|---|
| 414 | use crate::unicode_tables::perl_space::WHITE_SPACE; | 
|---|
| 415 | Ok(hir_class(WHITE_SPACE)) | 
|---|
| 416 | } | 
|---|
| 417 |  | 
|---|
| 418 | #[ cfg(feature = "unicode-bool")] | 
|---|
| 419 | fn imp() -> Result<hir::ClassUnicode, Error> { | 
|---|
| 420 | use crate::unicode_tables::property_bool::WHITE_SPACE; | 
|---|
| 421 | Ok(hir_class(WHITE_SPACE)) | 
|---|
| 422 | } | 
|---|
| 423 |  | 
|---|
| 424 | imp() | 
|---|
| 425 | } | 
|---|
| 426 |  | 
|---|
| 427 | /// Returns a Unicode aware class for \d. | 
|---|
| 428 | /// | 
|---|
| 429 | /// This returns an error if the data is not available for \d. | 
|---|
| 430 | pub fn perl_digit() -> Result<hir::ClassUnicode, Error> { | 
|---|
| 431 | #[ cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))] | 
|---|
| 432 | fn imp() -> Result<hir::ClassUnicode, Error> { | 
|---|
| 433 | Err(Error::PerlClassNotFound) | 
|---|
| 434 | } | 
|---|
| 435 |  | 
|---|
| 436 | #[ cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))] | 
|---|
| 437 | fn imp() -> Result<hir::ClassUnicode, Error> { | 
|---|
| 438 | use crate::unicode_tables::perl_decimal::DECIMAL_NUMBER; | 
|---|
| 439 | Ok(hir_class(DECIMAL_NUMBER)) | 
|---|
| 440 | } | 
|---|
| 441 |  | 
|---|
| 442 | #[ cfg(feature = "unicode-gencat")] | 
|---|
| 443 | fn imp() -> Result<hir::ClassUnicode, Error> { | 
|---|
| 444 | use crate::unicode_tables::general_category::DECIMAL_NUMBER; | 
|---|
| 445 | Ok(hir_class(DECIMAL_NUMBER)) | 
|---|
| 446 | } | 
|---|
| 447 |  | 
|---|
| 448 | imp() | 
|---|
| 449 | } | 
|---|
| 450 |  | 
|---|
| 451 | /// Build a Unicode HIR class from a sequence of Unicode scalar value ranges. | 
|---|
| 452 | pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode { | 
|---|
| 453 | let hir_ranges: Vec<hir::ClassUnicodeRange> = rangesimpl Iterator  | 
|---|
| 454 | .iter() | 
|---|
| 455 | .map(|&(s: char, e: char)| hir::ClassUnicodeRange::new(start:s, end:e)) | 
|---|
| 456 | .collect(); | 
|---|
| 457 | hir::ClassUnicode::new(hir_ranges) | 
|---|
| 458 | } | 
|---|
| 459 |  | 
|---|
| 460 | /// Returns true only if the given codepoint is in the `\w` character class. | 
|---|
| 461 | /// | 
|---|
| 462 | /// If the `unicode-perl` feature is not enabled, then this returns an error. | 
|---|
| 463 | pub fn is_word_character(c: char) -> Result<bool, UnicodeWordError> { | 
|---|
| 464 | #[ cfg(not(feature = "unicode-perl"))] | 
|---|
| 465 | fn imp(_: char) -> Result<bool, UnicodeWordError> { | 
|---|
| 466 | Err(UnicodeWordError(())) | 
|---|
| 467 | } | 
|---|
| 468 |  | 
|---|
| 469 | #[ cfg(feature = "unicode-perl")] | 
|---|
| 470 | fn imp(c: char) -> Result<bool, UnicodeWordError> { | 
|---|
| 471 | use crate::{is_word_byte, unicode_tables::perl_word::PERL_WORD}; | 
|---|
| 472 |  | 
|---|
| 473 | if u8::try_from(c).map_or(false, is_word_byte) { | 
|---|
| 474 | return Ok(true); | 
|---|
| 475 | } | 
|---|
| 476 | Ok(PERL_WORD | 
|---|
| 477 | .binary_search_by(|&(start, end)| { | 
|---|
| 478 | use core::cmp::Ordering; | 
|---|
| 479 |  | 
|---|
| 480 | if start <= c && c <= end { | 
|---|
| 481 | Ordering::Equal | 
|---|
| 482 | } else if start > c { | 
|---|
| 483 | Ordering::Greater | 
|---|
| 484 | } else { | 
|---|
| 485 | Ordering::Less | 
|---|
| 486 | } | 
|---|
| 487 | }) | 
|---|
| 488 | .is_ok()) | 
|---|
| 489 | } | 
|---|
| 490 |  | 
|---|
| 491 | imp(c) | 
|---|
| 492 | } | 
|---|
| 493 |  | 
|---|
| 494 | /// A mapping of property values for a specific property. | 
|---|
| 495 | /// | 
|---|
| 496 | /// The first element of each tuple is a normalized property value while the | 
|---|
| 497 | /// second element of each tuple is the corresponding canonical property | 
|---|
| 498 | /// value. | 
|---|
| 499 | type PropertyValues = &'static [(&'static str, &'static str)]; | 
|---|
| 500 |  | 
|---|
| 501 | fn canonical_gencat( | 
|---|
| 502 | normalized_value: &str, | 
|---|
| 503 | ) -> Result<Option<&'static str>, Error> { | 
|---|
| 504 | Ok(match normalized_value { | 
|---|
| 505 | "any"=> Some( "Any"), | 
|---|
| 506 | "assigned"=> Some( "Assigned"), | 
|---|
| 507 | "ascii"=> Some( "ASCII"), | 
|---|
| 508 | _ => { | 
|---|
| 509 | let gencats: &'static [(&str, &str)] = property_values(canonical_property_name: "General_Category")?.unwrap(); | 
|---|
| 510 | canonical_value(vals:gencats, normalized_value) | 
|---|
| 511 | } | 
|---|
| 512 | }) | 
|---|
| 513 | } | 
|---|
| 514 |  | 
|---|
| 515 | fn canonical_script( | 
|---|
| 516 | normalized_value: &str, | 
|---|
| 517 | ) -> Result<Option<&'static str>, Error> { | 
|---|
| 518 | let scripts: &'static [(&str, &str)] = property_values(canonical_property_name: "Script")?.unwrap(); | 
|---|
| 519 | Ok(canonical_value(vals:scripts, normalized_value)) | 
|---|
| 520 | } | 
|---|
| 521 |  | 
|---|
| 522 | /// Find the canonical property name for the given normalized property name. | 
|---|
| 523 | /// | 
|---|
| 524 | /// If no such property exists, then `None` is returned. | 
|---|
| 525 | /// | 
|---|
| 526 | /// The normalized property name must have been normalized according to | 
|---|
| 527 | /// UAX44 LM3, which can be done using `symbolic_name_normalize`. | 
|---|
| 528 | /// | 
|---|
| 529 | /// If the property names data is not available, then an error is returned. | 
|---|
| 530 | fn canonical_prop( | 
|---|
| 531 | normalized_name: &str, | 
|---|
| 532 | ) -> Result<Option<&'static str>, Error> { | 
|---|
| 533 | #[ cfg(not(any( | 
|---|
| 534 | feature = "unicode-age", | 
|---|
| 535 | feature = "unicode-bool", | 
|---|
| 536 | feature = "unicode-gencat", | 
|---|
| 537 | feature = "unicode-perl", | 
|---|
| 538 | feature = "unicode-script", | 
|---|
| 539 | feature = "unicode-segment", | 
|---|
| 540 | )))] | 
|---|
| 541 | fn imp(_: &str) -> Result<Option<&'static str>, Error> { | 
|---|
| 542 | Err(Error::PropertyNotFound) | 
|---|
| 543 | } | 
|---|
| 544 |  | 
|---|
| 545 | #[ cfg(any( | 
|---|
| 546 | feature = "unicode-age", | 
|---|
| 547 | feature = "unicode-bool", | 
|---|
| 548 | feature = "unicode-gencat", | 
|---|
| 549 | feature = "unicode-perl", | 
|---|
| 550 | feature = "unicode-script", | 
|---|
| 551 | feature = "unicode-segment", | 
|---|
| 552 | ))] | 
|---|
| 553 | fn imp(name: &str) -> Result<Option<&'static str>, Error> { | 
|---|
| 554 | use crate::unicode_tables::property_names::PROPERTY_NAMES; | 
|---|
| 555 |  | 
|---|
| 556 | Ok(PROPERTY_NAMES | 
|---|
| 557 | .binary_search_by_key(&name, |&(n, _)| n) | 
|---|
| 558 | .ok() | 
|---|
| 559 | .map(|i| PROPERTY_NAMES[i].1)) | 
|---|
| 560 | } | 
|---|
| 561 |  | 
|---|
| 562 | imp(normalized_name) | 
|---|
| 563 | } | 
|---|
| 564 |  | 
|---|
| 565 | /// Find the canonical property value for the given normalized property | 
|---|
| 566 | /// value. | 
|---|
| 567 | /// | 
|---|
| 568 | /// The given property values should correspond to the values for the property | 
|---|
| 569 | /// under question, which can be found using `property_values`. | 
|---|
| 570 | /// | 
|---|
| 571 | /// If no such property value exists, then `None` is returned. | 
|---|
| 572 | /// | 
|---|
| 573 | /// The normalized property value must have been normalized according to | 
|---|
| 574 | /// UAX44 LM3, which can be done using `symbolic_name_normalize`. | 
|---|
| 575 | fn canonical_value( | 
|---|
| 576 | vals: PropertyValues, | 
|---|
| 577 | normalized_value: &str, | 
|---|
| 578 | ) -> Option<&'static str> { | 
|---|
| 579 | valsOption.binary_search_by_key(&normalized_value, |&(n: &str, _)| n) | 
|---|
| 580 | .ok() | 
|---|
| 581 | .map(|i: usize| vals[i].1) | 
|---|
| 582 | } | 
|---|
| 583 |  | 
|---|
| 584 | /// Return the table of property values for the given property name. | 
|---|
| 585 | /// | 
|---|
| 586 | /// If the property values data is not available, then an error is returned. | 
|---|
| 587 | fn property_values( | 
|---|
| 588 | canonical_property_name: &'static str, | 
|---|
| 589 | ) -> Result<Option<PropertyValues>, Error> { | 
|---|
| 590 | #[ cfg(not(any( | 
|---|
| 591 | feature = "unicode-age", | 
|---|
| 592 | feature = "unicode-bool", | 
|---|
| 593 | feature = "unicode-gencat", | 
|---|
| 594 | feature = "unicode-perl", | 
|---|
| 595 | feature = "unicode-script", | 
|---|
| 596 | feature = "unicode-segment", | 
|---|
| 597 | )))] | 
|---|
| 598 | fn imp(_: &'static str) -> Result<Option<PropertyValues>, Error> { | 
|---|
| 599 | Err(Error::PropertyValueNotFound) | 
|---|
| 600 | } | 
|---|
| 601 |  | 
|---|
| 602 | #[ cfg(any( | 
|---|
| 603 | feature = "unicode-age", | 
|---|
| 604 | feature = "unicode-bool", | 
|---|
| 605 | feature = "unicode-gencat", | 
|---|
| 606 | feature = "unicode-perl", | 
|---|
| 607 | feature = "unicode-script", | 
|---|
| 608 | feature = "unicode-segment", | 
|---|
| 609 | ))] | 
|---|
| 610 | fn imp(name: &'static str) -> Result<Option<PropertyValues>, Error> { | 
|---|
| 611 | use crate::unicode_tables::property_values::PROPERTY_VALUES; | 
|---|
| 612 |  | 
|---|
| 613 | Ok(PROPERTY_VALUES | 
|---|
| 614 | .binary_search_by_key(&name, |&(n, _)| n) | 
|---|
| 615 | .ok() | 
|---|
| 616 | .map(|i| PROPERTY_VALUES[i].1)) | 
|---|
| 617 | } | 
|---|
| 618 |  | 
|---|
| 619 | imp(canonical_property_name) | 
|---|
| 620 | } | 
|---|
| 621 |  | 
|---|
| 622 | // This is only used in some cases, but small enough to just let it be dead | 
|---|
| 623 | // instead of figuring out (and maintaining) the right set of features. | 
|---|
| 624 | #[ allow(dead_code)] | 
|---|
| 625 | fn property_set( | 
|---|
| 626 | name_map: &'static [(&'static str, Range)], | 
|---|
| 627 | canonical: &'static str, | 
|---|
| 628 | ) -> Option<Range> { | 
|---|
| 629 | name_mapOption | 
|---|
| 630 | .binary_search_by_key(&canonical, |x: &(&str, &[(char, char)])| x.0) | 
|---|
| 631 | .ok() | 
|---|
| 632 | .map(|i: usize| name_map[i].1) | 
|---|
| 633 | } | 
|---|
| 634 |  | 
|---|
| 635 | /// Returns an iterator over Unicode Age sets. Each item corresponds to a set | 
|---|
| 636 | /// of codepoints that were added in a particular revision of Unicode. The | 
|---|
| 637 | /// iterator yields items in chronological order. | 
|---|
| 638 | /// | 
|---|
| 639 | /// If the given age value isn't valid or if the data isn't available, then an | 
|---|
| 640 | /// error is returned instead. | 
|---|
| 641 | fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>, Error> { | 
|---|
| 642 | #[ cfg(not(feature = "unicode-age"))] | 
|---|
| 643 | fn imp(_: &str) -> Result<impl Iterator<Item = Range>, Error> { | 
|---|
| 644 | use core::option::IntoIter; | 
|---|
| 645 | Err::<IntoIter<Range>, _>(Error::PropertyNotFound) | 
|---|
| 646 | } | 
|---|
| 647 |  | 
|---|
| 648 | #[ cfg(feature = "unicode-age")] | 
|---|
| 649 | fn imp(canonical_age: &str) -> Result<impl Iterator<Item = Range>, Error> { | 
|---|
| 650 | use crate::unicode_tables::age; | 
|---|
| 651 |  | 
|---|
| 652 | const AGES: &[(&str, Range)] = &[ | 
|---|
| 653 | ( "V1_1", age::V1_1), | 
|---|
| 654 | ( "V2_0", age::V2_0), | 
|---|
| 655 | ( "V2_1", age::V2_1), | 
|---|
| 656 | ( "V3_0", age::V3_0), | 
|---|
| 657 | ( "V3_1", age::V3_1), | 
|---|
| 658 | ( "V3_2", age::V3_2), | 
|---|
| 659 | ( "V4_0", age::V4_0), | 
|---|
| 660 | ( "V4_1", age::V4_1), | 
|---|
| 661 | ( "V5_0", age::V5_0), | 
|---|
| 662 | ( "V5_1", age::V5_1), | 
|---|
| 663 | ( "V5_2", age::V5_2), | 
|---|
| 664 | ( "V6_0", age::V6_0), | 
|---|
| 665 | ( "V6_1", age::V6_1), | 
|---|
| 666 | ( "V6_2", age::V6_2), | 
|---|
| 667 | ( "V6_3", age::V6_3), | 
|---|
| 668 | ( "V7_0", age::V7_0), | 
|---|
| 669 | ( "V8_0", age::V8_0), | 
|---|
| 670 | ( "V9_0", age::V9_0), | 
|---|
| 671 | ( "V10_0", age::V10_0), | 
|---|
| 672 | ( "V11_0", age::V11_0), | 
|---|
| 673 | ( "V12_0", age::V12_0), | 
|---|
| 674 | ( "V12_1", age::V12_1), | 
|---|
| 675 | ( "V13_0", age::V13_0), | 
|---|
| 676 | ( "V14_0", age::V14_0), | 
|---|
| 677 | ( "V15_0", age::V15_0), | 
|---|
| 678 | ( "V15_1", age::V15_1), | 
|---|
| 679 | ( "V16_0", age::V16_0), | 
|---|
| 680 | ]; | 
|---|
| 681 | assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync"); | 
|---|
| 682 |  | 
|---|
| 683 | let pos = AGES.iter().position(|&(age, _)| canonical_age == age); | 
|---|
| 684 | match pos { | 
|---|
| 685 | None => Err(Error::PropertyValueNotFound), | 
|---|
| 686 | Some(i) => Ok(AGES[..=i].iter().map(|&(_, classes)| classes)), | 
|---|
| 687 | } | 
|---|
| 688 | } | 
|---|
| 689 |  | 
|---|
| 690 | imp(canonical_age) | 
|---|
| 691 | } | 
|---|
| 692 |  | 
|---|
| 693 | /// Returns the Unicode HIR class corresponding to the given general category. | 
|---|
| 694 | /// | 
|---|
| 695 | /// Name canonicalization is assumed to be performed by the caller. | 
|---|
| 696 | /// | 
|---|
| 697 | /// If the given general category could not be found, or if the general | 
|---|
| 698 | /// category data is not available, then an error is returned. | 
|---|
| 699 | fn gencat(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> { | 
|---|
| 700 | #[ cfg(not(feature = "unicode-gencat"))] | 
|---|
| 701 | fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> { | 
|---|
| 702 | Err(Error::PropertyNotFound) | 
|---|
| 703 | } | 
|---|
| 704 |  | 
|---|
| 705 | #[ cfg(feature = "unicode-gencat")] | 
|---|
| 706 | fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> { | 
|---|
| 707 | use crate::unicode_tables::general_category::BY_NAME; | 
|---|
| 708 | match name { | 
|---|
| 709 | "ASCII"=> Ok(hir_class(&[( '\0 ', '\x7F ')])), | 
|---|
| 710 | "Any"=> Ok(hir_class(&[( '\0 ', '\u{10FFFF} ')])), | 
|---|
| 711 | "Assigned"=> { | 
|---|
| 712 | let mut cls = gencat( "Unassigned")?; | 
|---|
| 713 | cls.negate(); | 
|---|
| 714 | Ok(cls) | 
|---|
| 715 | } | 
|---|
| 716 | name => property_set(BY_NAME, name) | 
|---|
| 717 | .map(hir_class) | 
|---|
| 718 | .ok_or(Error::PropertyValueNotFound), | 
|---|
| 719 | } | 
|---|
| 720 | } | 
|---|
| 721 |  | 
|---|
| 722 | match canonical_name { | 
|---|
| 723 | "Decimal_Number"=> perl_digit(), | 
|---|
| 724 | name => imp(name), | 
|---|
| 725 | } | 
|---|
| 726 | } | 
|---|
| 727 |  | 
|---|
| 728 | /// Returns the Unicode HIR class corresponding to the given script. | 
|---|
| 729 | /// | 
|---|
| 730 | /// Name canonicalization is assumed to be performed by the caller. | 
|---|
| 731 | /// | 
|---|
| 732 | /// If the given script could not be found, or if the script data is not | 
|---|
| 733 | /// available, then an error is returned. | 
|---|
| 734 | fn script(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> { | 
|---|
| 735 | #[ cfg(not(feature = "unicode-script"))] | 
|---|
| 736 | fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> { | 
|---|
| 737 | Err(Error::PropertyNotFound) | 
|---|
| 738 | } | 
|---|
| 739 |  | 
|---|
| 740 | #[ cfg(feature = "unicode-script")] | 
|---|
| 741 | fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> { | 
|---|
| 742 | use crate::unicode_tables::script::BY_NAME; | 
|---|
| 743 | property_set(BY_NAME, name) | 
|---|
| 744 | .map(hir_class) | 
|---|
| 745 | .ok_or(err:Error::PropertyValueNotFound) | 
|---|
| 746 | } | 
|---|
| 747 |  | 
|---|
| 748 | imp(canonical_name) | 
|---|
| 749 | } | 
|---|
| 750 |  | 
|---|
| 751 | /// Returns the Unicode HIR class corresponding to the given script extension. | 
|---|
| 752 | /// | 
|---|
| 753 | /// Name canonicalization is assumed to be performed by the caller. | 
|---|
| 754 | /// | 
|---|
| 755 | /// If the given script extension could not be found, or if the script data is | 
|---|
| 756 | /// not available, then an error is returned. | 
|---|
| 757 | fn script_extension( | 
|---|
| 758 | canonical_name: &'static str, | 
|---|
| 759 | ) -> Result<hir::ClassUnicode, Error> { | 
|---|
| 760 | #[ cfg(not(feature = "unicode-script"))] | 
|---|
| 761 | fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> { | 
|---|
| 762 | Err(Error::PropertyNotFound) | 
|---|
| 763 | } | 
|---|
| 764 |  | 
|---|
| 765 | #[ cfg(feature = "unicode-script")] | 
|---|
| 766 | fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> { | 
|---|
| 767 | use crate::unicode_tables::script_extension::BY_NAME; | 
|---|
| 768 | property_set(BY_NAME, name) | 
|---|
| 769 | .map(hir_class) | 
|---|
| 770 | .ok_or(err:Error::PropertyValueNotFound) | 
|---|
| 771 | } | 
|---|
| 772 |  | 
|---|
| 773 | imp(canonical_name) | 
|---|
| 774 | } | 
|---|
| 775 |  | 
|---|
| 776 | /// Returns the Unicode HIR class corresponding to the given Unicode boolean | 
|---|
| 777 | /// property. | 
|---|
| 778 | /// | 
|---|
| 779 | /// Name canonicalization is assumed to be performed by the caller. | 
|---|
| 780 | /// | 
|---|
| 781 | /// If the given boolean property could not be found, or if the boolean | 
|---|
| 782 | /// property data is not available, then an error is returned. | 
|---|
| 783 | fn bool_property( | 
|---|
| 784 | canonical_name: &'static str, | 
|---|
| 785 | ) -> Result<hir::ClassUnicode, Error> { | 
|---|
| 786 | #[ cfg(not(feature = "unicode-bool"))] | 
|---|
| 787 | fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> { | 
|---|
| 788 | Err(Error::PropertyNotFound) | 
|---|
| 789 | } | 
|---|
| 790 |  | 
|---|
| 791 | #[ cfg(feature = "unicode-bool")] | 
|---|
| 792 | fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> { | 
|---|
| 793 | use crate::unicode_tables::property_bool::BY_NAME; | 
|---|
| 794 | property_set(BY_NAME, name) | 
|---|
| 795 | .map(hir_class) | 
|---|
| 796 | .ok_or(err:Error::PropertyNotFound) | 
|---|
| 797 | } | 
|---|
| 798 |  | 
|---|
| 799 | match canonical_name { | 
|---|
| 800 | "Decimal_Number"=> perl_digit(), | 
|---|
| 801 | "White_Space"=> perl_space(), | 
|---|
| 802 | name: &'static str => imp(name), | 
|---|
| 803 | } | 
|---|
| 804 | } | 
|---|
| 805 |  | 
|---|
| 806 | /// Returns the Unicode HIR class corresponding to the given grapheme cluster | 
|---|
| 807 | /// break property. | 
|---|
| 808 | /// | 
|---|
| 809 | /// Name canonicalization is assumed to be performed by the caller. | 
|---|
| 810 | /// | 
|---|
| 811 | /// If the given property could not be found, or if the corresponding data is | 
|---|
| 812 | /// not available, then an error is returned. | 
|---|
| 813 | fn gcb(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> { | 
|---|
| 814 | #[ cfg(not(feature = "unicode-segment"))] | 
|---|
| 815 | fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> { | 
|---|
| 816 | Err(Error::PropertyNotFound) | 
|---|
| 817 | } | 
|---|
| 818 |  | 
|---|
| 819 | #[ cfg(feature = "unicode-segment")] | 
|---|
| 820 | fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> { | 
|---|
| 821 | use crate::unicode_tables::grapheme_cluster_break::BY_NAME; | 
|---|
| 822 | property_set(BY_NAME, name) | 
|---|
| 823 | .map(hir_class) | 
|---|
| 824 | .ok_or(err:Error::PropertyValueNotFound) | 
|---|
| 825 | } | 
|---|
| 826 |  | 
|---|
| 827 | imp(canonical_name) | 
|---|
| 828 | } | 
|---|
| 829 |  | 
|---|
| 830 | /// Returns the Unicode HIR class corresponding to the given word break | 
|---|
| 831 | /// property. | 
|---|
| 832 | /// | 
|---|
| 833 | /// Name canonicalization is assumed to be performed by the caller. | 
|---|
| 834 | /// | 
|---|
| 835 | /// If the given property could not be found, or if the corresponding data is | 
|---|
| 836 | /// not available, then an error is returned. | 
|---|
| 837 | fn wb(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> { | 
|---|
| 838 | #[ cfg(not(feature = "unicode-segment"))] | 
|---|
| 839 | fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> { | 
|---|
| 840 | Err(Error::PropertyNotFound) | 
|---|
| 841 | } | 
|---|
| 842 |  | 
|---|
| 843 | #[ cfg(feature = "unicode-segment")] | 
|---|
| 844 | fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> { | 
|---|
| 845 | use crate::unicode_tables::word_break::BY_NAME; | 
|---|
| 846 | property_set(BY_NAME, name) | 
|---|
| 847 | .map(hir_class) | 
|---|
| 848 | .ok_or(err:Error::PropertyValueNotFound) | 
|---|
| 849 | } | 
|---|
| 850 |  | 
|---|
| 851 | imp(canonical_name) | 
|---|
| 852 | } | 
|---|
| 853 |  | 
|---|
| 854 | /// Returns the Unicode HIR class corresponding to the given sentence | 
|---|
| 855 | /// break property. | 
|---|
| 856 | /// | 
|---|
| 857 | /// Name canonicalization is assumed to be performed by the caller. | 
|---|
| 858 | /// | 
|---|
| 859 | /// If the given property could not be found, or if the corresponding data is | 
|---|
| 860 | /// not available, then an error is returned. | 
|---|
| 861 | fn sb(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> { | 
|---|
| 862 | #[ cfg(not(feature = "unicode-segment"))] | 
|---|
| 863 | fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> { | 
|---|
| 864 | Err(Error::PropertyNotFound) | 
|---|
| 865 | } | 
|---|
| 866 |  | 
|---|
| 867 | #[ cfg(feature = "unicode-segment")] | 
|---|
| 868 | fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> { | 
|---|
| 869 | use crate::unicode_tables::sentence_break::BY_NAME; | 
|---|
| 870 | property_set(BY_NAME, name) | 
|---|
| 871 | .map(hir_class) | 
|---|
| 872 | .ok_or(err:Error::PropertyValueNotFound) | 
|---|
| 873 | } | 
|---|
| 874 |  | 
|---|
| 875 | imp(canonical_name) | 
|---|
| 876 | } | 
|---|
| 877 |  | 
|---|
| 878 | /// Like symbolic_name_normalize_bytes, but operates on a string. | 
|---|
| 879 | fn symbolic_name_normalize(x: &str) -> String { | 
|---|
| 880 | let mut tmp: Vec = x.as_bytes().to_vec(); | 
|---|
| 881 | let len: usize = symbolic_name_normalize_bytes(&mut tmp).len(); | 
|---|
| 882 | tmp.truncate(len); | 
|---|
| 883 | // This should always succeed because `symbolic_name_normalize_bytes` | 
|---|
| 884 | // guarantees that `&tmp[..len]` is always valid UTF-8. | 
|---|
| 885 | // | 
|---|
| 886 | // N.B. We could avoid the additional UTF-8 check here, but it's unlikely | 
|---|
| 887 | // to be worth skipping the additional safety check. A benchmark must | 
|---|
| 888 | // justify it first. | 
|---|
| 889 | String::from_utf8(vec:tmp).unwrap() | 
|---|
| 890 | } | 
|---|
| 891 |  | 
|---|
| 892 | /// Normalize the given symbolic name in place according to UAX44-LM3. | 
|---|
| 893 | /// | 
|---|
| 894 | /// A "symbolic name" typically corresponds to property names and property | 
|---|
| 895 | /// value aliases. Note, though, that it should not be applied to property | 
|---|
| 896 | /// string values. | 
|---|
| 897 | /// | 
|---|
| 898 | /// The slice returned is guaranteed to be valid UTF-8 for all possible values | 
|---|
| 899 | /// of `slice`. | 
|---|
| 900 | /// | 
|---|
| 901 | /// See: https://unicode.org/reports/tr44/#UAX44-LM3 | 
|---|
| 902 | fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] { | 
|---|
| 903 | // I couldn't find a place in the standard that specified that property | 
|---|
| 904 | // names/aliases had a particular structure (unlike character names), but | 
|---|
| 905 | // we assume that it's ASCII only and drop anything that isn't ASCII. | 
|---|
| 906 | let mut start = 0; | 
|---|
| 907 | let mut starts_with_is = false; | 
|---|
| 908 | if slice.len() >= 2 { | 
|---|
| 909 | // Ignore any "is" prefix. | 
|---|
| 910 | starts_with_is = slice[0..2] == b"is"[..] | 
|---|
| 911 | || slice[0..2] == b"IS"[..] | 
|---|
| 912 | || slice[0..2] == b"iS"[..] | 
|---|
| 913 | || slice[0..2] == b"Is"[..]; | 
|---|
| 914 | if starts_with_is { | 
|---|
| 915 | start = 2; | 
|---|
| 916 | } | 
|---|
| 917 | } | 
|---|
| 918 | let mut next_write = 0; | 
|---|
| 919 | for i in start..slice.len() { | 
|---|
| 920 | // VALIDITY ARGUMENT: To guarantee that the resulting slice is valid | 
|---|
| 921 | // UTF-8, we ensure that the slice contains only ASCII bytes. In | 
|---|
| 922 | // particular, we drop every non-ASCII byte from the normalized string. | 
|---|
| 923 | let b = slice[i]; | 
|---|
| 924 | if b == b' '|| b == b'_'|| b == b'-'{ | 
|---|
| 925 | continue; | 
|---|
| 926 | } else if b'A'<= b && b <= b'Z'{ | 
|---|
| 927 | slice[next_write] = b + ( b'a'- b'A'); | 
|---|
| 928 | next_write += 1; | 
|---|
| 929 | } else if b <= 0x7F { | 
|---|
| 930 | slice[next_write] = b; | 
|---|
| 931 | next_write += 1; | 
|---|
| 932 | } | 
|---|
| 933 | } | 
|---|
| 934 | // Special case: ISO_Comment has a 'isc' abbreviation. Since we generally | 
|---|
| 935 | // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross | 
|---|
| 936 | // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it | 
|---|
| 937 | // is actually an alias for the 'Other' general category. | 
|---|
| 938 | if starts_with_is && next_write == 1 && slice[0] == b'c'{ | 
|---|
| 939 | slice[0] = b'i'; | 
|---|
| 940 | slice[1] = b's'; | 
|---|
| 941 | slice[2] = b'c'; | 
|---|
| 942 | next_write = 3; | 
|---|
| 943 | } | 
|---|
| 944 | &mut slice[..next_write] | 
|---|
| 945 | } | 
|---|
| 946 |  | 
|---|
| 947 | #[ cfg(test)] | 
|---|
| 948 | mod tests { | 
|---|
| 949 | use super::*; | 
|---|
| 950 |  | 
|---|
| 951 | #[ cfg(feature = "unicode-case")] | 
|---|
| 952 | fn simple_fold_ok(c: char) -> impl Iterator<Item = char> { | 
|---|
| 953 | SimpleCaseFolder::new().unwrap().mapping(c).iter().copied() | 
|---|
| 954 | } | 
|---|
| 955 |  | 
|---|
| 956 | #[ cfg(feature = "unicode-case")] | 
|---|
| 957 | fn contains_case_map(start: char, end: char) -> bool { | 
|---|
| 958 | SimpleCaseFolder::new().unwrap().overlaps(start, end) | 
|---|
| 959 | } | 
|---|
| 960 |  | 
|---|
| 961 | #[ test] | 
|---|
| 962 | #[ cfg(feature = "unicode-case")] | 
|---|
| 963 | fn simple_fold_k() { | 
|---|
| 964 | let xs: Vec<char> = simple_fold_ok( 'k').collect(); | 
|---|
| 965 | assert_eq!(xs, alloc::vec![ 'K', 'K']); | 
|---|
| 966 |  | 
|---|
| 967 | let xs: Vec<char> = simple_fold_ok( 'K').collect(); | 
|---|
| 968 | assert_eq!(xs, alloc::vec![ 'k', 'K']); | 
|---|
| 969 |  | 
|---|
| 970 | let xs: Vec<char> = simple_fold_ok( 'K').collect(); | 
|---|
| 971 | assert_eq!(xs, alloc::vec![ 'K', 'k']); | 
|---|
| 972 | } | 
|---|
| 973 |  | 
|---|
| 974 | #[ test] | 
|---|
| 975 | #[ cfg(feature = "unicode-case")] | 
|---|
| 976 | fn simple_fold_a() { | 
|---|
| 977 | let xs: Vec<char> = simple_fold_ok( 'a').collect(); | 
|---|
| 978 | assert_eq!(xs, alloc::vec![ 'A']); | 
|---|
| 979 |  | 
|---|
| 980 | let xs: Vec<char> = simple_fold_ok( 'A').collect(); | 
|---|
| 981 | assert_eq!(xs, alloc::vec![ 'a']); | 
|---|
| 982 | } | 
|---|
| 983 |  | 
|---|
| 984 | #[ test] | 
|---|
| 985 | #[ cfg(not(feature = "unicode-case"))] | 
|---|
| 986 | fn simple_fold_disabled() { | 
|---|
| 987 | assert!(SimpleCaseFolder::new().is_err()); | 
|---|
| 988 | } | 
|---|
| 989 |  | 
|---|
| 990 | #[ test] | 
|---|
| 991 | #[ cfg(feature = "unicode-case")] | 
|---|
| 992 | fn range_contains() { | 
|---|
| 993 | assert!(contains_case_map( 'A', 'A')); | 
|---|
| 994 | assert!(contains_case_map( 'Z', 'Z')); | 
|---|
| 995 | assert!(contains_case_map( 'A', 'Z')); | 
|---|
| 996 | assert!(contains_case_map( '@', 'A')); | 
|---|
| 997 | assert!(contains_case_map( 'Z', '[')); | 
|---|
| 998 | assert!(contains_case_map( '☃', 'Ⰰ')); | 
|---|
| 999 |  | 
|---|
| 1000 | assert!(!contains_case_map( '[', '[')); | 
|---|
| 1001 | assert!(!contains_case_map( '[', '`')); | 
|---|
| 1002 |  | 
|---|
| 1003 | assert!(!contains_case_map( '☃', '☃')); | 
|---|
| 1004 | } | 
|---|
| 1005 |  | 
|---|
| 1006 | #[ test] | 
|---|
| 1007 | #[ cfg(feature = "unicode-gencat")] | 
|---|
| 1008 | fn regression_466() { | 
|---|
| 1009 | use super::{CanonicalClassQuery, ClassQuery}; | 
|---|
| 1010 |  | 
|---|
| 1011 | let q = ClassQuery::OneLetter( 'C'); | 
|---|
| 1012 | assert_eq!( | 
|---|
| 1013 | q.canonicalize().unwrap(), | 
|---|
| 1014 | CanonicalClassQuery::GeneralCategory( "Other") | 
|---|
| 1015 | ); | 
|---|
| 1016 | } | 
|---|
| 1017 |  | 
|---|
| 1018 | #[ test] | 
|---|
| 1019 | fn sym_normalize() { | 
|---|
| 1020 | let sym_norm = symbolic_name_normalize; | 
|---|
| 1021 |  | 
|---|
| 1022 | assert_eq!(sym_norm( "Line_Break"), "linebreak"); | 
|---|
| 1023 | assert_eq!(sym_norm( "Line-break"), "linebreak"); | 
|---|
| 1024 | assert_eq!(sym_norm( "linebreak"), "linebreak"); | 
|---|
| 1025 | assert_eq!(sym_norm( "BA"), "ba"); | 
|---|
| 1026 | assert_eq!(sym_norm( "ba"), "ba"); | 
|---|
| 1027 | assert_eq!(sym_norm( "Greek"), "greek"); | 
|---|
| 1028 | assert_eq!(sym_norm( "isGreek"), "greek"); | 
|---|
| 1029 | assert_eq!(sym_norm( "IS_Greek"), "greek"); | 
|---|
| 1030 | assert_eq!(sym_norm( "isc"), "isc"); | 
|---|
| 1031 | assert_eq!(sym_norm( "is c"), "isc"); | 
|---|
| 1032 | assert_eq!(sym_norm( "is_c"), "isc"); | 
|---|
| 1033 | } | 
|---|
| 1034 |  | 
|---|
| 1035 | #[ test] | 
|---|
| 1036 | fn valid_utf8_symbolic() { | 
|---|
| 1037 | let mut x = b"abc\xFF xyz".to_vec(); | 
|---|
| 1038 | let y = symbolic_name_normalize_bytes(&mut x); | 
|---|
| 1039 | assert_eq!(y, b"abcxyz"); | 
|---|
| 1040 | } | 
|---|
| 1041 | } | 
|---|
| 1042 |  | 
|---|