| 1 | // This file is part of ICU4X. For terms of use, please see the file |
| 2 | // called LICENSE at the top level of the ICU4X source tree |
| 3 | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
| 4 | |
| 5 | //! The collection of code for locale canonicalization. |
| 6 | |
| 7 | use crate::provider::*; |
| 8 | use crate::LocaleTransformError; |
| 9 | use alloc::vec::Vec; |
| 10 | use core::cmp::Ordering; |
| 11 | |
| 12 | use crate::LocaleExpander; |
| 13 | use crate::TransformResult; |
| 14 | use icu_locid::extensions::Extensions; |
| 15 | use icu_locid::subtags::{Language, Region, Script}; |
| 16 | use icu_locid::{ |
| 17 | extensions::unicode::key, |
| 18 | subtags::{language, Variant, Variants}, |
| 19 | LanguageIdentifier, Locale, |
| 20 | }; |
| 21 | use icu_provider::prelude::*; |
| 22 | use tinystr::TinyAsciiStr; |
| 23 | |
| 24 | /// Implements the algorithm defined in *[UTS #35: Annex C, LocaleId Canonicalization]*. |
| 25 | /// |
| 26 | /// # Examples |
| 27 | /// |
| 28 | /// ``` |
| 29 | /// use icu::locid::Locale; |
| 30 | /// use icu::locid_transform::{LocaleCanonicalizer, TransformResult}; |
| 31 | /// |
| 32 | /// let lc = LocaleCanonicalizer::new(); |
| 33 | /// |
| 34 | /// let mut locale: Locale = "ja-Latn-fonipa-hepburn-heploc" .parse().unwrap(); |
| 35 | /// assert_eq!(lc.canonicalize(&mut locale), TransformResult::Modified); |
| 36 | /// assert_eq!(locale, "ja-Latn-alalc97-fonipa" .parse().unwrap()); |
| 37 | /// ``` |
| 38 | /// |
| 39 | /// [UTS #35: Annex C, LocaleId Canonicalization]: http://unicode.org/reports/tr35/#LocaleId_Canonicalization |
| 40 | #[derive (Debug)] |
| 41 | pub struct LocaleCanonicalizer { |
| 42 | /// Data to support canonicalization. |
| 43 | aliases: DataPayload<AliasesV2Marker>, |
| 44 | /// Likely subtags implementation for delegation. |
| 45 | expander: LocaleExpander, |
| 46 | } |
| 47 | |
| 48 | fn uts35_rule_matches<'a, I>( |
| 49 | source: &LanguageIdentifier, |
| 50 | language: Language, |
| 51 | script: Option<Script>, |
| 52 | region: Option<Region>, |
| 53 | raw_variants: I, |
| 54 | ) -> bool |
| 55 | where |
| 56 | I: Iterator<Item = &'a str>, |
| 57 | { |
| 58 | (language.is_empty() || language == source.language) |
| 59 | && (script.is_none() || script == source.script) |
| 60 | && (region.is_none() || region == source.region) |
| 61 | && { |
| 62 | // Checks if variants are a subset of source variants. |
| 63 | // As both iterators are sorted, this can be done linearly. |
| 64 | let mut source_variants = source.variants.iter(); |
| 65 | 'outer: for raw_variant in raw_variants { |
| 66 | for source_variant in source_variants.by_ref() { |
| 67 | match source_variant.strict_cmp(raw_variant.as_bytes()) { |
| 68 | Ordering::Equal => { |
| 69 | // The source_variant is equal, move to next raw_variant |
| 70 | continue 'outer; |
| 71 | } |
| 72 | Ordering::Less => { |
| 73 | // The source_variant is smaller, take the next source_variant |
| 74 | } |
| 75 | Ordering::Greater => { |
| 76 | // The source_variant is greater, |
| 77 | // raw_variants is not a subset of source_variants |
| 78 | return false; |
| 79 | } |
| 80 | } |
| 81 | } |
| 82 | // There are raw_variants left after we exhausted source_variants |
| 83 | return false; |
| 84 | } |
| 85 | true |
| 86 | } |
| 87 | } |
| 88 | |
| 89 | fn uts35_replacement<'a, I>( |
| 90 | source: &mut LanguageIdentifier, |
| 91 | ruletype_has_language: bool, |
| 92 | ruletype_has_script: bool, |
| 93 | ruletype_has_region: bool, |
| 94 | ruletype_variants: Option<I>, |
| 95 | replacement: &LanguageIdentifier, |
| 96 | ) where |
| 97 | I: Iterator<Item = &'a str>, |
| 98 | { |
| 99 | if ruletype_has_language || (source.language.is_empty() && !replacement.language.is_empty()) { |
| 100 | source.language = replacement.language; |
| 101 | } |
| 102 | if ruletype_has_script || (source.script.is_none() && replacement.script.is_some()) { |
| 103 | source.script = replacement.script; |
| 104 | } |
| 105 | if ruletype_has_region || (source.region.is_none() && replacement.region.is_some()) { |
| 106 | source.region = replacement.region; |
| 107 | } |
| 108 | if let Some(skips) = ruletype_variants { |
| 109 | // The rule matches if the ruletype variants are a subset of the source variants. |
| 110 | // This means ja-Latn-fonipa-hepburn-heploc matches against the rule for |
| 111 | // hepburn-heploc and is canonicalized to ja-Latn-alalc97-fonipa |
| 112 | |
| 113 | // We're merging three sorted deduped iterators into a new sequence: |
| 114 | // sources - skips + replacements |
| 115 | |
| 116 | let mut sources = source.variants.iter().peekable(); |
| 117 | let mut replacements = replacement.variants.iter().peekable(); |
| 118 | let mut skips = skips.peekable(); |
| 119 | |
| 120 | let mut variants: Vec<Variant> = Vec::new(); |
| 121 | |
| 122 | loop { |
| 123 | match (sources.peek(), skips.peek(), replacements.peek()) { |
| 124 | (Some(&source), Some(skip), _) |
| 125 | if source.strict_cmp(skip.as_bytes()) == Ordering::Greater => |
| 126 | { |
| 127 | skips.next(); |
| 128 | } |
| 129 | (Some(&source), Some(skip), _) |
| 130 | if source.strict_cmp(skip.as_bytes()) == Ordering::Equal => |
| 131 | { |
| 132 | skips.next(); |
| 133 | sources.next(); |
| 134 | } |
| 135 | (Some(&source), _, Some(&replacement)) |
| 136 | if replacement.cmp(source) == Ordering::Less => |
| 137 | { |
| 138 | variants.push(*replacement); |
| 139 | replacements.next(); |
| 140 | } |
| 141 | (Some(&source), _, Some(&replacement)) |
| 142 | if replacement.cmp(source) == Ordering::Equal => |
| 143 | { |
| 144 | variants.push(*source); |
| 145 | sources.next(); |
| 146 | replacements.next(); |
| 147 | } |
| 148 | (Some(&source), _, _) => { |
| 149 | variants.push(*source); |
| 150 | sources.next(); |
| 151 | } |
| 152 | (None, _, Some(&replacement)) => { |
| 153 | variants.push(*replacement); |
| 154 | replacements.next(); |
| 155 | } |
| 156 | (None, _, None) => { |
| 157 | break; |
| 158 | } |
| 159 | } |
| 160 | } |
| 161 | source.variants = Variants::from_vec_unchecked(variants); |
| 162 | } |
| 163 | } |
| 164 | |
| 165 | #[inline ] |
| 166 | fn uts35_check_language_rules( |
| 167 | langid: &mut LanguageIdentifier, |
| 168 | alias_data: &DataPayload<AliasesV2Marker>, |
| 169 | ) -> TransformResult { |
| 170 | if !langid.language.is_empty() { |
| 171 | let lang: TinyAsciiStr<3> = langid.language.into(); |
| 172 | let replacement = if lang.len() == 2 { |
| 173 | alias_data |
| 174 | .get() |
| 175 | .language_len2 |
| 176 | .get(&lang.resize().to_unvalidated()) |
| 177 | } else { |
| 178 | alias_data.get().language_len3.get(&lang.to_unvalidated()) |
| 179 | }; |
| 180 | |
| 181 | if let Some(replacement) = replacement { |
| 182 | if let Ok(new_langid) = replacement.parse() { |
| 183 | uts35_replacement::<core::iter::Empty<&str>>( |
| 184 | langid, |
| 185 | true, |
| 186 | false, |
| 187 | false, |
| 188 | None, |
| 189 | &new_langid, |
| 190 | ); |
| 191 | return TransformResult::Modified; |
| 192 | } |
| 193 | } |
| 194 | } |
| 195 | |
| 196 | TransformResult::Unmodified |
| 197 | } |
| 198 | |
| 199 | #[cfg (feature = "compiled_data" )] |
| 200 | impl Default for LocaleCanonicalizer { |
| 201 | fn default() -> Self { |
| 202 | Self::new() |
| 203 | } |
| 204 | } |
| 205 | |
| 206 | impl LocaleCanonicalizer { |
| 207 | /// A constructor which creates a [`LocaleCanonicalizer`] from compiled data. |
| 208 | /// |
| 209 | /// ✨ *Enabled with the `compiled_data` Cargo feature.* |
| 210 | /// |
| 211 | /// [📚 Help choosing a constructor](icu_provider::constructors) |
| 212 | #[cfg (feature = "compiled_data" )] |
| 213 | pub const fn new() -> Self { |
| 214 | Self::new_with_expander(LocaleExpander::new_extended()) |
| 215 | } |
| 216 | |
| 217 | // Note: This is a custom impl because the bounds on LocaleExpander::try_new_unstable changed |
| 218 | #[doc = icu_provider::gen_any_buffer_unstable_docs!(ANY, Self::new)] |
| 219 | pub fn try_new_with_any_provider( |
| 220 | provider: &(impl AnyProvider + ?Sized), |
| 221 | ) -> Result<Self, LocaleTransformError> { |
| 222 | let expander = LocaleExpander::try_new_with_any_provider(provider)?; |
| 223 | Self::try_new_with_expander_compat(&provider.as_downcasting(), expander) |
| 224 | } |
| 225 | |
| 226 | // Note: This is a custom impl because the bounds on LocaleExpander::try_new_unstable changed |
| 227 | #[doc = icu_provider::gen_any_buffer_unstable_docs!(BUFFER, Self::new)] |
| 228 | #[cfg (feature = "serde" )] |
| 229 | pub fn try_new_with_buffer_provider( |
| 230 | provider: &(impl BufferProvider + ?Sized), |
| 231 | ) -> Result<Self, LocaleTransformError> { |
| 232 | let expander = LocaleExpander::try_new_with_buffer_provider(provider)?; |
| 233 | Self::try_new_with_expander_compat(&provider.as_deserializing(), expander) |
| 234 | } |
| 235 | |
| 236 | #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)] |
| 237 | pub fn try_new_unstable<P>(provider: &P) -> Result<Self, LocaleTransformError> |
| 238 | where |
| 239 | P: DataProvider<AliasesV2Marker> |
| 240 | + DataProvider<LikelySubtagsForLanguageV1Marker> |
| 241 | + DataProvider<LikelySubtagsForScriptRegionV1Marker> |
| 242 | + ?Sized, |
| 243 | { |
| 244 | let expander = LocaleExpander::try_new_unstable(provider)?; |
| 245 | Self::try_new_with_expander_unstable(provider, expander) |
| 246 | } |
| 247 | |
| 248 | /// Creates a [`LocaleCanonicalizer`] with a custom [`LocaleExpander`] and compiled data. |
| 249 | /// |
| 250 | /// ✨ *Enabled with the `compiled_data` Cargo feature.* |
| 251 | /// |
| 252 | /// [📚 Help choosing a constructor](icu_provider::constructors) |
| 253 | #[cfg (feature = "compiled_data" )] |
| 254 | pub const fn new_with_expander(expander: LocaleExpander) -> Self { |
| 255 | Self { |
| 256 | aliases: DataPayload::from_static_ref( |
| 257 | crate::provider::Baked::SINGLETON_LOCID_TRANSFORM_ALIASES_V2, |
| 258 | ), |
| 259 | expander, |
| 260 | } |
| 261 | } |
| 262 | |
| 263 | fn try_new_with_expander_compat<P>( |
| 264 | provider: &P, |
| 265 | expander: LocaleExpander, |
| 266 | ) -> Result<Self, LocaleTransformError> |
| 267 | where |
| 268 | P: DataProvider<AliasesV2Marker> + DataProvider<AliasesV1Marker> + ?Sized, |
| 269 | { |
| 270 | let payload_v2: Result<DataPayload<AliasesV2Marker>, _> = provider |
| 271 | .load(Default::default()) |
| 272 | .and_then(DataResponse::take_payload); |
| 273 | let aliases = if let Ok(payload) = payload_v2 { |
| 274 | payload |
| 275 | } else { |
| 276 | let payload_v1: DataPayload<AliasesV1Marker> = provider |
| 277 | .load(Default::default()) |
| 278 | .and_then(DataResponse::take_payload)?; |
| 279 | payload_v1.try_map_project(|st, _| st.try_into())? |
| 280 | }; |
| 281 | |
| 282 | Ok(Self { aliases, expander }) |
| 283 | } |
| 284 | |
| 285 | #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_with_expander)] |
| 286 | pub fn try_new_with_expander_unstable<P>( |
| 287 | provider: &P, |
| 288 | expander: LocaleExpander, |
| 289 | ) -> Result<Self, LocaleTransformError> |
| 290 | where |
| 291 | P: DataProvider<AliasesV2Marker> + ?Sized, |
| 292 | { |
| 293 | let aliases: DataPayload<AliasesV2Marker> = |
| 294 | provider.load(Default::default())?.take_payload()?; |
| 295 | |
| 296 | Ok(Self { aliases, expander }) |
| 297 | } |
| 298 | |
| 299 | #[doc = icu_provider::gen_any_buffer_unstable_docs!(ANY, Self::new_with_expander)] |
| 300 | pub fn try_new_with_expander_with_any_provider( |
| 301 | provider: &(impl AnyProvider + ?Sized), |
| 302 | options: LocaleExpander, |
| 303 | ) -> Result<Self, LocaleTransformError> { |
| 304 | Self::try_new_with_expander_compat(&provider.as_downcasting(), options) |
| 305 | } |
| 306 | |
| 307 | #[cfg (feature = "serde" )] |
| 308 | #[doc = icu_provider::gen_any_buffer_unstable_docs!(BUFFER,Self::new_with_expander)] |
| 309 | pub fn try_new_with_expander_with_buffer_provider( |
| 310 | provider: &(impl BufferProvider + ?Sized), |
| 311 | options: LocaleExpander, |
| 312 | ) -> Result<Self, LocaleTransformError> { |
| 313 | Self::try_new_with_expander_compat(&provider.as_deserializing(), options) |
| 314 | } |
| 315 | |
| 316 | /// The canonicalize method potentially updates a passed in locale in place |
| 317 | /// depending up the results of running the canonicalization algorithm |
| 318 | /// from <http://unicode.org/reports/tr35/#LocaleId_Canonicalization>. |
| 319 | /// |
| 320 | /// Some BCP47 canonicalization data is not part of the CLDR json package. Because |
| 321 | /// of this, some canonicalizations are not performed, e.g. the canonicalization of |
| 322 | /// `und-u-ca-islamicc` to `und-u-ca-islamic-civil`. This will be fixed in a future |
| 323 | /// release once the missing data has been added to the CLDR json data. See: |
| 324 | /// <https://github.com/unicode-org/icu4x/issues/746> |
| 325 | /// |
| 326 | /// # Examples |
| 327 | /// |
| 328 | /// ``` |
| 329 | /// use icu::locid::Locale; |
| 330 | /// use icu::locid_transform::{LocaleCanonicalizer, TransformResult}; |
| 331 | /// |
| 332 | /// let lc = LocaleCanonicalizer::new(); |
| 333 | /// |
| 334 | /// let mut locale: Locale = "ja-Latn-fonipa-hepburn-heploc" .parse().unwrap(); |
| 335 | /// assert_eq!(lc.canonicalize(&mut locale), TransformResult::Modified); |
| 336 | /// assert_eq!(locale, "ja-Latn-alalc97-fonipa" .parse().unwrap()); |
| 337 | /// ``` |
| 338 | pub fn canonicalize(&self, locale: &mut Locale) -> TransformResult { |
| 339 | let mut result = TransformResult::Unmodified; |
| 340 | |
| 341 | // This loops until we get a 'fixed point', where applying the rules do not |
| 342 | // result in any more changes. |
| 343 | loop { |
| 344 | // These are linear searches due to the ordering imposed by the canonicalization |
| 345 | // rules, where rules with more variants should be considered first. With the |
| 346 | // current data in CLDR, we will only do this for locales which have variants, |
| 347 | // or new rules which we haven't special-cased yet (of which there are fewer |
| 348 | // than 20). |
| 349 | let modified = if locale.id.variants.is_empty() { |
| 350 | self.canonicalize_absolute_language_fallbacks(&mut locale.id) |
| 351 | } else { |
| 352 | self.canonicalize_language_variant_fallbacks(&mut locale.id) |
| 353 | }; |
| 354 | if modified { |
| 355 | result = TransformResult::Modified; |
| 356 | continue; |
| 357 | } |
| 358 | |
| 359 | if !locale.id.language.is_empty() { |
| 360 | // If the region is specified, check sgn-region rules first |
| 361 | if let Some(region) = locale.id.region { |
| 362 | if locale.id.language == language!("sgn" ) { |
| 363 | if let Some(&sgn_lang) = self |
| 364 | .aliases |
| 365 | .get() |
| 366 | .sgn_region |
| 367 | .get(®ion.into_tinystr().to_unvalidated()) |
| 368 | { |
| 369 | uts35_replacement::<core::iter::Empty<&str>>( |
| 370 | &mut locale.id, |
| 371 | true, |
| 372 | false, |
| 373 | true, |
| 374 | None, |
| 375 | &sgn_lang.into(), |
| 376 | ); |
| 377 | result = TransformResult::Modified; |
| 378 | continue; |
| 379 | } |
| 380 | } |
| 381 | } |
| 382 | |
| 383 | if uts35_check_language_rules(&mut locale.id, &self.aliases) |
| 384 | == TransformResult::Modified |
| 385 | { |
| 386 | result = TransformResult::Modified; |
| 387 | continue; |
| 388 | } |
| 389 | } |
| 390 | |
| 391 | if let Some(script) = locale.id.script { |
| 392 | if let Some(&replacement) = self |
| 393 | .aliases |
| 394 | .get() |
| 395 | .script |
| 396 | .get(&script.into_tinystr().to_unvalidated()) |
| 397 | { |
| 398 | locale.id.script = Some(replacement); |
| 399 | result = TransformResult::Modified; |
| 400 | continue; |
| 401 | } |
| 402 | } |
| 403 | |
| 404 | if let Some(region) = locale.id.region { |
| 405 | let replacement = if region.is_alphabetic() { |
| 406 | self.aliases |
| 407 | .get() |
| 408 | .region_alpha |
| 409 | .get(®ion.into_tinystr().resize().to_unvalidated()) |
| 410 | } else { |
| 411 | self.aliases |
| 412 | .get() |
| 413 | .region_num |
| 414 | .get(®ion.into_tinystr().to_unvalidated()) |
| 415 | }; |
| 416 | if let Some(&replacement) = replacement { |
| 417 | locale.id.region = Some(replacement); |
| 418 | result = TransformResult::Modified; |
| 419 | continue; |
| 420 | } |
| 421 | |
| 422 | if let Some(regions) = self |
| 423 | .aliases |
| 424 | .get() |
| 425 | .complex_region |
| 426 | .get(®ion.into_tinystr().to_unvalidated()) |
| 427 | { |
| 428 | // Skip if regions are empty |
| 429 | if let Some(default_region) = regions.get(0) { |
| 430 | let mut maximized = LanguageIdentifier { |
| 431 | language: locale.id.language, |
| 432 | script: locale.id.script, |
| 433 | region: None, |
| 434 | variants: Variants::default(), |
| 435 | }; |
| 436 | |
| 437 | locale.id.region = Some( |
| 438 | match (self.expander.maximize(&mut maximized), maximized.region) { |
| 439 | (TransformResult::Modified, Some(candidate)) |
| 440 | if regions.iter().any(|x| x == candidate) => |
| 441 | { |
| 442 | candidate |
| 443 | } |
| 444 | _ => default_region, |
| 445 | }, |
| 446 | ); |
| 447 | result = TransformResult::Modified; |
| 448 | continue; |
| 449 | } |
| 450 | } |
| 451 | } |
| 452 | |
| 453 | if !locale.id.variants.is_empty() { |
| 454 | let mut modified = Vec::with_capacity(0); |
| 455 | for (idx, &variant) in locale.id.variants.iter().enumerate() { |
| 456 | if let Some(&updated) = self |
| 457 | .aliases |
| 458 | .get() |
| 459 | .variant |
| 460 | .get(&variant.into_tinystr().to_unvalidated()) |
| 461 | { |
| 462 | if modified.is_empty() { |
| 463 | modified = locale.id.variants.to_vec(); |
| 464 | } |
| 465 | #[allow (clippy::indexing_slicing)] |
| 466 | let _ = core::mem::replace(&mut modified[idx], updated); |
| 467 | } |
| 468 | } |
| 469 | |
| 470 | if !modified.is_empty() { |
| 471 | modified.sort(); |
| 472 | modified.dedup(); |
| 473 | locale.id.variants = Variants::from_vec_unchecked(modified); |
| 474 | result = TransformResult::Modified; |
| 475 | continue; |
| 476 | } |
| 477 | } |
| 478 | |
| 479 | // Nothing matched in this iteration, we're done. |
| 480 | break; |
| 481 | } |
| 482 | |
| 483 | if !locale.extensions.transform.is_empty() || !locale.extensions.unicode.is_empty() { |
| 484 | self.canonicalize_extensions(&mut locale.extensions, &mut result); |
| 485 | } |
| 486 | result |
| 487 | } |
| 488 | |
| 489 | fn canonicalize_extensions(&self, extensions: &mut Extensions, result: &mut TransformResult) { |
| 490 | // Handle Locale extensions in their own loops, because these rules do not interact |
| 491 | // with each other. |
| 492 | if let Some(ref mut lang) = extensions.transform.lang { |
| 493 | while uts35_check_language_rules(lang, &self.aliases) == TransformResult::Modified { |
| 494 | *result = TransformResult::Modified; |
| 495 | } |
| 496 | } |
| 497 | |
| 498 | if !extensions.unicode.keywords.is_empty() { |
| 499 | for key in [key!("rg" ), key!("sd" )] { |
| 500 | if let Some(value) = extensions.unicode.keywords.get_mut(&key) { |
| 501 | if let &[only_value] = value.as_tinystr_slice() { |
| 502 | if let Some(modified_value) = self |
| 503 | .aliases |
| 504 | .get() |
| 505 | .subdivision |
| 506 | .get(&only_value.resize().to_unvalidated()) |
| 507 | { |
| 508 | if let Ok(modified_value) = modified_value.parse() { |
| 509 | *value = modified_value; |
| 510 | *result = TransformResult::Modified; |
| 511 | } |
| 512 | } |
| 513 | } |
| 514 | } |
| 515 | } |
| 516 | } |
| 517 | } |
| 518 | |
| 519 | fn canonicalize_language_variant_fallbacks(&self, lid: &mut LanguageIdentifier) -> bool { |
| 520 | // These language/variant comibnations have around 20 rules |
| 521 | for LanguageStrStrPair(lang, raw_variants, raw_to) in self |
| 522 | .aliases |
| 523 | .get() |
| 524 | .language_variants |
| 525 | .iter() |
| 526 | .map(zerofrom::ZeroFrom::zero_from) |
| 527 | { |
| 528 | let raw_variants = raw_variants.split('-' ); |
| 529 | // if is_iter_sorted(raw_variants.clone()) { // can we sort at construction? |
| 530 | if uts35_rule_matches(lid, lang, None, None, raw_variants.clone()) { |
| 531 | if let Ok(to) = raw_to.parse() { |
| 532 | uts35_replacement(lid, !lang.is_empty(), false, false, Some(raw_variants), &to); |
| 533 | return true; |
| 534 | } |
| 535 | } |
| 536 | } |
| 537 | false |
| 538 | } |
| 539 | |
| 540 | fn canonicalize_absolute_language_fallbacks(&self, lid: &mut LanguageIdentifier) -> bool { |
| 541 | for StrStrPair(raw_from, raw_to) in self |
| 542 | .aliases |
| 543 | .get() |
| 544 | .language |
| 545 | .iter() |
| 546 | .map(zerofrom::ZeroFrom::zero_from) |
| 547 | { |
| 548 | if let Ok(from) = raw_from.parse::<LanguageIdentifier>() { |
| 549 | if uts35_rule_matches( |
| 550 | lid, |
| 551 | from.language, |
| 552 | from.script, |
| 553 | from.region, |
| 554 | from.variants.iter().map(Variant::as_str), |
| 555 | ) { |
| 556 | if let Ok(to) = raw_to.parse() { |
| 557 | uts35_replacement( |
| 558 | lid, |
| 559 | !from.language.is_empty(), |
| 560 | from.script.is_some(), |
| 561 | from.region.is_some(), |
| 562 | Some(from.variants.iter().map(Variant::as_str)), |
| 563 | &to, |
| 564 | ); |
| 565 | return true; |
| 566 | } |
| 567 | } |
| 568 | } |
| 569 | } |
| 570 | false |
| 571 | } |
| 572 | } |
| 573 | |
| 574 | #[cfg (test)] |
| 575 | mod test { |
| 576 | use super::*; |
| 577 | |
| 578 | #[test ] |
| 579 | fn test_uts35_rule_matches() { |
| 580 | for (source, rule, result) in [ |
| 581 | ("ja" , "und" , true), |
| 582 | ("und-heploc-hepburn" , "und-hepburn" , true), |
| 583 | ("ja-heploc-hepburn" , "und-hepburn" , true), |
| 584 | ("ja-hepburn" , "und-hepburn-heploc" , false), |
| 585 | ] { |
| 586 | let source = source.parse().unwrap(); |
| 587 | let rule = rule.parse::<LanguageIdentifier>().unwrap(); |
| 588 | assert_eq!( |
| 589 | uts35_rule_matches( |
| 590 | &source, |
| 591 | rule.language, |
| 592 | rule.script, |
| 593 | rule.region, |
| 594 | rule.variants.iter().map(Variant::as_str), |
| 595 | ), |
| 596 | result, |
| 597 | "{}" , |
| 598 | source |
| 599 | ); |
| 600 | } |
| 601 | } |
| 602 | |
| 603 | #[test ] |
| 604 | fn test_uts35_replacement() { |
| 605 | for (locale, rule_0, rule_1, result) in [ |
| 606 | ( |
| 607 | "ja-Latn-fonipa-hepburn-heploc" , |
| 608 | "und-hepburn-heploc" , |
| 609 | "und-alalc97" , |
| 610 | "ja-Latn-alalc97-fonipa" , |
| 611 | ), |
| 612 | ("sgn-DD" , "und-DD" , "und-DE" , "sgn-DE" ), |
| 613 | ("sgn-DE" , "sgn-DE" , "gsg" , "gsg" ), |
| 614 | ] { |
| 615 | let mut locale: Locale = locale.parse().unwrap(); |
| 616 | let rule_0 = rule_0.parse::<LanguageIdentifier>().unwrap(); |
| 617 | let rule_1 = rule_1.parse().unwrap(); |
| 618 | let result = result.parse::<Locale>().unwrap(); |
| 619 | uts35_replacement( |
| 620 | &mut locale.id, |
| 621 | !rule_0.language.is_empty(), |
| 622 | rule_0.script.is_some(), |
| 623 | rule_0.region.is_some(), |
| 624 | Some(rule_0.variants.iter().map(Variant::as_str)), |
| 625 | &rule_1, |
| 626 | ); |
| 627 | assert_eq!(result, locale); |
| 628 | } |
| 629 | } |
| 630 | } |
| 631 | |
| 632 | #[cfg (feature = "serde" )] |
| 633 | #[cfg (test)] |
| 634 | mod tests { |
| 635 | use super::*; |
| 636 | use icu_locid::locale; |
| 637 | |
| 638 | struct RejectByKeyProvider { |
| 639 | keys: Vec<DataKey>, |
| 640 | } |
| 641 | |
| 642 | impl AnyProvider for RejectByKeyProvider { |
| 643 | fn load_any(&self, key: DataKey, _: DataRequest) -> Result<AnyResponse, DataError> { |
| 644 | use alloc::borrow::Cow; |
| 645 | |
| 646 | println!("{:#?}" , key); |
| 647 | if self.keys.contains(&key) { |
| 648 | return Err(DataErrorKind::MissingDataKey.with_str_context("rejected" )); |
| 649 | } |
| 650 | |
| 651 | let aliases_v2 = crate::provider::Baked::SINGLETON_LOCID_TRANSFORM_ALIASES_V2; |
| 652 | let l = crate::provider::Baked::SINGLETON_LOCID_TRANSFORM_LIKELYSUBTAGS_L_V1; |
| 653 | let ext = crate::provider::Baked::SINGLETON_LOCID_TRANSFORM_LIKELYSUBTAGS_EXT_V1; |
| 654 | let sr = crate::provider::Baked::SINGLETON_LOCID_TRANSFORM_LIKELYSUBTAGS_SR_V1; |
| 655 | |
| 656 | let payload = if key.hashed() == AliasesV1Marker::KEY.hashed() { |
| 657 | let aliases_v1 = AliasesV1 { |
| 658 | language_variants: zerovec::VarZeroVec::from(&[StrStrPair( |
| 659 | Cow::Borrowed("aa-saaho" ), |
| 660 | Cow::Borrowed("ssy" ), |
| 661 | )]), |
| 662 | ..Default::default() |
| 663 | }; |
| 664 | DataPayload::<AliasesV1Marker>::from_owned(aliases_v1).wrap_into_any_payload() |
| 665 | } else if key.hashed() == AliasesV2Marker::KEY.hashed() { |
| 666 | DataPayload::<AliasesV2Marker>::from_static_ref(aliases_v2).wrap_into_any_payload() |
| 667 | } else if key.hashed() == LikelySubtagsForLanguageV1Marker::KEY.hashed() { |
| 668 | DataPayload::<LikelySubtagsForLanguageV1Marker>::from_static_ref(l) |
| 669 | .wrap_into_any_payload() |
| 670 | } else if key.hashed() == LikelySubtagsExtendedV1Marker::KEY.hashed() { |
| 671 | DataPayload::<LikelySubtagsExtendedV1Marker>::from_static_ref(ext) |
| 672 | .wrap_into_any_payload() |
| 673 | } else if key.hashed() == LikelySubtagsForScriptRegionV1Marker::KEY.hashed() { |
| 674 | DataPayload::<LikelySubtagsForScriptRegionV1Marker>::from_static_ref(sr) |
| 675 | .wrap_into_any_payload() |
| 676 | } else { |
| 677 | return Err(DataErrorKind::MissingDataKey.into_error()); |
| 678 | }; |
| 679 | |
| 680 | Ok(AnyResponse { |
| 681 | payload: Some(payload), |
| 682 | metadata: Default::default(), |
| 683 | }) |
| 684 | } |
| 685 | } |
| 686 | |
| 687 | #[test ] |
| 688 | fn test_old_keys() { |
| 689 | let provider = RejectByKeyProvider { |
| 690 | keys: vec![AliasesV2Marker::KEY], |
| 691 | }; |
| 692 | let lc = LocaleCanonicalizer::try_new_with_any_provider(&provider) |
| 693 | .expect("should create with old keys" ); |
| 694 | let mut locale = locale!("aa-saaho" ); |
| 695 | assert_eq!(lc.canonicalize(&mut locale), TransformResult::Modified); |
| 696 | assert_eq!(locale, locale!("ssy" )); |
| 697 | } |
| 698 | |
| 699 | #[test ] |
| 700 | fn test_new_keys() { |
| 701 | let provider = RejectByKeyProvider { |
| 702 | keys: vec![AliasesV1Marker::KEY], |
| 703 | }; |
| 704 | let lc = LocaleCanonicalizer::try_new_with_any_provider(&provider) |
| 705 | .expect("should create with old keys" ); |
| 706 | let mut locale = locale!("aa-saaho" ); |
| 707 | assert_eq!(lc.canonicalize(&mut locale), TransformResult::Modified); |
| 708 | assert_eq!(locale, locale!("ssy" )); |
| 709 | } |
| 710 | |
| 711 | #[test ] |
| 712 | fn test_no_keys() { |
| 713 | let provider = RejectByKeyProvider { |
| 714 | keys: vec![AliasesV1Marker::KEY, AliasesV2Marker::KEY], |
| 715 | }; |
| 716 | if LocaleCanonicalizer::try_new_with_any_provider(&provider).is_ok() { |
| 717 | panic!("should not create: no data present" ) |
| 718 | }; |
| 719 | } |
| 720 | } |
| 721 | |