| 1 | // This file is part of ICU4X. For terms of use, please see the file |
| 2 | // called LICENSE at the top level of the ICU4X source tree |
| 3 | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
| 4 | |
| 5 | use icu_locid::extensions::unicode::{key, Key}; |
| 6 | use icu_locid::subtags::Language; |
| 7 | use icu_locid::LanguageIdentifier; |
| 8 | use icu_provider::FallbackPriority; |
| 9 | |
| 10 | use super::*; |
| 11 | |
| 12 | const SUBDIVISION_KEY: Key = key!("sd" ); |
| 13 | |
| 14 | impl<'a> LocaleFallbackerWithConfig<'a> { |
| 15 | pub(crate) fn normalize(&self, locale: &mut DataLocale) { |
| 16 | let language = locale.language(); |
| 17 | // 1. Populate the region (required for region fallback only) |
| 18 | if self.config.priority == FallbackPriority::Region && locale.region().is_none() { |
| 19 | // 1a. First look for region based on language+script |
| 20 | if let Some(script) = locale.script() { |
| 21 | locale.set_region( |
| 22 | self.likely_subtags |
| 23 | .ls2r |
| 24 | .get_2d( |
| 25 | &language.into_tinystr().to_unvalidated(), |
| 26 | &script.into_tinystr().to_unvalidated(), |
| 27 | ) |
| 28 | .copied(), |
| 29 | ); |
| 30 | } |
| 31 | // 1b. If that fails, try language only |
| 32 | if locale.region().is_none() { |
| 33 | locale.set_region( |
| 34 | self.likely_subtags |
| 35 | .l2r |
| 36 | .get(&language.into_tinystr().to_unvalidated()) |
| 37 | .copied(), |
| 38 | ); |
| 39 | } |
| 40 | } |
| 41 | // 2. Remove the script if it is implied by the other subtags |
| 42 | if let Some(script) = locale.script() { |
| 43 | let default_script = self |
| 44 | .likely_subtags |
| 45 | .l2s |
| 46 | .get_copied(&language.into_tinystr().to_unvalidated()) |
| 47 | .unwrap_or(DEFAULT_SCRIPT); |
| 48 | if let Some(region) = locale.region() { |
| 49 | if script |
| 50 | == self |
| 51 | .likely_subtags |
| 52 | .lr2s |
| 53 | .get_copied_2d( |
| 54 | &language.into_tinystr().to_unvalidated(), |
| 55 | ®ion.into_tinystr().to_unvalidated(), |
| 56 | ) |
| 57 | .unwrap_or(default_script) |
| 58 | { |
| 59 | locale.set_script(None); |
| 60 | } |
| 61 | } else if script == default_script { |
| 62 | locale.set_script(None); |
| 63 | } |
| 64 | } |
| 65 | // 3. Remove irrelevant extension subtags |
| 66 | locale.retain_unicode_ext(|key| { |
| 67 | match *key { |
| 68 | // Always retain -u-sd |
| 69 | SUBDIVISION_KEY => true, |
| 70 | // Retain the query-specific keyword |
| 71 | _ if Some(*key) == self.config.extension_key => true, |
| 72 | // Drop all others |
| 73 | _ => false, |
| 74 | } |
| 75 | }); |
| 76 | // 4. If there is an invalid "sd" subtag, drop it |
| 77 | // For now, ignore it, and let fallback do it for us |
| 78 | } |
| 79 | } |
| 80 | |
| 81 | impl<'a> LocaleFallbackIteratorInner<'a> { |
| 82 | pub fn step(&mut self, locale: &mut DataLocale) { |
| 83 | match self.config.priority { |
| 84 | FallbackPriority::Language => self.step_language(locale), |
| 85 | FallbackPriority::Region => self.step_region(locale), |
| 86 | // TODO(#1964): Change the collation fallback rules to be different |
| 87 | // from the language fallback fules. |
| 88 | FallbackPriority::Collation => self.step_language(locale), |
| 89 | // This case should not normally happen, but `FallbackPriority` is non_exhaustive. |
| 90 | // Make it go directly to `und`. |
| 91 | _ => { |
| 92 | debug_assert!( |
| 93 | false, |
| 94 | "Unknown FallbackPriority: {:?}" , |
| 95 | self.config.priority |
| 96 | ); |
| 97 | *locale = Default::default() |
| 98 | } |
| 99 | } |
| 100 | } |
| 101 | |
| 102 | fn step_language(&mut self, locale: &mut DataLocale) { |
| 103 | // 1. Remove the extension fallback keyword |
| 104 | if let Some(extension_key) = self.config.extension_key { |
| 105 | if let Some(value) = locale.remove_unicode_ext(&extension_key) { |
| 106 | self.backup_extension = Some(value); |
| 107 | return; |
| 108 | } |
| 109 | } |
| 110 | // 2. Remove the subdivision keyword |
| 111 | if let Some(value) = locale.remove_unicode_ext(&SUBDIVISION_KEY) { |
| 112 | self.backup_subdivision = Some(value); |
| 113 | return; |
| 114 | } |
| 115 | // 3. Assert that the locale is a language identifier |
| 116 | debug_assert!(!locale.has_unicode_ext()); |
| 117 | // 4. Remove variants |
| 118 | if locale.has_variants() { |
| 119 | self.backup_variants = Some(locale.clear_variants()); |
| 120 | return; |
| 121 | } |
| 122 | // 5. Check for parent override |
| 123 | if let Some(parent) = self.get_explicit_parent(locale) { |
| 124 | locale.set_langid(parent); |
| 125 | self.restore_extensions_variants(locale); |
| 126 | return; |
| 127 | } |
| 128 | // 6. Add the script subtag if necessary |
| 129 | if locale.script().is_none() { |
| 130 | if let Some(region) = locale.region() { |
| 131 | let language = locale.language(); |
| 132 | if let Some(script) = self.likely_subtags.lr2s.get_copied_2d( |
| 133 | &language.into_tinystr().to_unvalidated(), |
| 134 | ®ion.into_tinystr().to_unvalidated(), |
| 135 | ) { |
| 136 | locale.set_script(Some(script)); |
| 137 | self.restore_extensions_variants(locale); |
| 138 | return; |
| 139 | } |
| 140 | } |
| 141 | } |
| 142 | // 7. Remove region |
| 143 | if locale.region().is_some() { |
| 144 | locale.set_region(None); |
| 145 | self.restore_extensions_variants(locale); |
| 146 | return; |
| 147 | } |
| 148 | // 8. Remove language+script |
| 149 | debug_assert!(!locale.language().is_empty()); // don't call .step() on und |
| 150 | locale.set_script(None); |
| 151 | locale.set_language(Language::UND); |
| 152 | } |
| 153 | |
| 154 | fn step_region(&mut self, locale: &mut DataLocale) { |
| 155 | // 1. Remove the extension fallback keyword |
| 156 | if let Some(extension_key) = self.config.extension_key { |
| 157 | if let Some(value) = locale.remove_unicode_ext(&extension_key) { |
| 158 | self.backup_extension = Some(value); |
| 159 | return; |
| 160 | } |
| 161 | } |
| 162 | // 2. Remove the subdivision keyword |
| 163 | if let Some(value) = locale.remove_unicode_ext(&SUBDIVISION_KEY) { |
| 164 | self.backup_subdivision = Some(value); |
| 165 | return; |
| 166 | } |
| 167 | // 3. Assert that the locale is a language identifier |
| 168 | debug_assert!(!locale.has_unicode_ext()); |
| 169 | // 4. Remove variants |
| 170 | if locale.has_variants() { |
| 171 | self.backup_variants = Some(locale.clear_variants()); |
| 172 | return; |
| 173 | } |
| 174 | // 5. Remove language+script |
| 175 | if !locale.language().is_empty() || locale.script().is_some() { |
| 176 | locale.set_script(None); |
| 177 | locale.set_language(Language::UND); |
| 178 | self.restore_extensions_variants(locale); |
| 179 | return; |
| 180 | } |
| 181 | // 6. Remove region |
| 182 | debug_assert!(locale.region().is_some()); // don't call .step() on und |
| 183 | locale.set_region(None); |
| 184 | } |
| 185 | |
| 186 | fn restore_extensions_variants(&mut self, locale: &mut DataLocale) { |
| 187 | if let Some(value) = self.backup_extension.take() { |
| 188 | #[allow (clippy::unwrap_used)] // not reachable unless extension_key is present |
| 189 | locale.set_unicode_ext(self.config.extension_key.unwrap(), value); |
| 190 | } |
| 191 | if let Some(value) = self.backup_subdivision.take() { |
| 192 | locale.set_unicode_ext(SUBDIVISION_KEY, value); |
| 193 | } |
| 194 | if let Some(variants) = self.backup_variants.take() { |
| 195 | locale.set_variants(variants); |
| 196 | } |
| 197 | } |
| 198 | |
| 199 | fn get_explicit_parent(&self, locale: &DataLocale) -> Option<LanguageIdentifier> { |
| 200 | self.supplement |
| 201 | .and_then(|supplement| { |
| 202 | supplement |
| 203 | .parents |
| 204 | .get_copied_by(|uvstr| locale.strict_cmp(uvstr).reverse()) |
| 205 | }) |
| 206 | .or_else(|| { |
| 207 | self.parents |
| 208 | .parents |
| 209 | .get_copied_by(|uvstr| locale.strict_cmp(uvstr).reverse()) |
| 210 | }) |
| 211 | .map(LanguageIdentifier::from) |
| 212 | } |
| 213 | } |
| 214 | |
| 215 | #[cfg (test)] |
| 216 | mod tests { |
| 217 | use super::*; |
| 218 | use writeable::Writeable; |
| 219 | |
| 220 | /// Unicode extension keywords take part in fallback, but [auxiliary keys] are not modified. |
| 221 | /// |
| 222 | /// [auxiliary keys]: icu_provider::AuxiliaryKeys |
| 223 | #[test ] |
| 224 | fn test_aux_key_fallback() { |
| 225 | use super::LocaleFallbacker; |
| 226 | |
| 227 | let fallbacker = LocaleFallbacker::new(); |
| 228 | let mut fallback_iterator = fallbacker |
| 229 | .for_config(Default::default()) |
| 230 | .fallback_for("en-US-u-sd-usca-x-aux" .parse().unwrap()); |
| 231 | |
| 232 | assert_eq!(fallback_iterator.get().to_string(), "en-US-u-sd-usca-x-aux" ); |
| 233 | fallback_iterator.step(); |
| 234 | assert_eq!(fallback_iterator.get().to_string(), "en-US-x-aux" ); |
| 235 | fallback_iterator.step(); |
| 236 | assert_eq!(fallback_iterator.get().to_string(), "en-u-sd-usca-x-aux" ); |
| 237 | fallback_iterator.step(); |
| 238 | assert_eq!(fallback_iterator.get().to_string(), "en-x-aux" ); |
| 239 | fallback_iterator.step(); |
| 240 | assert_eq!(fallback_iterator.get().to_string(), "und-x-aux" ); |
| 241 | assert!(fallback_iterator.get().is_und()); |
| 242 | } |
| 243 | |
| 244 | struct TestCase { |
| 245 | input: &'static str, |
| 246 | requires_data: bool, |
| 247 | extension_key: Option<Key>, |
| 248 | fallback_supplement: Option<LocaleFallbackSupplement>, |
| 249 | // Note: The first entry in the chain is the normalized locale |
| 250 | expected_language_chain: &'static [&'static str], |
| 251 | expected_region_chain: &'static [&'static str], |
| 252 | } |
| 253 | |
| 254 | // TODO: Consider loading these from a JSON file |
| 255 | const TEST_CASES: &[TestCase] = &[ |
| 256 | TestCase { |
| 257 | input: "en-u-hc-h12-sd-usca" , |
| 258 | requires_data: false, |
| 259 | extension_key: None, |
| 260 | fallback_supplement: None, |
| 261 | expected_language_chain: &["en-u-sd-usca" , "en" ], |
| 262 | expected_region_chain: &["en-u-sd-usca" , "en" , "und-u-sd-usca" ], |
| 263 | }, |
| 264 | TestCase { |
| 265 | input: "en-US-u-hc-h12-sd-usca" , |
| 266 | requires_data: false, |
| 267 | extension_key: None, |
| 268 | fallback_supplement: None, |
| 269 | expected_language_chain: &["en-US-u-sd-usca" , "en-US" , "en-u-sd-usca" , "en" ], |
| 270 | expected_region_chain: &["en-US-u-sd-usca" , "en-US" , "und-US-u-sd-usca" , "und-US" ], |
| 271 | }, |
| 272 | TestCase { |
| 273 | input: "en-US-fonipa-u-hc-h12-sd-usca" , |
| 274 | requires_data: false, |
| 275 | extension_key: Some(key!("hc" )), |
| 276 | fallback_supplement: None, |
| 277 | expected_language_chain: &[ |
| 278 | "en-US-fonipa-u-hc-h12-sd-usca" , |
| 279 | "en-US-fonipa-u-sd-usca" , |
| 280 | "en-US-fonipa" , |
| 281 | "en-US" , |
| 282 | "en-fonipa-u-hc-h12-sd-usca" , |
| 283 | "en-fonipa-u-sd-usca" , |
| 284 | "en-fonipa" , |
| 285 | "en" , |
| 286 | ], |
| 287 | expected_region_chain: &[ |
| 288 | "en-US-fonipa-u-hc-h12-sd-usca" , |
| 289 | "en-US-fonipa-u-sd-usca" , |
| 290 | "en-US-fonipa" , |
| 291 | "en-US" , |
| 292 | "und-US-fonipa-u-hc-h12-sd-usca" , |
| 293 | "und-US-fonipa-u-sd-usca" , |
| 294 | "und-US-fonipa" , |
| 295 | "und-US" , |
| 296 | ], |
| 297 | }, |
| 298 | TestCase { |
| 299 | input: "en-u-hc-h12-sd-usca" , |
| 300 | requires_data: true, |
| 301 | extension_key: None, |
| 302 | fallback_supplement: None, |
| 303 | expected_language_chain: &["en-u-sd-usca" , "en" ], |
| 304 | expected_region_chain: &["en-US-u-sd-usca" , "en-US" , "und-US-u-sd-usca" , "und-US" ], |
| 305 | }, |
| 306 | TestCase { |
| 307 | input: "en-Latn-u-sd-usca" , |
| 308 | requires_data: true, |
| 309 | extension_key: None, |
| 310 | fallback_supplement: None, |
| 311 | expected_language_chain: &["en-u-sd-usca" , "en" ], |
| 312 | expected_region_chain: &["en-US-u-sd-usca" , "en-US" , "und-US-u-sd-usca" , "und-US" ], |
| 313 | }, |
| 314 | TestCase { |
| 315 | input: "en-Latn-US-u-sd-usca" , |
| 316 | requires_data: true, |
| 317 | extension_key: None, |
| 318 | fallback_supplement: None, |
| 319 | expected_language_chain: &["en-US-u-sd-usca" , "en-US" , "en-u-sd-usca" , "en" ], |
| 320 | expected_region_chain: &["en-US-u-sd-usca" , "en-US" , "und-US-u-sd-usca" , "und-US" ], |
| 321 | }, |
| 322 | TestCase { |
| 323 | // TODO(#4413): -u-rg is not yet supported; when it is, this test should be updated |
| 324 | input: "en-u-rg-gbxxxx" , |
| 325 | requires_data: false, |
| 326 | extension_key: None, |
| 327 | fallback_supplement: None, |
| 328 | expected_language_chain: &["en" ], |
| 329 | expected_region_chain: &["en" ], |
| 330 | }, |
| 331 | TestCase { |
| 332 | input: "sr-ME" , |
| 333 | requires_data: true, |
| 334 | extension_key: None, |
| 335 | fallback_supplement: None, |
| 336 | expected_language_chain: &["sr-ME" , "sr-Latn-ME" , "sr-Latn" ], |
| 337 | expected_region_chain: &["sr-ME" , "und-ME" ], |
| 338 | }, |
| 339 | TestCase { |
| 340 | input: "sr-Latn-ME" , |
| 341 | requires_data: true, |
| 342 | extension_key: None, |
| 343 | fallback_supplement: None, |
| 344 | expected_language_chain: &["sr-ME" , "sr-Latn-ME" , "sr-Latn" ], |
| 345 | expected_region_chain: &["sr-ME" , "und-ME" ], |
| 346 | }, |
| 347 | TestCase { |
| 348 | input: "sr-ME-fonipa" , |
| 349 | requires_data: true, |
| 350 | extension_key: None, |
| 351 | fallback_supplement: None, |
| 352 | expected_language_chain: &[ |
| 353 | "sr-ME-fonipa" , |
| 354 | "sr-ME" , |
| 355 | "sr-Latn-ME-fonipa" , |
| 356 | "sr-Latn-ME" , |
| 357 | "sr-Latn-fonipa" , |
| 358 | "sr-Latn" , |
| 359 | ], |
| 360 | expected_region_chain: &["sr-ME-fonipa" , "sr-ME" , "und-ME-fonipa" , "und-ME" ], |
| 361 | }, |
| 362 | TestCase { |
| 363 | input: "sr-RS" , |
| 364 | requires_data: true, |
| 365 | extension_key: None, |
| 366 | fallback_supplement: None, |
| 367 | expected_language_chain: &["sr-RS" , "sr" ], |
| 368 | expected_region_chain: &["sr-RS" , "und-RS" ], |
| 369 | }, |
| 370 | TestCase { |
| 371 | input: "sr-Cyrl-RS" , |
| 372 | requires_data: true, |
| 373 | extension_key: None, |
| 374 | fallback_supplement: None, |
| 375 | expected_language_chain: &["sr-RS" , "sr" ], |
| 376 | expected_region_chain: &["sr-RS" , "und-RS" ], |
| 377 | }, |
| 378 | TestCase { |
| 379 | input: "sr-Latn-RS" , |
| 380 | requires_data: true, |
| 381 | extension_key: None, |
| 382 | fallback_supplement: None, |
| 383 | expected_language_chain: &["sr-Latn-RS" , "sr-Latn" ], |
| 384 | expected_region_chain: &["sr-Latn-RS" , "und-RS" ], |
| 385 | }, |
| 386 | TestCase { |
| 387 | input: "de-Latn-LI" , |
| 388 | requires_data: true, |
| 389 | extension_key: None, |
| 390 | fallback_supplement: None, |
| 391 | expected_language_chain: &["de-LI" , "de" ], |
| 392 | expected_region_chain: &["de-LI" , "und-LI" ], |
| 393 | }, |
| 394 | TestCase { |
| 395 | input: "ca-ES-valencia" , |
| 396 | requires_data: true, |
| 397 | extension_key: None, |
| 398 | fallback_supplement: None, |
| 399 | expected_language_chain: &["ca-ES-valencia" , "ca-ES" , "ca-valencia" , "ca" ], |
| 400 | expected_region_chain: &["ca-ES-valencia" , "ca-ES" , "und-ES-valencia" , "und-ES" ], |
| 401 | }, |
| 402 | TestCase { |
| 403 | input: "es-AR" , |
| 404 | requires_data: true, |
| 405 | extension_key: None, |
| 406 | fallback_supplement: None, |
| 407 | expected_language_chain: &["es-AR" , "es-419" , "es" ], |
| 408 | expected_region_chain: &["es-AR" , "und-AR" ], |
| 409 | }, |
| 410 | TestCase { |
| 411 | input: "hi-IN" , |
| 412 | requires_data: true, |
| 413 | extension_key: None, |
| 414 | fallback_supplement: None, |
| 415 | expected_language_chain: &["hi-IN" , "hi" ], |
| 416 | expected_region_chain: &["hi-IN" , "und-IN" ], |
| 417 | }, |
| 418 | TestCase { |
| 419 | input: "hi-Latn-IN" , |
| 420 | requires_data: true, |
| 421 | extension_key: None, |
| 422 | fallback_supplement: None, |
| 423 | expected_language_chain: &["hi-Latn-IN" , "hi-Latn" , "en-IN" , "en-001" , "en" ], |
| 424 | expected_region_chain: &["hi-Latn-IN" , "und-IN" ], |
| 425 | }, |
| 426 | TestCase { |
| 427 | input: "zh-CN" , |
| 428 | requires_data: true, |
| 429 | extension_key: None, |
| 430 | fallback_supplement: None, |
| 431 | // Note: "zh-Hans" is not reachable because it is the default script for "zh". |
| 432 | // The fallback algorithm does not visit the language-script bundle when the |
| 433 | // script is the default for the language |
| 434 | expected_language_chain: &["zh-CN" , "zh" ], |
| 435 | expected_region_chain: &["zh-CN" , "und-CN" ], |
| 436 | }, |
| 437 | TestCase { |
| 438 | input: "zh-TW" , |
| 439 | requires_data: true, |
| 440 | extension_key: None, |
| 441 | fallback_supplement: None, |
| 442 | expected_language_chain: &["zh-TW" , "zh-Hant-TW" , "zh-Hant" ], |
| 443 | expected_region_chain: &["zh-TW" , "und-TW" ], |
| 444 | }, |
| 445 | TestCase { |
| 446 | input: "yue-HK" , |
| 447 | requires_data: true, |
| 448 | extension_key: None, |
| 449 | fallback_supplement: None, |
| 450 | expected_language_chain: &["yue-HK" , "yue" ], |
| 451 | expected_region_chain: &["yue-HK" , "und-HK" ], |
| 452 | }, |
| 453 | TestCase { |
| 454 | input: "yue-HK" , |
| 455 | requires_data: true, |
| 456 | extension_key: None, |
| 457 | fallback_supplement: Some(LocaleFallbackSupplement::Collation), |
| 458 | expected_language_chain: &["yue-HK" , "yue" , "zh-Hant" , "zh" ], |
| 459 | expected_region_chain: &["yue-HK" , "und-HK" ], |
| 460 | }, |
| 461 | ]; |
| 462 | |
| 463 | #[test ] |
| 464 | fn test_fallback() { |
| 465 | let fallbacker_no_data = LocaleFallbacker::new_without_data(); |
| 466 | let fallbacker_no_data = fallbacker_no_data.as_borrowed(); |
| 467 | let fallbacker_with_data = LocaleFallbacker::new(); |
| 468 | for cas in TEST_CASES { |
| 469 | for (priority, expected_chain) in [ |
| 470 | ( |
| 471 | LocaleFallbackPriority::Language, |
| 472 | cas.expected_language_chain, |
| 473 | ), |
| 474 | (LocaleFallbackPriority::Region, cas.expected_region_chain), |
| 475 | ] { |
| 476 | let mut config = LocaleFallbackConfig::default(); |
| 477 | config.priority = priority; |
| 478 | config.extension_key = cas.extension_key; |
| 479 | config.fallback_supplement = cas.fallback_supplement; |
| 480 | let fallbacker = if cas.requires_data { |
| 481 | fallbacker_with_data |
| 482 | } else { |
| 483 | fallbacker_no_data |
| 484 | }; |
| 485 | let mut it = fallbacker |
| 486 | .for_config(config) |
| 487 | .fallback_for(cas.input.parse().unwrap()); |
| 488 | for &expected in expected_chain { |
| 489 | assert_eq!( |
| 490 | expected, |
| 491 | &*it.get().write_to_string(), |
| 492 | "{:?} ({:?})" , |
| 493 | cas.input, |
| 494 | priority |
| 495 | ); |
| 496 | it.step(); |
| 497 | } |
| 498 | assert_eq!( |
| 499 | "und" , |
| 500 | &*it.get().write_to_string(), |
| 501 | "{:?} ({:?})" , |
| 502 | cas.input, |
| 503 | priority |
| 504 | ); |
| 505 | } |
| 506 | } |
| 507 | } |
| 508 | } |
| 509 | |