1 | // This file is part of ICU4X. For terms of use, please see the file |
2 | // called LICENSE at the top level of the ICU4X source tree |
3 | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | |
5 | use icu_locid::extensions::unicode::{key, Key}; |
6 | use icu_locid::subtags::Language; |
7 | use icu_locid::LanguageIdentifier; |
8 | use icu_provider::FallbackPriority; |
9 | |
10 | use super::*; |
11 | |
12 | const SUBDIVISION_KEY: Key = key!("sd" ); |
13 | |
14 | impl<'a> LocaleFallbackerWithConfig<'a> { |
15 | pub(crate) fn normalize(&self, locale: &mut DataLocale) { |
16 | let language = locale.language(); |
17 | // 1. Populate the region (required for region fallback only) |
18 | if self.config.priority == FallbackPriority::Region && locale.region().is_none() { |
19 | // 1a. First look for region based on language+script |
20 | if let Some(script) = locale.script() { |
21 | locale.set_region( |
22 | self.likely_subtags |
23 | .ls2r |
24 | .get_2d( |
25 | &language.into_tinystr().to_unvalidated(), |
26 | &script.into_tinystr().to_unvalidated(), |
27 | ) |
28 | .copied(), |
29 | ); |
30 | } |
31 | // 1b. If that fails, try language only |
32 | if locale.region().is_none() { |
33 | locale.set_region( |
34 | self.likely_subtags |
35 | .l2r |
36 | .get(&language.into_tinystr().to_unvalidated()) |
37 | .copied(), |
38 | ); |
39 | } |
40 | } |
41 | // 2. Remove the script if it is implied by the other subtags |
42 | if let Some(script) = locale.script() { |
43 | let default_script = self |
44 | .likely_subtags |
45 | .l2s |
46 | .get_copied(&language.into_tinystr().to_unvalidated()) |
47 | .unwrap_or(DEFAULT_SCRIPT); |
48 | if let Some(region) = locale.region() { |
49 | if script |
50 | == self |
51 | .likely_subtags |
52 | .lr2s |
53 | .get_copied_2d( |
54 | &language.into_tinystr().to_unvalidated(), |
55 | ®ion.into_tinystr().to_unvalidated(), |
56 | ) |
57 | .unwrap_or(default_script) |
58 | { |
59 | locale.set_script(None); |
60 | } |
61 | } else if script == default_script { |
62 | locale.set_script(None); |
63 | } |
64 | } |
65 | // 3. Remove irrelevant extension subtags |
66 | locale.retain_unicode_ext(|key| { |
67 | match *key { |
68 | // Always retain -u-sd |
69 | SUBDIVISION_KEY => true, |
70 | // Retain the query-specific keyword |
71 | _ if Some(*key) == self.config.extension_key => true, |
72 | // Drop all others |
73 | _ => false, |
74 | } |
75 | }); |
76 | // 4. If there is an invalid "sd" subtag, drop it |
77 | // For now, ignore it, and let fallback do it for us |
78 | } |
79 | } |
80 | |
81 | impl<'a> LocaleFallbackIteratorInner<'a> { |
82 | pub fn step(&mut self, locale: &mut DataLocale) { |
83 | match self.config.priority { |
84 | FallbackPriority::Language => self.step_language(locale), |
85 | FallbackPriority::Region => self.step_region(locale), |
86 | // TODO(#1964): Change the collation fallback rules to be different |
87 | // from the language fallback fules. |
88 | FallbackPriority::Collation => self.step_language(locale), |
89 | // This case should not normally happen, but `FallbackPriority` is non_exhaustive. |
90 | // Make it go directly to `und`. |
91 | _ => { |
92 | debug_assert!( |
93 | false, |
94 | "Unknown FallbackPriority: {:?}" , |
95 | self.config.priority |
96 | ); |
97 | *locale = Default::default() |
98 | } |
99 | } |
100 | } |
101 | |
102 | fn step_language(&mut self, locale: &mut DataLocale) { |
103 | // 1. Remove the extension fallback keyword |
104 | if let Some(extension_key) = self.config.extension_key { |
105 | if let Some(value) = locale.remove_unicode_ext(&extension_key) { |
106 | self.backup_extension = Some(value); |
107 | return; |
108 | } |
109 | } |
110 | // 2. Remove the subdivision keyword |
111 | if let Some(value) = locale.remove_unicode_ext(&SUBDIVISION_KEY) { |
112 | self.backup_subdivision = Some(value); |
113 | return; |
114 | } |
115 | // 3. Assert that the locale is a language identifier |
116 | debug_assert!(!locale.has_unicode_ext()); |
117 | // 4. Remove variants |
118 | if locale.has_variants() { |
119 | self.backup_variants = Some(locale.clear_variants()); |
120 | return; |
121 | } |
122 | // 5. Check for parent override |
123 | if let Some(parent) = self.get_explicit_parent(locale) { |
124 | locale.set_langid(parent); |
125 | self.restore_extensions_variants(locale); |
126 | return; |
127 | } |
128 | // 6. Add the script subtag if necessary |
129 | if locale.script().is_none() { |
130 | if let Some(region) = locale.region() { |
131 | let language = locale.language(); |
132 | if let Some(script) = self.likely_subtags.lr2s.get_copied_2d( |
133 | &language.into_tinystr().to_unvalidated(), |
134 | ®ion.into_tinystr().to_unvalidated(), |
135 | ) { |
136 | locale.set_script(Some(script)); |
137 | self.restore_extensions_variants(locale); |
138 | return; |
139 | } |
140 | } |
141 | } |
142 | // 7. Remove region |
143 | if locale.region().is_some() { |
144 | locale.set_region(None); |
145 | self.restore_extensions_variants(locale); |
146 | return; |
147 | } |
148 | // 8. Remove language+script |
149 | debug_assert!(!locale.language().is_empty()); // don't call .step() on und |
150 | locale.set_script(None); |
151 | locale.set_language(Language::UND); |
152 | } |
153 | |
154 | fn step_region(&mut self, locale: &mut DataLocale) { |
155 | // 1. Remove the extension fallback keyword |
156 | if let Some(extension_key) = self.config.extension_key { |
157 | if let Some(value) = locale.remove_unicode_ext(&extension_key) { |
158 | self.backup_extension = Some(value); |
159 | return; |
160 | } |
161 | } |
162 | // 2. Remove the subdivision keyword |
163 | if let Some(value) = locale.remove_unicode_ext(&SUBDIVISION_KEY) { |
164 | self.backup_subdivision = Some(value); |
165 | return; |
166 | } |
167 | // 3. Assert that the locale is a language identifier |
168 | debug_assert!(!locale.has_unicode_ext()); |
169 | // 4. Remove variants |
170 | if locale.has_variants() { |
171 | self.backup_variants = Some(locale.clear_variants()); |
172 | return; |
173 | } |
174 | // 5. Remove language+script |
175 | if !locale.language().is_empty() || locale.script().is_some() { |
176 | locale.set_script(None); |
177 | locale.set_language(Language::UND); |
178 | self.restore_extensions_variants(locale); |
179 | return; |
180 | } |
181 | // 6. Remove region |
182 | debug_assert!(locale.region().is_some()); // don't call .step() on und |
183 | locale.set_region(None); |
184 | } |
185 | |
186 | fn restore_extensions_variants(&mut self, locale: &mut DataLocale) { |
187 | if let Some(value) = self.backup_extension.take() { |
188 | #[allow (clippy::unwrap_used)] // not reachable unless extension_key is present |
189 | locale.set_unicode_ext(self.config.extension_key.unwrap(), value); |
190 | } |
191 | if let Some(value) = self.backup_subdivision.take() { |
192 | locale.set_unicode_ext(SUBDIVISION_KEY, value); |
193 | } |
194 | if let Some(variants) = self.backup_variants.take() { |
195 | locale.set_variants(variants); |
196 | } |
197 | } |
198 | |
199 | fn get_explicit_parent(&self, locale: &DataLocale) -> Option<LanguageIdentifier> { |
200 | self.supplement |
201 | .and_then(|supplement| { |
202 | supplement |
203 | .parents |
204 | .get_copied_by(|uvstr| locale.strict_cmp(uvstr).reverse()) |
205 | }) |
206 | .or_else(|| { |
207 | self.parents |
208 | .parents |
209 | .get_copied_by(|uvstr| locale.strict_cmp(uvstr).reverse()) |
210 | }) |
211 | .map(LanguageIdentifier::from) |
212 | } |
213 | } |
214 | |
215 | #[cfg (test)] |
216 | mod tests { |
217 | use super::*; |
218 | use writeable::Writeable; |
219 | |
220 | /// Unicode extension keywords take part in fallback, but [auxiliary keys] are not modified. |
221 | /// |
222 | /// [auxiliary keys]: icu_provider::AuxiliaryKeys |
223 | #[test ] |
224 | fn test_aux_key_fallback() { |
225 | use super::LocaleFallbacker; |
226 | |
227 | let fallbacker = LocaleFallbacker::new(); |
228 | let mut fallback_iterator = fallbacker |
229 | .for_config(Default::default()) |
230 | .fallback_for("en-US-u-sd-usca-x-aux" .parse().unwrap()); |
231 | |
232 | assert_eq!(fallback_iterator.get().to_string(), "en-US-u-sd-usca-x-aux" ); |
233 | fallback_iterator.step(); |
234 | assert_eq!(fallback_iterator.get().to_string(), "en-US-x-aux" ); |
235 | fallback_iterator.step(); |
236 | assert_eq!(fallback_iterator.get().to_string(), "en-u-sd-usca-x-aux" ); |
237 | fallback_iterator.step(); |
238 | assert_eq!(fallback_iterator.get().to_string(), "en-x-aux" ); |
239 | fallback_iterator.step(); |
240 | assert_eq!(fallback_iterator.get().to_string(), "und-x-aux" ); |
241 | assert!(fallback_iterator.get().is_und()); |
242 | } |
243 | |
244 | struct TestCase { |
245 | input: &'static str, |
246 | requires_data: bool, |
247 | extension_key: Option<Key>, |
248 | fallback_supplement: Option<LocaleFallbackSupplement>, |
249 | // Note: The first entry in the chain is the normalized locale |
250 | expected_language_chain: &'static [&'static str], |
251 | expected_region_chain: &'static [&'static str], |
252 | } |
253 | |
254 | // TODO: Consider loading these from a JSON file |
255 | const TEST_CASES: &[TestCase] = &[ |
256 | TestCase { |
257 | input: "en-u-hc-h12-sd-usca" , |
258 | requires_data: false, |
259 | extension_key: None, |
260 | fallback_supplement: None, |
261 | expected_language_chain: &["en-u-sd-usca" , "en" ], |
262 | expected_region_chain: &["en-u-sd-usca" , "en" , "und-u-sd-usca" ], |
263 | }, |
264 | TestCase { |
265 | input: "en-US-u-hc-h12-sd-usca" , |
266 | requires_data: false, |
267 | extension_key: None, |
268 | fallback_supplement: None, |
269 | expected_language_chain: &["en-US-u-sd-usca" , "en-US" , "en-u-sd-usca" , "en" ], |
270 | expected_region_chain: &["en-US-u-sd-usca" , "en-US" , "und-US-u-sd-usca" , "und-US" ], |
271 | }, |
272 | TestCase { |
273 | input: "en-US-fonipa-u-hc-h12-sd-usca" , |
274 | requires_data: false, |
275 | extension_key: Some(key!("hc" )), |
276 | fallback_supplement: None, |
277 | expected_language_chain: &[ |
278 | "en-US-fonipa-u-hc-h12-sd-usca" , |
279 | "en-US-fonipa-u-sd-usca" , |
280 | "en-US-fonipa" , |
281 | "en-US" , |
282 | "en-fonipa-u-hc-h12-sd-usca" , |
283 | "en-fonipa-u-sd-usca" , |
284 | "en-fonipa" , |
285 | "en" , |
286 | ], |
287 | expected_region_chain: &[ |
288 | "en-US-fonipa-u-hc-h12-sd-usca" , |
289 | "en-US-fonipa-u-sd-usca" , |
290 | "en-US-fonipa" , |
291 | "en-US" , |
292 | "und-US-fonipa-u-hc-h12-sd-usca" , |
293 | "und-US-fonipa-u-sd-usca" , |
294 | "und-US-fonipa" , |
295 | "und-US" , |
296 | ], |
297 | }, |
298 | TestCase { |
299 | input: "en-u-hc-h12-sd-usca" , |
300 | requires_data: true, |
301 | extension_key: None, |
302 | fallback_supplement: None, |
303 | expected_language_chain: &["en-u-sd-usca" , "en" ], |
304 | expected_region_chain: &["en-US-u-sd-usca" , "en-US" , "und-US-u-sd-usca" , "und-US" ], |
305 | }, |
306 | TestCase { |
307 | input: "en-Latn-u-sd-usca" , |
308 | requires_data: true, |
309 | extension_key: None, |
310 | fallback_supplement: None, |
311 | expected_language_chain: &["en-u-sd-usca" , "en" ], |
312 | expected_region_chain: &["en-US-u-sd-usca" , "en-US" , "und-US-u-sd-usca" , "und-US" ], |
313 | }, |
314 | TestCase { |
315 | input: "en-Latn-US-u-sd-usca" , |
316 | requires_data: true, |
317 | extension_key: None, |
318 | fallback_supplement: None, |
319 | expected_language_chain: &["en-US-u-sd-usca" , "en-US" , "en-u-sd-usca" , "en" ], |
320 | expected_region_chain: &["en-US-u-sd-usca" , "en-US" , "und-US-u-sd-usca" , "und-US" ], |
321 | }, |
322 | TestCase { |
323 | // TODO(#4413): -u-rg is not yet supported; when it is, this test should be updated |
324 | input: "en-u-rg-gbxxxx" , |
325 | requires_data: false, |
326 | extension_key: None, |
327 | fallback_supplement: None, |
328 | expected_language_chain: &["en" ], |
329 | expected_region_chain: &["en" ], |
330 | }, |
331 | TestCase { |
332 | input: "sr-ME" , |
333 | requires_data: true, |
334 | extension_key: None, |
335 | fallback_supplement: None, |
336 | expected_language_chain: &["sr-ME" , "sr-Latn-ME" , "sr-Latn" ], |
337 | expected_region_chain: &["sr-ME" , "und-ME" ], |
338 | }, |
339 | TestCase { |
340 | input: "sr-Latn-ME" , |
341 | requires_data: true, |
342 | extension_key: None, |
343 | fallback_supplement: None, |
344 | expected_language_chain: &["sr-ME" , "sr-Latn-ME" , "sr-Latn" ], |
345 | expected_region_chain: &["sr-ME" , "und-ME" ], |
346 | }, |
347 | TestCase { |
348 | input: "sr-ME-fonipa" , |
349 | requires_data: true, |
350 | extension_key: None, |
351 | fallback_supplement: None, |
352 | expected_language_chain: &[ |
353 | "sr-ME-fonipa" , |
354 | "sr-ME" , |
355 | "sr-Latn-ME-fonipa" , |
356 | "sr-Latn-ME" , |
357 | "sr-Latn-fonipa" , |
358 | "sr-Latn" , |
359 | ], |
360 | expected_region_chain: &["sr-ME-fonipa" , "sr-ME" , "und-ME-fonipa" , "und-ME" ], |
361 | }, |
362 | TestCase { |
363 | input: "sr-RS" , |
364 | requires_data: true, |
365 | extension_key: None, |
366 | fallback_supplement: None, |
367 | expected_language_chain: &["sr-RS" , "sr" ], |
368 | expected_region_chain: &["sr-RS" , "und-RS" ], |
369 | }, |
370 | TestCase { |
371 | input: "sr-Cyrl-RS" , |
372 | requires_data: true, |
373 | extension_key: None, |
374 | fallback_supplement: None, |
375 | expected_language_chain: &["sr-RS" , "sr" ], |
376 | expected_region_chain: &["sr-RS" , "und-RS" ], |
377 | }, |
378 | TestCase { |
379 | input: "sr-Latn-RS" , |
380 | requires_data: true, |
381 | extension_key: None, |
382 | fallback_supplement: None, |
383 | expected_language_chain: &["sr-Latn-RS" , "sr-Latn" ], |
384 | expected_region_chain: &["sr-Latn-RS" , "und-RS" ], |
385 | }, |
386 | TestCase { |
387 | input: "de-Latn-LI" , |
388 | requires_data: true, |
389 | extension_key: None, |
390 | fallback_supplement: None, |
391 | expected_language_chain: &["de-LI" , "de" ], |
392 | expected_region_chain: &["de-LI" , "und-LI" ], |
393 | }, |
394 | TestCase { |
395 | input: "ca-ES-valencia" , |
396 | requires_data: true, |
397 | extension_key: None, |
398 | fallback_supplement: None, |
399 | expected_language_chain: &["ca-ES-valencia" , "ca-ES" , "ca-valencia" , "ca" ], |
400 | expected_region_chain: &["ca-ES-valencia" , "ca-ES" , "und-ES-valencia" , "und-ES" ], |
401 | }, |
402 | TestCase { |
403 | input: "es-AR" , |
404 | requires_data: true, |
405 | extension_key: None, |
406 | fallback_supplement: None, |
407 | expected_language_chain: &["es-AR" , "es-419" , "es" ], |
408 | expected_region_chain: &["es-AR" , "und-AR" ], |
409 | }, |
410 | TestCase { |
411 | input: "hi-IN" , |
412 | requires_data: true, |
413 | extension_key: None, |
414 | fallback_supplement: None, |
415 | expected_language_chain: &["hi-IN" , "hi" ], |
416 | expected_region_chain: &["hi-IN" , "und-IN" ], |
417 | }, |
418 | TestCase { |
419 | input: "hi-Latn-IN" , |
420 | requires_data: true, |
421 | extension_key: None, |
422 | fallback_supplement: None, |
423 | expected_language_chain: &["hi-Latn-IN" , "hi-Latn" , "en-IN" , "en-001" , "en" ], |
424 | expected_region_chain: &["hi-Latn-IN" , "und-IN" ], |
425 | }, |
426 | TestCase { |
427 | input: "zh-CN" , |
428 | requires_data: true, |
429 | extension_key: None, |
430 | fallback_supplement: None, |
431 | // Note: "zh-Hans" is not reachable because it is the default script for "zh". |
432 | // The fallback algorithm does not visit the language-script bundle when the |
433 | // script is the default for the language |
434 | expected_language_chain: &["zh-CN" , "zh" ], |
435 | expected_region_chain: &["zh-CN" , "und-CN" ], |
436 | }, |
437 | TestCase { |
438 | input: "zh-TW" , |
439 | requires_data: true, |
440 | extension_key: None, |
441 | fallback_supplement: None, |
442 | expected_language_chain: &["zh-TW" , "zh-Hant-TW" , "zh-Hant" ], |
443 | expected_region_chain: &["zh-TW" , "und-TW" ], |
444 | }, |
445 | TestCase { |
446 | input: "yue-HK" , |
447 | requires_data: true, |
448 | extension_key: None, |
449 | fallback_supplement: None, |
450 | expected_language_chain: &["yue-HK" , "yue" ], |
451 | expected_region_chain: &["yue-HK" , "und-HK" ], |
452 | }, |
453 | TestCase { |
454 | input: "yue-HK" , |
455 | requires_data: true, |
456 | extension_key: None, |
457 | fallback_supplement: Some(LocaleFallbackSupplement::Collation), |
458 | expected_language_chain: &["yue-HK" , "yue" , "zh-Hant" , "zh" ], |
459 | expected_region_chain: &["yue-HK" , "und-HK" ], |
460 | }, |
461 | ]; |
462 | |
463 | #[test ] |
464 | fn test_fallback() { |
465 | let fallbacker_no_data = LocaleFallbacker::new_without_data(); |
466 | let fallbacker_no_data = fallbacker_no_data.as_borrowed(); |
467 | let fallbacker_with_data = LocaleFallbacker::new(); |
468 | for cas in TEST_CASES { |
469 | for (priority, expected_chain) in [ |
470 | ( |
471 | LocaleFallbackPriority::Language, |
472 | cas.expected_language_chain, |
473 | ), |
474 | (LocaleFallbackPriority::Region, cas.expected_region_chain), |
475 | ] { |
476 | let mut config = LocaleFallbackConfig::default(); |
477 | config.priority = priority; |
478 | config.extension_key = cas.extension_key; |
479 | config.fallback_supplement = cas.fallback_supplement; |
480 | let fallbacker = if cas.requires_data { |
481 | fallbacker_with_data |
482 | } else { |
483 | fallbacker_no_data |
484 | }; |
485 | let mut it = fallbacker |
486 | .for_config(config) |
487 | .fallback_for(cas.input.parse().unwrap()); |
488 | for &expected in expected_chain { |
489 | assert_eq!( |
490 | expected, |
491 | &*it.get().write_to_string(), |
492 | "{:?} ({:?})" , |
493 | cas.input, |
494 | priority |
495 | ); |
496 | it.step(); |
497 | } |
498 | assert_eq!( |
499 | "und" , |
500 | &*it.get().write_to_string(), |
501 | "{:?} ({:?})" , |
502 | cas.input, |
503 | priority |
504 | ); |
505 | } |
506 | } |
507 | } |
508 | } |
509 | |