1 | // This file is part of ICU4X. For terms of use, please see the file |
2 | // called LICENSE at the top level of the ICU4X source tree |
3 | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | |
5 | use icu_locid::extensions::unicode::{key, Key}; |
6 | use icu_locid::subtags::Language; |
7 | use icu_locid::LanguageIdentifier; |
8 | use icu_provider::FallbackPriority; |
9 | |
10 | use super::*; |
11 | |
12 | const SUBDIVISION_KEY: Key = key!("sd" ); |
13 | |
14 | impl<'a> LocaleFallbackerWithConfig<'a> { |
15 | pub(crate) fn normalize(&self, locale: &mut DataLocale) { |
16 | let language = locale.language(); |
17 | // 1. Populate the region (required for region fallback only) |
18 | if self.config.priority == FallbackPriority::Region && locale.region().is_none() { |
19 | // 1a. First look for region based on language+script |
20 | if let Some(script) = locale.script() { |
21 | locale.set_region( |
22 | self.likely_subtags |
23 | .ls2r |
24 | .get_2d( |
25 | &language.into_tinystr().to_unvalidated(), |
26 | &script.into_tinystr().to_unvalidated(), |
27 | ) |
28 | .copied(), |
29 | ); |
30 | } |
31 | // 1b. If that fails, try language only |
32 | if locale.region().is_none() { |
33 | locale.set_region( |
34 | self.likely_subtags |
35 | .l2r |
36 | .get(&language.into_tinystr().to_unvalidated()) |
37 | .copied(), |
38 | ); |
39 | } |
40 | } |
41 | // 2. Remove the script if it is implied by the other subtags |
42 | if let Some(script) = locale.script() { |
43 | let default_script = self |
44 | .likely_subtags |
45 | .l2s |
46 | .get_copied(&language.into_tinystr().to_unvalidated()) |
47 | .unwrap_or(DEFAULT_SCRIPT); |
48 | if let Some(region) = locale.region() { |
49 | if script |
50 | == self |
51 | .likely_subtags |
52 | .lr2s |
53 | .get_copied_2d( |
54 | &language.into_tinystr().to_unvalidated(), |
55 | ®ion.into_tinystr().to_unvalidated(), |
56 | ) |
57 | .unwrap_or(default_script) |
58 | { |
59 | locale.set_script(None); |
60 | } |
61 | } else if script == default_script { |
62 | locale.set_script(None); |
63 | } |
64 | } |
65 | // 3. Remove irrelevant extension subtags |
66 | locale.retain_unicode_ext(|key| { |
67 | match *key { |
68 | // Always retain -u-sd |
69 | SUBDIVISION_KEY => true, |
70 | // Retain the query-specific keyword |
71 | _ if Some(*key) == self.config.extension_key => true, |
72 | // Drop all others |
73 | _ => false, |
74 | } |
75 | }); |
76 | // 4. If there is an invalid "sd" subtag, drop it |
77 | // For now, ignore it, and let fallback do it for us |
78 | } |
79 | } |
80 | |
81 | impl<'a> LocaleFallbackIteratorInner<'a> { |
82 | pub fn step(&mut self, locale: &mut DataLocale) { |
83 | match self.config.priority { |
84 | FallbackPriority::Language => self.step_language(locale), |
85 | FallbackPriority::Region => self.step_region(locale), |
86 | // TODO(#1964): Change the collation fallback rules to be different |
87 | // from the language fallback fules. |
88 | FallbackPriority::Collation => self.step_language(locale), |
89 | // This case should not normally happen, but `FallbackPriority` is non_exhaustive. |
90 | // Make it go directly to `und`. |
91 | _ => { |
92 | debug_assert!( |
93 | false, |
94 | "Unknown FallbackPriority: {:?}" , |
95 | self.config.priority |
96 | ); |
97 | *locale = Default::default() |
98 | } |
99 | } |
100 | } |
101 | |
102 | fn step_language(&mut self, locale: &mut DataLocale) { |
103 | // 1. Remove the extension fallback keyword |
104 | if let Some(extension_key) = self.config.extension_key { |
105 | if let Some(value) = locale.remove_unicode_ext(&extension_key) { |
106 | self.backup_extension = Some(value); |
107 | return; |
108 | } |
109 | } |
110 | // 2. Remove the subdivision keyword |
111 | if let Some(value) = locale.remove_unicode_ext(&SUBDIVISION_KEY) { |
112 | self.backup_subdivision = Some(value); |
113 | return; |
114 | } |
115 | // 3. Assert that the locale is a language identifier |
116 | debug_assert!(!locale.has_unicode_ext()); |
117 | // 4. Remove variants |
118 | if locale.has_variants() { |
119 | self.backup_variants = Some(locale.clear_variants()); |
120 | return; |
121 | } |
122 | // 5. Check for parent override |
123 | if let Some(parent) = self.get_explicit_parent(locale) { |
124 | locale.set_langid(parent); |
125 | self.restore_extensions_variants(locale); |
126 | return; |
127 | } |
128 | // 6. Add the script subtag if necessary |
129 | if locale.script().is_none() { |
130 | if let Some(region) = locale.region() { |
131 | let language = locale.language(); |
132 | if let Some(script) = self.likely_subtags.lr2s.get_copied_2d( |
133 | &language.into_tinystr().to_unvalidated(), |
134 | ®ion.into_tinystr().to_unvalidated(), |
135 | ) { |
136 | locale.set_script(Some(script)); |
137 | self.restore_extensions_variants(locale); |
138 | return; |
139 | } |
140 | } |
141 | } |
142 | // 7. Remove region |
143 | if locale.region().is_some() { |
144 | locale.set_region(None); |
145 | self.restore_extensions_variants(locale); |
146 | return; |
147 | } |
148 | // 8. Remove language+script |
149 | debug_assert!(!locale.language().is_empty()); // don't call .step() on und |
150 | locale.set_script(None); |
151 | locale.set_language(Language::UND); |
152 | } |
153 | |
154 | fn step_region(&mut self, locale: &mut DataLocale) { |
155 | // 1. Remove the extension fallback keyword |
156 | if let Some(extension_key) = self.config.extension_key { |
157 | if let Some(value) = locale.remove_unicode_ext(&extension_key) { |
158 | self.backup_extension = Some(value); |
159 | return; |
160 | } |
161 | } |
162 | // 2. Remove the subdivision keyword |
163 | if let Some(value) = locale.remove_unicode_ext(&SUBDIVISION_KEY) { |
164 | self.backup_subdivision = Some(value); |
165 | return; |
166 | } |
167 | // 3. Assert that the locale is a language identifier |
168 | debug_assert!(!locale.has_unicode_ext()); |
169 | // 4. Remove variants |
170 | if locale.has_variants() { |
171 | self.backup_variants = Some(locale.clear_variants()); |
172 | return; |
173 | } |
174 | // 5. Remove language+script |
175 | if !locale.language().is_empty() || locale.script().is_some() { |
176 | locale.set_script(None); |
177 | locale.set_language(Language::UND); |
178 | self.restore_extensions_variants(locale); |
179 | return; |
180 | } |
181 | // 6. Remove region |
182 | debug_assert!(locale.region().is_some()); // don't call .step() on und |
183 | locale.set_region(None); |
184 | } |
185 | |
186 | fn restore_extensions_variants(&mut self, locale: &mut DataLocale) { |
187 | if let Some(value) = self.backup_extension.take() { |
188 | #[allow (clippy::unwrap_used)] // not reachable unless extension_key is present |
189 | locale.set_unicode_ext(self.config.extension_key.unwrap(), value); |
190 | } |
191 | if let Some(value) = self.backup_subdivision.take() { |
192 | locale.set_unicode_ext(SUBDIVISION_KEY, value); |
193 | } |
194 | if let Some(variants) = self.backup_variants.take() { |
195 | locale.set_variants(variants); |
196 | } |
197 | } |
198 | |
199 | fn get_explicit_parent(&self, locale: &DataLocale) -> Option<LanguageIdentifier> { |
200 | self.supplement |
201 | .and_then(|supplement| { |
202 | supplement |
203 | .parents |
204 | .get_copied_by(|uvstr| locale.strict_cmp(uvstr).reverse()) |
205 | }) |
206 | .or_else(|| { |
207 | self.parents |
208 | .parents |
209 | .get_copied_by(|uvstr| locale.strict_cmp(uvstr).reverse()) |
210 | }) |
211 | .map(LanguageIdentifier::from) |
212 | } |
213 | } |
214 | |
215 | #[cfg (test)] |
216 | mod tests { |
217 | use super::*; |
218 | use icu_locid::Locale; |
219 | use std::str::FromStr; |
220 | use writeable::Writeable; |
221 | |
222 | struct TestCase { |
223 | input: &'static str, |
224 | requires_data: bool, |
225 | extension_key: Option<Key>, |
226 | fallback_supplement: Option<LocaleFallbackSupplement>, |
227 | // Note: The first entry in the chain is the normalized locale |
228 | expected_language_chain: &'static [&'static str], |
229 | expected_region_chain: &'static [&'static str], |
230 | } |
231 | |
232 | // TODO: Consider loading these from a JSON file |
233 | const TEST_CASES: &[TestCase] = &[ |
234 | TestCase { |
235 | input: "en-u-hc-h12-sd-usca" , |
236 | requires_data: false, |
237 | extension_key: None, |
238 | fallback_supplement: None, |
239 | expected_language_chain: &["en-u-sd-usca" , "en" ], |
240 | expected_region_chain: &["en-u-sd-usca" , "en" , "und-u-sd-usca" ], |
241 | }, |
242 | TestCase { |
243 | input: "en-US-u-hc-h12-sd-usca" , |
244 | requires_data: false, |
245 | extension_key: None, |
246 | fallback_supplement: None, |
247 | expected_language_chain: &["en-US-u-sd-usca" , "en-US" , "en-u-sd-usca" , "en" ], |
248 | expected_region_chain: &["en-US-u-sd-usca" , "en-US" , "und-US-u-sd-usca" , "und-US" ], |
249 | }, |
250 | TestCase { |
251 | input: "en-US-fonipa-u-hc-h12-sd-usca" , |
252 | requires_data: false, |
253 | extension_key: Some(key!("hc" )), |
254 | fallback_supplement: None, |
255 | expected_language_chain: &[ |
256 | "en-US-fonipa-u-hc-h12-sd-usca" , |
257 | "en-US-fonipa-u-sd-usca" , |
258 | "en-US-fonipa" , |
259 | "en-US" , |
260 | "en-fonipa-u-hc-h12-sd-usca" , |
261 | "en-fonipa-u-sd-usca" , |
262 | "en-fonipa" , |
263 | "en" , |
264 | ], |
265 | expected_region_chain: &[ |
266 | "en-US-fonipa-u-hc-h12-sd-usca" , |
267 | "en-US-fonipa-u-sd-usca" , |
268 | "en-US-fonipa" , |
269 | "en-US" , |
270 | "und-US-fonipa-u-hc-h12-sd-usca" , |
271 | "und-US-fonipa-u-sd-usca" , |
272 | "und-US-fonipa" , |
273 | "und-US" , |
274 | ], |
275 | }, |
276 | TestCase { |
277 | input: "en-u-hc-h12-sd-usca" , |
278 | requires_data: true, |
279 | extension_key: None, |
280 | fallback_supplement: None, |
281 | expected_language_chain: &["en-u-sd-usca" , "en" ], |
282 | expected_region_chain: &["en-US-u-sd-usca" , "en-US" , "und-US-u-sd-usca" , "und-US" ], |
283 | }, |
284 | TestCase { |
285 | input: "en-Latn-u-sd-usca" , |
286 | requires_data: true, |
287 | extension_key: None, |
288 | fallback_supplement: None, |
289 | expected_language_chain: &["en-u-sd-usca" , "en" ], |
290 | expected_region_chain: &["en-US-u-sd-usca" , "en-US" , "und-US-u-sd-usca" , "und-US" ], |
291 | }, |
292 | TestCase { |
293 | input: "en-Latn-US-u-sd-usca" , |
294 | requires_data: true, |
295 | extension_key: None, |
296 | fallback_supplement: None, |
297 | expected_language_chain: &["en-US-u-sd-usca" , "en-US" , "en-u-sd-usca" , "en" ], |
298 | expected_region_chain: &["en-US-u-sd-usca" , "en-US" , "und-US-u-sd-usca" , "und-US" ], |
299 | }, |
300 | TestCase { |
301 | // NOTE: -u-rg is not yet supported; when it is, this test should be updated |
302 | input: "en-u-rg-gbxxxx" , |
303 | requires_data: false, |
304 | extension_key: None, |
305 | fallback_supplement: None, |
306 | expected_language_chain: &["en" ], |
307 | expected_region_chain: &["en" ], |
308 | }, |
309 | TestCase { |
310 | input: "sr-ME" , |
311 | requires_data: true, |
312 | extension_key: None, |
313 | fallback_supplement: None, |
314 | expected_language_chain: &["sr-ME" , "sr-Latn-ME" , "sr-Latn" ], |
315 | expected_region_chain: &["sr-ME" , "und-ME" ], |
316 | }, |
317 | TestCase { |
318 | input: "sr-Latn-ME" , |
319 | requires_data: true, |
320 | extension_key: None, |
321 | fallback_supplement: None, |
322 | expected_language_chain: &["sr-ME" , "sr-Latn-ME" , "sr-Latn" ], |
323 | expected_region_chain: &["sr-ME" , "und-ME" ], |
324 | }, |
325 | TestCase { |
326 | input: "sr-ME-fonipa" , |
327 | requires_data: true, |
328 | extension_key: None, |
329 | fallback_supplement: None, |
330 | expected_language_chain: &[ |
331 | "sr-ME-fonipa" , |
332 | "sr-ME" , |
333 | "sr-Latn-ME-fonipa" , |
334 | "sr-Latn-ME" , |
335 | "sr-Latn-fonipa" , |
336 | "sr-Latn" , |
337 | ], |
338 | expected_region_chain: &["sr-ME-fonipa" , "sr-ME" , "und-ME-fonipa" , "und-ME" ], |
339 | }, |
340 | TestCase { |
341 | input: "sr-RS" , |
342 | requires_data: true, |
343 | extension_key: None, |
344 | fallback_supplement: None, |
345 | expected_language_chain: &["sr-RS" , "sr" ], |
346 | expected_region_chain: &["sr-RS" , "und-RS" ], |
347 | }, |
348 | TestCase { |
349 | input: "sr-Cyrl-RS" , |
350 | requires_data: true, |
351 | extension_key: None, |
352 | fallback_supplement: None, |
353 | expected_language_chain: &["sr-RS" , "sr" ], |
354 | expected_region_chain: &["sr-RS" , "und-RS" ], |
355 | }, |
356 | TestCase { |
357 | input: "sr-Latn-RS" , |
358 | requires_data: true, |
359 | extension_key: None, |
360 | fallback_supplement: None, |
361 | expected_language_chain: &["sr-Latn-RS" , "sr-Latn" ], |
362 | expected_region_chain: &["sr-Latn-RS" , "und-RS" ], |
363 | }, |
364 | TestCase { |
365 | input: "de-Latn-LI" , |
366 | requires_data: true, |
367 | extension_key: None, |
368 | fallback_supplement: None, |
369 | expected_language_chain: &["de-LI" , "de" ], |
370 | expected_region_chain: &["de-LI" , "und-LI" ], |
371 | }, |
372 | TestCase { |
373 | input: "ca-ES-valencia" , |
374 | requires_data: true, |
375 | extension_key: None, |
376 | fallback_supplement: None, |
377 | expected_language_chain: &["ca-ES-valencia" , "ca-ES" , "ca-valencia" , "ca" ], |
378 | expected_region_chain: &["ca-ES-valencia" , "ca-ES" , "und-ES-valencia" , "und-ES" ], |
379 | }, |
380 | TestCase { |
381 | input: "es-AR" , |
382 | requires_data: true, |
383 | extension_key: None, |
384 | fallback_supplement: None, |
385 | expected_language_chain: &["es-AR" , "es-419" , "es" ], |
386 | expected_region_chain: &["es-AR" , "und-AR" ], |
387 | }, |
388 | TestCase { |
389 | input: "hi-IN" , |
390 | requires_data: true, |
391 | extension_key: None, |
392 | fallback_supplement: None, |
393 | expected_language_chain: &["hi-IN" , "hi" ], |
394 | expected_region_chain: &["hi-IN" , "und-IN" ], |
395 | }, |
396 | TestCase { |
397 | input: "hi-Latn-IN" , |
398 | requires_data: true, |
399 | extension_key: None, |
400 | fallback_supplement: None, |
401 | expected_language_chain: &["hi-Latn-IN" , "hi-Latn" , "en-IN" , "en-001" , "en" ], |
402 | expected_region_chain: &["hi-Latn-IN" , "und-IN" ], |
403 | }, |
404 | TestCase { |
405 | input: "zh-CN" , |
406 | requires_data: true, |
407 | extension_key: None, |
408 | fallback_supplement: None, |
409 | // Note: "zh-Hans" is not reachable because it is the default script for "zh". |
410 | // The fallback algorithm does not visit the language-script bundle when the |
411 | // script is the default for the language |
412 | expected_language_chain: &["zh-CN" , "zh" ], |
413 | expected_region_chain: &["zh-CN" , "und-CN" ], |
414 | }, |
415 | TestCase { |
416 | input: "zh-TW" , |
417 | requires_data: true, |
418 | extension_key: None, |
419 | fallback_supplement: None, |
420 | expected_language_chain: &["zh-TW" , "zh-Hant-TW" , "zh-Hant" ], |
421 | expected_region_chain: &["zh-TW" , "und-TW" ], |
422 | }, |
423 | TestCase { |
424 | input: "yue-HK" , |
425 | requires_data: true, |
426 | extension_key: None, |
427 | fallback_supplement: None, |
428 | expected_language_chain: &["yue-HK" , "yue" ], |
429 | expected_region_chain: &["yue-HK" , "und-HK" ], |
430 | }, |
431 | TestCase { |
432 | input: "yue-HK" , |
433 | requires_data: true, |
434 | extension_key: None, |
435 | fallback_supplement: Some(LocaleFallbackSupplement::Collation), |
436 | // TODO(#1964): add "zh" as a target. |
437 | expected_language_chain: &["yue-HK" , "yue" , "zh-Hant" ], |
438 | expected_region_chain: &["yue-HK" , "und-HK" ], |
439 | }, |
440 | ]; |
441 | |
442 | #[test ] |
443 | fn test_fallback() { |
444 | let fallbacker_no_data = LocaleFallbacker::new_without_data(); |
445 | let fallbacker_no_data = fallbacker_no_data.as_borrowed(); |
446 | let fallbacker_with_data = LocaleFallbacker::new(); |
447 | for cas in TEST_CASES { |
448 | for (priority, expected_chain) in [ |
449 | ( |
450 | LocaleFallbackPriority::Language, |
451 | cas.expected_language_chain, |
452 | ), |
453 | (LocaleFallbackPriority::Region, cas.expected_region_chain), |
454 | ] { |
455 | let mut config = LocaleFallbackConfig::default(); |
456 | config.priority = priority; |
457 | config.extension_key = cas.extension_key; |
458 | config.fallback_supplement = cas.fallback_supplement; |
459 | let fallbacker = if cas.requires_data { |
460 | fallbacker_with_data |
461 | } else { |
462 | fallbacker_no_data |
463 | }; |
464 | let mut it = fallbacker |
465 | .for_config(config) |
466 | .fallback_for(Locale::from_str(cas.input).unwrap().into()); |
467 | for &expected in expected_chain { |
468 | assert_eq!( |
469 | expected, |
470 | &*it.get().write_to_string(), |
471 | " {:?} ( {:?})" , |
472 | cas.input, |
473 | priority |
474 | ); |
475 | it.step(); |
476 | } |
477 | assert_eq!( |
478 | "und" , |
479 | &*it.get().write_to_string(), |
480 | " {:?} ( {:?})" , |
481 | cas.input, |
482 | priority |
483 | ); |
484 | } |
485 | } |
486 | } |
487 | } |
488 | |