1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use icu_locid::extensions::unicode::{key, Key};
6use icu_locid::subtags::Language;
7use icu_locid::LanguageIdentifier;
8use icu_provider::FallbackPriority;
9
10use super::*;
11
12const SUBDIVISION_KEY: Key = key!("sd");
13
14impl<'a> LocaleFallbackerWithConfig<'a> {
15 pub(crate) fn normalize(&self, locale: &mut DataLocale) {
16 let language = locale.language();
17 // 1. Populate the region (required for region fallback only)
18 if self.config.priority == FallbackPriority::Region && locale.region().is_none() {
19 // 1a. First look for region based on language+script
20 if let Some(script) = locale.script() {
21 locale.set_region(
22 self.likely_subtags
23 .ls2r
24 .get_2d(
25 &language.into_tinystr().to_unvalidated(),
26 &script.into_tinystr().to_unvalidated(),
27 )
28 .copied(),
29 );
30 }
31 // 1b. If that fails, try language only
32 if locale.region().is_none() {
33 locale.set_region(
34 self.likely_subtags
35 .l2r
36 .get(&language.into_tinystr().to_unvalidated())
37 .copied(),
38 );
39 }
40 }
41 // 2. Remove the script if it is implied by the other subtags
42 if let Some(script) = locale.script() {
43 let default_script = self
44 .likely_subtags
45 .l2s
46 .get_copied(&language.into_tinystr().to_unvalidated())
47 .unwrap_or(DEFAULT_SCRIPT);
48 if let Some(region) = locale.region() {
49 if script
50 == self
51 .likely_subtags
52 .lr2s
53 .get_copied_2d(
54 &language.into_tinystr().to_unvalidated(),
55 &region.into_tinystr().to_unvalidated(),
56 )
57 .unwrap_or(default_script)
58 {
59 locale.set_script(None);
60 }
61 } else if script == default_script {
62 locale.set_script(None);
63 }
64 }
65 // 3. Remove irrelevant extension subtags
66 locale.retain_unicode_ext(|key| {
67 match *key {
68 // Always retain -u-sd
69 SUBDIVISION_KEY => true,
70 // Retain the query-specific keyword
71 _ if Some(*key) == self.config.extension_key => true,
72 // Drop all others
73 _ => false,
74 }
75 });
76 // 4. If there is an invalid "sd" subtag, drop it
77 // For now, ignore it, and let fallback do it for us
78 }
79}
80
81impl<'a> LocaleFallbackIteratorInner<'a> {
82 pub fn step(&mut self, locale: &mut DataLocale) {
83 match self.config.priority {
84 FallbackPriority::Language => self.step_language(locale),
85 FallbackPriority::Region => self.step_region(locale),
86 // TODO(#1964): Change the collation fallback rules to be different
87 // from the language fallback fules.
88 FallbackPriority::Collation => self.step_language(locale),
89 // This case should not normally happen, but `FallbackPriority` is non_exhaustive.
90 // Make it go directly to `und`.
91 _ => {
92 debug_assert!(
93 false,
94 "Unknown FallbackPriority: {:?}",
95 self.config.priority
96 );
97 *locale = Default::default()
98 }
99 }
100 }
101
102 fn step_language(&mut self, locale: &mut DataLocale) {
103 // 1. Remove the extension fallback keyword
104 if let Some(extension_key) = self.config.extension_key {
105 if let Some(value) = locale.remove_unicode_ext(&extension_key) {
106 self.backup_extension = Some(value);
107 return;
108 }
109 }
110 // 2. Remove the subdivision keyword
111 if let Some(value) = locale.remove_unicode_ext(&SUBDIVISION_KEY) {
112 self.backup_subdivision = Some(value);
113 return;
114 }
115 // 3. Assert that the locale is a language identifier
116 debug_assert!(!locale.has_unicode_ext());
117 // 4. Remove variants
118 if locale.has_variants() {
119 self.backup_variants = Some(locale.clear_variants());
120 return;
121 }
122 // 5. Check for parent override
123 if let Some(parent) = self.get_explicit_parent(locale) {
124 locale.set_langid(parent);
125 self.restore_extensions_variants(locale);
126 return;
127 }
128 // 6. Add the script subtag if necessary
129 if locale.script().is_none() {
130 if let Some(region) = locale.region() {
131 let language = locale.language();
132 if let Some(script) = self.likely_subtags.lr2s.get_copied_2d(
133 &language.into_tinystr().to_unvalidated(),
134 &region.into_tinystr().to_unvalidated(),
135 ) {
136 locale.set_script(Some(script));
137 self.restore_extensions_variants(locale);
138 return;
139 }
140 }
141 }
142 // 7. Remove region
143 if locale.region().is_some() {
144 locale.set_region(None);
145 self.restore_extensions_variants(locale);
146 return;
147 }
148 // 8. Remove language+script
149 debug_assert!(!locale.language().is_empty()); // don't call .step() on und
150 locale.set_script(None);
151 locale.set_language(Language::UND);
152 }
153
154 fn step_region(&mut self, locale: &mut DataLocale) {
155 // 1. Remove the extension fallback keyword
156 if let Some(extension_key) = self.config.extension_key {
157 if let Some(value) = locale.remove_unicode_ext(&extension_key) {
158 self.backup_extension = Some(value);
159 return;
160 }
161 }
162 // 2. Remove the subdivision keyword
163 if let Some(value) = locale.remove_unicode_ext(&SUBDIVISION_KEY) {
164 self.backup_subdivision = Some(value);
165 return;
166 }
167 // 3. Assert that the locale is a language identifier
168 debug_assert!(!locale.has_unicode_ext());
169 // 4. Remove variants
170 if locale.has_variants() {
171 self.backup_variants = Some(locale.clear_variants());
172 return;
173 }
174 // 5. Remove language+script
175 if !locale.language().is_empty() || locale.script().is_some() {
176 locale.set_script(None);
177 locale.set_language(Language::UND);
178 self.restore_extensions_variants(locale);
179 return;
180 }
181 // 6. Remove region
182 debug_assert!(locale.region().is_some()); // don't call .step() on und
183 locale.set_region(None);
184 }
185
186 fn restore_extensions_variants(&mut self, locale: &mut DataLocale) {
187 if let Some(value) = self.backup_extension.take() {
188 #[allow(clippy::unwrap_used)] // not reachable unless extension_key is present
189 locale.set_unicode_ext(self.config.extension_key.unwrap(), value);
190 }
191 if let Some(value) = self.backup_subdivision.take() {
192 locale.set_unicode_ext(SUBDIVISION_KEY, value);
193 }
194 if let Some(variants) = self.backup_variants.take() {
195 locale.set_variants(variants);
196 }
197 }
198
199 fn get_explicit_parent(&self, locale: &DataLocale) -> Option<LanguageIdentifier> {
200 self.supplement
201 .and_then(|supplement| {
202 supplement
203 .parents
204 .get_copied_by(|uvstr| locale.strict_cmp(uvstr).reverse())
205 })
206 .or_else(|| {
207 self.parents
208 .parents
209 .get_copied_by(|uvstr| locale.strict_cmp(uvstr).reverse())
210 })
211 .map(LanguageIdentifier::from)
212 }
213}
214
215#[cfg(test)]
216mod tests {
217 use super::*;
218 use icu_locid::Locale;
219 use std::str::FromStr;
220 use writeable::Writeable;
221
222 struct TestCase {
223 input: &'static str,
224 requires_data: bool,
225 extension_key: Option<Key>,
226 fallback_supplement: Option<LocaleFallbackSupplement>,
227 // Note: The first entry in the chain is the normalized locale
228 expected_language_chain: &'static [&'static str],
229 expected_region_chain: &'static [&'static str],
230 }
231
232 // TODO: Consider loading these from a JSON file
233 const TEST_CASES: &[TestCase] = &[
234 TestCase {
235 input: "en-u-hc-h12-sd-usca",
236 requires_data: false,
237 extension_key: None,
238 fallback_supplement: None,
239 expected_language_chain: &["en-u-sd-usca", "en"],
240 expected_region_chain: &["en-u-sd-usca", "en", "und-u-sd-usca"],
241 },
242 TestCase {
243 input: "en-US-u-hc-h12-sd-usca",
244 requires_data: false,
245 extension_key: None,
246 fallback_supplement: None,
247 expected_language_chain: &["en-US-u-sd-usca", "en-US", "en-u-sd-usca", "en"],
248 expected_region_chain: &["en-US-u-sd-usca", "en-US", "und-US-u-sd-usca", "und-US"],
249 },
250 TestCase {
251 input: "en-US-fonipa-u-hc-h12-sd-usca",
252 requires_data: false,
253 extension_key: Some(key!("hc")),
254 fallback_supplement: None,
255 expected_language_chain: &[
256 "en-US-fonipa-u-hc-h12-sd-usca",
257 "en-US-fonipa-u-sd-usca",
258 "en-US-fonipa",
259 "en-US",
260 "en-fonipa-u-hc-h12-sd-usca",
261 "en-fonipa-u-sd-usca",
262 "en-fonipa",
263 "en",
264 ],
265 expected_region_chain: &[
266 "en-US-fonipa-u-hc-h12-sd-usca",
267 "en-US-fonipa-u-sd-usca",
268 "en-US-fonipa",
269 "en-US",
270 "und-US-fonipa-u-hc-h12-sd-usca",
271 "und-US-fonipa-u-sd-usca",
272 "und-US-fonipa",
273 "und-US",
274 ],
275 },
276 TestCase {
277 input: "en-u-hc-h12-sd-usca",
278 requires_data: true,
279 extension_key: None,
280 fallback_supplement: None,
281 expected_language_chain: &["en-u-sd-usca", "en"],
282 expected_region_chain: &["en-US-u-sd-usca", "en-US", "und-US-u-sd-usca", "und-US"],
283 },
284 TestCase {
285 input: "en-Latn-u-sd-usca",
286 requires_data: true,
287 extension_key: None,
288 fallback_supplement: None,
289 expected_language_chain: &["en-u-sd-usca", "en"],
290 expected_region_chain: &["en-US-u-sd-usca", "en-US", "und-US-u-sd-usca", "und-US"],
291 },
292 TestCase {
293 input: "en-Latn-US-u-sd-usca",
294 requires_data: true,
295 extension_key: None,
296 fallback_supplement: None,
297 expected_language_chain: &["en-US-u-sd-usca", "en-US", "en-u-sd-usca", "en"],
298 expected_region_chain: &["en-US-u-sd-usca", "en-US", "und-US-u-sd-usca", "und-US"],
299 },
300 TestCase {
301 // NOTE: -u-rg is not yet supported; when it is, this test should be updated
302 input: "en-u-rg-gbxxxx",
303 requires_data: false,
304 extension_key: None,
305 fallback_supplement: None,
306 expected_language_chain: &["en"],
307 expected_region_chain: &["en"],
308 },
309 TestCase {
310 input: "sr-ME",
311 requires_data: true,
312 extension_key: None,
313 fallback_supplement: None,
314 expected_language_chain: &["sr-ME", "sr-Latn-ME", "sr-Latn"],
315 expected_region_chain: &["sr-ME", "und-ME"],
316 },
317 TestCase {
318 input: "sr-Latn-ME",
319 requires_data: true,
320 extension_key: None,
321 fallback_supplement: None,
322 expected_language_chain: &["sr-ME", "sr-Latn-ME", "sr-Latn"],
323 expected_region_chain: &["sr-ME", "und-ME"],
324 },
325 TestCase {
326 input: "sr-ME-fonipa",
327 requires_data: true,
328 extension_key: None,
329 fallback_supplement: None,
330 expected_language_chain: &[
331 "sr-ME-fonipa",
332 "sr-ME",
333 "sr-Latn-ME-fonipa",
334 "sr-Latn-ME",
335 "sr-Latn-fonipa",
336 "sr-Latn",
337 ],
338 expected_region_chain: &["sr-ME-fonipa", "sr-ME", "und-ME-fonipa", "und-ME"],
339 },
340 TestCase {
341 input: "sr-RS",
342 requires_data: true,
343 extension_key: None,
344 fallback_supplement: None,
345 expected_language_chain: &["sr-RS", "sr"],
346 expected_region_chain: &["sr-RS", "und-RS"],
347 },
348 TestCase {
349 input: "sr-Cyrl-RS",
350 requires_data: true,
351 extension_key: None,
352 fallback_supplement: None,
353 expected_language_chain: &["sr-RS", "sr"],
354 expected_region_chain: &["sr-RS", "und-RS"],
355 },
356 TestCase {
357 input: "sr-Latn-RS",
358 requires_data: true,
359 extension_key: None,
360 fallback_supplement: None,
361 expected_language_chain: &["sr-Latn-RS", "sr-Latn"],
362 expected_region_chain: &["sr-Latn-RS", "und-RS"],
363 },
364 TestCase {
365 input: "de-Latn-LI",
366 requires_data: true,
367 extension_key: None,
368 fallback_supplement: None,
369 expected_language_chain: &["de-LI", "de"],
370 expected_region_chain: &["de-LI", "und-LI"],
371 },
372 TestCase {
373 input: "ca-ES-valencia",
374 requires_data: true,
375 extension_key: None,
376 fallback_supplement: None,
377 expected_language_chain: &["ca-ES-valencia", "ca-ES", "ca-valencia", "ca"],
378 expected_region_chain: &["ca-ES-valencia", "ca-ES", "und-ES-valencia", "und-ES"],
379 },
380 TestCase {
381 input: "es-AR",
382 requires_data: true,
383 extension_key: None,
384 fallback_supplement: None,
385 expected_language_chain: &["es-AR", "es-419", "es"],
386 expected_region_chain: &["es-AR", "und-AR"],
387 },
388 TestCase {
389 input: "hi-IN",
390 requires_data: true,
391 extension_key: None,
392 fallback_supplement: None,
393 expected_language_chain: &["hi-IN", "hi"],
394 expected_region_chain: &["hi-IN", "und-IN"],
395 },
396 TestCase {
397 input: "hi-Latn-IN",
398 requires_data: true,
399 extension_key: None,
400 fallback_supplement: None,
401 expected_language_chain: &["hi-Latn-IN", "hi-Latn", "en-IN", "en-001", "en"],
402 expected_region_chain: &["hi-Latn-IN", "und-IN"],
403 },
404 TestCase {
405 input: "zh-CN",
406 requires_data: true,
407 extension_key: None,
408 fallback_supplement: None,
409 // Note: "zh-Hans" is not reachable because it is the default script for "zh".
410 // The fallback algorithm does not visit the language-script bundle when the
411 // script is the default for the language
412 expected_language_chain: &["zh-CN", "zh"],
413 expected_region_chain: &["zh-CN", "und-CN"],
414 },
415 TestCase {
416 input: "zh-TW",
417 requires_data: true,
418 extension_key: None,
419 fallback_supplement: None,
420 expected_language_chain: &["zh-TW", "zh-Hant-TW", "zh-Hant"],
421 expected_region_chain: &["zh-TW", "und-TW"],
422 },
423 TestCase {
424 input: "yue-HK",
425 requires_data: true,
426 extension_key: None,
427 fallback_supplement: None,
428 expected_language_chain: &["yue-HK", "yue"],
429 expected_region_chain: &["yue-HK", "und-HK"],
430 },
431 TestCase {
432 input: "yue-HK",
433 requires_data: true,
434 extension_key: None,
435 fallback_supplement: Some(LocaleFallbackSupplement::Collation),
436 // TODO(#1964): add "zh" as a target.
437 expected_language_chain: &["yue-HK", "yue", "zh-Hant"],
438 expected_region_chain: &["yue-HK", "und-HK"],
439 },
440 ];
441
442 #[test]
443 fn test_fallback() {
444 let fallbacker_no_data = LocaleFallbacker::new_without_data();
445 let fallbacker_no_data = fallbacker_no_data.as_borrowed();
446 let fallbacker_with_data = LocaleFallbacker::new();
447 for cas in TEST_CASES {
448 for (priority, expected_chain) in [
449 (
450 LocaleFallbackPriority::Language,
451 cas.expected_language_chain,
452 ),
453 (LocaleFallbackPriority::Region, cas.expected_region_chain),
454 ] {
455 let mut config = LocaleFallbackConfig::default();
456 config.priority = priority;
457 config.extension_key = cas.extension_key;
458 config.fallback_supplement = cas.fallback_supplement;
459 let fallbacker = if cas.requires_data {
460 fallbacker_with_data
461 } else {
462 fallbacker_no_data
463 };
464 let mut it = fallbacker
465 .for_config(config)
466 .fallback_for(Locale::from_str(cas.input).unwrap().into());
467 for &expected in expected_chain {
468 assert_eq!(
469 expected,
470 &*it.get().write_to_string(),
471 "{:?} ({:?})",
472 cas.input,
473 priority
474 );
475 it.step();
476 }
477 assert_eq!(
478 "und",
479 &*it.get().write_to_string(),
480 "{:?} ({:?})",
481 cas.input,
482 priority
483 );
484 }
485 }
486 }
487}
488