1 | // This file is part of ICU4X. For terms of use, please see the file |
2 | // called LICENSE at the top level of the ICU4X source tree |
3 | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | |
5 | //! The collection of code for locale canonicalization. |
6 | |
7 | use crate::provider::*; |
8 | use crate::LocaleTransformError; |
9 | use alloc::vec::Vec; |
10 | use core::cmp::Ordering; |
11 | |
12 | use crate::LocaleExpander; |
13 | use crate::TransformResult; |
14 | use icu_locid::subtags::{Language, Region, Script}; |
15 | use icu_locid::{ |
16 | extensions::unicode::key, |
17 | subtags::{language, Variant, Variants}, |
18 | LanguageIdentifier, Locale, |
19 | }; |
20 | use icu_provider::prelude::*; |
21 | use tinystr::TinyAsciiStr; |
22 | |
23 | /// Implements the algorithm defined in *[UTS #35: Annex C, LocaleId Canonicalization]*. |
24 | /// |
25 | /// # Examples |
26 | /// |
27 | /// ``` |
28 | /// use icu_locid::Locale; |
29 | /// use icu_locid_transform::{LocaleCanonicalizer, TransformResult}; |
30 | /// |
31 | /// let lc = LocaleCanonicalizer::new(); |
32 | /// |
33 | /// let mut locale: Locale = "ja-Latn-fonipa-hepburn-heploc" .parse().unwrap(); |
34 | /// assert_eq!(lc.canonicalize(&mut locale), TransformResult::Modified); |
35 | /// assert_eq!(locale, "ja-Latn-alalc97-fonipa" .parse().unwrap()); |
36 | /// ``` |
37 | /// |
38 | /// [UTS #35: Annex C, LocaleId Canonicalization]: http://unicode.org/reports/tr35/#LocaleId_Canonicalization |
39 | #[derive (Debug)] |
40 | pub struct LocaleCanonicalizer { |
41 | /// Data to support canonicalization. |
42 | aliases: DataPayload<AliasesV1Marker>, |
43 | /// Likely subtags implementation for delegation. |
44 | expander: LocaleExpander, |
45 | } |
46 | |
47 | #[inline ] |
48 | fn uts35_rule_matches<'a, I>( |
49 | source: &Locale, |
50 | language: Language, |
51 | script: Option<Script>, |
52 | region: Option<Region>, |
53 | raw_variants: I, |
54 | ) -> bool |
55 | where |
56 | I: Iterator<Item = &'a str>, |
57 | { |
58 | (language.is_empty() || language == source.id.language) |
59 | && (script.is_none() || script == source.id.script) |
60 | && (region.is_none() || region == source.id.region) |
61 | && { |
62 | // Checks if variants are a subset of source variants. |
63 | // As both iterators are sorted, this can be done linearly. |
64 | let mut source_variants = source.id.variants.iter(); |
65 | 'outer: for it in raw_variants { |
66 | for cand in source_variants.by_ref() { |
67 | match cand.strict_cmp(it.as_bytes()) { |
68 | Ordering::Equal => { |
69 | continue 'outer; |
70 | } |
71 | Ordering::Less => {} |
72 | _ => { |
73 | return false; |
74 | } |
75 | } |
76 | } |
77 | return false; |
78 | } |
79 | true |
80 | } |
81 | } |
82 | |
83 | fn uts35_replacement<'a, I>( |
84 | source: &mut Locale, |
85 | ruletype_has_language: bool, |
86 | ruletype_has_script: bool, |
87 | ruletype_has_region: bool, |
88 | ruletype_variants: Option<I>, |
89 | replacement: &LanguageIdentifier, |
90 | ) where |
91 | I: Iterator<Item = &'a str>, |
92 | { |
93 | if ruletype_has_language || (source.id.language.is_empty() && !replacement.language.is_empty()) |
94 | { |
95 | source.id.language = replacement.language; |
96 | } |
97 | if ruletype_has_script || (source.id.script.is_none() && replacement.script.is_some()) { |
98 | source.id.script = replacement.script; |
99 | } |
100 | if ruletype_has_region || (source.id.region.is_none() && replacement.region.is_some()) { |
101 | source.id.region = replacement.region; |
102 | } |
103 | if let Some(skips) = ruletype_variants { |
104 | // The rule matches if the ruletype variants are a subset of the source variants. |
105 | // This means ja-Latn-fonipa-hepburn-heploc matches against the rule for |
106 | // hepburn-heploc and is canonicalized to ja-Latn-alalc97-fonipa |
107 | |
108 | // We're merging three sorted deduped iterators into a new sequence: |
109 | // sources - skips + replacements |
110 | |
111 | let mut sources = source.id.variants.iter().copied().peekable(); |
112 | let mut replacements = replacement.variants.iter().copied().peekable(); |
113 | let mut skips = skips.peekable(); |
114 | |
115 | let mut variants: Vec<Variant> = Vec::new(); |
116 | |
117 | loop { |
118 | match (sources.peek(), skips.peek(), replacements.peek()) { |
119 | (Some(&source), Some(skip), _) |
120 | if source.strict_cmp(skip.as_bytes()) == Ordering::Greater => |
121 | { |
122 | skips.next(); |
123 | } |
124 | (Some(&source), Some(skip), _) |
125 | if source.strict_cmp(skip.as_bytes()) == Ordering::Equal => |
126 | { |
127 | skips.next(); |
128 | sources.next(); |
129 | } |
130 | (Some(&source), _, Some(&replacement)) |
131 | if replacement.cmp(&source) == Ordering::Less => |
132 | { |
133 | variants.push(replacement); |
134 | replacements.next(); |
135 | } |
136 | (Some(&source), _, Some(&replacement)) |
137 | if replacement.cmp(&source) == Ordering::Equal => |
138 | { |
139 | variants.push(source); |
140 | sources.next(); |
141 | replacements.next(); |
142 | } |
143 | (Some(&source), _, _) => { |
144 | variants.push(source); |
145 | sources.next(); |
146 | } |
147 | (None, _, Some(&replacement)) => { |
148 | variants.push(replacement); |
149 | replacements.next(); |
150 | } |
151 | (None, _, None) => { |
152 | break; |
153 | } |
154 | } |
155 | } |
156 | source.id.variants = Variants::from_vec_unchecked(variants); |
157 | } |
158 | } |
159 | |
160 | #[inline ] |
161 | fn uts35_check_language_rules( |
162 | locale: &mut Locale, |
163 | alias_data: &DataPayload<AliasesV1Marker>, |
164 | ) -> TransformResult { |
165 | if !locale.id.language.is_empty() { |
166 | let lang: TinyAsciiStr<3> = locale.id.language.into(); |
167 | let replacement: Option<{unknown}> = if lang.len() == 2 { |
168 | alias_data&{unknown} |
169 | .get() |
170 | .language_len2 |
171 | .get(&lang.resize().to_unvalidated()) |
172 | } else { |
173 | alias_data.get().language_len3.get(&lang.to_unvalidated()) |
174 | }; |
175 | |
176 | if let Some(replacement) = replacement { |
177 | if let Ok(langid: LanguageIdentifier) = replacement.parse() { |
178 | uts35_replacement::<core::iter::Empty<&str>>( |
179 | source:locale, ruletype_has_language:true, ruletype_has_script:false, ruletype_has_region:false, ruletype_variants:None, &langid, |
180 | ); |
181 | return TransformResult::Modified; |
182 | } |
183 | } |
184 | } |
185 | |
186 | TransformResult::Unmodified |
187 | } |
188 | |
189 | fn is_iter_sorted<I, T>(mut iter: I) -> bool |
190 | where |
191 | I: Iterator<Item = T>, |
192 | T: PartialOrd, |
193 | { |
194 | if let Some(mut last: T) = iter.next() { |
195 | for curr: T in iter { |
196 | if last > curr { |
197 | return false; |
198 | } |
199 | last = curr; |
200 | } |
201 | } |
202 | true |
203 | } |
204 | |
205 | #[cfg (feature = "compiled_data" )] |
206 | impl Default for LocaleCanonicalizer { |
207 | fn default() -> Self { |
208 | Self::new() |
209 | } |
210 | } |
211 | |
212 | impl LocaleCanonicalizer { |
213 | /// A constructor which creates a [`LocaleCanonicalizer`] from compiled data. |
214 | /// |
215 | /// ✨ *Enabled with the `compiled_data` Cargo feature.* |
216 | /// |
217 | /// [📚 Help choosing a constructor](icu_provider::constructors) |
218 | #[cfg (feature = "compiled_data" )] |
219 | pub const fn new() -> Self { |
220 | Self::new_with_expander(LocaleExpander::new_extended()) |
221 | } |
222 | |
223 | // Note: This is a custom impl because the bounds on LocaleExpander::try_new_unstable changed |
224 | #[doc = icu_provider::gen_any_buffer_unstable_docs!(ANY, Self::new)] |
225 | pub fn try_new_with_any_provider( |
226 | provider: &(impl AnyProvider + ?Sized), |
227 | ) -> Result<LocaleCanonicalizer, LocaleTransformError> { |
228 | let expander = LocaleExpander::try_new_with_any_provider(provider)?; |
229 | Self::try_new_with_expander_unstable(&provider.as_downcasting(), expander) |
230 | } |
231 | |
232 | // Note: This is a custom impl because the bounds on LocaleExpander::try_new_unstable changed |
233 | #[doc = icu_provider::gen_any_buffer_unstable_docs!(BUFFER, Self::new)] |
234 | #[cfg (feature = "serde" )] |
235 | pub fn try_new_with_buffer_provider( |
236 | provider: &(impl BufferProvider + ?Sized), |
237 | ) -> Result<LocaleCanonicalizer, LocaleTransformError> { |
238 | let expander = LocaleExpander::try_new_with_buffer_provider(provider)?; |
239 | Self::try_new_with_expander_unstable(&provider.as_deserializing(), expander) |
240 | } |
241 | |
242 | #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)] |
243 | pub fn try_new_unstable<P>(provider: &P) -> Result<LocaleCanonicalizer, LocaleTransformError> |
244 | where |
245 | P: DataProvider<AliasesV1Marker> |
246 | + DataProvider<LikelySubtagsForLanguageV1Marker> |
247 | + DataProvider<LikelySubtagsForScriptRegionV1Marker> |
248 | + ?Sized, |
249 | { |
250 | let expander = LocaleExpander::try_new_unstable(provider)?; |
251 | Self::try_new_with_expander_unstable(provider, expander) |
252 | } |
253 | |
254 | /// Creates a [`LocaleCanonicalizer`] with a custom [`LocaleExpander`] and compiled data. |
255 | /// |
256 | /// ✨ *Enabled with the `compiled_data` Cargo feature.* |
257 | /// |
258 | /// [📚 Help choosing a constructor](icu_provider::constructors) |
259 | #[cfg (feature = "compiled_data" )] |
260 | pub const fn new_with_expander(expander: LocaleExpander) -> Self { |
261 | Self { |
262 | aliases: DataPayload::from_static_ref( |
263 | crate::provider::Baked::SINGLETON_LOCID_TRANSFORM_ALIASES_V1, |
264 | ), |
265 | expander, |
266 | } |
267 | } |
268 | |
269 | #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_with_expander)] |
270 | pub fn try_new_with_expander_unstable<P>( |
271 | provider: &P, |
272 | expander: LocaleExpander, |
273 | ) -> Result<LocaleCanonicalizer, LocaleTransformError> |
274 | where |
275 | P: DataProvider<AliasesV1Marker> + ?Sized, |
276 | { |
277 | let aliases: DataPayload<AliasesV1Marker> = |
278 | provider.load(Default::default())?.take_payload()?; |
279 | |
280 | Ok(LocaleCanonicalizer { aliases, expander }) |
281 | } |
282 | |
283 | icu_provider::gen_any_buffer_data_constructors!( |
284 | locale: skip, |
285 | options: LocaleExpander, |
286 | error: LocaleTransformError, |
287 | #[cfg (skip)] |
288 | functions: [ |
289 | new_with_expander, |
290 | try_new_with_expander_with_any_provider, |
291 | try_new_with_expander_with_buffer_provider, |
292 | try_new_with_expander_unstable, |
293 | Self, |
294 | ] |
295 | ); |
296 | |
297 | /// The canonicalize method potentially updates a passed in locale in place |
298 | /// depending up the results of running the canonicalization algorithm |
299 | /// from <http://unicode.org/reports/tr35/#LocaleId_Canonicalization>. |
300 | /// |
301 | /// Some BCP47 canonicalization data is not part of the CLDR json package. Because |
302 | /// of this, some canonicalizations are not performed, e.g. the canonicalization of |
303 | /// `und-u-ca-islamicc` to `und-u-ca-islamic-civil`. This will be fixed in a future |
304 | /// release once the missing data has been added to the CLDR json data. See: |
305 | /// <https://github.com/unicode-org/icu4x/issues/746> |
306 | /// |
307 | /// # Examples |
308 | /// |
309 | /// ``` |
310 | /// use icu_locid::Locale; |
311 | /// use icu_locid_transform::{LocaleCanonicalizer, TransformResult}; |
312 | /// |
313 | /// let lc = LocaleCanonicalizer::new(); |
314 | /// |
315 | /// let mut locale: Locale = "ja-Latn-fonipa-hepburn-heploc" .parse().unwrap(); |
316 | /// assert_eq!(lc.canonicalize(&mut locale), TransformResult::Modified); |
317 | /// assert_eq!(locale, "ja-Latn-alalc97-fonipa" .parse().unwrap()); |
318 | /// ``` |
319 | pub fn canonicalize(&self, locale: &mut Locale) -> TransformResult { |
320 | let mut result = TransformResult::Unmodified; |
321 | |
322 | // This loops until we get a 'fixed point', where applying the rules do not |
323 | // result in any more changes. |
324 | 'outer: loop { |
325 | // These are linear searches due to the ordering imposed by the canonicalization |
326 | // rules, where rules with more variants should be considered first. With the |
327 | // current data in CLDR, we will only do this for locales which have variants, |
328 | // or new rules which we haven't special-cased yet (of which there are fewer |
329 | // than 20). |
330 | if !locale.id.variants.is_empty() { |
331 | // These language/variant comibnations have around 20 rules |
332 | for StrStrPair(raw_lang_variants, raw_to) in self |
333 | .aliases |
334 | .get() |
335 | .language_variants |
336 | .iter() |
337 | .map(zerofrom::ZeroFrom::zero_from) |
338 | { |
339 | let (raw_lang, raw_variants) = { |
340 | let mut subtags = raw_lang_variants.split('-' ); |
341 | ( |
342 | // str::split can't return empty iterators |
343 | unsafe { subtags.next().unwrap_unchecked() }, |
344 | subtags, |
345 | ) |
346 | }; |
347 | if is_iter_sorted(raw_variants.clone()) { |
348 | if let Ok(lang) = raw_lang.parse::<Language>() { |
349 | if uts35_rule_matches(locale, lang, None, None, raw_variants.clone()) { |
350 | if let Ok(to) = raw_to.parse() { |
351 | uts35_replacement( |
352 | locale, |
353 | !lang.is_empty(), |
354 | false, |
355 | false, |
356 | Some(raw_variants), |
357 | &to, |
358 | ); |
359 | result = TransformResult::Modified; |
360 | continue 'outer; |
361 | } |
362 | } |
363 | } |
364 | } |
365 | } |
366 | } else { |
367 | // These are absolute fallbacks, and currently empty. |
368 | for StrStrPair(raw_from, raw_to) in self |
369 | .aliases |
370 | .get() |
371 | .language |
372 | .iter() |
373 | .map(zerofrom::ZeroFrom::zero_from) |
374 | { |
375 | if let Ok(from) = raw_from.parse::<LanguageIdentifier>() { |
376 | if uts35_rule_matches( |
377 | locale, |
378 | from.language, |
379 | from.script, |
380 | from.region, |
381 | from.variants.iter().map(Variant::as_str), |
382 | ) { |
383 | if let Ok(to) = raw_to.parse() { |
384 | uts35_replacement( |
385 | locale, |
386 | !from.language.is_empty(), |
387 | from.script.is_some(), |
388 | from.region.is_some(), |
389 | Some(from.variants.iter().map(Variant::as_str)), |
390 | &to, |
391 | ); |
392 | result = TransformResult::Modified; |
393 | continue 'outer; |
394 | } |
395 | } |
396 | } |
397 | } |
398 | } |
399 | |
400 | if !locale.id.language.is_empty() { |
401 | // If the region is specified, check sgn-region rules first |
402 | if let Some(region) = locale.id.region { |
403 | if locale.id.language == language!("sgn" ) { |
404 | if let Some(&sgn_lang) = self |
405 | .aliases |
406 | .get() |
407 | .sgn_region |
408 | .get(®ion.into_tinystr().to_unvalidated()) |
409 | { |
410 | uts35_replacement::<core::iter::Empty<&str>>( |
411 | locale, |
412 | true, |
413 | false, |
414 | true, |
415 | None, |
416 | &sgn_lang.into(), |
417 | ); |
418 | result = TransformResult::Modified; |
419 | continue; |
420 | } |
421 | } |
422 | } |
423 | |
424 | if uts35_check_language_rules(locale, &self.aliases) == TransformResult::Modified { |
425 | result = TransformResult::Modified; |
426 | continue; |
427 | } |
428 | } |
429 | |
430 | if let Some(script) = locale.id.script { |
431 | if let Some(&replacement) = self |
432 | .aliases |
433 | .get() |
434 | .script |
435 | .get(&script.into_tinystr().to_unvalidated()) |
436 | { |
437 | locale.id.script = Some(replacement); |
438 | result = TransformResult::Modified; |
439 | continue; |
440 | } |
441 | } |
442 | |
443 | if let Some(region) = locale.id.region { |
444 | let replacement = if region.is_alphabetic() { |
445 | self.aliases |
446 | .get() |
447 | .region_alpha |
448 | .get(®ion.into_tinystr().resize().to_unvalidated()) |
449 | } else { |
450 | self.aliases |
451 | .get() |
452 | .region_num |
453 | .get(®ion.into_tinystr().to_unvalidated()) |
454 | }; |
455 | if let Some(&replacement) = replacement { |
456 | locale.id.region = Some(replacement); |
457 | result = TransformResult::Modified; |
458 | continue; |
459 | } |
460 | |
461 | if let Some(regions) = self |
462 | .aliases |
463 | .get() |
464 | .complex_region |
465 | .get(®ion.into_tinystr().to_unvalidated()) |
466 | { |
467 | // Skip if regions are empty |
468 | if let Some(default_region) = regions.get(0) { |
469 | let mut maximized = LanguageIdentifier { |
470 | language: locale.id.language, |
471 | script: locale.id.script, |
472 | region: None, |
473 | variants: Variants::default(), |
474 | }; |
475 | |
476 | locale.id.region = Some( |
477 | match (self.expander.maximize(&mut maximized), maximized.region) { |
478 | (TransformResult::Modified, Some(candidate)) |
479 | if regions.iter().any(|x| x == candidate) => |
480 | { |
481 | candidate |
482 | } |
483 | _ => default_region, |
484 | }, |
485 | ); |
486 | result = TransformResult::Modified; |
487 | continue; |
488 | } |
489 | } |
490 | } |
491 | |
492 | if !locale.id.variants.is_empty() { |
493 | let mut modified = Vec::new(); |
494 | let mut unmodified = Vec::new(); |
495 | for &variant in locale.id.variants.iter() { |
496 | if let Some(&updated) = self |
497 | .aliases |
498 | .get() |
499 | .variant |
500 | .get(&variant.into_tinystr().to_unvalidated()) |
501 | { |
502 | modified.push(updated); |
503 | } else { |
504 | unmodified.push(variant); |
505 | } |
506 | } |
507 | |
508 | if !modified.is_empty() { |
509 | modified.extend(unmodified); |
510 | modified.sort(); |
511 | modified.dedup(); |
512 | locale.id.variants = Variants::from_vec_unchecked(modified); |
513 | result = TransformResult::Modified; |
514 | continue; |
515 | } |
516 | } |
517 | |
518 | // Nothing matched in this iteration, we're done. |
519 | break; |
520 | } |
521 | |
522 | // Handle Locale extensions in their own loops, because these rules do not interact |
523 | // with each other. |
524 | if let Some(lang) = &locale.extensions.transform.lang { |
525 | let mut tlang: Locale = lang.clone().into(); |
526 | let mut matched = false; |
527 | loop { |
528 | if uts35_check_language_rules(&mut tlang, &self.aliases) |
529 | == TransformResult::Modified |
530 | { |
531 | result = TransformResult::Modified; |
532 | matched = true; |
533 | continue; |
534 | } |
535 | |
536 | break; |
537 | } |
538 | |
539 | if matched { |
540 | locale.extensions.transform.lang = Some(tlang.id); |
541 | } |
542 | } |
543 | |
544 | // The `rg` region override and `sd` regional subdivision keys may contain |
545 | // language codes that require canonicalization. |
546 | for key in &[key!("rg" ), key!("sd" )] { |
547 | if let Some(value) = locale.extensions.unicode.keywords.get_mut(key) { |
548 | if let &[only_value] = value.as_tinystr_slice() { |
549 | if let Some(modified_value) = self |
550 | .aliases |
551 | .get() |
552 | .subdivision |
553 | .get(&only_value.resize().to_unvalidated()) |
554 | { |
555 | if let Ok(modified_value) = modified_value.parse() { |
556 | *value = modified_value; |
557 | result = TransformResult::Modified; |
558 | } |
559 | } |
560 | } |
561 | } |
562 | } |
563 | |
564 | result |
565 | } |
566 | } |
567 | |
568 | #[test ] |
569 | fn test_uts35_rule_matches() { |
570 | for (source: &str, rule: &str, result: bool) in [ |
571 | ("ja" , "und" , true), |
572 | ("und-heploc-hepburn" , "und-hepburn" , true), |
573 | ("ja-heploc-hepburn" , "und-hepburn" , true), |
574 | ("ja-hepburn" , "und-hepburn-heploc" , false), |
575 | ] { |
576 | let source: Locale = source.parse().unwrap(); |
577 | let rule: LanguageIdentifier = rule.parse::<LanguageIdentifier>().unwrap(); |
578 | assert_eq!( |
579 | uts35_rule_matches( |
580 | &source, |
581 | rule.language, |
582 | rule.script, |
583 | rule.region, |
584 | rule.variants.iter().map(Variant::as_str), |
585 | ), |
586 | result, |
587 | " {source}" |
588 | ); |
589 | } |
590 | } |
591 | |
592 | #[test ] |
593 | fn test_uts35_replacement() { |
594 | for (locale, rule_0, rule_1, result) in [ |
595 | ( |
596 | "ja-Latn-fonipa-hepburn-heploc" , |
597 | "und-hepburn-heploc" , |
598 | "und-alalc97" , |
599 | "ja-Latn-alalc97-fonipa" , |
600 | ), |
601 | ("sgn-DD" , "und-DD" , "und-DE" , "sgn-DE" ), |
602 | ("sgn-DE" , "sgn-DE" , "gsg" , "gsg" ), |
603 | ] { |
604 | let mut locale = locale.parse().unwrap(); |
605 | let rule_0 = rule_0.parse::<LanguageIdentifier>().unwrap(); |
606 | let rule_1 = rule_1.parse().unwrap(); |
607 | let result = result.parse::<Locale>().unwrap(); |
608 | uts35_replacement( |
609 | &mut locale, |
610 | !rule_0.language.is_empty(), |
611 | rule_0.script.is_some(), |
612 | rule_0.region.is_some(), |
613 | Some(rule_0.variants.iter().map(Variant::as_str)), |
614 | &rule_1, |
615 | ); |
616 | assert_eq!(result, locale); |
617 | } |
618 | } |
619 | |