1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! The collection of code for locale canonicalization.
6
7use crate::provider::*;
8use crate::LocaleTransformError;
9use alloc::vec::Vec;
10use core::cmp::Ordering;
11
12use crate::LocaleExpander;
13use crate::TransformResult;
14use icu_locid::subtags::{Language, Region, Script};
15use icu_locid::{
16 extensions::unicode::key,
17 subtags::{language, Variant, Variants},
18 LanguageIdentifier, Locale,
19};
20use icu_provider::prelude::*;
21use tinystr::TinyAsciiStr;
22
23/// Implements the algorithm defined in *[UTS #35: Annex C, LocaleId Canonicalization]*.
24///
25/// # Examples
26///
27/// ```
28/// use icu_locid::Locale;
29/// use icu_locid_transform::{LocaleCanonicalizer, TransformResult};
30///
31/// let lc = LocaleCanonicalizer::new();
32///
33/// let mut locale: Locale = "ja-Latn-fonipa-hepburn-heploc".parse().unwrap();
34/// assert_eq!(lc.canonicalize(&mut locale), TransformResult::Modified);
35/// assert_eq!(locale, "ja-Latn-alalc97-fonipa".parse().unwrap());
36/// ```
37///
38/// [UTS #35: Annex C, LocaleId Canonicalization]: http://unicode.org/reports/tr35/#LocaleId_Canonicalization
39#[derive(Debug)]
40pub struct LocaleCanonicalizer {
41 /// Data to support canonicalization.
42 aliases: DataPayload<AliasesV1Marker>,
43 /// Likely subtags implementation for delegation.
44 expander: LocaleExpander,
45}
46
47#[inline]
48fn uts35_rule_matches<'a, I>(
49 source: &Locale,
50 language: Language,
51 script: Option<Script>,
52 region: Option<Region>,
53 raw_variants: I,
54) -> bool
55where
56 I: Iterator<Item = &'a str>,
57{
58 (language.is_empty() || language == source.id.language)
59 && (script.is_none() || script == source.id.script)
60 && (region.is_none() || region == source.id.region)
61 && {
62 // Checks if variants are a subset of source variants.
63 // As both iterators are sorted, this can be done linearly.
64 let mut source_variants = source.id.variants.iter();
65 'outer: for it in raw_variants {
66 for cand in source_variants.by_ref() {
67 match cand.strict_cmp(it.as_bytes()) {
68 Ordering::Equal => {
69 continue 'outer;
70 }
71 Ordering::Less => {}
72 _ => {
73 return false;
74 }
75 }
76 }
77 return false;
78 }
79 true
80 }
81}
82
83fn uts35_replacement<'a, I>(
84 source: &mut Locale,
85 ruletype_has_language: bool,
86 ruletype_has_script: bool,
87 ruletype_has_region: bool,
88 ruletype_variants: Option<I>,
89 replacement: &LanguageIdentifier,
90) where
91 I: Iterator<Item = &'a str>,
92{
93 if ruletype_has_language || (source.id.language.is_empty() && !replacement.language.is_empty())
94 {
95 source.id.language = replacement.language;
96 }
97 if ruletype_has_script || (source.id.script.is_none() && replacement.script.is_some()) {
98 source.id.script = replacement.script;
99 }
100 if ruletype_has_region || (source.id.region.is_none() && replacement.region.is_some()) {
101 source.id.region = replacement.region;
102 }
103 if let Some(skips) = ruletype_variants {
104 // The rule matches if the ruletype variants are a subset of the source variants.
105 // This means ja-Latn-fonipa-hepburn-heploc matches against the rule for
106 // hepburn-heploc and is canonicalized to ja-Latn-alalc97-fonipa
107
108 // We're merging three sorted deduped iterators into a new sequence:
109 // sources - skips + replacements
110
111 let mut sources = source.id.variants.iter().copied().peekable();
112 let mut replacements = replacement.variants.iter().copied().peekable();
113 let mut skips = skips.peekable();
114
115 let mut variants: Vec<Variant> = Vec::new();
116
117 loop {
118 match (sources.peek(), skips.peek(), replacements.peek()) {
119 (Some(&source), Some(skip), _)
120 if source.strict_cmp(skip.as_bytes()) == Ordering::Greater =>
121 {
122 skips.next();
123 }
124 (Some(&source), Some(skip), _)
125 if source.strict_cmp(skip.as_bytes()) == Ordering::Equal =>
126 {
127 skips.next();
128 sources.next();
129 }
130 (Some(&source), _, Some(&replacement))
131 if replacement.cmp(&source) == Ordering::Less =>
132 {
133 variants.push(replacement);
134 replacements.next();
135 }
136 (Some(&source), _, Some(&replacement))
137 if replacement.cmp(&source) == Ordering::Equal =>
138 {
139 variants.push(source);
140 sources.next();
141 replacements.next();
142 }
143 (Some(&source), _, _) => {
144 variants.push(source);
145 sources.next();
146 }
147 (None, _, Some(&replacement)) => {
148 variants.push(replacement);
149 replacements.next();
150 }
151 (None, _, None) => {
152 break;
153 }
154 }
155 }
156 source.id.variants = Variants::from_vec_unchecked(variants);
157 }
158}
159
160#[inline]
161fn uts35_check_language_rules(
162 locale: &mut Locale,
163 alias_data: &DataPayload<AliasesV1Marker>,
164) -> TransformResult {
165 if !locale.id.language.is_empty() {
166 let lang: TinyAsciiStr<3> = locale.id.language.into();
167 let replacement: Option<{unknown}> = if lang.len() == 2 {
168 alias_data&{unknown}
169 .get()
170 .language_len2
171 .get(&lang.resize().to_unvalidated())
172 } else {
173 alias_data.get().language_len3.get(&lang.to_unvalidated())
174 };
175
176 if let Some(replacement) = replacement {
177 if let Ok(langid: LanguageIdentifier) = replacement.parse() {
178 uts35_replacement::<core::iter::Empty<&str>>(
179 source:locale, ruletype_has_language:true, ruletype_has_script:false, ruletype_has_region:false, ruletype_variants:None, &langid,
180 );
181 return TransformResult::Modified;
182 }
183 }
184 }
185
186 TransformResult::Unmodified
187}
188
189fn is_iter_sorted<I, T>(mut iter: I) -> bool
190where
191 I: Iterator<Item = T>,
192 T: PartialOrd,
193{
194 if let Some(mut last: T) = iter.next() {
195 for curr: T in iter {
196 if last > curr {
197 return false;
198 }
199 last = curr;
200 }
201 }
202 true
203}
204
205#[cfg(feature = "compiled_data")]
206impl Default for LocaleCanonicalizer {
207 fn default() -> Self {
208 Self::new()
209 }
210}
211
212impl LocaleCanonicalizer {
213 /// A constructor which creates a [`LocaleCanonicalizer`] from compiled data.
214 ///
215 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
216 ///
217 /// [📚 Help choosing a constructor](icu_provider::constructors)
218 #[cfg(feature = "compiled_data")]
219 pub const fn new() -> Self {
220 Self::new_with_expander(LocaleExpander::new_extended())
221 }
222
223 // Note: This is a custom impl because the bounds on LocaleExpander::try_new_unstable changed
224 #[doc = icu_provider::gen_any_buffer_unstable_docs!(ANY, Self::new)]
225 pub fn try_new_with_any_provider(
226 provider: &(impl AnyProvider + ?Sized),
227 ) -> Result<LocaleCanonicalizer, LocaleTransformError> {
228 let expander = LocaleExpander::try_new_with_any_provider(provider)?;
229 Self::try_new_with_expander_unstable(&provider.as_downcasting(), expander)
230 }
231
232 // Note: This is a custom impl because the bounds on LocaleExpander::try_new_unstable changed
233 #[doc = icu_provider::gen_any_buffer_unstable_docs!(BUFFER, Self::new)]
234 #[cfg(feature = "serde")]
235 pub fn try_new_with_buffer_provider(
236 provider: &(impl BufferProvider + ?Sized),
237 ) -> Result<LocaleCanonicalizer, LocaleTransformError> {
238 let expander = LocaleExpander::try_new_with_buffer_provider(provider)?;
239 Self::try_new_with_expander_unstable(&provider.as_deserializing(), expander)
240 }
241
242 #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)]
243 pub fn try_new_unstable<P>(provider: &P) -> Result<LocaleCanonicalizer, LocaleTransformError>
244 where
245 P: DataProvider<AliasesV1Marker>
246 + DataProvider<LikelySubtagsForLanguageV1Marker>
247 + DataProvider<LikelySubtagsForScriptRegionV1Marker>
248 + ?Sized,
249 {
250 let expander = LocaleExpander::try_new_unstable(provider)?;
251 Self::try_new_with_expander_unstable(provider, expander)
252 }
253
254 /// Creates a [`LocaleCanonicalizer`] with a custom [`LocaleExpander`] and compiled data.
255 ///
256 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
257 ///
258 /// [📚 Help choosing a constructor](icu_provider::constructors)
259 #[cfg(feature = "compiled_data")]
260 pub const fn new_with_expander(expander: LocaleExpander) -> Self {
261 Self {
262 aliases: DataPayload::from_static_ref(
263 crate::provider::Baked::SINGLETON_LOCID_TRANSFORM_ALIASES_V1,
264 ),
265 expander,
266 }
267 }
268
269 #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_with_expander)]
270 pub fn try_new_with_expander_unstable<P>(
271 provider: &P,
272 expander: LocaleExpander,
273 ) -> Result<LocaleCanonicalizer, LocaleTransformError>
274 where
275 P: DataProvider<AliasesV1Marker> + ?Sized,
276 {
277 let aliases: DataPayload<AliasesV1Marker> =
278 provider.load(Default::default())?.take_payload()?;
279
280 Ok(LocaleCanonicalizer { aliases, expander })
281 }
282
283 icu_provider::gen_any_buffer_data_constructors!(
284 locale: skip,
285 options: LocaleExpander,
286 error: LocaleTransformError,
287 #[cfg(skip)]
288 functions: [
289 new_with_expander,
290 try_new_with_expander_with_any_provider,
291 try_new_with_expander_with_buffer_provider,
292 try_new_with_expander_unstable,
293 Self,
294 ]
295 );
296
297 /// The canonicalize method potentially updates a passed in locale in place
298 /// depending up the results of running the canonicalization algorithm
299 /// from <http://unicode.org/reports/tr35/#LocaleId_Canonicalization>.
300 ///
301 /// Some BCP47 canonicalization data is not part of the CLDR json package. Because
302 /// of this, some canonicalizations are not performed, e.g. the canonicalization of
303 /// `und-u-ca-islamicc` to `und-u-ca-islamic-civil`. This will be fixed in a future
304 /// release once the missing data has been added to the CLDR json data. See:
305 /// <https://github.com/unicode-org/icu4x/issues/746>
306 ///
307 /// # Examples
308 ///
309 /// ```
310 /// use icu_locid::Locale;
311 /// use icu_locid_transform::{LocaleCanonicalizer, TransformResult};
312 ///
313 /// let lc = LocaleCanonicalizer::new();
314 ///
315 /// let mut locale: Locale = "ja-Latn-fonipa-hepburn-heploc".parse().unwrap();
316 /// assert_eq!(lc.canonicalize(&mut locale), TransformResult::Modified);
317 /// assert_eq!(locale, "ja-Latn-alalc97-fonipa".parse().unwrap());
318 /// ```
319 pub fn canonicalize(&self, locale: &mut Locale) -> TransformResult {
320 let mut result = TransformResult::Unmodified;
321
322 // This loops until we get a 'fixed point', where applying the rules do not
323 // result in any more changes.
324 'outer: loop {
325 // These are linear searches due to the ordering imposed by the canonicalization
326 // rules, where rules with more variants should be considered first. With the
327 // current data in CLDR, we will only do this for locales which have variants,
328 // or new rules which we haven't special-cased yet (of which there are fewer
329 // than 20).
330 if !locale.id.variants.is_empty() {
331 // These language/variant comibnations have around 20 rules
332 for StrStrPair(raw_lang_variants, raw_to) in self
333 .aliases
334 .get()
335 .language_variants
336 .iter()
337 .map(zerofrom::ZeroFrom::zero_from)
338 {
339 let (raw_lang, raw_variants) = {
340 let mut subtags = raw_lang_variants.split('-');
341 (
342 // str::split can't return empty iterators
343 unsafe { subtags.next().unwrap_unchecked() },
344 subtags,
345 )
346 };
347 if is_iter_sorted(raw_variants.clone()) {
348 if let Ok(lang) = raw_lang.parse::<Language>() {
349 if uts35_rule_matches(locale, lang, None, None, raw_variants.clone()) {
350 if let Ok(to) = raw_to.parse() {
351 uts35_replacement(
352 locale,
353 !lang.is_empty(),
354 false,
355 false,
356 Some(raw_variants),
357 &to,
358 );
359 result = TransformResult::Modified;
360 continue 'outer;
361 }
362 }
363 }
364 }
365 }
366 } else {
367 // These are absolute fallbacks, and currently empty.
368 for StrStrPair(raw_from, raw_to) in self
369 .aliases
370 .get()
371 .language
372 .iter()
373 .map(zerofrom::ZeroFrom::zero_from)
374 {
375 if let Ok(from) = raw_from.parse::<LanguageIdentifier>() {
376 if uts35_rule_matches(
377 locale,
378 from.language,
379 from.script,
380 from.region,
381 from.variants.iter().map(Variant::as_str),
382 ) {
383 if let Ok(to) = raw_to.parse() {
384 uts35_replacement(
385 locale,
386 !from.language.is_empty(),
387 from.script.is_some(),
388 from.region.is_some(),
389 Some(from.variants.iter().map(Variant::as_str)),
390 &to,
391 );
392 result = TransformResult::Modified;
393 continue 'outer;
394 }
395 }
396 }
397 }
398 }
399
400 if !locale.id.language.is_empty() {
401 // If the region is specified, check sgn-region rules first
402 if let Some(region) = locale.id.region {
403 if locale.id.language == language!("sgn") {
404 if let Some(&sgn_lang) = self
405 .aliases
406 .get()
407 .sgn_region
408 .get(&region.into_tinystr().to_unvalidated())
409 {
410 uts35_replacement::<core::iter::Empty<&str>>(
411 locale,
412 true,
413 false,
414 true,
415 None,
416 &sgn_lang.into(),
417 );
418 result = TransformResult::Modified;
419 continue;
420 }
421 }
422 }
423
424 if uts35_check_language_rules(locale, &self.aliases) == TransformResult::Modified {
425 result = TransformResult::Modified;
426 continue;
427 }
428 }
429
430 if let Some(script) = locale.id.script {
431 if let Some(&replacement) = self
432 .aliases
433 .get()
434 .script
435 .get(&script.into_tinystr().to_unvalidated())
436 {
437 locale.id.script = Some(replacement);
438 result = TransformResult::Modified;
439 continue;
440 }
441 }
442
443 if let Some(region) = locale.id.region {
444 let replacement = if region.is_alphabetic() {
445 self.aliases
446 .get()
447 .region_alpha
448 .get(&region.into_tinystr().resize().to_unvalidated())
449 } else {
450 self.aliases
451 .get()
452 .region_num
453 .get(&region.into_tinystr().to_unvalidated())
454 };
455 if let Some(&replacement) = replacement {
456 locale.id.region = Some(replacement);
457 result = TransformResult::Modified;
458 continue;
459 }
460
461 if let Some(regions) = self
462 .aliases
463 .get()
464 .complex_region
465 .get(&region.into_tinystr().to_unvalidated())
466 {
467 // Skip if regions are empty
468 if let Some(default_region) = regions.get(0) {
469 let mut maximized = LanguageIdentifier {
470 language: locale.id.language,
471 script: locale.id.script,
472 region: None,
473 variants: Variants::default(),
474 };
475
476 locale.id.region = Some(
477 match (self.expander.maximize(&mut maximized), maximized.region) {
478 (TransformResult::Modified, Some(candidate))
479 if regions.iter().any(|x| x == candidate) =>
480 {
481 candidate
482 }
483 _ => default_region,
484 },
485 );
486 result = TransformResult::Modified;
487 continue;
488 }
489 }
490 }
491
492 if !locale.id.variants.is_empty() {
493 let mut modified = Vec::new();
494 let mut unmodified = Vec::new();
495 for &variant in locale.id.variants.iter() {
496 if let Some(&updated) = self
497 .aliases
498 .get()
499 .variant
500 .get(&variant.into_tinystr().to_unvalidated())
501 {
502 modified.push(updated);
503 } else {
504 unmodified.push(variant);
505 }
506 }
507
508 if !modified.is_empty() {
509 modified.extend(unmodified);
510 modified.sort();
511 modified.dedup();
512 locale.id.variants = Variants::from_vec_unchecked(modified);
513 result = TransformResult::Modified;
514 continue;
515 }
516 }
517
518 // Nothing matched in this iteration, we're done.
519 break;
520 }
521
522 // Handle Locale extensions in their own loops, because these rules do not interact
523 // with each other.
524 if let Some(lang) = &locale.extensions.transform.lang {
525 let mut tlang: Locale = lang.clone().into();
526 let mut matched = false;
527 loop {
528 if uts35_check_language_rules(&mut tlang, &self.aliases)
529 == TransformResult::Modified
530 {
531 result = TransformResult::Modified;
532 matched = true;
533 continue;
534 }
535
536 break;
537 }
538
539 if matched {
540 locale.extensions.transform.lang = Some(tlang.id);
541 }
542 }
543
544 // The `rg` region override and `sd` regional subdivision keys may contain
545 // language codes that require canonicalization.
546 for key in &[key!("rg"), key!("sd")] {
547 if let Some(value) = locale.extensions.unicode.keywords.get_mut(key) {
548 if let &[only_value] = value.as_tinystr_slice() {
549 if let Some(modified_value) = self
550 .aliases
551 .get()
552 .subdivision
553 .get(&only_value.resize().to_unvalidated())
554 {
555 if let Ok(modified_value) = modified_value.parse() {
556 *value = modified_value;
557 result = TransformResult::Modified;
558 }
559 }
560 }
561 }
562 }
563
564 result
565 }
566}
567
568#[test]
569fn test_uts35_rule_matches() {
570 for (source: &str, rule: &str, result: bool) in [
571 ("ja", "und", true),
572 ("und-heploc-hepburn", "und-hepburn", true),
573 ("ja-heploc-hepburn", "und-hepburn", true),
574 ("ja-hepburn", "und-hepburn-heploc", false),
575 ] {
576 let source: Locale = source.parse().unwrap();
577 let rule: LanguageIdentifier = rule.parse::<LanguageIdentifier>().unwrap();
578 assert_eq!(
579 uts35_rule_matches(
580 &source,
581 rule.language,
582 rule.script,
583 rule.region,
584 rule.variants.iter().map(Variant::as_str),
585 ),
586 result,
587 "{source}"
588 );
589 }
590}
591
592#[test]
593fn test_uts35_replacement() {
594 for (locale, rule_0, rule_1, result) in [
595 (
596 "ja-Latn-fonipa-hepburn-heploc",
597 "und-hepburn-heploc",
598 "und-alalc97",
599 "ja-Latn-alalc97-fonipa",
600 ),
601 ("sgn-DD", "und-DD", "und-DE", "sgn-DE"),
602 ("sgn-DE", "sgn-DE", "gsg", "gsg"),
603 ] {
604 let mut locale = locale.parse().unwrap();
605 let rule_0 = rule_0.parse::<LanguageIdentifier>().unwrap();
606 let rule_1 = rule_1.parse().unwrap();
607 let result = result.parse::<Locale>().unwrap();
608 uts35_replacement(
609 &mut locale,
610 !rule_0.language.is_empty(),
611 rule_0.script.is_some(),
612 rule_0.region.is_some(),
613 Some(rule_0.variants.iter().map(Variant::as_str)),
614 &rule_1,
615 );
616 assert_eq!(result, locale);
617 }
618}
619