canonicalizer.rs source code [crates/icu_locid_transform/src/provider/canonicalizer.rs]

1	// This file is part of ICU4X. For terms of use, please see the file
2	// called LICENSE at the top level of the ICU4X source tree
3	// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5	use super::*;
6	use icu_locid::subtags::{Language, Region, Script, Variant};
7	use icu_provider::prelude::*;
8	use tinystr::UnvalidatedTinyAsciiStr;
9	use zerovec::{VarZeroVec, ZeroMap, ZeroSlice};
10
11	#[icu_provider::data_struct(marker(AliasesV1Marker, "locid_transform/aliases@1", singleton))]
12	#[derive(PartialEq, Clone, Default)]
13	#[cfg_attr(
14	feature = "datagen",
15	derive(serde::Serialize, databake::Bake),
16	databake(path = icu_locid_transform::provider),
17	)]
18	#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
19	#[yoke(prove_covariance_manually)]
20	/// This alias data is used for locale canonicalization. Each field defines a
21	/// mapping from an old identifier to a new identifier, based upon the rules in
22	/// from <http://unicode.org/reports/tr35/#LocaleId_Canonicalization>. The data
23	/// is stored in sorted order, allowing for binary search to identify rules to
24	/// apply. It is broken down into smaller vectors based upon some characteristic
25	/// of the data, to help avoid unnecessary searches. For example, the `sgn_region`
26	/// field contains aliases for sign language and region, so that it is not
27	/// necessary to search the data unless the input is a sign language.
28	///
29	/// The algorithm in tr35 is not guaranteed to terminate on data other than what
30	/// is currently in CLDR. For this reason, it is not a good idea to attempt to add
31	/// or modify aliases for use in this structure.
32	///
33	/// <div class="stab unstable">
34	/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
35	/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
36	/// to be stable, their Rust representation might not be. Use with caution.
37	/// </div>
38	// TODO: Use validated types as value types
39	#[derive(Debug)]
40	pub struct AliasesV1<'data> {
41	/// `[language(-variant)+\] -> [langid]`
42	/// This is not a map as it's searched linearly according to the canonicalization rules.
43	#[cfg_attr(feature = "serde", serde(borrow))]
44	pub language_variants: VarZeroVec<'data, UnvalidatedLanguageIdentifierPair>,
45	/// `sgn-[region] -> [language]`
46	#[cfg_attr(feature = "serde", serde(borrow))]
47	pub sgn_region: ZeroMap<'data, UnvalidatedRegion, Language>,
48	/// `[language{2}] -> [langid]`
49	#[cfg_attr(feature = "serde", serde(borrow))]
50	pub language_len2: ZeroMap<'data, UnvalidatedTinyAsciiStr<`2`>, UnvalidatedLanguageIdentifier>,
51	/// `[language{3}] -> [langid]`
52	#[cfg_attr(feature = "serde", serde(borrow))]
53	pub language_len3: ZeroMap<'data, UnvalidatedLanguage, UnvalidatedLanguageIdentifier>,
54	/// `[langid] -> [langid]`
55	/// This is not a map as it's searched linearly according to the canonicalization rules.
56	#[cfg_attr(feature = "serde", serde(borrow))]
57	pub language: VarZeroVec<'data, UnvalidatedLanguageIdentifierPair>,
58
59	/// `[script] -> [script]`
60	#[cfg_attr(feature = "serde", serde(borrow))]
61	pub script: ZeroMap<'data, UnvalidatedScript, Script>,
62
63	/// `[region{2}] -> [region]`
64	#[cfg_attr(feature = "serde", serde(borrow))]
65	pub region_alpha: ZeroMap<'data, UnvalidatedTinyAsciiStr<`2`>, Region>,
66	/// `[region{3}] -> [region]`
67	#[cfg_attr(feature = "serde", serde(borrow))]
68	pub region_num: ZeroMap<'data, UnvalidatedRegion, Region>,
69
70	/// `[region] -> [region]+`
71	#[cfg_attr(feature = "serde", serde(borrow))]
72	pub complex_region: ZeroMap<'data, UnvalidatedRegion, ZeroSlice<Region>>,
73
74	/// `[variant] -> [variant]`
75	#[cfg_attr(feature = "serde", serde(borrow))]
76	pub variant: ZeroMap<'data, UnvalidatedVariant, Variant>,
77
78	/// `[value{7}] -> [value{7}]`
79	#[cfg_attr(feature = "serde", serde(borrow))]
80	pub subdivision: ZeroMap<'data, UnvalidatedSubdivision, SemivalidatedSubdivision>,
81	}
82
83	#[cfg(feature = "datagen")]
84	impl<'data> From<AliasesV2<'data>> for AliasesV1<'data> {
85	fn from(value: AliasesV2<'data>) -> Self {
86	let language_variants = value
87	.language_variants
88	.iter()
89	.map(zerofrom::ZeroFrom::zero_from)
90	.map(\|v: LanguageStrStrPair\| {
91	let langid = alloc::format!("{0}-{1}", v.`0`, v.`1`);
92	StrStrPair(langid.into(), v.2)
93	})
94	.collect::<alloc::vec::Vec<StrStrPair>>();
95
96	Self {
97	language_variants: VarZeroVec::from(&language_variants),
98	sgn_region: value.sgn_region,
99	language_len2: value.language_len2,
100	language_len3: value.language_len3,
101	language: value.language,
102	script: value.script,
103	region_alpha: value.region_alpha,
104	region_num: value.region_num,
105	complex_region: value.complex_region,
106	variant: value.variant,
107	subdivision: value.subdivision,
108	}
109	}
110	}
111
112	impl<'data> TryFrom<AliasesV1<'data>> for AliasesV2<'data> {
113	type Error = icu_provider::DataError;
114
115	fn try_from(value: AliasesV1<'data>) -> Result<Self, Self::Error> {
116	#[allow(unused_imports)]
117	use alloc::borrow::ToOwned;
118
119	let language_variants = value
120	.language_variants
121	.iter()
122	.map(zerofrom::ZeroFrom::zero_from)
123	.map(\|v: StrStrPair\| -> Result<LanguageStrStrPair, DataError> {
124	let (lang, variant) =
125	v.0.split_once('-')
126	.ok_or_else(\|\| DataError::custom("Each pair should be language-variant"))?;
127	let lang: Language = lang
128	.parse()
129	.map_err(\|_\| DataError::custom("Language should be a valid language subtag"))?;
130	Ok(LanguageStrStrPair(lang, variant.to_owned().into(), v.1))
131	})
132	.collect::<Result<alloc::vec::Vec<_>, _>>()?;
133
134	Ok(Self {
135	language_variants: VarZeroVec::from(&language_variants),
136	sgn_region: value.sgn_region,
137	language_len2: value.language_len2,
138	language_len3: value.language_len3,
139	language: value.language,
140	script: value.script,
141	region_alpha: value.region_alpha,
142	region_num: value.region_num,
143	complex_region: value.complex_region,
144	variant: value.variant,
145	subdivision: value.subdivision,
146	})
147	}
148	}
149
150	#[icu_provider::data_struct(marker(AliasesV2Marker, "locid_transform/aliases@2", singleton))]
151	#[derive(PartialEq, Clone, Default)]
152	#[cfg_attr(
153	feature = "datagen",
154	derive(serde::Serialize, databake::Bake),
155	databake(path = icu_locid_transform::provider),
156	)]
157	#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
158	#[yoke(prove_covariance_manually)]
159	/// This alias data is used for locale canonicalization. Each field defines a
160	/// mapping from an old identifier to a new identifier, based upon the rules in
161	/// from <http://unicode.org/reports/tr35/#LocaleId_Canonicalization>. The data
162	/// is stored in sorted order, allowing for binary search to identify rules to
163	/// apply. It is broken down into smaller vectors based upon some characteristic
164	/// of the data, to help avoid unnecessary searches. For example, the `sgn_region`
165	/// field contains aliases for sign language and region, so that it is not
166	/// necessary to search the data unless the input is a sign language.
167	///
168	/// The algorithm in tr35 is not guaranteed to terminate on data other than what
169	/// is currently in CLDR. For this reason, it is not a good idea to attempt to add
170	/// or modify aliases for use in this structure.
171	///
172	/// <div class="stab unstable">
173	/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
174	/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
175	/// to be stable, their Rust representation might not be. Use with caution.
176	/// </div>
177	// TODO: Use validated types as value types
178	// Notice: V2 improves the alignment of `language_variants` speeding up canonicalization by upon
179	// to 40%. See https://github.com/unicode-org/icu4x/pull/2935 for details.
180	#[derive(Debug)]
181	pub struct AliasesV2<'data> {
182	/// `[language, variant(-variant)] -> [langid]`*
183	/// This is not a map as it's searched linearly according to the canonicalization rules.
184	#[cfg_attr(feature = "serde", serde(borrow))]
185	pub language_variants: VarZeroVec<'data, UnvalidatedLanguageVariantsPair>,
186	/// `sgn-[region] -> [language]`
187	#[cfg_attr(feature = "serde", serde(borrow))]
188	pub sgn_region: ZeroMap<'data, UnvalidatedRegion, Language>,
189	/// `[language{2}] -> [langid]`
190	#[cfg_attr(feature = "serde", serde(borrow))]
191	pub language_len2: ZeroMap<'data, UnvalidatedTinyAsciiStr<`2`>, UnvalidatedLanguageIdentifier>,
192	/// `[language{3}] -> [langid]`
193	#[cfg_attr(feature = "serde", serde(borrow))]
194	pub language_len3: ZeroMap<'data, UnvalidatedLanguage, UnvalidatedLanguageIdentifier>,
195	/// `[langid] -> [langid]`
196	/// This is not a map as it's searched linearly according to the canonicalization rules.
197	#[cfg_attr(feature = "serde", serde(borrow))]
198	pub language: VarZeroVec<'data, UnvalidatedLanguageIdentifierPair>,
199
200	/// `[script] -> [script]`
201	#[cfg_attr(feature = "serde", serde(borrow))]
202	pub script: ZeroMap<'data, UnvalidatedScript, Script>,
203
204	/// `[region{2}] -> [region]`
205	#[cfg_attr(feature = "serde", serde(borrow))]
206	pub region_alpha: ZeroMap<'data, UnvalidatedTinyAsciiStr<`2`>, Region>,
207	/// `[region{3}] -> [region]`
208	#[cfg_attr(feature = "serde", serde(borrow))]
209	pub region_num: ZeroMap<'data, UnvalidatedRegion, Region>,
210
211	/// `[region] -> [region]+`
212	#[cfg_attr(feature = "serde", serde(borrow))]
213	pub complex_region: ZeroMap<'data, UnvalidatedRegion, ZeroSlice<Region>>,
214
215	/// `[variant] -> [variant]`
216	#[cfg_attr(feature = "serde", serde(borrow))]
217	pub variant: ZeroMap<'data, UnvalidatedVariant, Variant>,
218
219	/// `[value{7}] -> [value{7}]`
220	#[cfg_attr(feature = "serde", serde(borrow))]
221	pub subdivision: ZeroMap<'data, UnvalidatedSubdivision, SemivalidatedSubdivision>,
222	}
223