| 1 | // This file is part of ICU4X. For terms of use, please see the file |
| 2 | // called LICENSE at the top level of the ICU4X source tree |
| 3 | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
| 4 | |
| 5 | use super::*; |
| 6 | use icu_locid::subtags::{Language, Region, Script, Variant}; |
| 7 | use icu_provider::prelude::*; |
| 8 | use tinystr::UnvalidatedTinyAsciiStr; |
| 9 | use zerovec::{VarZeroVec, ZeroMap, ZeroSlice}; |
| 10 | |
| 11 | #[icu_provider::data_struct (marker(AliasesV1Marker, "locid_transform/aliases@1" , singleton))] |
| 12 | #[derive (PartialEq, Clone, Default)] |
| 13 | #[cfg_attr ( |
| 14 | feature = "datagen" , |
| 15 | derive(serde::Serialize, databake::Bake), |
| 16 | databake(path = icu_locid_transform::provider), |
| 17 | )] |
| 18 | #[cfg_attr (feature = "serde" , derive(serde::Deserialize))] |
| 19 | #[yoke(prove_covariance_manually)] |
| 20 | /// This alias data is used for locale canonicalization. Each field defines a |
| 21 | /// mapping from an old identifier to a new identifier, based upon the rules in |
| 22 | /// from <http://unicode.org/reports/tr35/#LocaleId_Canonicalization>. The data |
| 23 | /// is stored in sorted order, allowing for binary search to identify rules to |
| 24 | /// apply. It is broken down into smaller vectors based upon some characteristic |
| 25 | /// of the data, to help avoid unnecessary searches. For example, the `sgn_region` |
| 26 | /// field contains aliases for sign language and region, so that it is not |
| 27 | /// necessary to search the data unless the input is a sign language. |
| 28 | /// |
| 29 | /// The algorithm in tr35 is not guaranteed to terminate on data other than what |
| 30 | /// is currently in CLDR. For this reason, it is not a good idea to attempt to add |
| 31 | /// or modify aliases for use in this structure. |
| 32 | /// |
| 33 | /// <div class="stab unstable"> |
| 34 | /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, |
| 35 | /// including in SemVer minor releases. While the serde representation of data structs is guaranteed |
| 36 | /// to be stable, their Rust representation might not be. Use with caution. |
| 37 | /// </div> |
| 38 | // TODO: Use validated types as value types |
| 39 | #[derive (Debug)] |
| 40 | pub struct AliasesV1<'data> { |
| 41 | /// `[language(-variant)+\] -> [langid]` |
| 42 | /// This is not a map as it's searched linearly according to the canonicalization rules. |
| 43 | #[cfg_attr (feature = "serde" , serde(borrow))] |
| 44 | pub language_variants: VarZeroVec<'data, UnvalidatedLanguageIdentifierPair>, |
| 45 | /// `sgn-[region] -> [language]` |
| 46 | #[cfg_attr (feature = "serde" , serde(borrow))] |
| 47 | pub sgn_region: ZeroMap<'data, UnvalidatedRegion, Language>, |
| 48 | /// `[language{2}] -> [langid]` |
| 49 | #[cfg_attr (feature = "serde" , serde(borrow))] |
| 50 | pub language_len2: ZeroMap<'data, UnvalidatedTinyAsciiStr<2>, UnvalidatedLanguageIdentifier>, |
| 51 | /// `[language{3}] -> [langid]` |
| 52 | #[cfg_attr (feature = "serde" , serde(borrow))] |
| 53 | pub language_len3: ZeroMap<'data, UnvalidatedLanguage, UnvalidatedLanguageIdentifier>, |
| 54 | /// `[langid] -> [langid]` |
| 55 | /// This is not a map as it's searched linearly according to the canonicalization rules. |
| 56 | #[cfg_attr (feature = "serde" , serde(borrow))] |
| 57 | pub language: VarZeroVec<'data, UnvalidatedLanguageIdentifierPair>, |
| 58 | |
| 59 | /// `[script] -> [script]` |
| 60 | #[cfg_attr (feature = "serde" , serde(borrow))] |
| 61 | pub script: ZeroMap<'data, UnvalidatedScript, Script>, |
| 62 | |
| 63 | /// `[region{2}] -> [region]` |
| 64 | #[cfg_attr (feature = "serde" , serde(borrow))] |
| 65 | pub region_alpha: ZeroMap<'data, UnvalidatedTinyAsciiStr<2>, Region>, |
| 66 | /// `[region{3}] -> [region]` |
| 67 | #[cfg_attr (feature = "serde" , serde(borrow))] |
| 68 | pub region_num: ZeroMap<'data, UnvalidatedRegion, Region>, |
| 69 | |
| 70 | /// `[region] -> [region]+` |
| 71 | #[cfg_attr (feature = "serde" , serde(borrow))] |
| 72 | pub complex_region: ZeroMap<'data, UnvalidatedRegion, ZeroSlice<Region>>, |
| 73 | |
| 74 | /// `[variant] -> [variant]` |
| 75 | #[cfg_attr (feature = "serde" , serde(borrow))] |
| 76 | pub variant: ZeroMap<'data, UnvalidatedVariant, Variant>, |
| 77 | |
| 78 | /// `[value{7}] -> [value{7}]` |
| 79 | #[cfg_attr (feature = "serde" , serde(borrow))] |
| 80 | pub subdivision: ZeroMap<'data, UnvalidatedSubdivision, SemivalidatedSubdivision>, |
| 81 | } |
| 82 | |
| 83 | #[cfg (feature = "datagen" )] |
| 84 | impl<'data> From<AliasesV2<'data>> for AliasesV1<'data> { |
| 85 | fn from(value: AliasesV2<'data>) -> Self { |
| 86 | let language_variants = value |
| 87 | .language_variants |
| 88 | .iter() |
| 89 | .map(zerofrom::ZeroFrom::zero_from) |
| 90 | .map(|v: LanguageStrStrPair| { |
| 91 | let langid = alloc::format!("{0}-{1}" , v.0, v.1); |
| 92 | StrStrPair(langid.into(), v.2) |
| 93 | }) |
| 94 | .collect::<alloc::vec::Vec<StrStrPair>>(); |
| 95 | |
| 96 | Self { |
| 97 | language_variants: VarZeroVec::from(&language_variants), |
| 98 | sgn_region: value.sgn_region, |
| 99 | language_len2: value.language_len2, |
| 100 | language_len3: value.language_len3, |
| 101 | language: value.language, |
| 102 | script: value.script, |
| 103 | region_alpha: value.region_alpha, |
| 104 | region_num: value.region_num, |
| 105 | complex_region: value.complex_region, |
| 106 | variant: value.variant, |
| 107 | subdivision: value.subdivision, |
| 108 | } |
| 109 | } |
| 110 | } |
| 111 | |
| 112 | impl<'data> TryFrom<AliasesV1<'data>> for AliasesV2<'data> { |
| 113 | type Error = icu_provider::DataError; |
| 114 | |
| 115 | fn try_from(value: AliasesV1<'data>) -> Result<Self, Self::Error> { |
| 116 | #[allow (unused_imports)] |
| 117 | use alloc::borrow::ToOwned; |
| 118 | |
| 119 | let language_variants = value |
| 120 | .language_variants |
| 121 | .iter() |
| 122 | .map(zerofrom::ZeroFrom::zero_from) |
| 123 | .map(|v: StrStrPair| -> Result<LanguageStrStrPair, DataError> { |
| 124 | let (lang, variant) = |
| 125 | v.0.split_once('-' ) |
| 126 | .ok_or_else(|| DataError::custom("Each pair should be language-variant" ))?; |
| 127 | let lang: Language = lang |
| 128 | .parse() |
| 129 | .map_err(|_| DataError::custom("Language should be a valid language subtag" ))?; |
| 130 | Ok(LanguageStrStrPair(lang, variant.to_owned().into(), v.1)) |
| 131 | }) |
| 132 | .collect::<Result<alloc::vec::Vec<_>, _>>()?; |
| 133 | |
| 134 | Ok(Self { |
| 135 | language_variants: VarZeroVec::from(&language_variants), |
| 136 | sgn_region: value.sgn_region, |
| 137 | language_len2: value.language_len2, |
| 138 | language_len3: value.language_len3, |
| 139 | language: value.language, |
| 140 | script: value.script, |
| 141 | region_alpha: value.region_alpha, |
| 142 | region_num: value.region_num, |
| 143 | complex_region: value.complex_region, |
| 144 | variant: value.variant, |
| 145 | subdivision: value.subdivision, |
| 146 | }) |
| 147 | } |
| 148 | } |
| 149 | |
| 150 | #[icu_provider::data_struct (marker(AliasesV2Marker, "locid_transform/aliases@2" , singleton))] |
| 151 | #[derive (PartialEq, Clone, Default)] |
| 152 | #[cfg_attr ( |
| 153 | feature = "datagen" , |
| 154 | derive(serde::Serialize, databake::Bake), |
| 155 | databake(path = icu_locid_transform::provider), |
| 156 | )] |
| 157 | #[cfg_attr (feature = "serde" , derive(serde::Deserialize))] |
| 158 | #[yoke(prove_covariance_manually)] |
| 159 | /// This alias data is used for locale canonicalization. Each field defines a |
| 160 | /// mapping from an old identifier to a new identifier, based upon the rules in |
| 161 | /// from <http://unicode.org/reports/tr35/#LocaleId_Canonicalization>. The data |
| 162 | /// is stored in sorted order, allowing for binary search to identify rules to |
| 163 | /// apply. It is broken down into smaller vectors based upon some characteristic |
| 164 | /// of the data, to help avoid unnecessary searches. For example, the `sgn_region` |
| 165 | /// field contains aliases for sign language and region, so that it is not |
| 166 | /// necessary to search the data unless the input is a sign language. |
| 167 | /// |
| 168 | /// The algorithm in tr35 is not guaranteed to terminate on data other than what |
| 169 | /// is currently in CLDR. For this reason, it is not a good idea to attempt to add |
| 170 | /// or modify aliases for use in this structure. |
| 171 | /// |
| 172 | /// <div class="stab unstable"> |
| 173 | /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, |
| 174 | /// including in SemVer minor releases. While the serde representation of data structs is guaranteed |
| 175 | /// to be stable, their Rust representation might not be. Use with caution. |
| 176 | /// </div> |
| 177 | // TODO: Use validated types as value types |
| 178 | // Notice: V2 improves the alignment of `language_variants` speeding up canonicalization by upon |
| 179 | // to 40%. See https://github.com/unicode-org/icu4x/pull/2935 for details. |
| 180 | #[derive (Debug)] |
| 181 | pub struct AliasesV2<'data> { |
| 182 | /// `[language, variant(-variant)*] -> [langid]` |
| 183 | /// This is not a map as it's searched linearly according to the canonicalization rules. |
| 184 | #[cfg_attr (feature = "serde" , serde(borrow))] |
| 185 | pub language_variants: VarZeroVec<'data, UnvalidatedLanguageVariantsPair>, |
| 186 | /// `sgn-[region] -> [language]` |
| 187 | #[cfg_attr (feature = "serde" , serde(borrow))] |
| 188 | pub sgn_region: ZeroMap<'data, UnvalidatedRegion, Language>, |
| 189 | /// `[language{2}] -> [langid]` |
| 190 | #[cfg_attr (feature = "serde" , serde(borrow))] |
| 191 | pub language_len2: ZeroMap<'data, UnvalidatedTinyAsciiStr<2>, UnvalidatedLanguageIdentifier>, |
| 192 | /// `[language{3}] -> [langid]` |
| 193 | #[cfg_attr (feature = "serde" , serde(borrow))] |
| 194 | pub language_len3: ZeroMap<'data, UnvalidatedLanguage, UnvalidatedLanguageIdentifier>, |
| 195 | /// `[langid] -> [langid]` |
| 196 | /// This is not a map as it's searched linearly according to the canonicalization rules. |
| 197 | #[cfg_attr (feature = "serde" , serde(borrow))] |
| 198 | pub language: VarZeroVec<'data, UnvalidatedLanguageIdentifierPair>, |
| 199 | |
| 200 | /// `[script] -> [script]` |
| 201 | #[cfg_attr (feature = "serde" , serde(borrow))] |
| 202 | pub script: ZeroMap<'data, UnvalidatedScript, Script>, |
| 203 | |
| 204 | /// `[region{2}] -> [region]` |
| 205 | #[cfg_attr (feature = "serde" , serde(borrow))] |
| 206 | pub region_alpha: ZeroMap<'data, UnvalidatedTinyAsciiStr<2>, Region>, |
| 207 | /// `[region{3}] -> [region]` |
| 208 | #[cfg_attr (feature = "serde" , serde(borrow))] |
| 209 | pub region_num: ZeroMap<'data, UnvalidatedRegion, Region>, |
| 210 | |
| 211 | /// `[region] -> [region]+` |
| 212 | #[cfg_attr (feature = "serde" , serde(borrow))] |
| 213 | pub complex_region: ZeroMap<'data, UnvalidatedRegion, ZeroSlice<Region>>, |
| 214 | |
| 215 | /// `[variant] -> [variant]` |
| 216 | #[cfg_attr (feature = "serde" , serde(borrow))] |
| 217 | pub variant: ZeroMap<'data, UnvalidatedVariant, Variant>, |
| 218 | |
| 219 | /// `[value{7}] -> [value{7}]` |
| 220 | #[cfg_attr (feature = "serde" , serde(borrow))] |
| 221 | pub subdivision: ZeroMap<'data, UnvalidatedSubdivision, SemivalidatedSubdivision>, |
| 222 | } |
| 223 | |