1 | // This file is part of ICU4X. For terms of use, please see the file |
2 | // called LICENSE at the top level of the ICU4X source tree |
3 | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | |
5 | //! Unicode Extensions provide a mechanism to extend the [`LanguageIdentifier`] with |
6 | //! additional bits of information - a combination of a [`LanguageIdentifier`] and [`Extensions`] |
7 | //! is called [`Locale`]. |
8 | //! |
9 | //! There are four types of extensions: |
10 | //! |
11 | //! * [`Unicode Extensions`] - marked as `u`. |
12 | //! * [`Transform Extensions`] - marked as `t`. |
13 | //! * [`Private Use Extensions`] - marked as `x`. |
14 | //! * [`Other Extensions`] - marked as any `a-z` except of `u`, `t` and `x`. |
15 | //! |
16 | //! One can think of extensions as a bag of extra information on top of basic 4 [`subtags`]. |
17 | //! |
18 | //! Notice: `Other` extension type is currently not supported. |
19 | //! |
20 | //! # Examples |
21 | //! |
22 | //! ``` |
23 | //! use icu::locid::extensions::unicode::{Key, Value}; |
24 | //! use icu::locid::Locale; |
25 | //! |
26 | //! let loc: Locale = "en-US-u-ca-buddhist-t-en-US-h0-hybrid-x-foo" |
27 | //! .parse() |
28 | //! .expect("Failed to parse." ); |
29 | //! |
30 | //! assert_eq!(loc.id.language, "en" .parse().unwrap()); |
31 | //! assert_eq!(loc.id.script, None); |
32 | //! assert_eq!(loc.id.region, Some("US" .parse().unwrap())); |
33 | //! assert_eq!(loc.id.variants.len(), 0); |
34 | //! |
35 | //! let key: Key = "ca" .parse().expect("Parsing key failed." ); |
36 | //! let value: Value = "buddhist" .parse().expect("Parsing value failed." ); |
37 | //! assert_eq!(loc.extensions.unicode.keywords.get(&key), Some(&value)); |
38 | //! ``` |
39 | //! |
40 | //! [`LanguageIdentifier`]: super::LanguageIdentifier |
41 | //! [`Locale`]: super::Locale |
42 | //! [`subtags`]: super::subtags |
43 | //! [`Other Extensions`]: other |
44 | //! [`Private Use Extensions`]: private |
45 | //! [`Transform Extensions`]: transform |
46 | //! [`Unicode Extensions`]: unicode |
47 | pub mod other; |
48 | pub mod private; |
49 | pub mod transform; |
50 | pub mod unicode; |
51 | |
52 | use other::Other; |
53 | use private::Private; |
54 | use transform::Transform; |
55 | use unicode::Unicode; |
56 | |
57 | use alloc::vec::Vec; |
58 | |
59 | use crate::parser::ParserError; |
60 | use crate::parser::SubtagIterator; |
61 | |
62 | /// Defines the type of extension. |
63 | #[derive (Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord, Copy)] |
64 | #[non_exhaustive ] |
65 | pub enum ExtensionType { |
66 | /// Transform Extension Type marked as `t`. |
67 | Transform, |
68 | /// Unicode Extension Type marked as `u`. |
69 | Unicode, |
70 | /// Private Extension Type marked as `x`. |
71 | Private, |
72 | /// All other extension types. |
73 | Other(u8), |
74 | } |
75 | |
76 | impl ExtensionType { |
77 | pub(crate) const fn try_from_byte(key: u8) -> Result<Self, ParserError> { |
78 | let key: u8 = key.to_ascii_lowercase(); |
79 | match key { |
80 | b'u' => Ok(Self::Unicode), |
81 | b't' => Ok(Self::Transform), |
82 | b'x' => Ok(Self::Private), |
83 | b'a' ..=b'z' => Ok(Self::Other(key)), |
84 | _ => Err(ParserError::InvalidExtension), |
85 | } |
86 | } |
87 | |
88 | pub(crate) const fn try_from_bytes_manual_slice( |
89 | bytes: &[u8], |
90 | start: usize, |
91 | end: usize, |
92 | ) -> Result<Self, ParserError> { |
93 | if end - start != 1 { |
94 | return Err(ParserError::InvalidExtension); |
95 | } |
96 | #[allow (clippy::indexing_slicing)] |
97 | Self::try_from_byte(key:bytes[start]) |
98 | } |
99 | } |
100 | |
101 | /// A map of extensions associated with a given [`Locale`](crate::Locale). |
102 | #[derive (Debug, Default, PartialEq, Eq, Clone, Hash)] |
103 | #[non_exhaustive ] |
104 | pub struct Extensions { |
105 | /// A representation of the data for a Unicode extension, when present in the locale identifier. |
106 | pub unicode: Unicode, |
107 | /// A representation of the data for a transform extension, when present in the locale identifier. |
108 | pub transform: Transform, |
109 | /// A representation of the data for a private-use extension, when present in the locale identifier. |
110 | pub private: Private, |
111 | /// A sequence of any other extensions that are present in the locale identifier but are not formally |
112 | /// [defined](https://unicode.org/reports/tr35/) and represented explicitly as [`Unicode`], [`Transform`], |
113 | /// and [`Private`] are. |
114 | pub other: Vec<Other>, |
115 | } |
116 | |
117 | impl Extensions { |
118 | /// Returns a new empty map of extensions. Same as [`default()`](Default::default()), but is `const`. |
119 | /// |
120 | /// # Examples |
121 | /// |
122 | /// ``` |
123 | /// use icu::locid::extensions::Extensions; |
124 | /// |
125 | /// assert_eq!(Extensions::new(), Extensions::default()); |
126 | /// ``` |
127 | #[inline ] |
128 | pub const fn new() -> Self { |
129 | Self { |
130 | unicode: Unicode::new(), |
131 | transform: Transform::new(), |
132 | private: Private::new(), |
133 | other: Vec::new(), |
134 | } |
135 | } |
136 | |
137 | /// Function to create a new map of extensions containing exactly one unicode extension, callable in `const` |
138 | /// context. |
139 | #[inline ] |
140 | pub const fn from_unicode(unicode: Unicode) -> Self { |
141 | Self { |
142 | unicode, |
143 | transform: Transform::new(), |
144 | private: Private::new(), |
145 | other: Vec::new(), |
146 | } |
147 | } |
148 | |
149 | /// Returns whether there are no extensions present. |
150 | /// |
151 | /// # Examples |
152 | /// |
153 | /// ``` |
154 | /// use icu::locid::Locale; |
155 | /// |
156 | /// let loc: Locale = "en-US-u-foo" .parse().expect("Parsing failed." ); |
157 | /// |
158 | /// assert!(!loc.extensions.is_empty()); |
159 | /// ``` |
160 | pub fn is_empty(&self) -> bool { |
161 | self.unicode.is_empty() |
162 | && self.transform.is_empty() |
163 | && self.private.is_empty() |
164 | && self.other.is_empty() |
165 | } |
166 | |
167 | /// Retains the specified extension types, clearing all others. |
168 | /// |
169 | /// # Examples |
170 | /// |
171 | /// ``` |
172 | /// use icu::locid::extensions::ExtensionType; |
173 | /// use icu::locid::Locale; |
174 | /// |
175 | /// let loc: Locale = |
176 | /// "und-a-hello-t-mul-u-world-z-zzz-x-extra" .parse().unwrap(); |
177 | /// |
178 | /// let mut only_unicode = loc.clone(); |
179 | /// only_unicode |
180 | /// .extensions |
181 | /// .retain_by_type(|t| t == ExtensionType::Unicode); |
182 | /// assert_eq!(only_unicode, "und-u-world" .parse().unwrap()); |
183 | /// |
184 | /// let mut only_t_z = loc.clone(); |
185 | /// only_t_z.extensions.retain_by_type(|t| { |
186 | /// t == ExtensionType::Transform || t == ExtensionType::Other(b'z' ) |
187 | /// }); |
188 | /// assert_eq!(only_t_z, "und-t-mul-z-zzz" .parse().unwrap()); |
189 | /// ``` |
190 | pub fn retain_by_type<F>(&mut self, mut predicate: F) |
191 | where |
192 | F: FnMut(ExtensionType) -> bool, |
193 | { |
194 | if !predicate(ExtensionType::Unicode) { |
195 | self.unicode.clear(); |
196 | } |
197 | if !predicate(ExtensionType::Transform) { |
198 | self.transform.clear(); |
199 | } |
200 | if !predicate(ExtensionType::Private) { |
201 | self.private.clear(); |
202 | } |
203 | self.other |
204 | .retain(|o| predicate(ExtensionType::Other(o.get_ext_byte()))); |
205 | } |
206 | |
207 | pub(crate) fn try_from_iter(iter: &mut SubtagIterator) -> Result<Self, ParserError> { |
208 | let mut unicode = None; |
209 | let mut transform = None; |
210 | let mut private = None; |
211 | let mut other = Vec::new(); |
212 | |
213 | while let Some(subtag) = iter.next() { |
214 | if subtag.is_empty() { |
215 | return Err(ParserError::InvalidExtension); |
216 | } |
217 | match subtag.first().map(|b| ExtensionType::try_from_byte(*b)) { |
218 | Some(Ok(ExtensionType::Unicode)) => { |
219 | if unicode.is_some() { |
220 | return Err(ParserError::DuplicatedExtension); |
221 | } |
222 | unicode = Some(Unicode::try_from_iter(iter)?); |
223 | } |
224 | Some(Ok(ExtensionType::Transform)) => { |
225 | if transform.is_some() { |
226 | return Err(ParserError::DuplicatedExtension); |
227 | } |
228 | transform = Some(Transform::try_from_iter(iter)?); |
229 | } |
230 | Some(Ok(ExtensionType::Private)) => { |
231 | if private.is_some() { |
232 | return Err(ParserError::DuplicatedExtension); |
233 | } |
234 | private = Some(Private::try_from_iter(iter)?); |
235 | } |
236 | Some(Ok(ExtensionType::Other(ext))) => { |
237 | if other.iter().any(|o: &Other| o.get_ext_byte() == ext) { |
238 | return Err(ParserError::DuplicatedExtension); |
239 | } |
240 | let parsed = Other::try_from_iter(ext, iter)?; |
241 | if let Err(idx) = other.binary_search(&parsed) { |
242 | other.insert(idx, parsed); |
243 | } else { |
244 | return Err(ParserError::InvalidExtension); |
245 | } |
246 | } |
247 | _ => return Err(ParserError::InvalidExtension), |
248 | } |
249 | } |
250 | |
251 | Ok(Self { |
252 | unicode: unicode.unwrap_or_default(), |
253 | transform: transform.unwrap_or_default(), |
254 | private: private.unwrap_or_default(), |
255 | other, |
256 | }) |
257 | } |
258 | |
259 | pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E> |
260 | where |
261 | F: FnMut(&str) -> Result<(), E>, |
262 | { |
263 | let mut wrote_tu = false; |
264 | // Alphabetic by singleton |
265 | self.other.iter().try_for_each(|other| { |
266 | if other.get_ext() > 't' && !wrote_tu { |
267 | // Since 't' and 'u' are next to each other in alphabetical |
268 | // order, write both now. |
269 | self.transform.for_each_subtag_str(f)?; |
270 | self.unicode.for_each_subtag_str(f)?; |
271 | wrote_tu = true; |
272 | } |
273 | other.for_each_subtag_str(f)?; |
274 | Ok(()) |
275 | })?; |
276 | |
277 | if !wrote_tu { |
278 | self.transform.for_each_subtag_str(f)?; |
279 | self.unicode.for_each_subtag_str(f)?; |
280 | } |
281 | |
282 | // Private must be written last, since it allows single character |
283 | // keys. Extensions must also be written in alphabetical order, |
284 | // which would seem to imply that other extensions `y` and `z` are |
285 | // invalid, but this is not specified. |
286 | self.private.for_each_subtag_str(f)?; |
287 | Ok(()) |
288 | } |
289 | } |
290 | |
291 | impl_writeable_for_each_subtag_str_no_test!(Extensions); |
292 | |
293 | #[test ] |
294 | fn test_writeable() { |
295 | use crate::Locale; |
296 | use writeable::assert_writeable_eq; |
297 | assert_writeable_eq!(Extensions::new(), "" ); |
298 | assert_writeable_eq!( |
299 | "my-t-my-d0-zawgyi" .parse::<Locale>().unwrap().extensions, |
300 | "t-my-d0-zawgyi" , |
301 | ); |
302 | assert_writeable_eq!( |
303 | "ar-SA-u-ca-islamic-civil" |
304 | .parse::<Locale>() |
305 | .unwrap() |
306 | .extensions, |
307 | "u-ca-islamic-civil" , |
308 | ); |
309 | assert_writeable_eq!( |
310 | "en-001-x-foo-bar" .parse::<Locale>().unwrap().extensions, |
311 | "x-foo-bar" , |
312 | ); |
313 | assert_writeable_eq!( |
314 | "und-t-m0-true" .parse::<Locale>().unwrap().extensions, |
315 | "t-m0-true" , |
316 | ); |
317 | assert_writeable_eq!( |
318 | "und-a-foo-t-foo-u-foo-w-foo-z-foo-x-foo" |
319 | .parse::<Locale>() |
320 | .unwrap() |
321 | .extensions, |
322 | "a-foo-t-foo-u-foo-w-foo-z-foo-x-foo" , |
323 | ); |
324 | } |
325 | |