1 | // This file is part of ICU4X. For terms of use, please see the file |
2 | // called LICENSE at the top level of the ICU4X source tree |
3 | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | |
5 | //! Unicode Extensions provide a mechanism to extend the [`LanguageIdentifier`] with |
6 | //! additional bits of information - a combination of a [`LanguageIdentifier`] and [`Extensions`] |
7 | //! is called [`Locale`]. |
8 | //! |
9 | //! There are four types of extensions: |
10 | //! |
11 | //! * [`Unicode Extensions`] - marked as `u`. |
12 | //! * [`Transform Extensions`] - marked as `t`. |
13 | //! * [`Private Use Extensions`] - marked as `x`. |
14 | //! * [`Other Extensions`] - marked as any `a-z` except of `u`, `t` and `x`. |
15 | //! |
16 | //! One can think of extensions as a bag of extra information on top of basic 4 [`subtags`]. |
17 | //! |
18 | //! Notice: `Other` extension type is currently not supported. |
19 | //! |
20 | //! # Examples |
21 | //! |
22 | //! ``` |
23 | //! use icu::locid::extensions::unicode::{Key, Value}; |
24 | //! use icu::locid::Locale; |
25 | //! |
26 | //! let loc: Locale = "en-US-u-ca-buddhist-t-en-us-h0-hybrid-x-foo" |
27 | //! .parse() |
28 | //! .expect("Failed to parse." ); |
29 | //! |
30 | //! assert_eq!(loc.id.language, "en" .parse().unwrap()); |
31 | //! assert_eq!(loc.id.script, None); |
32 | //! assert_eq!(loc.id.region, Some("US" .parse().unwrap())); |
33 | //! assert_eq!(loc.id.variants.len(), 0); |
34 | //! |
35 | //! let key: Key = "ca" .parse().expect("Parsing key failed." ); |
36 | //! let value: Value = "buddhist" .parse().expect("Parsing value failed." ); |
37 | //! assert_eq!(loc.extensions.unicode.keywords.get(&key), Some(&value)); |
38 | //! ``` |
39 | //! |
40 | //! [`LanguageIdentifier`]: super::LanguageIdentifier |
41 | //! [`Locale`]: super::Locale |
42 | //! [`subtags`]: super::subtags |
43 | //! [`Other Extensions`]: other |
44 | //! [`Private Use Extensions`]: private |
45 | //! [`Transform Extensions`]: transform |
46 | //! [`Unicode Extensions`]: unicode |
47 | pub mod other; |
48 | pub mod private; |
49 | pub mod transform; |
50 | pub mod unicode; |
51 | |
52 | use core::cmp::Ordering; |
53 | |
54 | use other::Other; |
55 | use private::Private; |
56 | use transform::Transform; |
57 | use unicode::Unicode; |
58 | |
59 | use alloc::vec::Vec; |
60 | |
61 | use crate::parser::ParserError; |
62 | use crate::parser::SubtagIterator; |
63 | use crate::subtags; |
64 | |
65 | /// Defines the type of extension. |
66 | #[derive (Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord, Copy)] |
67 | #[non_exhaustive ] |
68 | pub enum ExtensionType { |
69 | /// Transform Extension Type marked as `t`. |
70 | Transform, |
71 | /// Unicode Extension Type marked as `u`. |
72 | Unicode, |
73 | /// Private Extension Type marked as `x`. |
74 | Private, |
75 | /// All other extension types. |
76 | Other(u8), |
77 | } |
78 | |
79 | impl ExtensionType { |
80 | pub(crate) const fn try_from_byte(key: u8) -> Result<Self, ParserError> { |
81 | let key: u8 = key.to_ascii_lowercase(); |
82 | match key { |
83 | b'u' => Ok(Self::Unicode), |
84 | b't' => Ok(Self::Transform), |
85 | b'x' => Ok(Self::Private), |
86 | b'a' ..=b'z' => Ok(Self::Other(key)), |
87 | _ => Err(ParserError::InvalidExtension), |
88 | } |
89 | } |
90 | |
91 | pub(crate) const fn try_from_bytes_manual_slice( |
92 | bytes: &[u8], |
93 | start: usize, |
94 | end: usize, |
95 | ) -> Result<Self, ParserError> { |
96 | if end - start != 1 { |
97 | return Err(ParserError::InvalidExtension); |
98 | } |
99 | #[allow (clippy::indexing_slicing)] |
100 | Self::try_from_byte(key:bytes[start]) |
101 | } |
102 | } |
103 | |
104 | /// A map of extensions associated with a given [`Locale`](crate::Locale). |
105 | #[derive (Debug, Default, PartialEq, Eq, Clone, Hash)] |
106 | #[non_exhaustive ] |
107 | pub struct Extensions { |
108 | /// A representation of the data for a Unicode extension, when present in the locale identifier. |
109 | pub unicode: Unicode, |
110 | /// A representation of the data for a transform extension, when present in the locale identifier. |
111 | pub transform: Transform, |
112 | /// A representation of the data for a private-use extension, when present in the locale identifier. |
113 | pub private: Private, |
114 | /// A sequence of any other extensions that are present in the locale identifier but are not formally |
115 | /// [defined](https://unicode.org/reports/tr35/) and represented explicitly as [`Unicode`], [`Transform`], |
116 | /// and [`Private`] are. |
117 | pub other: Vec<Other>, |
118 | } |
119 | |
120 | impl Extensions { |
121 | /// Returns a new empty map of extensions. Same as [`default()`](Default::default()), but is `const`. |
122 | /// |
123 | /// # Examples |
124 | /// |
125 | /// ``` |
126 | /// use icu::locid::extensions::Extensions; |
127 | /// |
128 | /// assert_eq!(Extensions::new(), Extensions::default()); |
129 | /// ``` |
130 | #[inline ] |
131 | pub const fn new() -> Self { |
132 | Self { |
133 | unicode: Unicode::new(), |
134 | transform: Transform::new(), |
135 | private: Private::new(), |
136 | other: Vec::new(), |
137 | } |
138 | } |
139 | |
140 | /// Function to create a new map of extensions containing exactly one unicode extension, callable in `const` |
141 | /// context. |
142 | #[inline ] |
143 | pub const fn from_unicode(unicode: Unicode) -> Self { |
144 | Self { |
145 | unicode, |
146 | transform: Transform::new(), |
147 | private: Private::new(), |
148 | other: Vec::new(), |
149 | } |
150 | } |
151 | |
152 | /// Returns whether there are no extensions present. |
153 | /// |
154 | /// # Examples |
155 | /// |
156 | /// ``` |
157 | /// use icu::locid::Locale; |
158 | /// |
159 | /// let loc: Locale = "en-US-u-foo" .parse().expect("Parsing failed." ); |
160 | /// |
161 | /// assert!(!loc.extensions.is_empty()); |
162 | /// ``` |
163 | pub fn is_empty(&self) -> bool { |
164 | self.unicode.is_empty() |
165 | && self.transform.is_empty() |
166 | && self.private.is_empty() |
167 | && self.other.is_empty() |
168 | } |
169 | |
170 | #[allow (clippy::type_complexity)] |
171 | pub(crate) fn as_tuple( |
172 | &self, |
173 | ) -> ( |
174 | (&unicode::Attributes, &unicode::Keywords), |
175 | ( |
176 | Option<( |
177 | subtags::Language, |
178 | Option<subtags::Script>, |
179 | Option<subtags::Region>, |
180 | &subtags::Variants, |
181 | )>, |
182 | &transform::Fields, |
183 | ), |
184 | &private::Private, |
185 | &[other::Other], |
186 | ) { |
187 | ( |
188 | self.unicode.as_tuple(), |
189 | self.transform.as_tuple(), |
190 | &self.private, |
191 | &self.other, |
192 | ) |
193 | } |
194 | |
195 | /// Returns an ordering suitable for use in [`BTreeSet`]. |
196 | /// |
197 | /// The ordering may or may not be equivalent to string ordering, and it |
198 | /// may or may not be stable across ICU4X releases. |
199 | /// |
200 | /// [`BTreeSet`]: alloc::collections::BTreeSet |
201 | pub fn total_cmp(&self, other: &Self) -> Ordering { |
202 | self.as_tuple().cmp(&other.as_tuple()) |
203 | } |
204 | |
205 | /// Retains the specified extension types, clearing all others. |
206 | /// |
207 | /// # Examples |
208 | /// |
209 | /// ``` |
210 | /// use icu::locid::extensions::ExtensionType; |
211 | /// use icu::locid::Locale; |
212 | /// |
213 | /// let loc: Locale = |
214 | /// "und-a-hello-t-mul-u-world-z-zzz-x-extra" .parse().unwrap(); |
215 | /// |
216 | /// let mut only_unicode = loc.clone(); |
217 | /// only_unicode |
218 | /// .extensions |
219 | /// .retain_by_type(|t| t == ExtensionType::Unicode); |
220 | /// assert_eq!(only_unicode, "und-u-world" .parse().unwrap()); |
221 | /// |
222 | /// let mut only_t_z = loc.clone(); |
223 | /// only_t_z.extensions.retain_by_type(|t| { |
224 | /// t == ExtensionType::Transform || t == ExtensionType::Other(b'z' ) |
225 | /// }); |
226 | /// assert_eq!(only_t_z, "und-t-mul-z-zzz" .parse().unwrap()); |
227 | /// ``` |
228 | pub fn retain_by_type<F>(&mut self, mut predicate: F) |
229 | where |
230 | F: FnMut(ExtensionType) -> bool, |
231 | { |
232 | if !predicate(ExtensionType::Unicode) { |
233 | self.unicode.clear(); |
234 | } |
235 | if !predicate(ExtensionType::Transform) { |
236 | self.transform.clear(); |
237 | } |
238 | if !predicate(ExtensionType::Private) { |
239 | self.private.clear(); |
240 | } |
241 | self.other |
242 | .retain(|o| predicate(ExtensionType::Other(o.get_ext_byte()))); |
243 | } |
244 | |
245 | pub(crate) fn try_from_iter(iter: &mut SubtagIterator) -> Result<Self, ParserError> { |
246 | let mut unicode = None; |
247 | let mut transform = None; |
248 | let mut private = None; |
249 | let mut other = Vec::new(); |
250 | |
251 | while let Some(subtag) = iter.next() { |
252 | if subtag.is_empty() { |
253 | return Err(ParserError::InvalidExtension); |
254 | } |
255 | match subtag.first().map(|b| ExtensionType::try_from_byte(*b)) { |
256 | Some(Ok(ExtensionType::Unicode)) => { |
257 | if unicode.is_some() { |
258 | return Err(ParserError::DuplicatedExtension); |
259 | } |
260 | unicode = Some(Unicode::try_from_iter(iter)?); |
261 | } |
262 | Some(Ok(ExtensionType::Transform)) => { |
263 | if transform.is_some() { |
264 | return Err(ParserError::DuplicatedExtension); |
265 | } |
266 | transform = Some(Transform::try_from_iter(iter)?); |
267 | } |
268 | Some(Ok(ExtensionType::Private)) => { |
269 | if private.is_some() { |
270 | return Err(ParserError::DuplicatedExtension); |
271 | } |
272 | private = Some(Private::try_from_iter(iter)?); |
273 | } |
274 | Some(Ok(ExtensionType::Other(ext))) => { |
275 | if other.iter().any(|o: &Other| o.get_ext_byte() == ext) { |
276 | return Err(ParserError::DuplicatedExtension); |
277 | } |
278 | let parsed = Other::try_from_iter(ext, iter)?; |
279 | if let Err(idx) = other.binary_search(&parsed) { |
280 | other.insert(idx, parsed); |
281 | } else { |
282 | return Err(ParserError::InvalidExtension); |
283 | } |
284 | } |
285 | _ => return Err(ParserError::InvalidExtension), |
286 | } |
287 | } |
288 | |
289 | Ok(Self { |
290 | unicode: unicode.unwrap_or_default(), |
291 | transform: transform.unwrap_or_default(), |
292 | private: private.unwrap_or_default(), |
293 | other, |
294 | }) |
295 | } |
296 | |
297 | pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E> |
298 | where |
299 | F: FnMut(&str) -> Result<(), E>, |
300 | { |
301 | let mut wrote_tu = false; |
302 | // Alphabetic by singleton |
303 | self.other.iter().try_for_each(|other| { |
304 | if other.get_ext() > 't' && !wrote_tu { |
305 | // Since 't' and 'u' are next to each other in alphabetical |
306 | // order, write both now. |
307 | self.transform.for_each_subtag_str(f)?; |
308 | self.unicode.for_each_subtag_str(f)?; |
309 | wrote_tu = true; |
310 | } |
311 | other.for_each_subtag_str(f)?; |
312 | Ok(()) |
313 | })?; |
314 | |
315 | if !wrote_tu { |
316 | self.transform.for_each_subtag_str(f)?; |
317 | self.unicode.for_each_subtag_str(f)?; |
318 | } |
319 | |
320 | // Private must be written last, since it allows single character |
321 | // keys. Extensions must also be written in alphabetical order, |
322 | // which would seem to imply that other extensions `y` and `z` are |
323 | // invalid, but this is not specified. |
324 | self.private.for_each_subtag_str(f)?; |
325 | Ok(()) |
326 | } |
327 | } |
328 | |
329 | impl_writeable_for_each_subtag_str_no_test!(Extensions); |
330 | |
331 | #[test ] |
332 | fn test_writeable() { |
333 | use crate::Locale; |
334 | use writeable::assert_writeable_eq; |
335 | assert_writeable_eq!(Extensions::new(), "" ); |
336 | assert_writeable_eq!( |
337 | "my-t-my-d0-zawgyi" .parse::<Locale>().unwrap().extensions, |
338 | "t-my-d0-zawgyi" , |
339 | ); |
340 | assert_writeable_eq!( |
341 | "ar-SA-u-ca-islamic-civil" |
342 | .parse::<Locale>() |
343 | .unwrap() |
344 | .extensions, |
345 | "u-ca-islamic-civil" , |
346 | ); |
347 | assert_writeable_eq!( |
348 | "en-001-x-foo-bar" .parse::<Locale>().unwrap().extensions, |
349 | "x-foo-bar" , |
350 | ); |
351 | assert_writeable_eq!( |
352 | "und-t-m0-true" .parse::<Locale>().unwrap().extensions, |
353 | "t-m0-true" , |
354 | ); |
355 | assert_writeable_eq!( |
356 | "und-a-foo-t-foo-u-foo-w-foo-z-foo-x-foo" |
357 | .parse::<Locale>() |
358 | .unwrap() |
359 | .extensions, |
360 | "a-foo-t-foo-u-foo-w-foo-z-foo-x-foo" , |
361 | ); |
362 | } |
363 | |