1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! Unicode Extensions provide a mechanism to extend the [`LanguageIdentifier`] with
6//! additional bits of information - a combination of a [`LanguageIdentifier`] and [`Extensions`]
7//! is called [`Locale`].
8//!
9//! There are four types of extensions:
10//!
11//! * [`Unicode Extensions`] - marked as `u`.
12//! * [`Transform Extensions`] - marked as `t`.
13//! * [`Private Use Extensions`] - marked as `x`.
14//! * [`Other Extensions`] - marked as any `a-z` except of `u`, `t` and `x`.
15//!
16//! One can think of extensions as a bag of extra information on top of basic 4 [`subtags`].
17//!
18//! Notice: `Other` extension type is currently not supported.
19//!
20//! # Examples
21//!
22//! ```
23//! use icu::locid::extensions::unicode::{Key, Value};
24//! use icu::locid::Locale;
25//!
26//! let loc: Locale = "en-US-u-ca-buddhist-t-en-US-h0-hybrid-x-foo"
27//! .parse()
28//! .expect("Failed to parse.");
29//!
30//! assert_eq!(loc.id.language, "en".parse().unwrap());
31//! assert_eq!(loc.id.script, None);
32//! assert_eq!(loc.id.region, Some("US".parse().unwrap()));
33//! assert_eq!(loc.id.variants.len(), 0);
34//!
35//! let key: Key = "ca".parse().expect("Parsing key failed.");
36//! let value: Value = "buddhist".parse().expect("Parsing value failed.");
37//! assert_eq!(loc.extensions.unicode.keywords.get(&key), Some(&value));
38//! ```
39//!
40//! [`LanguageIdentifier`]: super::LanguageIdentifier
41//! [`Locale`]: super::Locale
42//! [`subtags`]: super::subtags
43//! [`Other Extensions`]: other
44//! [`Private Use Extensions`]: private
45//! [`Transform Extensions`]: transform
46//! [`Unicode Extensions`]: unicode
47pub mod other;
48pub mod private;
49pub mod transform;
50pub mod unicode;
51
52use other::Other;
53use private::Private;
54use transform::Transform;
55use unicode::Unicode;
56
57use alloc::vec::Vec;
58
59use crate::parser::ParserError;
60use crate::parser::SubtagIterator;
61
62/// Defines the type of extension.
63#[derive(Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord, Copy)]
64#[non_exhaustive]
65pub enum ExtensionType {
66 /// Transform Extension Type marked as `t`.
67 Transform,
68 /// Unicode Extension Type marked as `u`.
69 Unicode,
70 /// Private Extension Type marked as `x`.
71 Private,
72 /// All other extension types.
73 Other(u8),
74}
75
76impl ExtensionType {
77 pub(crate) const fn try_from_byte(key: u8) -> Result<Self, ParserError> {
78 let key: u8 = key.to_ascii_lowercase();
79 match key {
80 b'u' => Ok(Self::Unicode),
81 b't' => Ok(Self::Transform),
82 b'x' => Ok(Self::Private),
83 b'a'..=b'z' => Ok(Self::Other(key)),
84 _ => Err(ParserError::InvalidExtension),
85 }
86 }
87
88 pub(crate) const fn try_from_bytes_manual_slice(
89 bytes: &[u8],
90 start: usize,
91 end: usize,
92 ) -> Result<Self, ParserError> {
93 if end - start != 1 {
94 return Err(ParserError::InvalidExtension);
95 }
96 #[allow(clippy::indexing_slicing)]
97 Self::try_from_byte(key:bytes[start])
98 }
99}
100
101/// A map of extensions associated with a given [`Locale`](crate::Locale).
102#[derive(Debug, Default, PartialEq, Eq, Clone, Hash)]
103#[non_exhaustive]
104pub struct Extensions {
105 /// A representation of the data for a Unicode extension, when present in the locale identifier.
106 pub unicode: Unicode,
107 /// A representation of the data for a transform extension, when present in the locale identifier.
108 pub transform: Transform,
109 /// A representation of the data for a private-use extension, when present in the locale identifier.
110 pub private: Private,
111 /// A sequence of any other extensions that are present in the locale identifier but are not formally
112 /// [defined](https://unicode.org/reports/tr35/) and represented explicitly as [`Unicode`], [`Transform`],
113 /// and [`Private`] are.
114 pub other: Vec<Other>,
115}
116
117impl Extensions {
118 /// Returns a new empty map of extensions. Same as [`default()`](Default::default()), but is `const`.
119 ///
120 /// # Examples
121 ///
122 /// ```
123 /// use icu::locid::extensions::Extensions;
124 ///
125 /// assert_eq!(Extensions::new(), Extensions::default());
126 /// ```
127 #[inline]
128 pub const fn new() -> Self {
129 Self {
130 unicode: Unicode::new(),
131 transform: Transform::new(),
132 private: Private::new(),
133 other: Vec::new(),
134 }
135 }
136
137 /// Function to create a new map of extensions containing exactly one unicode extension, callable in `const`
138 /// context.
139 #[inline]
140 pub const fn from_unicode(unicode: Unicode) -> Self {
141 Self {
142 unicode,
143 transform: Transform::new(),
144 private: Private::new(),
145 other: Vec::new(),
146 }
147 }
148
149 /// Returns whether there are no extensions present.
150 ///
151 /// # Examples
152 ///
153 /// ```
154 /// use icu::locid::Locale;
155 ///
156 /// let loc: Locale = "en-US-u-foo".parse().expect("Parsing failed.");
157 ///
158 /// assert!(!loc.extensions.is_empty());
159 /// ```
160 pub fn is_empty(&self) -> bool {
161 self.unicode.is_empty()
162 && self.transform.is_empty()
163 && self.private.is_empty()
164 && self.other.is_empty()
165 }
166
167 /// Retains the specified extension types, clearing all others.
168 ///
169 /// # Examples
170 ///
171 /// ```
172 /// use icu::locid::extensions::ExtensionType;
173 /// use icu::locid::Locale;
174 ///
175 /// let loc: Locale =
176 /// "und-a-hello-t-mul-u-world-z-zzz-x-extra".parse().unwrap();
177 ///
178 /// let mut only_unicode = loc.clone();
179 /// only_unicode
180 /// .extensions
181 /// .retain_by_type(|t| t == ExtensionType::Unicode);
182 /// assert_eq!(only_unicode, "und-u-world".parse().unwrap());
183 ///
184 /// let mut only_t_z = loc.clone();
185 /// only_t_z.extensions.retain_by_type(|t| {
186 /// t == ExtensionType::Transform || t == ExtensionType::Other(b'z')
187 /// });
188 /// assert_eq!(only_t_z, "und-t-mul-z-zzz".parse().unwrap());
189 /// ```
190 pub fn retain_by_type<F>(&mut self, mut predicate: F)
191 where
192 F: FnMut(ExtensionType) -> bool,
193 {
194 if !predicate(ExtensionType::Unicode) {
195 self.unicode.clear();
196 }
197 if !predicate(ExtensionType::Transform) {
198 self.transform.clear();
199 }
200 if !predicate(ExtensionType::Private) {
201 self.private.clear();
202 }
203 self.other
204 .retain(|o| predicate(ExtensionType::Other(o.get_ext_byte())));
205 }
206
207 pub(crate) fn try_from_iter(iter: &mut SubtagIterator) -> Result<Self, ParserError> {
208 let mut unicode = None;
209 let mut transform = None;
210 let mut private = None;
211 let mut other = Vec::new();
212
213 while let Some(subtag) = iter.next() {
214 if subtag.is_empty() {
215 return Err(ParserError::InvalidExtension);
216 }
217 match subtag.first().map(|b| ExtensionType::try_from_byte(*b)) {
218 Some(Ok(ExtensionType::Unicode)) => {
219 if unicode.is_some() {
220 return Err(ParserError::DuplicatedExtension);
221 }
222 unicode = Some(Unicode::try_from_iter(iter)?);
223 }
224 Some(Ok(ExtensionType::Transform)) => {
225 if transform.is_some() {
226 return Err(ParserError::DuplicatedExtension);
227 }
228 transform = Some(Transform::try_from_iter(iter)?);
229 }
230 Some(Ok(ExtensionType::Private)) => {
231 if private.is_some() {
232 return Err(ParserError::DuplicatedExtension);
233 }
234 private = Some(Private::try_from_iter(iter)?);
235 }
236 Some(Ok(ExtensionType::Other(ext))) => {
237 if other.iter().any(|o: &Other| o.get_ext_byte() == ext) {
238 return Err(ParserError::DuplicatedExtension);
239 }
240 let parsed = Other::try_from_iter(ext, iter)?;
241 if let Err(idx) = other.binary_search(&parsed) {
242 other.insert(idx, parsed);
243 } else {
244 return Err(ParserError::InvalidExtension);
245 }
246 }
247 _ => return Err(ParserError::InvalidExtension),
248 }
249 }
250
251 Ok(Self {
252 unicode: unicode.unwrap_or_default(),
253 transform: transform.unwrap_or_default(),
254 private: private.unwrap_or_default(),
255 other,
256 })
257 }
258
259 pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
260 where
261 F: FnMut(&str) -> Result<(), E>,
262 {
263 let mut wrote_tu = false;
264 // Alphabetic by singleton
265 self.other.iter().try_for_each(|other| {
266 if other.get_ext() > 't' && !wrote_tu {
267 // Since 't' and 'u' are next to each other in alphabetical
268 // order, write both now.
269 self.transform.for_each_subtag_str(f)?;
270 self.unicode.for_each_subtag_str(f)?;
271 wrote_tu = true;
272 }
273 other.for_each_subtag_str(f)?;
274 Ok(())
275 })?;
276
277 if !wrote_tu {
278 self.transform.for_each_subtag_str(f)?;
279 self.unicode.for_each_subtag_str(f)?;
280 }
281
282 // Private must be written last, since it allows single character
283 // keys. Extensions must also be written in alphabetical order,
284 // which would seem to imply that other extensions `y` and `z` are
285 // invalid, but this is not specified.
286 self.private.for_each_subtag_str(f)?;
287 Ok(())
288 }
289}
290
291impl_writeable_for_each_subtag_str_no_test!(Extensions);
292
293#[test]
294fn test_writeable() {
295 use crate::Locale;
296 use writeable::assert_writeable_eq;
297 assert_writeable_eq!(Extensions::new(), "");
298 assert_writeable_eq!(
299 "my-t-my-d0-zawgyi".parse::<Locale>().unwrap().extensions,
300 "t-my-d0-zawgyi",
301 );
302 assert_writeable_eq!(
303 "ar-SA-u-ca-islamic-civil"
304 .parse::<Locale>()
305 .unwrap()
306 .extensions,
307 "u-ca-islamic-civil",
308 );
309 assert_writeable_eq!(
310 "en-001-x-foo-bar".parse::<Locale>().unwrap().extensions,
311 "x-foo-bar",
312 );
313 assert_writeable_eq!(
314 "und-t-m0-true".parse::<Locale>().unwrap().extensions,
315 "t-m0-true",
316 );
317 assert_writeable_eq!(
318 "und-a-foo-t-foo-u-foo-w-foo-z-foo-x-foo"
319 .parse::<Locale>()
320 .unwrap()
321 .extensions,
322 "a-foo-t-foo-u-foo-w-foo-z-foo-x-foo",
323 );
324}
325