| 1 | // This file is part of ICU4X. For terms of use, please see the file |
| 2 | // called LICENSE at the top level of the ICU4X source tree |
| 3 | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
| 4 | |
| 5 | //! Unicode Extensions provide a mechanism to extend the [`LanguageIdentifier`] with |
| 6 | //! additional bits of information - a combination of a [`LanguageIdentifier`] and [`Extensions`] |
| 7 | //! is called [`Locale`]. |
| 8 | //! |
| 9 | //! There are four types of extensions: |
| 10 | //! |
| 11 | //! * [`Unicode Extensions`] - marked as `u`. |
| 12 | //! * [`Transform Extensions`] - marked as `t`. |
| 13 | //! * [`Private Use Extensions`] - marked as `x`. |
| 14 | //! * [`Other Extensions`] - marked as any `a-z` except of `u`, `t` and `x`. |
| 15 | //! |
| 16 | //! One can think of extensions as a bag of extra information on top of basic 4 [`subtags`]. |
| 17 | //! |
| 18 | //! Notice: `Other` extension type is currently not supported. |
| 19 | //! |
| 20 | //! # Examples |
| 21 | //! |
| 22 | //! ``` |
| 23 | //! use icu::locid::extensions::unicode::{Key, Value}; |
| 24 | //! use icu::locid::Locale; |
| 25 | //! |
| 26 | //! let loc: Locale = "en-US-u-ca-buddhist-t-en-us-h0-hybrid-x-foo" |
| 27 | //! .parse() |
| 28 | //! .expect("Failed to parse." ); |
| 29 | //! |
| 30 | //! assert_eq!(loc.id.language, "en" .parse().unwrap()); |
| 31 | //! assert_eq!(loc.id.script, None); |
| 32 | //! assert_eq!(loc.id.region, Some("US" .parse().unwrap())); |
| 33 | //! assert_eq!(loc.id.variants.len(), 0); |
| 34 | //! |
| 35 | //! let key: Key = "ca" .parse().expect("Parsing key failed." ); |
| 36 | //! let value: Value = "buddhist" .parse().expect("Parsing value failed." ); |
| 37 | //! assert_eq!(loc.extensions.unicode.keywords.get(&key), Some(&value)); |
| 38 | //! ``` |
| 39 | //! |
| 40 | //! [`LanguageIdentifier`]: super::LanguageIdentifier |
| 41 | //! [`Locale`]: super::Locale |
| 42 | //! [`subtags`]: super::subtags |
| 43 | //! [`Other Extensions`]: other |
| 44 | //! [`Private Use Extensions`]: private |
| 45 | //! [`Transform Extensions`]: transform |
| 46 | //! [`Unicode Extensions`]: unicode |
| 47 | pub mod other; |
| 48 | pub mod private; |
| 49 | pub mod transform; |
| 50 | pub mod unicode; |
| 51 | |
| 52 | use core::cmp::Ordering; |
| 53 | |
| 54 | use other::Other; |
| 55 | use private::Private; |
| 56 | use transform::Transform; |
| 57 | use unicode::Unicode; |
| 58 | |
| 59 | use alloc::vec::Vec; |
| 60 | |
| 61 | use crate::parser::ParserError; |
| 62 | use crate::parser::SubtagIterator; |
| 63 | use crate::subtags; |
| 64 | |
| 65 | /// Defines the type of extension. |
| 66 | #[derive (Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord, Copy)] |
| 67 | #[non_exhaustive ] |
| 68 | pub enum ExtensionType { |
| 69 | /// Transform Extension Type marked as `t`. |
| 70 | Transform, |
| 71 | /// Unicode Extension Type marked as `u`. |
| 72 | Unicode, |
| 73 | /// Private Extension Type marked as `x`. |
| 74 | Private, |
| 75 | /// All other extension types. |
| 76 | Other(u8), |
| 77 | } |
| 78 | |
| 79 | impl ExtensionType { |
| 80 | pub(crate) const fn try_from_byte(key: u8) -> Result<Self, ParserError> { |
| 81 | let key: u8 = key.to_ascii_lowercase(); |
| 82 | match key { |
| 83 | b'u' => Ok(Self::Unicode), |
| 84 | b't' => Ok(Self::Transform), |
| 85 | b'x' => Ok(Self::Private), |
| 86 | b'a' ..=b'z' => Ok(Self::Other(key)), |
| 87 | _ => Err(ParserError::InvalidExtension), |
| 88 | } |
| 89 | } |
| 90 | |
| 91 | pub(crate) const fn try_from_bytes_manual_slice( |
| 92 | bytes: &[u8], |
| 93 | start: usize, |
| 94 | end: usize, |
| 95 | ) -> Result<Self, ParserError> { |
| 96 | if end - start != 1 { |
| 97 | return Err(ParserError::InvalidExtension); |
| 98 | } |
| 99 | #[allow (clippy::indexing_slicing)] |
| 100 | Self::try_from_byte(key:bytes[start]) |
| 101 | } |
| 102 | } |
| 103 | |
| 104 | /// A map of extensions associated with a given [`Locale`](crate::Locale). |
| 105 | #[derive (Debug, Default, PartialEq, Eq, Clone, Hash)] |
| 106 | #[non_exhaustive ] |
| 107 | pub struct Extensions { |
| 108 | /// A representation of the data for a Unicode extension, when present in the locale identifier. |
| 109 | pub unicode: Unicode, |
| 110 | /// A representation of the data for a transform extension, when present in the locale identifier. |
| 111 | pub transform: Transform, |
| 112 | /// A representation of the data for a private-use extension, when present in the locale identifier. |
| 113 | pub private: Private, |
| 114 | /// A sequence of any other extensions that are present in the locale identifier but are not formally |
| 115 | /// [defined](https://unicode.org/reports/tr35/) and represented explicitly as [`Unicode`], [`Transform`], |
| 116 | /// and [`Private`] are. |
| 117 | pub other: Vec<Other>, |
| 118 | } |
| 119 | |
| 120 | impl Extensions { |
| 121 | /// Returns a new empty map of extensions. Same as [`default()`](Default::default()), but is `const`. |
| 122 | /// |
| 123 | /// # Examples |
| 124 | /// |
| 125 | /// ``` |
| 126 | /// use icu::locid::extensions::Extensions; |
| 127 | /// |
| 128 | /// assert_eq!(Extensions::new(), Extensions::default()); |
| 129 | /// ``` |
| 130 | #[inline ] |
| 131 | pub const fn new() -> Self { |
| 132 | Self { |
| 133 | unicode: Unicode::new(), |
| 134 | transform: Transform::new(), |
| 135 | private: Private::new(), |
| 136 | other: Vec::new(), |
| 137 | } |
| 138 | } |
| 139 | |
| 140 | /// Function to create a new map of extensions containing exactly one unicode extension, callable in `const` |
| 141 | /// context. |
| 142 | #[inline ] |
| 143 | pub const fn from_unicode(unicode: Unicode) -> Self { |
| 144 | Self { |
| 145 | unicode, |
| 146 | transform: Transform::new(), |
| 147 | private: Private::new(), |
| 148 | other: Vec::new(), |
| 149 | } |
| 150 | } |
| 151 | |
| 152 | /// Returns whether there are no extensions present. |
| 153 | /// |
| 154 | /// # Examples |
| 155 | /// |
| 156 | /// ``` |
| 157 | /// use icu::locid::Locale; |
| 158 | /// |
| 159 | /// let loc: Locale = "en-US-u-foo" .parse().expect("Parsing failed." ); |
| 160 | /// |
| 161 | /// assert!(!loc.extensions.is_empty()); |
| 162 | /// ``` |
| 163 | pub fn is_empty(&self) -> bool { |
| 164 | self.unicode.is_empty() |
| 165 | && self.transform.is_empty() |
| 166 | && self.private.is_empty() |
| 167 | && self.other.is_empty() |
| 168 | } |
| 169 | |
| 170 | #[allow (clippy::type_complexity)] |
| 171 | pub(crate) fn as_tuple( |
| 172 | &self, |
| 173 | ) -> ( |
| 174 | (&unicode::Attributes, &unicode::Keywords), |
| 175 | ( |
| 176 | Option<( |
| 177 | subtags::Language, |
| 178 | Option<subtags::Script>, |
| 179 | Option<subtags::Region>, |
| 180 | &subtags::Variants, |
| 181 | )>, |
| 182 | &transform::Fields, |
| 183 | ), |
| 184 | &private::Private, |
| 185 | &[other::Other], |
| 186 | ) { |
| 187 | ( |
| 188 | self.unicode.as_tuple(), |
| 189 | self.transform.as_tuple(), |
| 190 | &self.private, |
| 191 | &self.other, |
| 192 | ) |
| 193 | } |
| 194 | |
| 195 | /// Returns an ordering suitable for use in [`BTreeSet`]. |
| 196 | /// |
| 197 | /// The ordering may or may not be equivalent to string ordering, and it |
| 198 | /// may or may not be stable across ICU4X releases. |
| 199 | /// |
| 200 | /// [`BTreeSet`]: alloc::collections::BTreeSet |
| 201 | pub fn total_cmp(&self, other: &Self) -> Ordering { |
| 202 | self.as_tuple().cmp(&other.as_tuple()) |
| 203 | } |
| 204 | |
| 205 | /// Retains the specified extension types, clearing all others. |
| 206 | /// |
| 207 | /// # Examples |
| 208 | /// |
| 209 | /// ``` |
| 210 | /// use icu::locid::extensions::ExtensionType; |
| 211 | /// use icu::locid::Locale; |
| 212 | /// |
| 213 | /// let loc: Locale = |
| 214 | /// "und-a-hello-t-mul-u-world-z-zzz-x-extra" .parse().unwrap(); |
| 215 | /// |
| 216 | /// let mut only_unicode = loc.clone(); |
| 217 | /// only_unicode |
| 218 | /// .extensions |
| 219 | /// .retain_by_type(|t| t == ExtensionType::Unicode); |
| 220 | /// assert_eq!(only_unicode, "und-u-world" .parse().unwrap()); |
| 221 | /// |
| 222 | /// let mut only_t_z = loc.clone(); |
| 223 | /// only_t_z.extensions.retain_by_type(|t| { |
| 224 | /// t == ExtensionType::Transform || t == ExtensionType::Other(b'z' ) |
| 225 | /// }); |
| 226 | /// assert_eq!(only_t_z, "und-t-mul-z-zzz" .parse().unwrap()); |
| 227 | /// ``` |
| 228 | pub fn retain_by_type<F>(&mut self, mut predicate: F) |
| 229 | where |
| 230 | F: FnMut(ExtensionType) -> bool, |
| 231 | { |
| 232 | if !predicate(ExtensionType::Unicode) { |
| 233 | self.unicode.clear(); |
| 234 | } |
| 235 | if !predicate(ExtensionType::Transform) { |
| 236 | self.transform.clear(); |
| 237 | } |
| 238 | if !predicate(ExtensionType::Private) { |
| 239 | self.private.clear(); |
| 240 | } |
| 241 | self.other |
| 242 | .retain(|o| predicate(ExtensionType::Other(o.get_ext_byte()))); |
| 243 | } |
| 244 | |
| 245 | pub(crate) fn try_from_iter(iter: &mut SubtagIterator) -> Result<Self, ParserError> { |
| 246 | let mut unicode = None; |
| 247 | let mut transform = None; |
| 248 | let mut private = None; |
| 249 | let mut other = Vec::new(); |
| 250 | |
| 251 | while let Some(subtag) = iter.next() { |
| 252 | if subtag.is_empty() { |
| 253 | return Err(ParserError::InvalidExtension); |
| 254 | } |
| 255 | match subtag.first().map(|b| ExtensionType::try_from_byte(*b)) { |
| 256 | Some(Ok(ExtensionType::Unicode)) => { |
| 257 | if unicode.is_some() { |
| 258 | return Err(ParserError::DuplicatedExtension); |
| 259 | } |
| 260 | unicode = Some(Unicode::try_from_iter(iter)?); |
| 261 | } |
| 262 | Some(Ok(ExtensionType::Transform)) => { |
| 263 | if transform.is_some() { |
| 264 | return Err(ParserError::DuplicatedExtension); |
| 265 | } |
| 266 | transform = Some(Transform::try_from_iter(iter)?); |
| 267 | } |
| 268 | Some(Ok(ExtensionType::Private)) => { |
| 269 | if private.is_some() { |
| 270 | return Err(ParserError::DuplicatedExtension); |
| 271 | } |
| 272 | private = Some(Private::try_from_iter(iter)?); |
| 273 | } |
| 274 | Some(Ok(ExtensionType::Other(ext))) => { |
| 275 | if other.iter().any(|o: &Other| o.get_ext_byte() == ext) { |
| 276 | return Err(ParserError::DuplicatedExtension); |
| 277 | } |
| 278 | let parsed = Other::try_from_iter(ext, iter)?; |
| 279 | if let Err(idx) = other.binary_search(&parsed) { |
| 280 | other.insert(idx, parsed); |
| 281 | } else { |
| 282 | return Err(ParserError::InvalidExtension); |
| 283 | } |
| 284 | } |
| 285 | _ => return Err(ParserError::InvalidExtension), |
| 286 | } |
| 287 | } |
| 288 | |
| 289 | Ok(Self { |
| 290 | unicode: unicode.unwrap_or_default(), |
| 291 | transform: transform.unwrap_or_default(), |
| 292 | private: private.unwrap_or_default(), |
| 293 | other, |
| 294 | }) |
| 295 | } |
| 296 | |
| 297 | pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E> |
| 298 | where |
| 299 | F: FnMut(&str) -> Result<(), E>, |
| 300 | { |
| 301 | let mut wrote_tu = false; |
| 302 | // Alphabetic by singleton |
| 303 | self.other.iter().try_for_each(|other| { |
| 304 | if other.get_ext() > 't' && !wrote_tu { |
| 305 | // Since 't' and 'u' are next to each other in alphabetical |
| 306 | // order, write both now. |
| 307 | self.transform.for_each_subtag_str(f)?; |
| 308 | self.unicode.for_each_subtag_str(f)?; |
| 309 | wrote_tu = true; |
| 310 | } |
| 311 | other.for_each_subtag_str(f)?; |
| 312 | Ok(()) |
| 313 | })?; |
| 314 | |
| 315 | if !wrote_tu { |
| 316 | self.transform.for_each_subtag_str(f)?; |
| 317 | self.unicode.for_each_subtag_str(f)?; |
| 318 | } |
| 319 | |
| 320 | // Private must be written last, since it allows single character |
| 321 | // keys. Extensions must also be written in alphabetical order, |
| 322 | // which would seem to imply that other extensions `y` and `z` are |
| 323 | // invalid, but this is not specified. |
| 324 | self.private.for_each_subtag_str(f)?; |
| 325 | Ok(()) |
| 326 | } |
| 327 | } |
| 328 | |
| 329 | impl_writeable_for_each_subtag_str_no_test!(Extensions); |
| 330 | |
| 331 | #[test ] |
| 332 | fn test_writeable() { |
| 333 | use crate::Locale; |
| 334 | use writeable::assert_writeable_eq; |
| 335 | assert_writeable_eq!(Extensions::new(), "" ); |
| 336 | assert_writeable_eq!( |
| 337 | "my-t-my-d0-zawgyi" .parse::<Locale>().unwrap().extensions, |
| 338 | "t-my-d0-zawgyi" , |
| 339 | ); |
| 340 | assert_writeable_eq!( |
| 341 | "ar-SA-u-ca-islamic-civil" |
| 342 | .parse::<Locale>() |
| 343 | .unwrap() |
| 344 | .extensions, |
| 345 | "u-ca-islamic-civil" , |
| 346 | ); |
| 347 | assert_writeable_eq!( |
| 348 | "en-001-x-foo-bar" .parse::<Locale>().unwrap().extensions, |
| 349 | "x-foo-bar" , |
| 350 | ); |
| 351 | assert_writeable_eq!( |
| 352 | "und-t-m0-true" .parse::<Locale>().unwrap().extensions, |
| 353 | "t-m0-true" , |
| 354 | ); |
| 355 | assert_writeable_eq!( |
| 356 | "und-a-foo-t-foo-u-foo-w-foo-z-foo-x-foo" |
| 357 | .parse::<Locale>() |
| 358 | .unwrap() |
| 359 | .extensions, |
| 360 | "a-foo-t-foo-u-foo-w-foo-z-foo-x-foo" , |
| 361 | ); |
| 362 | } |
| 363 | |