1 | // This file is part of ICU4X. For terms of use, please see the file |
2 | // called LICENSE at the top level of the ICU4X source tree |
3 | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | |
5 | use crate::ordering::SubtagOrderingResult; |
6 | use crate::parser::{ |
7 | parse_locale, parse_locale_with_single_variant_single_keyword_unicode_keyword_extension, |
8 | ParserError, ParserMode, SubtagIterator, |
9 | }; |
10 | use crate::{extensions, subtags, LanguageIdentifier}; |
11 | use alloc::string::String; |
12 | use core::cmp::Ordering; |
13 | use core::str::FromStr; |
14 | use tinystr::TinyAsciiStr; |
15 | use writeable::Writeable; |
16 | |
17 | /// A core struct representing a [`Unicode Locale Identifier`]. |
18 | /// |
19 | /// A locale is made of two parts: |
20 | /// * Unicode Language Identifier |
21 | /// * A set of Unicode Extensions |
22 | /// |
23 | /// [`Locale`] exposes all of the same fields and methods as [`LanguageIdentifier`], and |
24 | /// on top of that is able to parse, manipulate and serialize unicode extension fields. |
25 | /// |
26 | /// |
27 | /// # Examples |
28 | /// |
29 | /// ``` |
30 | /// use icu_locid::{ |
31 | /// extensions::unicode::{key, value}, |
32 | /// locale, |
33 | /// subtags::{language, region}, |
34 | /// }; |
35 | /// |
36 | /// let loc = locale!("en-US-u-ca-buddhist" ); |
37 | /// |
38 | /// assert_eq!(loc.id.language, language!("en" )); |
39 | /// assert_eq!(loc.id.script, None); |
40 | /// assert_eq!(loc.id.region, Some(region!("US" ))); |
41 | /// assert_eq!(loc.id.variants.len(), 0); |
42 | /// assert_eq!( |
43 | /// loc.extensions.unicode.keywords.get(&key!("ca" )), |
44 | /// Some(&value!("buddhist" )) |
45 | /// ); |
46 | /// ``` |
47 | /// |
48 | /// # Parsing |
49 | /// |
50 | /// Unicode recognizes three levels of standard conformance for a locale: |
51 | /// |
52 | /// * *well-formed* - syntactically correct |
53 | /// * *valid* - well-formed and only uses registered language subtags, extensions, keywords, types... |
54 | /// * *canonical* - valid and no deprecated codes or structure. |
55 | /// |
56 | /// At the moment parsing normalizes a well-formed locale identifier converting |
57 | /// `_` separators to `-` and adjusting casing to conform to the Unicode standard. |
58 | /// |
59 | /// Any bogus subtags will cause the parsing to fail with an error. |
60 | /// |
61 | /// No subtag validation or alias resolution is performed. |
62 | /// |
63 | /// # Examples |
64 | /// |
65 | /// ``` |
66 | /// use icu::locid::{subtags::*, Locale}; |
67 | /// |
68 | /// let loc: Locale = "eN_latn_Us-Valencia_u-hC-H12" |
69 | /// .parse() |
70 | /// .expect("Failed to parse." ); |
71 | /// |
72 | /// assert_eq!(loc.id.language, "en" .parse::<Language>().unwrap()); |
73 | /// assert_eq!(loc.id.script, "Latn" .parse::<Script>().ok()); |
74 | /// assert_eq!(loc.id.region, "US" .parse::<Region>().ok()); |
75 | /// assert_eq!( |
76 | /// loc.id.variants.get(0), |
77 | /// "valencia" .parse::<Variant>().ok().as_ref() |
78 | /// ); |
79 | /// ``` |
80 | /// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_locale_identifier |
81 | #[derive (Default, PartialEq, Eq, Clone, Hash)] |
82 | #[allow (clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro) |
83 | pub struct Locale { |
84 | /// The basic language/script/region components in the locale identifier along with any variants. |
85 | pub id: LanguageIdentifier, |
86 | /// Any extensions present in the locale identifier. |
87 | pub extensions: extensions::Extensions, |
88 | } |
89 | |
90 | #[test ] |
91 | fn test_sizes() { |
92 | assert_eq!(core::mem::size_of::<subtags::Language>(), 3); |
93 | assert_eq!(core::mem::size_of::<subtags::Script>(), 4); |
94 | assert_eq!(core::mem::size_of::<subtags::Region>(), 3); |
95 | assert_eq!(core::mem::size_of::<subtags::Variant>(), 8); |
96 | assert_eq!(core::mem::size_of::<subtags::Variants>(), 16); |
97 | assert_eq!(core::mem::size_of::<LanguageIdentifier>(), 32); |
98 | |
99 | assert_eq!(core::mem::size_of::<extensions::transform::Transform>(), 56); |
100 | assert_eq!(core::mem::size_of::<Option<LanguageIdentifier>>(), 32); |
101 | assert_eq!(core::mem::size_of::<extensions::transform::Fields>(), 24); |
102 | |
103 | assert_eq!(core::mem::size_of::<extensions::unicode::Attributes>(), 16); |
104 | assert_eq!(core::mem::size_of::<extensions::unicode::Keywords>(), 24); |
105 | assert_eq!(core::mem::size_of::<Vec<extensions::other::Other>>(), 24); |
106 | assert_eq!(core::mem::size_of::<extensions::private::Private>(), 16); |
107 | assert_eq!(core::mem::size_of::<extensions::Extensions>(), 136); |
108 | |
109 | assert_eq!(core::mem::size_of::<Locale>(), 168); |
110 | } |
111 | |
112 | impl Locale { |
113 | /// A constructor which takes a utf8 slice, parses it and |
114 | /// produces a well-formed [`Locale`]. |
115 | /// |
116 | /// # Examples |
117 | /// |
118 | /// ``` |
119 | /// use icu::locid::Locale; |
120 | /// |
121 | /// Locale::try_from_bytes(b"en-US-u-hc-h12" ).unwrap(); |
122 | /// ``` |
123 | pub fn try_from_bytes(v: &[u8]) -> Result<Self, ParserError> { |
124 | parse_locale(v) |
125 | } |
126 | |
127 | /// The default undefined locale "und". Same as [`default()`](Default::default()). |
128 | /// |
129 | /// # Examples |
130 | /// |
131 | /// ``` |
132 | /// use icu::locid::Locale; |
133 | /// |
134 | /// assert_eq!(Locale::default(), Locale::UND); |
135 | /// ``` |
136 | pub const UND: Self = Self { |
137 | id: LanguageIdentifier::UND, |
138 | extensions: extensions::Extensions::new(), |
139 | }; |
140 | |
141 | /// This is a best-effort operation that performs all available levels of canonicalization. |
142 | /// |
143 | /// At the moment the operation will normalize casing and the separator, but in the future |
144 | /// it may also validate and update from deprecated subtags to canonical ones. |
145 | /// |
146 | /// # Examples |
147 | /// |
148 | /// ``` |
149 | /// use icu::locid::Locale; |
150 | /// |
151 | /// assert_eq!( |
152 | /// Locale::canonicalize("pL_latn_pl-U-HC-H12" ).as_deref(), |
153 | /// Ok("pl-Latn-PL-u-hc-h12" ) |
154 | /// ); |
155 | /// ``` |
156 | pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, ParserError> { |
157 | let locale = Self::try_from_bytes(input.as_ref())?; |
158 | Ok(locale.write_to_string().into_owned()) |
159 | } |
160 | |
161 | /// Compare this [`Locale`] with BCP-47 bytes. |
162 | /// |
163 | /// The return value is equivalent to what would happen if you first converted this |
164 | /// [`Locale`] to a BCP-47 string and then performed a byte comparison. |
165 | /// |
166 | /// This function is case-sensitive and results in a *total order*, so it is appropriate for |
167 | /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`. |
168 | /// |
169 | /// # Examples |
170 | /// |
171 | /// ``` |
172 | /// use icu::locid::Locale; |
173 | /// use std::cmp::Ordering; |
174 | /// |
175 | /// let bcp47_strings: &[&str] = &[ |
176 | /// "pl-Latn-PL" , |
177 | /// "und" , |
178 | /// "und-fonipa" , |
179 | /// "und-t-m0-true" , |
180 | /// "und-u-ca-hebrew" , |
181 | /// "und-u-ca-japanese" , |
182 | /// "zh" , |
183 | /// ]; |
184 | /// |
185 | /// for ab in bcp47_strings.windows(2) { |
186 | /// let a = ab[0]; |
187 | /// let b = ab[1]; |
188 | /// assert!(a.cmp(b) == Ordering::Less); |
189 | /// let a_loc = a.parse::<Locale>().unwrap(); |
190 | /// assert!(a_loc.strict_cmp(a.as_bytes()) == Ordering::Equal); |
191 | /// assert!(a_loc.strict_cmp(b.as_bytes()) == Ordering::Less); |
192 | /// } |
193 | /// ``` |
194 | pub fn strict_cmp(&self, other: &[u8]) -> Ordering { |
195 | self.strict_cmp_iter(other.split(|b| *b == b'-' )).end() |
196 | } |
197 | |
198 | /// Compare this [`Locale`] with an iterator of BCP-47 subtags. |
199 | /// |
200 | /// This function has the same equality semantics as [`Locale::strict_cmp`]. It is intended as |
201 | /// a more modular version that allows multiple subtag iterators to be chained together. |
202 | /// |
203 | /// For an additional example, see [`SubtagOrderingResult`]. |
204 | /// |
205 | /// # Examples |
206 | /// |
207 | /// ``` |
208 | /// use icu::locid::locale; |
209 | /// use std::cmp::Ordering; |
210 | /// |
211 | /// let subtags: &[&[u8]] = |
212 | /// &[b"ca" , b"ES" , b"valencia" , b"u" , b"ca" , b"hebrew" ]; |
213 | /// |
214 | /// let loc = locale!("ca-ES-valencia-u-ca-hebrew" ); |
215 | /// assert_eq!( |
216 | /// Ordering::Equal, |
217 | /// loc.strict_cmp_iter(subtags.iter().copied()).end() |
218 | /// ); |
219 | /// |
220 | /// let loc = locale!("ca-ES-valencia" ); |
221 | /// assert_eq!( |
222 | /// Ordering::Less, |
223 | /// loc.strict_cmp_iter(subtags.iter().copied()).end() |
224 | /// ); |
225 | /// |
226 | /// let loc = locale!("ca-ES-valencia-u-nu-arab" ); |
227 | /// assert_eq!( |
228 | /// Ordering::Greater, |
229 | /// loc.strict_cmp_iter(subtags.iter().copied()).end() |
230 | /// ); |
231 | /// ``` |
232 | pub fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult<I> |
233 | where |
234 | I: Iterator<Item = &'l [u8]>, |
235 | { |
236 | let r = self.for_each_subtag_str(&mut |subtag| { |
237 | if let Some(other) = subtags.next() { |
238 | match subtag.as_bytes().cmp(other) { |
239 | Ordering::Equal => Ok(()), |
240 | not_equal => Err(not_equal), |
241 | } |
242 | } else { |
243 | Err(Ordering::Greater) |
244 | } |
245 | }); |
246 | match r { |
247 | Ok(_) => SubtagOrderingResult::Subtags(subtags), |
248 | Err(o) => SubtagOrderingResult::Ordering(o), |
249 | } |
250 | } |
251 | |
252 | /// Compare this `Locale` with a potentially unnormalized BCP-47 string. |
253 | /// |
254 | /// The return value is equivalent to what would happen if you first parsed the |
255 | /// BCP-47 string to a `Locale` and then performed a structural comparison. |
256 | /// |
257 | /// # Examples |
258 | /// |
259 | /// ``` |
260 | /// use icu::locid::Locale; |
261 | /// use std::cmp::Ordering; |
262 | /// |
263 | /// let bcp47_strings: &[&str] = &[ |
264 | /// "pl-LaTn-pL" , |
265 | /// "uNd" , |
266 | /// "UND-FONIPA" , |
267 | /// "UnD-t-m0-TrUe" , |
268 | /// "uNd-u-CA-Japanese" , |
269 | /// "ZH" , |
270 | /// ]; |
271 | /// |
272 | /// for a in bcp47_strings { |
273 | /// assert!(a.parse::<Locale>().unwrap().normalizing_eq(a)); |
274 | /// } |
275 | /// ``` |
276 | pub fn normalizing_eq(&self, other: &str) -> bool { |
277 | macro_rules! subtag_matches { |
278 | ($T:ty, $iter:ident, $expected:expr) => { |
279 | $iter |
280 | .next() |
281 | .map(|b| <$T>::try_from_bytes(b) == Ok($expected)) |
282 | .unwrap_or(false) |
283 | }; |
284 | } |
285 | |
286 | let mut iter = SubtagIterator::new(other.as_bytes()); |
287 | if !subtag_matches!(subtags::Language, iter, self.id.language) { |
288 | return false; |
289 | } |
290 | if let Some(ref script) = self.id.script { |
291 | if !subtag_matches!(subtags::Script, iter, *script) { |
292 | return false; |
293 | } |
294 | } |
295 | if let Some(ref region) = self.id.region { |
296 | if !subtag_matches!(subtags::Region, iter, *region) { |
297 | return false; |
298 | } |
299 | } |
300 | for variant in self.id.variants.iter() { |
301 | if !subtag_matches!(subtags::Variant, iter, *variant) { |
302 | return false; |
303 | } |
304 | } |
305 | if !self.extensions.is_empty() { |
306 | match extensions::Extensions::try_from_iter(&mut iter) { |
307 | Ok(exts) => { |
308 | if self.extensions != exts { |
309 | return false; |
310 | } |
311 | } |
312 | Err(_) => { |
313 | return false; |
314 | } |
315 | } |
316 | } |
317 | iter.next().is_none() |
318 | } |
319 | |
320 | #[doc (hidden)] |
321 | #[allow (clippy::type_complexity)] |
322 | pub const fn try_from_bytes_with_single_variant_single_keyword_unicode_extension( |
323 | v: &[u8], |
324 | ) -> Result< |
325 | ( |
326 | subtags::Language, |
327 | Option<subtags::Script>, |
328 | Option<subtags::Region>, |
329 | Option<subtags::Variant>, |
330 | Option<(extensions::unicode::Key, Option<TinyAsciiStr<8>>)>, |
331 | ), |
332 | ParserError, |
333 | > { |
334 | parse_locale_with_single_variant_single_keyword_unicode_keyword_extension( |
335 | v, |
336 | ParserMode::Locale, |
337 | ) |
338 | } |
339 | |
340 | pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E> |
341 | where |
342 | F: FnMut(&str) -> Result<(), E>, |
343 | { |
344 | self.id.for_each_subtag_str(f)?; |
345 | self.extensions.for_each_subtag_str(f)?; |
346 | Ok(()) |
347 | } |
348 | } |
349 | |
350 | impl FromStr for Locale { |
351 | type Err = ParserError; |
352 | |
353 | fn from_str(source: &str) -> Result<Self, Self::Err> { |
354 | Self::try_from_bytes(source.as_bytes()) |
355 | } |
356 | } |
357 | |
358 | impl From<LanguageIdentifier> for Locale { |
359 | fn from(id: LanguageIdentifier) -> Self { |
360 | Self { |
361 | id, |
362 | extensions: extensions::Extensions::default(), |
363 | } |
364 | } |
365 | } |
366 | |
367 | impl From<Locale> for LanguageIdentifier { |
368 | fn from(loc: Locale) -> Self { |
369 | loc.id |
370 | } |
371 | } |
372 | |
373 | impl AsRef<LanguageIdentifier> for Locale { |
374 | fn as_ref(&self) -> &LanguageIdentifier { |
375 | &self.id |
376 | } |
377 | } |
378 | |
379 | impl AsMut<LanguageIdentifier> for Locale { |
380 | fn as_mut(&mut self) -> &mut LanguageIdentifier { |
381 | &mut self.id |
382 | } |
383 | } |
384 | |
385 | impl core::fmt::Debug for Locale { |
386 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
387 | writeable::Writeable::write_to(self, sink:f) |
388 | } |
389 | } |
390 | |
391 | impl_writeable_for_each_subtag_str_no_test!(Locale, selff, selff.extensions.is_empty() => selff.id.write_to_string()); |
392 | |
393 | #[test ] |
394 | fn test_writeable() { |
395 | use writeable::assert_writeable_eq; |
396 | assert_writeable_eq!(Locale::UND, "und" ); |
397 | assert_writeable_eq!("und-001" .parse::<Locale>().unwrap(), "und-001" ); |
398 | assert_writeable_eq!("und-Mymr" .parse::<Locale>().unwrap(), "und-Mymr" ); |
399 | assert_writeable_eq!("my-Mymr-MM" .parse::<Locale>().unwrap(), "my-Mymr-MM" ); |
400 | assert_writeable_eq!( |
401 | "my-Mymr-MM-posix" .parse::<Locale>().unwrap(), |
402 | "my-Mymr-MM-posix" , |
403 | ); |
404 | assert_writeable_eq!( |
405 | "zh-macos-posix" .parse::<Locale>().unwrap(), |
406 | "zh-macos-posix" , |
407 | ); |
408 | assert_writeable_eq!( |
409 | "my-t-my-d0-zawgyi" .parse::<Locale>().unwrap(), |
410 | "my-t-my-d0-zawgyi" , |
411 | ); |
412 | assert_writeable_eq!( |
413 | "ar-SA-u-ca-islamic-civil" .parse::<Locale>().unwrap(), |
414 | "ar-SA-u-ca-islamic-civil" , |
415 | ); |
416 | assert_writeable_eq!( |
417 | "en-001-x-foo-bar" .parse::<Locale>().unwrap(), |
418 | "en-001-x-foo-bar" , |
419 | ); |
420 | assert_writeable_eq!("und-t-m0-true" .parse::<Locale>().unwrap(), "und-t-m0-true" ,); |
421 | } |
422 | |
423 | /// # Examples |
424 | /// |
425 | /// ``` |
426 | /// use icu::locid::Locale; |
427 | /// use icu::locid::{locale, subtags::language}; |
428 | /// |
429 | /// assert_eq!(Locale::from(language!("en" )), locale!("en" )); |
430 | /// ``` |
431 | impl From<subtags::Language> for Locale { |
432 | fn from(language: subtags::Language) -> Self { |
433 | Self { |
434 | id: language.into(), |
435 | ..Default::default() |
436 | } |
437 | } |
438 | } |
439 | |
440 | /// # Examples |
441 | /// |
442 | /// ``` |
443 | /// use icu::locid::Locale; |
444 | /// use icu::locid::{locale, subtags::script}; |
445 | /// |
446 | /// assert_eq!(Locale::from(Some(script!("latn" ))), locale!("und-Latn" )); |
447 | /// ``` |
448 | impl From<Option<subtags::Script>> for Locale { |
449 | fn from(script: Option<subtags::Script>) -> Self { |
450 | Self { |
451 | id: script.into(), |
452 | ..Default::default() |
453 | } |
454 | } |
455 | } |
456 | |
457 | /// # Examples |
458 | /// |
459 | /// ``` |
460 | /// use icu::locid::Locale; |
461 | /// use icu::locid::{locale, subtags::region}; |
462 | /// |
463 | /// assert_eq!(Locale::from(Some(region!("US" ))), locale!("und-US" )); |
464 | /// ``` |
465 | impl From<Option<subtags::Region>> for Locale { |
466 | fn from(region: Option<subtags::Region>) -> Self { |
467 | Self { |
468 | id: region.into(), |
469 | ..Default::default() |
470 | } |
471 | } |
472 | } |
473 | |
474 | /// # Examples |
475 | /// |
476 | /// ``` |
477 | /// use icu::locid::Locale; |
478 | /// use icu::locid::{ |
479 | /// locale, |
480 | /// subtags::{language, region, script}, |
481 | /// }; |
482 | /// |
483 | /// assert_eq!( |
484 | /// Locale::from(( |
485 | /// language!("en" ), |
486 | /// Some(script!("Latn" )), |
487 | /// Some(region!("US" )) |
488 | /// )), |
489 | /// locale!("en-Latn-US" ) |
490 | /// ); |
491 | /// ``` |
492 | impl |
493 | From<( |
494 | subtags::Language, |
495 | Option<subtags::Script>, |
496 | Option<subtags::Region>, |
497 | )> for Locale |
498 | { |
499 | fn from( |
500 | lsr: ( |
501 | subtags::Language, |
502 | Option<subtags::Script>, |
503 | Option<subtags::Region>, |
504 | ), |
505 | ) -> Self { |
506 | Self { |
507 | id: lsr.into(), |
508 | ..Default::default() |
509 | } |
510 | } |
511 | } |
512 | |