1 | // This file is part of ICU4X. For terms of use, please see the file |
2 | // called LICENSE at the top level of the ICU4X source tree |
3 | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | |
5 | #[allow (deprecated)] |
6 | use crate::ordering::SubtagOrderingResult; |
7 | use crate::parser::{ |
8 | parse_locale, parse_locale_with_single_variant_single_keyword_unicode_keyword_extension, |
9 | ParserError, ParserMode, SubtagIterator, |
10 | }; |
11 | use crate::{extensions, subtags, LanguageIdentifier}; |
12 | use alloc::string::String; |
13 | use core::cmp::Ordering; |
14 | use core::str::FromStr; |
15 | use tinystr::TinyAsciiStr; |
16 | use writeable::Writeable; |
17 | |
18 | /// A core struct representing a [`Unicode Locale Identifier`]. |
19 | /// |
20 | /// A locale is made of two parts: |
21 | /// * Unicode Language Identifier |
22 | /// * A set of Unicode Extensions |
23 | /// |
24 | /// [`Locale`] exposes all of the same fields and methods as [`LanguageIdentifier`], and |
25 | /// on top of that is able to parse, manipulate and serialize unicode extension fields. |
26 | /// |
27 | /// |
28 | /// # Examples |
29 | /// |
30 | /// ``` |
31 | /// use icu::locid::{ |
32 | /// extensions::unicode::{key, value}, |
33 | /// locale, |
34 | /// subtags::{language, region}, |
35 | /// }; |
36 | /// |
37 | /// let loc = locale!("en-US-u-ca-buddhist" ); |
38 | /// |
39 | /// assert_eq!(loc.id.language, language!("en" )); |
40 | /// assert_eq!(loc.id.script, None); |
41 | /// assert_eq!(loc.id.region, Some(region!("US" ))); |
42 | /// assert_eq!(loc.id.variants.len(), 0); |
43 | /// assert_eq!( |
44 | /// loc.extensions.unicode.keywords.get(&key!("ca" )), |
45 | /// Some(&value!("buddhist" )) |
46 | /// ); |
47 | /// ``` |
48 | /// |
49 | /// # Parsing |
50 | /// |
51 | /// Unicode recognizes three levels of standard conformance for a locale: |
52 | /// |
53 | /// * *well-formed* - syntactically correct |
54 | /// * *valid* - well-formed and only uses registered language subtags, extensions, keywords, types... |
55 | /// * *canonical* - valid and no deprecated codes or structure. |
56 | /// |
57 | /// At the moment parsing normalizes a well-formed locale identifier converting |
58 | /// `_` separators to `-` and adjusting casing to conform to the Unicode standard. |
59 | /// |
60 | /// Any bogus subtags will cause the parsing to fail with an error. |
61 | /// |
62 | /// No subtag validation or alias resolution is performed. |
63 | /// |
64 | /// # Examples |
65 | /// |
66 | /// ``` |
67 | /// use icu::locid::{subtags::*, Locale}; |
68 | /// |
69 | /// let loc: Locale = "eN_latn_Us-Valencia_u-hC-H12" |
70 | /// .parse() |
71 | /// .expect("Failed to parse." ); |
72 | /// |
73 | /// assert_eq!(loc.id.language, "en" .parse::<Language>().unwrap()); |
74 | /// assert_eq!(loc.id.script, "Latn" .parse::<Script>().ok()); |
75 | /// assert_eq!(loc.id.region, "US" .parse::<Region>().ok()); |
76 | /// assert_eq!( |
77 | /// loc.id.variants.get(0), |
78 | /// "valencia" .parse::<Variant>().ok().as_ref() |
79 | /// ); |
80 | /// ``` |
81 | /// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_locale_identifier |
82 | #[derive (Default, PartialEq, Eq, Clone, Hash)] |
83 | #[allow (clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro) |
84 | pub struct Locale { |
85 | /// The basic language/script/region components in the locale identifier along with any variants. |
86 | pub id: LanguageIdentifier, |
87 | /// Any extensions present in the locale identifier. |
88 | pub extensions: extensions::Extensions, |
89 | } |
90 | |
91 | #[test ] |
92 | fn test_sizes() { |
93 | assert_eq!(core::mem::size_of::<subtags::Language>(), 3); |
94 | assert_eq!(core::mem::size_of::<subtags::Script>(), 4); |
95 | assert_eq!(core::mem::size_of::<subtags::Region>(), 3); |
96 | assert_eq!(core::mem::size_of::<subtags::Variant>(), 8); |
97 | assert_eq!(core::mem::size_of::<subtags::Variants>(), 16); |
98 | assert_eq!(core::mem::size_of::<LanguageIdentifier>(), 32); |
99 | |
100 | assert_eq!(core::mem::size_of::<extensions::transform::Transform>(), 56); |
101 | assert_eq!(core::mem::size_of::<Option<LanguageIdentifier>>(), 32); |
102 | assert_eq!(core::mem::size_of::<extensions::transform::Fields>(), 24); |
103 | |
104 | assert_eq!(core::mem::size_of::<extensions::unicode::Attributes>(), 16); |
105 | assert_eq!(core::mem::size_of::<extensions::unicode::Keywords>(), 24); |
106 | assert_eq!(core::mem::size_of::<Vec<extensions::other::Other>>(), 24); |
107 | assert_eq!(core::mem::size_of::<extensions::private::Private>(), 16); |
108 | assert_eq!(core::mem::size_of::<extensions::Extensions>(), 136); |
109 | |
110 | assert_eq!(core::mem::size_of::<Locale>(), 168); |
111 | } |
112 | |
113 | impl Locale { |
114 | /// A constructor which takes a utf8 slice, parses it and |
115 | /// produces a well-formed [`Locale`]. |
116 | /// |
117 | /// # Examples |
118 | /// |
119 | /// ``` |
120 | /// use icu::locid::Locale; |
121 | /// |
122 | /// Locale::try_from_bytes(b"en-US-u-hc-h12" ).unwrap(); |
123 | /// ``` |
124 | pub fn try_from_bytes(v: &[u8]) -> Result<Self, ParserError> { |
125 | parse_locale(v) |
126 | } |
127 | |
128 | /// The default undefined locale "und". Same as [`default()`](Default::default()). |
129 | /// |
130 | /// # Examples |
131 | /// |
132 | /// ``` |
133 | /// use icu::locid::Locale; |
134 | /// |
135 | /// assert_eq!(Locale::default(), Locale::UND); |
136 | /// ``` |
137 | pub const UND: Self = Self { |
138 | id: LanguageIdentifier::UND, |
139 | extensions: extensions::Extensions::new(), |
140 | }; |
141 | |
142 | /// This is a best-effort operation that performs all available levels of canonicalization. |
143 | /// |
144 | /// At the moment the operation will normalize casing and the separator, but in the future |
145 | /// it may also validate and update from deprecated subtags to canonical ones. |
146 | /// |
147 | /// # Examples |
148 | /// |
149 | /// ``` |
150 | /// use icu::locid::Locale; |
151 | /// |
152 | /// assert_eq!( |
153 | /// Locale::canonicalize("pL_latn_pl-U-HC-H12" ).as_deref(), |
154 | /// Ok("pl-Latn-PL-u-hc-h12" ) |
155 | /// ); |
156 | /// ``` |
157 | pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, ParserError> { |
158 | let locale = Self::try_from_bytes(input.as_ref())?; |
159 | Ok(locale.write_to_string().into_owned()) |
160 | } |
161 | |
162 | /// Compare this [`Locale`] with BCP-47 bytes. |
163 | /// |
164 | /// The return value is equivalent to what would happen if you first converted this |
165 | /// [`Locale`] to a BCP-47 string and then performed a byte comparison. |
166 | /// |
167 | /// This function is case-sensitive and results in a *total order*, so it is appropriate for |
168 | /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`. |
169 | /// |
170 | /// # Examples |
171 | /// |
172 | /// ``` |
173 | /// use icu::locid::Locale; |
174 | /// use std::cmp::Ordering; |
175 | /// |
176 | /// let bcp47_strings: &[&str] = &[ |
177 | /// "pl-Latn-PL" , |
178 | /// "und" , |
179 | /// "und-fonipa" , |
180 | /// "und-t-m0-true" , |
181 | /// "und-u-ca-hebrew" , |
182 | /// "und-u-ca-japanese" , |
183 | /// "zh" , |
184 | /// ]; |
185 | /// |
186 | /// for ab in bcp47_strings.windows(2) { |
187 | /// let a = ab[0]; |
188 | /// let b = ab[1]; |
189 | /// assert!(a.cmp(b) == Ordering::Less); |
190 | /// let a_loc = a.parse::<Locale>().unwrap(); |
191 | /// assert!(a_loc.strict_cmp(a.as_bytes()) == Ordering::Equal); |
192 | /// assert!(a_loc.strict_cmp(b.as_bytes()) == Ordering::Less); |
193 | /// } |
194 | /// ``` |
195 | pub fn strict_cmp(&self, other: &[u8]) -> Ordering { |
196 | self.writeable_cmp_bytes(other) |
197 | } |
198 | |
199 | #[allow (clippy::type_complexity)] |
200 | pub(crate) fn as_tuple( |
201 | &self, |
202 | ) -> ( |
203 | ( |
204 | subtags::Language, |
205 | Option<subtags::Script>, |
206 | Option<subtags::Region>, |
207 | &subtags::Variants, |
208 | ), |
209 | ( |
210 | ( |
211 | &extensions::unicode::Attributes, |
212 | &extensions::unicode::Keywords, |
213 | ), |
214 | ( |
215 | Option<( |
216 | subtags::Language, |
217 | Option<subtags::Script>, |
218 | Option<subtags::Region>, |
219 | &subtags::Variants, |
220 | )>, |
221 | &extensions::transform::Fields, |
222 | ), |
223 | &extensions::private::Private, |
224 | &[extensions::other::Other], |
225 | ), |
226 | ) { |
227 | (self.id.as_tuple(), self.extensions.as_tuple()) |
228 | } |
229 | |
230 | /// Returns an ordering suitable for use in [`BTreeSet`]. |
231 | /// |
232 | /// The ordering may or may not be equivalent to string ordering, and it |
233 | /// may or may not be stable across ICU4X releases. |
234 | /// |
235 | /// [`BTreeSet`]: alloc::collections::BTreeSet |
236 | pub fn total_cmp(&self, other: &Self) -> Ordering { |
237 | self.as_tuple().cmp(&other.as_tuple()) |
238 | } |
239 | |
240 | /// Compare this [`Locale`] with an iterator of BCP-47 subtags. |
241 | /// |
242 | /// This function has the same equality semantics as [`Locale::strict_cmp`]. It is intended as |
243 | /// a more modular version that allows multiple subtag iterators to be chained together. |
244 | /// |
245 | /// For an additional example, see [`SubtagOrderingResult`]. |
246 | /// |
247 | /// # Examples |
248 | /// |
249 | /// ``` |
250 | /// use icu::locid::locale; |
251 | /// use std::cmp::Ordering; |
252 | /// |
253 | /// let subtags: &[&[u8]] = |
254 | /// &[b"ca" , b"ES" , b"valencia" , b"u" , b"ca" , b"hebrew" ]; |
255 | /// |
256 | /// let loc = locale!("ca-ES-valencia-u-ca-hebrew" ); |
257 | /// assert_eq!( |
258 | /// Ordering::Equal, |
259 | /// loc.strict_cmp_iter(subtags.iter().copied()).end() |
260 | /// ); |
261 | /// |
262 | /// let loc = locale!("ca-ES-valencia" ); |
263 | /// assert_eq!( |
264 | /// Ordering::Less, |
265 | /// loc.strict_cmp_iter(subtags.iter().copied()).end() |
266 | /// ); |
267 | /// |
268 | /// let loc = locale!("ca-ES-valencia-u-nu-arab" ); |
269 | /// assert_eq!( |
270 | /// Ordering::Greater, |
271 | /// loc.strict_cmp_iter(subtags.iter().copied()).end() |
272 | /// ); |
273 | /// ``` |
274 | #[deprecated (since = "1.5.0" , note = "if you need this, please file an issue" )] |
275 | #[allow (deprecated)] |
276 | pub fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult<I> |
277 | where |
278 | I: Iterator<Item = &'l [u8]>, |
279 | { |
280 | let r = self.for_each_subtag_str(&mut |subtag| { |
281 | if let Some(other) = subtags.next() { |
282 | match subtag.as_bytes().cmp(other) { |
283 | Ordering::Equal => Ok(()), |
284 | not_equal => Err(not_equal), |
285 | } |
286 | } else { |
287 | Err(Ordering::Greater) |
288 | } |
289 | }); |
290 | match r { |
291 | Ok(_) => SubtagOrderingResult::Subtags(subtags), |
292 | Err(o) => SubtagOrderingResult::Ordering(o), |
293 | } |
294 | } |
295 | |
296 | /// Compare this `Locale` with a potentially unnormalized BCP-47 string. |
297 | /// |
298 | /// The return value is equivalent to what would happen if you first parsed the |
299 | /// BCP-47 string to a `Locale` and then performed a structural comparison. |
300 | /// |
301 | /// # Examples |
302 | /// |
303 | /// ``` |
304 | /// use icu::locid::Locale; |
305 | /// |
306 | /// let bcp47_strings: &[&str] = &[ |
307 | /// "pl-LaTn-pL" , |
308 | /// "uNd" , |
309 | /// "UND-FONIPA" , |
310 | /// "UnD-t-m0-TrUe" , |
311 | /// "uNd-u-CA-Japanese" , |
312 | /// "ZH" , |
313 | /// ]; |
314 | /// |
315 | /// for a in bcp47_strings { |
316 | /// assert!(a.parse::<Locale>().unwrap().normalizing_eq(a)); |
317 | /// } |
318 | /// ``` |
319 | pub fn normalizing_eq(&self, other: &str) -> bool { |
320 | macro_rules! subtag_matches { |
321 | ($T:ty, $iter:ident, $expected:expr) => { |
322 | $iter |
323 | .next() |
324 | .map(|b| <$T>::try_from_bytes(b) == Ok($expected)) |
325 | .unwrap_or(false) |
326 | }; |
327 | } |
328 | |
329 | let mut iter = SubtagIterator::new(other.as_bytes()); |
330 | if !subtag_matches!(subtags::Language, iter, self.id.language) { |
331 | return false; |
332 | } |
333 | if let Some(ref script) = self.id.script { |
334 | if !subtag_matches!(subtags::Script, iter, *script) { |
335 | return false; |
336 | } |
337 | } |
338 | if let Some(ref region) = self.id.region { |
339 | if !subtag_matches!(subtags::Region, iter, *region) { |
340 | return false; |
341 | } |
342 | } |
343 | for variant in self.id.variants.iter() { |
344 | if !subtag_matches!(subtags::Variant, iter, *variant) { |
345 | return false; |
346 | } |
347 | } |
348 | if !self.extensions.is_empty() { |
349 | match extensions::Extensions::try_from_iter(&mut iter) { |
350 | Ok(exts) => { |
351 | if self.extensions != exts { |
352 | return false; |
353 | } |
354 | } |
355 | Err(_) => { |
356 | return false; |
357 | } |
358 | } |
359 | } |
360 | iter.next().is_none() |
361 | } |
362 | |
363 | #[doc (hidden)] |
364 | #[allow (clippy::type_complexity)] |
365 | pub const fn try_from_bytes_with_single_variant_single_keyword_unicode_extension( |
366 | v: &[u8], |
367 | ) -> Result< |
368 | ( |
369 | subtags::Language, |
370 | Option<subtags::Script>, |
371 | Option<subtags::Region>, |
372 | Option<subtags::Variant>, |
373 | Option<(extensions::unicode::Key, Option<TinyAsciiStr<8>>)>, |
374 | ), |
375 | ParserError, |
376 | > { |
377 | parse_locale_with_single_variant_single_keyword_unicode_keyword_extension( |
378 | v, |
379 | ParserMode::Locale, |
380 | ) |
381 | } |
382 | |
383 | pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E> |
384 | where |
385 | F: FnMut(&str) -> Result<(), E>, |
386 | { |
387 | self.id.for_each_subtag_str(f)?; |
388 | self.extensions.for_each_subtag_str(f)?; |
389 | Ok(()) |
390 | } |
391 | } |
392 | |
393 | impl FromStr for Locale { |
394 | type Err = ParserError; |
395 | |
396 | fn from_str(source: &str) -> Result<Self, Self::Err> { |
397 | Self::try_from_bytes(source.as_bytes()) |
398 | } |
399 | } |
400 | |
401 | impl From<LanguageIdentifier> for Locale { |
402 | fn from(id: LanguageIdentifier) -> Self { |
403 | Self { |
404 | id, |
405 | extensions: extensions::Extensions::default(), |
406 | } |
407 | } |
408 | } |
409 | |
410 | impl From<Locale> for LanguageIdentifier { |
411 | fn from(loc: Locale) -> Self { |
412 | loc.id |
413 | } |
414 | } |
415 | |
416 | impl AsRef<LanguageIdentifier> for Locale { |
417 | #[inline (always)] |
418 | fn as_ref(&self) -> &LanguageIdentifier { |
419 | &self.id |
420 | } |
421 | } |
422 | |
423 | impl AsMut<LanguageIdentifier> for Locale { |
424 | fn as_mut(&mut self) -> &mut LanguageIdentifier { |
425 | &mut self.id |
426 | } |
427 | } |
428 | |
429 | impl core::fmt::Debug for Locale { |
430 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
431 | writeable::Writeable::write_to(self, sink:f) |
432 | } |
433 | } |
434 | |
435 | impl_writeable_for_each_subtag_str_no_test!(Locale, selff, selff.extensions.is_empty() => selff.id.write_to_string()); |
436 | |
437 | #[test ] |
438 | fn test_writeable() { |
439 | use writeable::assert_writeable_eq; |
440 | assert_writeable_eq!(Locale::UND, "und" ); |
441 | assert_writeable_eq!("und-001" .parse::<Locale>().unwrap(), "und-001" ); |
442 | assert_writeable_eq!("und-Mymr" .parse::<Locale>().unwrap(), "und-Mymr" ); |
443 | assert_writeable_eq!("my-Mymr-MM" .parse::<Locale>().unwrap(), "my-Mymr-MM" ); |
444 | assert_writeable_eq!( |
445 | "my-Mymr-MM-posix" .parse::<Locale>().unwrap(), |
446 | "my-Mymr-MM-posix" , |
447 | ); |
448 | assert_writeable_eq!( |
449 | "zh-macos-posix" .parse::<Locale>().unwrap(), |
450 | "zh-macos-posix" , |
451 | ); |
452 | assert_writeable_eq!( |
453 | "my-t-my-d0-zawgyi" .parse::<Locale>().unwrap(), |
454 | "my-t-my-d0-zawgyi" , |
455 | ); |
456 | assert_writeable_eq!( |
457 | "ar-SA-u-ca-islamic-civil" .parse::<Locale>().unwrap(), |
458 | "ar-SA-u-ca-islamic-civil" , |
459 | ); |
460 | assert_writeable_eq!( |
461 | "en-001-x-foo-bar" .parse::<Locale>().unwrap(), |
462 | "en-001-x-foo-bar" , |
463 | ); |
464 | assert_writeable_eq!("und-t-m0-true" .parse::<Locale>().unwrap(), "und-t-m0-true" ,); |
465 | } |
466 | |
467 | /// # Examples |
468 | /// |
469 | /// ``` |
470 | /// use icu::locid::Locale; |
471 | /// use icu::locid::{locale, subtags::language}; |
472 | /// |
473 | /// assert_eq!(Locale::from(language!("en" )), locale!("en" )); |
474 | /// ``` |
475 | impl From<subtags::Language> for Locale { |
476 | fn from(language: subtags::Language) -> Self { |
477 | Self { |
478 | id: language.into(), |
479 | ..Default::default() |
480 | } |
481 | } |
482 | } |
483 | |
484 | /// # Examples |
485 | /// |
486 | /// ``` |
487 | /// use icu::locid::Locale; |
488 | /// use icu::locid::{locale, subtags::script}; |
489 | /// |
490 | /// assert_eq!(Locale::from(Some(script!("latn" ))), locale!("und-Latn" )); |
491 | /// ``` |
492 | impl From<Option<subtags::Script>> for Locale { |
493 | fn from(script: Option<subtags::Script>) -> Self { |
494 | Self { |
495 | id: script.into(), |
496 | ..Default::default() |
497 | } |
498 | } |
499 | } |
500 | |
501 | /// # Examples |
502 | /// |
503 | /// ``` |
504 | /// use icu::locid::Locale; |
505 | /// use icu::locid::{locale, subtags::region}; |
506 | /// |
507 | /// assert_eq!(Locale::from(Some(region!("US" ))), locale!("und-US" )); |
508 | /// ``` |
509 | impl From<Option<subtags::Region>> for Locale { |
510 | fn from(region: Option<subtags::Region>) -> Self { |
511 | Self { |
512 | id: region.into(), |
513 | ..Default::default() |
514 | } |
515 | } |
516 | } |
517 | |
518 | /// # Examples |
519 | /// |
520 | /// ``` |
521 | /// use icu::locid::Locale; |
522 | /// use icu::locid::{ |
523 | /// locale, |
524 | /// subtags::{language, region, script}, |
525 | /// }; |
526 | /// |
527 | /// assert_eq!( |
528 | /// Locale::from(( |
529 | /// language!("en" ), |
530 | /// Some(script!("Latn" )), |
531 | /// Some(region!("US" )) |
532 | /// )), |
533 | /// locale!("en-Latn-US" ) |
534 | /// ); |
535 | /// ``` |
536 | impl |
537 | From<( |
538 | subtags::Language, |
539 | Option<subtags::Script>, |
540 | Option<subtags::Region>, |
541 | )> for Locale |
542 | { |
543 | fn from( |
544 | lsr: ( |
545 | subtags::Language, |
546 | Option<subtags::Script>, |
547 | Option<subtags::Region>, |
548 | ), |
549 | ) -> Self { |
550 | Self { |
551 | id: lsr.into(), |
552 | ..Default::default() |
553 | } |
554 | } |
555 | } |
556 | |