1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::ordering::SubtagOrderingResult;
6use crate::parser::{
7 parse_locale, parse_locale_with_single_variant_single_keyword_unicode_keyword_extension,
8 ParserError, ParserMode, SubtagIterator,
9};
10use crate::{extensions, subtags, LanguageIdentifier};
11use alloc::string::String;
12use core::cmp::Ordering;
13use core::str::FromStr;
14use tinystr::TinyAsciiStr;
15use writeable::Writeable;
16
17/// A core struct representing a [`Unicode Locale Identifier`].
18///
19/// A locale is made of two parts:
20/// * Unicode Language Identifier
21/// * A set of Unicode Extensions
22///
23/// [`Locale`] exposes all of the same fields and methods as [`LanguageIdentifier`], and
24/// on top of that is able to parse, manipulate and serialize unicode extension fields.
25///
26///
27/// # Examples
28///
29/// ```
30/// use icu_locid::{
31/// extensions::unicode::{key, value},
32/// locale,
33/// subtags::{language, region},
34/// };
35///
36/// let loc = locale!("en-US-u-ca-buddhist");
37///
38/// assert_eq!(loc.id.language, language!("en"));
39/// assert_eq!(loc.id.script, None);
40/// assert_eq!(loc.id.region, Some(region!("US")));
41/// assert_eq!(loc.id.variants.len(), 0);
42/// assert_eq!(
43/// loc.extensions.unicode.keywords.get(&key!("ca")),
44/// Some(&value!("buddhist"))
45/// );
46/// ```
47///
48/// # Parsing
49///
50/// Unicode recognizes three levels of standard conformance for a locale:
51///
52/// * *well-formed* - syntactically correct
53/// * *valid* - well-formed and only uses registered language subtags, extensions, keywords, types...
54/// * *canonical* - valid and no deprecated codes or structure.
55///
56/// At the moment parsing normalizes a well-formed locale identifier converting
57/// `_` separators to `-` and adjusting casing to conform to the Unicode standard.
58///
59/// Any bogus subtags will cause the parsing to fail with an error.
60///
61/// No subtag validation or alias resolution is performed.
62///
63/// # Examples
64///
65/// ```
66/// use icu::locid::{subtags::*, Locale};
67///
68/// let loc: Locale = "eN_latn_Us-Valencia_u-hC-H12"
69/// .parse()
70/// .expect("Failed to parse.");
71///
72/// assert_eq!(loc.id.language, "en".parse::<Language>().unwrap());
73/// assert_eq!(loc.id.script, "Latn".parse::<Script>().ok());
74/// assert_eq!(loc.id.region, "US".parse::<Region>().ok());
75/// assert_eq!(
76/// loc.id.variants.get(0),
77/// "valencia".parse::<Variant>().ok().as_ref()
78/// );
79/// ```
80/// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_locale_identifier
81#[derive(Default, PartialEq, Eq, Clone, Hash)]
82#[allow(clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro)
83pub struct Locale {
84 /// The basic language/script/region components in the locale identifier along with any variants.
85 pub id: LanguageIdentifier,
86 /// Any extensions present in the locale identifier.
87 pub extensions: extensions::Extensions,
88}
89
90#[test]
91fn test_sizes() {
92 assert_eq!(core::mem::size_of::<subtags::Language>(), 3);
93 assert_eq!(core::mem::size_of::<subtags::Script>(), 4);
94 assert_eq!(core::mem::size_of::<subtags::Region>(), 3);
95 assert_eq!(core::mem::size_of::<subtags::Variant>(), 8);
96 assert_eq!(core::mem::size_of::<subtags::Variants>(), 16);
97 assert_eq!(core::mem::size_of::<LanguageIdentifier>(), 32);
98
99 assert_eq!(core::mem::size_of::<extensions::transform::Transform>(), 56);
100 assert_eq!(core::mem::size_of::<Option<LanguageIdentifier>>(), 32);
101 assert_eq!(core::mem::size_of::<extensions::transform::Fields>(), 24);
102
103 assert_eq!(core::mem::size_of::<extensions::unicode::Attributes>(), 16);
104 assert_eq!(core::mem::size_of::<extensions::unicode::Keywords>(), 24);
105 assert_eq!(core::mem::size_of::<Vec<extensions::other::Other>>(), 24);
106 assert_eq!(core::mem::size_of::<extensions::private::Private>(), 16);
107 assert_eq!(core::mem::size_of::<extensions::Extensions>(), 136);
108
109 assert_eq!(core::mem::size_of::<Locale>(), 168);
110}
111
112impl Locale {
113 /// A constructor which takes a utf8 slice, parses it and
114 /// produces a well-formed [`Locale`].
115 ///
116 /// # Examples
117 ///
118 /// ```
119 /// use icu::locid::Locale;
120 ///
121 /// Locale::try_from_bytes(b"en-US-u-hc-h12").unwrap();
122 /// ```
123 pub fn try_from_bytes(v: &[u8]) -> Result<Self, ParserError> {
124 parse_locale(v)
125 }
126
127 /// The default undefined locale "und". Same as [`default()`](Default::default()).
128 ///
129 /// # Examples
130 ///
131 /// ```
132 /// use icu::locid::Locale;
133 ///
134 /// assert_eq!(Locale::default(), Locale::UND);
135 /// ```
136 pub const UND: Self = Self {
137 id: LanguageIdentifier::UND,
138 extensions: extensions::Extensions::new(),
139 };
140
141 /// This is a best-effort operation that performs all available levels of canonicalization.
142 ///
143 /// At the moment the operation will normalize casing and the separator, but in the future
144 /// it may also validate and update from deprecated subtags to canonical ones.
145 ///
146 /// # Examples
147 ///
148 /// ```
149 /// use icu::locid::Locale;
150 ///
151 /// assert_eq!(
152 /// Locale::canonicalize("pL_latn_pl-U-HC-H12").as_deref(),
153 /// Ok("pl-Latn-PL-u-hc-h12")
154 /// );
155 /// ```
156 pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, ParserError> {
157 let locale = Self::try_from_bytes(input.as_ref())?;
158 Ok(locale.write_to_string().into_owned())
159 }
160
161 /// Compare this [`Locale`] with BCP-47 bytes.
162 ///
163 /// The return value is equivalent to what would happen if you first converted this
164 /// [`Locale`] to a BCP-47 string and then performed a byte comparison.
165 ///
166 /// This function is case-sensitive and results in a *total order*, so it is appropriate for
167 /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
168 ///
169 /// # Examples
170 ///
171 /// ```
172 /// use icu::locid::Locale;
173 /// use std::cmp::Ordering;
174 ///
175 /// let bcp47_strings: &[&str] = &[
176 /// "pl-Latn-PL",
177 /// "und",
178 /// "und-fonipa",
179 /// "und-t-m0-true",
180 /// "und-u-ca-hebrew",
181 /// "und-u-ca-japanese",
182 /// "zh",
183 /// ];
184 ///
185 /// for ab in bcp47_strings.windows(2) {
186 /// let a = ab[0];
187 /// let b = ab[1];
188 /// assert!(a.cmp(b) == Ordering::Less);
189 /// let a_loc = a.parse::<Locale>().unwrap();
190 /// assert!(a_loc.strict_cmp(a.as_bytes()) == Ordering::Equal);
191 /// assert!(a_loc.strict_cmp(b.as_bytes()) == Ordering::Less);
192 /// }
193 /// ```
194 pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
195 self.strict_cmp_iter(other.split(|b| *b == b'-')).end()
196 }
197
198 /// Compare this [`Locale`] with an iterator of BCP-47 subtags.
199 ///
200 /// This function has the same equality semantics as [`Locale::strict_cmp`]. It is intended as
201 /// a more modular version that allows multiple subtag iterators to be chained together.
202 ///
203 /// For an additional example, see [`SubtagOrderingResult`].
204 ///
205 /// # Examples
206 ///
207 /// ```
208 /// use icu::locid::locale;
209 /// use std::cmp::Ordering;
210 ///
211 /// let subtags: &[&[u8]] =
212 /// &[b"ca", b"ES", b"valencia", b"u", b"ca", b"hebrew"];
213 ///
214 /// let loc = locale!("ca-ES-valencia-u-ca-hebrew");
215 /// assert_eq!(
216 /// Ordering::Equal,
217 /// loc.strict_cmp_iter(subtags.iter().copied()).end()
218 /// );
219 ///
220 /// let loc = locale!("ca-ES-valencia");
221 /// assert_eq!(
222 /// Ordering::Less,
223 /// loc.strict_cmp_iter(subtags.iter().copied()).end()
224 /// );
225 ///
226 /// let loc = locale!("ca-ES-valencia-u-nu-arab");
227 /// assert_eq!(
228 /// Ordering::Greater,
229 /// loc.strict_cmp_iter(subtags.iter().copied()).end()
230 /// );
231 /// ```
232 pub fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult<I>
233 where
234 I: Iterator<Item = &'l [u8]>,
235 {
236 let r = self.for_each_subtag_str(&mut |subtag| {
237 if let Some(other) = subtags.next() {
238 match subtag.as_bytes().cmp(other) {
239 Ordering::Equal => Ok(()),
240 not_equal => Err(not_equal),
241 }
242 } else {
243 Err(Ordering::Greater)
244 }
245 });
246 match r {
247 Ok(_) => SubtagOrderingResult::Subtags(subtags),
248 Err(o) => SubtagOrderingResult::Ordering(o),
249 }
250 }
251
252 /// Compare this `Locale` with a potentially unnormalized BCP-47 string.
253 ///
254 /// The return value is equivalent to what would happen if you first parsed the
255 /// BCP-47 string to a `Locale` and then performed a structural comparison.
256 ///
257 /// # Examples
258 ///
259 /// ```
260 /// use icu::locid::Locale;
261 /// use std::cmp::Ordering;
262 ///
263 /// let bcp47_strings: &[&str] = &[
264 /// "pl-LaTn-pL",
265 /// "uNd",
266 /// "UND-FONIPA",
267 /// "UnD-t-m0-TrUe",
268 /// "uNd-u-CA-Japanese",
269 /// "ZH",
270 /// ];
271 ///
272 /// for a in bcp47_strings {
273 /// assert!(a.parse::<Locale>().unwrap().normalizing_eq(a));
274 /// }
275 /// ```
276 pub fn normalizing_eq(&self, other: &str) -> bool {
277 macro_rules! subtag_matches {
278 ($T:ty, $iter:ident, $expected:expr) => {
279 $iter
280 .next()
281 .map(|b| <$T>::try_from_bytes(b) == Ok($expected))
282 .unwrap_or(false)
283 };
284 }
285
286 let mut iter = SubtagIterator::new(other.as_bytes());
287 if !subtag_matches!(subtags::Language, iter, self.id.language) {
288 return false;
289 }
290 if let Some(ref script) = self.id.script {
291 if !subtag_matches!(subtags::Script, iter, *script) {
292 return false;
293 }
294 }
295 if let Some(ref region) = self.id.region {
296 if !subtag_matches!(subtags::Region, iter, *region) {
297 return false;
298 }
299 }
300 for variant in self.id.variants.iter() {
301 if !subtag_matches!(subtags::Variant, iter, *variant) {
302 return false;
303 }
304 }
305 if !self.extensions.is_empty() {
306 match extensions::Extensions::try_from_iter(&mut iter) {
307 Ok(exts) => {
308 if self.extensions != exts {
309 return false;
310 }
311 }
312 Err(_) => {
313 return false;
314 }
315 }
316 }
317 iter.next().is_none()
318 }
319
320 #[doc(hidden)]
321 #[allow(clippy::type_complexity)]
322 pub const fn try_from_bytes_with_single_variant_single_keyword_unicode_extension(
323 v: &[u8],
324 ) -> Result<
325 (
326 subtags::Language,
327 Option<subtags::Script>,
328 Option<subtags::Region>,
329 Option<subtags::Variant>,
330 Option<(extensions::unicode::Key, Option<TinyAsciiStr<8>>)>,
331 ),
332 ParserError,
333 > {
334 parse_locale_with_single_variant_single_keyword_unicode_keyword_extension(
335 v,
336 ParserMode::Locale,
337 )
338 }
339
340 pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
341 where
342 F: FnMut(&str) -> Result<(), E>,
343 {
344 self.id.for_each_subtag_str(f)?;
345 self.extensions.for_each_subtag_str(f)?;
346 Ok(())
347 }
348}
349
350impl FromStr for Locale {
351 type Err = ParserError;
352
353 fn from_str(source: &str) -> Result<Self, Self::Err> {
354 Self::try_from_bytes(source.as_bytes())
355 }
356}
357
358impl From<LanguageIdentifier> for Locale {
359 fn from(id: LanguageIdentifier) -> Self {
360 Self {
361 id,
362 extensions: extensions::Extensions::default(),
363 }
364 }
365}
366
367impl From<Locale> for LanguageIdentifier {
368 fn from(loc: Locale) -> Self {
369 loc.id
370 }
371}
372
373impl AsRef<LanguageIdentifier> for Locale {
374 fn as_ref(&self) -> &LanguageIdentifier {
375 &self.id
376 }
377}
378
379impl AsMut<LanguageIdentifier> for Locale {
380 fn as_mut(&mut self) -> &mut LanguageIdentifier {
381 &mut self.id
382 }
383}
384
385impl core::fmt::Debug for Locale {
386 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
387 writeable::Writeable::write_to(self, sink:f)
388 }
389}
390
391impl_writeable_for_each_subtag_str_no_test!(Locale, selff, selff.extensions.is_empty() => selff.id.write_to_string());
392
393#[test]
394fn test_writeable() {
395 use writeable::assert_writeable_eq;
396 assert_writeable_eq!(Locale::UND, "und");
397 assert_writeable_eq!("und-001".parse::<Locale>().unwrap(), "und-001");
398 assert_writeable_eq!("und-Mymr".parse::<Locale>().unwrap(), "und-Mymr");
399 assert_writeable_eq!("my-Mymr-MM".parse::<Locale>().unwrap(), "my-Mymr-MM");
400 assert_writeable_eq!(
401 "my-Mymr-MM-posix".parse::<Locale>().unwrap(),
402 "my-Mymr-MM-posix",
403 );
404 assert_writeable_eq!(
405 "zh-macos-posix".parse::<Locale>().unwrap(),
406 "zh-macos-posix",
407 );
408 assert_writeable_eq!(
409 "my-t-my-d0-zawgyi".parse::<Locale>().unwrap(),
410 "my-t-my-d0-zawgyi",
411 );
412 assert_writeable_eq!(
413 "ar-SA-u-ca-islamic-civil".parse::<Locale>().unwrap(),
414 "ar-SA-u-ca-islamic-civil",
415 );
416 assert_writeable_eq!(
417 "en-001-x-foo-bar".parse::<Locale>().unwrap(),
418 "en-001-x-foo-bar",
419 );
420 assert_writeable_eq!("und-t-m0-true".parse::<Locale>().unwrap(), "und-t-m0-true",);
421}
422
423/// # Examples
424///
425/// ```
426/// use icu::locid::Locale;
427/// use icu::locid::{locale, subtags::language};
428///
429/// assert_eq!(Locale::from(language!("en")), locale!("en"));
430/// ```
431impl From<subtags::Language> for Locale {
432 fn from(language: subtags::Language) -> Self {
433 Self {
434 id: language.into(),
435 ..Default::default()
436 }
437 }
438}
439
440/// # Examples
441///
442/// ```
443/// use icu::locid::Locale;
444/// use icu::locid::{locale, subtags::script};
445///
446/// assert_eq!(Locale::from(Some(script!("latn"))), locale!("und-Latn"));
447/// ```
448impl From<Option<subtags::Script>> for Locale {
449 fn from(script: Option<subtags::Script>) -> Self {
450 Self {
451 id: script.into(),
452 ..Default::default()
453 }
454 }
455}
456
457/// # Examples
458///
459/// ```
460/// use icu::locid::Locale;
461/// use icu::locid::{locale, subtags::region};
462///
463/// assert_eq!(Locale::from(Some(region!("US"))), locale!("und-US"));
464/// ```
465impl From<Option<subtags::Region>> for Locale {
466 fn from(region: Option<subtags::Region>) -> Self {
467 Self {
468 id: region.into(),
469 ..Default::default()
470 }
471 }
472}
473
474/// # Examples
475///
476/// ```
477/// use icu::locid::Locale;
478/// use icu::locid::{
479/// locale,
480/// subtags::{language, region, script},
481/// };
482///
483/// assert_eq!(
484/// Locale::from((
485/// language!("en"),
486/// Some(script!("Latn")),
487/// Some(region!("US"))
488/// )),
489/// locale!("en-Latn-US")
490/// );
491/// ```
492impl
493 From<(
494 subtags::Language,
495 Option<subtags::Script>,
496 Option<subtags::Region>,
497 )> for Locale
498{
499 fn from(
500 lsr: (
501 subtags::Language,
502 Option<subtags::Script>,
503 Option<subtags::Region>,
504 ),
505 ) -> Self {
506 Self {
507 id: lsr.into(),
508 ..Default::default()
509 }
510 }
511}
512