locale.rs source code [crates/icu_locid/src/locale.rs]

1	// This file is part of ICU4X. For terms of use, please see the file
2	// called LICENSE at the top level of the ICU4X source tree
3	// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5	#[allow(deprecated)]
6	use crate::ordering::SubtagOrderingResult;
7	use crate::parser::{
8	parse_locale, parse_locale_with_single_variant_single_keyword_unicode_keyword_extension,
9	ParserError, ParserMode, SubtagIterator,
10	};
11	use crate::{extensions, subtags, LanguageIdentifier};
12	use alloc::string::String;
13	use core::cmp::Ordering;
14	use core::str::FromStr;
15	use tinystr::TinyAsciiStr;
16	use writeable::Writeable;
17
18	/// A core struct representing a [`Unicode Locale Identifier`].
19	///
20	/// A locale is made of two parts:
21	/// Unicode Language Identifier*
22	/// A set of Unicode Extensions*
23	///
24	/// [`Locale`] exposes all of the same fields and methods as [`LanguageIdentifier`], and
25	/// on top of that is able to parse, manipulate and serialize unicode extension fields.
26	///
27	///
28	/// # Examples
29	///
30	/// ```
31	/// use icu::locid::{
32	/// extensions::unicode::{key, value},
33	/// locale,
34	/// subtags::{language, region},
35	/// };
36	///
37	/// let loc = locale!("en-US-u-ca-buddhist");
38	///
39	/// assert_eq!(loc.id.language, language!("en"));
40	/// assert_eq!(loc.id.script, None);
41	/// assert_eq!(loc.id.region, Some(region!("US")));
42	/// assert_eq!(loc.id.variants.len(), `0`);
43	/// assert_eq!(
44	/// loc.extensions.unicode.keywords.get(&key!("ca")),
45	/// Some(&value!("buddhist"))
46	/// );
47	/// ```
48	///
49	/// # Parsing
50	///
51	/// Unicode recognizes three levels of standard conformance for a locale:
52	///
53	/// well-formed - syntactically correct*
54	/// valid - well-formed and only uses registered language subtags, extensions, keywords, types...*
55	/// canonical - valid and no deprecated codes or structure.*
56	///
57	/// At the moment parsing normalizes a well-formed locale identifier converting
58	/// `_` separators to `-` and adjusting casing to conform to the Unicode standard.
59	///
60	/// Any bogus subtags will cause the parsing to fail with an error.
61	///
62	/// No subtag validation or alias resolution is performed.
63	///
64	/// # Examples
65	///
66	/// ```
67	/// use icu::locid::{subtags::*, Locale};
68	///
69	/// let loc: Locale = "eN_latn_Us-Valencia_u-hC-H12"
70	/// .parse()
71	/// .expect("Failed to parse.");
72	///
73	/// assert_eq!(loc.id.language, "en".parse::<Language>().unwrap());
74	/// assert_eq!(loc.id.script, "Latn".parse::<Script>().ok());
75	/// assert_eq!(loc.id.region, "US".parse::<Region>().ok());
76	/// assert_eq!(
77	/// loc.id.variants.get(`0`),
78	/// "valencia".parse::<Variant>().ok().as_ref()
79	/// );
80	/// ```
81	/// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_locale_identifier
82	#[derive(Default, PartialEq, Eq, Clone, Hash)]
83	#[allow(clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro)
84	pub struct Locale {
85	/// The basic language/script/region components in the locale identifier along with any variants.
86	pub id: LanguageIdentifier,
87	/// Any extensions present in the locale identifier.
88	pub extensions: extensions::Extensions,
89	}
90
91	#[test]
92	fn test_sizes() {
93	assert_eq!(core::mem::size_of::<subtags::Language>(), `3`);
94	assert_eq!(core::mem::size_of::<subtags::Script>(), `4`);
95	assert_eq!(core::mem::size_of::<subtags::Region>(), `3`);
96	assert_eq!(core::mem::size_of::<subtags::Variant>(), `8`);
97	assert_eq!(core::mem::size_of::<subtags::Variants>(), `16`);
98	assert_eq!(core::mem::size_of::<LanguageIdentifier>(), `32`);
99
100	assert_eq!(core::mem::size_of::<extensions::transform::Transform>(), `56`);
101	assert_eq!(core::mem::size_of::<Option<LanguageIdentifier>>(), `32`);
102	assert_eq!(core::mem::size_of::<extensions::transform::Fields>(), `24`);
103
104	assert_eq!(core::mem::size_of::<extensions::unicode::Attributes>(), `16`);
105	assert_eq!(core::mem::size_of::<extensions::unicode::Keywords>(), `24`);
106	assert_eq!(core::mem::size_of::<Vec<extensions::other::Other>>(), `24`);
107	assert_eq!(core::mem::size_of::<extensions::private::Private>(), `16`);
108	assert_eq!(core::mem::size_of::<extensions::Extensions>(), `136`);
109
110	assert_eq!(core::mem::size_of::<Locale>(), `168`);
111	}
112
113	impl Locale {
114	/// A constructor which takes a utf8 slice, parses it and
115	/// produces a well-formed [`Locale`].
116	///
117	/// # Examples
118	///
119	/// ```
120	/// use icu::locid::Locale;
121	///
122	/// Locale::try_from_bytes(b"en-US-u-hc-h12").unwrap();
123	/// ```
124	pub fn try_from_bytes(v: &[u8]) -> Result<Self, ParserError> {
125	parse_locale(v)
126	}
127
128	/// The default undefined locale "und". Same as [`default()`](Default::default()).
129	///
130	/// # Examples
131	///
132	/// ```
133	/// use icu::locid::Locale;
134	///
135	/// assert_eq!(Locale::default(), Locale::UND);
136	/// ```
137	pub const UND: Self = Self {
138	id: LanguageIdentifier::UND,
139	extensions: extensions::Extensions::new(),
140	};
141
142	/// This is a best-effort operation that performs all available levels of canonicalization.
143	///
144	/// At the moment the operation will normalize casing and the separator, but in the future
145	/// it may also validate and update from deprecated subtags to canonical ones.
146	///
147	/// # Examples
148	///
149	/// ```
150	/// use icu::locid::Locale;
151	///
152	/// assert_eq!(
153	/// Locale::canonicalize("pL_latn_pl-U-HC-H12").as_deref(),
154	/// Ok("pl-Latn-PL-u-hc-h12")
155	/// );
156	/// ```
157	pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, ParserError> {
158	let locale = Self::try_from_bytes(input.as_ref())?;
159	Ok(locale.write_to_string().into_owned())
160	}
161
162	/// Compare this [`Locale`] with BCP-47 bytes.
163	///
164	/// The return value is equivalent to what would happen if you first converted this
165	/// [`Locale`] to a BCP-47 string and then performed a byte comparison.
166	///
167	/// This function is case-sensitive and results in a total order, so it is appropriate for
168	/// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
169	///
170	/// # Examples
171	///
172	/// ```
173	/// use icu::locid::Locale;
174	/// use std::cmp::Ordering;
175	///
176	/// let bcp47_strings: &[&str] = &[
177	/// "pl-Latn-PL",
178	/// "und",
179	/// "und-fonipa",
180	/// "und-t-m0-true",
181	/// "und-u-ca-hebrew",
182	/// "und-u-ca-japanese",
183	/// "zh",
184	/// ];
185	///
186	/// for ab in bcp47_strings.windows(`2`) {
187	/// let a = ab[`0`];
188	/// let b = ab[`1`];
189	/// assert!(a.cmp(b) == Ordering::Less);
190	/// let a_loc = a.parse::<Locale>().unwrap();
191	/// assert!(a_loc.strict_cmp(a.as_bytes()) == Ordering::Equal);
192	/// assert!(a_loc.strict_cmp(b.as_bytes()) == Ordering::Less);
193	/// }
194	/// ```
195	pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
196	self.writeable_cmp_bytes(other)
197	}
198
199	#[allow(clippy::type_complexity)]
200	pub(crate) fn as_tuple(
201	&self,
202	) -> (
203	(
204	subtags::Language,
205	Option<subtags::Script>,
206	Option<subtags::Region>,
207	&subtags::Variants,
208	),
209	(
210	(
211	&extensions::unicode::Attributes,
212	&extensions::unicode::Keywords,
213	),
214	(
215	Option<(
216	subtags::Language,
217	Option<subtags::Script>,
218	Option<subtags::Region>,
219	&subtags::Variants,
220	)>,
221	&extensions::transform::Fields,
222	),
223	&extensions::private::Private,
224	&[extensions::other::Other],
225	),
226	) {
227	(self.id.as_tuple(), self.extensions.as_tuple())
228	}
229
230	/// Returns an ordering suitable for use in [`BTreeSet`].
231	///
232	/// The ordering may or may not be equivalent to string ordering, and it
233	/// may or may not be stable across ICU4X releases.
234	///
235	/// [`BTreeSet`]: alloc::collections::BTreeSet
236	pub fn total_cmp(&self, other: &Self) -> Ordering {
237	self.as_tuple().cmp(&other.as_tuple())
238	}
239
240	/// Compare this [`Locale`] with an iterator of BCP-47 subtags.
241	///
242	/// This function has the same equality semantics as [`Locale::strict_cmp`]. It is intended as
243	/// a more modular version that allows multiple subtag iterators to be chained together.
244	///
245	/// For an additional example, see [`SubtagOrderingResult`].
246	///
247	/// # Examples
248	///
249	/// ```
250	/// use icu::locid::locale;
251	/// use std::cmp::Ordering;
252	///
253	/// let subtags: &[&[u8]] =
254	/// &[b"ca", b"ES", b"valencia", b"u", b"ca", b"hebrew"];
255	///
256	/// let loc = locale!("ca-ES-valencia-u-ca-hebrew");
257	/// assert_eq!(
258	/// Ordering::Equal,
259	/// loc.strict_cmp_iter(subtags.iter().copied()).end()
260	/// );
261	///
262	/// let loc = locale!("ca-ES-valencia");
263	/// assert_eq!(
264	/// Ordering::Less,
265	/// loc.strict_cmp_iter(subtags.iter().copied()).end()
266	/// );
267	///
268	/// let loc = locale!("ca-ES-valencia-u-nu-arab");
269	/// assert_eq!(
270	/// Ordering::Greater,
271	/// loc.strict_cmp_iter(subtags.iter().copied()).end()
272	/// );
273	/// ```
274	#[deprecated(since = "1.5.0", note = "if you need this, please file an issue")]
275	#[allow(deprecated)]
276	pub fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult<I>
277	where
278	I: Iterator<Item = &'l [u8]>,
279	{
280	let r = self.for_each_subtag_str(&mut \|subtag\| {
281	if let Some(other) = subtags.next() {
282	match subtag.as_bytes().cmp(other) {
283	Ordering::Equal => Ok(()),
284	not_equal => Err(not_equal),
285	}
286	} else {
287	Err(Ordering::Greater)
288	}
289	});
290	match r {
291	Ok(_) => SubtagOrderingResult::Subtags(subtags),
292	Err(o) => SubtagOrderingResult::Ordering(o),
293	}
294	}
295
296	/// Compare this `Locale` with a potentially unnormalized BCP-47 string.
297	///
298	/// The return value is equivalent to what would happen if you first parsed the
299	/// BCP-47 string to a `Locale` and then performed a structural comparison.
300	///
301	/// # Examples
302	///
303	/// ```
304	/// use icu::locid::Locale;
305	///
306	/// let bcp47_strings: &[&str] = &[
307	/// "pl-LaTn-pL",
308	/// "uNd",
309	/// "UND-FONIPA",
310	/// "UnD-t-m0-TrUe",
311	/// "uNd-u-CA-Japanese",
312	/// "ZH",
313	/// ];
314	///
315	/// for a in bcp47_strings {
316	/// assert!(a.parse::<Locale>().unwrap().normalizing_eq(a));
317	/// }
318	/// ```
319	pub fn normalizing_eq(&self, other: &str) -> bool {
320	macro_rules! subtag_matches {
321	($T:ty, $iter:ident, $expected:expr) => {
322	$iter
323	.next()
324	.map(\|b\| <$T>::try_from_bytes(b) == Ok($expected))
325	.unwrap_or(`false`)
326	};
327	}
328
329	let mut iter = SubtagIterator::new(other.as_bytes());
330	if !subtag_matches!(subtags::Language, iter, self.id.language) {
331	return `false`;
332	}
333	if let Some(ref script) = self.id.script {
334	if !subtag_matches!(subtags::Script, iter, *script) {
335	return `false`;
336	}
337	}
338	if let Some(ref region) = self.id.region {
339	if !subtag_matches!(subtags::Region, iter, *region) {
340	return `false`;
341	}
342	}
343	for variant in self.id.variants.iter() {
344	if !subtag_matches!(subtags::Variant, iter, *variant) {
345	return `false`;
346	}
347	}
348	if !self.extensions.is_empty() {
349	match extensions::Extensions::try_from_iter(&mut iter) {
350	Ok(exts) => {
351	if self.extensions != exts {
352	return `false`;
353	}
354	}
355	Err(_) => {
356	return `false`;
357	}
358	}
359	}
360	iter.next().is_none()
361	}
362
363	#[doc(hidden)]
364	#[allow(clippy::type_complexity)]
365	pub const fn try_from_bytes_with_single_variant_single_keyword_unicode_extension(
366	v: &[u8],
367	) -> Result<
368	(
369	subtags::Language,
370	Option<subtags::Script>,
371	Option<subtags::Region>,
372	Option<subtags::Variant>,
373	Option<(extensions::unicode::Key, Option<TinyAsciiStr<`8`>>)>,
374	),
375	ParserError,
376	> {
377	parse_locale_with_single_variant_single_keyword_unicode_keyword_extension(
378	v,
379	ParserMode::Locale,
380	)
381	}
382
383	pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
384	where
385	F: FnMut(&str) -> Result<(), E>,
386	{
387	self.id.for_each_subtag_str(f)?;
388	self.extensions.for_each_subtag_str(f)?;
389	Ok(())
390	}
391	}
392
393	impl FromStr for Locale {
394	type Err = ParserError;
395
396	fn from_str(source: &str) -> Result<Self, Self::Err> {
397	Self::try_from_bytes(source.as_bytes())
398	}
399	}
400
401	impl From<LanguageIdentifier> for Locale {
402	fn from(id: LanguageIdentifier) -> Self {
403	Self {
404	id,
405	extensions: extensions::Extensions::default(),
406	}
407	}
408	}
409
410	impl From<Locale> for LanguageIdentifier {
411	fn from(loc: Locale) -> Self {
412	loc.id
413	}
414	}
415
416	impl AsRef<LanguageIdentifier> for Locale {
417	#[inline(always)]
418	fn as_ref(&self) -> &LanguageIdentifier {
419	&self.id
420	}
421	}
422
423	impl AsMut<LanguageIdentifier> for Locale {
424	fn as_mut(&mut self) -> &mut LanguageIdentifier {
425	&mut self.id
426	}
427	}
428
429	impl core::fmt::Debug for Locale {
430	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
431	writeable::Writeable::write_to(self, sink:f)
432	}
433	}
434
435	impl_writeable_for_each_subtag_str_no_test!(Locale, selff, selff.extensions.is_empty() => selff.id.write_to_string());
436
437	#[test]
438	fn test_writeable() {
439	use writeable::assert_writeable_eq;
440	assert_writeable_eq!(Locale::UND, "und");
441	assert_writeable_eq!("und-001".parse::<Locale>().unwrap(), "und-001");
442	assert_writeable_eq!("und-Mymr".parse::<Locale>().unwrap(), "und-Mymr");
443	assert_writeable_eq!("my-Mymr-MM".parse::<Locale>().unwrap(), "my-Mymr-MM");
444	assert_writeable_eq!(
445	"my-Mymr-MM-posix".parse::<Locale>().unwrap(),
446	"my-Mymr-MM-posix",
447	);
448	assert_writeable_eq!(
449	"zh-macos-posix".parse::<Locale>().unwrap(),
450	"zh-macos-posix",
451	);
452	assert_writeable_eq!(
453	"my-t-my-d0-zawgyi".parse::<Locale>().unwrap(),
454	"my-t-my-d0-zawgyi",
455	);
456	assert_writeable_eq!(
457	"ar-SA-u-ca-islamic-civil".parse::<Locale>().unwrap(),
458	"ar-SA-u-ca-islamic-civil",
459	);
460	assert_writeable_eq!(
461	"en-001-x-foo-bar".parse::<Locale>().unwrap(),
462	"en-001-x-foo-bar",
463	);
464	assert_writeable_eq!("und-t-m0-true".parse::<Locale>().unwrap(), "und-t-m0-true",);
465	}
466
467	/// # Examples
468	///
469	/// ```
470	/// use icu::locid::Locale;
471	/// use icu::locid::{locale, subtags::language};
472	///
473	/// assert_eq!(Locale::from(language!("en")), locale!("en"));
474	/// ```
475	impl From<subtags::Language> for Locale {
476	fn from(language: subtags::Language) -> Self {
477	Self {
478	id: language.into(),
479	..Default::default()
480	}
481	}
482	}
483
484	/// # Examples
485	///
486	/// ```
487	/// use icu::locid::Locale;
488	/// use icu::locid::{locale, subtags::script};
489	///
490	/// assert_eq!(Locale::from(Some(script!("latn"))), locale!("und-Latn"));
491	/// ```
492	impl From<Option<subtags::Script>> for Locale {
493	fn from(script: Option<subtags::Script>) -> Self {
494	Self {
495	id: script.into(),
496	..Default::default()
497	}
498	}
499	}
500
501	/// # Examples
502	///
503	/// ```
504	/// use icu::locid::Locale;
505	/// use icu::locid::{locale, subtags::region};
506	///
507	/// assert_eq!(Locale::from(Some(region!("US"))), locale!("und-US"));
508	/// ```
509	impl From<Option<subtags::Region>> for Locale {
510	fn from(region: Option<subtags::Region>) -> Self {
511	Self {
512	id: region.into(),
513	..Default::default()
514	}
515	}
516	}
517
518	/// # Examples
519	///
520	/// ```
521	/// use icu::locid::Locale;
522	/// use icu::locid::{
523	/// locale,
524	/// subtags::{language, region, script},
525	/// };
526	///
527	/// assert_eq!(
528	/// Locale::from((
529	/// language!("en"),
530	/// Some(script!("Latn")),
531	/// Some(region!("US"))
532	/// )),
533	/// locale!("en-Latn-US")
534	/// );
535	/// ```
536	impl
537	From<(
538	subtags::Language,
539	Option<subtags::Script>,
540	Option<subtags::Region>,
541	)> for Locale
542	{
543	fn from(
544	lsr: (
545	subtags::Language,
546	Option<subtags::Script>,
547	Option<subtags::Region>,
548	),
549	) -> Self {
550	Self {
551	id: lsr.into(),
552	..Default::default()
553	}
554	}
555	}
556