langid.rs source code [crates/icu_locid/src/langid.rs]

1	// This file is part of ICU4X. For terms of use, please see the file
2	// called LICENSE at the top level of the ICU4X source tree
3	// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5	use core::cmp::Ordering;
6	use core::str::FromStr;
7
8	use crate::ordering::SubtagOrderingResult;
9	use crate::parser::{
10	parse_language_identifier, parse_language_identifier_with_single_variant, ParserError,
11	ParserMode, SubtagIterator,
12	};
13	use crate::subtags;
14	use alloc::string::String;
15	use writeable::Writeable;
16
17	/// A core struct representing a [`Unicode BCP47 Language Identifier`].
18	///
19	/// # Examples
20	///
21	/// ```
22	/// use icu::locid::{
23	/// langid,
24	/// subtags::{language, region},
25	/// };
26	///
27	/// let li = langid!("en-US");
28	///
29	/// assert_eq!(li.language, language!("en"));
30	/// assert_eq!(li.script, None);
31	/// assert_eq!(li.region, Some(region!("US")));
32	/// assert_eq!(li.variants.len(), `0`);
33	/// ```
34	///
35	/// # Parsing
36	///
37	/// Unicode recognizes three levels of standard conformance for any language identifier:
38	///
39	/// well-formed - syntactically correct*
40	/// valid - well-formed and only uses registered language, region, script and variant subtags...*
41	/// canonical - valid and no deprecated codes or structure.*
42	///
43	/// At the moment parsing normalizes a well-formed language identifier converting
44	/// `_` separators to `-` and adjusting casing to conform to the Unicode standard.
45	///
46	/// Any bogus subtags will cause the parsing to fail with an error.
47	/// No subtag validation is performed.
48	///
49	/// # Examples
50	///
51	/// ```
52	/// use icu::locid::{
53	/// langid,
54	/// subtags::{language, region, script, variant},
55	/// };
56	///
57	/// let li = langid!("eN_latn_Us-Valencia");
58	///
59	/// assert_eq!(li.language, language!("en"));
60	/// assert_eq!(li.script, Some(script!("Latn")));
61	/// assert_eq!(li.region, Some(region!("US")));
62	/// assert_eq!(li.variants.get(`0`), Some(&variant!("valencia")));
63	/// ```
64	///
65	/// [`Unicode BCP47 Language Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_language_identifier
66	#[derive(Default, PartialEq, Eq, Clone, Hash)]
67	#[allow(clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro)
68	pub struct LanguageIdentifier {
69	/// Language subtag of the language identifier.
70	pub language: subtags::Language,
71	/// Script subtag of the language identifier.
72	pub script: Option<subtags::Script>,
73	/// Region subtag of the language identifier.
74	pub region: Option<subtags::Region>,
75	/// Variant subtags of the language identifier.
76	pub variants: subtags::Variants,
77	}
78
79	impl LanguageIdentifier {
80	/// A constructor which takes a utf8 slice, parses it and
81	/// produces a well-formed [`LanguageIdentifier`].
82	///
83	/// # Examples
84	///
85	/// ```
86	/// use icu::locid::LanguageIdentifier;
87	///
88	/// LanguageIdentifier::try_from_bytes(b"en-US").expect("Parsing failed");
89	/// ```
90	pub fn try_from_bytes(v: &[u8]) -> Result<Self, ParserError> {
91	parse_language_identifier(v, ParserMode::LanguageIdentifier)
92	}
93
94	#[doc(hidden)]
95	#[allow(clippy::type_complexity)]
96	// The return type should be `Result<Self, ParserError>` once the `const_precise_live_drops`
97	// is stabilized ([rust-lang#73255](https://github.com/rust-lang/rust/issues/73255)).
98	pub const fn try_from_bytes_with_single_variant(
99	v: &[u8],
100	) -> Result<
101	(
102	subtags::Language,
103	Option<subtags::Script>,
104	Option<subtags::Region>,
105	Option<subtags::Variant>,
106	),
107	ParserError,
108	> {
109	parse_language_identifier_with_single_variant(v, ParserMode::LanguageIdentifier)
110	}
111
112	/// A constructor which takes a utf8 slice which may contain extension keys,
113	/// parses it and produces a well-formed [`LanguageIdentifier`].
114	///
115	/// # Examples
116	///
117	/// ```
118	/// use icu::locid::{langid, LanguageIdentifier};
119	///
120	/// let li = LanguageIdentifier::try_from_locale_bytes(b"en-US-x-posix")
121	/// .expect("Parsing failed.");
122	///
123	/// assert_eq!(li, langid!("en-US"));
124	/// ```
125	///
126	/// This method should be used for input that may be a locale identifier.
127	/// All extensions will be lost.
128	pub fn try_from_locale_bytes(v: &[u8]) -> Result<Self, ParserError> {
129	parse_language_identifier(v, ParserMode::Locale)
130	}
131
132	/// The default undefined language "und". Same as [`default()`](Default::default()).
133	///
134	/// # Examples
135	///
136	/// ```
137	/// use icu::locid::LanguageIdentifier;
138	///
139	/// assert_eq!(LanguageIdentifier::default(), LanguageIdentifier::UND);
140	/// ```
141	pub const UND: Self = Self {
142	language: subtags::Language::UND,
143	script: None,
144	region: None,
145	variants: subtags::Variants::new(),
146	};
147
148	/// This is a best-effort operation that performs all available levels of canonicalization.
149	///
150	/// At the moment the operation will normalize casing and the separator, but in the future
151	/// it may also validate and update from deprecated subtags to canonical ones.
152	///
153	/// # Examples
154	///
155	/// ```
156	/// use icu::locid::LanguageIdentifier;
157	///
158	/// assert_eq!(
159	/// LanguageIdentifier::canonicalize("pL_latn_pl").as_deref(),
160	/// Ok("pl-Latn-PL")
161	/// );
162	/// ```
163	pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, ParserError> {
164	let lang_id = Self::try_from_bytes(input.as_ref())?;
165	Ok(lang_id.write_to_string().into_owned())
166	}
167
168	/// Compare this [`LanguageIdentifier`] with BCP-47 bytes.
169	///
170	/// The return value is equivalent to what would happen if you first converted this
171	/// [`LanguageIdentifier`] to a BCP-47 string and then performed a byte comparison.
172	///
173	/// This function is case-sensitive and results in a total order, so it is appropriate for
174	/// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
175	///
176	/// # Examples
177	///
178	/// ```
179	/// use icu::locid::LanguageIdentifier;
180	/// use std::cmp::Ordering;
181	///
182	/// let bcp47_strings: &[&str] = &[
183	/// "pl-Latn-PL",
184	/// "und",
185	/// "und-Adlm",
186	/// "und-GB",
187	/// "und-ZA",
188	/// "und-fonipa",
189	/// "zh",
190	/// ];
191	///
192	/// for ab in bcp47_strings.windows(`2`) {
193	/// let a = ab[`0`];
194	/// let b = ab[`1`];
195	/// assert!(a.cmp(b) == Ordering::Less);
196	/// let a_langid = a.parse::<LanguageIdentifier>().unwrap();
197	/// assert!(a_langid.strict_cmp(a.as_bytes()) == Ordering::Equal);
198	/// assert!(a_langid.strict_cmp(b.as_bytes()) == Ordering::Less);
199	/// }
200	/// ```
201	pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
202	self.strict_cmp_iter(other.split(\|b\| *b == b'-')).end()
203	}
204
205	/// Compare this [`LanguageIdentifier`] with an iterator of BCP-47 subtags.
206	///
207	/// This function has the same equality semantics as [`LanguageIdentifier::strict_cmp`]. It is intended as
208	/// a more modular version that allows multiple subtag iterators to be chained together.
209	///
210	/// For an additional example, see [`SubtagOrderingResult`].
211	///
212	/// # Examples
213	///
214	/// ```
215	/// use icu::locid::LanguageIdentifier;
216	/// use std::cmp::Ordering;
217	///
218	/// let subtags: &[&[u8]] = &[b"ca", b"ES", b"valencia"];
219	///
220	/// let loc = "ca-ES-valencia".parse::<LanguageIdentifier>().unwrap();
221	/// assert_eq!(
222	/// Ordering::Equal,
223	/// loc.strict_cmp_iter(subtags.iter().copied()).end()
224	/// );
225	///
226	/// let loc = "ca-ES".parse::<LanguageIdentifier>().unwrap();
227	/// assert_eq!(
228	/// Ordering::Less,
229	/// loc.strict_cmp_iter(subtags.iter().copied()).end()
230	/// );
231	///
232	/// let loc = "ca-ZA".parse::<LanguageIdentifier>().unwrap();
233	/// assert_eq!(
234	/// Ordering::Greater,
235	/// loc.strict_cmp_iter(subtags.iter().copied()).end()
236	/// );
237	/// ```
238	pub fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult<I>
239	where
240	I: Iterator<Item = &'l [u8]>,
241	{
242	let r = self.for_each_subtag_str(&mut \|subtag\| {
243	if let Some(other) = subtags.next() {
244	match subtag.as_bytes().cmp(other) {
245	Ordering::Equal => Ok(()),
246	not_equal => Err(not_equal),
247	}
248	} else {
249	Err(Ordering::Greater)
250	}
251	});
252	match r {
253	Ok(_) => SubtagOrderingResult::Subtags(subtags),
254	Err(o) => SubtagOrderingResult::Ordering(o),
255	}
256	}
257
258	/// Compare this `LanguageIdentifier` with a potentially unnormalized BCP-47 string.
259	///
260	/// The return value is equivalent to what would happen if you first parsed the
261	/// BCP-47 string to a `LanguageIdentifier` and then performed a structural comparison.
262	///
263	/// # Examples
264	///
265	/// ```
266	/// use icu::locid::LanguageIdentifier;
267	/// use std::cmp::Ordering;
268	///
269	/// let bcp47_strings: &[&str] = &[
270	/// "pl-LaTn-pL",
271	/// "uNd",
272	/// "UnD-adlm",
273	/// "uNd-GB",
274	/// "UND-FONIPA",
275	/// "ZH",
276	/// ];
277	///
278	/// for a in bcp47_strings {
279	/// assert!(a.parse::<LanguageIdentifier>().unwrap().normalizing_eq(a));
280	/// }
281	/// ```
282	pub fn normalizing_eq(&self, other: &str) -> bool {
283	macro_rules! subtag_matches {
284	($T:ty, $iter:ident, $expected:expr) => {
285	$iter
286	.next()
287	.map(\|b\| <$T>::try_from_bytes(b) == Ok($expected))
288	.unwrap_or(`false`)
289	};
290	}
291
292	let mut iter = SubtagIterator::new(other.as_bytes());
293	if !subtag_matches!(subtags::Language, iter, self.language) {
294	return `false`;
295	}
296	if let Some(ref script) = self.script {
297	if !subtag_matches!(subtags::Script, iter, *script) {
298	return `false`;
299	}
300	}
301	if let Some(ref region) = self.region {
302	if !subtag_matches!(subtags::Region, iter, *region) {
303	return `false`;
304	}
305	}
306	for variant in self.variants.iter() {
307	if !subtag_matches!(subtags::Variant, iter, *variant) {
308	return `false`;
309	}
310	}
311	iter.next().is_none()
312	}
313
314	pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
315	where
316	F: FnMut(&str) -> Result<(), E>,
317	{
318	f(self.language.as_str())?;
319	if let Some(ref script) = self.script {
320	f(script.as_str())?;
321	}
322	if let Some(ref region) = self.region {
323	f(region.as_str())?;
324	}
325	for variant in self.variants.iter() {
326	f(variant.as_str())?;
327	}
328	Ok(())
329	}
330	}
331
332	impl AsRef<LanguageIdentifier> for LanguageIdentifier {
333	fn as_ref(&self) -> &Self {
334	self
335	}
336	}
337
338	impl AsMut<LanguageIdentifier> for LanguageIdentifier {
339	fn as_mut(&mut self) -> &mut Self {
340	self
341	}
342	}
343
344	impl core::fmt::Debug for LanguageIdentifier {
345	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
346	core::fmt::Display::fmt(&self, f)
347	}
348	}
349
350	impl FromStr for LanguageIdentifier {
351	type Err = ParserError;
352
353	fn from_str(source: &str) -> Result<Self, Self::Err> {
354	Self::try_from_bytes(source.as_bytes())
355	}
356	}
357
358	impl_writeable_for_each_subtag_str_no_test!(LanguageIdentifier, selff, selff.script.is_none() && selff.region.is_none() && selff.variants.is_empty() => selff.language.write_to_string());
359
360	#[test]
361	fn test_writeable() {
362	use writeable::assert_writeable_eq;
363	assert_writeable_eq!(LanguageIdentifier::UND, "und");
364	assert_writeable_eq!("und-001".parse::<LanguageIdentifier>().unwrap(), "und-001");
365	assert_writeable_eq!(
366	"und-Mymr".parse::<LanguageIdentifier>().unwrap(),
367	"und-Mymr",
368	);
369	assert_writeable_eq!(
370	"my-Mymr-MM".parse::<LanguageIdentifier>().unwrap(),
371	"my-Mymr-MM",
372	);
373	assert_writeable_eq!(
374	"my-Mymr-MM-posix".parse::<LanguageIdentifier>().unwrap(),
375	"my-Mymr-MM-posix",
376	);
377	assert_writeable_eq!(
378	"zh-macos-posix".parse::<LanguageIdentifier>().unwrap(),
379	"zh-macos-posix",
380	);
381	}
382
383	/// # Examples
384	///
385	/// ```
386	/// use icu::locid::{langid, subtags::language, LanguageIdentifier};
387	///
388	/// assert_eq!(LanguageIdentifier::from(language!("en")), langid!("en"));
389	/// ```
390	impl From<subtags::Language> for LanguageIdentifier {
391	fn from(language: subtags::Language) -> Self {
392	Self {
393	language,
394	..Default::default()
395	}
396	}
397	}
398
399	/// # Examples
400	///
401	/// ```
402	/// use icu::locid::{langid, subtags::script, LanguageIdentifier};
403	///
404	/// assert_eq!(
405	/// LanguageIdentifier::from(Some(script!("latn"))),
406	/// langid!("und-Latn")
407	/// );
408	/// ```
409	impl From<Option<subtags::Script>> for LanguageIdentifier {
410	fn from(script: Option<subtags::Script>) -> Self {
411	Self {
412	script,
413	..Default::default()
414	}
415	}
416	}
417
418	/// # Examples
419	///
420	/// ```
421	/// use icu::locid::{langid, subtags::region, LanguageIdentifier};
422	///
423	/// assert_eq!(
424	/// LanguageIdentifier::from(Some(region!("US"))),
425	/// langid!("und-US")
426	/// );
427	/// ```
428	impl From<Option<subtags::Region>> for LanguageIdentifier {
429	fn from(region: Option<subtags::Region>) -> Self {
430	Self {
431	region,
432	..Default::default()
433	}
434	}
435	}
436
437	/// Convert from an LSR tuple to a [`LanguageIdentifier`].
438	///
439	/// # Examples
440	///
441	/// ```
442	/// use icu::locid::{
443	/// langid,
444	/// subtags::{language, region, script},
445	/// LanguageIdentifier,
446	/// };
447	///
448	/// let lang = language!("en");
449	/// let script = script!("Latn");
450	/// let region = region!("US");
451	/// assert_eq!(
452	/// LanguageIdentifier::from((lang, Some(script), Some(region))),
453	/// langid!("en-Latn-US")
454	/// );
455	/// ```
456	impl
457	From<(
458	subtags::Language,
459	Option<subtags::Script>,
460	Option<subtags::Region>,
461	)> for LanguageIdentifier
462	{
463	fn from(
464	lsr: (
465	subtags::Language,
466	Option<subtags::Script>,
467	Option<subtags::Region>,
468	),
469	) -> Self {
470	Self {
471	language: lsr.0,
472	script: lsr.1,
473	region: lsr.2,
474	..Default::default()
475	}
476	}
477	}
478
479	/// Convert from a [`LanguageIdentifier`] to an LSR tuple.
480	///
481	/// # Examples
482	///
483	/// ```
484	/// use icu::locid::{
485	/// langid,
486	/// subtags::{language, region, script},
487	/// };
488	///
489	/// let lid = langid!("en-Latn-US");
490	/// let (lang, script, region) = (&lid).into();
491	///
492	/// assert_eq!(lang, language!("en"));
493	/// assert_eq!(script, Some(script!("Latn")));
494	/// assert_eq!(region, Some(region!("US")));
495	/// ```
496	impl From<&LanguageIdentifier>
497	for (
498	subtags::Language,
499	Option<subtags::Script>,
500	Option<subtags::Region>,
501	)
502	{
503	fn from(langid: &LanguageIdentifier) -> Self {
504	(langid.language, langid.script, langid.region)
505	}
506	}
507