1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use core::cmp::Ordering;
6use core::str::FromStr;
7
8use crate::ordering::SubtagOrderingResult;
9use crate::parser::{
10 parse_language_identifier, parse_language_identifier_with_single_variant, ParserError,
11 ParserMode, SubtagIterator,
12};
13use crate::subtags;
14use alloc::string::String;
15use writeable::Writeable;
16
17/// A core struct representing a [`Unicode BCP47 Language Identifier`].
18///
19/// # Examples
20///
21/// ```
22/// use icu::locid::{
23/// langid,
24/// subtags::{language, region},
25/// };
26///
27/// let li = langid!("en-US");
28///
29/// assert_eq!(li.language, language!("en"));
30/// assert_eq!(li.script, None);
31/// assert_eq!(li.region, Some(region!("US")));
32/// assert_eq!(li.variants.len(), 0);
33/// ```
34///
35/// # Parsing
36///
37/// Unicode recognizes three levels of standard conformance for any language identifier:
38///
39/// * *well-formed* - syntactically correct
40/// * *valid* - well-formed and only uses registered language, region, script and variant subtags...
41/// * *canonical* - valid and no deprecated codes or structure.
42///
43/// At the moment parsing normalizes a well-formed language identifier converting
44/// `_` separators to `-` and adjusting casing to conform to the Unicode standard.
45///
46/// Any bogus subtags will cause the parsing to fail with an error.
47/// No subtag validation is performed.
48///
49/// # Examples
50///
51/// ```
52/// use icu::locid::{
53/// langid,
54/// subtags::{language, region, script, variant},
55/// };
56///
57/// let li = langid!("eN_latn_Us-Valencia");
58///
59/// assert_eq!(li.language, language!("en"));
60/// assert_eq!(li.script, Some(script!("Latn")));
61/// assert_eq!(li.region, Some(region!("US")));
62/// assert_eq!(li.variants.get(0), Some(&variant!("valencia")));
63/// ```
64///
65/// [`Unicode BCP47 Language Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_language_identifier
66#[derive(Default, PartialEq, Eq, Clone, Hash)]
67#[allow(clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro)
68pub struct LanguageIdentifier {
69 /// Language subtag of the language identifier.
70 pub language: subtags::Language,
71 /// Script subtag of the language identifier.
72 pub script: Option<subtags::Script>,
73 /// Region subtag of the language identifier.
74 pub region: Option<subtags::Region>,
75 /// Variant subtags of the language identifier.
76 pub variants: subtags::Variants,
77}
78
79impl LanguageIdentifier {
80 /// A constructor which takes a utf8 slice, parses it and
81 /// produces a well-formed [`LanguageIdentifier`].
82 ///
83 /// # Examples
84 ///
85 /// ```
86 /// use icu::locid::LanguageIdentifier;
87 ///
88 /// LanguageIdentifier::try_from_bytes(b"en-US").expect("Parsing failed");
89 /// ```
90 pub fn try_from_bytes(v: &[u8]) -> Result<Self, ParserError> {
91 parse_language_identifier(v, ParserMode::LanguageIdentifier)
92 }
93
94 #[doc(hidden)]
95 #[allow(clippy::type_complexity)]
96 // The return type should be `Result<Self, ParserError>` once the `const_precise_live_drops`
97 // is stabilized ([rust-lang#73255](https://github.com/rust-lang/rust/issues/73255)).
98 pub const fn try_from_bytes_with_single_variant(
99 v: &[u8],
100 ) -> Result<
101 (
102 subtags::Language,
103 Option<subtags::Script>,
104 Option<subtags::Region>,
105 Option<subtags::Variant>,
106 ),
107 ParserError,
108 > {
109 parse_language_identifier_with_single_variant(v, ParserMode::LanguageIdentifier)
110 }
111
112 /// A constructor which takes a utf8 slice which may contain extension keys,
113 /// parses it and produces a well-formed [`LanguageIdentifier`].
114 ///
115 /// # Examples
116 ///
117 /// ```
118 /// use icu::locid::{langid, LanguageIdentifier};
119 ///
120 /// let li = LanguageIdentifier::try_from_locale_bytes(b"en-US-x-posix")
121 /// .expect("Parsing failed.");
122 ///
123 /// assert_eq!(li, langid!("en-US"));
124 /// ```
125 ///
126 /// This method should be used for input that may be a locale identifier.
127 /// All extensions will be lost.
128 pub fn try_from_locale_bytes(v: &[u8]) -> Result<Self, ParserError> {
129 parse_language_identifier(v, ParserMode::Locale)
130 }
131
132 /// The default undefined language "und". Same as [`default()`](Default::default()).
133 ///
134 /// # Examples
135 ///
136 /// ```
137 /// use icu::locid::LanguageIdentifier;
138 ///
139 /// assert_eq!(LanguageIdentifier::default(), LanguageIdentifier::UND);
140 /// ```
141 pub const UND: Self = Self {
142 language: subtags::Language::UND,
143 script: None,
144 region: None,
145 variants: subtags::Variants::new(),
146 };
147
148 /// This is a best-effort operation that performs all available levels of canonicalization.
149 ///
150 /// At the moment the operation will normalize casing and the separator, but in the future
151 /// it may also validate and update from deprecated subtags to canonical ones.
152 ///
153 /// # Examples
154 ///
155 /// ```
156 /// use icu::locid::LanguageIdentifier;
157 ///
158 /// assert_eq!(
159 /// LanguageIdentifier::canonicalize("pL_latn_pl").as_deref(),
160 /// Ok("pl-Latn-PL")
161 /// );
162 /// ```
163 pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, ParserError> {
164 let lang_id = Self::try_from_bytes(input.as_ref())?;
165 Ok(lang_id.write_to_string().into_owned())
166 }
167
168 /// Compare this [`LanguageIdentifier`] with BCP-47 bytes.
169 ///
170 /// The return value is equivalent to what would happen if you first converted this
171 /// [`LanguageIdentifier`] to a BCP-47 string and then performed a byte comparison.
172 ///
173 /// This function is case-sensitive and results in a *total order*, so it is appropriate for
174 /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
175 ///
176 /// # Examples
177 ///
178 /// ```
179 /// use icu::locid::LanguageIdentifier;
180 /// use std::cmp::Ordering;
181 ///
182 /// let bcp47_strings: &[&str] = &[
183 /// "pl-Latn-PL",
184 /// "und",
185 /// "und-Adlm",
186 /// "und-GB",
187 /// "und-ZA",
188 /// "und-fonipa",
189 /// "zh",
190 /// ];
191 ///
192 /// for ab in bcp47_strings.windows(2) {
193 /// let a = ab[0];
194 /// let b = ab[1];
195 /// assert!(a.cmp(b) == Ordering::Less);
196 /// let a_langid = a.parse::<LanguageIdentifier>().unwrap();
197 /// assert!(a_langid.strict_cmp(a.as_bytes()) == Ordering::Equal);
198 /// assert!(a_langid.strict_cmp(b.as_bytes()) == Ordering::Less);
199 /// }
200 /// ```
201 pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
202 self.strict_cmp_iter(other.split(|b| *b == b'-')).end()
203 }
204
205 /// Compare this [`LanguageIdentifier`] with an iterator of BCP-47 subtags.
206 ///
207 /// This function has the same equality semantics as [`LanguageIdentifier::strict_cmp`]. It is intended as
208 /// a more modular version that allows multiple subtag iterators to be chained together.
209 ///
210 /// For an additional example, see [`SubtagOrderingResult`].
211 ///
212 /// # Examples
213 ///
214 /// ```
215 /// use icu::locid::LanguageIdentifier;
216 /// use std::cmp::Ordering;
217 ///
218 /// let subtags: &[&[u8]] = &[b"ca", b"ES", b"valencia"];
219 ///
220 /// let loc = "ca-ES-valencia".parse::<LanguageIdentifier>().unwrap();
221 /// assert_eq!(
222 /// Ordering::Equal,
223 /// loc.strict_cmp_iter(subtags.iter().copied()).end()
224 /// );
225 ///
226 /// let loc = "ca-ES".parse::<LanguageIdentifier>().unwrap();
227 /// assert_eq!(
228 /// Ordering::Less,
229 /// loc.strict_cmp_iter(subtags.iter().copied()).end()
230 /// );
231 ///
232 /// let loc = "ca-ZA".parse::<LanguageIdentifier>().unwrap();
233 /// assert_eq!(
234 /// Ordering::Greater,
235 /// loc.strict_cmp_iter(subtags.iter().copied()).end()
236 /// );
237 /// ```
238 pub fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult<I>
239 where
240 I: Iterator<Item = &'l [u8]>,
241 {
242 let r = self.for_each_subtag_str(&mut |subtag| {
243 if let Some(other) = subtags.next() {
244 match subtag.as_bytes().cmp(other) {
245 Ordering::Equal => Ok(()),
246 not_equal => Err(not_equal),
247 }
248 } else {
249 Err(Ordering::Greater)
250 }
251 });
252 match r {
253 Ok(_) => SubtagOrderingResult::Subtags(subtags),
254 Err(o) => SubtagOrderingResult::Ordering(o),
255 }
256 }
257
258 /// Compare this `LanguageIdentifier` with a potentially unnormalized BCP-47 string.
259 ///
260 /// The return value is equivalent to what would happen if you first parsed the
261 /// BCP-47 string to a `LanguageIdentifier` and then performed a structural comparison.
262 ///
263 /// # Examples
264 ///
265 /// ```
266 /// use icu::locid::LanguageIdentifier;
267 /// use std::cmp::Ordering;
268 ///
269 /// let bcp47_strings: &[&str] = &[
270 /// "pl-LaTn-pL",
271 /// "uNd",
272 /// "UnD-adlm",
273 /// "uNd-GB",
274 /// "UND-FONIPA",
275 /// "ZH",
276 /// ];
277 ///
278 /// for a in bcp47_strings {
279 /// assert!(a.parse::<LanguageIdentifier>().unwrap().normalizing_eq(a));
280 /// }
281 /// ```
282 pub fn normalizing_eq(&self, other: &str) -> bool {
283 macro_rules! subtag_matches {
284 ($T:ty, $iter:ident, $expected:expr) => {
285 $iter
286 .next()
287 .map(|b| <$T>::try_from_bytes(b) == Ok($expected))
288 .unwrap_or(false)
289 };
290 }
291
292 let mut iter = SubtagIterator::new(other.as_bytes());
293 if !subtag_matches!(subtags::Language, iter, self.language) {
294 return false;
295 }
296 if let Some(ref script) = self.script {
297 if !subtag_matches!(subtags::Script, iter, *script) {
298 return false;
299 }
300 }
301 if let Some(ref region) = self.region {
302 if !subtag_matches!(subtags::Region, iter, *region) {
303 return false;
304 }
305 }
306 for variant in self.variants.iter() {
307 if !subtag_matches!(subtags::Variant, iter, *variant) {
308 return false;
309 }
310 }
311 iter.next().is_none()
312 }
313
314 pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
315 where
316 F: FnMut(&str) -> Result<(), E>,
317 {
318 f(self.language.as_str())?;
319 if let Some(ref script) = self.script {
320 f(script.as_str())?;
321 }
322 if let Some(ref region) = self.region {
323 f(region.as_str())?;
324 }
325 for variant in self.variants.iter() {
326 f(variant.as_str())?;
327 }
328 Ok(())
329 }
330}
331
332impl AsRef<LanguageIdentifier> for LanguageIdentifier {
333 fn as_ref(&self) -> &Self {
334 self
335 }
336}
337
338impl AsMut<LanguageIdentifier> for LanguageIdentifier {
339 fn as_mut(&mut self) -> &mut Self {
340 self
341 }
342}
343
344impl core::fmt::Debug for LanguageIdentifier {
345 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
346 core::fmt::Display::fmt(&self, f)
347 }
348}
349
350impl FromStr for LanguageIdentifier {
351 type Err = ParserError;
352
353 fn from_str(source: &str) -> Result<Self, Self::Err> {
354 Self::try_from_bytes(source.as_bytes())
355 }
356}
357
358impl_writeable_for_each_subtag_str_no_test!(LanguageIdentifier, selff, selff.script.is_none() && selff.region.is_none() && selff.variants.is_empty() => selff.language.write_to_string());
359
360#[test]
361fn test_writeable() {
362 use writeable::assert_writeable_eq;
363 assert_writeable_eq!(LanguageIdentifier::UND, "und");
364 assert_writeable_eq!("und-001".parse::<LanguageIdentifier>().unwrap(), "und-001");
365 assert_writeable_eq!(
366 "und-Mymr".parse::<LanguageIdentifier>().unwrap(),
367 "und-Mymr",
368 );
369 assert_writeable_eq!(
370 "my-Mymr-MM".parse::<LanguageIdentifier>().unwrap(),
371 "my-Mymr-MM",
372 );
373 assert_writeable_eq!(
374 "my-Mymr-MM-posix".parse::<LanguageIdentifier>().unwrap(),
375 "my-Mymr-MM-posix",
376 );
377 assert_writeable_eq!(
378 "zh-macos-posix".parse::<LanguageIdentifier>().unwrap(),
379 "zh-macos-posix",
380 );
381}
382
383/// # Examples
384///
385/// ```
386/// use icu::locid::{langid, subtags::language, LanguageIdentifier};
387///
388/// assert_eq!(LanguageIdentifier::from(language!("en")), langid!("en"));
389/// ```
390impl From<subtags::Language> for LanguageIdentifier {
391 fn from(language: subtags::Language) -> Self {
392 Self {
393 language,
394 ..Default::default()
395 }
396 }
397}
398
399/// # Examples
400///
401/// ```
402/// use icu::locid::{langid, subtags::script, LanguageIdentifier};
403///
404/// assert_eq!(
405/// LanguageIdentifier::from(Some(script!("latn"))),
406/// langid!("und-Latn")
407/// );
408/// ```
409impl From<Option<subtags::Script>> for LanguageIdentifier {
410 fn from(script: Option<subtags::Script>) -> Self {
411 Self {
412 script,
413 ..Default::default()
414 }
415 }
416}
417
418/// # Examples
419///
420/// ```
421/// use icu::locid::{langid, subtags::region, LanguageIdentifier};
422///
423/// assert_eq!(
424/// LanguageIdentifier::from(Some(region!("US"))),
425/// langid!("und-US")
426/// );
427/// ```
428impl From<Option<subtags::Region>> for LanguageIdentifier {
429 fn from(region: Option<subtags::Region>) -> Self {
430 Self {
431 region,
432 ..Default::default()
433 }
434 }
435}
436
437/// Convert from an LSR tuple to a [`LanguageIdentifier`].
438///
439/// # Examples
440///
441/// ```
442/// use icu::locid::{
443/// langid,
444/// subtags::{language, region, script},
445/// LanguageIdentifier,
446/// };
447///
448/// let lang = language!("en");
449/// let script = script!("Latn");
450/// let region = region!("US");
451/// assert_eq!(
452/// LanguageIdentifier::from((lang, Some(script), Some(region))),
453/// langid!("en-Latn-US")
454/// );
455/// ```
456impl
457 From<(
458 subtags::Language,
459 Option<subtags::Script>,
460 Option<subtags::Region>,
461 )> for LanguageIdentifier
462{
463 fn from(
464 lsr: (
465 subtags::Language,
466 Option<subtags::Script>,
467 Option<subtags::Region>,
468 ),
469 ) -> Self {
470 Self {
471 language: lsr.0,
472 script: lsr.1,
473 region: lsr.2,
474 ..Default::default()
475 }
476 }
477}
478
479/// Convert from a [`LanguageIdentifier`] to an LSR tuple.
480///
481/// # Examples
482///
483/// ```
484/// use icu::locid::{
485/// langid,
486/// subtags::{language, region, script},
487/// };
488///
489/// let lid = langid!("en-Latn-US");
490/// let (lang, script, region) = (&lid).into();
491///
492/// assert_eq!(lang, language!("en"));
493/// assert_eq!(script, Some(script!("Latn")));
494/// assert_eq!(region, Some(region!("US")));
495/// ```
496impl From<&LanguageIdentifier>
497 for (
498 subtags::Language,
499 Option<subtags::Script>,
500 Option<subtags::Region>,
501 )
502{
503 fn from(langid: &LanguageIdentifier) -> Self {
504 (langid.language, langid.script, langid.region)
505 }
506}
507