1 | // This file is part of ICU4X. For terms of use, please see the file |
2 | // called LICENSE at the top level of the ICU4X source tree |
3 | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | |
5 | use core::cmp::Ordering; |
6 | use core::str::FromStr; |
7 | |
8 | use crate::ordering::SubtagOrderingResult; |
9 | use crate::parser::{ |
10 | parse_language_identifier, parse_language_identifier_with_single_variant, ParserError, |
11 | ParserMode, SubtagIterator, |
12 | }; |
13 | use crate::subtags; |
14 | use alloc::string::String; |
15 | use writeable::Writeable; |
16 | |
17 | /// A core struct representing a [`Unicode BCP47 Language Identifier`]. |
18 | /// |
19 | /// # Examples |
20 | /// |
21 | /// ``` |
22 | /// use icu::locid::{ |
23 | /// langid, |
24 | /// subtags::{language, region}, |
25 | /// }; |
26 | /// |
27 | /// let li = langid!("en-US" ); |
28 | /// |
29 | /// assert_eq!(li.language, language!("en" )); |
30 | /// assert_eq!(li.script, None); |
31 | /// assert_eq!(li.region, Some(region!("US" ))); |
32 | /// assert_eq!(li.variants.len(), 0); |
33 | /// ``` |
34 | /// |
35 | /// # Parsing |
36 | /// |
37 | /// Unicode recognizes three levels of standard conformance for any language identifier: |
38 | /// |
39 | /// * *well-formed* - syntactically correct |
40 | /// * *valid* - well-formed and only uses registered language, region, script and variant subtags... |
41 | /// * *canonical* - valid and no deprecated codes or structure. |
42 | /// |
43 | /// At the moment parsing normalizes a well-formed language identifier converting |
44 | /// `_` separators to `-` and adjusting casing to conform to the Unicode standard. |
45 | /// |
46 | /// Any bogus subtags will cause the parsing to fail with an error. |
47 | /// No subtag validation is performed. |
48 | /// |
49 | /// # Examples |
50 | /// |
51 | /// ``` |
52 | /// use icu::locid::{ |
53 | /// langid, |
54 | /// subtags::{language, region, script, variant}, |
55 | /// }; |
56 | /// |
57 | /// let li = langid!("eN_latn_Us-Valencia" ); |
58 | /// |
59 | /// assert_eq!(li.language, language!("en" )); |
60 | /// assert_eq!(li.script, Some(script!("Latn" ))); |
61 | /// assert_eq!(li.region, Some(region!("US" ))); |
62 | /// assert_eq!(li.variants.get(0), Some(&variant!("valencia" ))); |
63 | /// ``` |
64 | /// |
65 | /// [`Unicode BCP47 Language Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_language_identifier |
66 | #[derive (Default, PartialEq, Eq, Clone, Hash)] |
67 | #[allow (clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro) |
68 | pub struct LanguageIdentifier { |
69 | /// Language subtag of the language identifier. |
70 | pub language: subtags::Language, |
71 | /// Script subtag of the language identifier. |
72 | pub script: Option<subtags::Script>, |
73 | /// Region subtag of the language identifier. |
74 | pub region: Option<subtags::Region>, |
75 | /// Variant subtags of the language identifier. |
76 | pub variants: subtags::Variants, |
77 | } |
78 | |
79 | impl LanguageIdentifier { |
80 | /// A constructor which takes a utf8 slice, parses it and |
81 | /// produces a well-formed [`LanguageIdentifier`]. |
82 | /// |
83 | /// # Examples |
84 | /// |
85 | /// ``` |
86 | /// use icu::locid::LanguageIdentifier; |
87 | /// |
88 | /// LanguageIdentifier::try_from_bytes(b"en-US" ).expect("Parsing failed" ); |
89 | /// ``` |
90 | pub fn try_from_bytes(v: &[u8]) -> Result<Self, ParserError> { |
91 | parse_language_identifier(v, ParserMode::LanguageIdentifier) |
92 | } |
93 | |
94 | #[doc (hidden)] |
95 | #[allow (clippy::type_complexity)] |
96 | // The return type should be `Result<Self, ParserError>` once the `const_precise_live_drops` |
97 | // is stabilized ([rust-lang#73255](https://github.com/rust-lang/rust/issues/73255)). |
98 | pub const fn try_from_bytes_with_single_variant( |
99 | v: &[u8], |
100 | ) -> Result< |
101 | ( |
102 | subtags::Language, |
103 | Option<subtags::Script>, |
104 | Option<subtags::Region>, |
105 | Option<subtags::Variant>, |
106 | ), |
107 | ParserError, |
108 | > { |
109 | parse_language_identifier_with_single_variant(v, ParserMode::LanguageIdentifier) |
110 | } |
111 | |
112 | /// A constructor which takes a utf8 slice which may contain extension keys, |
113 | /// parses it and produces a well-formed [`LanguageIdentifier`]. |
114 | /// |
115 | /// # Examples |
116 | /// |
117 | /// ``` |
118 | /// use icu::locid::{langid, LanguageIdentifier}; |
119 | /// |
120 | /// let li = LanguageIdentifier::try_from_locale_bytes(b"en-US-x-posix" ) |
121 | /// .expect("Parsing failed." ); |
122 | /// |
123 | /// assert_eq!(li, langid!("en-US" )); |
124 | /// ``` |
125 | /// |
126 | /// This method should be used for input that may be a locale identifier. |
127 | /// All extensions will be lost. |
128 | pub fn try_from_locale_bytes(v: &[u8]) -> Result<Self, ParserError> { |
129 | parse_language_identifier(v, ParserMode::Locale) |
130 | } |
131 | |
132 | /// The default undefined language "und". Same as [`default()`](Default::default()). |
133 | /// |
134 | /// # Examples |
135 | /// |
136 | /// ``` |
137 | /// use icu::locid::LanguageIdentifier; |
138 | /// |
139 | /// assert_eq!(LanguageIdentifier::default(), LanguageIdentifier::UND); |
140 | /// ``` |
141 | pub const UND: Self = Self { |
142 | language: subtags::Language::UND, |
143 | script: None, |
144 | region: None, |
145 | variants: subtags::Variants::new(), |
146 | }; |
147 | |
148 | /// This is a best-effort operation that performs all available levels of canonicalization. |
149 | /// |
150 | /// At the moment the operation will normalize casing and the separator, but in the future |
151 | /// it may also validate and update from deprecated subtags to canonical ones. |
152 | /// |
153 | /// # Examples |
154 | /// |
155 | /// ``` |
156 | /// use icu::locid::LanguageIdentifier; |
157 | /// |
158 | /// assert_eq!( |
159 | /// LanguageIdentifier::canonicalize("pL_latn_pl" ).as_deref(), |
160 | /// Ok("pl-Latn-PL" ) |
161 | /// ); |
162 | /// ``` |
163 | pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, ParserError> { |
164 | let lang_id = Self::try_from_bytes(input.as_ref())?; |
165 | Ok(lang_id.write_to_string().into_owned()) |
166 | } |
167 | |
168 | /// Compare this [`LanguageIdentifier`] with BCP-47 bytes. |
169 | /// |
170 | /// The return value is equivalent to what would happen if you first converted this |
171 | /// [`LanguageIdentifier`] to a BCP-47 string and then performed a byte comparison. |
172 | /// |
173 | /// This function is case-sensitive and results in a *total order*, so it is appropriate for |
174 | /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`. |
175 | /// |
176 | /// # Examples |
177 | /// |
178 | /// ``` |
179 | /// use icu::locid::LanguageIdentifier; |
180 | /// use std::cmp::Ordering; |
181 | /// |
182 | /// let bcp47_strings: &[&str] = &[ |
183 | /// "pl-Latn-PL" , |
184 | /// "und" , |
185 | /// "und-Adlm" , |
186 | /// "und-GB" , |
187 | /// "und-ZA" , |
188 | /// "und-fonipa" , |
189 | /// "zh" , |
190 | /// ]; |
191 | /// |
192 | /// for ab in bcp47_strings.windows(2) { |
193 | /// let a = ab[0]; |
194 | /// let b = ab[1]; |
195 | /// assert!(a.cmp(b) == Ordering::Less); |
196 | /// let a_langid = a.parse::<LanguageIdentifier>().unwrap(); |
197 | /// assert!(a_langid.strict_cmp(a.as_bytes()) == Ordering::Equal); |
198 | /// assert!(a_langid.strict_cmp(b.as_bytes()) == Ordering::Less); |
199 | /// } |
200 | /// ``` |
201 | pub fn strict_cmp(&self, other: &[u8]) -> Ordering { |
202 | self.strict_cmp_iter(other.split(|b| *b == b'-' )).end() |
203 | } |
204 | |
205 | /// Compare this [`LanguageIdentifier`] with an iterator of BCP-47 subtags. |
206 | /// |
207 | /// This function has the same equality semantics as [`LanguageIdentifier::strict_cmp`]. It is intended as |
208 | /// a more modular version that allows multiple subtag iterators to be chained together. |
209 | /// |
210 | /// For an additional example, see [`SubtagOrderingResult`]. |
211 | /// |
212 | /// # Examples |
213 | /// |
214 | /// ``` |
215 | /// use icu::locid::LanguageIdentifier; |
216 | /// use std::cmp::Ordering; |
217 | /// |
218 | /// let subtags: &[&[u8]] = &[b"ca" , b"ES" , b"valencia" ]; |
219 | /// |
220 | /// let loc = "ca-ES-valencia" .parse::<LanguageIdentifier>().unwrap(); |
221 | /// assert_eq!( |
222 | /// Ordering::Equal, |
223 | /// loc.strict_cmp_iter(subtags.iter().copied()).end() |
224 | /// ); |
225 | /// |
226 | /// let loc = "ca-ES" .parse::<LanguageIdentifier>().unwrap(); |
227 | /// assert_eq!( |
228 | /// Ordering::Less, |
229 | /// loc.strict_cmp_iter(subtags.iter().copied()).end() |
230 | /// ); |
231 | /// |
232 | /// let loc = "ca-ZA" .parse::<LanguageIdentifier>().unwrap(); |
233 | /// assert_eq!( |
234 | /// Ordering::Greater, |
235 | /// loc.strict_cmp_iter(subtags.iter().copied()).end() |
236 | /// ); |
237 | /// ``` |
238 | pub fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult<I> |
239 | where |
240 | I: Iterator<Item = &'l [u8]>, |
241 | { |
242 | let r = self.for_each_subtag_str(&mut |subtag| { |
243 | if let Some(other) = subtags.next() { |
244 | match subtag.as_bytes().cmp(other) { |
245 | Ordering::Equal => Ok(()), |
246 | not_equal => Err(not_equal), |
247 | } |
248 | } else { |
249 | Err(Ordering::Greater) |
250 | } |
251 | }); |
252 | match r { |
253 | Ok(_) => SubtagOrderingResult::Subtags(subtags), |
254 | Err(o) => SubtagOrderingResult::Ordering(o), |
255 | } |
256 | } |
257 | |
258 | /// Compare this `LanguageIdentifier` with a potentially unnormalized BCP-47 string. |
259 | /// |
260 | /// The return value is equivalent to what would happen if you first parsed the |
261 | /// BCP-47 string to a `LanguageIdentifier` and then performed a structural comparison. |
262 | /// |
263 | /// # Examples |
264 | /// |
265 | /// ``` |
266 | /// use icu::locid::LanguageIdentifier; |
267 | /// use std::cmp::Ordering; |
268 | /// |
269 | /// let bcp47_strings: &[&str] = &[ |
270 | /// "pl-LaTn-pL" , |
271 | /// "uNd" , |
272 | /// "UnD-adlm" , |
273 | /// "uNd-GB" , |
274 | /// "UND-FONIPA" , |
275 | /// "ZH" , |
276 | /// ]; |
277 | /// |
278 | /// for a in bcp47_strings { |
279 | /// assert!(a.parse::<LanguageIdentifier>().unwrap().normalizing_eq(a)); |
280 | /// } |
281 | /// ``` |
282 | pub fn normalizing_eq(&self, other: &str) -> bool { |
283 | macro_rules! subtag_matches { |
284 | ($T:ty, $iter:ident, $expected:expr) => { |
285 | $iter |
286 | .next() |
287 | .map(|b| <$T>::try_from_bytes(b) == Ok($expected)) |
288 | .unwrap_or(false) |
289 | }; |
290 | } |
291 | |
292 | let mut iter = SubtagIterator::new(other.as_bytes()); |
293 | if !subtag_matches!(subtags::Language, iter, self.language) { |
294 | return false; |
295 | } |
296 | if let Some(ref script) = self.script { |
297 | if !subtag_matches!(subtags::Script, iter, *script) { |
298 | return false; |
299 | } |
300 | } |
301 | if let Some(ref region) = self.region { |
302 | if !subtag_matches!(subtags::Region, iter, *region) { |
303 | return false; |
304 | } |
305 | } |
306 | for variant in self.variants.iter() { |
307 | if !subtag_matches!(subtags::Variant, iter, *variant) { |
308 | return false; |
309 | } |
310 | } |
311 | iter.next().is_none() |
312 | } |
313 | |
314 | pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E> |
315 | where |
316 | F: FnMut(&str) -> Result<(), E>, |
317 | { |
318 | f(self.language.as_str())?; |
319 | if let Some(ref script) = self.script { |
320 | f(script.as_str())?; |
321 | } |
322 | if let Some(ref region) = self.region { |
323 | f(region.as_str())?; |
324 | } |
325 | for variant in self.variants.iter() { |
326 | f(variant.as_str())?; |
327 | } |
328 | Ok(()) |
329 | } |
330 | } |
331 | |
332 | impl AsRef<LanguageIdentifier> for LanguageIdentifier { |
333 | fn as_ref(&self) -> &Self { |
334 | self |
335 | } |
336 | } |
337 | |
338 | impl AsMut<LanguageIdentifier> for LanguageIdentifier { |
339 | fn as_mut(&mut self) -> &mut Self { |
340 | self |
341 | } |
342 | } |
343 | |
344 | impl core::fmt::Debug for LanguageIdentifier { |
345 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
346 | core::fmt::Display::fmt(&self, f) |
347 | } |
348 | } |
349 | |
350 | impl FromStr for LanguageIdentifier { |
351 | type Err = ParserError; |
352 | |
353 | fn from_str(source: &str) -> Result<Self, Self::Err> { |
354 | Self::try_from_bytes(source.as_bytes()) |
355 | } |
356 | } |
357 | |
358 | impl_writeable_for_each_subtag_str_no_test!(LanguageIdentifier, selff, selff.script.is_none() && selff.region.is_none() && selff.variants.is_empty() => selff.language.write_to_string()); |
359 | |
360 | #[test ] |
361 | fn test_writeable() { |
362 | use writeable::assert_writeable_eq; |
363 | assert_writeable_eq!(LanguageIdentifier::UND, "und" ); |
364 | assert_writeable_eq!("und-001" .parse::<LanguageIdentifier>().unwrap(), "und-001" ); |
365 | assert_writeable_eq!( |
366 | "und-Mymr" .parse::<LanguageIdentifier>().unwrap(), |
367 | "und-Mymr" , |
368 | ); |
369 | assert_writeable_eq!( |
370 | "my-Mymr-MM" .parse::<LanguageIdentifier>().unwrap(), |
371 | "my-Mymr-MM" , |
372 | ); |
373 | assert_writeable_eq!( |
374 | "my-Mymr-MM-posix" .parse::<LanguageIdentifier>().unwrap(), |
375 | "my-Mymr-MM-posix" , |
376 | ); |
377 | assert_writeable_eq!( |
378 | "zh-macos-posix" .parse::<LanguageIdentifier>().unwrap(), |
379 | "zh-macos-posix" , |
380 | ); |
381 | } |
382 | |
383 | /// # Examples |
384 | /// |
385 | /// ``` |
386 | /// use icu::locid::{langid, subtags::language, LanguageIdentifier}; |
387 | /// |
388 | /// assert_eq!(LanguageIdentifier::from(language!("en" )), langid!("en" )); |
389 | /// ``` |
390 | impl From<subtags::Language> for LanguageIdentifier { |
391 | fn from(language: subtags::Language) -> Self { |
392 | Self { |
393 | language, |
394 | ..Default::default() |
395 | } |
396 | } |
397 | } |
398 | |
399 | /// # Examples |
400 | /// |
401 | /// ``` |
402 | /// use icu::locid::{langid, subtags::script, LanguageIdentifier}; |
403 | /// |
404 | /// assert_eq!( |
405 | /// LanguageIdentifier::from(Some(script!("latn" ))), |
406 | /// langid!("und-Latn" ) |
407 | /// ); |
408 | /// ``` |
409 | impl From<Option<subtags::Script>> for LanguageIdentifier { |
410 | fn from(script: Option<subtags::Script>) -> Self { |
411 | Self { |
412 | script, |
413 | ..Default::default() |
414 | } |
415 | } |
416 | } |
417 | |
418 | /// # Examples |
419 | /// |
420 | /// ``` |
421 | /// use icu::locid::{langid, subtags::region, LanguageIdentifier}; |
422 | /// |
423 | /// assert_eq!( |
424 | /// LanguageIdentifier::from(Some(region!("US" ))), |
425 | /// langid!("und-US" ) |
426 | /// ); |
427 | /// ``` |
428 | impl From<Option<subtags::Region>> for LanguageIdentifier { |
429 | fn from(region: Option<subtags::Region>) -> Self { |
430 | Self { |
431 | region, |
432 | ..Default::default() |
433 | } |
434 | } |
435 | } |
436 | |
437 | /// Convert from an LSR tuple to a [`LanguageIdentifier`]. |
438 | /// |
439 | /// # Examples |
440 | /// |
441 | /// ``` |
442 | /// use icu::locid::{ |
443 | /// langid, |
444 | /// subtags::{language, region, script}, |
445 | /// LanguageIdentifier, |
446 | /// }; |
447 | /// |
448 | /// let lang = language!("en" ); |
449 | /// let script = script!("Latn" ); |
450 | /// let region = region!("US" ); |
451 | /// assert_eq!( |
452 | /// LanguageIdentifier::from((lang, Some(script), Some(region))), |
453 | /// langid!("en-Latn-US" ) |
454 | /// ); |
455 | /// ``` |
456 | impl |
457 | From<( |
458 | subtags::Language, |
459 | Option<subtags::Script>, |
460 | Option<subtags::Region>, |
461 | )> for LanguageIdentifier |
462 | { |
463 | fn from( |
464 | lsr: ( |
465 | subtags::Language, |
466 | Option<subtags::Script>, |
467 | Option<subtags::Region>, |
468 | ), |
469 | ) -> Self { |
470 | Self { |
471 | language: lsr.0, |
472 | script: lsr.1, |
473 | region: lsr.2, |
474 | ..Default::default() |
475 | } |
476 | } |
477 | } |
478 | |
479 | /// Convert from a [`LanguageIdentifier`] to an LSR tuple. |
480 | /// |
481 | /// # Examples |
482 | /// |
483 | /// ``` |
484 | /// use icu::locid::{ |
485 | /// langid, |
486 | /// subtags::{language, region, script}, |
487 | /// }; |
488 | /// |
489 | /// let lid = langid!("en-Latn-US" ); |
490 | /// let (lang, script, region) = (&lid).into(); |
491 | /// |
492 | /// assert_eq!(lang, language!("en" )); |
493 | /// assert_eq!(script, Some(script!("Latn" ))); |
494 | /// assert_eq!(region, Some(region!("US" ))); |
495 | /// ``` |
496 | impl From<&LanguageIdentifier> |
497 | for ( |
498 | subtags::Language, |
499 | Option<subtags::Script>, |
500 | Option<subtags::Region>, |
501 | ) |
502 | { |
503 | fn from(langid: &LanguageIdentifier) -> Self { |
504 | (langid.language, langid.script, langid.region) |
505 | } |
506 | } |
507 | |