1mod errors;
2mod layout_table;
3#[cfg(feature = "likelysubtags")]
4pub mod likelysubtags;
5#[doc(hidden)]
6pub mod parser;
7#[cfg(feature = "serde")]
8mod serde;
9pub mod subtags;
10
11pub use crate::errors::LanguageIdentifierError;
12use std::fmt::Write;
13use std::iter::Peekable;
14use std::str::FromStr;
15
16/// Enum representing available character direction orientations.
17#[derive(Clone, Copy, Debug, PartialEq)]
18pub enum CharacterDirection {
19 /// Right To Left
20 ///
21 /// Used in languages such as Arabic, Hebrew, Fula, Kurdish etc.
22 RTL,
23 /// Left To Right
24 ///
25 /// Used in languages such as French, Spanish, English, German etc.
26 LTR,
27 /// Top To Bottom
28 ///
29 /// Used in Traditional Mongolian
30 TTB,
31}
32
33type PartsTuple = (
34 subtags::Language,
35 Option<subtags::Script>,
36 Option<subtags::Region>,
37 Vec<subtags::Variant>,
38);
39
40/// `LanguageIdentifier` is a core struct representing a Unicode Language Identifier.
41///
42/// # Examples
43///
44/// ```
45/// use unic_langid_impl::LanguageIdentifier;
46///
47/// let li: LanguageIdentifier = "en-US".parse()
48/// .expect("Failed to parse.");
49///
50/// assert_eq!(li.language, "en");
51/// assert_eq!(li.script, None);
52/// assert_eq!(li.region.as_ref().map(Into::into), Some("US"));
53/// assert_eq!(li.variants().len(), 0);
54/// ```
55///
56/// # Parsing
57///
58/// Unicode recognizes three levels of standard conformance for any language identifier:
59///
60/// * *well-formed* - syntactically correct
61/// * *valid* - well-formed and only uses registered language subtags, extensions, keywords, types...
62/// * *canonical* - valid and no deprecated codes or structure.
63///
64/// At the moment parsing normalizes a well-formed language identifier converting
65/// `_` separators to `-` and adjusting casing to conform to the Unicode standard.
66///
67/// Any bogus subtags will cause the parsing to fail with an error.
68/// No subtag validation is performed.
69///
70/// # Examples:
71///
72/// ```
73/// use unic_langid_impl::LanguageIdentifier;
74///
75/// let li: LanguageIdentifier = "eN_latn_Us-Valencia".parse()
76/// .expect("Failed to parse.");
77///
78/// assert_eq!(li.language, "en");
79/// assert_eq!(li.script.as_ref().map(Into::into), Some("Latn"));
80/// assert_eq!(li.region.as_ref().map(Into::into), Some("US"));
81/// assert_eq!(li.variants().map(|v| v.as_str()).collect::<Vec<_>>(), &["valencia"]);
82/// ```
83#[derive(Default, Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord)]
84pub struct LanguageIdentifier {
85 pub language: subtags::Language,
86 pub script: Option<subtags::Script>,
87 pub region: Option<subtags::Region>,
88 variants: Option<Box<[subtags::Variant]>>,
89}
90
91impl LanguageIdentifier {
92 /// A constructor which takes a utf8 slice, parses it and
93 /// produces a well-formed `LanguageIdentifier`.
94 ///
95 /// # Examples
96 ///
97 /// ```
98 /// use unic_langid_impl::LanguageIdentifier;
99 ///
100 /// let li = LanguageIdentifier::from_bytes("en-US".as_bytes())
101 /// .expect("Parsing failed.");
102 ///
103 /// assert_eq!(li.to_string(), "en-US");
104 /// ```
105 pub fn from_bytes(v: &[u8]) -> Result<Self, LanguageIdentifierError> {
106 Ok(parser::parse_language_identifier(v)?)
107 }
108
109 /// A constructor which takes optional subtags as `AsRef<[u8]>`, parses them and
110 /// produces a well-formed `LanguageIdentifier`.
111 ///
112 /// # Examples
113 ///
114 /// ```
115 /// use unic_langid_impl::LanguageIdentifier;
116 ///
117 /// let li = LanguageIdentifier::from_parts(
118 /// "fr".parse().expect("Parsing failed."),
119 /// None,
120 /// Some("CA".parse().expect("Parsing failed.")),
121 /// &[]
122 /// );
123 ///
124 /// assert_eq!(li.to_string(), "fr-CA");
125 /// ```
126 pub fn from_parts(
127 language: subtags::Language,
128 script: Option<subtags::Script>,
129 region: Option<subtags::Region>,
130 variants: &[subtags::Variant],
131 ) -> Self {
132 let variants = if !variants.is_empty() {
133 let mut v = variants.to_vec();
134 v.sort_unstable();
135 v.dedup();
136 Some(v.into_boxed_slice())
137 } else {
138 None
139 };
140
141 Self {
142 language,
143 script,
144 region,
145 variants,
146 }
147 }
148
149 /// # Unchecked
150 ///
151 /// This function accepts subtags expecting variants
152 /// to be deduplicated and ordered.
153 pub const fn from_raw_parts_unchecked(
154 language: subtags::Language,
155 script: Option<subtags::Script>,
156 region: Option<subtags::Region>,
157 variants: Option<Box<[subtags::Variant]>>,
158 ) -> Self {
159 Self {
160 language,
161 script,
162 region,
163 variants,
164 }
165 }
166
167 #[doc(hidden)]
168 /// This method is used by `unic-locale` to handle partial
169 /// subtag iterator.
170 ///
171 /// Not stable.
172 pub fn try_from_iter<'a>(
173 iter: &mut Peekable<impl Iterator<Item = &'a [u8]>>,
174 allow_extension: bool,
175 ) -> Result<LanguageIdentifier, LanguageIdentifierError> {
176 Ok(parser::parse_language_identifier_from_iter(
177 iter,
178 allow_extension,
179 )?)
180 }
181
182 /// Consumes `LanguageIdentifier` and produces raw internal representations
183 /// of all subtags in form of `u64`/`u32`.
184 ///
185 /// Primarily used for storing internal representation and restoring via
186 /// `from_raw_parts_unchecked`.
187 ///
188 /// # Examples
189 ///
190 /// ```
191 /// use unic_langid_impl::LanguageIdentifier;
192 /// use tinystr::{TinyStr8, TinyStr4};
193 ///
194 /// let li: LanguageIdentifier = "en-US".parse()
195 /// .expect("Parsing failed.");
196 ///
197 /// let (lang, script, region, variants) = li.into_parts();
198 ///
199 /// // let li2 = LanguageIdentifier::from_raw_parts_unchecked(
200 /// // lang.map(|l| unsafe { TinyStr8::new_unchecked(l) }),
201 /// // script.map(|s| unsafe { TinyStr4::new_unchecked(s) }),
202 /// // region.map(|r| unsafe { TinyStr4::new_unchecked(r) }),
203 /// // variants.map(|v| v.into_iter().map(|v| unsafe { TinyStr8::new_unchecked(*v) }).collect()),
204 /// //);
205 ///
206 /// //assert_eq!(li2.to_string(), "en-US");
207 /// ```
208 pub fn into_parts(self) -> PartsTuple {
209 (
210 self.language,
211 self.script,
212 self.region,
213 self.variants.map_or_else(Vec::new, |v| v.to_vec()),
214 )
215 }
216
217 /// Compares a `LanguageIdentifier` to another `AsRef<LanguageIdentifier`
218 /// allowing for either side to use the missing fields as wildcards.
219 ///
220 /// This allows for matching between `en` (treated as `en-*-*-*`) and `en-US`.
221 ///
222 /// # Examples
223 ///
224 /// ```
225 /// use unic_langid_impl::LanguageIdentifier;
226 ///
227 /// let li1: LanguageIdentifier = "en".parse()
228 /// .expect("Parsing failed.");
229 ///
230 /// let li2: LanguageIdentifier = "en-US".parse()
231 /// .expect("Parsing failed.");
232 ///
233 /// assert_ne!(li1, li2); // "en" != "en-US"
234 /// assert_ne!(li1.to_string(), li2.to_string()); // "en" != "en-US"
235 ///
236 /// assert_eq!(li1.matches(&li2, false, false), false); // "en" != "en-US"
237 /// assert_eq!(li1.matches(&li2, true, false), true); // "en-*-*-*" == "en-US"
238 /// assert_eq!(li1.matches(&li2, false, true), false); // "en" != "en-*-US-*"
239 /// assert_eq!(li1.matches(&li2, true, true), true); // "en-*-*-*" == "en-*-US-*"
240 /// ```
241 pub fn matches<O: AsRef<Self>>(
242 &self,
243 other: &O,
244 self_as_range: bool,
245 other_as_range: bool,
246 ) -> bool {
247 let other = other.as_ref();
248 self.language
249 .matches(other.language, self_as_range, other_as_range)
250 && subtag_matches(&self.script, &other.script, self_as_range, other_as_range)
251 && subtag_matches(&self.region, &other.region, self_as_range, other_as_range)
252 && subtags_match(
253 &self.variants,
254 &other.variants,
255 self_as_range,
256 other_as_range,
257 )
258 }
259
260 /// Returns a vector of variants subtags of the `LanguageIdentifier`.
261 ///
262 /// # Examples
263 ///
264 /// ```
265 /// use unic_langid_impl::LanguageIdentifier;
266 ///
267 /// let li1: LanguageIdentifier = "ca-ES-valencia".parse()
268 /// .expect("Parsing failed.");
269 ///
270 /// assert_eq!(li1.variants().map(|v| v.as_str()).collect::<Vec<_>>(), &["valencia"]);
271 ///
272 /// let li2: LanguageIdentifier = "de".parse()
273 /// .expect("Parsing failed.");
274 ///
275 /// assert_eq!(li2.variants().len(), 0);
276 /// ```
277 pub fn variants(&self) -> impl ExactSizeIterator<Item = &subtags::Variant> {
278 let variants: &[_] = match self.variants {
279 Some(ref v) => v,
280 None => &[],
281 };
282
283 variants.iter()
284 }
285
286 /// Sets variant subtags of the `LanguageIdentifier`.
287 ///
288 /// # Examples
289 ///
290 /// ```
291 /// use unic_langid_impl::LanguageIdentifier;
292 ///
293 /// let mut li: LanguageIdentifier = "ca-ES".parse()
294 /// .expect("Parsing failed.");
295 ///
296 /// li.set_variants(&["valencia".parse().expect("Parsing failed.")]);
297 ///
298 /// assert_eq!(li.to_string(), "ca-ES-valencia");
299 /// ```
300 pub fn set_variants(&mut self, variants: &[subtags::Variant]) {
301 let mut v = variants.to_vec();
302
303 if v.is_empty() {
304 self.variants = None;
305 } else {
306 v.sort_unstable();
307 v.dedup();
308 self.variants = Some(v.into_boxed_slice());
309 }
310 }
311
312 /// Tests if a variant subtag is present in the `LanguageIdentifier`.
313 ///
314 /// # Examples
315 ///
316 /// ```
317 /// use unic_langid_impl::LanguageIdentifier;
318 ///
319 /// let mut li: LanguageIdentifier = "ca-ES-macos".parse()
320 /// .expect("Parsing failed.");
321 ///
322 /// assert_eq!(li.has_variant("valencia".parse().unwrap()), false);
323 /// assert_eq!(li.has_variant("macos".parse().unwrap()), true);
324 /// ```
325 pub fn has_variant(&self, variant: subtags::Variant) -> bool {
326 if let Some(variants) = &self.variants {
327 variants.contains(&variant)
328 } else {
329 false
330 }
331 }
332
333 /// Clears variant subtags of the `LanguageIdentifier`.
334 ///
335 /// # Examples
336 ///
337 /// ```
338 /// use unic_langid_impl::LanguageIdentifier;
339 ///
340 /// let mut li: LanguageIdentifier = "ca-ES-valencia".parse()
341 /// .expect("Parsing failed.");
342 ///
343 /// li.clear_variants();
344 ///
345 /// assert_eq!(li.to_string(), "ca-ES");
346 /// ```
347 pub fn clear_variants(&mut self) {
348 self.variants = None;
349 }
350
351 /// Extends the `LanguageIdentifier` adding likely subtags based
352 /// on tables provided by CLDR.
353 ///
354 /// # Examples
355 ///
356 /// ```
357 /// use unic_langid_impl::LanguageIdentifier;
358 ///
359 /// let mut li: LanguageIdentifier = "en-US".parse()
360 /// .expect("Parsing failed.");
361 ///
362 /// assert_eq!(li.maximize(), true);
363 /// assert_eq!(li.to_string(), "en-Latn-US");
364 /// ```
365 #[cfg(feature = "likelysubtags")]
366 pub fn maximize(&mut self) -> bool {
367 if let Some(new_li) = likelysubtags::maximize(self.language, self.script, self.region) {
368 self.language = new_li.0;
369 self.script = new_li.1;
370 self.region = new_li.2;
371 true
372 } else {
373 false
374 }
375 }
376
377 /// Extends the `LanguageIdentifier` removing likely subtags based
378 /// on tables provided by CLDR.
379 ///
380 /// # Examples
381 ///
382 /// ```
383 /// use unic_langid_impl::LanguageIdentifier;
384 ///
385 /// let mut li: LanguageIdentifier = "en-Latn-US".parse()
386 /// .expect("Parsing failed.");
387 ///
388 /// assert_eq!(li.minimize(), true);
389 /// assert_eq!(li.to_string(), "en");
390 /// ```
391 #[cfg(feature = "likelysubtags")]
392 pub fn minimize(&mut self) -> bool {
393 if let Some(new_li) = likelysubtags::minimize(self.language, self.script, self.region) {
394 self.language = new_li.0;
395 self.script = new_li.1;
396 self.region = new_li.2;
397 true
398 } else {
399 false
400 }
401 }
402
403 /// Returns character direction of the `LanguageIdentifier`.
404 ///
405 /// # Examples
406 ///
407 /// ```
408 /// use unic_langid_impl::{LanguageIdentifier, CharacterDirection};
409 ///
410 /// let li1: LanguageIdentifier = "es-AR".parse()
411 /// .expect("Parsing failed.");
412 /// let li2: LanguageIdentifier = "fa".parse()
413 /// .expect("Parsing failed.");
414 ///
415 /// assert_eq!(li1.character_direction(), CharacterDirection::LTR);
416 /// assert_eq!(li2.character_direction(), CharacterDirection::RTL);
417 /// ```
418 pub fn character_direction(&self) -> CharacterDirection {
419 match (self.language.into(), self.script) {
420 (_, Some(script))
421 if layout_table::SCRIPTS_CHARACTER_DIRECTION_LTR.contains(&script.into()) =>
422 {
423 CharacterDirection::LTR
424 }
425 (_, Some(script))
426 if layout_table::SCRIPTS_CHARACTER_DIRECTION_RTL.contains(&script.into()) =>
427 {
428 CharacterDirection::RTL
429 }
430 (_, Some(script))
431 if layout_table::SCRIPTS_CHARACTER_DIRECTION_TTB.contains(&script.into()) =>
432 {
433 CharacterDirection::TTB
434 }
435 (Some(lang), _) if layout_table::LANGS_CHARACTER_DIRECTION_RTL.contains(&lang) => {
436 #[cfg(feature = "likelysubtags")]
437 if let Some((_, Some(script), _)) =
438 likelysubtags::maximize(self.language, None, self.region)
439 {
440 if layout_table::SCRIPTS_CHARACTER_DIRECTION_LTR.contains(&script.into()) {
441 return CharacterDirection::LTR;
442 }
443 }
444 CharacterDirection::RTL
445 }
446 _ => CharacterDirection::LTR,
447 }
448 }
449}
450
451impl FromStr for LanguageIdentifier {
452 type Err = LanguageIdentifierError;
453
454 fn from_str(source: &str) -> Result<Self, Self::Err> {
455 Self::from_bytes(source.as_bytes())
456 }
457}
458
459impl AsRef<LanguageIdentifier> for LanguageIdentifier {
460 #[inline(always)]
461 fn as_ref(&self) -> &LanguageIdentifier {
462 self
463 }
464}
465
466impl std::fmt::Display for LanguageIdentifier {
467 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
468 self.language.fmt(f)?;
469 if let Some(ref script: &Script) = self.script {
470 f.write_char('-')?;
471 script.fmt(f)?;
472 }
473 if let Some(ref region: &Region) = self.region {
474 f.write_char('-')?;
475 region.fmt(f)?;
476 }
477 if let Some(variants: &Box<[Variant]>) = &self.variants {
478 for variant: &Variant in variants.iter() {
479 f.write_char('-')?;
480 variant.fmt(f)?;
481 }
482 }
483 Ok(())
484 }
485}
486
487impl PartialEq<&str> for LanguageIdentifier {
488 fn eq(&self, other: &&str) -> bool {
489 self.to_string().as_str() == *other
490 }
491}
492
493fn subtag_matches<P: PartialEq>(
494 subtag1: &Option<P>,
495 subtag2: &Option<P>,
496 as_range1: bool,
497 as_range2: bool,
498) -> bool {
499 (as_range1 && subtag1.is_none()) || (as_range2 && subtag2.is_none()) || subtag1 == subtag2
500}
501
502fn is_option_empty<P: PartialEq>(subtag: &Option<Box<[P]>>) -> bool {
503 subtag.as_ref().map_or(default:true, |t: &Box<[P]>| t.is_empty())
504}
505
506fn subtags_match<P: PartialEq>(
507 subtag1: &Option<Box<[P]>>,
508 subtag2: &Option<Box<[P]>>,
509 as_range1: bool,
510 as_range2: bool,
511) -> bool {
512 // or is some and is empty!
513 (as_range1 && is_option_empty(subtag:subtag1))
514 || (as_range2 && is_option_empty(subtag:subtag2))
515 || subtag1 == subtag2
516}
517
518/// This is a best-effort operation that performs all available levels of canonicalization.
519///
520/// At the moment the operation will normalize casing and the separator, but in the future
521/// it may also validate and update from deprecated subtags to canonical ones.
522///
523/// # Examples
524///
525/// ```
526/// use unic_langid_impl::canonicalize;
527///
528/// assert_eq!(canonicalize("pL_latn_pl"), Ok("pl-Latn-PL".to_string()));
529/// ```
530pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, LanguageIdentifierError> {
531 let lang_id: LanguageIdentifier = LanguageIdentifier::from_bytes(input.as_ref())?;
532 Ok(lang_id.to_string())
533}
534
535#[test]
536fn invalid_subtag() {
537 assert!(LanguageIdentifier::from_bytes("en-ÁÁÁÁ".as_bytes()).is_err());
538}
539