1mod errors;
2mod layout_table;
3#[cfg(feature = "likelysubtags")]
4pub mod likelysubtags;
5#[doc(hidden)]
6pub mod parser;
7#[cfg(feature = "serde")]
8mod serde;
9pub mod subtags;
10
11pub use crate::errors::LanguageIdentifierError;
12use std::fmt::Write;
13use std::iter::Peekable;
14use std::str::FromStr;
15
16/// Enum representing available character direction orientations.
17#[derive(Clone, Copy, Debug, PartialEq)]
18pub enum CharacterDirection {
19 /// Right To Left
20 ///
21 /// Used in languages such as Arabic, Hebrew, Fula, Kurdish etc.
22 RTL,
23 /// Left To Right
24 ///
25 /// Used in languages such as French, Spanish, English, German etc.
26 LTR,
27 /// Top To Bottom
28 ///
29 /// Used in Traditional Mongolian
30 TTB,
31}
32
33type PartsTuple = (
34 subtags::Language,
35 Option<subtags::Script>,
36 Option<subtags::Region>,
37 Vec<subtags::Variant>,
38);
39
40/// `LanguageIdentifier` is a core struct representing a Unicode Language Identifier.
41///
42/// # Examples
43///
44/// ```
45/// use unic_langid_impl::LanguageIdentifier;
46///
47/// let li: LanguageIdentifier = "en-US".parse()
48/// .expect("Failed to parse.");
49///
50/// assert_eq!(li.language, "en");
51/// assert_eq!(li.script, None);
52/// assert_eq!(li.region.as_ref().map(Into::into), Some("US"));
53/// assert_eq!(li.variants().len(), 0);
54/// ```
55///
56/// # Parsing
57///
58/// Unicode recognizes three levels of standard conformance for any language identifier:
59///
60/// * *well-formed* - syntactically correct
61/// * *valid* - well-formed and only uses registered language subtags, extensions, keywords, types...
62/// * *canonical* - valid and no deprecated codes or structure.
63///
64/// At the moment parsing normalizes a well-formed language identifier converting
65/// `_` separators to `-` and adjusting casing to conform to the Unicode standard.
66///
67/// Any bogus subtags will cause the parsing to fail with an error.
68/// No subtag validation is performed.
69///
70/// # Examples:
71///
72/// ```
73/// use unic_langid_impl::LanguageIdentifier;
74///
75/// let li: LanguageIdentifier = "eN_latn_Us-Valencia".parse()
76/// .expect("Failed to parse.");
77///
78/// assert_eq!(li.language, "en");
79/// assert_eq!(li.script.as_ref().map(Into::into), Some("Latn"));
80/// assert_eq!(li.region.as_ref().map(Into::into), Some("US"));
81/// assert_eq!(li.variants().map(|v| v.as_str()).collect::<Vec<_>>(), &["valencia"]);
82/// ```
83#[derive(Default, Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord)]
84pub struct LanguageIdentifier {
85 pub language: subtags::Language,
86 pub script: Option<subtags::Script>,
87 pub region: Option<subtags::Region>,
88 variants: Option<Box<[subtags::Variant]>>,
89}
90
91impl LanguageIdentifier {
92 /// A constructor which takes a utf8 slice, parses it and
93 /// produces a well-formed `LanguageIdentifier`.
94 ///
95 /// # Examples
96 ///
97 /// ```
98 /// use unic_langid_impl::LanguageIdentifier;
99 ///
100 /// let li = LanguageIdentifier::from_bytes("en-US".as_bytes())
101 /// .expect("Parsing failed.");
102 ///
103 /// assert_eq!(li.to_string(), "en-US");
104 /// ```
105 pub fn from_bytes(v: &[u8]) -> Result<Self, LanguageIdentifierError> {
106 Ok(parser::parse_language_identifier(v)?)
107 }
108
109 /// A constructor which takes optional subtags as `AsRef<[u8]>`, parses them and
110 /// produces a well-formed `LanguageIdentifier`.
111 ///
112 /// # Examples
113 ///
114 /// ```
115 /// use unic_langid_impl::LanguageIdentifier;
116 ///
117 /// let li = LanguageIdentifier::from_parts(
118 /// "fr".parse().expect("Parsing failed."),
119 /// None,
120 /// Some("CA".parse().expect("Parsing failed.")),
121 /// &[]
122 /// );
123 ///
124 /// assert_eq!(li.to_string(), "fr-CA");
125 /// ```
126 pub fn from_parts(
127 language: subtags::Language,
128 script: Option<subtags::Script>,
129 region: Option<subtags::Region>,
130 variants: &[subtags::Variant],
131 ) -> Self {
132 let variants = if !variants.is_empty() {
133 let mut v = variants.to_vec();
134 v.sort_unstable();
135 v.dedup();
136 Some(v.into_boxed_slice())
137 } else {
138 None
139 };
140
141 Self {
142 language,
143 script,
144 region,
145 variants,
146 }
147 }
148
149 /// # Unchecked
150 ///
151 /// This function accepts subtags expecting variants
152 /// to be deduplicated and ordered.
153 pub const fn from_raw_parts_unchecked(
154 language: subtags::Language,
155 script: Option<subtags::Script>,
156 region: Option<subtags::Region>,
157 variants: Option<Box<[subtags::Variant]>>,
158 ) -> Self {
159 Self {
160 language,
161 script,
162 region,
163 variants,
164 }
165 }
166
167 #[doc(hidden)]
168 /// This method is used by `unic-locale` to handle partial
169 /// subtag iterator.
170 ///
171 /// Not stable.
172 pub fn try_from_iter<'a>(
173 iter: &mut Peekable<impl Iterator<Item = &'a [u8]>>,
174 allow_extension: bool,
175 ) -> Result<LanguageIdentifier, LanguageIdentifierError> {
176 Ok(parser::parse_language_identifier_from_iter(
177 iter,
178 allow_extension,
179 )?)
180 }
181
182 /// Consumes `LanguageIdentifier` and produces raw internal representations
183 /// of all subtags in form of `u64`/`u32`.
184 ///
185 /// Primarily used for storing internal representation and restoring via
186 /// `from_raw_parts_unchecked`.
187 ///
188 /// # Examples
189 ///
190 /// ```
191 /// use unic_langid_impl::LanguageIdentifier;
192 /// use tinystr::{TinyStr8, TinyStr4};
193 ///
194 /// let li: LanguageIdentifier = "en-US".parse()
195 /// .expect("Parsing failed.");
196 ///
197 /// let (lang, script, region, variants) = li.into_parts();
198 ///
199 /// // let li2 = LanguageIdentifier::from_raw_parts_unchecked(
200 /// // lang.map(|l| unsafe { TinyStr8::new_unchecked(l) }),
201 /// // script.map(|s| unsafe { TinyStr4::new_unchecked(s) }),
202 /// // region.map(|r| unsafe { TinyStr4::new_unchecked(r) }),
203 /// // variants.map(|v| v.into_iter().map(|v| unsafe { TinyStr8::new_unchecked(*v) }).collect()),
204 /// //);
205 ///
206 /// //assert_eq!(li2.to_string(), "en-US");
207 /// ```
208 pub fn into_parts(self) -> PartsTuple {
209 (
210 self.language,
211 self.script,
212 self.region,
213 self.variants.map_or_else(Vec::new, |v| v.to_vec()),
214 )
215 }
216
217 /// Compares a `LanguageIdentifier` to another `AsRef<LanguageIdentifier`
218 /// allowing for either side to use the missing fields as wildcards.
219 ///
220 /// This allows for matching between `en` (treated as `en-*-*-*`) and `en-US`.
221 ///
222 /// # Examples
223 ///
224 /// ```
225 /// use unic_langid_impl::LanguageIdentifier;
226 ///
227 /// let li1: LanguageIdentifier = "en".parse()
228 /// .expect("Parsing failed.");
229 ///
230 /// let li2: LanguageIdentifier = "en-US".parse()
231 /// .expect("Parsing failed.");
232 ///
233 /// assert_ne!(li1, li2); // "en" != "en-US"
234 /// assert_ne!(li1.to_string(), li2.to_string()); // "en" != "en-US"
235 ///
236 /// assert_eq!(li1.matches(&li2, false, false), false); // "en" != "en-US"
237 /// assert_eq!(li1.matches(&li2, true, false), true); // "en-*-*-*" == "en-US"
238 /// assert_eq!(li1.matches(&li2, false, true), false); // "en" != "en-*-US-*"
239 /// assert_eq!(li1.matches(&li2, true, true), true); // "en-*-*-*" == "en-*-US-*"
240 /// ```
241 pub fn matches<O: AsRef<Self>>(
242 &self,
243 other: &O,
244 self_as_range: bool,
245 other_as_range: bool,
246 ) -> bool {
247 let other = other.as_ref();
248 self.language
249 .matches(other.language, self_as_range, other_as_range)
250 && subtag_matches(&self.script, &other.script, self_as_range, other_as_range)
251 && subtag_matches(&self.region, &other.region, self_as_range, other_as_range)
252 && subtags_match(
253 &self.variants,
254 &other.variants,
255 self_as_range,
256 other_as_range,
257 )
258 }
259
260 /// Returns a vector of variants subtags of the `LanguageIdentifier`.
261 ///
262 /// # Examples
263 ///
264 /// ```
265 /// use unic_langid_impl::LanguageIdentifier;
266 ///
267 /// let li1: LanguageIdentifier = "ca-ES-valencia".parse()
268 /// .expect("Parsing failed.");
269 ///
270 /// assert_eq!(li1.variants().map(|v| v.as_str()).collect::<Vec<_>>(), &["valencia"]);
271 ///
272 /// let li2: LanguageIdentifier = "de".parse()
273 /// .expect("Parsing failed.");
274 ///
275 /// assert_eq!(li2.variants().len(), 0);
276 /// ```
277 pub fn variants(&self) -> impl ExactSizeIterator<Item = &subtags::Variant> {
278 let variants: &[_] = match self.variants {
279 Some(ref v) => v,
280 None => &[],
281 };
282
283 variants.iter()
284 }
285
286 /// Sets variant subtags of the `LanguageIdentifier`.
287 ///
288 /// # Examples
289 ///
290 /// ```
291 /// use unic_langid_impl::LanguageIdentifier;
292 ///
293 /// let mut li: LanguageIdentifier = "ca-ES".parse()
294 /// .expect("Parsing failed.");
295 ///
296 /// li.set_variants(&["valencia".parse().expect("Parsing failed.")]);
297 ///
298 /// assert_eq!(li.to_string(), "ca-ES-valencia");
299 /// ```
300 pub fn set_variants(&mut self, variants: &[subtags::Variant]) {
301 let mut v = variants.to_vec();
302
303 if v.is_empty() {
304 self.variants = None;
305 } else {
306 v.sort_unstable();
307 v.dedup();
308 self.variants = Some(v.into_boxed_slice());
309 }
310 }
311
312 /// Tests if a variant subtag is present in the `LanguageIdentifier`.
313 ///
314 /// # Examples
315 ///
316 /// ```
317 /// use unic_langid_impl::LanguageIdentifier;
318 ///
319 /// let mut li: LanguageIdentifier = "ca-ES-macos".parse()
320 /// .expect("Parsing failed.");
321 ///
322 /// assert_eq!(li.has_variant("valencia".parse().unwrap()), false);
323 /// assert_eq!(li.has_variant("macos".parse().unwrap()), true);
324 /// ```
325 pub fn has_variant(&self, variant: subtags::Variant) -> bool {
326 if let Some(variants) = &self.variants {
327 variants.contains(&variant)
328 } else {
329 false
330 }
331 }
332
333 /// Clears variant subtags of the `LanguageIdentifier`.
334 ///
335 /// # Examples
336 ///
337 /// ```
338 /// use unic_langid_impl::LanguageIdentifier;
339 ///
340 /// let mut li: LanguageIdentifier = "ca-ES-valencia".parse()
341 /// .expect("Parsing failed.");
342 ///
343 /// li.clear_variants();
344 ///
345 /// assert_eq!(li.to_string(), "ca-ES");
346 /// ```
347 pub fn clear_variants(&mut self) {
348 self.variants = None;
349 }
350
351 /// Extends the `LanguageIdentifier` adding likely subtags based
352 /// on tables provided by CLDR.
353 ///
354 /// # Examples
355 ///
356 /// ```
357 /// use unic_langid_impl::LanguageIdentifier;
358 ///
359 /// let mut li: LanguageIdentifier = "en-US".parse()
360 /// .expect("Parsing failed.");
361 ///
362 /// assert_eq!(li.maximize(), true);
363 /// assert_eq!(li.to_string(), "en-Latn-US");
364 /// ```
365 #[cfg(feature = "likelysubtags")]
366 pub fn maximize(&mut self) -> bool {
367 if let Some(new_li) = likelysubtags::maximize(self.language, self.script, self.region) {
368 self.language = new_li.0;
369 self.script = new_li.1;
370 self.region = new_li.2;
371 true
372 } else {
373 false
374 }
375 }
376
377 /// Extends the `LanguageIdentifier` removing likely subtags based
378 /// on tables provided by CLDR.
379 ///
380 /// # Examples
381 ///
382 /// ```
383 /// use unic_langid_impl::LanguageIdentifier;
384 ///
385 /// let mut li: LanguageIdentifier = "en-Latn-US".parse()
386 /// .expect("Parsing failed.");
387 ///
388 /// assert_eq!(li.minimize(), true);
389 /// assert_eq!(li.to_string(), "en");
390 /// ```
391 #[cfg(feature = "likelysubtags")]
392 pub fn minimize(&mut self) -> bool {
393 if let Some(new_li) = likelysubtags::minimize(self.language, self.script, self.region) {
394 self.language = new_li.0;
395 self.script = new_li.1;
396 self.region = new_li.2;
397 true
398 } else {
399 false
400 }
401 }
402
403 /// Returns character direction of the `LanguageIdentifier`.
404 ///
405 /// # Examples
406 ///
407 /// ```
408 /// use unic_langid_impl::{LanguageIdentifier, CharacterDirection};
409 ///
410 /// let li1: LanguageIdentifier = "es-AR".parse()
411 /// .expect("Parsing failed.");
412 /// let li2: LanguageIdentifier = "fa".parse()
413 /// .expect("Parsing failed.");
414 ///
415 /// assert_eq!(li1.character_direction(), CharacterDirection::LTR);
416 /// assert_eq!(li2.character_direction(), CharacterDirection::RTL);
417 /// ```
418 pub fn character_direction(&self) -> CharacterDirection {
419 match (self.language.into(), self.script) {
420 (_, Some(script))
421 if layout_table::SCRIPTS_CHARACTER_DIRECTION_LTR.contains(&script.into()) =>
422 {
423 CharacterDirection::LTR
424 }
425 (_, Some(script))
426 if layout_table::SCRIPTS_CHARACTER_DIRECTION_RTL.contains(&script.into()) =>
427 {
428 CharacterDirection::RTL
429 }
430 (_, Some(script))
431 if layout_table::SCRIPTS_CHARACTER_DIRECTION_TTB.contains(&script.into()) =>
432 {
433 CharacterDirection::TTB
434 }
435 (Some(lang), _) if layout_table::LANGS_CHARACTER_DIRECTION_RTL.contains(&lang) => {
436 CharacterDirection::RTL
437 }
438 _ => CharacterDirection::LTR,
439 }
440 }
441}
442
443impl FromStr for LanguageIdentifier {
444 type Err = LanguageIdentifierError;
445
446 fn from_str(source: &str) -> Result<Self, Self::Err> {
447 Self::from_bytes(source.as_bytes())
448 }
449}
450
451impl AsRef<LanguageIdentifier> for LanguageIdentifier {
452 #[inline(always)]
453 fn as_ref(&self) -> &LanguageIdentifier {
454 self
455 }
456}
457
458impl std::fmt::Display for LanguageIdentifier {
459 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
460 self.language.fmt(f)?;
461 if let Some(ref script: &Script) = self.script {
462 f.write_char('-')?;
463 script.fmt(f)?;
464 }
465 if let Some(ref region: &Region) = self.region {
466 f.write_char('-')?;
467 region.fmt(f)?;
468 }
469 if let Some(variants: &Box<[Variant]>) = &self.variants {
470 for variant: &Variant in variants.iter() {
471 f.write_char('-')?;
472 variant.fmt(f)?;
473 }
474 }
475 Ok(())
476 }
477}
478
479impl PartialEq<&str> for LanguageIdentifier {
480 fn eq(&self, other: &&str) -> bool {
481 self.to_string().as_str() == *other
482 }
483}
484
485fn subtag_matches<P: PartialEq>(
486 subtag1: &Option<P>,
487 subtag2: &Option<P>,
488 as_range1: bool,
489 as_range2: bool,
490) -> bool {
491 (as_range1 && subtag1.is_none()) || (as_range2 && subtag2.is_none()) || subtag1 == subtag2
492}
493
494fn is_option_empty<P: PartialEq>(subtag: &Option<Box<[P]>>) -> bool {
495 subtag.as_ref().map_or(default:true, |t: &Box<[P]>| t.is_empty())
496}
497
498fn subtags_match<P: PartialEq>(
499 subtag1: &Option<Box<[P]>>,
500 subtag2: &Option<Box<[P]>>,
501 as_range1: bool,
502 as_range2: bool,
503) -> bool {
504 // or is some and is empty!
505 (as_range1 && is_option_empty(subtag:subtag1))
506 || (as_range2 && is_option_empty(subtag:subtag2))
507 || subtag1 == subtag2
508}
509
510/// This is a best-effort operation that performs all available levels of canonicalization.
511///
512/// At the moment the operation will normalize casing and the separator, but in the future
513/// it may also validate and update from deprecated subtags to canonical ones.
514///
515/// # Examples
516///
517/// ```
518/// use unic_langid_impl::canonicalize;
519///
520/// assert_eq!(canonicalize("pL_latn_pl"), Ok("pl-Latn-PL".to_string()));
521/// ```
522pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, LanguageIdentifierError> {
523 let lang_id: LanguageIdentifier = LanguageIdentifier::from_bytes(input.as_ref())?;
524 Ok(lang_id.to_string())
525}
526
527#[test]
528fn invalid_subtag() {
529 assert!(LanguageIdentifier::from_bytes("en-ÁÁÁÁ".as_bytes()).is_err());
530}
531