1 | mod errors; |
2 | mod layout_table; |
3 | #[cfg (feature = "likelysubtags" )] |
4 | pub mod likelysubtags; |
5 | #[doc (hidden)] |
6 | pub mod parser; |
7 | #[cfg (feature = "serde" )] |
8 | mod serde; |
9 | pub mod subtags; |
10 | |
11 | pub use crate::errors::LanguageIdentifierError; |
12 | use std::fmt::Write; |
13 | use std::iter::Peekable; |
14 | use std::str::FromStr; |
15 | |
16 | /// Enum representing available character direction orientations. |
17 | #[derive (Clone, Copy, Debug, PartialEq)] |
18 | pub enum CharacterDirection { |
19 | /// Right To Left |
20 | /// |
21 | /// Used in languages such as Arabic, Hebrew, Fula, Kurdish etc. |
22 | RTL, |
23 | /// Left To Right |
24 | /// |
25 | /// Used in languages such as French, Spanish, English, German etc. |
26 | LTR, |
27 | /// Top To Bottom |
28 | /// |
29 | /// Used in Traditional Mongolian |
30 | TTB, |
31 | } |
32 | |
33 | type PartsTuple = ( |
34 | subtags::Language, |
35 | Option<subtags::Script>, |
36 | Option<subtags::Region>, |
37 | Vec<subtags::Variant>, |
38 | ); |
39 | |
40 | /// `LanguageIdentifier` is a core struct representing a Unicode Language Identifier. |
41 | /// |
42 | /// # Examples |
43 | /// |
44 | /// ``` |
45 | /// use unic_langid_impl::LanguageIdentifier; |
46 | /// |
47 | /// let li: LanguageIdentifier = "en-US" .parse() |
48 | /// .expect("Failed to parse." ); |
49 | /// |
50 | /// assert_eq!(li.language, "en" ); |
51 | /// assert_eq!(li.script, None); |
52 | /// assert_eq!(li.region.as_ref().map(Into::into), Some("US" )); |
53 | /// assert_eq!(li.variants().len(), 0); |
54 | /// ``` |
55 | /// |
56 | /// # Parsing |
57 | /// |
58 | /// Unicode recognizes three levels of standard conformance for any language identifier: |
59 | /// |
60 | /// * *well-formed* - syntactically correct |
61 | /// * *valid* - well-formed and only uses registered language subtags, extensions, keywords, types... |
62 | /// * *canonical* - valid and no deprecated codes or structure. |
63 | /// |
64 | /// At the moment parsing normalizes a well-formed language identifier converting |
65 | /// `_` separators to `-` and adjusting casing to conform to the Unicode standard. |
66 | /// |
67 | /// Any bogus subtags will cause the parsing to fail with an error. |
68 | /// No subtag validation is performed. |
69 | /// |
70 | /// # Examples: |
71 | /// |
72 | /// ``` |
73 | /// use unic_langid_impl::LanguageIdentifier; |
74 | /// |
75 | /// let li: LanguageIdentifier = "eN_latn_Us-Valencia" .parse() |
76 | /// .expect("Failed to parse." ); |
77 | /// |
78 | /// assert_eq!(li.language, "en" ); |
79 | /// assert_eq!(li.script.as_ref().map(Into::into), Some("Latn" )); |
80 | /// assert_eq!(li.region.as_ref().map(Into::into), Some("US" )); |
81 | /// assert_eq!(li.variants().map(|v| v.as_str()).collect::<Vec<_>>(), &["valencia" ]); |
82 | /// ``` |
83 | #[derive (Default, Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord)] |
84 | pub struct LanguageIdentifier { |
85 | pub language: subtags::Language, |
86 | pub script: Option<subtags::Script>, |
87 | pub region: Option<subtags::Region>, |
88 | variants: Option<Box<[subtags::Variant]>>, |
89 | } |
90 | |
91 | impl LanguageIdentifier { |
92 | /// A constructor which takes a utf8 slice, parses it and |
93 | /// produces a well-formed `LanguageIdentifier`. |
94 | /// |
95 | /// # Examples |
96 | /// |
97 | /// ``` |
98 | /// use unic_langid_impl::LanguageIdentifier; |
99 | /// |
100 | /// let li = LanguageIdentifier::from_bytes("en-US" .as_bytes()) |
101 | /// .expect("Parsing failed." ); |
102 | /// |
103 | /// assert_eq!(li.to_string(), "en-US" ); |
104 | /// ``` |
105 | pub fn from_bytes(v: &[u8]) -> Result<Self, LanguageIdentifierError> { |
106 | Ok(parser::parse_language_identifier(v)?) |
107 | } |
108 | |
109 | /// A constructor which takes optional subtags as `AsRef<[u8]>`, parses them and |
110 | /// produces a well-formed `LanguageIdentifier`. |
111 | /// |
112 | /// # Examples |
113 | /// |
114 | /// ``` |
115 | /// use unic_langid_impl::LanguageIdentifier; |
116 | /// |
117 | /// let li = LanguageIdentifier::from_parts( |
118 | /// "fr" .parse().expect("Parsing failed." ), |
119 | /// None, |
120 | /// Some("CA" .parse().expect("Parsing failed." )), |
121 | /// &[] |
122 | /// ); |
123 | /// |
124 | /// assert_eq!(li.to_string(), "fr-CA" ); |
125 | /// ``` |
126 | pub fn from_parts( |
127 | language: subtags::Language, |
128 | script: Option<subtags::Script>, |
129 | region: Option<subtags::Region>, |
130 | variants: &[subtags::Variant], |
131 | ) -> Self { |
132 | let variants = if !variants.is_empty() { |
133 | let mut v = variants.to_vec(); |
134 | v.sort_unstable(); |
135 | v.dedup(); |
136 | Some(v.into_boxed_slice()) |
137 | } else { |
138 | None |
139 | }; |
140 | |
141 | Self { |
142 | language, |
143 | script, |
144 | region, |
145 | variants, |
146 | } |
147 | } |
148 | |
149 | /// # Unchecked |
150 | /// |
151 | /// This function accepts subtags expecting variants |
152 | /// to be deduplicated and ordered. |
153 | pub const fn from_raw_parts_unchecked( |
154 | language: subtags::Language, |
155 | script: Option<subtags::Script>, |
156 | region: Option<subtags::Region>, |
157 | variants: Option<Box<[subtags::Variant]>>, |
158 | ) -> Self { |
159 | Self { |
160 | language, |
161 | script, |
162 | region, |
163 | variants, |
164 | } |
165 | } |
166 | |
167 | #[doc (hidden)] |
168 | /// This method is used by `unic-locale` to handle partial |
169 | /// subtag iterator. |
170 | /// |
171 | /// Not stable. |
172 | pub fn try_from_iter<'a>( |
173 | iter: &mut Peekable<impl Iterator<Item = &'a [u8]>>, |
174 | allow_extension: bool, |
175 | ) -> Result<LanguageIdentifier, LanguageIdentifierError> { |
176 | Ok(parser::parse_language_identifier_from_iter( |
177 | iter, |
178 | allow_extension, |
179 | )?) |
180 | } |
181 | |
182 | /// Consumes `LanguageIdentifier` and produces raw internal representations |
183 | /// of all subtags in form of `u64`/`u32`. |
184 | /// |
185 | /// Primarily used for storing internal representation and restoring via |
186 | /// `from_raw_parts_unchecked`. |
187 | /// |
188 | /// # Examples |
189 | /// |
190 | /// ``` |
191 | /// use unic_langid_impl::LanguageIdentifier; |
192 | /// use tinystr::{TinyStr8, TinyStr4}; |
193 | /// |
194 | /// let li: LanguageIdentifier = "en-US" .parse() |
195 | /// .expect("Parsing failed." ); |
196 | /// |
197 | /// let (lang, script, region, variants) = li.into_parts(); |
198 | /// |
199 | /// // let li2 = LanguageIdentifier::from_raw_parts_unchecked( |
200 | /// // lang.map(|l| unsafe { TinyStr8::new_unchecked(l) }), |
201 | /// // script.map(|s| unsafe { TinyStr4::new_unchecked(s) }), |
202 | /// // region.map(|r| unsafe { TinyStr4::new_unchecked(r) }), |
203 | /// // variants.map(|v| v.into_iter().map(|v| unsafe { TinyStr8::new_unchecked(*v) }).collect()), |
204 | /// //); |
205 | /// |
206 | /// //assert_eq!(li2.to_string(), "en-US"); |
207 | /// ``` |
208 | pub fn into_parts(self) -> PartsTuple { |
209 | ( |
210 | self.language, |
211 | self.script, |
212 | self.region, |
213 | self.variants.map_or_else(Vec::new, |v| v.to_vec()), |
214 | ) |
215 | } |
216 | |
217 | /// Compares a `LanguageIdentifier` to another `AsRef<LanguageIdentifier` |
218 | /// allowing for either side to use the missing fields as wildcards. |
219 | /// |
220 | /// This allows for matching between `en` (treated as `en-*-*-*`) and `en-US`. |
221 | /// |
222 | /// # Examples |
223 | /// |
224 | /// ``` |
225 | /// use unic_langid_impl::LanguageIdentifier; |
226 | /// |
227 | /// let li1: LanguageIdentifier = "en" .parse() |
228 | /// .expect("Parsing failed." ); |
229 | /// |
230 | /// let li2: LanguageIdentifier = "en-US" .parse() |
231 | /// .expect("Parsing failed." ); |
232 | /// |
233 | /// assert_ne!(li1, li2); // "en" != "en-US" |
234 | /// assert_ne!(li1.to_string(), li2.to_string()); // "en" != "en-US" |
235 | /// |
236 | /// assert_eq!(li1.matches(&li2, false, false), false); // "en" != "en-US" |
237 | /// assert_eq!(li1.matches(&li2, true, false), true); // "en-*-*-*" == "en-US" |
238 | /// assert_eq!(li1.matches(&li2, false, true), false); // "en" != "en-*-US-*" |
239 | /// assert_eq!(li1.matches(&li2, true, true), true); // "en-*-*-*" == "en-*-US-*" |
240 | /// ``` |
241 | pub fn matches<O: AsRef<Self>>( |
242 | &self, |
243 | other: &O, |
244 | self_as_range: bool, |
245 | other_as_range: bool, |
246 | ) -> bool { |
247 | let other = other.as_ref(); |
248 | self.language |
249 | .matches(other.language, self_as_range, other_as_range) |
250 | && subtag_matches(&self.script, &other.script, self_as_range, other_as_range) |
251 | && subtag_matches(&self.region, &other.region, self_as_range, other_as_range) |
252 | && subtags_match( |
253 | &self.variants, |
254 | &other.variants, |
255 | self_as_range, |
256 | other_as_range, |
257 | ) |
258 | } |
259 | |
260 | /// Returns a vector of variants subtags of the `LanguageIdentifier`. |
261 | /// |
262 | /// # Examples |
263 | /// |
264 | /// ``` |
265 | /// use unic_langid_impl::LanguageIdentifier; |
266 | /// |
267 | /// let li1: LanguageIdentifier = "ca-ES-valencia" .parse() |
268 | /// .expect("Parsing failed." ); |
269 | /// |
270 | /// assert_eq!(li1.variants().map(|v| v.as_str()).collect::<Vec<_>>(), &["valencia" ]); |
271 | /// |
272 | /// let li2: LanguageIdentifier = "de" .parse() |
273 | /// .expect("Parsing failed." ); |
274 | /// |
275 | /// assert_eq!(li2.variants().len(), 0); |
276 | /// ``` |
277 | pub fn variants(&self) -> impl ExactSizeIterator<Item = &subtags::Variant> { |
278 | let variants: &[_] = match self.variants { |
279 | Some(ref v) => v, |
280 | None => &[], |
281 | }; |
282 | |
283 | variants.iter() |
284 | } |
285 | |
286 | /// Sets variant subtags of the `LanguageIdentifier`. |
287 | /// |
288 | /// # Examples |
289 | /// |
290 | /// ``` |
291 | /// use unic_langid_impl::LanguageIdentifier; |
292 | /// |
293 | /// let mut li: LanguageIdentifier = "ca-ES" .parse() |
294 | /// .expect("Parsing failed." ); |
295 | /// |
296 | /// li.set_variants(&["valencia" .parse().expect("Parsing failed." )]); |
297 | /// |
298 | /// assert_eq!(li.to_string(), "ca-ES-valencia" ); |
299 | /// ``` |
300 | pub fn set_variants(&mut self, variants: &[subtags::Variant]) { |
301 | let mut v = variants.to_vec(); |
302 | |
303 | if v.is_empty() { |
304 | self.variants = None; |
305 | } else { |
306 | v.sort_unstable(); |
307 | v.dedup(); |
308 | self.variants = Some(v.into_boxed_slice()); |
309 | } |
310 | } |
311 | |
312 | /// Tests if a variant subtag is present in the `LanguageIdentifier`. |
313 | /// |
314 | /// # Examples |
315 | /// |
316 | /// ``` |
317 | /// use unic_langid_impl::LanguageIdentifier; |
318 | /// |
319 | /// let mut li: LanguageIdentifier = "ca-ES-macos" .parse() |
320 | /// .expect("Parsing failed." ); |
321 | /// |
322 | /// assert_eq!(li.has_variant("valencia" .parse().unwrap()), false); |
323 | /// assert_eq!(li.has_variant("macos" .parse().unwrap()), true); |
324 | /// ``` |
325 | pub fn has_variant(&self, variant: subtags::Variant) -> bool { |
326 | if let Some(variants) = &self.variants { |
327 | variants.contains(&variant) |
328 | } else { |
329 | false |
330 | } |
331 | } |
332 | |
333 | /// Clears variant subtags of the `LanguageIdentifier`. |
334 | /// |
335 | /// # Examples |
336 | /// |
337 | /// ``` |
338 | /// use unic_langid_impl::LanguageIdentifier; |
339 | /// |
340 | /// let mut li: LanguageIdentifier = "ca-ES-valencia" .parse() |
341 | /// .expect("Parsing failed." ); |
342 | /// |
343 | /// li.clear_variants(); |
344 | /// |
345 | /// assert_eq!(li.to_string(), "ca-ES" ); |
346 | /// ``` |
347 | pub fn clear_variants(&mut self) { |
348 | self.variants = None; |
349 | } |
350 | |
351 | /// Extends the `LanguageIdentifier` adding likely subtags based |
352 | /// on tables provided by CLDR. |
353 | /// |
354 | /// # Examples |
355 | /// |
356 | /// ``` |
357 | /// use unic_langid_impl::LanguageIdentifier; |
358 | /// |
359 | /// let mut li: LanguageIdentifier = "en-US".parse() |
360 | /// .expect("Parsing failed."); |
361 | /// |
362 | /// assert_eq!(li.maximize(), true); |
363 | /// assert_eq!(li.to_string(), "en-Latn-US"); |
364 | /// ``` |
365 | #[cfg (feature = "likelysubtags" )] |
366 | pub fn maximize(&mut self) -> bool { |
367 | if let Some(new_li) = likelysubtags::maximize(self.language, self.script, self.region) { |
368 | self.language = new_li.0; |
369 | self.script = new_li.1; |
370 | self.region = new_li.2; |
371 | true |
372 | } else { |
373 | false |
374 | } |
375 | } |
376 | |
377 | /// Extends the `LanguageIdentifier` removing likely subtags based |
378 | /// on tables provided by CLDR. |
379 | /// |
380 | /// # Examples |
381 | /// |
382 | /// ``` |
383 | /// use unic_langid_impl::LanguageIdentifier; |
384 | /// |
385 | /// let mut li: LanguageIdentifier = "en-Latn-US".parse() |
386 | /// .expect("Parsing failed."); |
387 | /// |
388 | /// assert_eq!(li.minimize(), true); |
389 | /// assert_eq!(li.to_string(), "en"); |
390 | /// ``` |
391 | #[cfg (feature = "likelysubtags" )] |
392 | pub fn minimize(&mut self) -> bool { |
393 | if let Some(new_li) = likelysubtags::minimize(self.language, self.script, self.region) { |
394 | self.language = new_li.0; |
395 | self.script = new_li.1; |
396 | self.region = new_li.2; |
397 | true |
398 | } else { |
399 | false |
400 | } |
401 | } |
402 | |
403 | /// Returns character direction of the `LanguageIdentifier`. |
404 | /// |
405 | /// # Examples |
406 | /// |
407 | /// ``` |
408 | /// use unic_langid_impl::{LanguageIdentifier, CharacterDirection}; |
409 | /// |
410 | /// let li1: LanguageIdentifier = "es-AR" .parse() |
411 | /// .expect("Parsing failed." ); |
412 | /// let li2: LanguageIdentifier = "fa" .parse() |
413 | /// .expect("Parsing failed." ); |
414 | /// |
415 | /// assert_eq!(li1.character_direction(), CharacterDirection::LTR); |
416 | /// assert_eq!(li2.character_direction(), CharacterDirection::RTL); |
417 | /// ``` |
418 | pub fn character_direction(&self) -> CharacterDirection { |
419 | match (self.language.into(), self.script) { |
420 | (_, Some(script)) |
421 | if layout_table::SCRIPTS_CHARACTER_DIRECTION_LTR.contains(&script.into()) => |
422 | { |
423 | CharacterDirection::LTR |
424 | } |
425 | (_, Some(script)) |
426 | if layout_table::SCRIPTS_CHARACTER_DIRECTION_RTL.contains(&script.into()) => |
427 | { |
428 | CharacterDirection::RTL |
429 | } |
430 | (_, Some(script)) |
431 | if layout_table::SCRIPTS_CHARACTER_DIRECTION_TTB.contains(&script.into()) => |
432 | { |
433 | CharacterDirection::TTB |
434 | } |
435 | (Some(lang), _) if layout_table::LANGS_CHARACTER_DIRECTION_RTL.contains(&lang) => { |
436 | CharacterDirection::RTL |
437 | } |
438 | _ => CharacterDirection::LTR, |
439 | } |
440 | } |
441 | } |
442 | |
443 | impl FromStr for LanguageIdentifier { |
444 | type Err = LanguageIdentifierError; |
445 | |
446 | fn from_str(source: &str) -> Result<Self, Self::Err> { |
447 | Self::from_bytes(source.as_bytes()) |
448 | } |
449 | } |
450 | |
451 | impl AsRef<LanguageIdentifier> for LanguageIdentifier { |
452 | #[inline (always)] |
453 | fn as_ref(&self) -> &LanguageIdentifier { |
454 | self |
455 | } |
456 | } |
457 | |
458 | impl std::fmt::Display for LanguageIdentifier { |
459 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { |
460 | self.language.fmt(f)?; |
461 | if let Some(ref script: &Script) = self.script { |
462 | f.write_char('-' )?; |
463 | script.fmt(f)?; |
464 | } |
465 | if let Some(ref region: &Region) = self.region { |
466 | f.write_char('-' )?; |
467 | region.fmt(f)?; |
468 | } |
469 | if let Some(variants: &Box<[Variant]>) = &self.variants { |
470 | for variant: &Variant in variants.iter() { |
471 | f.write_char('-' )?; |
472 | variant.fmt(f)?; |
473 | } |
474 | } |
475 | Ok(()) |
476 | } |
477 | } |
478 | |
479 | impl PartialEq<&str> for LanguageIdentifier { |
480 | fn eq(&self, other: &&str) -> bool { |
481 | self.to_string().as_str() == *other |
482 | } |
483 | } |
484 | |
485 | fn subtag_matches<P: PartialEq>( |
486 | subtag1: &Option<P>, |
487 | subtag2: &Option<P>, |
488 | as_range1: bool, |
489 | as_range2: bool, |
490 | ) -> bool { |
491 | (as_range1 && subtag1.is_none()) || (as_range2 && subtag2.is_none()) || subtag1 == subtag2 |
492 | } |
493 | |
494 | fn is_option_empty<P: PartialEq>(subtag: &Option<Box<[P]>>) -> bool { |
495 | subtag.as_ref().map_or(default:true, |t: &Box<[P]>| t.is_empty()) |
496 | } |
497 | |
498 | fn subtags_match<P: PartialEq>( |
499 | subtag1: &Option<Box<[P]>>, |
500 | subtag2: &Option<Box<[P]>>, |
501 | as_range1: bool, |
502 | as_range2: bool, |
503 | ) -> bool { |
504 | // or is some and is empty! |
505 | (as_range1 && is_option_empty(subtag:subtag1)) |
506 | || (as_range2 && is_option_empty(subtag:subtag2)) |
507 | || subtag1 == subtag2 |
508 | } |
509 | |
510 | /// This is a best-effort operation that performs all available levels of canonicalization. |
511 | /// |
512 | /// At the moment the operation will normalize casing and the separator, but in the future |
513 | /// it may also validate and update from deprecated subtags to canonical ones. |
514 | /// |
515 | /// # Examples |
516 | /// |
517 | /// ``` |
518 | /// use unic_langid_impl::canonicalize; |
519 | /// |
520 | /// assert_eq!(canonicalize("pL_latn_pl" ), Ok("pl-Latn-PL" .to_string())); |
521 | /// ``` |
522 | pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, LanguageIdentifierError> { |
523 | let lang_id: LanguageIdentifier = LanguageIdentifier::from_bytes(input.as_ref())?; |
524 | Ok(lang_id.to_string()) |
525 | } |
526 | |
527 | #[test ] |
528 | fn invalid_subtag() { |
529 | assert!(LanguageIdentifier::from_bytes("en-ÁÁÁÁ" .as_bytes()).is_err()); |
530 | } |
531 | |