lib.rs source code [crates/unic_langid_impl/src/lib.rs]

1	mod errors;
2	mod layout_table;
3	#[cfg(feature = "likelysubtags")]
4	pub mod likelysubtags;
5	#[doc(hidden)]
6	pub mod parser;
7	#[cfg(feature = "serde")]
8	mod serde;
9	pub mod subtags;
10
11	pub use crate::errors::LanguageIdentifierError;
12	use std::fmt::Write;
13	use std::iter::Peekable;
14	use std::str::FromStr;
15
16	/// Enum representing available character direction orientations.
17	#[derive(Clone, Copy, Debug, PartialEq)]
18	pub enum CharacterDirection {
19	/// Right To Left
20	///
21	/// Used in languages such as Arabic, Hebrew, Fula, Kurdish etc.
22	RTL,
23	/// Left To Right
24	///
25	/// Used in languages such as French, Spanish, English, German etc.
26	LTR,
27	/// Top To Bottom
28	///
29	/// Used in Traditional Mongolian
30	TTB,
31	}
32
33	type PartsTuple = (
34	subtags::Language,
35	Option<subtags::Script>,
36	Option<subtags::Region>,
37	Vec<subtags::Variant>,
38	);
39
40	/// `LanguageIdentifier` is a core struct representing a Unicode Language Identifier.
41	///
42	/// # Examples
43	///
44	/// ```
45	/// use unic_langid_impl::LanguageIdentifier;
46	///
47	/// let li: LanguageIdentifier = "en-US".parse()
48	/// .expect("Failed to parse.");
49	///
50	/// assert_eq!(li.language, "en");
51	/// assert_eq!(li.script, None);
52	/// assert_eq!(li.region.as_ref().map(Into::into), Some("US"));
53	/// assert_eq!(li.variants().len(), `0`);
54	/// ```
55	///
56	/// # Parsing
57	///
58	/// Unicode recognizes three levels of standard conformance for any language identifier:
59	///
60	/// well-formed - syntactically correct*
61	/// valid - well-formed and only uses registered language subtags, extensions, keywords, types...*
62	/// canonical - valid and no deprecated codes or structure.*
63	///
64	/// At the moment parsing normalizes a well-formed language identifier converting
65	/// `_` separators to `-` and adjusting casing to conform to the Unicode standard.
66	///
67	/// Any bogus subtags will cause the parsing to fail with an error.
68	/// No subtag validation is performed.
69	///
70	/// # Examples:
71	///
72	/// ```
73	/// use unic_langid_impl::LanguageIdentifier;
74	///
75	/// let li: LanguageIdentifier = "eN_latn_Us-Valencia".parse()
76	/// .expect("Failed to parse.");
77	///
78	/// assert_eq!(li.language, "en");
79	/// assert_eq!(li.script.as_ref().map(Into::into), Some("Latn"));
80	/// assert_eq!(li.region.as_ref().map(Into::into), Some("US"));
81	/// assert_eq!(li.variants().map(\|v\| v.as_str()).collect::<Vec<_>>(), &["valencia"]);
82	/// ```
83	#[derive(Default, Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord)]
84	pub struct LanguageIdentifier {
85	pub language: subtags::Language,
86	pub script: Option<subtags::Script>,
87	pub region: Option<subtags::Region>,
88	variants: Option<Box<[subtags::Variant]>>,
89	}
90
91	impl LanguageIdentifier {
92	/// A constructor which takes a utf8 slice, parses it and
93	/// produces a well-formed `LanguageIdentifier`.
94	///
95	/// # Examples
96	///
97	/// ```
98	/// use unic_langid_impl::LanguageIdentifier;
99	///
100	/// let li = LanguageIdentifier::from_bytes("en-US".as_bytes())
101	/// .expect("Parsing failed.");
102	///
103	/// assert_eq!(li.to_string(), "en-US");
104	/// ```
105	pub fn from_bytes(v: &[u8]) -> Result<Self, LanguageIdentifierError> {
106	Ok(parser::parse_language_identifier(v)?)
107	}
108
109	/// A constructor which takes optional subtags as `AsRef<[u8]>`, parses them and
110	/// produces a well-formed `LanguageIdentifier`.
111	///
112	/// # Examples
113	///
114	/// ```
115	/// use unic_langid_impl::LanguageIdentifier;
116	///
117	/// let li = LanguageIdentifier::from_parts(
118	/// "fr".parse().expect("Parsing failed."),
119	/// None,
120	/// Some("CA".parse().expect("Parsing failed.")),
121	/// &[]
122	/// );
123	///
124	/// assert_eq!(li.to_string(), "fr-CA");
125	/// ```
126	pub fn from_parts(
127	language: subtags::Language,
128	script: Option<subtags::Script>,
129	region: Option<subtags::Region>,
130	variants: &[subtags::Variant],
131	) -> Self {
132	let variants = if !variants.is_empty() {
133	let mut v = variants.to_vec();
134	v.sort_unstable();
135	v.dedup();
136	Some(v.into_boxed_slice())
137	} else {
138	None
139	};
140
141	Self {
142	language,
143	script,
144	region,
145	variants,
146	}
147	}
148
149	/// # Unchecked
150	///
151	/// This function accepts subtags expecting variants
152	/// to be deduplicated and ordered.
153	pub const fn from_raw_parts_unchecked(
154	language: subtags::Language,
155	script: Option<subtags::Script>,
156	region: Option<subtags::Region>,
157	variants: Option<Box<[subtags::Variant]>>,
158	) -> Self {
159	Self {
160	language,
161	script,
162	region,
163	variants,
164	}
165	}
166
167	#[doc(hidden)]
168	/// This method is used by `unic-locale` to handle partial
169	/// subtag iterator.
170	///
171	/// Not stable.
172	pub fn try_from_iter<'a>(
173	iter: &mut Peekable<impl Iterator<Item = &'a [u8]>>,
174	allow_extension: bool,
175	) -> Result<LanguageIdentifier, LanguageIdentifierError> {
176	Ok(parser::parse_language_identifier_from_iter(
177	iter,
178	allow_extension,
179	)?)
180	}
181
182	/// Consumes `LanguageIdentifier` and produces raw internal representations
183	/// of all subtags in form of `u64`/`u32`.
184	///
185	/// Primarily used for storing internal representation and restoring via
186	/// `from_raw_parts_unchecked`.
187	///
188	/// # Examples
189	///
190	/// ```
191	/// use unic_langid_impl::LanguageIdentifier;
192	/// use tinystr::{TinyStr8, TinyStr4};
193	///
194	/// let li: LanguageIdentifier = "en-US".parse()
195	/// .expect("Parsing failed.");
196	///
197	/// let (lang, script, region, variants) = li.into_parts();
198	///
199	/// // let li2 = LanguageIdentifier::from_raw_parts_unchecked(
200	/// // lang.map(\|l\| unsafe { TinyStr8::new_unchecked(l) }),
201	/// // script.map(\|s\| unsafe { TinyStr4::new_unchecked(s) }),
202	/// // region.map(\|r\| unsafe { TinyStr4::new_unchecked(r) }),
203	/// // variants.map(\|v\| v.into_iter().map(\|v\| unsafe { TinyStr8::new_unchecked(v) }).collect()),*
204	/// //);
205	///
206	/// //assert_eq!(li2.to_string(), "en-US");
207	/// ```
208	pub fn into_parts(self) -> PartsTuple {
209	(
210	self.language,
211	self.script,
212	self.region,
213	self.variants.map_or_else(Vec::new, \|v\| v.to_vec()),
214	)
215	}
216
217	/// Compares a `LanguageIdentifier` to another `AsRef<LanguageIdentifier`
218	/// allowing for either side to use the missing fields as wildcards.
219	///
220	/// This allows for matching between `en` (treated as `en---`) and `en-US`.*
221	///
222	/// # Examples
223	///
224	/// ```
225	/// use unic_langid_impl::LanguageIdentifier;
226	///
227	/// let li1: LanguageIdentifier = "en".parse()
228	/// .expect("Parsing failed.");
229	///
230	/// let li2: LanguageIdentifier = "en-US".parse()
231	/// .expect("Parsing failed.");
232	///
233	/// assert_ne!(li1, li2); // "en" != "en-US"
234	/// assert_ne!(li1.to_string(), li2.to_string()); // "en" != "en-US"
235	///
236	/// assert_eq!(li1.matches(&li2, `false`, `false`), `false`); // "en" != "en-US"
237	/// assert_eq!(li1.matches(&li2, `true`, `false`), `true`); // "en---" == "en-US"*
238	/// assert_eq!(li1.matches(&li2, `false`, `true`), `false`); // "en" != "en--US-"
239	/// assert_eq!(li1.matches(&li2, `true`, `true`), `true`); // "en---" == "en--US-"*
240	/// ```
241	pub fn matches<O: AsRef<Self>>(
242	&self,
243	other: &O,
244	self_as_range: bool,
245	other_as_range: bool,
246	) -> bool {
247	let other = other.as_ref();
248	self.language
249	.matches(other.language, self_as_range, other_as_range)
250	&& subtag_matches(&self.script, &other.script, self_as_range, other_as_range)
251	&& subtag_matches(&self.region, &other.region, self_as_range, other_as_range)
252	&& subtags_match(
253	&self.variants,
254	&other.variants,
255	self_as_range,
256	other_as_range,
257	)
258	}
259
260	/// Returns a vector of variants subtags of the `LanguageIdentifier`.
261	///
262	/// # Examples
263	///
264	/// ```
265	/// use unic_langid_impl::LanguageIdentifier;
266	///
267	/// let li1: LanguageIdentifier = "ca-ES-valencia".parse()
268	/// .expect("Parsing failed.");
269	///
270	/// assert_eq!(li1.variants().map(\|v\| v.as_str()).collect::<Vec<_>>(), &["valencia"]);
271	///
272	/// let li2: LanguageIdentifier = "de".parse()
273	/// .expect("Parsing failed.");
274	///
275	/// assert_eq!(li2.variants().len(), `0`);
276	/// ```
277	pub fn variants(&self) -> impl ExactSizeIterator<Item = &subtags::Variant> {
278	let variants: &[_] = match self.variants {
279	Some(ref v) => v,
280	None => &[],
281	};
282
283	variants.iter()
284	}
285
286	/// Sets variant subtags of the `LanguageIdentifier`.
287	///
288	/// # Examples
289	///
290	/// ```
291	/// use unic_langid_impl::LanguageIdentifier;
292	///
293	/// let mut li: LanguageIdentifier = "ca-ES".parse()
294	/// .expect("Parsing failed.");
295	///
296	/// li.set_variants(&["valencia".parse().expect("Parsing failed.")]);
297	///
298	/// assert_eq!(li.to_string(), "ca-ES-valencia");
299	/// ```
300	pub fn set_variants(&mut self, variants: &[subtags::Variant]) {
301	let mut v = variants.to_vec();
302
303	if v.is_empty() {
304	self.variants = None;
305	} else {
306	v.sort_unstable();
307	v.dedup();
308	self.variants = Some(v.into_boxed_slice());
309	}
310	}
311
312	/// Tests if a variant subtag is present in the `LanguageIdentifier`.
313	///
314	/// # Examples
315	///
316	/// ```
317	/// use unic_langid_impl::LanguageIdentifier;
318	///
319	/// let mut li: LanguageIdentifier = "ca-ES-macos".parse()
320	/// .expect("Parsing failed.");
321	///
322	/// assert_eq!(li.has_variant("valencia".parse().unwrap()), `false`);
323	/// assert_eq!(li.has_variant("macos".parse().unwrap()), `true`);
324	/// ```
325	pub fn has_variant(&self, variant: subtags::Variant) -> bool {
326	if let Some(variants) = &self.variants {
327	variants.contains(&variant)
328	} else {
329	`false`
330	}
331	}
332
333	/// Clears variant subtags of the `LanguageIdentifier`.
334	///
335	/// # Examples
336	///
337	/// ```
338	/// use unic_langid_impl::LanguageIdentifier;
339	///
340	/// let mut li: LanguageIdentifier = "ca-ES-valencia".parse()
341	/// .expect("Parsing failed.");
342	///
343	/// li.clear_variants();
344	///
345	/// assert_eq!(li.to_string(), "ca-ES");
346	/// ```
347	pub fn clear_variants(&mut self) {
348	self.variants = None;
349	}
350
351	/// Extends the `LanguageIdentifier` adding likely subtags based
352	/// on tables provided by CLDR.
353	///
354	/// # Examples
355	///
356	/// ```
357	/// use unic_langid_impl::LanguageIdentifier;
358	///
359	/// let mut li: LanguageIdentifier = "en-US".parse()
360	/// .expect("Parsing failed.");
361	///
362	/// assert_eq!(li.maximize(), true);
363	/// assert_eq!(li.to_string(), "en-Latn-US");
364	/// ```
365	#[cfg(feature = "likelysubtags")]
366	pub fn maximize(&mut self) -> bool {
367	if let Some(new_li) = likelysubtags::maximize(self.language, self.script, self.region) {
368	self.language = new_li.0;
369	self.script = new_li.1;
370	self.region = new_li.2;
371	`true`
372	} else {
373	`false`
374	}
375	}
376
377	/// Extends the `LanguageIdentifier` removing likely subtags based
378	/// on tables provided by CLDR.
379	///
380	/// # Examples
381	///
382	/// ```
383	/// use unic_langid_impl::LanguageIdentifier;
384	///
385	/// let mut li: LanguageIdentifier = "en-Latn-US".parse()
386	/// .expect("Parsing failed.");
387	///
388	/// assert_eq!(li.minimize(), true);
389	/// assert_eq!(li.to_string(), "en");
390	/// ```
391	#[cfg(feature = "likelysubtags")]
392	pub fn minimize(&mut self) -> bool {
393	if let Some(new_li) = likelysubtags::minimize(self.language, self.script, self.region) {
394	self.language = new_li.0;
395	self.script = new_li.1;
396	self.region = new_li.2;
397	`true`
398	} else {
399	`false`
400	}
401	}
402
403	/// Returns character direction of the `LanguageIdentifier`.
404	///
405	/// # Examples
406	///
407	/// ```
408	/// use unic_langid_impl::{LanguageIdentifier, CharacterDirection};
409	///
410	/// let li1: LanguageIdentifier = "es-AR".parse()
411	/// .expect("Parsing failed.");
412	/// let li2: LanguageIdentifier = "fa".parse()
413	/// .expect("Parsing failed.");
414	///
415	/// assert_eq!(li1.character_direction(), CharacterDirection::LTR);
416	/// assert_eq!(li2.character_direction(), CharacterDirection::RTL);
417	/// ```
418	pub fn character_direction(&self) -> CharacterDirection {
419	match (self.language.into(), self.script) {
420	(_, Some(script))
421	if layout_table::SCRIPTS_CHARACTER_DIRECTION_LTR.contains(&script.into()) =>
422	{
423	CharacterDirection::LTR
424	}
425	(_, Some(script))
426	if layout_table::SCRIPTS_CHARACTER_DIRECTION_RTL.contains(&script.into()) =>
427	{
428	CharacterDirection::RTL
429	}
430	(_, Some(script))
431	if layout_table::SCRIPTS_CHARACTER_DIRECTION_TTB.contains(&script.into()) =>
432	{
433	CharacterDirection::TTB
434	}
435	(Some(lang), _) if layout_table::LANGS_CHARACTER_DIRECTION_RTL.contains(&lang) => {
436	#[cfg(feature = "likelysubtags")]
437	if let Some((_, Some(script), _)) =
438	likelysubtags::maximize(self.language, None, self.region)
439	{
440	if layout_table::SCRIPTS_CHARACTER_DIRECTION_LTR.contains(&script.into()) {
441	return CharacterDirection::LTR;
442	}
443	}
444	CharacterDirection::RTL
445	}
446	_ => CharacterDirection::LTR,
447	}
448	}
449	}
450
451	impl FromStr for LanguageIdentifier {
452	type Err = LanguageIdentifierError;
453
454	fn from_str(source: &str) -> Result<Self, Self::Err> {
455	Self::from_bytes(source.as_bytes())
456	}
457	}
458
459	impl AsRef<LanguageIdentifier> for LanguageIdentifier {
460	#[inline(always)]
461	fn as_ref(&self) -> &LanguageIdentifier {
462	self
463	}
464	}
465
466	impl std::fmt::Display for LanguageIdentifier {
467	fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
468	self.language.fmt(f)?;
469	if let Some(ref script: &Script) = self.script {
470	f.write_char('-')?;
471	script.fmt(f)?;
472	}
473	if let Some(ref region: &Region) = self.region {
474	f.write_char('-')?;
475	region.fmt(f)?;
476	}
477	if let Some(variants: &Box<[Variant]>) = &self.variants {
478	for variant: &Variant in variants.iter() {
479	f.write_char('-')?;
480	variant.fmt(f)?;
481	}
482	}
483	Ok(())
484	}
485	}
486
487	impl PartialEq<&str> for LanguageIdentifier {
488	fn eq(&self, other: &&str) -> bool {
489	self.to_string().as_str() == *other
490	}
491	}
492
493	fn subtag_matches<P: PartialEq>(
494	subtag1: &Option<P>,
495	subtag2: &Option<P>,
496	as_range1: bool,
497	as_range2: bool,
498	) -> bool {
499	(as_range1 && subtag1.is_none()) \|\| (as_range2 && subtag2.is_none()) \|\| subtag1 == subtag2
500	}
501
502	fn is_option_empty<P: PartialEq>(subtag: &Option<Box<[P]>>) -> bool {
503	subtag.as_ref().map_or(default:`true`, \|t: &Box<[P]>\| t.is_empty())
504	}
505
506	fn subtags_match<P: PartialEq>(
507	subtag1: &Option<Box<[P]>>,
508	subtag2: &Option<Box<[P]>>,
509	as_range1: bool,
510	as_range2: bool,
511	) -> bool {
512	// or is some and is empty!
513	(as_range1 && is_option_empty(subtag:subtag1))
514	\|\| (as_range2 && is_option_empty(subtag:subtag2))
515	\|\| subtag1 == subtag2
516	}
517
518	/// This is a best-effort operation that performs all available levels of canonicalization.
519	///
520	/// At the moment the operation will normalize casing and the separator, but in the future
521	/// it may also validate and update from deprecated subtags to canonical ones.
522	///
523	/// # Examples
524	///
525	/// ```
526	/// use unic_langid_impl::canonicalize;
527	///
528	/// assert_eq!(canonicalize("pL_latn_pl"), Ok("pl-Latn-PL".to_string()));
529	/// ```
530	pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, LanguageIdentifierError> {
531	let lang_id: LanguageIdentifier = LanguageIdentifier::from_bytes(input.as_ref())?;
532	Ok(lang_id.to_string())
533	}
534
535	#[test]
536	fn invalid_subtag() {
537	assert!(LanguageIdentifier::from_bytes("en-ÁÁÁÁ".as_bytes()).is_err());
538	}
539