1//! Global locale instances and system inspection.
2//!
3//! This is an auxiliary crate for i18n solutions that:
4//!
5//! - Holds the appropriate default instances of locale.
6//! - Inspects the system for the initial values.
7//!
8//! You don't want to use it directly, but instead use an internationalisation crate like [locale].
9//!
10//! This crate is separate and intentionally minimal so that multiple i18n crates or multiple
11//! versions of one that get into the application still share the current locale setting.
12//!
13//! [locale]: https://crates.io/crates/locale
14
15#[macro_use]
16extern crate lazy_static;
17
18extern crate regex;
19
20#[cfg(target_os = "macos")]
21#[macro_use]
22extern crate objc;
23
24use regex::Regex;
25use std::borrow::{Borrow,Cow};
26use std::cell::RefCell;
27use std::convert::AsRef;
28use std::fmt;
29use std::sync::Mutex;
30
31// ------------------------------ LANGUAGE RANGE ---------------------------------
32
33/// Language and culture identifier.
34///
35/// This object holds a [RFC4647] extended language range.
36///
37/// The internal data may be owned or shared from object with lifetime `'a`. The lifetime can be
38/// extended using the `into_static()` method, which internally clones the data as needed.
39///
40/// # Syntax
41///
42/// The range is composed of `-`-separated alphanumeric subtags, possibly replaced by `*`s. It
43/// might be empty.
44///
45/// In agreement with [RFC4647], this object only requires that the tag matches:
46///
47/// ```ebnf
48/// language_tag = (alpha{1,8} | "*")
49/// ("-" (alphanum{1,8} | "*"))*
50/// ```
51///
52/// The exact interpretation is up to the downstream localization provider, but it expected that
53/// it will be matched against a normalized [RFC5646] language tag, which has the structure:
54///
55/// ```ebnf
56/// language_tag = language
57/// ("-" script)?
58/// ("-" region)?
59/// ("-" variant)*
60/// ("-" extension)*
61/// ("-" private)?
62///
63/// language = alpha{2,3} ("-" alpha{3}){0,3}
64///
65/// script = aplha{4}
66///
67/// region = alpha{2}
68/// | digit{3}
69///
70/// variant = alphanum{5,8}
71/// | digit alphanum{3}
72///
73/// extension = [0-9a-wyz] ("-" alphanum{2,8})+
74///
75/// private = "x" ("-" alphanum{1,8})+
76/// ```
77///
78/// * `language` is an [ISO639] 2-letter or, where not defined, 3-letter code. A code for
79/// macro-language might be followed by code of specific dialect.
80/// * `script` is an [ISO15924] 4-letter code.
81/// * `region` is either an [ISO3166] 2-letter code or, for areas other than countries, [UN M.49]
82/// 3-digit numeric code.
83/// * `variant` is a string indicating variant of the language.
84/// * `extension` and `private` define additional options. The private part has same structure as
85/// the Unicode [`-u-` extension][u_ext]. Available options are documented for the facets that
86/// use them.
87///
88/// The values obtained by inspecting the system are normalized according to those rules.
89///
90/// The content will be case-normalized as recommended in [RFC5646] §2.1.1, namely:
91///
92/// * `language` is written in lowercase,
93/// * `script` is written with first capital,
94/// * `country` is written in uppercase and
95/// * all other subtags are written in lowercase.
96///
97/// When detecting system configuration, additional options that may be generated under the
98/// [`-u-` extension][u_ext] currently are:
99///
100/// * `cf` — Currency format (`account` for parenthesized negative values, `standard` for minus
101/// sign).
102/// * `fw` — First day of week (`mon` to `sun`).
103/// * `hc` — Hour cycle (`h12` for 1–12, `h23` for 0–23).
104/// * `ms` — Measurement system (`metric` or `ussystem`).
105/// * `nu` — Numbering system—only decimal systems are currently used.
106/// * `va` — Variant when locale is specified in Unix format and the tag after `@` does not
107/// correspond to any variant defined in [Language subtag registry].
108///
109/// And under the `-x-` extension, following options are defined:
110///
111/// * `df` — Date format:
112///
113/// * `iso`: Short date should be in ISO format of `yyyy-MM-dd`.
114///
115/// For example `-df-iso`.
116///
117/// * `dm` — Decimal separator for monetary:
118///
119/// Followed by one or more Unicode codepoints in hexadecimal. For example `-dm-002d` means to
120/// use comma.
121///
122/// * `ds` — Decimal separator for numbers:
123///
124/// Followed by one or more Unicode codepoints in hexadecimal. For example `-ds-002d` means to
125/// use comma.
126///
127/// * `gm` — Group (thousand) separator for monetary:
128///
129/// Followed by one or more Unicode codepoints in hexadecimal. For example `-dm-00a0` means to
130/// use non-breaking space.
131///
132/// * `gs` — Group (thousand) separator for numbers:
133///
134/// Followed by one or more Unicode codepoints in hexadecimal. For example `-ds-00a0` means to
135/// use non-breaking space.
136///
137/// * `ls` — List separator:
138///
139/// Followed by one or more Unicode codepoints in hexadecimal. For example, `-ds-003b` means to
140/// use a semicolon.
141///
142/// [RFC5646]: https://www.rfc-editor.org/rfc/rfc5646.txt
143/// [RFC4647]: https://www.rfc-editor.org/rfc/rfc4647.txt
144/// [ISO639]: https://en.wikipedia.org/wiki/ISO_639
145/// [ISO15924]: https://en.wikipedia.org/wiki/ISO_15924
146/// [ISO3166]: https://en.wikipedia.org/wiki/ISO_3166
147/// [UN M.49]: https://en.wikipedia.org/wiki/UN_M.49
148/// [u_ext]: http://www.unicode.org/reports/tr35/#u_Extension
149/// [Language subtag registry]: https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
150#[derive(Clone,Debug,Eq,Hash,PartialEq)]
151pub struct LanguageRange<'a> {
152 language: Cow<'a, str>
153}
154
155lazy_static! {
156 static ref REGULAR_LANGUAGE_RANGE_REGEX: Regex = Regex::new(r"(?x) ^
157 (?P<language> (?:
158 [[:alpha:]]{2,3} (?: - [[:alpha:]]{3} ){0,3}
159 | \* ))
160 (?P<script> - (?: [[:alpha:]]{4} | \* ))?
161 (?P<region> - (?: [[:alpha:]]{2} | [[:digit:]]{3} | \* ))?
162 (?P<rest> (?: - (?: [[:alnum:]]{1,8} | \* ))*)
163 $ ").unwrap();
164 static ref LANGUAGE_RANGE_REGEX: Regex = Regex::new(r"(?x) ^
165 (?: [[:alpha:]]{1,8} | \* )
166 (?: - (?: [[:alnum:]]{1,8} | \* ))*
167 $ ").unwrap();
168 static ref UNIX_INVARIANT_REGEX: Regex = Regex::new(r"(?ix) ^
169 (?: c | posix )
170 (?: \. (?: [0-9a-zA-Z-]{1,20} ))?
171 $ ").unwrap();
172 static ref UNIX_TAG_REGEX: Regex = Regex::new(r"(?ix) ^
173 (?P<language> [[:alpha:]]{2,3} )
174 (?: _ (?P<region> [[:alpha:]]{2} | [[:digit:]]{3} ))?
175 (?: \. (?P<encoding> [0-9a-zA-Z-]{1,20} ))?
176 (?: @ (?P<variant> [[:alnum:]]{1,20} ))?
177 $ ").unwrap();
178}
179
180fn is_owned<'a, T: ToOwned + ?Sized>(c: &Cow<'a, T>) -> bool {
181 match *c {
182 Cow::Owned(_) => true,
183 Cow::Borrowed(_) => false,
184 }
185}
186
187fn canon_lower<'a>(o: Option<&'a str>) -> Cow<'a, str> {
188 match o {
189 None => Cow::Borrowed(""),
190 Some(s: &str) =>
191 if s.chars().any(char::is_uppercase) {
192 Cow::Owned(s.to_ascii_lowercase())
193 } else {
194 Cow::Borrowed(s)
195 },
196 }
197}
198
199fn canon_script<'a>(o: Option<&'a str>) -> Cow<'a, str> {
200 assert!(o.map_or(true, |s| s.len() >= 2 && &s[0..1] == "-"));
201 match o {
202 None => Cow::Borrowed(""),
203 Some(s: &str) =>
204 if s[1..2].chars().next().unwrap().is_uppercase() &&
205 s[2..].chars().all(char::is_lowercase) {
206 Cow::Borrowed(s)
207 } else {
208 Cow::Owned(String::from("-") +
209 s[1..2].to_ascii_uppercase().as_ref() +
210 s[2..].to_ascii_lowercase().as_ref())
211 },
212 }
213}
214
215fn canon_upper<'a>(o: Option<&'a str>) -> Cow<'a, str> {
216 assert!(o.map_or(true, |s| s.len() > 1 && &s[0..1] == "-"));
217 match o {
218 None => Cow::Borrowed(""),
219 Some(s: &str) =>
220 if s.chars().any(char::is_lowercase) {
221 Cow::Owned(s.to_ascii_uppercase())
222 } else {
223 Cow::Borrowed(s)
224 },
225 }
226}
227
228impl<'a> LanguageRange<'a> {
229 /// Construct LanguageRange from string, with normalization.
230 ///
231 /// LanguageRange must follow the [RFC4647] syntax.
232 /// It will be case-normalized as recommended in [RFC5646] §2.1.1, namely:
233 ///
234 /// * `language`, if recognized, is written in lowercase,
235 /// * `script`, if recognized, is written with first capital,
236 /// * `country`, if recognized, is written in uppercase and
237 /// * all other subtags are written in lowercase.
238 ///
239 /// [RFC5646]: https://www.rfc-editor.org/rfc/rfc5646.txt
240 /// [RFC4647]: https://www.rfc-editor.org/rfc/rfc4647.txt
241 pub fn new(lt: &'a str) -> Result<LanguageRange> {
242 if lt == "" {
243 return Ok(LanguageRange {
244 language: Cow::Borrowed(lt),
245 });
246 } else if let Some(caps) = REGULAR_LANGUAGE_RANGE_REGEX.captures(lt) {
247 let language = canon_lower(caps.name("language").map(|m| m.as_str()));
248 let script = canon_script(caps.name("script").map(|m| m.as_str()));
249 let region = canon_upper(caps.name("region").map(|m| m.as_str()));
250 let rest = canon_lower(caps.name("rest").map(|m| m.as_str()));
251 if is_owned(&language) ||
252 is_owned(&script) ||
253 is_owned(&region) ||
254 is_owned(&rest)
255 {
256 return Ok(LanguageRange {
257 language: Cow::Owned(
258 language.into_owned() +
259 script.borrow() +
260 region.borrow() +
261 rest.borrow()),
262 });
263 } else {
264 return Ok(LanguageRange {
265 language: Cow::Borrowed(lt),
266 });
267 }
268 } else if LANGUAGE_RANGE_REGEX.is_match(lt) {
269 return Ok(LanguageRange {
270 language: canon_lower(Some(lt)),
271 });
272 } else {
273 return Err(Error::NotWellFormed);
274 }
275 }
276
277 /// Return LanguageRange for the invariant locale.
278 ///
279 /// Invariant language is identified simply by empty string.
280 pub fn invariant() -> LanguageRange<'static> {
281 LanguageRange { language: Cow::Borrowed("") }
282 }
283
284 /// Clone the internal data to extend lifetime.
285 pub fn into_static(self) -> LanguageRange<'static> {
286 LanguageRange {
287 language: Cow::Owned(self.language.into_owned())
288 }
289 }
290
291 /// Create new instance sharing the internal data.
292 pub fn to_shared(&'a self) -> Self {
293 LanguageRange {
294 language: Cow::Borrowed(self.language.borrow())
295 }
296 }
297
298 /// Create language tag from Unix/Linux/GNU locale tag.
299 ///
300 /// Unix locale tags have the form
301 ///
302 /// > *language* [ `_` *region* ] [ `.` *encoding* ] [ `@` *variant* ]
303 ///
304 /// The *language* and *region* have the same format as RFC5646. *Encoding* is not relevant
305 /// here, since Rust always uses Utf-8. That leaves *variant*, which is unfortunately rather
306 /// free-form. So this function will translate known variants to corresponding RFC5646 subtags
307 /// and represent anything else with Unicode POSIX variant (`-u-va-`) extension.
308 ///
309 /// Note: This function is public here for benefit of applications that may come across this
310 /// kind of tags from other sources than system configuration.
311 pub fn from_unix(s: &str) -> Result<LanguageRange<'static>> {
312 if let Some(caps) = UNIX_TAG_REGEX.captures(s) {
313 let src_variant = caps.name("variant").map(|m| m.as_str()).unwrap_or("").to_ascii_lowercase();
314 let mut res = caps.name("language").map(|m| m.as_str()).unwrap().to_ascii_lowercase();
315 let region = caps.name("region").map(|m| m.as_str()).unwrap_or("");
316 let mut script = "";
317 let mut variant = "";
318 let mut uvariant = "";
319 match src_variant.as_ref() {
320 // Variants seen in the wild in GNU LibC (via http://lh.2xlibre.net/) or in Debian
321 // GNU/Linux Stretch system. Treatment of things not found in RFC5646 subtag registry
322 // (http://www.iana.org/assignments/language-subtag-registry/language-subtag-registry)
323 // or CLDR according to notes at https://wiki.openoffice.org/wiki/LocaleMapping.
324 // Dialects:
325 // aa_ER@saaho - NOTE: Can't be found under that name in RFC5646 subtag registry,
326 // but there is language Saho with code ssy, which is likely that thing.
327 "saaho" if res == "aa" => res = String::from("ssy"),
328 // Scripts:
329 // @arabic
330 "arabic" => script = "Arab",
331 // @cyrillic
332 "cyrl" => script = "Cyrl",
333 "cyrillic" => script = "Cyrl",
334 // @devanagari
335 "devanagari" => script = "Deva",
336 // @hebrew
337 "hebrew" => script = "Hebr",
338 // tt@iqtelif
339 // Neither RFC5646 subtag registry nor CLDR knows anything about this, but as best
340 // as I can tell it is Tatar name for Latin (default is Cyrillic).
341 "iqtelif" => script = "Latn",
342 // @Latn
343 "latn" => script = "Latn",
344 // @latin
345 "latin" => script = "Latn",
346 // en@shaw
347 "shaw" => script = "Shaw",
348 // Variants:
349 // sr@ijekavianlatin
350 "ijekavianlatin" => {
351 script = "Latn";
352 variant = "ijekavsk";
353 },
354 // sr@ije
355 "ije" => variant = "ijekavsk",
356 // sr@ijekavian
357 "ijekavian" => variant = "ijekavsk",
358 // ca@valencia
359 "valencia" => variant = "valencia",
360 // Currencies:
361 // @euro - NOTE: We follow suite of Java and Openoffice and ignore it, because it
362 // is default for all locales where it sometimes appears now, and because we use
363 // explicit currency in monetary formatting anyway.
364 "euro" => {},
365 // Collation:
366 // gez@abegede - NOTE: This is collation, but CLDR does not have any code for it,
367 // so we for the moment leave it fall through as -u-va- instead of -u-co-.
368 // Anything else:
369 // en@boldquot, en@quot, en@piglatin - just randomish stuff
370 // @cjknarrow - beware, it's gonna end up as -u-va-cjknarro due to lenght limit
371 s if s.len() <= 8 => uvariant = &*s,
372 s => uvariant = &s[0..8], // the subtags are limited to 8 chars, but some are longer
373 };
374 if script != "" {
375 res.push('-');
376 res.push_str(script);
377 }
378 if region != "" {
379 res.push('-');
380 res.push_str(&*region.to_ascii_uppercase());
381 }
382 if variant != "" {
383 res.push('-');
384 res.push_str(variant);
385 }
386 if uvariant != "" {
387 res.push_str("-u-va-");
388 res.push_str(uvariant);
389 }
390 return Ok(LanguageRange {
391 language: Cow::Owned(res)
392 });
393 } else if UNIX_INVARIANT_REGEX.is_match(s) {
394 return Ok(LanguageRange::invariant())
395 } else {
396 return Err(Error::NotWellFormed);
397 }
398 }
399}
400
401impl<'a> AsRef<str> for LanguageRange<'a> {
402 fn as_ref(&self) -> &str {
403 self.language.as_ref()
404 }
405}
406
407impl<'a> fmt::Display for LanguageRange<'a> {
408 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
409 self.language.fmt(f)
410 }
411}
412
413// -------------------------------- LOCALE -------------------------------------
414
415/// Locale configuration.
416///
417/// Users may accept several languages in some order of preference and may want to use rules from
418/// different culture for some particular aspect of the program behaviour, and operating systems
419/// allow them to specify this (to various extent).
420///
421/// The `Locale` objects represent the user configuration. They contain:
422///
423/// - The primary `LanguageRange`.
424/// - Optional category-specific overrides.
425/// - Optional fallbacks in case data (usually translations) for the primary language are not
426/// available.
427///
428/// The set of categories is open-ended. The `locale` crate uses five well-known categories
429/// `messages`, `numeric`, `time`, `collate` and `monetary`, but some systems define additional
430/// ones (GNU Linux has additionally `paper`, `name`, `address`, `telephone` and `measurement`) and
431/// these are provided in the user default `Locale` and other libraries can use them.
432///
433/// `Locale` is represented by a `,`-separated sequence of tags in `LanguageRange` syntax, where
434/// all except the first one may be preceded by category name and `=` sign.
435///
436/// The first tag indicates the default locale, the tags prefixed by category names indicate
437/// _overrides_ for those categories and the remaining tags indicate fallbacks.
438///
439/// Note that a syntactically valid value of HTTP `Accept-Language` header is a valid `Locale`. Not
440/// the other way around though due to the presence of category selectors.
441// TODO: Interning
442#[derive(Clone,Debug,Eq,Hash,PartialEq)]
443pub struct Locale {
444 // TODO: Intern the string for performance reasons
445 // XXX: Store pre-split to LanguageTags?
446 inner: String,
447}
448
449lazy_static! {
450 static ref LOCALE_ELEMENT_REGEX: Regex = Regex::new(r"(?ix) ^
451 (?: (?P<category> [[:alpha:]]{1,20} ) = )?
452 (?P<tag> (?: [[:alnum:]] | - | \* )+ )
453 $ ").unwrap();
454}
455
456impl Locale {
457 /// Obtain the user default locale.
458 ///
459 /// This is the locale indicated by operating environment.
460 pub fn user_default() -> Locale {
461 USER_LOCALE.clone()
462 }
463
464 /// Obtain the global default locale.
465 ///
466 /// The global default for `current()` locale. Defaults to `user_default()`.
467 pub fn global_default() -> Locale {
468 GLOBAL_LOCALE.lock().unwrap().clone()
469 }
470
471 /// Change the global default locale.
472 ///
473 /// Setting this overrides the default for new threads and threads that didn't do any
474 /// locale-aware operation yet.
475 pub fn set_global_default(lb: Locale) {
476 *GLOBAL_LOCALE.lock().unwrap() = lb;
477 }
478
479 /// Obtain the current locale of current thread.
480 ///
481 /// Defaults to `global_default()` on first use in each thread.
482 pub fn current() -> Locale {
483 CURRENT_LOCALE.with(|l| l.borrow().clone())
484 }
485
486 /// Change the current locale of current thread.
487 pub fn set_current(lb: Locale) {
488 CURRENT_LOCALE.with(|l| *l.borrow_mut() = lb);
489 }
490
491 /// Construct locale from the string representation.
492 ///
493 /// `Locale` is represented by a `,`-separated sequence of tags in `LanguageRange` syntax, where
494 /// all except the first one may be preceded by category name and `=` sign.
495 ///
496 /// The first tag indicates the default locale, the tags prefixed by category names indicate
497 /// _overrides_ for those categories and the remaining tags indicate fallbacks.
498 pub fn new(s: &str) -> Result<Locale> {
499 let mut i = s.split(',');
500 let mut res = Locale::from(
501 try!(LanguageRange::new(
502 i.next().unwrap()))); // NOTE: split "" is (""), not ()
503 for t in i {
504 if let Some(caps) = LOCALE_ELEMENT_REGEX.captures(t) {
505 let tag = try!(LanguageRange::new(
506 try!(caps.name("tag").map(|m| m.as_str()).ok_or(Error::NotWellFormed))));
507 match caps.name("category").map(|m| m.as_str()) {
508 Some(cat) => res.add_category(cat.to_ascii_lowercase().as_ref(), &tag),
509 None => res.add(&tag),
510 }
511 } else {
512 return Err(Error::NotWellFormed);
513 }
514 }
515 return Ok(res);
516 }
517
518 /// Construct invariant locale.
519 ///
520 /// Invariant locale is represented simply with empty string.
521 pub fn invariant() -> Locale {
522 Locale::from(LanguageRange::invariant())
523 }
524
525 /// Append fallback language tag.
526 ///
527 /// Adds fallback to the end of the list.
528 pub fn add(&mut self, tag: &LanguageRange) {
529 for i in self.inner.split(',') {
530 if i == tag.as_ref() {
531 return; // don't add duplicates
532 }
533 }
534 self.inner.push_str(",");
535 self.inner.push_str(tag.as_ref());
536 }
537
538 /// Append category override.
539 ///
540 /// Appending new override for a category that already has one will not replace the existing
541 /// override. This might change in future.
542 pub fn add_category(&mut self, category: &str, tag: &LanguageRange) {
543 if self.inner.split(',').next().unwrap() == tag.as_ref() {
544 return; // don't add useless override equal to the primary tag
545 }
546 for i in self.inner.split(',') {
547 if i.starts_with(category) &&
548 i[category.len()..].starts_with("=") &&
549 &i[category.len() + 1..] == tag.as_ref() {
550 return; // don't add duplicates
551 }
552 }
553 self.inner.push_str(",");
554 self.inner.push_str(category);
555 self.inner.push_str("=");
556 self.inner.push_str(tag.as_ref());
557 }
558
559 /// Iterate over `LanguageRange`s in this `Locale`.
560 ///
561 /// Returns tuples of optional category (as string) and corresponding `LanguageRange`. All tags
562 /// in the list are returned, in order of preference.
563 ///
564 /// The iterator is guaranteed to return at least one value.
565 pub fn tags<'a>(&'a self) -> Tags<'a> {
566 Tags { tags: self.inner.split(","), }
567 }
568
569 /// Iterate over `LanguageRange`s in this `Locale` applicable to given category.
570 ///
571 /// Returns `LanguageRange`s in the `Locale` that are applicable to provided category. The tags
572 /// are returned in order of preference, which means the category-specific ones first and then
573 /// the generic ones.
574 ///
575 /// The iterator is guaranteed to return at least one value.
576 pub fn tags_for<'a, 'c>(&'a self, category: &'c str) -> TagsFor<'a, 'c> {
577 let mut tags = self.inner.split(",");
578 while let Some(s) = tags.clone().next() {
579 if s.starts_with(category) && s[category.len()..].starts_with("=") {
580 return TagsFor {
581 src: self.inner.as_ref(),
582 tags: tags,
583 category: Some(category),
584 };
585 }
586 tags.next();
587 }
588 return TagsFor {
589 src: self.inner.as_ref(),
590 tags: self.inner.split(","),
591 category: None,
592 };
593 }
594}
595
596/// Locale is specified by a string tag. This is the way to access it.
597// FIXME: Do we want to provide the full string representation? We would have it as single string
598// then.
599impl AsRef<str> for Locale {
600 fn as_ref(&self) -> &str {
601 self.inner.as_ref()
602 }
603}
604
605impl<'a> From<LanguageRange<'a>> for Locale {
606 fn from(t: LanguageRange<'a>) -> Locale {
607 Locale {
608 inner: t.language.into_owned(),
609 }
610 }
611}
612
613impl fmt::Display for Locale {
614 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
615 self.inner.fmt(f)
616 }
617}
618
619/// Iterator over `LanguageRange`s for all categories in a `Locale`
620///
621/// Returns tuples of optional category (as string) and corresponding `LanguageRange`. All tags
622/// in the list are returned, in order of preference.
623///
624/// The iterator is guaranteed to return at least one value.
625pub struct Tags<'a> {
626 tags: std::str::Split<'a, &'static str>,
627}
628
629impl<'a> Iterator for Tags<'a> {
630 type Item = (Option<&'a str>, LanguageRange<'a>);
631 fn next(&mut self) -> Option<Self::Item> {
632 if let Some(s: &str) = self.tags.next() {
633 if let Some(i: usize) = s.find('=') {
634 return Some((
635 Some(&s[..i]),
636 LanguageRange { language: Cow::Borrowed(&s[i+1..]), }));
637 } else {
638 return Some((
639 None,
640 LanguageRange { language: Cow::Borrowed(s), }));
641 }
642 } else {
643 return None;
644 }
645 }
646}
647
648/// Iterator over `LanguageRange`s for specific category in a `Locale`
649///
650/// Returns `LanguageRange`s in the `Locale` that are applicable to provided category. The tags
651/// are returned in order of preference, which means the category-specific ones first and then
652/// the generic ones.
653///
654/// The iterator is guaranteed to return at least one value.
655pub struct TagsFor<'a, 'c> {
656 src: &'a str,
657 tags: std::str::Split<'a, &'static str>,
658 category: Option<&'c str>,
659}
660
661impl<'a, 'c> Iterator for TagsFor<'a, 'c> {
662 type Item = LanguageRange<'a>;
663 fn next(&mut self) -> Option<Self::Item> {
664 if let Some(cat: &str) = self.category {
665 while let Some(s: &str) = self.tags.next() {
666 if s.starts_with(cat) && s[cat.len()..].starts_with("=") {
667 return Some(
668 LanguageRange { language: Cow::Borrowed(&s[cat.len()+1..]) });
669 }
670 }
671 self.category = None;
672 self.tags = self.src.split(",");
673 }
674 while let Some(s: &str) = self.tags.next() {
675 if s.find('=').is_none() {
676 return Some(
677 LanguageRange{ language: Cow::Borrowed(s) });
678 }
679 }
680 return None;
681 }
682}
683
684// ------------------------------- INSTANCES -----------------------------------
685
686// TODO: We only need this until either std::sync::StaticMutex or std::sync::Mutex becomes usable
687// with normal `static`.
688// FIX-THE-TODO: Do we? A mutex might be usable, but we still need to initialize the value inside
689// on first access!
690lazy_static! {
691 // TODO: Implement the constructor
692 static ref USER_LOCALE: Locale = system_locale();
693 static ref GLOBAL_LOCALE: Mutex<Locale> = Mutex::new(Locale::user_default());
694}
695
696thread_local!(
697 static CURRENT_LOCALE: RefCell<Locale> = RefCell::new(Locale::global_default())
698);
699
700// NOTE: Cgi-style environment variable HTTP_ACCEPT_LANGUAGE is unlikely to be defined at any other
701// time than when actually executing in CGI, so we can relatively safely always interpret it.
702mod cgi;
703
704// NOTE: Unix-style environment variables are actually inspected everywhere, because many users
705// have them, because some software only uses those even on Windows and other systems.
706mod unix;
707
708// NOTE: Functions used exist from Vista on only
709#[cfg(target_family = "windows")]
710mod win32;
711
712// Emscripten support
713#[cfg(target_os = "emscripten")]
714mod emscripten;
715
716// macOS support
717#[cfg(target_os = "macos")]
718mod macos;
719
720static INITIALISERS: &'static [fn() -> Option<Locale>] = &[
721 cgi::system_locale,
722 unix::system_locale,
723 #[cfg(target_family = "windows")] win32::system_locale,
724 #[cfg(target_os = "emscripten")] emscripten::system_locale,
725 #[cfg(target_os = "macos")] macos::system_locale,
726];
727
728fn system_locale() -> Locale {
729 for f: &fn() -> Option in INITIALISERS {
730 if let Some(l: Locale) = f() {
731 return l;
732 }
733 }
734 return Locale::invariant();
735}
736
737// --------------------------------- ERRORS ------------------------------------
738
739/// Errors that may be returned by `locale_config`.
740#[derive(Copy,Clone,Debug,PartialEq,Eq)]
741pub enum Error {
742 /// Provided definition was not well formed.
743 ///
744 /// This is returned when provided configuration string does not match even the rather loose
745 /// definition for language range from [RFC4647] or the composition format used by `Locale`.
746 ///
747 /// [RFC4647]: https://www.rfc-editor.org/rfc/rfc4647.txt
748 NotWellFormed,
749 /// Placeholder for adding more errors in future. **Do not match!**.
750 __NonExhaustive,
751}
752
753impl ::std::fmt::Display for Error {
754 fn fmt(&self, out: &mut ::std::fmt::Formatter) -> ::std::fmt::Result {
755 use ::std::error::Error;
756 out.write_str(self.description())
757 }
758}
759
760impl ::std::error::Error for Error {
761 fn description(&self) -> &str {
762 match self {
763 &Error::NotWellFormed => "Language tag is not well-formed.",
764 // this is exception: here we do want exhaustive match so we don't publish version with
765 // missing descriptions by mistake.
766 &Error::__NonExhaustive => panic!("Placeholder error must not be instantiated!"),
767 }
768 }
769}
770
771/// Convenience Result alias.
772type Result<T> = ::std::result::Result<T, Error>;
773
774// --------------------------------- TESTS -------------------------------------
775
776#[cfg(test)]
777mod test {
778 use super::LanguageRange;
779 use super::Locale;
780 use super::is_owned;
781 use std::iter::FromIterator;
782
783 #[test]
784 fn simple_valid_lang_ranges() {
785 assert_eq!("en-US", LanguageRange::new("en-US").unwrap().as_ref());
786 assert_eq!("en-US", LanguageRange::new("EN-US").unwrap().as_ref());
787 assert_eq!("en", LanguageRange::new("en").unwrap().as_ref());
788 assert_eq!("eng-Latn-840", LanguageRange::new("eng-Latn-840").unwrap().as_ref());
789 assert_eq!("english", LanguageRange::new("English").unwrap().as_ref());
790 }
791
792 #[test]
793 fn wildcard_lang_ranges() {
794 assert_eq!("*", LanguageRange::new("*").unwrap().as_ref());
795 assert_eq!("zh-*", LanguageRange::new("zh-*").unwrap().as_ref());
796 assert_eq!("zh-*-CN", LanguageRange::new("zh-*-cn").unwrap().as_ref());
797 assert_eq!("en-*-simple-*", LanguageRange::new("En-*-Simple-*").unwrap().as_ref());
798 assert_eq!("zh-Hans-*", LanguageRange::new("zh-hans-*").unwrap().as_ref());
799 }
800
801 #[test]
802 fn complex_valid_lang_ranges() {
803 assert_eq!("de-DE-u-email-co-phonebk-x-linux",
804 LanguageRange::new("de-DE-u-email-co-phonebk-x-linux").unwrap().as_ref());
805 assert_eq!("vi-VN-u-fw-mon-hc-h24-ms-metric",
806 LanguageRange::new("vi-vn-u-fw-mon-hc-h24-ms-metric").unwrap().as_ref());
807 assert_eq!("sl-Cyrl-YU-rozaj-solba-1994-b-1234-a-foobar-x-b-1234-a-foobar",
808 LanguageRange::new("sl-Cyrl-YU-rozaj-solba-1994-b-1234-a-Foobar-x-b-1234-a-Foobar").unwrap().as_ref());
809 }
810
811 #[test]
812 fn invalid_lang_range_invalid_char() {
813 assert!(LanguageRange::new("not a range").is_err());
814 }
815
816 #[test]
817 fn invalid_lang_range_long_element() {
818 assert!(LanguageRange::new("de-DE-u-email-co-phonebook-x-linux").is_err());
819 }
820
821 #[test]
822 fn invalid_lang_range_leading_number() {
823 assert!(LanguageRange::new("840").is_err());
824 }
825
826 #[test]
827 fn invalid_lang_range_bad_asterisk() {
828 assert!(LanguageRange::new("e*-US").is_err());
829 assert!(LanguageRange::new("en-*s").is_err());
830 }
831
832 #[test]
833 fn normal_lang_range() {
834 // Check that the string is not copied if the tag is canonical
835 assert!(!is_owned(&LanguageRange::new("en-US").unwrap().language));
836 assert!(!is_owned(&LanguageRange::new("en").unwrap().language));
837 assert!(!is_owned(&LanguageRange::new("zh-Hant-CN").unwrap().language));
838 assert!(!is_owned(&LanguageRange::new("cs-CZ-x-ds-002e").unwrap().language));
839 assert!(!is_owned(&LanguageRange::new("czech").unwrap().language));
840 }
841
842 #[test]
843 fn locale_simple() {
844 assert_eq!("en-US", Locale::new("en-US").unwrap().as_ref());
845 assert_eq!("zh-Hant", Locale::new("zh-hant").unwrap().as_ref());
846 assert_eq!("de-*", Locale::new("de-*").unwrap().as_ref());
847 assert!(Locale::new("invalid!").is_err());
848 assert!(Locale::new("hı-İN").is_err());
849 }
850
851 #[test]
852 fn locale_list() {
853 assert_eq!("cs-CZ,en-GB,en,*", Locale::new("cs-cz,en-gb,en,*").unwrap().as_ref());
854 assert_eq!("cs-CZ,engrish", Locale::new("cs-cz,engrish").unwrap().as_ref());
855 assert!(Locale::new("cs-cz,hı-İN").is_err());
856 }
857
858 #[test]
859 fn locale_category() {
860 assert_eq!("cs-CZ,messages=en-GB",
861 Locale::new("cs-CZ,messages=en-GB").unwrap().as_ref());
862 assert_eq!("zh-Hant,time=ja-JP,measurement=en-US",
863 Locale::new("zh-hant,TIME=ja-jp,meaSURement=en-US").unwrap().as_ref());
864 // the first item must be plain language tag
865 assert!(Locale::new("messages=pl").is_err());
866 // adding general alternate should not help
867 assert!(Locale::new("numeric=de,fr-FR").is_err());
868 }
869
870 #[test]
871 fn locale_dups() {
872 assert_eq!("cs-CZ,en,de-AT", Locale::new("cs-CZ,en,de-AT,en").unwrap().as_ref());
873 assert_eq!("en-US,en", Locale::new("en-us,en-US,EN,eN-Us,en").unwrap().as_ref());
874 }
875
876 #[test]
877 fn locale_category_dups() {
878 assert_eq!("cs-CZ",
879 Locale::new("cs-CZ,messages=cs-CZ,time=cs-cz,collate=CS-cz").unwrap().as_ref());
880 assert_eq!("de-AT,en-AU",
881 Locale::new("de-AT,en-AU,messages=de-AT").unwrap().as_ref());
882 // category overrides override, so don't drop if they are only equal to alternates
883 assert_eq!("de-AT,en-AU,messages=en-AU",
884 Locale::new("de-AT,en-AU,messages=en-AU").unwrap().as_ref());
885 assert_eq!("hi-IN,time=en-IN",
886 Locale::new("hi-IN,time=en-IN,TIME=EN-in,TiMe=En-iN").unwrap().as_ref());
887 }
888
889 #[test]
890 fn unix_tags() {
891 assert_eq!("cs-CZ", LanguageRange::from_unix("cs_CZ.UTF-8").unwrap().as_ref());
892 assert_eq!("sr-RS-ijekavsk", LanguageRange::from_unix("sr_RS@ijekavian").unwrap().as_ref());
893 assert_eq!("sr-Latn-ijekavsk", LanguageRange::from_unix("sr.UTF-8@ijekavianlatin").unwrap().as_ref());
894 assert_eq!("en-Arab", LanguageRange::from_unix("en@arabic").unwrap().as_ref());
895 assert_eq!("en-Arab", LanguageRange::from_unix("en.UTF-8@arabic").unwrap().as_ref());
896 assert_eq!("de-DE", LanguageRange::from_unix("DE_de.UTF-8@euro").unwrap().as_ref());
897 assert_eq!("ssy-ER", LanguageRange::from_unix("aa_ER@saaho").unwrap().as_ref());
898 assert!(LanguageRange::from_unix("foo_BAR").is_err());
899 assert!(LanguageRange::from_unix("en@arabic.UTF-8").is_err());
900 assert_eq!("", LanguageRange::from_unix("C").unwrap().as_ref());
901 assert_eq!("", LanguageRange::from_unix("C.UTF-8").unwrap().as_ref());
902 assert_eq!("", LanguageRange::from_unix("C.ISO-8859-1").unwrap().as_ref());
903 assert_eq!("", LanguageRange::from_unix("POSIX").unwrap().as_ref());
904 }
905
906 #[test]
907 fn category_tag_list() {
908 assert_eq!(
909 Vec::from_iter(Locale::new("cs-CZ,messages=en-GB,time=de-DE,collate=en-US").unwrap().tags()),
910 &[(None, LanguageRange::new("cs-CZ").unwrap()),
911 (Some("messages"), LanguageRange::new("en-GB").unwrap()),
912 (Some("time"), LanguageRange::new("de-DE").unwrap()),
913 (Some("collate"), LanguageRange::new("en-US").unwrap()),
914 ]);
915 }
916
917 #[test]
918 fn tag_list_for() {
919 let locale = Locale::new("cs-CZ,messages=en-GB,time=de-DE,sk-SK,pl-PL").unwrap();
920 assert_eq!(
921 Vec::from_iter(locale.tags_for("messages")),
922 &[LanguageRange::new("en-GB").unwrap(),
923 LanguageRange::new("cs-CZ").unwrap(),
924 LanguageRange::new("sk-SK").unwrap(),
925 LanguageRange::new("pl-PL").unwrap(),
926 ]);
927 assert_eq!(
928 Vec::from_iter(locale.tags_for("time")),
929 &[LanguageRange::new("de-DE").unwrap(),
930 LanguageRange::new("cs-CZ").unwrap(),
931 LanguageRange::new("sk-SK").unwrap(),
932 LanguageRange::new("pl-PL").unwrap(),
933 ]);
934 assert_eq!(
935 Vec::from_iter(locale.tags_for("measurement")),
936 &[LanguageRange::new("cs-CZ").unwrap(),
937 LanguageRange::new("sk-SK").unwrap(),
938 LanguageRange::new("pl-PL").unwrap(),
939 ]);
940 }
941}
942