1 | //! Global locale instances and system inspection. |
2 | //! |
3 | //! This is an auxiliary crate for i18n solutions that: |
4 | //! |
5 | //! - Holds the appropriate default instances of locale. |
6 | //! - Inspects the system for the initial values. |
7 | //! |
8 | //! You don't want to use it directly, but instead use an internationalisation crate like [locale]. |
9 | //! |
10 | //! This crate is separate and intentionally minimal so that multiple i18n crates or multiple |
11 | //! versions of one that get into the application still share the current locale setting. |
12 | //! |
13 | //! [locale]: https://crates.io/crates/locale |
14 | |
15 | #[macro_use ] |
16 | extern crate lazy_static; |
17 | |
18 | extern crate regex; |
19 | |
20 | #[cfg (target_os = "macos" )] |
21 | #[macro_use ] |
22 | extern crate objc; |
23 | |
24 | use regex::Regex; |
25 | use std::borrow::{Borrow,Cow}; |
26 | use std::cell::RefCell; |
27 | use std::convert::AsRef; |
28 | use std::fmt; |
29 | use std::sync::Mutex; |
30 | |
31 | // ------------------------------ LANGUAGE RANGE --------------------------------- |
32 | |
33 | /// Language and culture identifier. |
34 | /// |
35 | /// This object holds a [RFC4647] extended language range. |
36 | /// |
37 | /// The internal data may be owned or shared from object with lifetime `'a`. The lifetime can be |
38 | /// extended using the `into_static()` method, which internally clones the data as needed. |
39 | /// |
40 | /// # Syntax |
41 | /// |
42 | /// The range is composed of `-`-separated alphanumeric subtags, possibly replaced by `*`s. It |
43 | /// might be empty. |
44 | /// |
45 | /// In agreement with [RFC4647], this object only requires that the tag matches: |
46 | /// |
47 | /// ```ebnf |
48 | /// language_tag = (alpha{1,8} | "*") |
49 | /// ("-" (alphanum{1,8} | "*"))* |
50 | /// ``` |
51 | /// |
52 | /// The exact interpretation is up to the downstream localization provider, but it expected that |
53 | /// it will be matched against a normalized [RFC5646] language tag, which has the structure: |
54 | /// |
55 | /// ```ebnf |
56 | /// language_tag = language |
57 | /// ("-" script)? |
58 | /// ("-" region)? |
59 | /// ("-" variant)* |
60 | /// ("-" extension)* |
61 | /// ("-" private)? |
62 | /// |
63 | /// language = alpha{2,3} ("-" alpha{3}){0,3} |
64 | /// |
65 | /// script = aplha{4} |
66 | /// |
67 | /// region = alpha{2} |
68 | /// | digit{3} |
69 | /// |
70 | /// variant = alphanum{5,8} |
71 | /// | digit alphanum{3} |
72 | /// |
73 | /// extension = [0-9a-wyz] ("-" alphanum{2,8})+ |
74 | /// |
75 | /// private = "x" ("-" alphanum{1,8})+ |
76 | /// ``` |
77 | /// |
78 | /// * `language` is an [ISO639] 2-letter or, where not defined, 3-letter code. A code for |
79 | /// macro-language might be followed by code of specific dialect. |
80 | /// * `script` is an [ISO15924] 4-letter code. |
81 | /// * `region` is either an [ISO3166] 2-letter code or, for areas other than countries, [UN M.49] |
82 | /// 3-digit numeric code. |
83 | /// * `variant` is a string indicating variant of the language. |
84 | /// * `extension` and `private` define additional options. The private part has same structure as |
85 | /// the Unicode [`-u-` extension][u_ext]. Available options are documented for the facets that |
86 | /// use them. |
87 | /// |
88 | /// The values obtained by inspecting the system are normalized according to those rules. |
89 | /// |
90 | /// The content will be case-normalized as recommended in [RFC5646] §2.1.1, namely: |
91 | /// |
92 | /// * `language` is written in lowercase, |
93 | /// * `script` is written with first capital, |
94 | /// * `country` is written in uppercase and |
95 | /// * all other subtags are written in lowercase. |
96 | /// |
97 | /// When detecting system configuration, additional options that may be generated under the |
98 | /// [`-u-` extension][u_ext] currently are: |
99 | /// |
100 | /// * `cf` — Currency format (`account` for parenthesized negative values, `standard` for minus |
101 | /// sign). |
102 | /// * `fw` — First day of week (`mon` to `sun`). |
103 | /// * `hc` — Hour cycle (`h12` for 1–12, `h23` for 0–23). |
104 | /// * `ms` — Measurement system (`metric` or `ussystem`). |
105 | /// * `nu` — Numbering system—only decimal systems are currently used. |
106 | /// * `va` — Variant when locale is specified in Unix format and the tag after `@` does not |
107 | /// correspond to any variant defined in [Language subtag registry]. |
108 | /// |
109 | /// And under the `-x-` extension, following options are defined: |
110 | /// |
111 | /// * `df` — Date format: |
112 | /// |
113 | /// * `iso`: Short date should be in ISO format of `yyyy-MM-dd`. |
114 | /// |
115 | /// For example `-df-iso`. |
116 | /// |
117 | /// * `dm` — Decimal separator for monetary: |
118 | /// |
119 | /// Followed by one or more Unicode codepoints in hexadecimal. For example `-dm-002d` means to |
120 | /// use comma. |
121 | /// |
122 | /// * `ds` — Decimal separator for numbers: |
123 | /// |
124 | /// Followed by one or more Unicode codepoints in hexadecimal. For example `-ds-002d` means to |
125 | /// use comma. |
126 | /// |
127 | /// * `gm` — Group (thousand) separator for monetary: |
128 | /// |
129 | /// Followed by one or more Unicode codepoints in hexadecimal. For example `-dm-00a0` means to |
130 | /// use non-breaking space. |
131 | /// |
132 | /// * `gs` — Group (thousand) separator for numbers: |
133 | /// |
134 | /// Followed by one or more Unicode codepoints in hexadecimal. For example `-ds-00a0` means to |
135 | /// use non-breaking space. |
136 | /// |
137 | /// * `ls` — List separator: |
138 | /// |
139 | /// Followed by one or more Unicode codepoints in hexadecimal. For example, `-ds-003b` means to |
140 | /// use a semicolon. |
141 | /// |
142 | /// [RFC5646]: https://www.rfc-editor.org/rfc/rfc5646.txt |
143 | /// [RFC4647]: https://www.rfc-editor.org/rfc/rfc4647.txt |
144 | /// [ISO639]: https://en.wikipedia.org/wiki/ISO_639 |
145 | /// [ISO15924]: https://en.wikipedia.org/wiki/ISO_15924 |
146 | /// [ISO3166]: https://en.wikipedia.org/wiki/ISO_3166 |
147 | /// [UN M.49]: https://en.wikipedia.org/wiki/UN_M.49 |
148 | /// [u_ext]: http://www.unicode.org/reports/tr35/#u_Extension |
149 | /// [Language subtag registry]: https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |
150 | #[derive (Clone,Debug,Eq,Hash,PartialEq)] |
151 | pub struct LanguageRange<'a> { |
152 | language: Cow<'a, str> |
153 | } |
154 | |
155 | lazy_static! { |
156 | static ref REGULAR_LANGUAGE_RANGE_REGEX: Regex = Regex::new(r"(?x) ^ |
157 | (?P<language> (?: |
158 | [[:alpha:]]{2,3} (?: - [[:alpha:]]{3} ){0,3} |
159 | | \* )) |
160 | (?P<script> - (?: [[:alpha:]]{4} | \* ))? |
161 | (?P<region> - (?: [[:alpha:]]{2} | [[:digit:]]{3} | \* ))? |
162 | (?P<rest> (?: - (?: [[:alnum:]]{1,8} | \* ))*) |
163 | $ " ).unwrap(); |
164 | static ref LANGUAGE_RANGE_REGEX: Regex = Regex::new(r"(?x) ^ |
165 | (?: [[:alpha:]]{1,8} | \* ) |
166 | (?: - (?: [[:alnum:]]{1,8} | \* ))* |
167 | $ " ).unwrap(); |
168 | static ref UNIX_INVARIANT_REGEX: Regex = Regex::new(r"(?ix) ^ |
169 | (?: c | posix ) |
170 | (?: \. (?: [0-9a-zA-Z-]{1,20} ))? |
171 | $ " ).unwrap(); |
172 | static ref UNIX_TAG_REGEX: Regex = Regex::new(r"(?ix) ^ |
173 | (?P<language> [[:alpha:]]{2,3} ) |
174 | (?: _ (?P<region> [[:alpha:]]{2} | [[:digit:]]{3} ))? |
175 | (?: \. (?P<encoding> [0-9a-zA-Z-]{1,20} ))? |
176 | (?: @ (?P<variant> [[:alnum:]]{1,20} ))? |
177 | $ " ).unwrap(); |
178 | } |
179 | |
180 | fn is_owned<'a, T: ToOwned + ?Sized>(c: &Cow<'a, T>) -> bool { |
181 | match *c { |
182 | Cow::Owned(_) => true, |
183 | Cow::Borrowed(_) => false, |
184 | } |
185 | } |
186 | |
187 | fn canon_lower<'a>(o: Option<&'a str>) -> Cow<'a, str> { |
188 | match o { |
189 | None => Cow::Borrowed("" ), |
190 | Some(s: &str) => |
191 | if s.chars().any(char::is_uppercase) { |
192 | Cow::Owned(s.to_ascii_lowercase()) |
193 | } else { |
194 | Cow::Borrowed(s) |
195 | }, |
196 | } |
197 | } |
198 | |
199 | fn canon_script<'a>(o: Option<&'a str>) -> Cow<'a, str> { |
200 | assert!(o.map_or(true, |s| s.len() >= 2 && &s[0..1] == "-" )); |
201 | match o { |
202 | None => Cow::Borrowed("" ), |
203 | Some(s: &str) => |
204 | if s[1..2].chars().next().unwrap().is_uppercase() && |
205 | s[2..].chars().all(char::is_lowercase) { |
206 | Cow::Borrowed(s) |
207 | } else { |
208 | Cow::Owned(String::from("-" ) + |
209 | s[1..2].to_ascii_uppercase().as_ref() + |
210 | s[2..].to_ascii_lowercase().as_ref()) |
211 | }, |
212 | } |
213 | } |
214 | |
215 | fn canon_upper<'a>(o: Option<&'a str>) -> Cow<'a, str> { |
216 | assert!(o.map_or(true, |s| s.len() > 1 && &s[0..1] == "-" )); |
217 | match o { |
218 | None => Cow::Borrowed("" ), |
219 | Some(s: &str) => |
220 | if s.chars().any(char::is_lowercase) { |
221 | Cow::Owned(s.to_ascii_uppercase()) |
222 | } else { |
223 | Cow::Borrowed(s) |
224 | }, |
225 | } |
226 | } |
227 | |
228 | impl<'a> LanguageRange<'a> { |
229 | /// Construct LanguageRange from string, with normalization. |
230 | /// |
231 | /// LanguageRange must follow the [RFC4647] syntax. |
232 | /// It will be case-normalized as recommended in [RFC5646] §2.1.1, namely: |
233 | /// |
234 | /// * `language`, if recognized, is written in lowercase, |
235 | /// * `script`, if recognized, is written with first capital, |
236 | /// * `country`, if recognized, is written in uppercase and |
237 | /// * all other subtags are written in lowercase. |
238 | /// |
239 | /// [RFC5646]: https://www.rfc-editor.org/rfc/rfc5646.txt |
240 | /// [RFC4647]: https://www.rfc-editor.org/rfc/rfc4647.txt |
241 | pub fn new(lt: &'a str) -> Result<LanguageRange> { |
242 | if lt == "" { |
243 | return Ok(LanguageRange { |
244 | language: Cow::Borrowed(lt), |
245 | }); |
246 | } else if let Some(caps) = REGULAR_LANGUAGE_RANGE_REGEX.captures(lt) { |
247 | let language = canon_lower(caps.name("language" ).map(|m| m.as_str())); |
248 | let script = canon_script(caps.name("script" ).map(|m| m.as_str())); |
249 | let region = canon_upper(caps.name("region" ).map(|m| m.as_str())); |
250 | let rest = canon_lower(caps.name("rest" ).map(|m| m.as_str())); |
251 | if is_owned(&language) || |
252 | is_owned(&script) || |
253 | is_owned(®ion) || |
254 | is_owned(&rest) |
255 | { |
256 | return Ok(LanguageRange { |
257 | language: Cow::Owned( |
258 | language.into_owned() + |
259 | script.borrow() + |
260 | region.borrow() + |
261 | rest.borrow()), |
262 | }); |
263 | } else { |
264 | return Ok(LanguageRange { |
265 | language: Cow::Borrowed(lt), |
266 | }); |
267 | } |
268 | } else if LANGUAGE_RANGE_REGEX.is_match(lt) { |
269 | return Ok(LanguageRange { |
270 | language: canon_lower(Some(lt)), |
271 | }); |
272 | } else { |
273 | return Err(Error::NotWellFormed); |
274 | } |
275 | } |
276 | |
277 | /// Return LanguageRange for the invariant locale. |
278 | /// |
279 | /// Invariant language is identified simply by empty string. |
280 | pub fn invariant() -> LanguageRange<'static> { |
281 | LanguageRange { language: Cow::Borrowed("" ) } |
282 | } |
283 | |
284 | /// Clone the internal data to extend lifetime. |
285 | pub fn into_static(self) -> LanguageRange<'static> { |
286 | LanguageRange { |
287 | language: Cow::Owned(self.language.into_owned()) |
288 | } |
289 | } |
290 | |
291 | /// Create new instance sharing the internal data. |
292 | pub fn to_shared(&'a self) -> Self { |
293 | LanguageRange { |
294 | language: Cow::Borrowed(self.language.borrow()) |
295 | } |
296 | } |
297 | |
298 | /// Create language tag from Unix/Linux/GNU locale tag. |
299 | /// |
300 | /// Unix locale tags have the form |
301 | /// |
302 | /// > *language* [ `_` *region* ] [ `.` *encoding* ] [ `@` *variant* ] |
303 | /// |
304 | /// The *language* and *region* have the same format as RFC5646. *Encoding* is not relevant |
305 | /// here, since Rust always uses Utf-8. That leaves *variant*, which is unfortunately rather |
306 | /// free-form. So this function will translate known variants to corresponding RFC5646 subtags |
307 | /// and represent anything else with Unicode POSIX variant (`-u-va-`) extension. |
308 | /// |
309 | /// Note: This function is public here for benefit of applications that may come across this |
310 | /// kind of tags from other sources than system configuration. |
311 | pub fn from_unix(s: &str) -> Result<LanguageRange<'static>> { |
312 | if let Some(caps) = UNIX_TAG_REGEX.captures(s) { |
313 | let src_variant = caps.name("variant" ).map(|m| m.as_str()).unwrap_or("" ).to_ascii_lowercase(); |
314 | let mut res = caps.name("language" ).map(|m| m.as_str()).unwrap().to_ascii_lowercase(); |
315 | let region = caps.name("region" ).map(|m| m.as_str()).unwrap_or("" ); |
316 | let mut script = "" ; |
317 | let mut variant = "" ; |
318 | let mut uvariant = "" ; |
319 | match src_variant.as_ref() { |
320 | // Variants seen in the wild in GNU LibC (via http://lh.2xlibre.net/) or in Debian |
321 | // GNU/Linux Stretch system. Treatment of things not found in RFC5646 subtag registry |
322 | // (http://www.iana.org/assignments/language-subtag-registry/language-subtag-registry) |
323 | // or CLDR according to notes at https://wiki.openoffice.org/wiki/LocaleMapping. |
324 | // Dialects: |
325 | // aa_ER@saaho - NOTE: Can't be found under that name in RFC5646 subtag registry, |
326 | // but there is language Saho with code ssy, which is likely that thing. |
327 | "saaho" if res == "aa" => res = String::from("ssy" ), |
328 | // Scripts: |
329 | // @arabic |
330 | "arabic" => script = "Arab" , |
331 | // @cyrillic |
332 | "cyrl" => script = "Cyrl" , |
333 | "cyrillic" => script = "Cyrl" , |
334 | // @devanagari |
335 | "devanagari" => script = "Deva" , |
336 | // @hebrew |
337 | "hebrew" => script = "Hebr" , |
338 | // tt@iqtelif |
339 | // Neither RFC5646 subtag registry nor CLDR knows anything about this, but as best |
340 | // as I can tell it is Tatar name for Latin (default is Cyrillic). |
341 | "iqtelif" => script = "Latn" , |
342 | // @Latn |
343 | "latn" => script = "Latn" , |
344 | // @latin |
345 | "latin" => script = "Latn" , |
346 | // en@shaw |
347 | "shaw" => script = "Shaw" , |
348 | // Variants: |
349 | // sr@ijekavianlatin |
350 | "ijekavianlatin" => { |
351 | script = "Latn" ; |
352 | variant = "ijekavsk" ; |
353 | }, |
354 | // sr@ije |
355 | "ije" => variant = "ijekavsk" , |
356 | // sr@ijekavian |
357 | "ijekavian" => variant = "ijekavsk" , |
358 | // ca@valencia |
359 | "valencia" => variant = "valencia" , |
360 | // Currencies: |
361 | // @euro - NOTE: We follow suite of Java and Openoffice and ignore it, because it |
362 | // is default for all locales where it sometimes appears now, and because we use |
363 | // explicit currency in monetary formatting anyway. |
364 | "euro" => {}, |
365 | // Collation: |
366 | // gez@abegede - NOTE: This is collation, but CLDR does not have any code for it, |
367 | // so we for the moment leave it fall through as -u-va- instead of -u-co-. |
368 | // Anything else: |
369 | // en@boldquot, en@quot, en@piglatin - just randomish stuff |
370 | // @cjknarrow - beware, it's gonna end up as -u-va-cjknarro due to lenght limit |
371 | s if s.len() <= 8 => uvariant = &*s, |
372 | s => uvariant = &s[0..8], // the subtags are limited to 8 chars, but some are longer |
373 | }; |
374 | if script != "" { |
375 | res.push('-' ); |
376 | res.push_str(script); |
377 | } |
378 | if region != "" { |
379 | res.push('-' ); |
380 | res.push_str(&*region.to_ascii_uppercase()); |
381 | } |
382 | if variant != "" { |
383 | res.push('-' ); |
384 | res.push_str(variant); |
385 | } |
386 | if uvariant != "" { |
387 | res.push_str("-u-va-" ); |
388 | res.push_str(uvariant); |
389 | } |
390 | return Ok(LanguageRange { |
391 | language: Cow::Owned(res) |
392 | }); |
393 | } else if UNIX_INVARIANT_REGEX.is_match(s) { |
394 | return Ok(LanguageRange::invariant()) |
395 | } else { |
396 | return Err(Error::NotWellFormed); |
397 | } |
398 | } |
399 | } |
400 | |
401 | impl<'a> AsRef<str> for LanguageRange<'a> { |
402 | fn as_ref(&self) -> &str { |
403 | self.language.as_ref() |
404 | } |
405 | } |
406 | |
407 | impl<'a> fmt::Display for LanguageRange<'a> { |
408 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
409 | self.language.fmt(f) |
410 | } |
411 | } |
412 | |
413 | // -------------------------------- LOCALE ------------------------------------- |
414 | |
415 | /// Locale configuration. |
416 | /// |
417 | /// Users may accept several languages in some order of preference and may want to use rules from |
418 | /// different culture for some particular aspect of the program behaviour, and operating systems |
419 | /// allow them to specify this (to various extent). |
420 | /// |
421 | /// The `Locale` objects represent the user configuration. They contain: |
422 | /// |
423 | /// - The primary `LanguageRange`. |
424 | /// - Optional category-specific overrides. |
425 | /// - Optional fallbacks in case data (usually translations) for the primary language are not |
426 | /// available. |
427 | /// |
428 | /// The set of categories is open-ended. The `locale` crate uses five well-known categories |
429 | /// `messages`, `numeric`, `time`, `collate` and `monetary`, but some systems define additional |
430 | /// ones (GNU Linux has additionally `paper`, `name`, `address`, `telephone` and `measurement`) and |
431 | /// these are provided in the user default `Locale` and other libraries can use them. |
432 | /// |
433 | /// `Locale` is represented by a `,`-separated sequence of tags in `LanguageRange` syntax, where |
434 | /// all except the first one may be preceded by category name and `=` sign. |
435 | /// |
436 | /// The first tag indicates the default locale, the tags prefixed by category names indicate |
437 | /// _overrides_ for those categories and the remaining tags indicate fallbacks. |
438 | /// |
439 | /// Note that a syntactically valid value of HTTP `Accept-Language` header is a valid `Locale`. Not |
440 | /// the other way around though due to the presence of category selectors. |
441 | // TODO: Interning |
442 | #[derive (Clone,Debug,Eq,Hash,PartialEq)] |
443 | pub struct Locale { |
444 | // TODO: Intern the string for performance reasons |
445 | // XXX: Store pre-split to LanguageTags? |
446 | inner: String, |
447 | } |
448 | |
449 | lazy_static! { |
450 | static ref LOCALE_ELEMENT_REGEX: Regex = Regex::new(r"(?ix) ^ |
451 | (?: (?P<category> [[:alpha:]]{1,20} ) = )? |
452 | (?P<tag> (?: [[:alnum:]] | - | \* )+ ) |
453 | $ " ).unwrap(); |
454 | } |
455 | |
456 | impl Locale { |
457 | /// Obtain the user default locale. |
458 | /// |
459 | /// This is the locale indicated by operating environment. |
460 | pub fn user_default() -> Locale { |
461 | USER_LOCALE.clone() |
462 | } |
463 | |
464 | /// Obtain the global default locale. |
465 | /// |
466 | /// The global default for `current()` locale. Defaults to `user_default()`. |
467 | pub fn global_default() -> Locale { |
468 | GLOBAL_LOCALE.lock().unwrap().clone() |
469 | } |
470 | |
471 | /// Change the global default locale. |
472 | /// |
473 | /// Setting this overrides the default for new threads and threads that didn't do any |
474 | /// locale-aware operation yet. |
475 | pub fn set_global_default(lb: Locale) { |
476 | *GLOBAL_LOCALE.lock().unwrap() = lb; |
477 | } |
478 | |
479 | /// Obtain the current locale of current thread. |
480 | /// |
481 | /// Defaults to `global_default()` on first use in each thread. |
482 | pub fn current() -> Locale { |
483 | CURRENT_LOCALE.with(|l| l.borrow().clone()) |
484 | } |
485 | |
486 | /// Change the current locale of current thread. |
487 | pub fn set_current(lb: Locale) { |
488 | CURRENT_LOCALE.with(|l| *l.borrow_mut() = lb); |
489 | } |
490 | |
491 | /// Construct locale from the string representation. |
492 | /// |
493 | /// `Locale` is represented by a `,`-separated sequence of tags in `LanguageRange` syntax, where |
494 | /// all except the first one may be preceded by category name and `=` sign. |
495 | /// |
496 | /// The first tag indicates the default locale, the tags prefixed by category names indicate |
497 | /// _overrides_ for those categories and the remaining tags indicate fallbacks. |
498 | pub fn new(s: &str) -> Result<Locale> { |
499 | let mut i = s.split(',' ); |
500 | let mut res = Locale::from( |
501 | try!(LanguageRange::new( |
502 | i.next().unwrap()))); // NOTE: split "" is (""), not () |
503 | for t in i { |
504 | if let Some(caps) = LOCALE_ELEMENT_REGEX.captures(t) { |
505 | let tag = try!(LanguageRange::new( |
506 | try!(caps.name("tag" ).map(|m| m.as_str()).ok_or(Error::NotWellFormed)))); |
507 | match caps.name("category" ).map(|m| m.as_str()) { |
508 | Some(cat) => res.add_category(cat.to_ascii_lowercase().as_ref(), &tag), |
509 | None => res.add(&tag), |
510 | } |
511 | } else { |
512 | return Err(Error::NotWellFormed); |
513 | } |
514 | } |
515 | return Ok(res); |
516 | } |
517 | |
518 | /// Construct invariant locale. |
519 | /// |
520 | /// Invariant locale is represented simply with empty string. |
521 | pub fn invariant() -> Locale { |
522 | Locale::from(LanguageRange::invariant()) |
523 | } |
524 | |
525 | /// Append fallback language tag. |
526 | /// |
527 | /// Adds fallback to the end of the list. |
528 | pub fn add(&mut self, tag: &LanguageRange) { |
529 | for i in self.inner.split(',' ) { |
530 | if i == tag.as_ref() { |
531 | return; // don't add duplicates |
532 | } |
533 | } |
534 | self.inner.push_str("," ); |
535 | self.inner.push_str(tag.as_ref()); |
536 | } |
537 | |
538 | /// Append category override. |
539 | /// |
540 | /// Appending new override for a category that already has one will not replace the existing |
541 | /// override. This might change in future. |
542 | pub fn add_category(&mut self, category: &str, tag: &LanguageRange) { |
543 | if self.inner.split(',' ).next().unwrap() == tag.as_ref() { |
544 | return; // don't add useless override equal to the primary tag |
545 | } |
546 | for i in self.inner.split(',' ) { |
547 | if i.starts_with(category) && |
548 | i[category.len()..].starts_with("=" ) && |
549 | &i[category.len() + 1..] == tag.as_ref() { |
550 | return; // don't add duplicates |
551 | } |
552 | } |
553 | self.inner.push_str("," ); |
554 | self.inner.push_str(category); |
555 | self.inner.push_str("=" ); |
556 | self.inner.push_str(tag.as_ref()); |
557 | } |
558 | |
559 | /// Iterate over `LanguageRange`s in this `Locale`. |
560 | /// |
561 | /// Returns tuples of optional category (as string) and corresponding `LanguageRange`. All tags |
562 | /// in the list are returned, in order of preference. |
563 | /// |
564 | /// The iterator is guaranteed to return at least one value. |
565 | pub fn tags<'a>(&'a self) -> Tags<'a> { |
566 | Tags { tags: self.inner.split("," ), } |
567 | } |
568 | |
569 | /// Iterate over `LanguageRange`s in this `Locale` applicable to given category. |
570 | /// |
571 | /// Returns `LanguageRange`s in the `Locale` that are applicable to provided category. The tags |
572 | /// are returned in order of preference, which means the category-specific ones first and then |
573 | /// the generic ones. |
574 | /// |
575 | /// The iterator is guaranteed to return at least one value. |
576 | pub fn tags_for<'a, 'c>(&'a self, category: &'c str) -> TagsFor<'a, 'c> { |
577 | let mut tags = self.inner.split("," ); |
578 | while let Some(s) = tags.clone().next() { |
579 | if s.starts_with(category) && s[category.len()..].starts_with("=" ) { |
580 | return TagsFor { |
581 | src: self.inner.as_ref(), |
582 | tags: tags, |
583 | category: Some(category), |
584 | }; |
585 | } |
586 | tags.next(); |
587 | } |
588 | return TagsFor { |
589 | src: self.inner.as_ref(), |
590 | tags: self.inner.split("," ), |
591 | category: None, |
592 | }; |
593 | } |
594 | } |
595 | |
596 | /// Locale is specified by a string tag. This is the way to access it. |
597 | // FIXME: Do we want to provide the full string representation? We would have it as single string |
598 | // then. |
599 | impl AsRef<str> for Locale { |
600 | fn as_ref(&self) -> &str { |
601 | self.inner.as_ref() |
602 | } |
603 | } |
604 | |
605 | impl<'a> From<LanguageRange<'a>> for Locale { |
606 | fn from(t: LanguageRange<'a>) -> Locale { |
607 | Locale { |
608 | inner: t.language.into_owned(), |
609 | } |
610 | } |
611 | } |
612 | |
613 | impl fmt::Display for Locale { |
614 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
615 | self.inner.fmt(f) |
616 | } |
617 | } |
618 | |
619 | /// Iterator over `LanguageRange`s for all categories in a `Locale` |
620 | /// |
621 | /// Returns tuples of optional category (as string) and corresponding `LanguageRange`. All tags |
622 | /// in the list are returned, in order of preference. |
623 | /// |
624 | /// The iterator is guaranteed to return at least one value. |
625 | pub struct Tags<'a> { |
626 | tags: std::str::Split<'a, &'static str>, |
627 | } |
628 | |
629 | impl<'a> Iterator for Tags<'a> { |
630 | type Item = (Option<&'a str>, LanguageRange<'a>); |
631 | fn next(&mut self) -> Option<Self::Item> { |
632 | if let Some(s: &str) = self.tags.next() { |
633 | if let Some(i: usize) = s.find('=' ) { |
634 | return Some(( |
635 | Some(&s[..i]), |
636 | LanguageRange { language: Cow::Borrowed(&s[i+1..]), })); |
637 | } else { |
638 | return Some(( |
639 | None, |
640 | LanguageRange { language: Cow::Borrowed(s), })); |
641 | } |
642 | } else { |
643 | return None; |
644 | } |
645 | } |
646 | } |
647 | |
648 | /// Iterator over `LanguageRange`s for specific category in a `Locale` |
649 | /// |
650 | /// Returns `LanguageRange`s in the `Locale` that are applicable to provided category. The tags |
651 | /// are returned in order of preference, which means the category-specific ones first and then |
652 | /// the generic ones. |
653 | /// |
654 | /// The iterator is guaranteed to return at least one value. |
655 | pub struct TagsFor<'a, 'c> { |
656 | src: &'a str, |
657 | tags: std::str::Split<'a, &'static str>, |
658 | category: Option<&'c str>, |
659 | } |
660 | |
661 | impl<'a, 'c> Iterator for TagsFor<'a, 'c> { |
662 | type Item = LanguageRange<'a>; |
663 | fn next(&mut self) -> Option<Self::Item> { |
664 | if let Some(cat: &str) = self.category { |
665 | while let Some(s: &str) = self.tags.next() { |
666 | if s.starts_with(cat) && s[cat.len()..].starts_with("=" ) { |
667 | return Some( |
668 | LanguageRange { language: Cow::Borrowed(&s[cat.len()+1..]) }); |
669 | } |
670 | } |
671 | self.category = None; |
672 | self.tags = self.src.split("," ); |
673 | } |
674 | while let Some(s: &str) = self.tags.next() { |
675 | if s.find('=' ).is_none() { |
676 | return Some( |
677 | LanguageRange{ language: Cow::Borrowed(s) }); |
678 | } |
679 | } |
680 | return None; |
681 | } |
682 | } |
683 | |
684 | // ------------------------------- INSTANCES ----------------------------------- |
685 | |
686 | // TODO: We only need this until either std::sync::StaticMutex or std::sync::Mutex becomes usable |
687 | // with normal `static`. |
688 | // FIX-THE-TODO: Do we? A mutex might be usable, but we still need to initialize the value inside |
689 | // on first access! |
690 | lazy_static! { |
691 | // TODO: Implement the constructor |
692 | static ref USER_LOCALE: Locale = system_locale(); |
693 | static ref GLOBAL_LOCALE: Mutex<Locale> = Mutex::new(Locale::user_default()); |
694 | } |
695 | |
696 | thread_local!( |
697 | static CURRENT_LOCALE: RefCell<Locale> = RefCell::new(Locale::global_default()) |
698 | ); |
699 | |
700 | // NOTE: Cgi-style environment variable HTTP_ACCEPT_LANGUAGE is unlikely to be defined at any other |
701 | // time than when actually executing in CGI, so we can relatively safely always interpret it. |
702 | mod cgi; |
703 | |
704 | // NOTE: Unix-style environment variables are actually inspected everywhere, because many users |
705 | // have them, because some software only uses those even on Windows and other systems. |
706 | mod unix; |
707 | |
708 | // NOTE: Functions used exist from Vista on only |
709 | #[cfg (target_family = "windows" )] |
710 | mod win32; |
711 | |
712 | // Emscripten support |
713 | #[cfg (target_os = "emscripten" )] |
714 | mod emscripten; |
715 | |
716 | // macOS support |
717 | #[cfg (target_os = "macos" )] |
718 | mod macos; |
719 | |
720 | static INITIALISERS: &'static [fn() -> Option<Locale>] = &[ |
721 | cgi::system_locale, |
722 | unix::system_locale, |
723 | #[cfg (target_family = "windows" )] win32::system_locale, |
724 | #[cfg (target_os = "emscripten" )] emscripten::system_locale, |
725 | #[cfg (target_os = "macos" )] macos::system_locale, |
726 | ]; |
727 | |
728 | fn system_locale() -> Locale { |
729 | for f: &fn() -> Option in INITIALISERS { |
730 | if let Some(l: Locale) = f() { |
731 | return l; |
732 | } |
733 | } |
734 | return Locale::invariant(); |
735 | } |
736 | |
737 | // --------------------------------- ERRORS ------------------------------------ |
738 | |
739 | /// Errors that may be returned by `locale_config`. |
740 | #[derive (Copy,Clone,Debug,PartialEq,Eq)] |
741 | pub enum Error { |
742 | /// Provided definition was not well formed. |
743 | /// |
744 | /// This is returned when provided configuration string does not match even the rather loose |
745 | /// definition for language range from [RFC4647] or the composition format used by `Locale`. |
746 | /// |
747 | /// [RFC4647]: https://www.rfc-editor.org/rfc/rfc4647.txt |
748 | NotWellFormed, |
749 | /// Placeholder for adding more errors in future. **Do not match!**. |
750 | __NonExhaustive, |
751 | } |
752 | |
753 | impl ::std::fmt::Display for Error { |
754 | fn fmt(&self, out: &mut ::std::fmt::Formatter) -> ::std::fmt::Result { |
755 | use ::std::error::Error; |
756 | out.write_str(self.description()) |
757 | } |
758 | } |
759 | |
760 | impl ::std::error::Error for Error { |
761 | fn description(&self) -> &str { |
762 | match self { |
763 | &Error::NotWellFormed => "Language tag is not well-formed." , |
764 | // this is exception: here we do want exhaustive match so we don't publish version with |
765 | // missing descriptions by mistake. |
766 | &Error::__NonExhaustive => panic!("Placeholder error must not be instantiated!" ), |
767 | } |
768 | } |
769 | } |
770 | |
771 | /// Convenience Result alias. |
772 | type Result<T> = ::std::result::Result<T, Error>; |
773 | |
774 | // --------------------------------- TESTS ------------------------------------- |
775 | |
776 | #[cfg (test)] |
777 | mod test { |
778 | use super::LanguageRange; |
779 | use super::Locale; |
780 | use super::is_owned; |
781 | use std::iter::FromIterator; |
782 | |
783 | #[test ] |
784 | fn simple_valid_lang_ranges() { |
785 | assert_eq!("en-US" , LanguageRange::new("en-US" ).unwrap().as_ref()); |
786 | assert_eq!("en-US" , LanguageRange::new("EN-US" ).unwrap().as_ref()); |
787 | assert_eq!("en" , LanguageRange::new("en" ).unwrap().as_ref()); |
788 | assert_eq!("eng-Latn-840" , LanguageRange::new("eng-Latn-840" ).unwrap().as_ref()); |
789 | assert_eq!("english" , LanguageRange::new("English" ).unwrap().as_ref()); |
790 | } |
791 | |
792 | #[test ] |
793 | fn wildcard_lang_ranges() { |
794 | assert_eq!("*" , LanguageRange::new("*" ).unwrap().as_ref()); |
795 | assert_eq!("zh-*" , LanguageRange::new("zh-*" ).unwrap().as_ref()); |
796 | assert_eq!("zh-*-CN" , LanguageRange::new("zh-*-cn" ).unwrap().as_ref()); |
797 | assert_eq!("en-*-simple-*" , LanguageRange::new("En-*-Simple-*" ).unwrap().as_ref()); |
798 | assert_eq!("zh-Hans-*" , LanguageRange::new("zh-hans-*" ).unwrap().as_ref()); |
799 | } |
800 | |
801 | #[test ] |
802 | fn complex_valid_lang_ranges() { |
803 | assert_eq!("de-DE-u-email-co-phonebk-x-linux" , |
804 | LanguageRange::new("de-DE-u-email-co-phonebk-x-linux" ).unwrap().as_ref()); |
805 | assert_eq!("vi-VN-u-fw-mon-hc-h24-ms-metric" , |
806 | LanguageRange::new("vi-vn-u-fw-mon-hc-h24-ms-metric" ).unwrap().as_ref()); |
807 | assert_eq!("sl-Cyrl-YU-rozaj-solba-1994-b-1234-a-foobar-x-b-1234-a-foobar" , |
808 | LanguageRange::new("sl-Cyrl-YU-rozaj-solba-1994-b-1234-a-Foobar-x-b-1234-a-Foobar" ).unwrap().as_ref()); |
809 | } |
810 | |
811 | #[test ] |
812 | fn invalid_lang_range_invalid_char() { |
813 | assert!(LanguageRange::new("not a range" ).is_err()); |
814 | } |
815 | |
816 | #[test ] |
817 | fn invalid_lang_range_long_element() { |
818 | assert!(LanguageRange::new("de-DE-u-email-co-phonebook-x-linux" ).is_err()); |
819 | } |
820 | |
821 | #[test ] |
822 | fn invalid_lang_range_leading_number() { |
823 | assert!(LanguageRange::new("840" ).is_err()); |
824 | } |
825 | |
826 | #[test ] |
827 | fn invalid_lang_range_bad_asterisk() { |
828 | assert!(LanguageRange::new("e*-US" ).is_err()); |
829 | assert!(LanguageRange::new("en-*s" ).is_err()); |
830 | } |
831 | |
832 | #[test ] |
833 | fn normal_lang_range() { |
834 | // Check that the string is not copied if the tag is canonical |
835 | assert!(!is_owned(&LanguageRange::new("en-US" ).unwrap().language)); |
836 | assert!(!is_owned(&LanguageRange::new("en" ).unwrap().language)); |
837 | assert!(!is_owned(&LanguageRange::new("zh-Hant-CN" ).unwrap().language)); |
838 | assert!(!is_owned(&LanguageRange::new("cs-CZ-x-ds-002e" ).unwrap().language)); |
839 | assert!(!is_owned(&LanguageRange::new("czech" ).unwrap().language)); |
840 | } |
841 | |
842 | #[test ] |
843 | fn locale_simple() { |
844 | assert_eq!("en-US" , Locale::new("en-US" ).unwrap().as_ref()); |
845 | assert_eq!("zh-Hant" , Locale::new("zh-hant" ).unwrap().as_ref()); |
846 | assert_eq!("de-*" , Locale::new("de-*" ).unwrap().as_ref()); |
847 | assert!(Locale::new("invalid!" ).is_err()); |
848 | assert!(Locale::new("hı-İN" ).is_err()); |
849 | } |
850 | |
851 | #[test ] |
852 | fn locale_list() { |
853 | assert_eq!("cs-CZ,en-GB,en,*" , Locale::new("cs-cz,en-gb,en,*" ).unwrap().as_ref()); |
854 | assert_eq!("cs-CZ,engrish" , Locale::new("cs-cz,engrish" ).unwrap().as_ref()); |
855 | assert!(Locale::new("cs-cz,hı-İN" ).is_err()); |
856 | } |
857 | |
858 | #[test ] |
859 | fn locale_category() { |
860 | assert_eq!("cs-CZ,messages=en-GB" , |
861 | Locale::new("cs-CZ,messages=en-GB" ).unwrap().as_ref()); |
862 | assert_eq!("zh-Hant,time=ja-JP,measurement=en-US" , |
863 | Locale::new("zh-hant,TIME=ja-jp,meaSURement=en-US" ).unwrap().as_ref()); |
864 | // the first item must be plain language tag |
865 | assert!(Locale::new("messages=pl" ).is_err()); |
866 | // adding general alternate should not help |
867 | assert!(Locale::new("numeric=de,fr-FR" ).is_err()); |
868 | } |
869 | |
870 | #[test ] |
871 | fn locale_dups() { |
872 | assert_eq!("cs-CZ,en,de-AT" , Locale::new("cs-CZ,en,de-AT,en" ).unwrap().as_ref()); |
873 | assert_eq!("en-US,en" , Locale::new("en-us,en-US,EN,eN-Us,en" ).unwrap().as_ref()); |
874 | } |
875 | |
876 | #[test ] |
877 | fn locale_category_dups() { |
878 | assert_eq!("cs-CZ" , |
879 | Locale::new("cs-CZ,messages=cs-CZ,time=cs-cz,collate=CS-cz" ).unwrap().as_ref()); |
880 | assert_eq!("de-AT,en-AU" , |
881 | Locale::new("de-AT,en-AU,messages=de-AT" ).unwrap().as_ref()); |
882 | // category overrides override, so don't drop if they are only equal to alternates |
883 | assert_eq!("de-AT,en-AU,messages=en-AU" , |
884 | Locale::new("de-AT,en-AU,messages=en-AU" ).unwrap().as_ref()); |
885 | assert_eq!("hi-IN,time=en-IN" , |
886 | Locale::new("hi-IN,time=en-IN,TIME=EN-in,TiMe=En-iN" ).unwrap().as_ref()); |
887 | } |
888 | |
889 | #[test ] |
890 | fn unix_tags() { |
891 | assert_eq!("cs-CZ" , LanguageRange::from_unix("cs_CZ.UTF-8" ).unwrap().as_ref()); |
892 | assert_eq!("sr-RS-ijekavsk" , LanguageRange::from_unix("sr_RS@ijekavian" ).unwrap().as_ref()); |
893 | assert_eq!("sr-Latn-ijekavsk" , LanguageRange::from_unix("sr.UTF-8@ijekavianlatin" ).unwrap().as_ref()); |
894 | assert_eq!("en-Arab" , LanguageRange::from_unix("en@arabic" ).unwrap().as_ref()); |
895 | assert_eq!("en-Arab" , LanguageRange::from_unix("en.UTF-8@arabic" ).unwrap().as_ref()); |
896 | assert_eq!("de-DE" , LanguageRange::from_unix("DE_de.UTF-8@euro" ).unwrap().as_ref()); |
897 | assert_eq!("ssy-ER" , LanguageRange::from_unix("aa_ER@saaho" ).unwrap().as_ref()); |
898 | assert!(LanguageRange::from_unix("foo_BAR" ).is_err()); |
899 | assert!(LanguageRange::from_unix("en@arabic.UTF-8" ).is_err()); |
900 | assert_eq!("" , LanguageRange::from_unix("C" ).unwrap().as_ref()); |
901 | assert_eq!("" , LanguageRange::from_unix("C.UTF-8" ).unwrap().as_ref()); |
902 | assert_eq!("" , LanguageRange::from_unix("C.ISO-8859-1" ).unwrap().as_ref()); |
903 | assert_eq!("" , LanguageRange::from_unix("POSIX" ).unwrap().as_ref()); |
904 | } |
905 | |
906 | #[test ] |
907 | fn category_tag_list() { |
908 | assert_eq!( |
909 | Vec::from_iter(Locale::new("cs-CZ,messages=en-GB,time=de-DE,collate=en-US" ).unwrap().tags()), |
910 | &[(None, LanguageRange::new("cs-CZ" ).unwrap()), |
911 | (Some("messages" ), LanguageRange::new("en-GB" ).unwrap()), |
912 | (Some("time" ), LanguageRange::new("de-DE" ).unwrap()), |
913 | (Some("collate" ), LanguageRange::new("en-US" ).unwrap()), |
914 | ]); |
915 | } |
916 | |
917 | #[test ] |
918 | fn tag_list_for() { |
919 | let locale = Locale::new("cs-CZ,messages=en-GB,time=de-DE,sk-SK,pl-PL" ).unwrap(); |
920 | assert_eq!( |
921 | Vec::from_iter(locale.tags_for("messages" )), |
922 | &[LanguageRange::new("en-GB" ).unwrap(), |
923 | LanguageRange::new("cs-CZ" ).unwrap(), |
924 | LanguageRange::new("sk-SK" ).unwrap(), |
925 | LanguageRange::new("pl-PL" ).unwrap(), |
926 | ]); |
927 | assert_eq!( |
928 | Vec::from_iter(locale.tags_for("time" )), |
929 | &[LanguageRange::new("de-DE" ).unwrap(), |
930 | LanguageRange::new("cs-CZ" ).unwrap(), |
931 | LanguageRange::new("sk-SK" ).unwrap(), |
932 | LanguageRange::new("pl-PL" ).unwrap(), |
933 | ]); |
934 | assert_eq!( |
935 | Vec::from_iter(locale.tags_for("measurement" )), |
936 | &[LanguageRange::new("cs-CZ" ).unwrap(), |
937 | LanguageRange::new("sk-SK" ).unwrap(), |
938 | LanguageRange::new("pl-PL" ).unwrap(), |
939 | ]); |
940 | } |
941 | } |
942 | |