1 | use core::str::FromStr; |
2 | |
3 | use smallvec::SmallVec; |
4 | |
5 | use crate::common::TagExt; |
6 | use crate::{script, tag_table, Language, Script, Tag}; |
7 | |
8 | type ThreeTags = SmallVec<[Tag; 3]>; |
9 | |
10 | trait SmallVecExt { |
11 | fn left(&self) -> usize; |
12 | fn is_full(&self) -> bool; |
13 | } |
14 | |
15 | impl<A: smallvec::Array> SmallVecExt for SmallVec<A> { |
16 | fn left(&self) -> usize { |
17 | self.inline_size() - self.len() |
18 | } |
19 | |
20 | fn is_full(&self) -> bool { |
21 | self.len() == self.inline_size() |
22 | } |
23 | } |
24 | |
25 | /// Converts an `Script` and an `Language` to script and language tags. |
26 | pub fn tags_from_script_and_language( |
27 | script: Option<Script>, |
28 | language: Option<&Language>, |
29 | ) -> (ThreeTags, ThreeTags) { |
30 | let mut needs_script = true; |
31 | let mut scripts = SmallVec::new(); |
32 | let mut languages = SmallVec::new(); |
33 | |
34 | let mut private_use_subtag = None; |
35 | let mut prefix = "" ; |
36 | if let Some(language) = language { |
37 | let language = language.as_str(); |
38 | if language.starts_with("x-" ) { |
39 | private_use_subtag = Some(language); |
40 | } else { |
41 | let bytes = language.as_bytes(); |
42 | let mut i = 1; |
43 | while i < bytes.len() { |
44 | if bytes.get(i - 1) == Some(&b'-' ) && bytes.get(i + 1) == Some(&b'-' ) { |
45 | if bytes[i] == b'x' { |
46 | private_use_subtag = Some(&language[i..]); |
47 | if prefix.is_empty() { |
48 | prefix = &language[..i - 1]; |
49 | } |
50 | |
51 | break; |
52 | } else { |
53 | prefix = &language[..i - 1]; |
54 | } |
55 | } |
56 | |
57 | i += 1; |
58 | } |
59 | |
60 | if prefix.is_empty() { |
61 | prefix = &language[..i]; |
62 | } |
63 | } |
64 | |
65 | needs_script = !parse_private_use_subtag( |
66 | private_use_subtag, |
67 | "-hbsc" , |
68 | u8::to_ascii_lowercase, |
69 | &mut scripts, |
70 | ); |
71 | |
72 | let needs_language = !parse_private_use_subtag( |
73 | private_use_subtag, |
74 | "-hbot" , |
75 | u8::to_ascii_uppercase, |
76 | &mut languages, |
77 | ); |
78 | |
79 | if needs_language { |
80 | if let Ok(prefix) = Language::from_str(prefix) { |
81 | tags_from_language(&prefix, &mut languages); |
82 | } |
83 | } |
84 | } |
85 | |
86 | if needs_script { |
87 | all_tags_from_script(script, &mut scripts); |
88 | } |
89 | |
90 | (scripts, languages) |
91 | } |
92 | |
93 | fn parse_private_use_subtag( |
94 | private_use_subtag: Option<&str>, |
95 | prefix: &str, |
96 | normalize: fn(&u8) -> u8, |
97 | tags: &mut ThreeTags, |
98 | ) -> bool { |
99 | let private_use_subtag = match private_use_subtag { |
100 | Some(v) => v, |
101 | None => return false, |
102 | }; |
103 | |
104 | let private_use_subtag = match private_use_subtag.find(prefix) { |
105 | Some(idx) => &private_use_subtag[idx + prefix.len()..], |
106 | None => return false, |
107 | }; |
108 | |
109 | let mut tag = SmallVec::<[u8; 4]>::new(); |
110 | for c in private_use_subtag.bytes().take(4) { |
111 | if c.is_ascii_alphanumeric() { |
112 | tag.push((normalize)(&c)); |
113 | } else { |
114 | break; |
115 | } |
116 | } |
117 | |
118 | if tag.is_empty() { |
119 | return false; |
120 | } |
121 | |
122 | let mut tag = Tag::from_bytes_lossy(tag.as_slice()); |
123 | |
124 | // Some bits magic from HarfBuzz... |
125 | if tag.as_u32() & 0xDFDFDFDF == Tag::default_script().as_u32() { |
126 | tag = Tag(tag.as_u32() ^ !0xDFDFDFDF); |
127 | } |
128 | |
129 | tags.push(tag); |
130 | |
131 | true |
132 | } |
133 | |
134 | fn lang_cmp(s1: &str, s2: &str) -> core::cmp::Ordering { |
135 | let da: usize = s1.find('-' ).unwrap_or(default:s1.len()); |
136 | let db: usize = s2.find('-' ).unwrap_or(default:s2.len()); |
137 | let n: usize = core::cmp::max(v1:da, v2:db); |
138 | let ea: usize = core::cmp::min(v1:n, v2:s1.len()); |
139 | let eb: usize = core::cmp::min(v1:n, v2:s2.len()); |
140 | s1[..ea].cmp(&s2[..eb]) |
141 | } |
142 | |
143 | fn tags_from_language(language: &Language, tags: &mut ThreeTags) { |
144 | let language = language.as_str(); |
145 | |
146 | // Check for matches of multiple subtags. |
147 | if tag_table::tags_from_complex_language(language, tags) { |
148 | return; |
149 | } |
150 | |
151 | let mut sublang = language; |
152 | |
153 | // Find a language matching in the first component. |
154 | if let Some(i) = language.find('-' ) { |
155 | // If there is an extended language tag, use it. |
156 | if language.len() >= 6 { |
157 | let extlang = match language[i + 1..].find('-' ) { |
158 | Some(idx) => idx == 3, |
159 | None => language.len() - i - 1 == 3, |
160 | }; |
161 | |
162 | if extlang && language.as_bytes()[i + 1].is_ascii_alphabetic() { |
163 | sublang = &language[i + 1..]; |
164 | } |
165 | } |
166 | } |
167 | |
168 | use tag_table::OPEN_TYPE_LANGUAGES as LANGUAGES; |
169 | |
170 | if let Ok(mut idx) = LANGUAGES.binary_search_by(|v| lang_cmp(v.language, sublang)) { |
171 | while idx != 0 && LANGUAGES[idx].language == LANGUAGES[idx - 1].language { |
172 | idx -= 1; |
173 | } |
174 | |
175 | let len = core::cmp::min(tags.left(), LANGUAGES.len() - idx - 1); |
176 | for i in 0..len { |
177 | if LANGUAGES[idx + i].language != LANGUAGES[idx].language { |
178 | break; |
179 | } |
180 | |
181 | if LANGUAGES[idx + i].tag.is_null() { |
182 | break; |
183 | } |
184 | |
185 | if tags.is_full() { |
186 | break; |
187 | } |
188 | |
189 | tags.push(LANGUAGES[idx + i].tag); |
190 | } |
191 | |
192 | return; |
193 | } |
194 | |
195 | if language.len() == 3 { |
196 | tags.push(Tag::from_bytes_lossy(language.as_bytes()).to_uppercase()); |
197 | } |
198 | } |
199 | |
200 | fn all_tags_from_script(script: Option<Script>, tags: &mut ThreeTags) { |
201 | if let Some(script: Script) = script { |
202 | if let Some(tag: Tag) = new_tag_from_script(script) { |
203 | // Script::Myanmar maps to 'mym2', but there is no 'mym3'. |
204 | if tag != Tag::from_bytes(b"mym2" ) { |
205 | let mut tag3: [u8; 4] = tag.to_bytes(); |
206 | tag3[3] = b'3' ; |
207 | tags.push(Tag::from_bytes(&tag3)); |
208 | } |
209 | |
210 | if !tags.is_full() { |
211 | tags.push(tag); |
212 | } |
213 | } |
214 | |
215 | if !tags.is_full() { |
216 | tags.push(old_tag_from_script(script)); |
217 | } |
218 | } |
219 | } |
220 | |
221 | fn new_tag_from_script(script: Script) -> Option<Tag> { |
222 | match script { |
223 | script::BENGALI => Some(Tag::from_bytes(b"bng2" )), |
224 | script::DEVANAGARI => Some(Tag::from_bytes(b"dev2" )), |
225 | script::GUJARATI => Some(Tag::from_bytes(b"gjr2" )), |
226 | script::GURMUKHI => Some(Tag::from_bytes(b"gur2" )), |
227 | script::KANNADA => Some(Tag::from_bytes(b"knd2" )), |
228 | script::MALAYALAM => Some(Tag::from_bytes(b"mlm2" )), |
229 | script::ORIYA => Some(Tag::from_bytes(b"ory2" )), |
230 | script::TAMIL => Some(Tag::from_bytes(b"tml2" )), |
231 | script::TELUGU => Some(Tag::from_bytes(b"tel2" )), |
232 | script::MYANMAR => Some(Tag::from_bytes(b"mym2" )), |
233 | _ => None, |
234 | } |
235 | } |
236 | |
237 | fn old_tag_from_script(script: Script) -> Tag { |
238 | // This seems to be accurate as of end of 2012. |
239 | match script { |
240 | // Katakana and Hiragana both map to 'kana'. |
241 | script::HIRAGANA => Tag::from_bytes(b"kana" ), |
242 | |
243 | // Spaces at the end are preserved, unlike ISO 15924. |
244 | script::LAO => Tag::from_bytes(b"lao " ), |
245 | script::YI => Tag::from_bytes(b"yi " ), |
246 | // Unicode-5.0 additions. |
247 | script::NKO => Tag::from_bytes(b"nko " ), |
248 | // Unicode-5.1 additions. |
249 | script::VAI => Tag::from_bytes(b"vai " ), |
250 | |
251 | // Else, just change first char to lowercase and return. |
252 | _ => Tag(script.tag().as_u32() | 0x20000000), |
253 | } |
254 | } |
255 | |
256 | #[rustfmt::skip] |
257 | #[cfg (test)] |
258 | mod tests { |
259 | #![allow (non_snake_case)] |
260 | |
261 | use super::*; |
262 | use core::str::FromStr; |
263 | use alloc::vec::Vec; |
264 | |
265 | fn new_tag_to_script(tag: Tag) -> Option<Script> { |
266 | match &tag.to_bytes() { |
267 | b"bng2" => Some(script::BENGALI), |
268 | b"dev2" => Some(script::DEVANAGARI), |
269 | b"gjr2" => Some(script::GUJARATI), |
270 | b"gur2" => Some(script::GURMUKHI), |
271 | b"knd2" => Some(script::KANNADA), |
272 | b"mlm2" => Some(script::MALAYALAM), |
273 | b"ory2" => Some(script::ORIYA), |
274 | b"tml2" => Some(script::TAMIL), |
275 | b"tel2" => Some(script::TELUGU), |
276 | b"mym2" => Some(script::MYANMAR), |
277 | _ => Some(script::UNKNOWN), |
278 | } |
279 | } |
280 | |
281 | fn old_tag_to_script(tag: Tag) -> Option<Script> { |
282 | if tag == Tag::default_script() { |
283 | return None; |
284 | } |
285 | |
286 | let mut bytes = tag.to_bytes(); |
287 | |
288 | // This side of the conversion is fully algorithmic. |
289 | |
290 | // Any spaces at the end of the tag are replaced by repeating the last |
291 | // letter. Eg 'nko ' -> 'Nkoo' |
292 | if bytes[2] == b' ' { |
293 | bytes[2] = bytes[1]; |
294 | } |
295 | if bytes[3] == b' ' { |
296 | bytes[3] = bytes[2]; |
297 | } |
298 | |
299 | // Change first char to uppercase. |
300 | bytes[0] = bytes[0].to_ascii_uppercase(); |
301 | |
302 | Some(Script(Tag::from_bytes(&bytes))) |
303 | } |
304 | |
305 | fn tag_to_script(tag: Tag) -> Option<Script> { |
306 | let bytes = tag.to_bytes(); |
307 | if bytes[3] == b'2' || bytes[3] == b'3' { |
308 | let mut tag2 = bytes; |
309 | tag2[3] = b'2' ; |
310 | return new_tag_to_script(Tag::from_bytes(&tag2)); |
311 | } |
312 | |
313 | old_tag_to_script(tag) |
314 | } |
315 | |
316 | fn test_simple_tags(tag: &str, script: Script) { |
317 | let tag = Tag::from_bytes_lossy(tag.as_bytes()); |
318 | |
319 | let (scripts, _) = tags_from_script_and_language(Some(script), None); |
320 | if !scripts.is_empty() { |
321 | assert_eq!(tag, scripts[0]); |
322 | } else { |
323 | assert_eq!(tag, Tag::default_script()); |
324 | } |
325 | |
326 | assert_eq!(tag_to_script(tag), Some(script)); |
327 | } |
328 | |
329 | #[test ] |
330 | fn tag_to_uppercase() { |
331 | assert_eq!(Tag::from_bytes(b"abcd" ).to_uppercase(), Tag::from_bytes(b"ABCD" )); |
332 | assert_eq!(Tag::from_bytes(b"abc " ).to_uppercase(), Tag::from_bytes(b"ABC " )); |
333 | assert_eq!(Tag::from_bytes(b"ABCD" ).to_uppercase(), Tag::from_bytes(b"ABCD" )); |
334 | } |
335 | |
336 | #[test ] |
337 | fn tag_to_lowercase() { |
338 | assert_eq!(Tag::from_bytes(b"abcd" ).to_lowercase(), Tag::from_bytes(b"abcd" )); |
339 | assert_eq!(Tag::from_bytes(b"abc " ).to_lowercase(), Tag::from_bytes(b"abc " )); |
340 | assert_eq!(Tag::from_bytes(b"ABCD" ).to_lowercase(), Tag::from_bytes(b"abcd" )); |
341 | } |
342 | |
343 | #[test ] |
344 | fn script_degenerate() { |
345 | assert_eq!(Tag::from_bytes(b"DFLT" ), Tag::default_script()); |
346 | |
347 | // Hiragana and Katakana both map to 'kana'. |
348 | test_simple_tags("kana" , script::KATAKANA); |
349 | |
350 | let (scripts, _) = tags_from_script_and_language(Some(script::HIRAGANA), None); |
351 | assert_eq!(scripts.as_slice(), &[Tag::from_bytes(b"kana" )]); |
352 | |
353 | // Spaces are replaced |
354 | assert_eq!(tag_to_script(Tag::from_bytes(b"be " )), Script::from_iso15924_tag(Tag::from_bytes(b"Beee" ))); |
355 | } |
356 | |
357 | #[test ] |
358 | fn script_simple() { |
359 | // Arbitrary non-existent script. |
360 | test_simple_tags("wwyz" , Script::from_iso15924_tag(Tag::from_bytes(b"wWyZ" )).unwrap()); |
361 | |
362 | // These we don't really care about. |
363 | test_simple_tags("zyyy" , script::COMMON); |
364 | test_simple_tags("zinh" , script::INHERITED); |
365 | test_simple_tags("zzzz" , script::UNKNOWN); |
366 | |
367 | test_simple_tags("arab" , script::ARABIC); |
368 | test_simple_tags("copt" , script::COPTIC); |
369 | test_simple_tags("kana" , script::KATAKANA); |
370 | test_simple_tags("latn" , script::LATIN); |
371 | |
372 | // These are trickier since their OT script tags have space. |
373 | test_simple_tags("lao " , script::LAO); |
374 | test_simple_tags("yi " , script::YI); |
375 | // Unicode-5.0 additions. |
376 | test_simple_tags("nko " , script::NKO); |
377 | // Unicode-5.1 additions. |
378 | test_simple_tags("vai " , script::VAI); |
379 | |
380 | // https://docs.microsoft.com/en-us/typography/opentype/spec/scripttags |
381 | |
382 | // Unicode-5.2 additions. |
383 | test_simple_tags("mtei" , script::MEETEI_MAYEK); |
384 | // Unicode-6.0 additions. |
385 | test_simple_tags("mand" , script::MANDAIC); |
386 | } |
387 | |
388 | macro_rules! test_script_from_language { |
389 | ($name:ident, $tag:expr, $lang:expr, $script:expr) => { |
390 | #[test] |
391 | fn $name() { |
392 | let tag = Tag::from_bytes_lossy($tag.as_bytes()); |
393 | let (scripts, _) = tags_from_script_and_language( |
394 | $script, Language::from_str($lang).ok().as_ref(), |
395 | ); |
396 | if !scripts.is_empty() { |
397 | assert_eq!(scripts.as_slice(), &[tag]); |
398 | } |
399 | } |
400 | }; |
401 | } |
402 | |
403 | test_script_from_language!(script_from_language_01, "" , "" , None); |
404 | test_script_from_language!(script_from_language_02, "" , "en" , None); |
405 | test_script_from_language!(script_from_language_03, "copt" , "en" , Some(script::COPTIC)); |
406 | test_script_from_language!(script_from_language_04, "" , "x-hbsc" , None); |
407 | test_script_from_language!(script_from_language_05, "copt" , "x-hbsc" , Some(script::COPTIC)); |
408 | test_script_from_language!(script_from_language_06, "abc " , "x-hbscabc" , None); |
409 | test_script_from_language!(script_from_language_07, "deva" , "x-hbscdeva" , None); |
410 | test_script_from_language!(script_from_language_08, "dev2" , "x-hbscdev2" , None); |
411 | test_script_from_language!(script_from_language_09, "dev3" , "x-hbscdev3" , None); |
412 | test_script_from_language!(script_from_language_10, "copt" , "x-hbotpap0-hbsccopt" , None); |
413 | test_script_from_language!(script_from_language_11, "" , "en-x-hbsc" , None); |
414 | test_script_from_language!(script_from_language_12, "copt" , "en-x-hbsc" , Some(script::COPTIC)); |
415 | test_script_from_language!(script_from_language_13, "abc " , "en-x-hbscabc" , None); |
416 | test_script_from_language!(script_from_language_14, "deva" , "en-x-hbscdeva" , None); |
417 | test_script_from_language!(script_from_language_15, "dev2" , "en-x-hbscdev2" , None); |
418 | test_script_from_language!(script_from_language_16, "dev3" , "en-x-hbscdev3" , None); |
419 | test_script_from_language!(script_from_language_17, "copt" , "en-x-hbotpap0-hbsccopt" , None); |
420 | |
421 | #[test ] |
422 | fn script_indic() { |
423 | fn check(tag1: &str, tag2: &str, tag3: &str, script: Script) { |
424 | let tag1 = Tag::from_bytes_lossy(tag1.as_bytes()); |
425 | let tag2 = Tag::from_bytes_lossy(tag2.as_bytes()); |
426 | let tag3 = Tag::from_bytes_lossy(tag3.as_bytes()); |
427 | |
428 | let (scripts, _) = tags_from_script_and_language(Some(script), None); |
429 | assert_eq!(scripts.as_slice(), &[tag1, tag2, tag3]); |
430 | assert_eq!(tag_to_script(tag1), Some(script)); |
431 | assert_eq!(tag_to_script(tag2), Some(script)); |
432 | assert_eq!(tag_to_script(tag3), Some(script)); |
433 | } |
434 | |
435 | check("bng3" , "bng2" , "beng" , script::BENGALI); |
436 | check("dev3" , "dev2" , "deva" , script::DEVANAGARI); |
437 | check("gjr3" , "gjr2" , "gujr" , script::GUJARATI); |
438 | check("gur3" , "gur2" , "guru" , script::GURMUKHI); |
439 | check("knd3" , "knd2" , "knda" , script::KANNADA); |
440 | check("mlm3" , "mlm2" , "mlym" , script::MALAYALAM); |
441 | check("ory3" , "ory2" , "orya" , script::ORIYA); |
442 | check("tml3" , "tml2" , "taml" , script::TAMIL); |
443 | check("tel3" , "tel2" , "telu" , script::TELUGU); |
444 | } |
445 | |
446 | // TODO: swap tag and lang |
447 | macro_rules! test_tag_from_language { |
448 | ($name:ident, $tag:expr, $lang:expr) => { |
449 | #[test] |
450 | fn $name() { |
451 | let tag = Tag::from_bytes_lossy($tag.as_bytes()); |
452 | let (_, languages) = tags_from_script_and_language( |
453 | None, Language::from_str(&$lang.to_lowercase()).ok().as_ref(), |
454 | ); |
455 | if !languages.is_empty() { |
456 | assert_eq!(languages[0], tag); |
457 | } |
458 | } |
459 | }; |
460 | } |
461 | |
462 | test_tag_from_language!(tag_from_language_dflt, "dflt" , "" ); |
463 | test_tag_from_language!(tag_from_language_ALT, "ALT" , "alt" ); |
464 | test_tag_from_language!(tag_from_language_ARA, "ARA" , "ar" ); |
465 | test_tag_from_language!(tag_from_language_AZE, "AZE" , "az" ); |
466 | test_tag_from_language!(tag_from_language_az_ir, "AZE" , "az-ir" ); |
467 | test_tag_from_language!(tag_from_language_az_az, "AZE" , "az-az" ); |
468 | test_tag_from_language!(tag_from_language_ENG, "ENG" , "en" ); |
469 | test_tag_from_language!(tag_from_language_en_US, "ENG" , "en_US" ); |
470 | test_tag_from_language!(tag_from_language_CJA, "CJA" , "cja" ); /* Western Cham */ |
471 | test_tag_from_language!(tag_from_language_CJM, "CJM" , "cjm" ); /* Eastern Cham */ |
472 | test_tag_from_language!(tag_from_language_ENV, "EVN" , "eve" ); |
473 | test_tag_from_language!(tag_from_language_HAL, "HAL" , "cfm" ); /* BCP47 and current ISO639-3 code for Halam/Falam Chin */ |
474 | test_tag_from_language!(tag_from_language_flm, "HAL" , "flm" ); /* Retired ISO639-3 code for Halam/Falam Chin */ |
475 | test_tag_from_language!(tag_from_language_hy, "HYE0" , "hy" ); |
476 | test_tag_from_language!(tag_from_language_hyw, "HYE" , "hyw" ); |
477 | test_tag_from_language!(tag_from_language_bgr, "QIN" , "bgr" ); /* Bawm Chin */ |
478 | test_tag_from_language!(tag_from_language_cbl, "QIN" , "cbl" ); /* Bualkhaw Chin */ |
479 | test_tag_from_language!(tag_from_language_cka, "QIN" , "cka" ); /* Khumi Awa Chin */ |
480 | test_tag_from_language!(tag_from_language_cmr, "QIN" , "cmr" ); /* Mro-Khimi Chin */ |
481 | test_tag_from_language!(tag_from_language_cnb, "QIN" , "cnb" ); /* Chinbon Chin */ |
482 | test_tag_from_language!(tag_from_language_cnh, "QIN" , "cnh" ); /* Hakha Chin */ |
483 | test_tag_from_language!(tag_from_language_cnk, "QIN" , "cnk" ); /* Khumi Chin */ |
484 | test_tag_from_language!(tag_from_language_cnw, "QIN" , "cnw" ); /* Ngawn Chin */ |
485 | test_tag_from_language!(tag_from_language_csh, "QIN" , "csh" ); /* Asho Chin */ |
486 | test_tag_from_language!(tag_from_language_csy, "QIN" , "csy" ); /* Siyin Chin */ |
487 | test_tag_from_language!(tag_from_language_ctd, "QIN" , "ctd" ); /* Tedim Chin */ |
488 | test_tag_from_language!(tag_from_language_czt, "QIN" , "czt" ); /* Zotung Chin */ |
489 | test_tag_from_language!(tag_from_language_dao, "QIN" , "dao" ); /* Daai Chin */ |
490 | test_tag_from_language!(tag_from_language_htl, "QIN" , "hlt" ); /* Matu Chin */ |
491 | test_tag_from_language!(tag_from_language_mrh, "QIN" , "mrh" ); /* Mara Chin */ |
492 | test_tag_from_language!(tag_from_language_pck, "QIN" , "pck" ); /* Paite Chin */ |
493 | test_tag_from_language!(tag_from_language_sez, "QIN" , "sez" ); /* Senthang Chin */ |
494 | test_tag_from_language!(tag_from_language_tcp, "QIN" , "tcp" ); /* Tawr Chin */ |
495 | test_tag_from_language!(tag_from_language_tcz, "QIN" , "tcz" ); /* Thado Chin */ |
496 | test_tag_from_language!(tag_from_language_yos, "QIN" , "yos" ); /* Yos, deprecated by IANA in favor of Zou [zom] */ |
497 | test_tag_from_language!(tag_from_language_zom, "QIN" , "zom" ); /* Zou */ |
498 | test_tag_from_language!(tag_from_language_FAR, "FAR" , "fa" ); |
499 | test_tag_from_language!(tag_from_language_fa_IR, "FAR" , "fa_IR" ); |
500 | test_tag_from_language!(tag_from_language_man, "MNK" , "man" ); |
501 | test_tag_from_language!(tag_from_language_SWA, "SWA" , "aii" ); /* Swadaya Aramaic */ |
502 | test_tag_from_language!(tag_from_language_SYR, "SYR" , "syr" ); /* Syriac [macrolanguage] */ |
503 | test_tag_from_language!(tag_from_language_amw, "SYR" , "amw" ); /* Western Neo-Aramaic */ |
504 | test_tag_from_language!(tag_from_language_cld, "SYR" , "cld" ); /* Chaldean Neo-Aramaic */ |
505 | test_tag_from_language!(tag_from_language_syc, "SYR" , "syc" ); /* Classical Syriac */ |
506 | test_tag_from_language!(tag_from_language_TUA, "TUA" , "tru" ); /* Turoyo Aramaic */ |
507 | test_tag_from_language!(tag_from_language_zh, "ZHS" , "zh" ); /* Chinese */ |
508 | test_tag_from_language!(tag_from_language_zh_cn, "ZHS" , "zh-cn" ); /* Chinese (China) */ |
509 | test_tag_from_language!(tag_from_language_zh_sg, "ZHS" , "zh-sg" ); /* Chinese (Singapore) */ |
510 | test_tag_from_language!(tag_from_language_zh_mo, "ZHTM" , "zh-mo" ); /* Chinese (Macao) */ |
511 | test_tag_from_language!(tag_from_language_zh_hant_mo, "ZHTM" , "zh-hant-mo" ); /* Chinese (Macao) */ |
512 | test_tag_from_language!(tag_from_language_zh_hans_mo, "ZHS" , "zh-hans-mo" ); /* Chinese (Simplified, Macao) */ |
513 | test_tag_from_language!(tag_from_language_ZHH, "ZHH" , "zh-HK" ); /* Chinese (Hong Kong) */ |
514 | test_tag_from_language!(tag_from_language_zh_HanT_hK, "ZHH" , "zH-HanT-hK" ); /* Chinese (Hong Kong) */ |
515 | test_tag_from_language!(tag_from_language_zh_HanS_hK, "ZHS" , "zH-HanS-hK" ); /* Chinese (Simplified, Hong Kong) */ |
516 | test_tag_from_language!(tag_from_language_zh_tw, "ZHT" , "zh-tw" ); /* Chinese (Taiwan) */ |
517 | test_tag_from_language!(tag_from_language_ZHS, "ZHS" , "zh-Hans" ); /* Chinese (Simplified) */ |
518 | test_tag_from_language!(tag_from_language_ZHT, "ZHT" , "zh-Hant" ); /* Chinese (Traditional) */ |
519 | test_tag_from_language!(tag_from_language_zh_xx, "ZHS" , "zh-xx" ); /* Chinese (Other) */ |
520 | test_tag_from_language!(tag_from_language_zh_Hans_TW, "ZHS" , "zh-Hans-TW" ); |
521 | test_tag_from_language!(tag_from_language_yue, "ZHH" , "yue" ); |
522 | test_tag_from_language!(tag_from_language_yue_Hant, "ZHH" , "yue-Hant" ); |
523 | test_tag_from_language!(tag_from_language_yue_Hans, "ZHS" , "yue-Hans" ); |
524 | test_tag_from_language!(tag_from_language_ABC, "ABC" , "abc" ); |
525 | test_tag_from_language!(tag_from_language_ABCD, "ABCD" , "x-hbotabcd" ); |
526 | test_tag_from_language!(tag_from_language_asdf_asdf_wer_x_hbotabc_zxc, "ABC" , "asdf-asdf-wer-x-hbotabc-zxc" ); |
527 | test_tag_from_language!(tag_from_language_asdf_asdf_wer_x_hbotabc, "ABC" , "asdf-asdf-wer-x-hbotabc" ); |
528 | test_tag_from_language!(tag_from_language_asdf_asdf_wer_x_hbotabcd, "ABCD" , "asdf-asdf-wer-x-hbotabcd" ); |
529 | test_tag_from_language!(tag_from_language_asdf_asdf_wer_x_hbot_zxc, "dflt" , "asdf-asdf-wer-x-hbot-zxc" ); |
530 | test_tag_from_language!(tag_from_language_xy, "dflt" , "xy" ); |
531 | test_tag_from_language!(tag_from_language_xyz, "XYZ" , "xyz" ); /* Unknown ISO 639-3 */ |
532 | test_tag_from_language!(tag_from_language_xyz_qw, "XYZ" , "xyz-qw" ); /* Unknown ISO 639-3 */ |
533 | |
534 | /* |
535 | * Invalid input. The precise answer does not matter, as long as it |
536 | * does not crash or get into an infinite loop. |
537 | */ |
538 | test_tag_from_language!(tag_from_language__fonipa, "IPPH" , "-fonipa" ); |
539 | |
540 | /* |
541 | * Tags that contain "-fonipa" as a substring but which do not contain |
542 | * the subtag "fonipa". |
543 | */ |
544 | test_tag_from_language!(tag_from_language_en_fonipax, "ENG" , "en-fonipax" ); |
545 | test_tag_from_language!(tag_from_language_en_x_fonipa, "ENG" , "en-x-fonipa" ); |
546 | test_tag_from_language!(tag_from_language_en_a_fonipa, "ENG" , "en-a-fonipa" ); |
547 | test_tag_from_language!(tag_from_language_en_a_qwe_b_fonipa, "ENG" , "en-a-qwe-b-fonipa" ); |
548 | |
549 | /* International Phonetic Alphabet */ |
550 | test_tag_from_language!(tag_from_language_en_fonipa, "IPPH" , "en-fonipa" ); |
551 | test_tag_from_language!(tag_from_language_en_fonipax_fonipa, "IPPH" , "en-fonipax-fonipa" ); |
552 | test_tag_from_language!(tag_from_language_rm_ch_fonipa_sursilv_x_foobar, "IPPH" , "rm-CH-fonipa-sursilv-x-foobar" ); |
553 | test_tag_from_language!(tag_from_language_IPPH, "IPPH" , "und-fonipa" ); |
554 | test_tag_from_language!(tag_from_language_zh_fonipa, "IPPH" , "zh-fonipa" ); |
555 | |
556 | /* North American Phonetic Alphabet (Americanist Phonetic Notation) */ |
557 | test_tag_from_language!(tag_from_language_en_fonnapa, "APPH" , "en-fonnapa" ); |
558 | test_tag_from_language!(tag_from_language_chr_fonnapa, "APPH" , "chr-fonnapa" ); |
559 | test_tag_from_language!(tag_from_language_APPH, "APPH" , "und-fonnapa" ); |
560 | |
561 | /* Khutsuri Georgian */ |
562 | test_tag_from_language!(tag_from_language_ka_geok, "KGE" , "ka-Geok" ); |
563 | test_tag_from_language!(tag_from_language_KGE, "KGE" , "und-Geok" ); |
564 | |
565 | /* Irish Traditional */ |
566 | test_tag_from_language!(tag_from_language_IRT, "IRT" , "ga-Latg" ); |
567 | |
568 | /* Moldavian */ |
569 | test_tag_from_language!(tag_from_language_MOL, "MOL" , "ro-MD" ); |
570 | |
571 | /* Polytonic Greek */ |
572 | test_tag_from_language!(tag_from_language_PGR, "PGR" , "el-polyton" ); |
573 | test_tag_from_language!(tag_from_language_el_CY_polyton, "PGR" , "el-CY-polyton" ); |
574 | |
575 | /* Estrangela Syriac */ |
576 | test_tag_from_language!(tag_from_language_aii_Syre, "SYRE" , "aii-Syre" ); |
577 | test_tag_from_language!(tag_from_language_de_Syre, "SYRE" , "de-Syre" ); |
578 | test_tag_from_language!(tag_from_language_syr_Syre, "SYRE" , "syr-Syre" ); |
579 | test_tag_from_language!(tag_from_language_und_Syre, "SYRE" , "und-Syre" ); |
580 | |
581 | /* Western Syriac */ |
582 | test_tag_from_language!(tag_from_language_aii_Syrj, "SYRJ" , "aii-Syrj" ); |
583 | test_tag_from_language!(tag_from_language_de_Syrj, "SYRJ" , "de-Syrj" ); |
584 | test_tag_from_language!(tag_from_language_syr_Syrj, "SYRJ" , "syr-Syrj" ); |
585 | test_tag_from_language!(tag_from_language_SYRJ, "SYRJ" , "und-Syrj" ); |
586 | |
587 | /* Eastern Syriac */ |
588 | test_tag_from_language!(tag_from_language_aii_Syrn, "SYRN" , "aii-Syrn" ); |
589 | test_tag_from_language!(tag_from_language_de_Syrn, "SYRN" , "de-Syrn" ); |
590 | test_tag_from_language!(tag_from_language_syr_Syrn, "SYRN" , "syr-Syrn" ); |
591 | test_tag_from_language!(tag_from_language_SYRN, "SYRN" , "und-Syrn" ); |
592 | |
593 | /* Test that x-hbot overrides the base language */ |
594 | test_tag_from_language!(tag_from_language_fa_x_hbotabc_zxc, "ABC" , "fa-x-hbotabc-zxc" ); |
595 | test_tag_from_language!(tag_from_language_fa_ir_x_hbotabc_zxc, "ABC" , "fa-ir-x-hbotabc-zxc" ); |
596 | test_tag_from_language!(tag_from_language_zh_x_hbotabc_zxc, "ABC" , "zh-x-hbotabc-zxc" ); |
597 | test_tag_from_language!(tag_from_language_zh_cn_x_hbotabc_zxc, "ABC" , "zh-cn-x-hbotabc-zxc" ); |
598 | test_tag_from_language!(tag_from_language_zh_xy_x_hbotabc_zxc, "ABC" , "zh-xy-x-hbotabc-zxc" ); |
599 | test_tag_from_language!(tag_from_language_xyz_xy_x_hbotabc_zxc, "ABC" , "xyz-xy-x-hbotabc-zxc" ); |
600 | |
601 | /* Unnormalized BCP 47 tags */ |
602 | test_tag_from_language!(tag_from_language_ar_aao, "ARA" , "ar-aao" ); |
603 | test_tag_from_language!(tag_from_language_art_lojban, "JBO" , "art-lojban" ); |
604 | test_tag_from_language!(tag_from_language_kok_gom, "KOK" , "kok-gom" ); |
605 | test_tag_from_language!(tag_from_language_i_lux, "LTZ" , "i-lux" ); |
606 | test_tag_from_language!(tag_from_language_drh, "MNG" , "drh" ); |
607 | test_tag_from_language!(tag_from_language_ar_ary1, "MOR" , "ar-ary" ); |
608 | test_tag_from_language!(tag_from_language_ar_ary_DZ, "MOR" , "ar-ary-DZ" ); |
609 | test_tag_from_language!(tag_from_language_no_bok, "NOR" , "no-bok" ); |
610 | test_tag_from_language!(tag_from_language_no_nyn, "NYN" , "no-nyn" ); |
611 | test_tag_from_language!(tag_from_language_i_hak, "ZHS" , "i-hak" ); |
612 | test_tag_from_language!(tag_from_language_zh_guoyu, "ZHS" , "zh-guoyu" ); |
613 | test_tag_from_language!(tag_from_language_zh_min, "ZHS" , "zh-min" ); |
614 | test_tag_from_language!(tag_from_language_zh_min_nan, "ZHS" , "zh-min-nan" ); |
615 | test_tag_from_language!(tag_from_language_zh_xiang, "ZHS" , "zh-xiang" ); |
616 | |
617 | /* BCP 47 tags that look similar to unrelated language system tags */ |
618 | test_tag_from_language!(tag_from_language_als, "SQI" , "als" ); |
619 | test_tag_from_language!(tag_from_language_far, "dflt" , "far" ); |
620 | |
621 | /* A UN M.49 region code, not an extended language subtag */ |
622 | test_tag_from_language!(tag_from_language_ar_001, "ARA" , "ar-001" ); |
623 | |
624 | /* An invalid tag */ |
625 | test_tag_from_language!(tag_from_language_invalid, "TRK" , "tr@foo=bar" ); |
626 | |
627 | macro_rules! test_tags { |
628 | ($name:ident, $script:expr, $lang:expr, $scripts:expr, $langs:expr) => { |
629 | #[test] |
630 | fn $name() { |
631 | let (scripts, languages) = tags_from_script_and_language( |
632 | $script, Language::from_str($lang).ok().as_ref(), |
633 | ); |
634 | |
635 | let exp_scripts: Vec<Tag> = $scripts.iter().map(|v| Tag::from_bytes_lossy(*v)).collect(); |
636 | let exp_langs: Vec<Tag> = $langs.iter().map(|v| Tag::from_bytes_lossy(*v)).collect(); |
637 | |
638 | assert_eq!(exp_scripts, scripts.as_slice()); |
639 | assert_eq!(exp_langs, languages.as_slice()); |
640 | } |
641 | }; |
642 | } |
643 | |
644 | test_tags!(tag_full_en, None, "en" , &[], &[b"ENG" ]); |
645 | test_tags!(tag_full_en_x_hbscdflt, None, "en-x-hbscdflt" , &[b"DFLT" ], &[b"ENG" ]); |
646 | test_tags!(tag_full_en_latin, Some(script::LATIN), "en" , &[b"latn" ], &[b"ENG" ]); |
647 | test_tags!(tag_full_und_fonnapa, None, "und-fonnapa" , &[], &[b"APPH" ]); |
648 | test_tags!(tag_full_en_fonnapa, None, "en-fonnapa" , &[], &[b"APPH" ]); |
649 | test_tags!(tag_full_x_hbot1234_hbsc5678, None, "x-hbot1234-hbsc5678" , &[b"5678" ], &[b"1234" ]); |
650 | test_tags!(tag_full_x_hbsc5678_hbot1234, None, "x-hbsc5678-hbot1234" , &[b"5678" ], &[b"1234" ]); |
651 | test_tags!(tag_full_ml, Some(script::MALAYALAM), "ml" , &[b"mlm3" , b"mlm2" , b"mlym" ], &[b"MAL" , b"MLR" ]); |
652 | test_tags!(tag_full_xyz, None, "xyz" , &[], &[b"XYZ" ]); |
653 | test_tags!(tag_full_xy, None, "xy" , &[], &[]); |
654 | } |
655 | |