| 1 | use core::str::FromStr; |
| 2 | |
| 3 | use smallvec::SmallVec; |
| 4 | |
| 5 | use super::common::TagExt; |
| 6 | use super::{hb_tag_t, script, tag_table, Language, Script}; |
| 7 | |
| 8 | type ThreeTags = SmallVec<[hb_tag_t; 3]>; |
| 9 | |
| 10 | trait SmallVecExt { |
| 11 | fn left(&self) -> usize; |
| 12 | fn is_full(&self) -> bool; |
| 13 | } |
| 14 | |
| 15 | impl<A: smallvec::Array> SmallVecExt for SmallVec<A> { |
| 16 | fn left(&self) -> usize { |
| 17 | self.inline_size() - self.len() |
| 18 | } |
| 19 | |
| 20 | fn is_full(&self) -> bool { |
| 21 | self.len() == self.inline_size() |
| 22 | } |
| 23 | } |
| 24 | |
| 25 | /// Converts an `Script` and an `Language` to script and language tags. |
| 26 | pub fn tags_from_script_and_language( |
| 27 | script: Option<Script>, |
| 28 | language: Option<&Language>, |
| 29 | ) -> (ThreeTags, ThreeTags) { |
| 30 | let mut needs_script = true; |
| 31 | let mut scripts = SmallVec::new(); |
| 32 | let mut languages = SmallVec::new(); |
| 33 | |
| 34 | let mut private_use_subtag = None; |
| 35 | let mut prefix = "" ; |
| 36 | if let Some(language) = language { |
| 37 | let language = language.as_str(); |
| 38 | if language.starts_with("x-" ) { |
| 39 | private_use_subtag = Some(language); |
| 40 | } else { |
| 41 | let bytes = language.as_bytes(); |
| 42 | let mut i = 1; |
| 43 | while i < bytes.len() { |
| 44 | if bytes.get(i - 1) == Some(&b'-' ) && bytes.get(i + 1) == Some(&b'-' ) { |
| 45 | if bytes[i] == b'x' { |
| 46 | private_use_subtag = Some(&language[i..]); |
| 47 | if prefix.is_empty() { |
| 48 | prefix = &language[..i - 1]; |
| 49 | } |
| 50 | |
| 51 | break; |
| 52 | } else { |
| 53 | prefix = &language[..i - 1]; |
| 54 | } |
| 55 | } |
| 56 | |
| 57 | i += 1; |
| 58 | } |
| 59 | |
| 60 | if prefix.is_empty() { |
| 61 | prefix = &language[..i]; |
| 62 | } |
| 63 | } |
| 64 | |
| 65 | needs_script = !parse_private_use_subtag( |
| 66 | private_use_subtag, |
| 67 | "-hbsc" , |
| 68 | u8::to_ascii_lowercase, |
| 69 | &mut scripts, |
| 70 | ); |
| 71 | |
| 72 | let needs_language = !parse_private_use_subtag( |
| 73 | private_use_subtag, |
| 74 | "-hbot" , |
| 75 | u8::to_ascii_uppercase, |
| 76 | &mut languages, |
| 77 | ); |
| 78 | |
| 79 | if needs_language { |
| 80 | if let Ok(prefix) = Language::from_str(prefix) { |
| 81 | tags_from_language(&prefix, &mut languages); |
| 82 | } |
| 83 | } |
| 84 | } |
| 85 | |
| 86 | if needs_script { |
| 87 | all_tags_from_script(script, &mut scripts); |
| 88 | } |
| 89 | |
| 90 | (scripts, languages) |
| 91 | } |
| 92 | |
| 93 | fn parse_private_use_subtag( |
| 94 | private_use_subtag: Option<&str>, |
| 95 | prefix: &str, |
| 96 | normalize: fn(&u8) -> u8, |
| 97 | tags: &mut ThreeTags, |
| 98 | ) -> bool { |
| 99 | let private_use_subtag = match private_use_subtag { |
| 100 | Some(v) => v, |
| 101 | None => return false, |
| 102 | }; |
| 103 | |
| 104 | let private_use_subtag = match private_use_subtag.find(prefix) { |
| 105 | Some(idx) => &private_use_subtag[idx + prefix.len()..], |
| 106 | None => return false, |
| 107 | }; |
| 108 | |
| 109 | let mut tag = SmallVec::<[u8; 4]>::new(); |
| 110 | for c in private_use_subtag.bytes().take(4) { |
| 111 | if c.is_ascii_alphanumeric() { |
| 112 | tag.push((normalize)(&c)); |
| 113 | } else { |
| 114 | break; |
| 115 | } |
| 116 | } |
| 117 | |
| 118 | if tag.is_empty() { |
| 119 | return false; |
| 120 | } |
| 121 | |
| 122 | let mut tag = hb_tag_t::from_bytes_lossy(tag.as_slice()); |
| 123 | |
| 124 | // Some bits magic from HarfBuzz... |
| 125 | if tag.as_u32() & 0xDFDFDFDF == hb_tag_t::default_script().as_u32() { |
| 126 | tag = hb_tag_t(tag.as_u32() ^ !0xDFDFDFDF); |
| 127 | } |
| 128 | |
| 129 | tags.push(tag); |
| 130 | |
| 131 | true |
| 132 | } |
| 133 | |
| 134 | fn lang_cmp(s1: &str, s2: &str) -> core::cmp::Ordering { |
| 135 | let da: usize = s1.find('-' ).unwrap_or(default:s1.len()); |
| 136 | let db: usize = s2.find('-' ).unwrap_or(default:s2.len()); |
| 137 | let n: usize = core::cmp::max(v1:da, v2:db); |
| 138 | let ea: usize = core::cmp::min(v1:n, v2:s1.len()); |
| 139 | let eb: usize = core::cmp::min(v1:n, v2:s2.len()); |
| 140 | s1[..ea].cmp(&s2[..eb]) |
| 141 | } |
| 142 | |
| 143 | fn tags_from_language(language: &Language, tags: &mut ThreeTags) { |
| 144 | let language = language.as_str(); |
| 145 | |
| 146 | // Check for matches of multiple subtags. |
| 147 | if tag_table::tags_from_complex_language(language, tags) { |
| 148 | return; |
| 149 | } |
| 150 | |
| 151 | let mut sublang = language; |
| 152 | |
| 153 | // Find a language matching in the first component. |
| 154 | if let Some(i) = language.find('-' ) { |
| 155 | // If there is an extended language tag, use it. |
| 156 | if language.len() >= 6 { |
| 157 | let extlang = match language[i + 1..].find('-' ) { |
| 158 | Some(idx) => idx == 3, |
| 159 | None => language.len() - i - 1 == 3, |
| 160 | }; |
| 161 | |
| 162 | if extlang && language.as_bytes()[i + 1].is_ascii_alphabetic() { |
| 163 | sublang = &language[i + 1..]; |
| 164 | } |
| 165 | } |
| 166 | } |
| 167 | |
| 168 | use tag_table::OPEN_TYPE_LANGUAGES as LANGUAGES; |
| 169 | |
| 170 | if let Ok(mut idx) = LANGUAGES.binary_search_by(|v| lang_cmp(v.language, sublang)) { |
| 171 | while idx != 0 && LANGUAGES[idx].language == LANGUAGES[idx - 1].language { |
| 172 | idx -= 1; |
| 173 | } |
| 174 | |
| 175 | let len = core::cmp::min(tags.left(), LANGUAGES.len() - idx - 1); |
| 176 | for i in 0..len { |
| 177 | if LANGUAGES[idx + i].language != LANGUAGES[idx].language { |
| 178 | break; |
| 179 | } |
| 180 | |
| 181 | if LANGUAGES[idx + i].tag.is_null() { |
| 182 | break; |
| 183 | } |
| 184 | |
| 185 | if tags.is_full() { |
| 186 | break; |
| 187 | } |
| 188 | |
| 189 | tags.push(LANGUAGES[idx + i].tag); |
| 190 | } |
| 191 | |
| 192 | return; |
| 193 | } |
| 194 | |
| 195 | if language.len() == 3 { |
| 196 | tags.push(hb_tag_t::from_bytes_lossy(language.as_bytes()).to_uppercase()); |
| 197 | } |
| 198 | } |
| 199 | |
| 200 | fn all_tags_from_script(script: Option<Script>, tags: &mut ThreeTags) { |
| 201 | if let Some(script: Script) = script { |
| 202 | if let Some(tag: Tag) = new_tag_from_script(script) { |
| 203 | // Script::Myanmar maps to 'mym2', but there is no 'mym3'. |
| 204 | if tag != hb_tag_t::from_bytes(b"mym2" ) { |
| 205 | let mut tag3: [u8; 4] = tag.to_bytes(); |
| 206 | tag3[3] = b'3' ; |
| 207 | tags.push(hb_tag_t::from_bytes(&tag3)); |
| 208 | } |
| 209 | |
| 210 | if !tags.is_full() { |
| 211 | tags.push(tag); |
| 212 | } |
| 213 | } |
| 214 | |
| 215 | if !tags.is_full() { |
| 216 | tags.push(old_tag_from_script(script)); |
| 217 | } |
| 218 | } |
| 219 | } |
| 220 | |
| 221 | fn new_tag_from_script(script: Script) -> Option<hb_tag_t> { |
| 222 | match script { |
| 223 | script::BENGALI => Some(hb_tag_t::from_bytes(b"bng2" )), |
| 224 | script::DEVANAGARI => Some(hb_tag_t::from_bytes(b"dev2" )), |
| 225 | script::GUJARATI => Some(hb_tag_t::from_bytes(b"gjr2" )), |
| 226 | script::GURMUKHI => Some(hb_tag_t::from_bytes(b"gur2" )), |
| 227 | script::KANNADA => Some(hb_tag_t::from_bytes(b"knd2" )), |
| 228 | script::MALAYALAM => Some(hb_tag_t::from_bytes(b"mlm2" )), |
| 229 | script::ORIYA => Some(hb_tag_t::from_bytes(b"ory2" )), |
| 230 | script::TAMIL => Some(hb_tag_t::from_bytes(b"tml2" )), |
| 231 | script::TELUGU => Some(hb_tag_t::from_bytes(b"tel2" )), |
| 232 | script::MYANMAR => Some(hb_tag_t::from_bytes(b"mym2" )), |
| 233 | _ => None, |
| 234 | } |
| 235 | } |
| 236 | |
| 237 | fn old_tag_from_script(script: Script) -> hb_tag_t { |
| 238 | // This seems to be accurate as of end of 2012. |
| 239 | match script { |
| 240 | // Katakana and Hiragana both map to 'kana'. |
| 241 | script::HIRAGANA => hb_tag_t::from_bytes(b"kana" ), |
| 242 | |
| 243 | // Spaces at the end are preserved, unlike ISO 15924. |
| 244 | script::LAO => hb_tag_t::from_bytes(b"lao " ), |
| 245 | script::YI => hb_tag_t::from_bytes(b"yi " ), |
| 246 | // Unicode-5.0 additions. |
| 247 | script::NKO => hb_tag_t::from_bytes(b"nko " ), |
| 248 | // Unicode-5.1 additions. |
| 249 | script::VAI => hb_tag_t::from_bytes(b"vai " ), |
| 250 | |
| 251 | // Else, just change first char to lowercase and return. |
| 252 | _ => hb_tag_t(script.tag().as_u32() | 0x20000000), |
| 253 | } |
| 254 | } |
| 255 | |
| 256 | #[rustfmt::skip] |
| 257 | #[cfg (test)] |
| 258 | mod tests { |
| 259 | #![allow (non_snake_case)] |
| 260 | |
| 261 | use super::*; |
| 262 | use core::str::FromStr; |
| 263 | use alloc::vec::Vec; |
| 264 | |
| 265 | fn new_tag_to_script(tag: hb_tag_t) -> Option<Script> { |
| 266 | match &tag.to_bytes() { |
| 267 | b"bng2" => Some(script::BENGALI), |
| 268 | b"dev2" => Some(script::DEVANAGARI), |
| 269 | b"gjr2" => Some(script::GUJARATI), |
| 270 | b"gur2" => Some(script::GURMUKHI), |
| 271 | b"knd2" => Some(script::KANNADA), |
| 272 | b"mlm2" => Some(script::MALAYALAM), |
| 273 | b"ory2" => Some(script::ORIYA), |
| 274 | b"tml2" => Some(script::TAMIL), |
| 275 | b"tel2" => Some(script::TELUGU), |
| 276 | b"mym2" => Some(script::MYANMAR), |
| 277 | _ => Some(script::UNKNOWN), |
| 278 | } |
| 279 | } |
| 280 | |
| 281 | fn old_tag_to_script(tag: hb_tag_t) -> Option<Script> { |
| 282 | if tag == hb_tag_t::default_script() { |
| 283 | return None; |
| 284 | } |
| 285 | |
| 286 | let mut bytes = tag.to_bytes(); |
| 287 | |
| 288 | // This side of the conversion is fully algorithmic. |
| 289 | |
| 290 | // Any spaces at the end of the tag are replaced by repeating the last |
| 291 | // letter. Eg 'nko ' -> 'Nkoo' |
| 292 | if bytes[2] == b' ' { |
| 293 | bytes[2] = bytes[1]; |
| 294 | } |
| 295 | if bytes[3] == b' ' { |
| 296 | bytes[3] = bytes[2]; |
| 297 | } |
| 298 | |
| 299 | // Change first char to uppercase. |
| 300 | bytes[0] = bytes[0].to_ascii_uppercase(); |
| 301 | |
| 302 | Some(Script(hb_tag_t::from_bytes(&bytes))) |
| 303 | } |
| 304 | |
| 305 | fn tag_to_script(tag: hb_tag_t) -> Option<Script> { |
| 306 | let bytes = tag.to_bytes(); |
| 307 | if bytes[3] == b'2' || bytes[3] == b'3' { |
| 308 | let mut tag2 = bytes; |
| 309 | tag2[3] = b'2' ; |
| 310 | return new_tag_to_script(hb_tag_t::from_bytes(&tag2)); |
| 311 | } |
| 312 | |
| 313 | old_tag_to_script(tag) |
| 314 | } |
| 315 | |
| 316 | fn test_simple_tags(tag: &str, script: Script) { |
| 317 | let tag = hb_tag_t::from_bytes_lossy(tag.as_bytes()); |
| 318 | |
| 319 | let (scripts, _) = tags_from_script_and_language(Some(script), None); |
| 320 | if !scripts.is_empty() { |
| 321 | assert_eq!(tag, scripts[0]); |
| 322 | } else { |
| 323 | assert_eq!(tag, hb_tag_t::default_script()); |
| 324 | } |
| 325 | |
| 326 | assert_eq!(tag_to_script(tag), Some(script)); |
| 327 | } |
| 328 | |
| 329 | #[test ] |
| 330 | fn tag_to_uppercase() { |
| 331 | assert_eq!(hb_tag_t::from_bytes(b"abcd" ).to_uppercase(), hb_tag_t::from_bytes(b"ABCD" )); |
| 332 | assert_eq!(hb_tag_t::from_bytes(b"abc " ).to_uppercase(), hb_tag_t::from_bytes(b"ABC " )); |
| 333 | assert_eq!(hb_tag_t::from_bytes(b"ABCD" ).to_uppercase(), hb_tag_t::from_bytes(b"ABCD" )); |
| 334 | } |
| 335 | |
| 336 | #[test ] |
| 337 | fn tag_to_lowercase() { |
| 338 | assert_eq!(hb_tag_t::from_bytes(b"abcd" ).to_lowercase(), hb_tag_t::from_bytes(b"abcd" )); |
| 339 | assert_eq!(hb_tag_t::from_bytes(b"abc " ).to_lowercase(), hb_tag_t::from_bytes(b"abc " )); |
| 340 | assert_eq!(hb_tag_t::from_bytes(b"ABCD" ).to_lowercase(), hb_tag_t::from_bytes(b"abcd" )); |
| 341 | } |
| 342 | |
| 343 | #[test ] |
| 344 | fn script_degenerate() { |
| 345 | assert_eq!(hb_tag_t::from_bytes(b"DFLT" ), hb_tag_t::default_script()); |
| 346 | |
| 347 | // Hiragana and Katakana both map to 'kana'. |
| 348 | test_simple_tags("kana" , script::KATAKANA); |
| 349 | |
| 350 | let (scripts, _) = tags_from_script_and_language(Some(script::HIRAGANA), None); |
| 351 | assert_eq!(scripts.as_slice(), &[hb_tag_t::from_bytes(b"kana" )]); |
| 352 | |
| 353 | // Spaces are replaced |
| 354 | assert_eq!(tag_to_script(hb_tag_t::from_bytes(b"be " )), Script::from_iso15924_tag(hb_tag_t::from_bytes(b"Beee" ))); |
| 355 | } |
| 356 | |
| 357 | #[test ] |
| 358 | fn script_simple() { |
| 359 | // Arbitrary non-existent script. |
| 360 | test_simple_tags("wwyz" , Script::from_iso15924_tag(hb_tag_t::from_bytes(b"wWyZ" )).unwrap()); |
| 361 | |
| 362 | // These we don't really care about. |
| 363 | test_simple_tags("zyyy" , script::COMMON); |
| 364 | test_simple_tags("zinh" , script::INHERITED); |
| 365 | test_simple_tags("zzzz" , script::UNKNOWN); |
| 366 | |
| 367 | test_simple_tags("arab" , script::ARABIC); |
| 368 | test_simple_tags("copt" , script::COPTIC); |
| 369 | test_simple_tags("kana" , script::KATAKANA); |
| 370 | test_simple_tags("latn" , script::LATIN); |
| 371 | |
| 372 | // These are trickier since their OT script tags have space. |
| 373 | test_simple_tags("lao " , script::LAO); |
| 374 | test_simple_tags("yi " , script::YI); |
| 375 | // Unicode-5.0 additions. |
| 376 | test_simple_tags("nko " , script::NKO); |
| 377 | // Unicode-5.1 additions. |
| 378 | test_simple_tags("vai " , script::VAI); |
| 379 | |
| 380 | // https://docs.microsoft.com/en-us/typography/opentype/spec/scripttags |
| 381 | |
| 382 | // Unicode-5.2 additions. |
| 383 | test_simple_tags("mtei" , script::MEETEI_MAYEK); |
| 384 | // Unicode-6.0 additions. |
| 385 | test_simple_tags("mand" , script::MANDAIC); |
| 386 | } |
| 387 | |
| 388 | macro_rules! test_script_from_language { |
| 389 | ($name:ident, $tag:expr, $lang:expr, $script:expr) => { |
| 390 | #[test] |
| 391 | fn $name() { |
| 392 | let tag = hb_tag_t::from_bytes_lossy($tag.as_bytes()); |
| 393 | let (scripts, _) = tags_from_script_and_language( |
| 394 | $script, Language::from_str($lang).ok().as_ref(), |
| 395 | ); |
| 396 | if !scripts.is_empty() { |
| 397 | assert_eq!(scripts.as_slice(), &[tag]); |
| 398 | } |
| 399 | } |
| 400 | }; |
| 401 | } |
| 402 | |
| 403 | test_script_from_language!(script_from_language_01, "" , "" , None); |
| 404 | test_script_from_language!(script_from_language_02, "" , "en" , None); |
| 405 | test_script_from_language!(script_from_language_03, "copt" , "en" , Some(script::COPTIC)); |
| 406 | test_script_from_language!(script_from_language_04, "" , "x-hbsc" , None); |
| 407 | test_script_from_language!(script_from_language_05, "copt" , "x-hbsc" , Some(script::COPTIC)); |
| 408 | test_script_from_language!(script_from_language_06, "abc " , "x-hbscabc" , None); |
| 409 | test_script_from_language!(script_from_language_07, "deva" , "x-hbscdeva" , None); |
| 410 | test_script_from_language!(script_from_language_08, "dev2" , "x-hbscdev2" , None); |
| 411 | test_script_from_language!(script_from_language_09, "dev3" , "x-hbscdev3" , None); |
| 412 | test_script_from_language!(script_from_language_10, "copt" , "x-hbotpap0-hbsccopt" , None); |
| 413 | test_script_from_language!(script_from_language_11, "" , "en-x-hbsc" , None); |
| 414 | test_script_from_language!(script_from_language_12, "copt" , "en-x-hbsc" , Some(script::COPTIC)); |
| 415 | test_script_from_language!(script_from_language_13, "abc " , "en-x-hbscabc" , None); |
| 416 | test_script_from_language!(script_from_language_14, "deva" , "en-x-hbscdeva" , None); |
| 417 | test_script_from_language!(script_from_language_15, "dev2" , "en-x-hbscdev2" , None); |
| 418 | test_script_from_language!(script_from_language_16, "dev3" , "en-x-hbscdev3" , None); |
| 419 | test_script_from_language!(script_from_language_17, "copt" , "en-x-hbotpap0-hbsccopt" , None); |
| 420 | |
| 421 | #[test ] |
| 422 | fn script_indic() { |
| 423 | fn check(tag1: &str, tag2: &str, tag3: &str, script: Script) { |
| 424 | let tag1 = hb_tag_t::from_bytes_lossy(tag1.as_bytes()); |
| 425 | let tag2 = hb_tag_t::from_bytes_lossy(tag2.as_bytes()); |
| 426 | let tag3 = hb_tag_t::from_bytes_lossy(tag3.as_bytes()); |
| 427 | |
| 428 | let (scripts, _) = tags_from_script_and_language(Some(script), None); |
| 429 | assert_eq!(scripts.as_slice(), &[tag1, tag2, tag3]); |
| 430 | assert_eq!(tag_to_script(tag1), Some(script)); |
| 431 | assert_eq!(tag_to_script(tag2), Some(script)); |
| 432 | assert_eq!(tag_to_script(tag3), Some(script)); |
| 433 | } |
| 434 | |
| 435 | check("bng3" , "bng2" , "beng" , script::BENGALI); |
| 436 | check("dev3" , "dev2" , "deva" , script::DEVANAGARI); |
| 437 | check("gjr3" , "gjr2" , "gujr" , script::GUJARATI); |
| 438 | check("gur3" , "gur2" , "guru" , script::GURMUKHI); |
| 439 | check("knd3" , "knd2" , "knda" , script::KANNADA); |
| 440 | check("mlm3" , "mlm2" , "mlym" , script::MALAYALAM); |
| 441 | check("ory3" , "ory2" , "orya" , script::ORIYA); |
| 442 | check("tml3" , "tml2" , "taml" , script::TAMIL); |
| 443 | check("tel3" , "tel2" , "telu" , script::TELUGU); |
| 444 | } |
| 445 | |
| 446 | // TODO: swap tag and lang |
| 447 | macro_rules! test_tag_from_language { |
| 448 | ($name:ident, $tag:expr, $lang:expr) => { |
| 449 | #[test] |
| 450 | fn $name() { |
| 451 | let tag = hb_tag_t::from_bytes_lossy($tag.as_bytes()); |
| 452 | let (_, languages) = tags_from_script_and_language( |
| 453 | None, Language::from_str(&$lang.to_lowercase()).ok().as_ref(), |
| 454 | ); |
| 455 | if !languages.is_empty() { |
| 456 | assert_eq!(languages[0], tag); |
| 457 | } |
| 458 | } |
| 459 | }; |
| 460 | } |
| 461 | |
| 462 | test_tag_from_language!(tag_from_language_dflt, "dflt" , "" ); |
| 463 | test_tag_from_language!(tag_from_language_ALT, "ALT" , "alt" ); |
| 464 | test_tag_from_language!(tag_from_language_ARA, "ARA" , "ar" ); |
| 465 | test_tag_from_language!(tag_from_language_AZE, "AZE" , "az" ); |
| 466 | test_tag_from_language!(tag_from_language_az_ir, "AZE" , "az-ir" ); |
| 467 | test_tag_from_language!(tag_from_language_az_az, "AZE" , "az-az" ); |
| 468 | test_tag_from_language!(tag_from_language_ENG, "ENG" , "en" ); |
| 469 | test_tag_from_language!(tag_from_language_en_US, "ENG" , "en_US" ); |
| 470 | test_tag_from_language!(tag_from_language_CJA, "CJA" , "cja" ); /* Western Cham */ |
| 471 | test_tag_from_language!(tag_from_language_CJM, "CJM" , "cjm" ); /* Eastern Cham */ |
| 472 | test_tag_from_language!(tag_from_language_ENV, "EVN" , "eve" ); |
| 473 | test_tag_from_language!(tag_from_language_HAL, "HAL" , "cfm" ); /* BCP47 and current ISO639-3 code for Halam/Falam Chin */ |
| 474 | test_tag_from_language!(tag_from_language_flm, "HAL" , "flm" ); /* Retired ISO639-3 code for Halam/Falam Chin */ |
| 475 | test_tag_from_language!(tag_from_language_hy, "HYE0" , "hy" ); |
| 476 | test_tag_from_language!(tag_from_language_hyw, "HYE" , "hyw" ); |
| 477 | test_tag_from_language!(tag_from_language_bgr, "QIN" , "bgr" ); /* Bawm Chin */ |
| 478 | test_tag_from_language!(tag_from_language_cbl, "QIN" , "cbl" ); /* Bualkhaw Chin */ |
| 479 | test_tag_from_language!(tag_from_language_cka, "QIN" , "cka" ); /* Khumi Awa Chin */ |
| 480 | test_tag_from_language!(tag_from_language_cmr, "QIN" , "cmr" ); /* Mro-Khimi Chin */ |
| 481 | test_tag_from_language!(tag_from_language_cnb, "QIN" , "cnb" ); /* Chinbon Chin */ |
| 482 | test_tag_from_language!(tag_from_language_cnh, "QIN" , "cnh" ); /* Hakha Chin */ |
| 483 | test_tag_from_language!(tag_from_language_cnk, "QIN" , "cnk" ); /* Khumi Chin */ |
| 484 | test_tag_from_language!(tag_from_language_cnw, "QIN" , "cnw" ); /* Ngawn Chin */ |
| 485 | test_tag_from_language!(tag_from_language_csh, "QIN" , "csh" ); /* Asho Chin */ |
| 486 | test_tag_from_language!(tag_from_language_csy, "QIN" , "csy" ); /* Siyin Chin */ |
| 487 | test_tag_from_language!(tag_from_language_ctd, "QIN" , "ctd" ); /* Tedim Chin */ |
| 488 | test_tag_from_language!(tag_from_language_czt, "QIN" , "czt" ); /* Zotung Chin */ |
| 489 | test_tag_from_language!(tag_from_language_dao, "QIN" , "dao" ); /* Daai Chin */ |
| 490 | test_tag_from_language!(tag_from_language_htl, "QIN" , "hlt" ); /* Matu Chin */ |
| 491 | test_tag_from_language!(tag_from_language_mrh, "QIN" , "mrh" ); /* Mara Chin */ |
| 492 | test_tag_from_language!(tag_from_language_pck, "QIN" , "pck" ); /* Paite Chin */ |
| 493 | test_tag_from_language!(tag_from_language_sez, "QIN" , "sez" ); /* Senthang Chin */ |
| 494 | test_tag_from_language!(tag_from_language_tcp, "QIN" , "tcp" ); /* Tawr Chin */ |
| 495 | test_tag_from_language!(tag_from_language_tcz, "QIN" , "tcz" ); /* Thado Chin */ |
| 496 | test_tag_from_language!(tag_from_language_yos, "QIN" , "yos" ); /* Yos, deprecated by IANA in favor of Zou [zom] */ |
| 497 | test_tag_from_language!(tag_from_language_zom, "QIN" , "zom" ); /* Zou */ |
| 498 | test_tag_from_language!(tag_from_language_FAR, "FAR" , "fa" ); |
| 499 | test_tag_from_language!(tag_from_language_fa_IR, "FAR" , "fa_IR" ); |
| 500 | test_tag_from_language!(tag_from_language_man, "MNK" , "man" ); |
| 501 | test_tag_from_language!(tag_from_language_SWA, "SWA" , "aii" ); /* Swadaya Aramaic */ |
| 502 | test_tag_from_language!(tag_from_language_SYR, "SYR" , "syr" ); /* Syriac [macrolanguage] */ |
| 503 | test_tag_from_language!(tag_from_language_amw, "SYR" , "amw" ); /* Western Neo-Aramaic */ |
| 504 | test_tag_from_language!(tag_from_language_cld, "SYR" , "cld" ); /* Chaldean Neo-Aramaic */ |
| 505 | test_tag_from_language!(tag_from_language_syc, "SYR" , "syc" ); /* Classical Syriac */ |
| 506 | test_tag_from_language!(tag_from_language_TUA, "TUA" , "tru" ); /* Turoyo Aramaic */ |
| 507 | test_tag_from_language!(tag_from_language_zh, "ZHS" , "zh" ); /* Chinese */ |
| 508 | test_tag_from_language!(tag_from_language_zh_cn, "ZHS" , "zh-cn" ); /* Chinese (China) */ |
| 509 | test_tag_from_language!(tag_from_language_zh_sg, "ZHS" , "zh-sg" ); /* Chinese (Singapore) */ |
| 510 | test_tag_from_language!(tag_from_language_zh_mo, "ZHTM" , "zh-mo" ); /* Chinese (Macao) */ |
| 511 | test_tag_from_language!(tag_from_language_zh_hant_mo, "ZHTM" , "zh-hant-mo" ); /* Chinese (Macao) */ |
| 512 | test_tag_from_language!(tag_from_language_zh_hans_mo, "ZHS" , "zh-hans-mo" ); /* Chinese (Simplified, Macao) */ |
| 513 | test_tag_from_language!(tag_from_language_ZHH, "ZHH" , "zh-HK" ); /* Chinese (Hong Kong) */ |
| 514 | test_tag_from_language!(tag_from_language_zh_HanT_hK, "ZHH" , "zH-HanT-hK" ); /* Chinese (Hong Kong) */ |
| 515 | test_tag_from_language!(tag_from_language_zh_HanS_hK, "ZHS" , "zH-HanS-hK" ); /* Chinese (Simplified, Hong Kong) */ |
| 516 | test_tag_from_language!(tag_from_language_zh_tw, "ZHT" , "zh-tw" ); /* Chinese (Taiwan) */ |
| 517 | test_tag_from_language!(tag_from_language_ZHS, "ZHS" , "zh-Hans" ); /* Chinese (Simplified) */ |
| 518 | test_tag_from_language!(tag_from_language_ZHT, "ZHT" , "zh-Hant" ); /* Chinese (Traditional) */ |
| 519 | test_tag_from_language!(tag_from_language_zh_xx, "ZHS" , "zh-xx" ); /* Chinese (Other) */ |
| 520 | test_tag_from_language!(tag_from_language_zh_Hans_TW, "ZHS" , "zh-Hans-TW" ); |
| 521 | test_tag_from_language!(tag_from_language_yue, "ZHH" , "yue" ); |
| 522 | test_tag_from_language!(tag_from_language_yue_Hant, "ZHH" , "yue-Hant" ); |
| 523 | test_tag_from_language!(tag_from_language_yue_Hans, "ZHS" , "yue-Hans" ); |
| 524 | test_tag_from_language!(tag_from_language_ABC, "ABC" , "abc" ); |
| 525 | test_tag_from_language!(tag_from_language_ABCD, "ABCD" , "x-hbotabcd" ); |
| 526 | test_tag_from_language!(tag_from_language_asdf_asdf_wer_x_hbotabc_zxc, "ABC" , "asdf-asdf-wer-x-hbotabc-zxc" ); |
| 527 | test_tag_from_language!(tag_from_language_asdf_asdf_wer_x_hbotabc, "ABC" , "asdf-asdf-wer-x-hbotabc" ); |
| 528 | test_tag_from_language!(tag_from_language_asdf_asdf_wer_x_hbotabcd, "ABCD" , "asdf-asdf-wer-x-hbotabcd" ); |
| 529 | test_tag_from_language!(tag_from_language_asdf_asdf_wer_x_hbot_zxc, "dflt" , "asdf-asdf-wer-x-hbot-zxc" ); |
| 530 | test_tag_from_language!(tag_from_language_xy, "dflt" , "xy" ); |
| 531 | test_tag_from_language!(tag_from_language_xyz, "XYZ" , "xyz" ); /* Unknown ISO 639-3 */ |
| 532 | test_tag_from_language!(tag_from_language_xyz_qw, "XYZ" , "xyz-qw" ); /* Unknown ISO 639-3 */ |
| 533 | |
| 534 | /* |
| 535 | * Invalid input. The precise answer does not matter, as long as it |
| 536 | * does not crash or get into an infinite loop. |
| 537 | */ |
| 538 | test_tag_from_language!(tag_from_language__fonipa, "IPPH" , "-fonipa" ); |
| 539 | |
| 540 | /* |
| 541 | * Tags that contain "-fonipa" as a substring but which do not contain |
| 542 | * the subtag "fonipa". |
| 543 | */ |
| 544 | test_tag_from_language!(tag_from_language_en_fonipax, "ENG" , "en-fonipax" ); |
| 545 | test_tag_from_language!(tag_from_language_en_x_fonipa, "ENG" , "en-x-fonipa" ); |
| 546 | test_tag_from_language!(tag_from_language_en_a_fonipa, "ENG" , "en-a-fonipa" ); |
| 547 | test_tag_from_language!(tag_from_language_en_a_qwe_b_fonipa, "ENG" , "en-a-qwe-b-fonipa" ); |
| 548 | |
| 549 | /* International Phonetic Alphabet */ |
| 550 | test_tag_from_language!(tag_from_language_en_fonipa, "IPPH" , "en-fonipa" ); |
| 551 | test_tag_from_language!(tag_from_language_en_fonipax_fonipa, "IPPH" , "en-fonipax-fonipa" ); |
| 552 | test_tag_from_language!(tag_from_language_rm_ch_fonipa_sursilv_x_foobar, "IPPH" , "rm-CH-fonipa-sursilv-x-foobar" ); |
| 553 | test_tag_from_language!(tag_from_language_IPPH, "IPPH" , "und-fonipa" ); |
| 554 | test_tag_from_language!(tag_from_language_zh_fonipa, "IPPH" , "zh-fonipa" ); |
| 555 | |
| 556 | /* North American Phonetic Alphabet (Americanist Phonetic Notation) */ |
| 557 | test_tag_from_language!(tag_from_language_en_fonnapa, "APPH" , "en-fonnapa" ); |
| 558 | test_tag_from_language!(tag_from_language_chr_fonnapa, "APPH" , "chr-fonnapa" ); |
| 559 | test_tag_from_language!(tag_from_language_APPH, "APPH" , "und-fonnapa" ); |
| 560 | |
| 561 | /* Khutsuri Georgian */ |
| 562 | test_tag_from_language!(tag_from_language_ka_geok, "KGE" , "ka-Geok" ); |
| 563 | test_tag_from_language!(tag_from_language_KGE, "KGE" , "und-Geok" ); |
| 564 | |
| 565 | /* Irish Traditional */ |
| 566 | test_tag_from_language!(tag_from_language_IRT, "IRT" , "ga-Latg" ); |
| 567 | |
| 568 | /* Moldavian */ |
| 569 | test_tag_from_language!(tag_from_language_MOL, "MOL" , "ro-MD" ); |
| 570 | |
| 571 | /* Polytonic Greek */ |
| 572 | test_tag_from_language!(tag_from_language_PGR, "PGR" , "el-polyton" ); |
| 573 | test_tag_from_language!(tag_from_language_el_CY_polyton, "PGR" , "el-CY-polyton" ); |
| 574 | |
| 575 | /* Estrangela Syriac */ |
| 576 | test_tag_from_language!(tag_from_language_aii_Syre, "SYRE" , "aii-Syre" ); |
| 577 | test_tag_from_language!(tag_from_language_de_Syre, "SYRE" , "de-Syre" ); |
| 578 | test_tag_from_language!(tag_from_language_syr_Syre, "SYRE" , "syr-Syre" ); |
| 579 | test_tag_from_language!(tag_from_language_und_Syre, "SYRE" , "und-Syre" ); |
| 580 | |
| 581 | /* Western Syriac */ |
| 582 | test_tag_from_language!(tag_from_language_aii_Syrj, "SYRJ" , "aii-Syrj" ); |
| 583 | test_tag_from_language!(tag_from_language_de_Syrj, "SYRJ" , "de-Syrj" ); |
| 584 | test_tag_from_language!(tag_from_language_syr_Syrj, "SYRJ" , "syr-Syrj" ); |
| 585 | test_tag_from_language!(tag_from_language_SYRJ, "SYRJ" , "und-Syrj" ); |
| 586 | |
| 587 | /* Eastern Syriac */ |
| 588 | test_tag_from_language!(tag_from_language_aii_Syrn, "SYRN" , "aii-Syrn" ); |
| 589 | test_tag_from_language!(tag_from_language_de_Syrn, "SYRN" , "de-Syrn" ); |
| 590 | test_tag_from_language!(tag_from_language_syr_Syrn, "SYRN" , "syr-Syrn" ); |
| 591 | test_tag_from_language!(tag_from_language_SYRN, "SYRN" , "und-Syrn" ); |
| 592 | |
| 593 | /* Test that x-hbot overrides the base language */ |
| 594 | test_tag_from_language!(tag_from_language_fa_x_hbotabc_zxc, "ABC" , "fa-x-hbotabc-zxc" ); |
| 595 | test_tag_from_language!(tag_from_language_fa_ir_x_hbotabc_zxc, "ABC" , "fa-ir-x-hbotabc-zxc" ); |
| 596 | test_tag_from_language!(tag_from_language_zh_x_hbotabc_zxc, "ABC" , "zh-x-hbotabc-zxc" ); |
| 597 | test_tag_from_language!(tag_from_language_zh_cn_x_hbotabc_zxc, "ABC" , "zh-cn-x-hbotabc-zxc" ); |
| 598 | test_tag_from_language!(tag_from_language_zh_xy_x_hbotabc_zxc, "ABC" , "zh-xy-x-hbotabc-zxc" ); |
| 599 | test_tag_from_language!(tag_from_language_xyz_xy_x_hbotabc_zxc, "ABC" , "xyz-xy-x-hbotabc-zxc" ); |
| 600 | |
| 601 | /* Unnormalized BCP 47 tags */ |
| 602 | test_tag_from_language!(tag_from_language_ar_aao, "ARA" , "ar-aao" ); |
| 603 | test_tag_from_language!(tag_from_language_art_lojban, "JBO" , "art-lojban" ); |
| 604 | test_tag_from_language!(tag_from_language_kok_gom, "KOK" , "kok-gom" ); |
| 605 | test_tag_from_language!(tag_from_language_i_lux, "LTZ" , "i-lux" ); |
| 606 | test_tag_from_language!(tag_from_language_drh, "MNG" , "drh" ); |
| 607 | test_tag_from_language!(tag_from_language_ar_ary1, "MOR" , "ar-ary" ); |
| 608 | test_tag_from_language!(tag_from_language_ar_ary_DZ, "MOR" , "ar-ary-DZ" ); |
| 609 | test_tag_from_language!(tag_from_language_no_bok, "NOR" , "no-bok" ); |
| 610 | test_tag_from_language!(tag_from_language_no_nyn, "NYN" , "no-nyn" ); |
| 611 | test_tag_from_language!(tag_from_language_i_hak, "ZHS" , "i-hak" ); |
| 612 | test_tag_from_language!(tag_from_language_zh_guoyu, "ZHS" , "zh-guoyu" ); |
| 613 | test_tag_from_language!(tag_from_language_zh_min, "ZHS" , "zh-min" ); |
| 614 | test_tag_from_language!(tag_from_language_zh_min_nan, "ZHS" , "zh-min-nan" ); |
| 615 | test_tag_from_language!(tag_from_language_zh_xiang, "ZHS" , "zh-xiang" ); |
| 616 | |
| 617 | /* BCP 47 tags that look similar to unrelated language system tags */ |
| 618 | test_tag_from_language!(tag_from_language_als, "SQI" , "als" ); |
| 619 | test_tag_from_language!(tag_from_language_far, "dflt" , "far" ); |
| 620 | |
| 621 | /* A UN M.49 region code, not an extended language subtag */ |
| 622 | test_tag_from_language!(tag_from_language_ar_001, "ARA" , "ar-001" ); |
| 623 | |
| 624 | /* An invalid tag */ |
| 625 | test_tag_from_language!(tag_from_language_invalid, "TRK" , "tr@foo=bar" ); |
| 626 | |
| 627 | macro_rules! test_tags { |
| 628 | ($name:ident, $script:expr, $lang:expr, $scripts:expr, $langs:expr) => { |
| 629 | #[test] |
| 630 | fn $name() { |
| 631 | let (scripts, languages) = tags_from_script_and_language( |
| 632 | $script, Language::from_str($lang).ok().as_ref(), |
| 633 | ); |
| 634 | |
| 635 | let exp_scripts: Vec<hb_tag_t> = $scripts.iter().map(|v| hb_tag_t::from_bytes_lossy(*v)).collect(); |
| 636 | let exp_langs: Vec<hb_tag_t> = $langs.iter().map(|v| hb_tag_t::from_bytes_lossy(*v)).collect(); |
| 637 | |
| 638 | assert_eq!(exp_scripts, scripts.as_slice()); |
| 639 | assert_eq!(exp_langs, languages.as_slice()); |
| 640 | } |
| 641 | }; |
| 642 | } |
| 643 | |
| 644 | test_tags!(tag_full_en, None, "en" , &[], &[b"ENG" ]); |
| 645 | test_tags!(tag_full_en_x_hbscdflt, None, "en-x-hbscdflt" , &[b"DFLT" ], &[b"ENG" ]); |
| 646 | test_tags!(tag_full_en_latin, Some(script::LATIN), "en" , &[b"latn" ], &[b"ENG" ]); |
| 647 | test_tags!(tag_full_und_fonnapa, None, "und-fonnapa" , &[], &[b"APPH" ]); |
| 648 | test_tags!(tag_full_en_fonnapa, None, "en-fonnapa" , &[], &[b"APPH" ]); |
| 649 | test_tags!(tag_full_x_hbot1234_hbsc5678, None, "x-hbot1234-hbsc5678" , &[b"5678" ], &[b"1234" ]); |
| 650 | test_tags!(tag_full_x_hbsc5678_hbot1234, None, "x-hbsc5678-hbot1234" , &[b"5678" ], &[b"1234" ]); |
| 651 | test_tags!(tag_full_ml, Some(script::MALAYALAM), "ml" , &[b"mlm3" , b"mlm2" , b"mlym" ], &[b"MAL" , b"MLR" ]); |
| 652 | test_tags!(tag_full_xyz, None, "xyz" , &[], &[b"XYZ" ]); |
| 653 | test_tags!(tag_full_xy, None, "xy" , &[], &[]); |
| 654 | } |
| 655 | |