1 | //! This crate exposes the Unicode `Script` and `Script_Extension` |
2 | //! properties from [UAX #24](http://www.unicode.org/reports/tr24/) |
3 | |
4 | #![cfg_attr (not(test), no_std)] |
5 | #![cfg_attr (feature = "bench" , feature(test))] |
6 | |
7 | mod tables; |
8 | |
9 | use core::convert::TryFrom; |
10 | use core::fmt; |
11 | use core::u64; |
12 | pub use tables::script_extensions; |
13 | use tables::{get_script, get_script_extension, NEXT_SCRIPT}; |
14 | pub use tables::{Script, UNICODE_VERSION}; |
15 | |
16 | impl Script { |
17 | /// Get the full name of a script. |
18 | pub fn full_name(self) -> &'static str { |
19 | self.inner_full_name() |
20 | } |
21 | |
22 | /// Attempts to parse script name from the provided string. |
23 | /// Returns `None` if the provided string does not represent a valid |
24 | /// script full name. |
25 | pub fn from_full_name(input: &str) -> Option<Self> { |
26 | Self::inner_from_full_name(input) |
27 | } |
28 | |
29 | /// Get the four-character short name of a script. |
30 | pub fn short_name(self) -> &'static str { |
31 | self.inner_short_name() |
32 | } |
33 | |
34 | /// Attempts to parse script name from the provided string. |
35 | /// Returns `None` if the provided string does not represent a valid |
36 | /// script four-character short name. |
37 | pub fn from_short_name(input: &str) -> Option<Self> { |
38 | Self::inner_from_short_name(input) |
39 | } |
40 | |
41 | /// Is this script "Recommended" according to |
42 | /// [UAX #31](www.unicode.org/reports/tr31/#Table_Recommended_Scripts)? |
43 | pub fn is_recommended(self) -> bool { |
44 | use Script::*; |
45 | match self { |
46 | Common | Inherited | Arabic | Armenian | Bengali | Bopomofo | Cyrillic | Devanagari |
47 | | Ethiopic | Georgian | Greek | Gujarati | Gurmukhi | Han | Hangul | Hebrew |
48 | | Hiragana | Kannada | Katakana | Khmer | Lao | Latin | Malayalam | Myanmar | Oriya |
49 | | Sinhala | Tamil | Telugu | Thaana | Thai | Tibetan => true, |
50 | _ => false, |
51 | } |
52 | } |
53 | } |
54 | |
55 | impl From<Script> for ScriptExtension { |
56 | fn from(script: Script) -> Self { |
57 | if script == Script::Common { |
58 | ScriptExtension::new_common() |
59 | } else if script == Script::Inherited { |
60 | ScriptExtension::new_inherited() |
61 | } else if script == Script::Unknown { |
62 | ScriptExtension::new_unknown() |
63 | } else { |
64 | let mut first = 0; |
65 | let mut second = 0; |
66 | let mut third = 0; |
67 | let bit = script as u8; |
68 | // Find out which field it's in, and set the appropriate bit there |
69 | if bit < 64 { |
70 | first = 1 << bit as u64; |
71 | } else if bit < 128 { |
72 | // offset by 64 since `bit` is an absolute number, |
73 | // not relative to the chunk |
74 | second = 1 << (bit - 64) as u64; |
75 | } else { |
76 | third = 1 << (bit - 128) as u32; |
77 | } |
78 | ScriptExtension::new(first, second, third) |
79 | } |
80 | } |
81 | } |
82 | |
83 | impl TryFrom<ScriptExtension> for Script { |
84 | type Error = (); |
85 | fn try_from(ext: ScriptExtension) -> Result<Self, ()> { |
86 | if ext.is_common_or_inherited() { |
87 | if ext.common { |
88 | Ok(Script::Common) |
89 | } else { |
90 | Ok(Script::Inherited) |
91 | } |
92 | } else if ext.is_empty() { |
93 | Ok(Script::Unknown) |
94 | } else { |
95 | // filled elements will have set ones |
96 | let fo = ext.first.count_ones(); |
97 | let so = ext.second.count_ones(); |
98 | let to = ext.third.count_ones(); |
99 | // only one bit set, in the first chunk |
100 | if fo == 1 && so == 0 && to == 0 { |
101 | // use trailing_zeroes() to figure out which bit it is |
102 | Ok(Script::for_integer(ext.first.trailing_zeros() as u8)) |
103 | // only one bit set, in the second chunk |
104 | } else if fo == 0 && so == 1 && to == 0 { |
105 | Ok(Script::for_integer(64 + ext.second.trailing_zeros() as u8)) |
106 | // only one bit set, in the third chunk |
107 | } else if fo == 0 && so == 0 && to == 1 { |
108 | Ok(Script::for_integer(128 + ext.third.trailing_zeros() as u8)) |
109 | } else { |
110 | Err(()) |
111 | } |
112 | } |
113 | } |
114 | } |
115 | |
116 | impl Default for Script { |
117 | fn default() -> Self { |
118 | Script::Common |
119 | } |
120 | } |
121 | |
122 | impl From<char> for Script { |
123 | fn from(o: char) -> Self { |
124 | o.script() |
125 | } |
126 | } |
127 | |
128 | impl fmt::Display for Script { |
129 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
130 | write!(f, " {}" , self.full_name()) |
131 | } |
132 | } |
133 | |
134 | #[derive (Clone, Copy, PartialEq, Eq, Hash)] |
135 | #[non_exhaustive ] |
136 | /// A value for the `Script_Extension` property |
137 | /// |
138 | /// [`ScriptExtension`] is one or more [`Script`] |
139 | /// |
140 | /// This is essentially an optimized version of `Vec<Script>` that uses bitfields |
141 | pub struct ScriptExtension { |
142 | // A bitset for the first 64 scripts |
143 | first: u64, |
144 | // A bitset for the scripts 65-128 |
145 | second: u64, |
146 | // A bitset for scripts after 128 |
147 | third: u64, |
148 | // Both Common and Inherited are represented by all used bits being set, |
149 | // this flag lets us distinguish the two. |
150 | common: bool, |
151 | } |
152 | |
153 | impl ScriptExtension { |
154 | // We don't use the complete u64 of `third`, so the "all" value is not just u32::MAX |
155 | // Instead, we take the number of the next (unused) script bit, subtract 128 to bring |
156 | // it in the range of `third`, create a u64 with just that bit set, and subtract 1 |
157 | // to create one with all the lower bits set. |
158 | const THIRD_MAX: u64 = ((1 << (NEXT_SCRIPT - 128)) - 1); |
159 | |
160 | pub(crate) const fn new(first: u64, second: u64, third: u64) -> Self { |
161 | ScriptExtension { |
162 | first, |
163 | second, |
164 | third, |
165 | common: false, |
166 | } |
167 | } |
168 | |
169 | pub(crate) const fn new_common() -> Self { |
170 | ScriptExtension { |
171 | first: u64::MAX, |
172 | second: u64::MAX, |
173 | third: Self::THIRD_MAX, |
174 | common: true, |
175 | } |
176 | } |
177 | |
178 | pub(crate) const fn new_inherited() -> Self { |
179 | ScriptExtension { |
180 | first: u64::MAX, |
181 | second: u64::MAX, |
182 | third: Self::THIRD_MAX, |
183 | common: false, |
184 | } |
185 | } |
186 | |
187 | pub(crate) const fn new_unknown() -> Self { |
188 | ScriptExtension { |
189 | first: 0, |
190 | second: 0, |
191 | third: 0, |
192 | common: false, |
193 | } |
194 | } |
195 | |
196 | const fn is_common_or_inherited(self) -> bool { |
197 | (self.first == u64::MAX) & (self.second == u64::MAX) & (self.third == Self::THIRD_MAX) |
198 | } |
199 | |
200 | /// Checks if the script extension is Common |
201 | pub const fn is_common(self) -> bool { |
202 | self.is_common_or_inherited() & self.common |
203 | } |
204 | |
205 | /// Checks if the script extension is Inherited |
206 | pub const fn is_inherited(self) -> bool { |
207 | self.is_common_or_inherited() & !self.common |
208 | } |
209 | |
210 | /// Checks if the script extension is empty (unknown) |
211 | pub const fn is_empty(self) -> bool { |
212 | (self.first == 0) & (self.second == 0) & (self.third == 0) |
213 | } |
214 | |
215 | /// Returns the number of scripts in the script extension |
216 | pub fn len(self) -> usize { |
217 | if self.is_common_or_inherited() { |
218 | 1 |
219 | } else { |
220 | (self.first.count_ones() + self.second.count_ones() + self.third.count_ones()) as usize |
221 | } |
222 | } |
223 | |
224 | /// Intersect this `ScriptExtension` with another `ScriptExtension`. Produces `Unknown` if things |
225 | /// do not intersect. This is equivalent to [`ScriptExtension::intersection`] but it stores the result |
226 | /// in `self` |
227 | /// |
228 | /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting |
229 | /// everything, the intersection of `Common` and `Inherited` is `Inherited` |
230 | pub fn intersect_with(&mut self, other: Self) { |
231 | *self = self.intersection(other) |
232 | } |
233 | |
234 | /// Find the intersection between two ScriptExtensions. Returns Unknown if things |
235 | /// do not intersect. |
236 | /// |
237 | /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting |
238 | /// everything, the intersection of `Common` and `Inherited` is `Inherited` |
239 | pub const fn intersection(self, other: Self) -> Self { |
240 | let first = self.first & other.first; |
241 | let second = self.second & other.second; |
242 | let third = self.third & other.third; |
243 | let common = self.common & other.common; |
244 | ScriptExtension { |
245 | first, |
246 | second, |
247 | third, |
248 | common, |
249 | } |
250 | } |
251 | |
252 | /// Find the union between two ScriptExtensions. |
253 | /// |
254 | /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting |
255 | /// everything, the union of `Common` and `Inherited` is `Common` |
256 | pub const fn union(self, other: Self) -> Self { |
257 | let first = self.first | other.first; |
258 | let second = self.second | other.second; |
259 | let third = self.third | other.third; |
260 | let common = self.common | other.common; |
261 | ScriptExtension { |
262 | first, |
263 | second, |
264 | third, |
265 | common, |
266 | } |
267 | } |
268 | |
269 | /// Check if this ScriptExtension contains the given script |
270 | /// |
271 | /// Should be used with specific scripts only, this will |
272 | /// return `true` if `self` is not `Unknown` and `script` is |
273 | /// `Common` or `Inherited` |
274 | pub fn contains_script(self, script: Script) -> bool { |
275 | !self.intersection(script.into()).is_empty() |
276 | } |
277 | |
278 | /// Get the intersection of script extensions of all characters |
279 | /// in a string. |
280 | pub fn for_str(x: &str) -> Self { |
281 | let mut ext = ScriptExtension::default(); |
282 | for ch in x.chars() { |
283 | ext.intersect_with(ch.into()); |
284 | } |
285 | ext |
286 | } |
287 | |
288 | /// Iterate over the scripts in this script extension |
289 | /// |
290 | /// Will never yield Script::Unknown |
291 | pub fn iter(self) -> ScriptIterator { |
292 | ScriptIterator { ext: self } |
293 | } |
294 | } |
295 | |
296 | impl Default for ScriptExtension { |
297 | fn default() -> Self { |
298 | ScriptExtension::new_common() |
299 | } |
300 | } |
301 | |
302 | impl From<char> for ScriptExtension { |
303 | fn from(o: char) -> Self { |
304 | o.script_extension() |
305 | } |
306 | } |
307 | |
308 | impl From<&'_ str> for ScriptExtension { |
309 | fn from(o: &'_ str) -> Self { |
310 | Self::for_str(o) |
311 | } |
312 | } |
313 | |
314 | impl fmt::Debug for ScriptExtension { |
315 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
316 | write!(f, "ScriptExtension(" )?; |
317 | fmt::Display::fmt(self, f)?; |
318 | write!(f, ")" ) |
319 | } |
320 | } |
321 | |
322 | impl fmt::Display for ScriptExtension { |
323 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
324 | if self.is_common() { |
325 | write!(f, "Common" )?; |
326 | } else if self.is_inherited() { |
327 | write!(f, "Inherited" )?; |
328 | } else if self.is_empty() { |
329 | write!(f, "Unknown" )?; |
330 | } else { |
331 | let mut first: bool = true; |
332 | for script: Script in self.iter() { |
333 | if !first { |
334 | write!(f, " + " )?; |
335 | first = false; |
336 | } |
337 | script.full_name().fmt(f)?; |
338 | } |
339 | } |
340 | Ok(()) |
341 | } |
342 | } |
343 | |
344 | /// Extension trait on `char` for calculating script properties |
345 | pub trait UnicodeScript { |
346 | /// Get the script for a given character |
347 | fn script(&self) -> Script; |
348 | /// Get the Script_Extension for a given character |
349 | fn script_extension(&self) -> ScriptExtension; |
350 | } |
351 | |
352 | impl UnicodeScript for char { |
353 | fn script(&self) -> Script { |
354 | get_script(*self).unwrap_or(default:Script::Unknown) |
355 | } |
356 | |
357 | fn script_extension(&self) -> ScriptExtension { |
358 | get_script_extension(*self).unwrap_or_else(|| self.script().into()) |
359 | } |
360 | } |
361 | |
362 | /// Iterator over scripts in a [ScriptExtension]. |
363 | /// |
364 | /// Can be obtained ia [ScriptExtension::iter()] |
365 | pub struct ScriptIterator { |
366 | ext: ScriptExtension, |
367 | } |
368 | |
369 | impl Iterator for ScriptIterator { |
370 | type Item = Script; |
371 | |
372 | fn next(&mut self) -> Option<Script> { |
373 | if self.ext.is_common_or_inherited() { |
374 | let common = self.ext.common; |
375 | self.ext = ScriptExtension::new_unknown(); |
376 | if common { |
377 | Some(Script::Common) |
378 | } else { |
379 | Some(Script::Inherited) |
380 | } |
381 | // Are there bits left in the first chunk? |
382 | } else if self.ext.first != 0 { |
383 | // Find the next bit |
384 | let bit = self.ext.first.trailing_zeros(); |
385 | // unset just that bit |
386 | self.ext.first &= !(1 << bit); |
387 | Some(Script::for_integer(bit as u8)) |
388 | // Are there bits left in the second chunk? |
389 | } else if self.ext.second != 0 { |
390 | let bit = self.ext.second.trailing_zeros(); |
391 | self.ext.second &= !(1 << bit); |
392 | Some(Script::for_integer(64 + bit as u8)) |
393 | // Are there bits left in the third chunk? |
394 | } else if self.ext.third != 0 { |
395 | let bit = self.ext.third.trailing_zeros(); |
396 | self.ext.third &= !(1 << bit); |
397 | Some(Script::for_integer(128 + bit as u8)) |
398 | } else { |
399 | // Script::Unknown |
400 | None |
401 | } |
402 | } |
403 | } |
404 | |
405 | #[cfg (test)] |
406 | mod tests { |
407 | use crate::*; |
408 | use std::collections::HashSet; |
409 | use std::convert::TryInto; |
410 | |
411 | #[cfg (feature = "bench" )] |
412 | use test::bench::Bencher; |
413 | #[cfg (feature = "bench" )] |
414 | extern crate test; |
415 | |
416 | #[test ] |
417 | fn test_conversion() { |
418 | let mut seen_scripts = HashSet::new(); |
419 | let mut seen_exts = HashSet::new(); |
420 | for bit in 0..NEXT_SCRIPT { |
421 | let script = Script::for_integer(bit); |
422 | let ext = script.into(); |
423 | if seen_scripts.contains(&script) { |
424 | panic!("Found script {:?} twice!" , script) |
425 | } |
426 | if seen_exts.contains(&ext) { |
427 | panic!("Found extension {:?} twice!" , ext) |
428 | } |
429 | seen_scripts.insert(script); |
430 | seen_exts.insert(ext); |
431 | assert_eq!(script as u8, bit); |
432 | assert!(!ScriptExtension::new_common().intersection(ext).is_empty()); |
433 | assert!(!ScriptExtension::new_inherited() |
434 | .intersection(ext) |
435 | .is_empty()); |
436 | assert!(ScriptExtension::new_unknown().intersection(ext).is_empty()); |
437 | assert_eq!(ext.iter().collect::<Vec<_>>(), vec![script]); |
438 | assert_eq!(Ok(script), ext.try_into()); |
439 | } |
440 | } |
441 | |
442 | #[test ] |
443 | fn test_specific() { |
444 | let s = "सवव मानवी व्यद्क् जन्मतःच स्वतींत्र आहेत व त्ाींना समान प्रवतष्ठा व समान अविकार आहेत. त्ाींना ववचारशद्क् व सवविे कबुद्द्धलाभलेली आहे. व त्ाींनी एकमेकाींशी बींिुत्वाचाभावनेने आचरण करावे." ; |
445 | let ext = ScriptExtension::for_str(s); |
446 | assert_eq!(ext, script_extensions::DEVA); |
447 | println!( |
448 | "{:?}" , |
449 | script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH |
450 | ); |
451 | println!( |
452 | "{:?}" , |
453 | ext.intersection( |
454 | script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH |
455 | ) |
456 | ); |
457 | assert!(!ext |
458 | .intersection(script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH) |
459 | .is_empty()); |
460 | |
461 | let u = ext.union(Script::Dogra.into()); |
462 | assert_eq!( |
463 | u.intersection( |
464 | script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH |
465 | ), |
466 | u |
467 | ); |
468 | } |
469 | |
470 | #[test ] |
471 | fn test_specific_ext() { |
472 | let ext = script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH; |
473 | |
474 | let all: HashSet<_> = ext.iter().collect(); |
475 | |
476 | for bit in 0..NEXT_SCRIPT { |
477 | let script = Script::for_integer(bit); |
478 | |
479 | if all.contains(&script) { |
480 | assert!(ext.contains_script(script)) |
481 | } else { |
482 | assert!(!ext.contains_script(script)) |
483 | } |
484 | } |
485 | |
486 | assert!(ext.contains_script(Script::Devanagari)); |
487 | assert!(ext.contains_script(Script::Dogra)); |
488 | assert!(ext.contains_script(Script::Gujarati)); |
489 | assert!(ext.contains_script(Script::Gurmukhi)); |
490 | assert!(ext.contains_script(Script::Khojki)); |
491 | assert!(ext.contains_script(Script::Kaithi)); |
492 | assert!(ext.contains_script(Script::Mahajani)); |
493 | assert!(ext.contains_script(Script::Modi)); |
494 | assert!(ext.contains_script(Script::Khudawadi)); |
495 | assert!(ext.contains_script(Script::Takri)); |
496 | assert!(ext.contains_script(Script::Tirhuta)); |
497 | |
498 | let scr: Result<Script, _> = ext.try_into(); |
499 | assert!(scr.is_err()); |
500 | } |
501 | |
502 | #[cfg (feature = "bench" )] |
503 | #[bench ] |
504 | fn bench_script_intersection(b: &mut Bencher) { |
505 | b.iter(|| { |
506 | let script = test::black_box(Script::Devanagari); |
507 | let ext = test::black_box(script_extensions::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH); |
508 | test::black_box(ext.intersection(script.into())); |
509 | }) |
510 | } |
511 | |
512 | #[cfg (feature = "bench" )] |
513 | #[bench ] |
514 | fn bench_ext_to_script(b: &mut Bencher) { |
515 | let ext: ScriptExtension = Script::Devanagari.into(); |
516 | b.iter(|| { |
517 | let ext = test::black_box(ext); |
518 | let script: Result<Script, _> = ext.try_into(); |
519 | let _ = test::black_box(script); |
520 | }) |
521 | } |
522 | |
523 | #[cfg (feature = "bench" )] |
524 | #[bench ] |
525 | fn bench_script_to_ext(b: &mut Bencher) { |
526 | b.iter(|| { |
527 | let script = test::black_box(Script::Devanagari); |
528 | let ext: ScriptExtension = script.into(); |
529 | test::black_box(ext); |
530 | }) |
531 | } |
532 | |
533 | #[cfg (feature = "bench" )] |
534 | #[bench ] |
535 | fn bench_ext_intersection(b: &mut Bencher) { |
536 | b.iter(|| { |
537 | let e1 = test::black_box(script_extensions::ARAB_ROHG_SYRC_THAA_YEZI); |
538 | let e2 = test::black_box(script_extensions::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH); |
539 | test::black_box(e2.intersection(e1)); |
540 | }) |
541 | } |
542 | |
543 | #[cfg (feature = "bench" )] |
544 | #[bench ] |
545 | fn bench_to_vec(b: &mut Bencher) { |
546 | b.iter(|| { |
547 | let ext = test::black_box(script_extensions::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH); |
548 | test::black_box(ext.iter().collect::<Vec<_>>()); |
549 | }) |
550 | } |
551 | |
552 | #[cfg (feature = "bench" )] |
553 | #[bench ] |
554 | fn bench_string_ext(b: &mut Bencher) { |
555 | b.iter(|| { |
556 | let s = test::black_box("सवव मानवी व्यद्क् जन्मतःच स्वतींत्र आहेत व त्ाींना समान प्रवतष्ठा व समान अविकार आहेत. त्ाींना ववचारशद्क् व सवविे कबुद्द्धलाभलेली आहे. व त्ाींनी एकमेकाींशी बींिुत्वाचाभावनेने आचरण करावे." ); |
557 | test::black_box(ScriptExtension::for_str(s)); |
558 | }) |
559 | } |
560 | } |
561 | |