1//! This crate exposes the Unicode `Script` and `Script_Extension`
2//! properties from [UAX #24](http://www.unicode.org/reports/tr24/)
3
4#![cfg_attr(not(test), no_std)]
5#![cfg_attr(feature = "bench", feature(test))]
6
7mod tables;
8
9use core::convert::TryFrom;
10use core::fmt;
11use core::u64;
12pub use tables::script_extensions;
13use tables::{get_script, get_script_extension, NEXT_SCRIPT};
14pub use tables::{Script, UNICODE_VERSION};
15
16impl Script {
17 /// Get the full name of a script.
18 pub fn full_name(self) -> &'static str {
19 self.inner_full_name()
20 }
21
22 /// Attempts to parse script name from the provided string.
23 /// Returns `None` if the provided string does not represent a valid
24 /// script full name.
25 pub fn from_full_name(input: &str) -> Option<Self> {
26 Self::inner_from_full_name(input)
27 }
28
29 /// Get the four-character short name of a script.
30 pub fn short_name(self) -> &'static str {
31 self.inner_short_name()
32 }
33
34 /// Attempts to parse script name from the provided string.
35 /// Returns `None` if the provided string does not represent a valid
36 /// script four-character short name.
37 pub fn from_short_name(input: &str) -> Option<Self> {
38 Self::inner_from_short_name(input)
39 }
40
41 /// Is this script "Recommended" according to
42 /// [UAX #31](www.unicode.org/reports/tr31/#Table_Recommended_Scripts)?
43 pub fn is_recommended(self) -> bool {
44 use Script::*;
45 match self {
46 Common | Inherited | Arabic | Armenian | Bengali | Bopomofo | Cyrillic | Devanagari
47 | Ethiopic | Georgian | Greek | Gujarati | Gurmukhi | Han | Hangul | Hebrew
48 | Hiragana | Kannada | Katakana | Khmer | Lao | Latin | Malayalam | Myanmar | Oriya
49 | Sinhala | Tamil | Telugu | Thaana | Thai | Tibetan => true,
50 _ => false,
51 }
52 }
53}
54
55impl From<Script> for ScriptExtension {
56 fn from(script: Script) -> Self {
57 if script == Script::Common {
58 ScriptExtension::new_common()
59 } else if script == Script::Inherited {
60 ScriptExtension::new_inherited()
61 } else if script == Script::Unknown {
62 ScriptExtension::new_unknown()
63 } else {
64 let mut first = 0;
65 let mut second = 0;
66 let mut third = 0;
67 let bit = script as u8;
68 // Find out which field it's in, and set the appropriate bit there
69 if bit < 64 {
70 first = 1 << bit as u64;
71 } else if bit < 128 {
72 // offset by 64 since `bit` is an absolute number,
73 // not relative to the chunk
74 second = 1 << (bit - 64) as u64;
75 } else {
76 third = 1 << (bit - 128) as u32;
77 }
78 ScriptExtension::new(first, second, third)
79 }
80 }
81}
82
83impl TryFrom<ScriptExtension> for Script {
84 type Error = ();
85 fn try_from(ext: ScriptExtension) -> Result<Self, ()> {
86 if ext.is_common_or_inherited() {
87 if ext.common {
88 Ok(Script::Common)
89 } else {
90 Ok(Script::Inherited)
91 }
92 } else if ext.is_empty() {
93 Ok(Script::Unknown)
94 } else {
95 // filled elements will have set ones
96 let fo = ext.first.count_ones();
97 let so = ext.second.count_ones();
98 let to = ext.third.count_ones();
99 // only one bit set, in the first chunk
100 if fo == 1 && so == 0 && to == 0 {
101 // use trailing_zeroes() to figure out which bit it is
102 Ok(Script::for_integer(ext.first.trailing_zeros() as u8))
103 // only one bit set, in the second chunk
104 } else if fo == 0 && so == 1 && to == 0 {
105 Ok(Script::for_integer(64 + ext.second.trailing_zeros() as u8))
106 // only one bit set, in the third chunk
107 } else if fo == 0 && so == 0 && to == 1 {
108 Ok(Script::for_integer(128 + ext.third.trailing_zeros() as u8))
109 } else {
110 Err(())
111 }
112 }
113 }
114}
115
116impl Default for Script {
117 fn default() -> Self {
118 Script::Common
119 }
120}
121
122impl From<char> for Script {
123 fn from(o: char) -> Self {
124 o.script()
125 }
126}
127
128impl fmt::Display for Script {
129 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
130 write!(f, "{}", self.full_name())
131 }
132}
133
134#[derive(Clone, Copy, PartialEq, Eq, Hash)]
135#[non_exhaustive]
136/// A value for the `Script_Extension` property
137///
138/// [`ScriptExtension`] is one or more [`Script`]
139///
140/// This is essentially an optimized version of `Vec<Script>` that uses bitfields
141pub struct ScriptExtension {
142 // A bitset for the first 64 scripts
143 first: u64,
144 // A bitset for the scripts 65-128
145 second: u64,
146 // A bitset for scripts after 128
147 third: u64,
148 // Both Common and Inherited are represented by all used bits being set,
149 // this flag lets us distinguish the two.
150 common: bool,
151}
152
153impl ScriptExtension {
154 // We don't use the complete u64 of `third`, so the "all" value is not just u32::MAX
155 // Instead, we take the number of the next (unused) script bit, subtract 128 to bring
156 // it in the range of `third`, create a u64 with just that bit set, and subtract 1
157 // to create one with all the lower bits set.
158 const THIRD_MAX: u64 = ((1 << (NEXT_SCRIPT - 128)) - 1);
159
160 pub(crate) const fn new(first: u64, second: u64, third: u64) -> Self {
161 ScriptExtension {
162 first,
163 second,
164 third,
165 common: false,
166 }
167 }
168
169 pub(crate) const fn new_common() -> Self {
170 ScriptExtension {
171 first: u64::MAX,
172 second: u64::MAX,
173 third: Self::THIRD_MAX,
174 common: true,
175 }
176 }
177
178 pub(crate) const fn new_inherited() -> Self {
179 ScriptExtension {
180 first: u64::MAX,
181 second: u64::MAX,
182 third: Self::THIRD_MAX,
183 common: false,
184 }
185 }
186
187 pub(crate) const fn new_unknown() -> Self {
188 ScriptExtension {
189 first: 0,
190 second: 0,
191 third: 0,
192 common: false,
193 }
194 }
195
196 const fn is_common_or_inherited(self) -> bool {
197 (self.first == u64::MAX) & (self.second == u64::MAX) & (self.third == Self::THIRD_MAX)
198 }
199
200 /// Checks if the script extension is Common
201 pub const fn is_common(self) -> bool {
202 self.is_common_or_inherited() & self.common
203 }
204
205 /// Checks if the script extension is Inherited
206 pub const fn is_inherited(self) -> bool {
207 self.is_common_or_inherited() & !self.common
208 }
209
210 /// Checks if the script extension is empty (unknown)
211 pub const fn is_empty(self) -> bool {
212 (self.first == 0) & (self.second == 0) & (self.third == 0)
213 }
214
215 /// Returns the number of scripts in the script extension
216 pub fn len(self) -> usize {
217 if self.is_common_or_inherited() {
218 1
219 } else {
220 (self.first.count_ones() + self.second.count_ones() + self.third.count_ones()) as usize
221 }
222 }
223
224 /// Intersect this `ScriptExtension` with another `ScriptExtension`. Produces `Unknown` if things
225 /// do not intersect. This is equivalent to [`ScriptExtension::intersection`] but it stores the result
226 /// in `self`
227 ///
228 /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
229 /// everything, the intersection of `Common` and `Inherited` is `Inherited`
230 pub fn intersect_with(&mut self, other: Self) {
231 *self = self.intersection(other)
232 }
233
234 /// Find the intersection between two ScriptExtensions. Returns Unknown if things
235 /// do not intersect.
236 ///
237 /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
238 /// everything, the intersection of `Common` and `Inherited` is `Inherited`
239 pub const fn intersection(self, other: Self) -> Self {
240 let first = self.first & other.first;
241 let second = self.second & other.second;
242 let third = self.third & other.third;
243 let common = self.common & other.common;
244 ScriptExtension {
245 first,
246 second,
247 third,
248 common,
249 }
250 }
251
252 /// Find the union between two ScriptExtensions.
253 ///
254 /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
255 /// everything, the union of `Common` and `Inherited` is `Common`
256 pub const fn union(self, other: Self) -> Self {
257 let first = self.first | other.first;
258 let second = self.second | other.second;
259 let third = self.third | other.third;
260 let common = self.common | other.common;
261 ScriptExtension {
262 first,
263 second,
264 third,
265 common,
266 }
267 }
268
269 /// Check if this ScriptExtension contains the given script
270 ///
271 /// Should be used with specific scripts only, this will
272 /// return `true` if `self` is not `Unknown` and `script` is
273 /// `Common` or `Inherited`
274 pub fn contains_script(self, script: Script) -> bool {
275 !self.intersection(script.into()).is_empty()
276 }
277
278 /// Get the intersection of script extensions of all characters
279 /// in a string.
280 pub fn for_str(x: &str) -> Self {
281 let mut ext = ScriptExtension::default();
282 for ch in x.chars() {
283 ext.intersect_with(ch.into());
284 }
285 ext
286 }
287
288 /// Iterate over the scripts in this script extension
289 ///
290 /// Will never yield Script::Unknown
291 pub fn iter(self) -> ScriptIterator {
292 ScriptIterator { ext: self }
293 }
294}
295
296impl Default for ScriptExtension {
297 fn default() -> Self {
298 ScriptExtension::new_common()
299 }
300}
301
302impl From<char> for ScriptExtension {
303 fn from(o: char) -> Self {
304 o.script_extension()
305 }
306}
307
308impl From<&'_ str> for ScriptExtension {
309 fn from(o: &'_ str) -> Self {
310 Self::for_str(o)
311 }
312}
313
314impl fmt::Debug for ScriptExtension {
315 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
316 write!(f, "ScriptExtension(")?;
317 fmt::Display::fmt(self, f)?;
318 write!(f, ")")
319 }
320}
321
322impl fmt::Display for ScriptExtension {
323 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
324 if self.is_common() {
325 write!(f, "Common")?;
326 } else if self.is_inherited() {
327 write!(f, "Inherited")?;
328 } else if self.is_empty() {
329 write!(f, "Unknown")?;
330 } else {
331 let mut first: bool = true;
332 for script: Script in self.iter() {
333 if !first {
334 write!(f, " + ")?;
335 first = false;
336 }
337 script.full_name().fmt(f)?;
338 }
339 }
340 Ok(())
341 }
342}
343
344/// Extension trait on `char` for calculating script properties
345pub trait UnicodeScript {
346 /// Get the script for a given character
347 fn script(&self) -> Script;
348 /// Get the Script_Extension for a given character
349 fn script_extension(&self) -> ScriptExtension;
350}
351
352impl UnicodeScript for char {
353 fn script(&self) -> Script {
354 get_script(*self).unwrap_or(default:Script::Unknown)
355 }
356
357 fn script_extension(&self) -> ScriptExtension {
358 get_script_extension(*self).unwrap_or_else(|| self.script().into())
359 }
360}
361
362/// Iterator over scripts in a [ScriptExtension].
363///
364/// Can be obtained ia [ScriptExtension::iter()]
365pub struct ScriptIterator {
366 ext: ScriptExtension,
367}
368
369impl Iterator for ScriptIterator {
370 type Item = Script;
371
372 fn next(&mut self) -> Option<Script> {
373 if self.ext.is_common_or_inherited() {
374 let common = self.ext.common;
375 self.ext = ScriptExtension::new_unknown();
376 if common {
377 Some(Script::Common)
378 } else {
379 Some(Script::Inherited)
380 }
381 // Are there bits left in the first chunk?
382 } else if self.ext.first != 0 {
383 // Find the next bit
384 let bit = self.ext.first.trailing_zeros();
385 // unset just that bit
386 self.ext.first &= !(1 << bit);
387 Some(Script::for_integer(bit as u8))
388 // Are there bits left in the second chunk?
389 } else if self.ext.second != 0 {
390 let bit = self.ext.second.trailing_zeros();
391 self.ext.second &= !(1 << bit);
392 Some(Script::for_integer(64 + bit as u8))
393 // Are there bits left in the third chunk?
394 } else if self.ext.third != 0 {
395 let bit = self.ext.third.trailing_zeros();
396 self.ext.third &= !(1 << bit);
397 Some(Script::for_integer(128 + bit as u8))
398 } else {
399 // Script::Unknown
400 None
401 }
402 }
403}
404
405#[cfg(test)]
406mod tests {
407 use crate::*;
408 use std::collections::HashSet;
409 use std::convert::TryInto;
410
411 #[cfg(feature = "bench")]
412 use test::bench::Bencher;
413 #[cfg(feature = "bench")]
414 extern crate test;
415
416 #[test]
417 fn test_conversion() {
418 let mut seen_scripts = HashSet::new();
419 let mut seen_exts = HashSet::new();
420 for bit in 0..NEXT_SCRIPT {
421 let script = Script::for_integer(bit);
422 let ext = script.into();
423 if seen_scripts.contains(&script) {
424 panic!("Found script {:?} twice!", script)
425 }
426 if seen_exts.contains(&ext) {
427 panic!("Found extension {:?} twice!", ext)
428 }
429 seen_scripts.insert(script);
430 seen_exts.insert(ext);
431 assert_eq!(script as u8, bit);
432 assert!(!ScriptExtension::new_common().intersection(ext).is_empty());
433 assert!(!ScriptExtension::new_inherited()
434 .intersection(ext)
435 .is_empty());
436 assert!(ScriptExtension::new_unknown().intersection(ext).is_empty());
437 assert_eq!(ext.iter().collect::<Vec<_>>(), vec![script]);
438 assert_eq!(Ok(script), ext.try_into());
439 }
440 }
441
442 #[test]
443 fn test_specific() {
444 let s = "सवव मानवी व्यद्क् जन्मतःच स्वतींत्र आहेत व त्ाींना समान प्रवतष्ठा व समान अविकार आहेत. त्ाींना ववचारशद्क् व सवविे कबुद्द्धलाभलेली आहे. व त्ाींनी एकमेकाींशी बींिुत्वाचाभावनेने आचरण करावे.";
445 let ext = ScriptExtension::for_str(s);
446 assert_eq!(ext, script_extensions::DEVA);
447 println!(
448 "{:?}",
449 script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
450 );
451 println!(
452 "{:?}",
453 ext.intersection(
454 script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
455 )
456 );
457 assert!(!ext
458 .intersection(script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH)
459 .is_empty());
460
461 let u = ext.union(Script::Dogra.into());
462 assert_eq!(
463 u.intersection(
464 script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
465 ),
466 u
467 );
468 }
469
470 #[test]
471 fn test_specific_ext() {
472 let ext = script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH;
473
474 let all: HashSet<_> = ext.iter().collect();
475
476 for bit in 0..NEXT_SCRIPT {
477 let script = Script::for_integer(bit);
478
479 if all.contains(&script) {
480 assert!(ext.contains_script(script))
481 } else {
482 assert!(!ext.contains_script(script))
483 }
484 }
485
486 assert!(ext.contains_script(Script::Devanagari));
487 assert!(ext.contains_script(Script::Dogra));
488 assert!(ext.contains_script(Script::Gujarati));
489 assert!(ext.contains_script(Script::Gurmukhi));
490 assert!(ext.contains_script(Script::Khojki));
491 assert!(ext.contains_script(Script::Kaithi));
492 assert!(ext.contains_script(Script::Mahajani));
493 assert!(ext.contains_script(Script::Modi));
494 assert!(ext.contains_script(Script::Khudawadi));
495 assert!(ext.contains_script(Script::Takri));
496 assert!(ext.contains_script(Script::Tirhuta));
497
498 let scr: Result<Script, _> = ext.try_into();
499 assert!(scr.is_err());
500 }
501
502 #[cfg(feature = "bench")]
503 #[bench]
504 fn bench_script_intersection(b: &mut Bencher) {
505 b.iter(|| {
506 let script = test::black_box(Script::Devanagari);
507 let ext = test::black_box(script_extensions::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH);
508 test::black_box(ext.intersection(script.into()));
509 })
510 }
511
512 #[cfg(feature = "bench")]
513 #[bench]
514 fn bench_ext_to_script(b: &mut Bencher) {
515 let ext: ScriptExtension = Script::Devanagari.into();
516 b.iter(|| {
517 let ext = test::black_box(ext);
518 let script: Result<Script, _> = ext.try_into();
519 let _ = test::black_box(script);
520 })
521 }
522
523 #[cfg(feature = "bench")]
524 #[bench]
525 fn bench_script_to_ext(b: &mut Bencher) {
526 b.iter(|| {
527 let script = test::black_box(Script::Devanagari);
528 let ext: ScriptExtension = script.into();
529 test::black_box(ext);
530 })
531 }
532
533 #[cfg(feature = "bench")]
534 #[bench]
535 fn bench_ext_intersection(b: &mut Bencher) {
536 b.iter(|| {
537 let e1 = test::black_box(script_extensions::ARAB_ROHG_SYRC_THAA_YEZI);
538 let e2 = test::black_box(script_extensions::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH);
539 test::black_box(e2.intersection(e1));
540 })
541 }
542
543 #[cfg(feature = "bench")]
544 #[bench]
545 fn bench_to_vec(b: &mut Bencher) {
546 b.iter(|| {
547 let ext = test::black_box(script_extensions::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH);
548 test::black_box(ext.iter().collect::<Vec<_>>());
549 })
550 }
551
552 #[cfg(feature = "bench")]
553 #[bench]
554 fn bench_string_ext(b: &mut Bencher) {
555 b.iter(|| {
556 let s = test::black_box("सवव मानवी व्यद्क् जन्मतःच स्वतींत्र आहेत व त्ाींना समान प्रवतष्ठा व समान अविकार आहेत. त्ाींना ववचारशद्क् व सवविे कबुद्द्धलाभलेली आहे. व त्ाींनी एकमेकाींशी बींिुत्वाचाभावनेने आचरण करावे.");
557 test::black_box(ScriptExtension::for_str(s));
558 })
559 }
560}
561