1 | //! [Mixed-script detection](https://www.unicode.org/reports/tr39/#Mixed_Script_Detection) |
2 | |
3 | use core::fmt::{self, Debug}; |
4 | use unicode_script::{Script, ScriptExtension}; |
5 | |
6 | /// An Augmented script set, as defined by UTS 39 |
7 | /// |
8 | /// https://www.unicode.org/reports/tr39/#def-augmented-script-set |
9 | #[derive (Copy, Clone, PartialEq, Hash, Eq)] |
10 | pub struct AugmentedScriptSet { |
11 | /// The base ScriptExtension value |
12 | pub base: ScriptExtension, |
13 | /// Han With Bopomofo |
14 | pub hanb: bool, |
15 | /// Japanese |
16 | pub jpan: bool, |
17 | /// Korean |
18 | pub kore: bool, |
19 | } |
20 | |
21 | impl From<ScriptExtension> for AugmentedScriptSet { |
22 | fn from(ext: ScriptExtension) -> Self { |
23 | let mut hanb = false; |
24 | let mut jpan = false; |
25 | let mut kore = false; |
26 | |
27 | if ext.is_common() || ext.is_inherited() || ext.contains_script(Script::Han) { |
28 | hanb = true; |
29 | jpan = true; |
30 | kore = true; |
31 | } else { |
32 | if ext.contains_script(Script::Hiragana) || ext.contains_script(Script::Katakana) { |
33 | jpan = true; |
34 | } |
35 | |
36 | if ext.contains_script(Script::Hangul) { |
37 | kore = true; |
38 | } |
39 | |
40 | if ext.contains_script(Script::Bopomofo) { |
41 | hanb = true; |
42 | } |
43 | } |
44 | Self { |
45 | base: ext, |
46 | hanb, |
47 | jpan, |
48 | kore, |
49 | } |
50 | } |
51 | } |
52 | |
53 | impl From<char> for AugmentedScriptSet { |
54 | fn from(c: char) -> Self { |
55 | AugmentedScriptSet::for_char(c) |
56 | } |
57 | } |
58 | |
59 | impl From<&'_ str> for AugmentedScriptSet { |
60 | fn from(s: &'_ str) -> Self { |
61 | AugmentedScriptSet::for_str(s) |
62 | } |
63 | } |
64 | |
65 | impl Default for AugmentedScriptSet { |
66 | fn default() -> Self { |
67 | AugmentedScriptSet { |
68 | base: Script::Common.into(), |
69 | hanb: true, |
70 | jpan: true, |
71 | kore: true, |
72 | } |
73 | } |
74 | } |
75 | |
76 | impl Debug for AugmentedScriptSet { |
77 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
78 | if self.is_empty() { |
79 | write!(f, "AugmentedScriptSet {{∅ }}" )?; |
80 | } else if self.is_all() { |
81 | write!(f, "AugmentedScriptSet {{ALL }}" )?; |
82 | } else { |
83 | write!(f, "AugmentedScriptSet {{" )?; |
84 | let mut first_entry = true; |
85 | let hanb = if self.hanb { Some("Hanb" ) } else { None }; |
86 | let jpan = if self.jpan { Some("Jpan" ) } else { None }; |
87 | let kore = if self.kore { Some("Kore" ) } else { None }; |
88 | for writing_system in None |
89 | .into_iter() |
90 | .chain(hanb) |
91 | .chain(jpan) |
92 | .chain(kore) |
93 | .chain(self.base.iter().map(Script::short_name)) |
94 | { |
95 | if !first_entry { |
96 | write!(f, ", " )?; |
97 | } else { |
98 | first_entry = false; |
99 | } |
100 | write!(f, " {}" , writing_system)?; |
101 | } |
102 | write!(f, " }}" )?; |
103 | } |
104 | Ok(()) |
105 | } |
106 | } |
107 | |
108 | impl fmt::Display for AugmentedScriptSet { |
109 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
110 | if self.is_empty() { |
111 | write!(f, "Empty" )?; |
112 | } else if self.is_all() { |
113 | write!(f, "All" )?; |
114 | } else { |
115 | let mut first_entry = true; |
116 | let hanb = if self.hanb { |
117 | Some("Han with Bopomofo" ) |
118 | } else { |
119 | None |
120 | }; |
121 | let jpan = if self.jpan { Some("Japanese" ) } else { None }; |
122 | let kore = if self.kore { Some("Korean" ) } else { None }; |
123 | for writing_system in None |
124 | .into_iter() |
125 | .chain(hanb) |
126 | .chain(jpan) |
127 | .chain(kore) |
128 | .chain(self.base.iter().map(Script::full_name)) |
129 | { |
130 | if !first_entry { |
131 | write!(f, ", " )?; |
132 | } else { |
133 | first_entry = false; |
134 | } |
135 | write!(f, " {}" , writing_system)?; |
136 | } |
137 | } |
138 | Ok(()) |
139 | } |
140 | } |
141 | |
142 | impl AugmentedScriptSet { |
143 | /// Intersect this set with another |
144 | pub fn intersect_with(&mut self, other: Self) { |
145 | self.base.intersect_with(other.base); |
146 | self.hanb = self.hanb && other.hanb; |
147 | self.jpan = self.jpan && other.jpan; |
148 | self.kore = self.kore && other.kore; |
149 | } |
150 | |
151 | /// Check if the set is empty |
152 | pub fn is_empty(&self) -> bool { |
153 | self.base.is_empty() && !self.hanb && !self.jpan && !self.kore |
154 | } |
155 | |
156 | /// Check if the set is "All" (Common or Inherited) |
157 | pub fn is_all(&self) -> bool { |
158 | self.base.is_common() || self.base.is_inherited() |
159 | } |
160 | |
161 | /// Construct an AugmentedScriptSet for a given character |
162 | pub fn for_char(c: char) -> Self { |
163 | ScriptExtension::from(c).into() |
164 | } |
165 | |
166 | /// Find the [resolved script set](https://www.unicode.org/reports/tr39/#def-resolved-script-set) of a given string |
167 | pub fn for_str(s: &str) -> Self { |
168 | let mut set = AugmentedScriptSet::default(); |
169 | for ch in s.chars() { |
170 | set.intersect_with(ch.into()) |
171 | } |
172 | set |
173 | } |
174 | } |
175 | |
176 | /// Extension trait for [mixed-script detection](https://www.unicode.org/reports/tr39/#Mixed_Script_Detection) |
177 | pub trait MixedScript { |
178 | /// Check if a string is [single-script](https://www.unicode.org/reports/tr39/#def-single-script) |
179 | /// |
180 | /// Note that a single-script string may still contain multiple Script properties! |
181 | fn is_single_script(self) -> bool; |
182 | |
183 | /// Find the [resolved script set](https://www.unicode.org/reports/tr39/#def-resolved-script-set) of a given string |
184 | fn resolve_script_set(self) -> AugmentedScriptSet; |
185 | } |
186 | |
187 | impl MixedScript for &'_ str { |
188 | fn is_single_script(self) -> bool { |
189 | !AugmentedScriptSet::for_str(self).is_empty() |
190 | } |
191 | |
192 | fn resolve_script_set(self) -> AugmentedScriptSet { |
193 | self.into() |
194 | } |
195 | } |
196 | |
197 | /// Check if a character is considered potential mixed script confusable. |
198 | /// |
199 | /// If the specified character is not restricted from use for identifiers, |
200 | /// this function returns whether it is considered mixed script confusable |
201 | /// with another character that is not restricted from use for identifiers. |
202 | /// |
203 | /// If the specified character is restricted from use for identifiers, |
204 | /// the return value is unspecified. |
205 | pub fn is_potential_mixed_script_confusable_char(c: char) -> bool { |
206 | use crate::tables::potential_mixed_script_confusable::potential_mixed_script_confusable; |
207 | |
208 | potential_mixed_script_confusable(c) |
209 | } |
210 | |