| 1 | //! [Mixed-script detection](https://www.unicode.org/reports/tr39/#Mixed_Script_Detection) |
| 2 | |
| 3 | use core::fmt::{self, Debug}; |
| 4 | use unicode_script::{Script, ScriptExtension}; |
| 5 | |
| 6 | /// An Augmented script set, as defined by UTS 39 |
| 7 | /// |
| 8 | /// https://www.unicode.org/reports/tr39/#def-augmented-script-set |
| 9 | #[derive (Copy, Clone, PartialEq, Hash, Eq)] |
| 10 | pub struct AugmentedScriptSet { |
| 11 | /// The base ScriptExtension value |
| 12 | pub base: ScriptExtension, |
| 13 | /// Han With Bopomofo |
| 14 | pub hanb: bool, |
| 15 | /// Japanese |
| 16 | pub jpan: bool, |
| 17 | /// Korean |
| 18 | pub kore: bool, |
| 19 | } |
| 20 | |
| 21 | impl From<ScriptExtension> for AugmentedScriptSet { |
| 22 | fn from(ext: ScriptExtension) -> Self { |
| 23 | let mut hanb = false; |
| 24 | let mut jpan = false; |
| 25 | let mut kore = false; |
| 26 | |
| 27 | if ext.is_common() || ext.is_inherited() || ext.contains_script(Script::Han) { |
| 28 | hanb = true; |
| 29 | jpan = true; |
| 30 | kore = true; |
| 31 | } else { |
| 32 | if ext.contains_script(Script::Hiragana) || ext.contains_script(Script::Katakana) { |
| 33 | jpan = true; |
| 34 | } |
| 35 | |
| 36 | if ext.contains_script(Script::Hangul) { |
| 37 | kore = true; |
| 38 | } |
| 39 | |
| 40 | if ext.contains_script(Script::Bopomofo) { |
| 41 | hanb = true; |
| 42 | } |
| 43 | } |
| 44 | Self { |
| 45 | base: ext, |
| 46 | hanb, |
| 47 | jpan, |
| 48 | kore, |
| 49 | } |
| 50 | } |
| 51 | } |
| 52 | |
| 53 | impl From<char> for AugmentedScriptSet { |
| 54 | fn from(c: char) -> Self { |
| 55 | AugmentedScriptSet::for_char(c) |
| 56 | } |
| 57 | } |
| 58 | |
| 59 | impl From<&'_ str> for AugmentedScriptSet { |
| 60 | fn from(s: &'_ str) -> Self { |
| 61 | AugmentedScriptSet::for_str(s) |
| 62 | } |
| 63 | } |
| 64 | |
| 65 | impl Default for AugmentedScriptSet { |
| 66 | fn default() -> Self { |
| 67 | AugmentedScriptSet { |
| 68 | base: Script::Common.into(), |
| 69 | hanb: true, |
| 70 | jpan: true, |
| 71 | kore: true, |
| 72 | } |
| 73 | } |
| 74 | } |
| 75 | |
| 76 | impl Debug for AugmentedScriptSet { |
| 77 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| 78 | if self.is_empty() { |
| 79 | write!(f, "AugmentedScriptSet {{∅ }}" )?; |
| 80 | } else if self.is_all() { |
| 81 | write!(f, "AugmentedScriptSet {{ALL }}" )?; |
| 82 | } else { |
| 83 | write!(f, "AugmentedScriptSet {{" )?; |
| 84 | let mut first_entry = true; |
| 85 | let hanb = if self.hanb { Some("Hanb" ) } else { None }; |
| 86 | let jpan = if self.jpan { Some("Jpan" ) } else { None }; |
| 87 | let kore = if self.kore { Some("Kore" ) } else { None }; |
| 88 | for writing_system in None |
| 89 | .into_iter() |
| 90 | .chain(hanb) |
| 91 | .chain(jpan) |
| 92 | .chain(kore) |
| 93 | .chain(self.base.iter().map(Script::short_name)) |
| 94 | { |
| 95 | if !first_entry { |
| 96 | write!(f, ", " )?; |
| 97 | } else { |
| 98 | first_entry = false; |
| 99 | } |
| 100 | write!(f, " {}" , writing_system)?; |
| 101 | } |
| 102 | write!(f, " }}" )?; |
| 103 | } |
| 104 | Ok(()) |
| 105 | } |
| 106 | } |
| 107 | |
| 108 | impl fmt::Display for AugmentedScriptSet { |
| 109 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| 110 | if self.is_empty() { |
| 111 | write!(f, "Empty" )?; |
| 112 | } else if self.is_all() { |
| 113 | write!(f, "All" )?; |
| 114 | } else { |
| 115 | let mut first_entry = true; |
| 116 | let hanb = if self.hanb { |
| 117 | Some("Han with Bopomofo" ) |
| 118 | } else { |
| 119 | None |
| 120 | }; |
| 121 | let jpan = if self.jpan { Some("Japanese" ) } else { None }; |
| 122 | let kore = if self.kore { Some("Korean" ) } else { None }; |
| 123 | for writing_system in None |
| 124 | .into_iter() |
| 125 | .chain(hanb) |
| 126 | .chain(jpan) |
| 127 | .chain(kore) |
| 128 | .chain(self.base.iter().map(Script::full_name)) |
| 129 | { |
| 130 | if !first_entry { |
| 131 | write!(f, ", " )?; |
| 132 | } else { |
| 133 | first_entry = false; |
| 134 | } |
| 135 | write!(f, " {}" , writing_system)?; |
| 136 | } |
| 137 | } |
| 138 | Ok(()) |
| 139 | } |
| 140 | } |
| 141 | |
| 142 | impl AugmentedScriptSet { |
| 143 | /// Intersect this set with another |
| 144 | pub fn intersect_with(&mut self, other: Self) { |
| 145 | self.base.intersect_with(other.base); |
| 146 | self.hanb = self.hanb && other.hanb; |
| 147 | self.jpan = self.jpan && other.jpan; |
| 148 | self.kore = self.kore && other.kore; |
| 149 | } |
| 150 | |
| 151 | /// Check if the set is empty |
| 152 | pub fn is_empty(&self) -> bool { |
| 153 | self.base.is_empty() && !self.hanb && !self.jpan && !self.kore |
| 154 | } |
| 155 | |
| 156 | /// Check if the set is "All" (Common or Inherited) |
| 157 | pub fn is_all(&self) -> bool { |
| 158 | self.base.is_common() || self.base.is_inherited() |
| 159 | } |
| 160 | |
| 161 | /// Construct an AugmentedScriptSet for a given character |
| 162 | pub fn for_char(c: char) -> Self { |
| 163 | ScriptExtension::from(c).into() |
| 164 | } |
| 165 | |
| 166 | /// Find the [resolved script set](https://www.unicode.org/reports/tr39/#def-resolved-script-set) of a given string |
| 167 | pub fn for_str(s: &str) -> Self { |
| 168 | let mut set = AugmentedScriptSet::default(); |
| 169 | for ch in s.chars() { |
| 170 | set.intersect_with(ch.into()) |
| 171 | } |
| 172 | set |
| 173 | } |
| 174 | } |
| 175 | |
| 176 | /// Extension trait for [mixed-script detection](https://www.unicode.org/reports/tr39/#Mixed_Script_Detection) |
| 177 | pub trait MixedScript { |
| 178 | /// Check if a string is [single-script](https://www.unicode.org/reports/tr39/#def-single-script) |
| 179 | /// |
| 180 | /// Note that a single-script string may still contain multiple Script properties! |
| 181 | fn is_single_script(self) -> bool; |
| 182 | |
| 183 | /// Find the [resolved script set](https://www.unicode.org/reports/tr39/#def-resolved-script-set) of a given string |
| 184 | fn resolve_script_set(self) -> AugmentedScriptSet; |
| 185 | } |
| 186 | |
| 187 | impl MixedScript for &'_ str { |
| 188 | fn is_single_script(self) -> bool { |
| 189 | !AugmentedScriptSet::for_str(self).is_empty() |
| 190 | } |
| 191 | |
| 192 | fn resolve_script_set(self) -> AugmentedScriptSet { |
| 193 | self.into() |
| 194 | } |
| 195 | } |
| 196 | |
| 197 | /// Check if a character is considered potential mixed script confusable. |
| 198 | /// |
| 199 | /// If the specified character is not restricted from use for identifiers, |
| 200 | /// this function returns whether it is considered mixed script confusable |
| 201 | /// with another character that is not restricted from use for identifiers. |
| 202 | /// |
| 203 | /// If the specified character is restricted from use for identifiers, |
| 204 | /// the return value is unspecified. |
| 205 | pub fn is_potential_mixed_script_confusable_char(c: char) -> bool { |
| 206 | use crate::tables::potential_mixed_script_confusable::potential_mixed_script_confusable; |
| 207 | |
| 208 | potential_mixed_script_confusable(c) |
| 209 | } |
| 210 | |