1 | use crate::lookups::canonical_combining_class; |
2 | use crate::stream_safe; |
3 | use crate::tables; |
4 | use crate::UnicodeNormalization; |
5 | |
6 | /// QuickCheck quickly determines if a string is normalized, it can return |
7 | /// `Maybe` |
8 | /// |
9 | /// The QuickCheck algorithm can quickly determine if a text is or isn't |
10 | /// normalized without any allocations in many cases, but it has to be able to |
11 | /// return `Maybe` when a full decomposition and recomposition is necessary. |
12 | #[derive (Debug, Eq, PartialEq)] |
13 | pub enum IsNormalized { |
14 | /// The text is definitely normalized. |
15 | Yes, |
16 | /// The text is definitely not normalized. |
17 | No, |
18 | /// The text may be normalized. |
19 | Maybe, |
20 | } |
21 | |
22 | // https://unicode.org/reports/tr15/#Detecting_Normalization_Forms |
23 | #[inline ] |
24 | fn quick_check<F, I>(s: I, is_allowed: F, stream_safe: bool) -> IsNormalized |
25 | where |
26 | I: Iterator<Item = char>, |
27 | F: Fn(char) -> IsNormalized, |
28 | { |
29 | let mut last_cc = 0u8; |
30 | let mut nonstarter_count = 0; |
31 | let mut result = IsNormalized::Yes; |
32 | for ch in s { |
33 | // For ASCII we know it's always allowed and a starter |
34 | if ch <= ' \x7f' { |
35 | last_cc = 0; |
36 | nonstarter_count = 0; |
37 | continue; |
38 | } |
39 | |
40 | // Otherwise, lookup the combining class and QC property |
41 | let cc = canonical_combining_class(ch); |
42 | if last_cc > cc && cc != 0 { |
43 | return IsNormalized::No; |
44 | } |
45 | match is_allowed(ch) { |
46 | IsNormalized::Yes => (), |
47 | IsNormalized::No => return IsNormalized::No, |
48 | IsNormalized::Maybe => { |
49 | result = IsNormalized::Maybe; |
50 | } |
51 | } |
52 | if stream_safe { |
53 | let decomp = stream_safe::classify_nonstarters(ch); |
54 | |
55 | // If we're above `MAX_NONSTARTERS`, we're definitely *not* |
56 | // stream-safe normalized. |
57 | if nonstarter_count + decomp.leading_nonstarters > stream_safe::MAX_NONSTARTERS { |
58 | return IsNormalized::No; |
59 | } |
60 | if decomp.leading_nonstarters == decomp.decomposition_len { |
61 | nonstarter_count += decomp.decomposition_len; |
62 | } else { |
63 | nonstarter_count = decomp.trailing_nonstarters; |
64 | } |
65 | } |
66 | last_cc = cc; |
67 | } |
68 | result |
69 | } |
70 | |
71 | /// Quickly check if a string is in NFC, potentially returning |
72 | /// `IsNormalized::Maybe` if further checks are necessary. In this case a check |
73 | /// like `s.chars().nfc().eq(s.chars())` should suffice. |
74 | #[inline ] |
75 | pub fn is_nfc_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized { |
76 | quick_check(s, is_allowed:tables::qc_nfc, stream_safe:false) |
77 | } |
78 | |
79 | /// Quickly check if a string is in NFKC. |
80 | #[inline ] |
81 | pub fn is_nfkc_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized { |
82 | quick_check(s, is_allowed:tables::qc_nfkc, stream_safe:false) |
83 | } |
84 | |
85 | /// Quickly check if a string is in NFD. |
86 | #[inline ] |
87 | pub fn is_nfd_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized { |
88 | quick_check(s, is_allowed:tables::qc_nfd, stream_safe:false) |
89 | } |
90 | |
91 | /// Quickly check if a string is in NFKD. |
92 | #[inline ] |
93 | pub fn is_nfkd_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized { |
94 | quick_check(s, is_allowed:tables::qc_nfkd, stream_safe:false) |
95 | } |
96 | |
97 | /// Quickly check if a string is Stream-Safe NFC. |
98 | #[inline ] |
99 | pub fn is_nfc_stream_safe_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized { |
100 | quick_check(s, is_allowed:tables::qc_nfc, stream_safe:true) |
101 | } |
102 | |
103 | /// Quickly check if a string is Stream-Safe NFD. |
104 | #[inline ] |
105 | pub fn is_nfd_stream_safe_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized { |
106 | quick_check(s, is_allowed:tables::qc_nfd, stream_safe:true) |
107 | } |
108 | |
109 | /// Authoritatively check if a string is in NFC. |
110 | #[inline ] |
111 | pub fn is_nfc(s: &str) -> bool { |
112 | match is_nfc_quick(s.chars()) { |
113 | IsNormalized::Yes => true, |
114 | IsNormalized::No => false, |
115 | IsNormalized::Maybe => s.chars().eq(s.chars().nfc()), |
116 | } |
117 | } |
118 | |
119 | /// Authoritatively check if a string is in NFKC. |
120 | #[inline ] |
121 | pub fn is_nfkc(s: &str) -> bool { |
122 | match is_nfkc_quick(s.chars()) { |
123 | IsNormalized::Yes => true, |
124 | IsNormalized::No => false, |
125 | IsNormalized::Maybe => s.chars().eq(s.chars().nfkc()), |
126 | } |
127 | } |
128 | |
129 | /// Authoritatively check if a string is in NFD. |
130 | #[inline ] |
131 | pub fn is_nfd(s: &str) -> bool { |
132 | match is_nfd_quick(s.chars()) { |
133 | IsNormalized::Yes => true, |
134 | IsNormalized::No => false, |
135 | IsNormalized::Maybe => s.chars().eq(s.chars().nfd()), |
136 | } |
137 | } |
138 | |
139 | /// Authoritatively check if a string is in NFKD. |
140 | #[inline ] |
141 | pub fn is_nfkd(s: &str) -> bool { |
142 | match is_nfkd_quick(s.chars()) { |
143 | IsNormalized::Yes => true, |
144 | IsNormalized::No => false, |
145 | IsNormalized::Maybe => s.chars().eq(s.chars().nfkd()), |
146 | } |
147 | } |
148 | |
149 | /// Authoritatively check if a string is Stream-Safe NFC. |
150 | #[inline ] |
151 | pub fn is_nfc_stream_safe(s: &str) -> bool { |
152 | match is_nfc_stream_safe_quick(s.chars()) { |
153 | IsNormalized::Yes => true, |
154 | IsNormalized::No => false, |
155 | IsNormalized::Maybe => s.chars().eq(s.chars().stream_safe().nfc()), |
156 | } |
157 | } |
158 | |
159 | /// Authoritatively check if a string is Stream-Safe NFD. |
160 | #[inline ] |
161 | pub fn is_nfd_stream_safe(s: &str) -> bool { |
162 | match is_nfd_stream_safe_quick(s.chars()) { |
163 | IsNormalized::Yes => true, |
164 | IsNormalized::No => false, |
165 | IsNormalized::Maybe => s.chars().eq(s.chars().stream_safe().nfd()), |
166 | } |
167 | } |
168 | |
169 | #[cfg (test)] |
170 | mod tests { |
171 | use super::{is_nfc_stream_safe_quick, is_nfd_stream_safe_quick, IsNormalized}; |
172 | |
173 | #[test ] |
174 | fn test_stream_safe_nfd() { |
175 | let okay = "Da \u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}ngerzone" ; |
176 | assert_eq!(is_nfd_stream_safe_quick(okay.chars()), IsNormalized::Yes); |
177 | |
178 | let too_much = "Da \u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{031e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}ngerzone" ; |
179 | assert_eq!(is_nfd_stream_safe_quick(too_much.chars()), IsNormalized::No); |
180 | } |
181 | |
182 | #[test ] |
183 | fn test_stream_safe_nfc() { |
184 | let okay = "ok \u{e0}\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}y" ; |
185 | assert_eq!(is_nfc_stream_safe_quick(okay.chars()), IsNormalized::Maybe); |
186 | |
187 | let too_much = "not ok \u{e0}\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{031e}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}y" ; |
188 | assert_eq!(is_nfc_stream_safe_quick(too_much.chars()), IsNormalized::No); |
189 | } |
190 | } |
191 | |