1 | use crate::lookups::canonical_combining_class; |
2 | use crate::stream_safe; |
3 | use crate::tables; |
4 | use crate::UnicodeNormalization; |
5 | |
6 | /// The QuickCheck algorithm can quickly determine if a text is or isn't |
7 | /// normalized without any allocations in many cases, but it has to be able to |
8 | /// return `Maybe` when a full decomposition and recomposition is necessary. |
9 | #[derive (Debug, Eq, PartialEq)] |
10 | pub enum IsNormalized { |
11 | /// The text is definitely normalized. |
12 | Yes, |
13 | /// The text is definitely not normalized. |
14 | No, |
15 | /// The text may be normalized. |
16 | Maybe, |
17 | } |
18 | |
19 | // https://unicode.org/reports/tr15/#Detecting_Normalization_Forms |
20 | #[inline ] |
21 | fn quick_check<F, I>(s: I, is_allowed: F, stream_safe: bool) -> IsNormalized |
22 | where |
23 | I: Iterator<Item = char>, |
24 | F: Fn(char) -> IsNormalized, |
25 | { |
26 | let mut last_cc = 0u8; |
27 | let mut nonstarter_count = 0; |
28 | let mut result = IsNormalized::Yes; |
29 | for ch in s { |
30 | // For ASCII we know it's always allowed and a starter |
31 | if ch <= ' \x7f' { |
32 | last_cc = 0; |
33 | nonstarter_count = 0; |
34 | continue; |
35 | } |
36 | |
37 | // Otherwise, lookup the combining class and QC property |
38 | let cc = canonical_combining_class(ch); |
39 | if last_cc > cc && cc != 0 { |
40 | return IsNormalized::No; |
41 | } |
42 | match is_allowed(ch) { |
43 | IsNormalized::Yes => (), |
44 | IsNormalized::No => return IsNormalized::No, |
45 | IsNormalized::Maybe => { |
46 | result = IsNormalized::Maybe; |
47 | } |
48 | } |
49 | if stream_safe { |
50 | let decomp = stream_safe::classify_nonstarters(ch); |
51 | |
52 | // If we're above `MAX_NONSTARTERS`, we're definitely *not* |
53 | // stream-safe normalized. |
54 | if nonstarter_count + decomp.leading_nonstarters > stream_safe::MAX_NONSTARTERS { |
55 | return IsNormalized::No; |
56 | } |
57 | if decomp.leading_nonstarters == decomp.decomposition_len { |
58 | nonstarter_count += decomp.decomposition_len; |
59 | } else { |
60 | nonstarter_count = decomp.trailing_nonstarters; |
61 | } |
62 | } |
63 | last_cc = cc; |
64 | } |
65 | result |
66 | } |
67 | |
68 | /// Quickly check if a string is in NFC, potentially returning |
69 | /// `IsNormalized::Maybe` if further checks are necessary. In this case a check |
70 | /// like `s.chars().nfc().eq(s.chars())` should suffice. |
71 | #[inline ] |
72 | pub fn is_nfc_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized { |
73 | quick_check(s, is_allowed:tables::qc_nfc, stream_safe:false) |
74 | } |
75 | |
76 | /// Quickly check if a string is in NFKC. |
77 | #[inline ] |
78 | pub fn is_nfkc_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized { |
79 | quick_check(s, is_allowed:tables::qc_nfkc, stream_safe:false) |
80 | } |
81 | |
82 | /// Quickly check if a string is in NFD. |
83 | #[inline ] |
84 | pub fn is_nfd_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized { |
85 | quick_check(s, is_allowed:tables::qc_nfd, stream_safe:false) |
86 | } |
87 | |
88 | /// Quickly check if a string is in NFKD. |
89 | #[inline ] |
90 | pub fn is_nfkd_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized { |
91 | quick_check(s, is_allowed:tables::qc_nfkd, stream_safe:false) |
92 | } |
93 | |
94 | /// Quickly check if a string is Stream-Safe NFC. |
95 | #[inline ] |
96 | pub fn is_nfc_stream_safe_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized { |
97 | quick_check(s, is_allowed:tables::qc_nfc, stream_safe:true) |
98 | } |
99 | |
100 | /// Quickly check if a string is Stream-Safe NFD. |
101 | #[inline ] |
102 | pub fn is_nfd_stream_safe_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized { |
103 | quick_check(s, is_allowed:tables::qc_nfd, stream_safe:true) |
104 | } |
105 | |
106 | /// Authoritatively check if a string is in NFC. |
107 | #[inline ] |
108 | pub fn is_nfc(s: &str) -> bool { |
109 | match is_nfc_quick(s.chars()) { |
110 | IsNormalized::Yes => true, |
111 | IsNormalized::No => false, |
112 | IsNormalized::Maybe => s.chars().eq(s.chars().nfc()), |
113 | } |
114 | } |
115 | |
116 | /// Authoritatively check if a string is in NFKC. |
117 | #[inline ] |
118 | pub fn is_nfkc(s: &str) -> bool { |
119 | match is_nfkc_quick(s.chars()) { |
120 | IsNormalized::Yes => true, |
121 | IsNormalized::No => false, |
122 | IsNormalized::Maybe => s.chars().eq(s.chars().nfkc()), |
123 | } |
124 | } |
125 | |
126 | /// Authoritatively check if a string is in NFD. |
127 | #[inline ] |
128 | pub fn is_nfd(s: &str) -> bool { |
129 | match is_nfd_quick(s.chars()) { |
130 | IsNormalized::Yes => true, |
131 | IsNormalized::No => false, |
132 | IsNormalized::Maybe => s.chars().eq(s.chars().nfd()), |
133 | } |
134 | } |
135 | |
136 | /// Authoritatively check if a string is in NFKD. |
137 | #[inline ] |
138 | pub fn is_nfkd(s: &str) -> bool { |
139 | match is_nfkd_quick(s.chars()) { |
140 | IsNormalized::Yes => true, |
141 | IsNormalized::No => false, |
142 | IsNormalized::Maybe => s.chars().eq(s.chars().nfkd()), |
143 | } |
144 | } |
145 | |
146 | /// Authoritatively check if a string is Stream-Safe NFC. |
147 | #[inline ] |
148 | pub fn is_nfc_stream_safe(s: &str) -> bool { |
149 | match is_nfc_stream_safe_quick(s.chars()) { |
150 | IsNormalized::Yes => true, |
151 | IsNormalized::No => false, |
152 | IsNormalized::Maybe => s.chars().eq(s.chars().stream_safe().nfc()), |
153 | } |
154 | } |
155 | |
156 | /// Authoritatively check if a string is Stream-Safe NFD. |
157 | #[inline ] |
158 | pub fn is_nfd_stream_safe(s: &str) -> bool { |
159 | match is_nfd_stream_safe_quick(s.chars()) { |
160 | IsNormalized::Yes => true, |
161 | IsNormalized::No => false, |
162 | IsNormalized::Maybe => s.chars().eq(s.chars().stream_safe().nfd()), |
163 | } |
164 | } |
165 | |
166 | #[cfg (test)] |
167 | mod tests { |
168 | use super::{is_nfc_stream_safe_quick, is_nfd_stream_safe_quick, IsNormalized}; |
169 | |
170 | #[test ] |
171 | fn test_stream_safe_nfd() { |
172 | let okay = "Da \u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}ngerzone" ; |
173 | assert_eq!(is_nfd_stream_safe_quick(okay.chars()), IsNormalized::Yes); |
174 | |
175 | let too_much = "Da \u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{031e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}ngerzone" ; |
176 | assert_eq!(is_nfd_stream_safe_quick(too_much.chars()), IsNormalized::No); |
177 | } |
178 | |
179 | #[test ] |
180 | fn test_stream_safe_nfc() { |
181 | let okay = "ok \u{e0}\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}y" ; |
182 | assert_eq!(is_nfc_stream_safe_quick(okay.chars()), IsNormalized::Maybe); |
183 | |
184 | let too_much = "not ok \u{e0}\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{031e}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}y" ; |
185 | assert_eq!(is_nfc_stream_safe_quick(too_much.chars()), IsNormalized::No); |
186 | } |
187 | } |
188 | |