1 | use core::iter::FusedIterator; |
2 | |
3 | use crate::lookups::{ |
4 | canonical_combining_class, canonical_fully_decomposed, compatibility_fully_decomposed, |
5 | stream_safe_trailing_nonstarters, |
6 | }; |
7 | use crate::normalize::{hangul_decomposition_length, is_hangul_syllable}; |
8 | use crate::tables::stream_safe_leading_nonstarters; |
9 | |
10 | pub(crate) const MAX_NONSTARTERS: usize = 30; |
11 | const COMBINING_GRAPHEME_JOINER: char = ' \u{034F}' ; |
12 | |
13 | /// UAX15-D4: This iterator keeps track of how many non-starters there have been |
14 | /// since the last starter in *NFKD* and will emit a Combining Grapheme Joiner |
15 | /// (U+034F) if the count exceeds 30. |
16 | pub struct StreamSafe<I> { |
17 | iter: I, |
18 | nonstarter_count: usize, |
19 | buffer: Option<char>, |
20 | } |
21 | |
22 | impl<I> StreamSafe<I> { |
23 | pub(crate) fn new(iter: I) -> Self { |
24 | Self { |
25 | iter, |
26 | nonstarter_count: 0, |
27 | buffer: None, |
28 | } |
29 | } |
30 | } |
31 | |
32 | impl<I: Iterator<Item = char>> Iterator for StreamSafe<I> { |
33 | type Item = char; |
34 | |
35 | #[inline ] |
36 | fn next(&mut self) -> Option<char> { |
37 | let next_ch = match self.buffer.take().or_else(|| self.iter.next()) { |
38 | None => return None, |
39 | Some(c) => c, |
40 | }; |
41 | let d = classify_nonstarters(next_ch); |
42 | if self.nonstarter_count + d.leading_nonstarters > MAX_NONSTARTERS { |
43 | // Since we're emitting a CGJ, the suffix of the emitted string in NFKD has no trailing |
44 | // nonstarters, so we can reset the counter to zero. Put `next_ch` back into the |
45 | // iterator (via `self.buffer`), and we'll reclassify it next iteration. |
46 | self.nonstarter_count = 0; |
47 | self.buffer = Some(next_ch); |
48 | return Some(COMBINING_GRAPHEME_JOINER); |
49 | } |
50 | |
51 | // Is the character all nonstarters in NFKD? If so, increment our counter of contiguous |
52 | // nonstarters in NKFD. |
53 | if d.leading_nonstarters == d.decomposition_len { |
54 | self.nonstarter_count += d.decomposition_len; |
55 | } |
56 | // Otherwise, reset the counter to the decomposition's number of trailing nonstarters. |
57 | else { |
58 | self.nonstarter_count = d.trailing_nonstarters; |
59 | } |
60 | Some(next_ch) |
61 | } |
62 | } |
63 | |
64 | impl<I: Iterator<Item = char> + FusedIterator> FusedIterator for StreamSafe<I> {} |
65 | |
66 | #[derive (Debug)] |
67 | pub(crate) struct Decomposition { |
68 | pub(crate) leading_nonstarters: usize, |
69 | pub(crate) trailing_nonstarters: usize, |
70 | pub(crate) decomposition_len: usize, |
71 | } |
72 | |
73 | #[inline ] |
74 | pub(crate) fn classify_nonstarters(c: char) -> Decomposition { |
75 | // As usual, fast path for ASCII (which is always a starter) |
76 | if c <= ' \x7f' { |
77 | return Decomposition { |
78 | leading_nonstarters: 0, |
79 | trailing_nonstarters: 0, |
80 | decomposition_len: 1, |
81 | }; |
82 | } |
83 | // Next, special case Hangul, since it's not handled by our tables. |
84 | if is_hangul_syllable(c) { |
85 | return Decomposition { |
86 | leading_nonstarters: 0, |
87 | trailing_nonstarters: 0, |
88 | decomposition_len: hangul_decomposition_length(c), |
89 | }; |
90 | } |
91 | let decomp = compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)); |
92 | match decomp { |
93 | Some(decomp) => Decomposition { |
94 | leading_nonstarters: stream_safe_leading_nonstarters(c), |
95 | trailing_nonstarters: stream_safe_trailing_nonstarters(c), |
96 | decomposition_len: decomp.len(), |
97 | }, |
98 | None => { |
99 | let is_nonstarter = canonical_combining_class(c) != 0; |
100 | let nonstarter = if is_nonstarter { 1 } else { 0 }; |
101 | Decomposition { |
102 | leading_nonstarters: nonstarter, |
103 | trailing_nonstarters: nonstarter, |
104 | decomposition_len: 1, |
105 | } |
106 | } |
107 | } |
108 | } |
109 | |
110 | #[cfg (test)] |
111 | mod tests { |
112 | use super::{classify_nonstarters, StreamSafe}; |
113 | use crate::lookups::canonical_combining_class; |
114 | use crate::normalize::decompose_compatible; |
115 | |
116 | #[cfg (not(feature = "std" ))] |
117 | use alloc::{string::String, vec::Vec}; |
118 | |
119 | use core::char; |
120 | |
121 | fn stream_safe(s: &str) -> String { |
122 | StreamSafe::new(s.chars()).collect() |
123 | } |
124 | |
125 | #[test ] |
126 | fn test_simple() { |
127 | let technically_okay = "Da \u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}ngerzone" ; |
128 | assert_eq!(stream_safe(technically_okay), technically_okay); |
129 | |
130 | let too_much = "Da \u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone" ; |
131 | let fixed_it = "Da \u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}ngerzone" ; |
132 | assert_eq!(stream_safe(too_much), fixed_it); |
133 | |
134 | let woah_nelly = "Da \u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone" ; |
135 | let its_cool = "Da \u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{034f}\u{031d}\u{032e}ngerzone" ; |
136 | assert_eq!(stream_safe(woah_nelly), its_cool); |
137 | } |
138 | |
139 | #[test ] |
140 | fn test_all_nonstarters() { |
141 | let s = " \u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}" ; |
142 | let expected = " \u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{034F}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}" ; |
143 | assert_eq!(stream_safe(s), expected); |
144 | } |
145 | |
146 | #[test ] |
147 | fn test_classify_nonstarters() { |
148 | // Highest character in the `compat_fully_decomp` table is 2FA1D |
149 | for ch in 0..0x2FA1E { |
150 | let ch = match char::from_u32(ch) { |
151 | Some(c) => c, |
152 | None => continue, |
153 | }; |
154 | let c = classify_nonstarters(ch); |
155 | let mut s = Vec::new(); |
156 | decompose_compatible(ch, |c| s.push(c)); |
157 | |
158 | assert_eq!(s.len(), c.decomposition_len); |
159 | |
160 | let num_leading = s |
161 | .iter() |
162 | .take_while(|&c| canonical_combining_class(*c) != 0) |
163 | .count(); |
164 | let num_trailing = s |
165 | .iter() |
166 | .rev() |
167 | .take_while(|&c| canonical_combining_class(*c) != 0) |
168 | .count(); |
169 | |
170 | assert_eq!(num_leading, c.leading_nonstarters); |
171 | assert_eq!(num_trailing, c.trailing_nonstarters); |
172 | } |
173 | } |
174 | } |
175 | |