1use crate::lookups::{
2 canonical_combining_class, canonical_fully_decomposed, compatibility_fully_decomposed,
3 stream_safe_trailing_nonstarters,
4};
5use crate::normalize::{hangul_decomposition_length, is_hangul_syllable};
6use crate::tables::stream_safe_leading_nonstarters;
7
8pub(crate) const MAX_NONSTARTERS: usize = 30;
9const COMBINING_GRAPHEME_JOINER: char = '\u{034F}';
10
11/// UAX15-D4: This iterator keeps track of how many non-starters there have been
12/// since the last starter in *NFKD* and will emit a Combining Grapheme Joiner
13/// (U+034F) if the count exceeds 30.
14pub struct StreamSafe<I> {
15 iter: I,
16 nonstarter_count: usize,
17 buffer: Option<char>,
18}
19
20impl<I> StreamSafe<I> {
21 pub(crate) fn new(iter: I) -> Self {
22 Self {
23 iter,
24 nonstarter_count: 0,
25 buffer: None,
26 }
27 }
28}
29
30impl<I: Iterator<Item = char>> Iterator for StreamSafe<I> {
31 type Item = char;
32
33 #[inline]
34 fn next(&mut self) -> Option<char> {
35 let next_ch = match self.buffer.take().or_else(|| self.iter.next()) {
36 None => return None,
37 Some(c) => c,
38 };
39 let d = classify_nonstarters(next_ch);
40 if self.nonstarter_count + d.leading_nonstarters > MAX_NONSTARTERS {
41 // Since we're emitting a CGJ, the suffix of the emitted string in NFKD has no trailing
42 // nonstarters, so we can reset the counter to zero. Put `next_ch` back into the
43 // iterator (via `self.buffer`), and we'll reclassify it next iteration.
44 self.nonstarter_count = 0;
45 self.buffer = Some(next_ch);
46 return Some(COMBINING_GRAPHEME_JOINER);
47 }
48
49 // Is the character all nonstarters in NFKD? If so, increment our counter of contiguous
50 // nonstarters in NKFD.
51 if d.leading_nonstarters == d.decomposition_len {
52 self.nonstarter_count += d.decomposition_len;
53 }
54 // Otherwise, reset the counter to the decomposition's number of trailing nonstarters.
55 else {
56 self.nonstarter_count = d.trailing_nonstarters;
57 }
58 Some(next_ch)
59 }
60}
61
62#[derive(Debug)]
63pub(crate) struct Decomposition {
64 pub(crate) leading_nonstarters: usize,
65 pub(crate) trailing_nonstarters: usize,
66 pub(crate) decomposition_len: usize,
67}
68
69#[inline]
70pub(crate) fn classify_nonstarters(c: char) -> Decomposition {
71 // As usual, fast path for ASCII (which is always a starter)
72 if c <= '\x7f' {
73 return Decomposition {
74 leading_nonstarters: 0,
75 trailing_nonstarters: 0,
76 decomposition_len: 1,
77 };
78 }
79 // Next, special case Hangul, since it's not handled by our tables.
80 if is_hangul_syllable(c) {
81 return Decomposition {
82 leading_nonstarters: 0,
83 trailing_nonstarters: 0,
84 decomposition_len: hangul_decomposition_length(c),
85 };
86 }
87 let decomp = compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
88 match decomp {
89 Some(decomp) => Decomposition {
90 leading_nonstarters: stream_safe_leading_nonstarters(c),
91 trailing_nonstarters: stream_safe_trailing_nonstarters(c),
92 decomposition_len: decomp.len(),
93 },
94 None => {
95 let is_nonstarter = canonical_combining_class(c) != 0;
96 let nonstarter = if is_nonstarter { 1 } else { 0 };
97 Decomposition {
98 leading_nonstarters: nonstarter,
99 trailing_nonstarters: nonstarter,
100 decomposition_len: 1,
101 }
102 }
103 }
104}
105
106#[cfg(test)]
107mod tests {
108 use super::{classify_nonstarters, StreamSafe};
109 use crate::lookups::canonical_combining_class;
110 use crate::normalize::decompose_compatible;
111
112 #[cfg(not(feature = "std"))]
113 use crate::no_std_prelude::*;
114
115 use core::char;
116
117 fn stream_safe(s: &str) -> String {
118 StreamSafe::new(s.chars()).collect()
119 }
120
121 #[test]
122 fn test_simple() {
123 let technically_okay = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}ngerzone";
124 assert_eq!(stream_safe(technically_okay), technically_okay);
125
126 let too_much = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
127 let fixed_it = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}ngerzone";
128 assert_eq!(stream_safe(too_much), fixed_it);
129
130 let woah_nelly = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
131 let its_cool = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{034f}\u{031d}\u{032e}ngerzone";
132 assert_eq!(stream_safe(woah_nelly), its_cool);
133 }
134
135 #[test]
136 fn test_all_nonstarters() {
137 let s = "\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}";
138 let expected = "\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{034F}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}";
139 assert_eq!(stream_safe(s), expected);
140 }
141
142 #[test]
143 fn test_classify_nonstarters() {
144 // Highest character in the `compat_fully_decomp` table is 2FA1D
145 for ch in 0..0x2FA1E {
146 let ch = match char::from_u32(ch) {
147 Some(c) => c,
148 None => continue,
149 };
150 let c = classify_nonstarters(ch);
151 let mut s = Vec::new();
152 decompose_compatible(ch, |c| s.push(c));
153
154 assert_eq!(s.len(), c.decomposition_len);
155
156 let num_leading = s
157 .iter()
158 .take_while(|&c| canonical_combining_class(*c) != 0)
159 .count();
160 let num_trailing = s
161 .iter()
162 .rev()
163 .take_while(|&c| canonical_combining_class(*c) != 0)
164 .count();
165
166 assert_eq!(num_leading, c.leading_nonstarters);
167 assert_eq!(num_trailing, c.trailing_nonstarters);
168 }
169 }
170}
171