| 1 | use core::iter::FusedIterator; | 
| 2 |  | 
|---|
| 3 | use crate::lookups::{ | 
|---|
| 4 | canonical_combining_class, canonical_fully_decomposed, compatibility_fully_decomposed, | 
|---|
| 5 | stream_safe_trailing_nonstarters, | 
|---|
| 6 | }; | 
|---|
| 7 | use crate::normalize::{hangul_decomposition_length, is_hangul_syllable}; | 
|---|
| 8 | use crate::tables::stream_safe_leading_nonstarters; | 
|---|
| 9 |  | 
|---|
| 10 | pub(crate) const MAX_NONSTARTERS: usize = 30; | 
|---|
| 11 | const COMBINING_GRAPHEME_JOINER: char = '\u{034F} '; | 
|---|
| 12 |  | 
|---|
| 13 | /// UAX15-D4: This iterator keeps track of how many non-starters there have been | 
|---|
| 14 | /// since the last starter in *NFKD* and will emit a Combining Grapheme Joiner | 
|---|
| 15 | /// (U+034F) if the count exceeds 30. | 
|---|
| 16 | pub struct StreamSafe<I> { | 
|---|
| 17 | iter: I, | 
|---|
| 18 | nonstarter_count: usize, | 
|---|
| 19 | buffer: Option<char>, | 
|---|
| 20 | } | 
|---|
| 21 |  | 
|---|
| 22 | impl<I> StreamSafe<I> { | 
|---|
| 23 | pub(crate) fn new(iter: I) -> Self { | 
|---|
| 24 | Self { | 
|---|
| 25 | iter, | 
|---|
| 26 | nonstarter_count: 0, | 
|---|
| 27 | buffer: None, | 
|---|
| 28 | } | 
|---|
| 29 | } | 
|---|
| 30 | } | 
|---|
| 31 |  | 
|---|
| 32 | impl<I: Iterator<Item = char>> Iterator for StreamSafe<I> { | 
|---|
| 33 | type Item = char; | 
|---|
| 34 |  | 
|---|
| 35 | #[ inline] | 
|---|
| 36 | fn next(&mut self) -> Option<char> { | 
|---|
| 37 | let next_ch = match self.buffer.take().or_else(|| self.iter.next()) { | 
|---|
| 38 | None => return None, | 
|---|
| 39 | Some(c) => c, | 
|---|
| 40 | }; | 
|---|
| 41 | let d = classify_nonstarters(next_ch); | 
|---|
| 42 | if self.nonstarter_count + d.leading_nonstarters > MAX_NONSTARTERS { | 
|---|
| 43 | // Since we're emitting a CGJ, the suffix of the emitted string in NFKD has no trailing | 
|---|
| 44 | // nonstarters, so we can reset the counter to zero. Put `next_ch` back into the | 
|---|
| 45 | // iterator (via `self.buffer`), and we'll reclassify it next iteration. | 
|---|
| 46 | self.nonstarter_count = 0; | 
|---|
| 47 | self.buffer = Some(next_ch); | 
|---|
| 48 | return Some(COMBINING_GRAPHEME_JOINER); | 
|---|
| 49 | } | 
|---|
| 50 |  | 
|---|
| 51 | // Is the character all nonstarters in NFKD? If so, increment our counter of contiguous | 
|---|
| 52 | // nonstarters in NKFD. | 
|---|
| 53 | if d.leading_nonstarters == d.decomposition_len { | 
|---|
| 54 | self.nonstarter_count += d.decomposition_len; | 
|---|
| 55 | } | 
|---|
| 56 | // Otherwise, reset the counter to the decomposition's number of trailing nonstarters. | 
|---|
| 57 | else { | 
|---|
| 58 | self.nonstarter_count = d.trailing_nonstarters; | 
|---|
| 59 | } | 
|---|
| 60 | Some(next_ch) | 
|---|
| 61 | } | 
|---|
| 62 | } | 
|---|
| 63 |  | 
|---|
| 64 | impl<I: Iterator<Item = char> + FusedIterator> FusedIterator for StreamSafe<I> {} | 
|---|
| 65 |  | 
|---|
| 66 | #[ derive(Debug)] | 
|---|
| 67 | pub(crate) struct Decomposition { | 
|---|
| 68 | pub(crate) leading_nonstarters: usize, | 
|---|
| 69 | pub(crate) trailing_nonstarters: usize, | 
|---|
| 70 | pub(crate) decomposition_len: usize, | 
|---|
| 71 | } | 
|---|
| 72 |  | 
|---|
| 73 | #[ inline] | 
|---|
| 74 | pub(crate) fn classify_nonstarters(c: char) -> Decomposition { | 
|---|
| 75 | // As usual, fast path for ASCII (which is always a starter) | 
|---|
| 76 | if c <= '\x7f '{ | 
|---|
| 77 | return Decomposition { | 
|---|
| 78 | leading_nonstarters: 0, | 
|---|
| 79 | trailing_nonstarters: 0, | 
|---|
| 80 | decomposition_len: 1, | 
|---|
| 81 | }; | 
|---|
| 82 | } | 
|---|
| 83 | // Next, special case Hangul, since it's not handled by our tables. | 
|---|
| 84 | if is_hangul_syllable(c) { | 
|---|
| 85 | return Decomposition { | 
|---|
| 86 | leading_nonstarters: 0, | 
|---|
| 87 | trailing_nonstarters: 0, | 
|---|
| 88 | decomposition_len: hangul_decomposition_length(c), | 
|---|
| 89 | }; | 
|---|
| 90 | } | 
|---|
| 91 | let decomp = compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)); | 
|---|
| 92 | match decomp { | 
|---|
| 93 | Some(decomp) => Decomposition { | 
|---|
| 94 | leading_nonstarters: stream_safe_leading_nonstarters(c), | 
|---|
| 95 | trailing_nonstarters: stream_safe_trailing_nonstarters(c), | 
|---|
| 96 | decomposition_len: decomp.len(), | 
|---|
| 97 | }, | 
|---|
| 98 | None => { | 
|---|
| 99 | let is_nonstarter = canonical_combining_class(c) != 0; | 
|---|
| 100 | let nonstarter = if is_nonstarter { 1 } else { 0 }; | 
|---|
| 101 | Decomposition { | 
|---|
| 102 | leading_nonstarters: nonstarter, | 
|---|
| 103 | trailing_nonstarters: nonstarter, | 
|---|
| 104 | decomposition_len: 1, | 
|---|
| 105 | } | 
|---|
| 106 | } | 
|---|
| 107 | } | 
|---|
| 108 | } | 
|---|
| 109 |  | 
|---|
| 110 | #[ cfg(test)] | 
|---|
| 111 | mod tests { | 
|---|
| 112 | use super::{classify_nonstarters, StreamSafe}; | 
|---|
| 113 | use crate::lookups::canonical_combining_class; | 
|---|
| 114 | use crate::normalize::decompose_compatible; | 
|---|
| 115 |  | 
|---|
| 116 | #[ cfg(not(feature = "std"))] | 
|---|
| 117 | use alloc::{string::String, vec::Vec}; | 
|---|
| 118 |  | 
|---|
| 119 | use core::char; | 
|---|
| 120 |  | 
|---|
| 121 | fn stream_safe(s: &str) -> String { | 
|---|
| 122 | StreamSafe::new(s.chars()).collect() | 
|---|
| 123 | } | 
|---|
| 124 |  | 
|---|
| 125 | #[ test] | 
|---|
| 126 | fn test_simple() { | 
|---|
| 127 | let technically_okay = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d} ngerzone"; | 
|---|
| 128 | assert_eq!(stream_safe(technically_okay), technically_okay); | 
|---|
| 129 |  | 
|---|
| 130 | let too_much = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e} ngerzone"; | 
|---|
| 131 | let fixed_it = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e} ngerzone"; | 
|---|
| 132 | assert_eq!(stream_safe(too_much), fixed_it); | 
|---|
| 133 |  | 
|---|
| 134 | let woah_nelly = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e} ngerzone"; | 
|---|
| 135 | let its_cool = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{034f}\u{031d}\u{032e} ngerzone"; | 
|---|
| 136 | assert_eq!(stream_safe(woah_nelly), its_cool); | 
|---|
| 137 | } | 
|---|
| 138 |  | 
|---|
| 139 | #[ test] | 
|---|
| 140 | fn test_all_nonstarters() { | 
|---|
| 141 | let s = "\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300} "; | 
|---|
| 142 | let expected = "\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{034F}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300} "; | 
|---|
| 143 | assert_eq!(stream_safe(s), expected); | 
|---|
| 144 | } | 
|---|
| 145 |  | 
|---|
| 146 | #[ test] | 
|---|
| 147 | fn test_classify_nonstarters() { | 
|---|
| 148 | // Highest character in the `compat_fully_decomp` table is 2FA1D | 
|---|
| 149 | for ch in 0..0x2FA1E { | 
|---|
| 150 | let ch = match char::from_u32(ch) { | 
|---|
| 151 | Some(c) => c, | 
|---|
| 152 | None => continue, | 
|---|
| 153 | }; | 
|---|
| 154 | let c = classify_nonstarters(ch); | 
|---|
| 155 | let mut s = Vec::new(); | 
|---|
| 156 | decompose_compatible(ch, |c| s.push(c)); | 
|---|
| 157 |  | 
|---|
| 158 | assert_eq!(s.len(), c.decomposition_len); | 
|---|
| 159 |  | 
|---|
| 160 | let num_leading = s | 
|---|
| 161 | .iter() | 
|---|
| 162 | .take_while(|&c| canonical_combining_class(*c) != 0) | 
|---|
| 163 | .count(); | 
|---|
| 164 | let num_trailing = s | 
|---|
| 165 | .iter() | 
|---|
| 166 | .rev() | 
|---|
| 167 | .take_while(|&c| canonical_combining_class(*c) != 0) | 
|---|
| 168 | .count(); | 
|---|
| 169 |  | 
|---|
| 170 | assert_eq!(num_leading, c.leading_nonstarters); | 
|---|
| 171 | assert_eq!(num_trailing, c.trailing_nonstarters); | 
|---|
| 172 | } | 
|---|
| 173 | } | 
|---|
| 174 | } | 
|---|
| 175 |  | 
|---|