1 | // This file is part of ICU4X. For terms of use, please see the file |
2 | // called LICENSE at the top level of the ICU4X source tree |
3 | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | |
5 | // https://github.com/unicode-org/icu4x/blob/main/documents/process/boilerplate.md#library-annotations |
6 | #![cfg_attr (not(any(test, feature = "std" )), no_std)] |
7 | #![cfg_attr ( |
8 | not(test), |
9 | deny( |
10 | clippy::indexing_slicing, |
11 | clippy::unwrap_used, |
12 | clippy::expect_used, |
13 | clippy::panic, |
14 | clippy::exhaustive_structs, |
15 | clippy::exhaustive_enums, |
16 | missing_debug_implementations, |
17 | ) |
18 | )] |
19 | #![warn (missing_docs)] |
20 | |
21 | //! Normalizing text into Unicode Normalization Forms. |
22 | //! |
23 | //! This module is published as its own crate ([`icu_normalizer`](https://docs.rs/icu_normalizer/latest/icu_normalizer/)) |
24 | //! and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project. |
25 | //! |
26 | //! # Implementation notes |
27 | //! |
28 | //! The normalizer operates on a lazy iterator over Unicode scalar values (Rust `char`) internally |
29 | //! and iterating over guaranteed-valid UTF-8, potentially-invalid UTF-8, and potentially-invalid |
30 | //! UTF-16 is a step that doesn’t leak into the normalizer internals. Ill-formed byte sequences are |
31 | //! treated as U+FFFD. |
32 | //! |
33 | //! The normalizer data layout is not based on the ICU4C design at all. Instead, the normalization |
34 | //! data layout is a clean-slate design optimized for the concept of fusing the NFD decomposition |
35 | //! into the collator. That is, the decomposing normalizer is a by-product of the collator-motivated |
36 | //! data layout. |
37 | //! |
38 | //! Notably, the decomposition data structure is optimized for a starter decomposing to itself, |
39 | //! which is the most common case, and for a starter decomposing to a starter and a non-starter |
40 | //! on the Basic Multilingual Plane. Notably, in this case, the collator makes use of the |
41 | //! knowledge that the second character of such a decomposition is a non-starter. Therefore, |
42 | //! decomposition into two starters is handled by generic fallback path that looks the |
43 | //! decomposition from an array by offset and length instead of baking a BMP starter pair directly |
44 | //! into a trie value. |
45 | //! |
46 | //! The decompositions into non-starters are hard-coded. At present in Unicode, these appear |
47 | //! to be special cases falling into three categories: |
48 | //! |
49 | //! 1. Deprecated combining marks. |
50 | //! 2. Particular Tibetan vowel sings. |
51 | //! 3. NFKD only: half-width kana voicing marks. |
52 | //! |
53 | //! Hopefully Unicode never adds more decompositions into non-starters (other than a character |
54 | //! decomposing to itself), but if it does, a code update is needed instead of a mere data update. |
55 | //! |
56 | //! The composing normalizer builds on the decomposing normalizer by performing the canonical |
57 | //! composition post-processing per spec. As an optimization, though, the composing normalizer |
58 | //! attempts to pass through already-normalized text consisting of starters that never combine |
59 | //! backwards and that map to themselves if followed by a character whose decomposition starts |
60 | //! with a starter that never combines backwards. |
61 | //! |
62 | //! As a difference with ICU4C, the composing normalizer has only the simplest possible |
63 | //! passthrough (only one inversion list lookup per character in the best case) and the full |
64 | //! decompose-then-canonically-compose behavior, whereas ICU4C has other paths between these |
65 | //! extremes. The ICU4X collator doesn't make use of the FCD concept at all in order to avoid |
66 | //! doing the work of checking whether the FCD condition holds. |
67 | |
68 | extern crate alloc; |
69 | |
70 | mod error; |
71 | pub mod properties; |
72 | pub mod provider; |
73 | pub mod uts46; |
74 | |
75 | pub use crate::error::NormalizerError; |
76 | |
77 | #[doc (no_inline)] |
78 | pub use NormalizerError as Error; |
79 | |
80 | use crate::provider::CanonicalDecompositionDataV1Marker; |
81 | use crate::provider::CompatibilityDecompositionSupplementV1Marker; |
82 | use crate::provider::DecompositionDataV1; |
83 | use crate::provider::Uts46DecompositionSupplementV1Marker; |
84 | use alloc::string::String; |
85 | use alloc::vec::Vec; |
86 | use core::char::REPLACEMENT_CHARACTER; |
87 | use core::str::from_utf8_unchecked; |
88 | use icu_collections::char16trie::Char16Trie; |
89 | use icu_collections::char16trie::Char16TrieIterator; |
90 | use icu_collections::char16trie::TrieResult; |
91 | use icu_collections::codepointtrie::CodePointTrie; |
92 | use icu_properties::CanonicalCombiningClass; |
93 | use icu_provider::prelude::*; |
94 | use provider::CanonicalCompositionsV1Marker; |
95 | use provider::CanonicalDecompositionTablesV1Marker; |
96 | use provider::CompatibilityDecompositionTablesV1Marker; |
97 | use provider::DecompositionSupplementV1; |
98 | use provider::DecompositionTablesV1; |
99 | use smallvec::SmallVec; |
100 | use utf16_iter::Utf16CharsEx; |
101 | use utf8_iter::Utf8CharsEx; |
102 | use write16::Write16; |
103 | use zerofrom::ZeroFrom; |
104 | use zerovec::{zeroslice, ZeroSlice}; |
105 | |
106 | #[derive (Debug)] |
107 | enum SupplementPayloadHolder { |
108 | Compatibility(DataPayload<CompatibilityDecompositionSupplementV1Marker>), |
109 | Uts46(DataPayload<Uts46DecompositionSupplementV1Marker>), |
110 | } |
111 | |
112 | impl SupplementPayloadHolder { |
113 | fn get(&self) -> &DecompositionSupplementV1 { |
114 | match self { |
115 | SupplementPayloadHolder::Compatibility(d: &DataPayload<{unknown}>) => d.get(), |
116 | SupplementPayloadHolder::Uts46(d: &DataPayload<{unknown}>) => d.get(), |
117 | } |
118 | } |
119 | } |
120 | |
121 | /// Treatment of the ignorable marker (0xFFFFFFFF) in data. |
122 | #[derive (Debug, PartialEq, Eq)] |
123 | enum IgnorableBehavior { |
124 | /// 0xFFFFFFFF in data is not supported. |
125 | Unsupported, |
126 | /// Ignorables are ignored. |
127 | Ignored, |
128 | /// Ignorables are treated as singleton decompositions |
129 | /// to the REPLACEMENT CHARACTER. |
130 | ReplacementCharacter, |
131 | } |
132 | |
133 | /// Number of iterations allowed on the fast path before flushing. |
134 | /// Since a typical UTF-16 iteration advances over a 2-byte BMP |
135 | /// character, this means two memory pages. |
136 | /// Intel Core i7-4770 had the best results between 2 and 4 pages |
137 | /// when testing powers of two. Apple M1 didn't seem to care |
138 | /// about 1, 2, 4, or 8 pages. |
139 | /// |
140 | /// Curiously, the `str` case does not appear to benefit from |
141 | /// similar flushing, though the tested monomorphization never |
142 | /// passes an error through from `Write`. |
143 | const UTF16_FAST_PATH_FLUSH_THRESHOLD: usize = 4096; |
144 | |
145 | /// Marker for UTS 46 ignorables. |
146 | const IGNORABLE_MARKER: u32 = 0xFFFFFFFF; |
147 | |
148 | /// Marker for starters that decompose to themselves but may |
149 | /// combine backwards under canonical composition. |
150 | /// (Main trie only; not used in the supplementary trie.) |
151 | const BACKWARD_COMBINING_STARTER_MARKER: u32 = 1; |
152 | |
153 | /// Magic marker trie value for characters whose decomposition |
154 | /// starts with a non-starter. The actual decomposition is |
155 | /// hard-coded. |
156 | const SPECIAL_NON_STARTER_DECOMPOSITION_MARKER: u32 = 2; |
157 | |
158 | /// `u16` version of the previous marker value. |
159 | const SPECIAL_NON_STARTER_DECOMPOSITION_MARKER_U16: u16 = 2; |
160 | |
161 | /// Marker that a complex decomposition isn't round-trippable |
162 | /// under re-composition. |
163 | const NON_ROUND_TRIP_MARKER: u16 = 1; |
164 | |
165 | /// Checks if a trie value carries a (non-zero) canonical |
166 | /// combining class. |
167 | fn trie_value_has_ccc(trie_value: u32) -> bool { |
168 | (trie_value & 0xFFFFFF00) == 0xD800 |
169 | } |
170 | |
171 | /// Checks if the trie signifies a special non-starter decomposition. |
172 | fn trie_value_indicates_special_non_starter_decomposition(trie_value: u32) -> bool { |
173 | trie_value == SPECIAL_NON_STARTER_DECOMPOSITION_MARKER |
174 | } |
175 | |
176 | /// Checks if a trie value signifies a character whose decomposition |
177 | /// starts with a non-starter. |
178 | fn decomposition_starts_with_non_starter(trie_value: u32) -> bool { |
179 | trie_value_has_ccc(trie_value) |
180 | || trie_value_indicates_special_non_starter_decomposition(trie_value) |
181 | } |
182 | |
183 | /// Extracts a canonical combining class (possibly zero) from a trie value. |
184 | /// |
185 | /// # Panics |
186 | /// |
187 | /// The trie value must not be one that signifies a special non-starter |
188 | /// decomposition. (Debug-only) |
189 | fn ccc_from_trie_value(trie_value: u32) -> CanonicalCombiningClass { |
190 | if trie_value_has_ccc(trie_value) { |
191 | CanonicalCombiningClass(trie_value as u8) |
192 | } else { |
193 | debug_assert_ne!(trie_value, SPECIAL_NON_STARTER_DECOMPOSITION_MARKER); |
194 | CanonicalCombiningClass::NotReordered |
195 | } |
196 | } |
197 | |
198 | /// The tail (everything after the first character) of the NFKD form U+FDFA |
199 | /// as 16-bit units. |
200 | static FDFA_NFKD: [u16; 17] = [ |
201 | 0x644, 0x649, 0x20, 0x627, 0x644, 0x644, 0x647, 0x20, 0x639, 0x644, 0x64A, 0x647, 0x20, 0x648, |
202 | 0x633, 0x644, 0x645, |
203 | ]; |
204 | |
205 | /// Marker value for U+FDFA in NFKD |
206 | const FDFA_MARKER: u16 = 3; |
207 | |
208 | // These constants originate from page 143 of Unicode 14.0 |
209 | /// Syllable base |
210 | const HANGUL_S_BASE: u32 = 0xAC00; |
211 | /// Lead jamo base |
212 | const HANGUL_L_BASE: u32 = 0x1100; |
213 | /// Vowel jamo base |
214 | const HANGUL_V_BASE: u32 = 0x1161; |
215 | /// Trail jamo base (deliberately off by one to account for the absence of a trail) |
216 | const HANGUL_T_BASE: u32 = 0x11A7; |
217 | /// Lead jamo count |
218 | const HANGUL_L_COUNT: u32 = 19; |
219 | /// Vowel jamo count |
220 | const HANGUL_V_COUNT: u32 = 21; |
221 | /// Trail jamo count (deliberately off by one to account for the absence of a trail) |
222 | const HANGUL_T_COUNT: u32 = 28; |
223 | /// Vowel jamo count times trail jamo count |
224 | const HANGUL_N_COUNT: u32 = 588; |
225 | /// Syllable count |
226 | const HANGUL_S_COUNT: u32 = 11172; |
227 | |
228 | /// One past the conjoining jamo block |
229 | const HANGUL_JAMO_LIMIT: u32 = 0x1200; |
230 | |
231 | /// If `opt` is `Some`, unwrap it. If `None`, panic if debug assertions |
232 | /// are enabled and return `default` if debug assertions are not enabled. |
233 | /// |
234 | /// Use this only if the only reason why `opt` could be `None` is bogus |
235 | /// data from the provider. |
236 | #[inline (always)] |
237 | fn unwrap_or_gigo<T>(opt: Option<T>, default: T) -> T { |
238 | if let Some(val: T) = opt { |
239 | val |
240 | } else { |
241 | // GIGO case |
242 | debug_assert!(false); |
243 | default |
244 | } |
245 | } |
246 | |
247 | /// Convert a `u32` _obtained from data provider data_ to `char`. |
248 | #[inline (always)] |
249 | fn char_from_u32(u: u32) -> char { |
250 | unwrap_or_gigo(opt:core::char::from_u32(u), REPLACEMENT_CHARACTER) |
251 | } |
252 | |
253 | /// Convert a `u16` _obtained from data provider data_ to `char`. |
254 | #[inline (always)] |
255 | fn char_from_u16(u: u16) -> char { |
256 | char_from_u32(u32::from(u)) |
257 | } |
258 | |
259 | const EMPTY_U16: &ZeroSlice<u16> = zeroslice![]; |
260 | |
261 | const EMPTY_CHAR: &ZeroSlice<char> = zeroslice![]; |
262 | |
263 | #[inline (always)] |
264 | fn in_inclusive_range(c: char, start: char, end: char) -> bool { |
265 | u32::from(c).wrapping_sub(u32::from(start)) <= (u32::from(end) - u32::from(start)) |
266 | } |
267 | |
268 | #[inline (always)] |
269 | fn in_inclusive_range32(u: u32, start: u32, end: u32) -> bool { |
270 | u.wrapping_sub(start) <= (end - start) |
271 | } |
272 | |
273 | #[inline (always)] |
274 | fn in_inclusive_range16(u: u16, start: u16, end: u16) -> bool { |
275 | u.wrapping_sub(start) <= (end - start) |
276 | } |
277 | |
278 | /// Performs canonical composition (including Hangul) on a pair of |
279 | /// characters or returns `None` if these characters don't compose. |
280 | /// Composition exclusions are taken into account. |
281 | #[inline ] |
282 | fn compose(iter: Char16TrieIterator, starter: char, second: char) -> Option<char> { |
283 | let v: u32 = u32::from(second).wrapping_sub(HANGUL_V_BASE); |
284 | if v >= HANGUL_JAMO_LIMIT - HANGUL_V_BASE { |
285 | return compose_non_hangul(iter, starter, second); |
286 | } |
287 | if v < HANGUL_V_COUNT { |
288 | let l: u32 = u32::from(starter).wrapping_sub(HANGUL_L_BASE); |
289 | if l < HANGUL_L_COUNT { |
290 | let lv: u32 = l * HANGUL_N_COUNT + v * HANGUL_T_COUNT; |
291 | // Safe, because the inputs are known to be in range. |
292 | return Some(unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lv) }); |
293 | } |
294 | return None; |
295 | } |
296 | if in_inclusive_range(c:second, start:' \u{11A8}' , end:' \u{11C2}' ) { |
297 | let lv: u32 = u32::from(starter).wrapping_sub(HANGUL_S_BASE); |
298 | if lv < HANGUL_S_COUNT && lv % HANGUL_T_COUNT == 0 { |
299 | let lvt: u32 = lv + (u32::from(second) - HANGUL_T_BASE); |
300 | // Safe, because the inputs are known to be in range. |
301 | return Some(unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lvt) }); |
302 | } |
303 | } |
304 | None |
305 | } |
306 | |
307 | /// Performs (non-Hangul) canonical composition on a pair of characters |
308 | /// or returns `None` if these characters don't compose. Composition |
309 | /// exclusions are taken into account. |
310 | fn compose_non_hangul(mut iter: Char16TrieIterator, starter: char, second: char) -> Option<char> { |
311 | // To make the trie smaller, the pairs are stored second character first. |
312 | // Given how this method is used in ways where it's known that `second` |
313 | // is or isn't a starter. We could potentially split the trie into two |
314 | // tries depending on whether `second` is a starter. |
315 | match iter.next(second) { |
316 | TrieResult::NoMatch => None, |
317 | TrieResult::NoValue => match iter.next(starter) { |
318 | TrieResult::NoMatch => None, |
319 | TrieResult::FinalValue(i) => { |
320 | if let Some(c) = char::from_u32(i as u32) { |
321 | Some(c) |
322 | } else { |
323 | // GIGO case |
324 | debug_assert!(false); |
325 | None |
326 | } |
327 | } |
328 | TrieResult::NoValue | TrieResult::Intermediate(_) => { |
329 | // GIGO case |
330 | debug_assert!(false); |
331 | None |
332 | } |
333 | }, |
334 | TrieResult::FinalValue(_) | TrieResult::Intermediate(_) => { |
335 | // GIGO case |
336 | debug_assert!(false); |
337 | None |
338 | } |
339 | } |
340 | } |
341 | |
342 | /// Struct for holding together a character and the value |
343 | /// looked up for it from the NFD trie in a more explicit |
344 | /// way than an anonymous pair. |
345 | /// Also holds a flag about the supplementary-trie provenance. |
346 | #[derive (Debug, PartialEq, Eq)] |
347 | struct CharacterAndTrieValue { |
348 | character: char, |
349 | trie_val: u32, |
350 | from_supplement: bool, |
351 | } |
352 | |
353 | impl CharacterAndTrieValue { |
354 | #[inline (always)] |
355 | pub fn new(c: char, trie_value: u32) -> Self { |
356 | CharacterAndTrieValue { |
357 | character: c, |
358 | trie_val: trie_value, |
359 | from_supplement: false, |
360 | } |
361 | } |
362 | #[inline (always)] |
363 | pub fn new_from_supplement(c: char, trie_value: u32) -> Self { |
364 | CharacterAndTrieValue { |
365 | character: c, |
366 | trie_val: trie_value, |
367 | from_supplement: true, |
368 | } |
369 | } |
370 | #[inline (always)] |
371 | pub fn starter_and_decomposes_to_self(&self) -> bool { |
372 | if self.trie_val > BACKWARD_COMBINING_STARTER_MARKER { |
373 | return false; |
374 | } |
375 | // Hangul syllables get 0 as their trie value |
376 | u32::from(self.character).wrapping_sub(HANGUL_S_BASE) >= HANGUL_S_COUNT |
377 | } |
378 | #[inline (always)] |
379 | pub fn can_combine_backwards(&self) -> bool { |
380 | decomposition_starts_with_non_starter(self.trie_val) |
381 | || self.trie_val == BACKWARD_COMBINING_STARTER_MARKER |
382 | || in_inclusive_range32(self.trie_val, 0x1161, 0x11C2) |
383 | } |
384 | #[inline (always)] |
385 | pub fn potential_passthrough(&self) -> bool { |
386 | self.potential_passthrough_impl(BACKWARD_COMBINING_STARTER_MARKER) |
387 | } |
388 | #[inline (always)] |
389 | pub fn potential_passthrough_and_cannot_combine_backwards(&self) -> bool { |
390 | self.potential_passthrough_impl(0) |
391 | } |
392 | #[inline (always)] |
393 | fn potential_passthrough_impl(&self, bound: u32) -> bool { |
394 | // This methods looks badly branchy, but most characters |
395 | // take the first return. |
396 | if self.trie_val <= bound { |
397 | return true; |
398 | } |
399 | if self.from_supplement { |
400 | return false; |
401 | } |
402 | let trail_or_complex = (self.trie_val >> 16) as u16; |
403 | if trail_or_complex == 0 { |
404 | return false; |
405 | } |
406 | let lead = self.trie_val as u16; |
407 | if lead == 0 { |
408 | return true; |
409 | } |
410 | if lead == NON_ROUND_TRIP_MARKER { |
411 | return false; |
412 | } |
413 | if (trail_or_complex & 0x7F) == 0x3C |
414 | && in_inclusive_range16(trail_or_complex, 0x0900, 0x0BFF) |
415 | { |
416 | // Nukta |
417 | return false; |
418 | } |
419 | if in_inclusive_range(self.character, ' \u{FB1D}' , ' \u{FB4E}' ) { |
420 | // Hebrew presentation forms |
421 | return false; |
422 | } |
423 | if in_inclusive_range(self.character, ' \u{1F71}' , ' \u{1FFB}' ) { |
424 | // Polytonic Greek with oxia |
425 | return false; |
426 | } |
427 | // To avoid more branchiness, 4 characters that decompose to |
428 | // a BMP starter followed by a BMP non-starter are excluded |
429 | // from being encoded directly into the trie value and are |
430 | // handled as complex decompositions instead. These are: |
431 | // U+0F76 TIBETAN VOWEL SIGN VOCALIC R |
432 | // U+0F78 TIBETAN VOWEL SIGN VOCALIC L |
433 | // U+212B ANGSTROM SIGN |
434 | // U+2ADC FORKING |
435 | true |
436 | } |
437 | } |
438 | |
439 | /// Pack a `char` and a `CanonicalCombiningClass` in |
440 | /// 32 bits (the former in the lower 24 bits and the |
441 | /// latter in the high 8 bits). The latter can be |
442 | /// initialized to 0xFF upon creation, in which case |
443 | /// it can be actually set later by calling |
444 | /// `set_ccc_from_trie_if_not_already_set`. This is |
445 | /// a micro optimization to avoid the Canonical |
446 | /// Combining Class trie lookup when there is only |
447 | /// one combining character in a sequence. This type |
448 | /// is intentionally non-`Copy` to get compiler help |
449 | /// in making sure that the class is set on the |
450 | /// instance on which it is intended to be set |
451 | /// and not on a temporary copy. |
452 | /// |
453 | /// Note that 0xFF is won't be assigned to an actual |
454 | /// canonical combining class per definition D104 |
455 | /// in The Unicode Standard. |
456 | // |
457 | // NOTE: The Pernosco debugger has special knowledge |
458 | // of this struct. Please do not change the bit layout |
459 | // or the crate-module-qualified name of this struct |
460 | // without coordination. |
461 | #[derive (Debug)] |
462 | struct CharacterAndClass(u32); |
463 | |
464 | impl CharacterAndClass { |
465 | pub fn new(c: char, ccc: CanonicalCombiningClass) -> Self { |
466 | CharacterAndClass(u32::from(c) | (u32::from(ccc.0) << 24)) |
467 | } |
468 | pub fn new_with_placeholder(c: char) -> Self { |
469 | CharacterAndClass(u32::from(c) | ((0xFF) << 24)) |
470 | } |
471 | pub fn new_with_trie_value(c_tv: CharacterAndTrieValue) -> Self { |
472 | Self::new(c_tv.character, ccc_from_trie_value(c_tv.trie_val)) |
473 | } |
474 | pub fn new_starter(c: char) -> Self { |
475 | CharacterAndClass(u32::from(c)) |
476 | } |
477 | pub fn character(&self) -> char { |
478 | // Safe, because the low 24 bits came from a `char` |
479 | // originally. |
480 | unsafe { char::from_u32_unchecked(self.0 & 0xFFFFFF) } |
481 | } |
482 | pub fn ccc(&self) -> CanonicalCombiningClass { |
483 | CanonicalCombiningClass((self.0 >> 24) as u8) |
484 | } |
485 | pub fn character_and_ccc(&self) -> (char, CanonicalCombiningClass) { |
486 | (self.character(), self.ccc()) |
487 | } |
488 | pub fn set_ccc_from_trie_if_not_already_set(&mut self, trie: &CodePointTrie<u32>) { |
489 | if self.0 >> 24 != 0xFF { |
490 | return; |
491 | } |
492 | let scalar = self.0 & 0xFFFFFF; |
493 | self.0 = ((ccc_from_trie_value(trie.get32_u32(scalar)).0 as u32) << 24) | scalar; |
494 | } |
495 | } |
496 | |
497 | // This function exists as a borrow check helper. |
498 | #[inline (always)] |
499 | fn sort_slice_by_ccc(slice: &mut [CharacterAndClass], trie: &CodePointTrie<u32>) { |
500 | // We don't look up the canonical combining class for starters |
501 | // of for single combining characters between starters. When |
502 | // there's more than one combining character between starters, |
503 | // we look up the canonical combining class for each character |
504 | // exactly once. |
505 | if slice.len() < 2 { |
506 | return; |
507 | } |
508 | sliceIterMut<'_, CharacterAndClass> |
509 | .iter_mut() |
510 | .for_each(|cc: &mut CharacterAndClass| cc.set_ccc_from_trie_if_not_already_set(trie)); |
511 | slice.sort_by_key(|cc: &CharacterAndClass| cc.ccc()); |
512 | } |
513 | |
514 | /// An iterator adaptor that turns an `Iterator` over `char` into |
515 | /// a lazily-decomposed `char` sequence. |
516 | #[derive (Debug)] |
517 | pub struct Decomposition<'data, I> |
518 | where |
519 | I: Iterator<Item = char>, |
520 | { |
521 | delegate: I, |
522 | buffer: SmallVec<[CharacterAndClass; 17]>, // Enough to hold NFKD for U+FDFA |
523 | /// The index of the next item to be read from `buffer`. |
524 | /// The purpose if this index is to avoid having to move |
525 | /// the rest upon every read. |
526 | buffer_pos: usize, |
527 | // At the start of `next()` if not `None`, this is a pending unnormalized |
528 | // starter. When `Decomposition` appears alone, this is never a non-starter. |
529 | // However, when `Decomposition` appears inside a `Composition`, this |
530 | // may become a non-starter before `decomposing_next()` is called. |
531 | pending: Option<CharacterAndTrieValue>, // None at end of stream |
532 | trie: &'data CodePointTrie<'data, u32>, |
533 | supplementary_trie: Option<&'data CodePointTrie<'data, u32>>, |
534 | scalars16: &'data ZeroSlice<u16>, |
535 | scalars24: &'data ZeroSlice<char>, |
536 | supplementary_scalars16: &'data ZeroSlice<u16>, |
537 | supplementary_scalars24: &'data ZeroSlice<char>, |
538 | half_width_voicing_marks_become_non_starters: bool, |
539 | /// The lowest character for which either of the following does |
540 | /// not hold: |
541 | /// 1. Decomposes to self. |
542 | /// 2. Decomposition starts with a non-starter |
543 | decomposition_passthrough_bound: u32, // never above 0xC0 |
544 | ignorable_behavior: IgnorableBehavior, // Arguably should be a type parameter |
545 | } |
546 | |
547 | impl<'data, I> Decomposition<'data, I> |
548 | where |
549 | I: Iterator<Item = char>, |
550 | { |
551 | /// Constructs a decomposing iterator adapter from a delegate |
552 | /// iterator and references to the necessary data, without |
553 | /// supplementary data. |
554 | /// |
555 | /// Use `DecomposingNormalizer::normalize_iter()` instead unless |
556 | /// there's a good reason to use this constructor directly. |
557 | /// |
558 | /// Public but hidden in order to be able to use this from the |
559 | /// collator. |
560 | #[doc (hidden)] |
561 | pub fn new( |
562 | delegate: I, |
563 | decompositions: &'data DecompositionDataV1, |
564 | tables: &'data DecompositionTablesV1, |
565 | ) -> Self { |
566 | Self::new_with_supplements( |
567 | delegate, |
568 | decompositions, |
569 | None, |
570 | tables, |
571 | None, |
572 | 0xC0, |
573 | IgnorableBehavior::Unsupported, |
574 | ) |
575 | } |
576 | |
577 | /// Constructs a decomposing iterator adapter from a delegate |
578 | /// iterator and references to the necessary data, including |
579 | /// supplementary data. |
580 | /// |
581 | /// Use `DecomposingNormalizer::normalize_iter()` instead unless |
582 | /// there's a good reason to use this constructor directly. |
583 | fn new_with_supplements( |
584 | delegate: I, |
585 | decompositions: &'data DecompositionDataV1, |
586 | supplementary_decompositions: Option<&'data DecompositionSupplementV1>, |
587 | tables: &'data DecompositionTablesV1, |
588 | supplementary_tables: Option<&'data DecompositionTablesV1>, |
589 | decomposition_passthrough_bound: u8, |
590 | ignorable_behavior: IgnorableBehavior, |
591 | ) -> Self { |
592 | let half_width_voicing_marks_become_non_starters = |
593 | if let Some(supplementary) = supplementary_decompositions { |
594 | supplementary.half_width_voicing_marks_become_non_starters() |
595 | } else { |
596 | false |
597 | }; |
598 | let mut ret = Decomposition::<I> { |
599 | delegate, |
600 | buffer: SmallVec::new(), // Normalized |
601 | buffer_pos: 0, |
602 | // Initialize with a placeholder starter in case |
603 | // the real stream starts with a non-starter. |
604 | pending: Some(CharacterAndTrieValue::new(' \u{FFFF}' , 0)), |
605 | trie: &decompositions.trie, |
606 | supplementary_trie: supplementary_decompositions.map(|s| &s.trie), |
607 | scalars16: &tables.scalars16, |
608 | scalars24: &tables.scalars24, |
609 | supplementary_scalars16: if let Some(supplementary) = supplementary_tables { |
610 | &supplementary.scalars16 |
611 | } else { |
612 | EMPTY_U16 |
613 | }, |
614 | supplementary_scalars24: if let Some(supplementary) = supplementary_tables { |
615 | &supplementary.scalars24 |
616 | } else { |
617 | EMPTY_CHAR |
618 | }, |
619 | half_width_voicing_marks_become_non_starters, |
620 | decomposition_passthrough_bound: u32::from(decomposition_passthrough_bound), |
621 | ignorable_behavior, |
622 | }; |
623 | let _ = ret.next(); // Remove the U+FFFF placeholder |
624 | ret |
625 | } |
626 | |
627 | fn push_decomposition16( |
628 | &mut self, |
629 | low: u16, |
630 | offset: usize, |
631 | slice16: &ZeroSlice<u16>, |
632 | ) -> (char, usize) { |
633 | let len = usize::from(low >> 13) + 2; |
634 | let (starter, tail) = slice16 |
635 | .get_subslice(offset..offset + len) |
636 | .and_then(|slice| slice.split_first()) |
637 | .map_or_else( |
638 | || { |
639 | // GIGO case |
640 | debug_assert!(false); |
641 | (REPLACEMENT_CHARACTER, EMPTY_U16) |
642 | }, |
643 | |(first, trail)| (char_from_u16(first), trail), |
644 | ); |
645 | if low & 0x1000 != 0 { |
646 | // All the rest are combining |
647 | self.buffer.extend( |
648 | tail.iter() |
649 | .map(|u| CharacterAndClass::new_with_placeholder(char_from_u16(u))), |
650 | ); |
651 | (starter, 0) |
652 | } else { |
653 | let mut i = 0; |
654 | let mut combining_start = 0; |
655 | for u in tail.iter() { |
656 | let ch = char_from_u16(u); |
657 | let trie_value = self.trie.get(ch); |
658 | self.buffer.push(CharacterAndClass::new_with_trie_value( |
659 | CharacterAndTrieValue::new(ch, trie_value), |
660 | )); |
661 | i += 1; |
662 | // Half-width kana and iota subscript don't occur in the tails |
663 | // of these multicharacter decompositions. |
664 | if !decomposition_starts_with_non_starter(trie_value) { |
665 | combining_start = i; |
666 | } |
667 | } |
668 | (starter, combining_start) |
669 | } |
670 | } |
671 | |
672 | fn push_decomposition32( |
673 | &mut self, |
674 | low: u16, |
675 | offset: usize, |
676 | slice32: &ZeroSlice<char>, |
677 | ) -> (char, usize) { |
678 | let len = usize::from(low >> 13) + 1; |
679 | let (starter, tail) = slice32 |
680 | .get_subslice(offset..offset + len) |
681 | .and_then(|slice| slice.split_first()) |
682 | .unwrap_or_else(|| { |
683 | // GIGO case |
684 | debug_assert!(false); |
685 | (REPLACEMENT_CHARACTER, EMPTY_CHAR) |
686 | }); |
687 | if low & 0x1000 != 0 { |
688 | // All the rest are combining |
689 | self.buffer |
690 | .extend(tail.iter().map(CharacterAndClass::new_with_placeholder)); |
691 | (starter, 0) |
692 | } else { |
693 | let mut i = 0; |
694 | let mut combining_start = 0; |
695 | for ch in tail.iter() { |
696 | let trie_value = self.trie.get(ch); |
697 | self.buffer.push(CharacterAndClass::new_with_trie_value( |
698 | CharacterAndTrieValue::new(ch, trie_value), |
699 | )); |
700 | i += 1; |
701 | // Half-width kana and iota subscript don't occur in the tails |
702 | // of these multicharacter decompositions. |
703 | if !decomposition_starts_with_non_starter(trie_value) { |
704 | combining_start = i; |
705 | } |
706 | } |
707 | (starter, combining_start) |
708 | } |
709 | } |
710 | |
711 | #[inline (always)] |
712 | fn attach_trie_value(&self, c: char) -> CharacterAndTrieValue { |
713 | if let Some(supplementary) = self.supplementary_trie { |
714 | if let Some(value) = self.attach_supplementary_trie_value(c, supplementary) { |
715 | return value; |
716 | } |
717 | } |
718 | |
719 | CharacterAndTrieValue::new(c, self.trie.get(c)) |
720 | } |
721 | |
722 | #[inline (never)] |
723 | fn attach_supplementary_trie_value( |
724 | &self, |
725 | c: char, |
726 | supplementary: &CodePointTrie<u32>, |
727 | ) -> Option<CharacterAndTrieValue> { |
728 | let voicing_mark = u32::from(c).wrapping_sub(0xFF9E); |
729 | if voicing_mark <= 1 && self.half_width_voicing_marks_become_non_starters { |
730 | return Some(CharacterAndTrieValue::new( |
731 | if voicing_mark == 0 { |
732 | ' \u{3099}' |
733 | } else { |
734 | ' \u{309A}' |
735 | }, |
736 | 0xD800 | u32::from(CanonicalCombiningClass::KanaVoicing.0), |
737 | )); |
738 | } |
739 | let trie_value = supplementary.get32(u32::from(c)); |
740 | if trie_value != 0 { |
741 | return Some(CharacterAndTrieValue::new_from_supplement(c, trie_value)); |
742 | } |
743 | None |
744 | } |
745 | |
746 | fn delegate_next_no_pending(&mut self) -> Option<CharacterAndTrieValue> { |
747 | debug_assert!(self.pending.is_none()); |
748 | loop { |
749 | let c = self.delegate.next()?; |
750 | |
751 | // TODO(#2384): Measure if this check is actually an optimization even in the |
752 | // non-supplementary case of if this should go inside the supplementary |
753 | // `if` below. |
754 | if u32::from(c) < self.decomposition_passthrough_bound { |
755 | return Some(CharacterAndTrieValue::new(c, 0)); |
756 | } |
757 | |
758 | if let Some(supplementary) = self.supplementary_trie { |
759 | if let Some(value) = self.attach_supplementary_trie_value(c, supplementary) { |
760 | if value.trie_val == IGNORABLE_MARKER { |
761 | match self.ignorable_behavior { |
762 | IgnorableBehavior::Unsupported => { |
763 | debug_assert!(false); |
764 | } |
765 | IgnorableBehavior::ReplacementCharacter => { |
766 | return Some(CharacterAndTrieValue::new( |
767 | c, |
768 | u32::from(REPLACEMENT_CHARACTER), |
769 | )); |
770 | } |
771 | IgnorableBehavior::Ignored => { |
772 | // Else ignore this character by reading the next one from the delegate. |
773 | continue; |
774 | } |
775 | } |
776 | } |
777 | return Some(value); |
778 | } |
779 | } |
780 | let trie_val = self.trie.get(c); |
781 | debug_assert_ne!(trie_val, IGNORABLE_MARKER); |
782 | return Some(CharacterAndTrieValue::new(c, trie_val)); |
783 | } |
784 | } |
785 | |
786 | fn delegate_next(&mut self) -> Option<CharacterAndTrieValue> { |
787 | if let Some(pending) = self.pending.take() { |
788 | // Only happens as part of `Composition` and as part of |
789 | // the contiguous-buffer methods of `DecomposingNormalizer`. |
790 | // I.e. does not happen as part of standalone iterator |
791 | // usage of `Decomposition`. |
792 | Some(pending) |
793 | } else { |
794 | self.delegate_next_no_pending() |
795 | } |
796 | } |
797 | |
798 | fn decomposing_next(&mut self, c_and_trie_val: CharacterAndTrieValue) -> char { |
799 | let (starter, combining_start) = { |
800 | let c = c_and_trie_val.character; |
801 | let hangul_offset = u32::from(c).wrapping_sub(HANGUL_S_BASE); // SIndex in the spec |
802 | if hangul_offset >= HANGUL_S_COUNT { |
803 | let decomposition = c_and_trie_val.trie_val; |
804 | if decomposition <= BACKWARD_COMBINING_STARTER_MARKER { |
805 | // The character is its own decomposition |
806 | (c, 0) |
807 | } else { |
808 | let trail_or_complex = (decomposition >> 16) as u16; |
809 | let lead = decomposition as u16; |
810 | if lead > NON_ROUND_TRIP_MARKER && trail_or_complex != 0 { |
811 | // Decomposition into two BMP characters: starter and non-starter |
812 | let starter = char_from_u16(lead); |
813 | let combining = char_from_u16(trail_or_complex); |
814 | self.buffer |
815 | .push(CharacterAndClass::new_with_placeholder(combining)); |
816 | (starter, 0) |
817 | } else if lead > NON_ROUND_TRIP_MARKER { |
818 | if lead != FDFA_MARKER { |
819 | debug_assert_ne!( |
820 | lead, SPECIAL_NON_STARTER_DECOMPOSITION_MARKER_U16, |
821 | "Should not reach this point with non-starter marker" |
822 | ); |
823 | // Decomposition into one BMP character |
824 | let starter = char_from_u16(lead); |
825 | (starter, 0) |
826 | } else { |
827 | // Special case for the NFKD form of U+FDFA. |
828 | self.buffer.extend(FDFA_NFKD.map(|u| { |
829 | // Safe, because `FDFA_NFKD` is known not to contain |
830 | // surrogates. |
831 | CharacterAndClass::new_starter(unsafe { |
832 | core::char::from_u32_unchecked(u32::from(u)) |
833 | }) |
834 | })); |
835 | (' \u{0635}' , 17) |
836 | } |
837 | } else { |
838 | // Complex decomposition |
839 | // Format for 16-bit value: |
840 | // 15..13: length minus two for 16-bit case and length minus one for |
841 | // the 32-bit case. Length 8 needs to fit in three bits in |
842 | // the 16-bit case, and this way the value is future-proofed |
843 | // up to 9 in the 16-bit case. Zero is unused and length one |
844 | // in the 16-bit case goes directly into the trie. |
845 | // 12: 1 if all trailing characters are guaranteed non-starters, |
846 | // 0 if no guarantees about non-starterness. |
847 | // Note: The bit choice is this way around to allow for |
848 | // dynamically falling back to not having this but instead |
849 | // having one more bit for length by merely choosing |
850 | // different masks. |
851 | // 11..0: Start offset in storage. The offset is to the logical |
852 | // sequence of scalars16, scalars32, supplementary_scalars16, |
853 | // supplementary_scalars32. |
854 | let offset = usize::from(trail_or_complex & 0xFFF); |
855 | if offset < self.scalars16.len() { |
856 | self.push_decomposition16(trail_or_complex, offset, self.scalars16) |
857 | } else if offset < self.scalars16.len() + self.scalars24.len() { |
858 | self.push_decomposition32( |
859 | trail_or_complex, |
860 | offset - self.scalars16.len(), |
861 | self.scalars24, |
862 | ) |
863 | } else if offset |
864 | < self.scalars16.len() |
865 | + self.scalars24.len() |
866 | + self.supplementary_scalars16.len() |
867 | { |
868 | self.push_decomposition16( |
869 | trail_or_complex, |
870 | offset - (self.scalars16.len() + self.scalars24.len()), |
871 | self.supplementary_scalars16, |
872 | ) |
873 | } else { |
874 | self.push_decomposition32( |
875 | trail_or_complex, |
876 | offset |
877 | - (self.scalars16.len() |
878 | + self.scalars24.len() |
879 | + self.supplementary_scalars16.len()), |
880 | self.supplementary_scalars24, |
881 | ) |
882 | } |
883 | } |
884 | } |
885 | } else { |
886 | // Hangul syllable |
887 | // The math here comes from page 144 of Unicode 14.0 |
888 | let l = hangul_offset / HANGUL_N_COUNT; |
889 | let v = (hangul_offset % HANGUL_N_COUNT) / HANGUL_T_COUNT; |
890 | let t = hangul_offset % HANGUL_T_COUNT; |
891 | |
892 | // The unsafe blocks here are OK, because the values stay |
893 | // within the Hangul jamo block and, therefore, the scalar |
894 | // value range by construction. |
895 | self.buffer.push(CharacterAndClass::new_starter(unsafe { |
896 | core::char::from_u32_unchecked(HANGUL_V_BASE + v) |
897 | })); |
898 | let first = unsafe { core::char::from_u32_unchecked(HANGUL_L_BASE + l) }; |
899 | if t != 0 { |
900 | self.buffer.push(CharacterAndClass::new_starter(unsafe { |
901 | core::char::from_u32_unchecked(HANGUL_T_BASE + t) |
902 | })); |
903 | (first, 2) |
904 | } else { |
905 | (first, 1) |
906 | } |
907 | } |
908 | }; |
909 | // Either we're inside `Composition` or `self.pending.is_none()`. |
910 | |
911 | self.gather_and_sort_combining(combining_start); |
912 | starter |
913 | } |
914 | |
915 | fn gather_and_sort_combining(&mut self, combining_start: usize) { |
916 | // Not a `for` loop to avoid holding a mutable reference to `self` across |
917 | // the loop body. |
918 | while let Some(ch_and_trie_val) = self.delegate_next() { |
919 | if trie_value_has_ccc(ch_and_trie_val.trie_val) { |
920 | self.buffer |
921 | .push(CharacterAndClass::new_with_trie_value(ch_and_trie_val)); |
922 | } else if trie_value_indicates_special_non_starter_decomposition( |
923 | ch_and_trie_val.trie_val, |
924 | ) { |
925 | // The Tibetan special cases are starters that decompose into non-starters. |
926 | let mapped = match ch_and_trie_val.character { |
927 | ' \u{0340}' => { |
928 | // COMBINING GRAVE TONE MARK |
929 | CharacterAndClass::new(' \u{0300}' , CanonicalCombiningClass::Above) |
930 | } |
931 | ' \u{0341}' => { |
932 | // COMBINING ACUTE TONE MARK |
933 | CharacterAndClass::new(' \u{0301}' , CanonicalCombiningClass::Above) |
934 | } |
935 | ' \u{0343}' => { |
936 | // COMBINING GREEK KORONIS |
937 | CharacterAndClass::new(' \u{0313}' , CanonicalCombiningClass::Above) |
938 | } |
939 | ' \u{0344}' => { |
940 | // COMBINING GREEK DIALYTIKA TONOS |
941 | self.buffer.push(CharacterAndClass::new( |
942 | ' \u{0308}' , |
943 | CanonicalCombiningClass::Above, |
944 | )); |
945 | CharacterAndClass::new(' \u{0301}' , CanonicalCombiningClass::Above) |
946 | } |
947 | ' \u{0F73}' => { |
948 | // TIBETAN VOWEL SIGN II |
949 | self.buffer.push(CharacterAndClass::new( |
950 | ' \u{0F71}' , |
951 | CanonicalCombiningClass::CCC129, |
952 | )); |
953 | CharacterAndClass::new(' \u{0F72}' , CanonicalCombiningClass::CCC130) |
954 | } |
955 | ' \u{0F75}' => { |
956 | // TIBETAN VOWEL SIGN UU |
957 | self.buffer.push(CharacterAndClass::new( |
958 | ' \u{0F71}' , |
959 | CanonicalCombiningClass::CCC129, |
960 | )); |
961 | CharacterAndClass::new(' \u{0F74}' , CanonicalCombiningClass::CCC132) |
962 | } |
963 | ' \u{0F81}' => { |
964 | // TIBETAN VOWEL SIGN REVERSED II |
965 | self.buffer.push(CharacterAndClass::new( |
966 | ' \u{0F71}' , |
967 | CanonicalCombiningClass::CCC129, |
968 | )); |
969 | CharacterAndClass::new(' \u{0F80}' , CanonicalCombiningClass::CCC130) |
970 | } |
971 | _ => { |
972 | // GIGO case |
973 | debug_assert!(false); |
974 | CharacterAndClass::new_with_placeholder(REPLACEMENT_CHARACTER) |
975 | } |
976 | }; |
977 | self.buffer.push(mapped); |
978 | } else { |
979 | self.pending = Some(ch_and_trie_val); |
980 | break; |
981 | } |
982 | } |
983 | // Slicing succeeds by construction; we've always ensured that `combining_start` |
984 | // is in permissible range. |
985 | #[allow (clippy::indexing_slicing)] |
986 | sort_slice_by_ccc(&mut self.buffer[combining_start..], self.trie); |
987 | } |
988 | } |
989 | |
990 | impl<'data, I> Iterator for Decomposition<'data, I> |
991 | where |
992 | I: Iterator<Item = char>, |
993 | { |
994 | type Item = char; |
995 | |
996 | fn next(&mut self) -> Option<char> { |
997 | if let Some(ret: char) = self.buffer.get(self.buffer_pos).map(|c: &CharacterAndClass| c.character()) { |
998 | self.buffer_pos += 1; |
999 | if self.buffer_pos == self.buffer.len() { |
1000 | self.buffer.clear(); |
1001 | self.buffer_pos = 0; |
1002 | } |
1003 | return Some(ret); |
1004 | } |
1005 | debug_assert_eq!(self.buffer_pos, 0); |
1006 | let c_and_trie_val: CharacterAndTrieValue = self.pending.take()?; |
1007 | Some(self.decomposing_next(c_and_trie_val)) |
1008 | } |
1009 | } |
1010 | |
1011 | /// An iterator adaptor that turns an `Iterator` over `char` into |
1012 | /// a lazily-decomposed and then canonically composed `char` sequence. |
1013 | #[derive (Debug)] |
1014 | pub struct Composition<'data, I> |
1015 | where |
1016 | I: Iterator<Item = char>, |
1017 | { |
1018 | /// The decomposing part of the normalizer than operates before |
1019 | /// the canonical composition is performed on its output. |
1020 | decomposition: Decomposition<'data, I>, |
1021 | /// Non-Hangul canonical composition data. |
1022 | canonical_compositions: Char16Trie<'data>, |
1023 | /// To make `next()` yield in cases where there's a non-composing |
1024 | /// starter in the decomposition buffer, we put it here to let it |
1025 | /// wait for the next `next()` call (or a jump forward within the |
1026 | /// `next()` call). |
1027 | unprocessed_starter: Option<char>, |
1028 | /// The lowest character for which any one of the following does |
1029 | /// not hold: |
1030 | /// 1. Roundtrips via decomposition and recomposition. |
1031 | /// 2. Decomposition starts with a non-starter |
1032 | /// 3. Is not a backward-combining starter |
1033 | composition_passthrough_bound: u32, |
1034 | } |
1035 | |
1036 | impl<'data, I> Composition<'data, I> |
1037 | where |
1038 | I: Iterator<Item = char>, |
1039 | { |
1040 | fn new( |
1041 | decomposition: Decomposition<'data, I>, |
1042 | canonical_compositions: Char16Trie<'data>, |
1043 | composition_passthrough_bound: u16, |
1044 | ) -> Self { |
1045 | Self { |
1046 | decomposition, |
1047 | canonical_compositions, |
1048 | unprocessed_starter: None, |
1049 | composition_passthrough_bound: u32::from(composition_passthrough_bound), |
1050 | } |
1051 | } |
1052 | |
1053 | /// Performs canonical composition (including Hangul) on a pair of |
1054 | /// characters or returns `None` if these characters don't compose. |
1055 | /// Composition exclusions are taken into account. |
1056 | #[inline (always)] |
1057 | pub fn compose(&self, starter: char, second: char) -> Option<char> { |
1058 | compose(self.canonical_compositions.iter(), starter, second) |
1059 | } |
1060 | |
1061 | /// Performs (non-Hangul) canonical composition on a pair of characters |
1062 | /// or returns `None` if these characters don't compose. Composition |
1063 | /// exclusions are taken into account. |
1064 | #[inline (always)] |
1065 | fn compose_non_hangul(&self, starter: char, second: char) -> Option<char> { |
1066 | compose_non_hangul(self.canonical_compositions.iter(), starter, second) |
1067 | } |
1068 | } |
1069 | |
1070 | impl<'data, I> Iterator for Composition<'data, I> |
1071 | where |
1072 | I: Iterator<Item = char>, |
1073 | { |
1074 | type Item = char; |
1075 | |
1076 | #[inline ] |
1077 | fn next(&mut self) -> Option<char> { |
1078 | let mut undecomposed_starter = CharacterAndTrieValue::new(' \u{0}' , 0); // The compiler can't figure out that this gets overwritten before use. |
1079 | if self.unprocessed_starter.is_none() { |
1080 | // The loop is only broken out of as goto forward |
1081 | #[allow (clippy::never_loop)] |
1082 | loop { |
1083 | if let Some((character, ccc)) = self |
1084 | .decomposition |
1085 | .buffer |
1086 | .get(self.decomposition.buffer_pos) |
1087 | .map(|c| c.character_and_ccc()) |
1088 | { |
1089 | self.decomposition.buffer_pos += 1; |
1090 | if self.decomposition.buffer_pos == self.decomposition.buffer.len() { |
1091 | self.decomposition.buffer.clear(); |
1092 | self.decomposition.buffer_pos = 0; |
1093 | } |
1094 | if ccc == CanonicalCombiningClass::NotReordered { |
1095 | // Previous decomposition contains a starter. This must |
1096 | // now become the `unprocessed_starter` for it to have |
1097 | // a chance to compose with the upcoming characters. |
1098 | // |
1099 | // E.g. parenthesized Hangul in NFKC comes through here, |
1100 | // but suitable composition exclusion could exercise this |
1101 | // in NFC. |
1102 | self.unprocessed_starter = Some(character); |
1103 | break; // We already have a starter, so skip taking one from `pending`. |
1104 | } |
1105 | return Some(character); |
1106 | } |
1107 | debug_assert_eq!(self.decomposition.buffer_pos, 0); |
1108 | undecomposed_starter = self.decomposition.pending.take()?; |
1109 | if u32::from(undecomposed_starter.character) < self.composition_passthrough_bound |
1110 | || undecomposed_starter.potential_passthrough() |
1111 | { |
1112 | // TODO(#2385): In the NFC case (moot for NFKC and UTS46), if the upcoming |
1113 | // character is not below `decomposition_passthrough_bound` but is |
1114 | // below `composition_passthrough_bound`, we read from the trie |
1115 | // unnecessarily. |
1116 | if let Some(upcoming) = self.decomposition.delegate_next_no_pending() { |
1117 | let cannot_combine_backwards = u32::from(upcoming.character) |
1118 | < self.composition_passthrough_bound |
1119 | || !upcoming.can_combine_backwards(); |
1120 | self.decomposition.pending = Some(upcoming); |
1121 | if cannot_combine_backwards { |
1122 | // Fast-track succeeded! |
1123 | return Some(undecomposed_starter.character); |
1124 | } |
1125 | } else { |
1126 | // End of stream |
1127 | return Some(undecomposed_starter.character); |
1128 | } |
1129 | } |
1130 | break; // Not actually looping |
1131 | } |
1132 | } |
1133 | let mut starter = ' \u{0}' ; // The compiler can't figure out this gets overwritten before use. |
1134 | |
1135 | // The point of having this boolean is to have only one call site to |
1136 | // `self.decomposition.decomposing_next`, which is hopefully beneficial for |
1137 | // code size under inlining. |
1138 | let mut attempt_composition = false; |
1139 | loop { |
1140 | if let Some(unprocessed) = self.unprocessed_starter.take() { |
1141 | debug_assert_eq!(undecomposed_starter, CharacterAndTrieValue::new(' \u{0}' , 0)); |
1142 | debug_assert_eq!(starter, ' \u{0}' ); |
1143 | starter = unprocessed; |
1144 | } else { |
1145 | debug_assert_eq!(self.decomposition.buffer_pos, 0); |
1146 | let next_starter = self.decomposition.decomposing_next(undecomposed_starter); |
1147 | if !attempt_composition { |
1148 | starter = next_starter; |
1149 | } else if let Some(composed) = self.compose(starter, next_starter) { |
1150 | starter = composed; |
1151 | } else { |
1152 | // This is our yield point. We'll pick this up above in the |
1153 | // next call to `next()`. |
1154 | self.unprocessed_starter = Some(next_starter); |
1155 | return Some(starter); |
1156 | } |
1157 | } |
1158 | // We first loop by index to avoid moving the contents of `buffer`, but |
1159 | // if there's a discontiguous match, we'll start modifying `buffer` instead. |
1160 | loop { |
1161 | let (character, ccc) = if let Some((character, ccc)) = self |
1162 | .decomposition |
1163 | .buffer |
1164 | .get(self.decomposition.buffer_pos) |
1165 | .map(|c| c.character_and_ccc()) |
1166 | { |
1167 | (character, ccc) |
1168 | } else { |
1169 | self.decomposition.buffer.clear(); |
1170 | self.decomposition.buffer_pos = 0; |
1171 | break; |
1172 | }; |
1173 | if let Some(composed) = self.compose(starter, character) { |
1174 | starter = composed; |
1175 | self.decomposition.buffer_pos += 1; |
1176 | continue; |
1177 | } |
1178 | let mut most_recent_skipped_ccc = ccc; |
1179 | { |
1180 | let _ = self |
1181 | .decomposition |
1182 | .buffer |
1183 | .drain(0..self.decomposition.buffer_pos); |
1184 | } |
1185 | self.decomposition.buffer_pos = 0; |
1186 | if most_recent_skipped_ccc == CanonicalCombiningClass::NotReordered { |
1187 | // We failed to compose a starter. Discontiguous match not allowed. |
1188 | // We leave the starter in `buffer` for `next()` to find. |
1189 | return Some(starter); |
1190 | } |
1191 | let mut i = 1; // We have skipped one non-starter. |
1192 | while let Some((character, ccc)) = self |
1193 | .decomposition |
1194 | .buffer |
1195 | .get(i) |
1196 | .map(|c| c.character_and_ccc()) |
1197 | { |
1198 | if ccc == CanonicalCombiningClass::NotReordered { |
1199 | // Discontiguous match not allowed. |
1200 | return Some(starter); |
1201 | } |
1202 | debug_assert!(ccc >= most_recent_skipped_ccc); |
1203 | if ccc != most_recent_skipped_ccc { |
1204 | // Using the non-Hangul version as a micro-optimization, since |
1205 | // we already rejected the case where `second` is a starter |
1206 | // above, and conjoining jamo are starters. |
1207 | if let Some(composed) = self.compose_non_hangul(starter, character) { |
1208 | self.decomposition.buffer.remove(i); |
1209 | starter = composed; |
1210 | continue; |
1211 | } |
1212 | } |
1213 | most_recent_skipped_ccc = ccc; |
1214 | i += 1; |
1215 | } |
1216 | break; |
1217 | } |
1218 | |
1219 | debug_assert_eq!(self.decomposition.buffer_pos, 0); |
1220 | |
1221 | if !self.decomposition.buffer.is_empty() { |
1222 | return Some(starter); |
1223 | } |
1224 | // Now we need to check if composition with an upcoming starter is possible. |
1225 | #[allow (clippy::unwrap_used)] |
1226 | if self.decomposition.pending.is_some() { |
1227 | // We know that `pending_starter` decomposes to start with a starter. |
1228 | // Otherwise, it would have been moved to `self.decomposition.buffer` |
1229 | // by `self.decomposing_next()`. We do this set lookup here in order |
1230 | // to get an opportunity to go back to the fast track. |
1231 | // Note that this check has to happen _after_ checking that `pending` |
1232 | // holds a character, because this flag isn't defined to be meaningful |
1233 | // when `pending` isn't holding a character. |
1234 | let pending = self.decomposition.pending.as_ref().unwrap(); |
1235 | if u32::from(pending.character) < self.composition_passthrough_bound |
1236 | || !pending.can_combine_backwards() |
1237 | { |
1238 | // Won't combine backwards anyway. |
1239 | return Some(starter); |
1240 | } |
1241 | // Consume what we peeked. `unwrap` OK, because we checked `is_some()` |
1242 | // above. |
1243 | undecomposed_starter = self.decomposition.pending.take().unwrap(); |
1244 | // The following line is OK, because we're about to loop back |
1245 | // to `self.decomposition.decomposing_next(c);`, which will |
1246 | // restore the between-`next()`-calls invariant of `pending` |
1247 | // before this function returns. |
1248 | attempt_composition = true; |
1249 | continue; |
1250 | } |
1251 | // End of input |
1252 | return Some(starter); |
1253 | } |
1254 | } |
1255 | } |
1256 | |
1257 | macro_rules! composing_normalize_to { |
1258 | ($(#[$meta:meta])*, |
1259 | $normalize_to:ident, |
1260 | $write:path, |
1261 | $slice:ty, |
1262 | $prolog:block, |
1263 | $always_valid_utf:literal, |
1264 | $as_slice:ident, |
1265 | $fast:block, |
1266 | $text:ident, |
1267 | $sink:ident, |
1268 | $composition:ident, |
1269 | $composition_passthrough_bound:ident, |
1270 | $undecomposed_starter:ident, |
1271 | $pending_slice:ident, |
1272 | $len_utf:ident, |
1273 | ) => { |
1274 | $(#[$meta])* |
1275 | pub fn $normalize_to<W: $write + ?Sized>( |
1276 | &self, |
1277 | $text: $slice, |
1278 | $sink: &mut W, |
1279 | ) -> core::fmt::Result { |
1280 | $prolog |
1281 | let mut $composition = self.normalize_iter($text.chars()); |
1282 | debug_assert_eq!($composition.decomposition.ignorable_behavior, IgnorableBehavior::Unsupported); |
1283 | for cc in $composition.decomposition.buffer.drain(..) { |
1284 | $sink.write_char(cc.character())?; |
1285 | } |
1286 | |
1287 | // Try to get the compiler to hoist the bound to a register. |
1288 | let $composition_passthrough_bound = $composition.composition_passthrough_bound; |
1289 | 'outer: loop { |
1290 | debug_assert_eq!($composition.decomposition.buffer_pos, 0); |
1291 | let mut $undecomposed_starter = |
1292 | if let Some(pending) = $composition.decomposition.pending.take() { |
1293 | pending |
1294 | } else { |
1295 | return Ok(()); |
1296 | }; |
1297 | // Allowing indexed slicing, because a failure would be a code bug and |
1298 | // not a data issue. |
1299 | #[allow(clippy::indexing_slicing)] |
1300 | if u32::from($undecomposed_starter.character) < $composition_passthrough_bound || |
1301 | $undecomposed_starter.potential_passthrough() |
1302 | { |
1303 | // We don't know if a `REPLACEMENT_CHARACTER` occurred in the slice or |
1304 | // was returned in response to an error by the iterator. Assume the |
1305 | // latter for correctness even though it pessimizes the former. |
1306 | if $always_valid_utf || $undecomposed_starter.character != REPLACEMENT_CHARACTER { |
1307 | let $pending_slice = &$text[$text.len() - $composition.decomposition.delegate.$as_slice().len() - $undecomposed_starter.character.$len_utf()..]; |
1308 | // The `$fast` block must either: |
1309 | // 1. Return due to reaching EOF |
1310 | // 2. Leave a starter with its trie value in `$undecomposed_starter` |
1311 | // and, if there is still more input, leave the next character |
1312 | // and its trie value in `$composition.decomposition.pending`. |
1313 | $fast |
1314 | } |
1315 | } |
1316 | // Fast track above, full algorithm below |
1317 | let mut starter = $composition |
1318 | .decomposition |
1319 | .decomposing_next($undecomposed_starter); |
1320 | 'bufferloop: loop { |
1321 | // We first loop by index to avoid moving the contents of `buffer`, but |
1322 | // if there's a discontiguous match, we'll start modifying `buffer` instead. |
1323 | loop { |
1324 | let (character, ccc) = if let Some((character, ccc)) = $composition |
1325 | .decomposition |
1326 | .buffer |
1327 | .get($composition.decomposition.buffer_pos) |
1328 | .map(|c| c.character_and_ccc()) |
1329 | { |
1330 | (character, ccc) |
1331 | } else { |
1332 | $composition.decomposition.buffer.clear(); |
1333 | $composition.decomposition.buffer_pos = 0; |
1334 | break; |
1335 | }; |
1336 | if let Some(composed) = $composition.compose(starter, character) { |
1337 | starter = composed; |
1338 | $composition.decomposition.buffer_pos += 1; |
1339 | continue; |
1340 | } |
1341 | let mut most_recent_skipped_ccc = ccc; |
1342 | if most_recent_skipped_ccc == CanonicalCombiningClass::NotReordered { |
1343 | // We failed to compose a starter. Discontiguous match not allowed. |
1344 | // Write the current `starter` we've been composing, make the unmatched |
1345 | // starter in the buffer the new `starter` (we know it's been decomposed) |
1346 | // and process the rest of the buffer with that as the starter. |
1347 | $sink.write_char(starter)?; |
1348 | starter = character; |
1349 | $composition.decomposition.buffer_pos += 1; |
1350 | continue 'bufferloop; |
1351 | } else { |
1352 | { |
1353 | let _ = $composition |
1354 | .decomposition |
1355 | .buffer |
1356 | .drain(0..$composition.decomposition.buffer_pos); |
1357 | } |
1358 | $composition.decomposition.buffer_pos = 0; |
1359 | } |
1360 | let mut i = 1; // We have skipped one non-starter. |
1361 | while let Some((character, ccc)) = $composition |
1362 | .decomposition |
1363 | .buffer |
1364 | .get(i) |
1365 | .map(|c| c.character_and_ccc()) |
1366 | { |
1367 | if ccc == CanonicalCombiningClass::NotReordered { |
1368 | // Discontiguous match not allowed. |
1369 | $sink.write_char(starter)?; |
1370 | for cc in $composition.decomposition.buffer.drain(..i) { |
1371 | $sink.write_char(cc.character())?; |
1372 | } |
1373 | starter = character; |
1374 | { |
1375 | let removed = $composition.decomposition.buffer.remove(0); |
1376 | debug_assert_eq!(starter, removed.character()); |
1377 | } |
1378 | debug_assert_eq!($composition.decomposition.buffer_pos, 0); |
1379 | continue 'bufferloop; |
1380 | } |
1381 | debug_assert!(ccc >= most_recent_skipped_ccc); |
1382 | if ccc != most_recent_skipped_ccc { |
1383 | // Using the non-Hangul version as a micro-optimization, since |
1384 | // we already rejected the case where `second` is a starter |
1385 | // above, and conjoining jamo are starters. |
1386 | if let Some(composed) = |
1387 | $composition.compose_non_hangul(starter, character) |
1388 | { |
1389 | $composition.decomposition.buffer.remove(i); |
1390 | starter = composed; |
1391 | continue; |
1392 | } |
1393 | } |
1394 | most_recent_skipped_ccc = ccc; |
1395 | i += 1; |
1396 | } |
1397 | break; |
1398 | } |
1399 | debug_assert_eq!($composition.decomposition.buffer_pos, 0); |
1400 | |
1401 | if !$composition.decomposition.buffer.is_empty() { |
1402 | $sink.write_char(starter)?; |
1403 | for cc in $composition.decomposition.buffer.drain(..) { |
1404 | $sink.write_char(cc.character())?; |
1405 | } |
1406 | // We had non-empty buffer, so can't compose with upcoming. |
1407 | continue 'outer; |
1408 | } |
1409 | // Now we need to check if composition with an upcoming starter is possible. |
1410 | if $composition.decomposition.pending.is_some() { |
1411 | // We know that `pending_starter` decomposes to start with a starter. |
1412 | // Otherwise, it would have been moved to `composition.decomposition.buffer` |
1413 | // by `composition.decomposing_next()`. We do this set lookup here in order |
1414 | // to get an opportunity to go back to the fast track. |
1415 | // Note that this check has to happen _after_ checking that `pending` |
1416 | // holds a character, because this flag isn't defined to be meaningful |
1417 | // when `pending` isn't holding a character. |
1418 | let pending = $composition.decomposition.pending.as_ref().unwrap(); |
1419 | if u32::from(pending.character) < $composition.composition_passthrough_bound |
1420 | || !pending.can_combine_backwards() |
1421 | { |
1422 | // Won't combine backwards anyway. |
1423 | $sink.write_char(starter)?; |
1424 | continue 'outer; |
1425 | } |
1426 | let pending_starter = $composition.decomposition.pending.take().unwrap(); |
1427 | let decomposed = $composition.decomposition.decomposing_next(pending_starter); |
1428 | if let Some(composed) = $composition.compose(starter, decomposed) { |
1429 | starter = composed; |
1430 | } else { |
1431 | $sink.write_char(starter)?; |
1432 | starter = decomposed; |
1433 | } |
1434 | continue 'bufferloop; |
1435 | } |
1436 | // End of input |
1437 | $sink.write_char(starter)?; |
1438 | return Ok(()); |
1439 | } // 'bufferloop |
1440 | } |
1441 | } |
1442 | }; |
1443 | } |
1444 | |
1445 | macro_rules! decomposing_normalize_to { |
1446 | ($(#[$meta:meta])*, |
1447 | $normalize_to:ident, |
1448 | $write:path, |
1449 | $slice:ty, |
1450 | $prolog:block, |
1451 | $as_slice:ident, |
1452 | $fast:block, |
1453 | $text:ident, |
1454 | $sink:ident, |
1455 | $decomposition:ident, |
1456 | $decomposition_passthrough_bound:ident, |
1457 | $undecomposed_starter:ident, |
1458 | $pending_slice:ident, |
1459 | $outer:lifetime, // loop labels use lifetime tokens |
1460 | ) => { |
1461 | $(#[$meta])* |
1462 | pub fn $normalize_to<W: $write + ?Sized>( |
1463 | &self, |
1464 | $text: $slice, |
1465 | $sink: &mut W, |
1466 | ) -> core::fmt::Result { |
1467 | $prolog |
1468 | |
1469 | let mut $decomposition = self.normalize_iter($text.chars()); |
1470 | debug_assert_eq!($decomposition.ignorable_behavior, IgnorableBehavior::Unsupported); |
1471 | |
1472 | // Try to get the compiler to hoist the bound to a register. |
1473 | let $decomposition_passthrough_bound = $decomposition.decomposition_passthrough_bound; |
1474 | $outer: loop { |
1475 | for cc in $decomposition.buffer.drain(..) { |
1476 | $sink.write_char(cc.character())?; |
1477 | } |
1478 | debug_assert_eq!($decomposition.buffer_pos, 0); |
1479 | let mut $undecomposed_starter = if let Some(pending) = $decomposition.pending.take() { |
1480 | pending |
1481 | } else { |
1482 | return Ok(()); |
1483 | }; |
1484 | // Allowing indexed slicing, because a failure would be a code bug and |
1485 | // not a data issue. |
1486 | #[allow(clippy::indexing_slicing)] |
1487 | if $undecomposed_starter.starter_and_decomposes_to_self() { |
1488 | // Don't bother including `undecomposed_starter` in a contiguous buffer |
1489 | // write: Just write it right away: |
1490 | $sink.write_char($undecomposed_starter.character)?; |
1491 | |
1492 | let $pending_slice = $decomposition.delegate.$as_slice(); |
1493 | $fast |
1494 | } |
1495 | let starter = $decomposition.decomposing_next($undecomposed_starter); |
1496 | $sink.write_char(starter)?; |
1497 | } |
1498 | } |
1499 | }; |
1500 | } |
1501 | |
1502 | macro_rules! normalizer_methods { |
1503 | () => { |
1504 | /// Normalize a string slice into a `String`. |
1505 | pub fn normalize(&self, text: &str) -> String { |
1506 | let mut ret = String::new(); |
1507 | ret.reserve(text.len()); |
1508 | let _ = self.normalize_to(text, &mut ret); |
1509 | ret |
1510 | } |
1511 | |
1512 | /// Check whether a string slice is normalized. |
1513 | pub fn is_normalized(&self, text: &str) -> bool { |
1514 | let mut sink = IsNormalizedSinkStr::new(text); |
1515 | if self.normalize_to(text, &mut sink).is_err() { |
1516 | return false; |
1517 | } |
1518 | sink.finished() |
1519 | } |
1520 | |
1521 | /// Normalize a slice of potentially-invalid UTF-16 into a `Vec`. |
1522 | /// |
1523 | /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER |
1524 | /// before normalizing. |
1525 | pub fn normalize_utf16(&self, text: &[u16]) -> Vec<u16> { |
1526 | let mut ret = Vec::new(); |
1527 | let _ = self.normalize_utf16_to(text, &mut ret); |
1528 | ret |
1529 | } |
1530 | |
1531 | /// Checks whether a slice of potentially-invalid UTF-16 is normalized. |
1532 | /// |
1533 | /// Unpaired surrogates are treated as the REPLACEMENT CHARACTER. |
1534 | pub fn is_normalized_utf16(&self, text: &[u16]) -> bool { |
1535 | let mut sink = IsNormalizedSinkUtf16::new(text); |
1536 | if self.normalize_utf16_to(text, &mut sink).is_err() { |
1537 | return false; |
1538 | } |
1539 | sink.finished() |
1540 | } |
1541 | |
1542 | /// Normalize a slice of potentially-invalid UTF-8 into a `String`. |
1543 | /// |
1544 | /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER |
1545 | /// according to the WHATWG Encoding Standard. |
1546 | pub fn normalize_utf8(&self, text: &[u8]) -> String { |
1547 | let mut ret = String::new(); |
1548 | ret.reserve(text.len()); |
1549 | let _ = self.normalize_utf8_to(text, &mut ret); |
1550 | ret |
1551 | } |
1552 | |
1553 | /// Check if a slice of potentially-invalid UTF-8 is normalized. |
1554 | /// |
1555 | /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER |
1556 | /// according to the WHATWG Encoding Standard before checking. |
1557 | pub fn is_normalized_utf8(&self, text: &[u8]) -> bool { |
1558 | let mut sink = IsNormalizedSinkUtf8::new(text); |
1559 | if self.normalize_utf8_to(text, &mut sink).is_err() { |
1560 | return false; |
1561 | } |
1562 | sink.finished() |
1563 | } |
1564 | }; |
1565 | } |
1566 | |
1567 | /// A normalizer for performing decomposing normalization. |
1568 | #[derive (Debug)] |
1569 | pub struct DecomposingNormalizer { |
1570 | decompositions: DataPayload<CanonicalDecompositionDataV1Marker>, |
1571 | supplementary_decompositions: Option<SupplementPayloadHolder>, |
1572 | tables: DataPayload<CanonicalDecompositionTablesV1Marker>, |
1573 | supplementary_tables: Option<DataPayload<CompatibilityDecompositionTablesV1Marker>>, |
1574 | decomposition_passthrough_bound: u8, // never above 0xC0 |
1575 | composition_passthrough_bound: u16, // never above 0x0300 |
1576 | } |
1577 | |
1578 | impl DecomposingNormalizer { |
1579 | /// NFD constructor using compiled data. |
1580 | /// |
1581 | /// ✨ *Enabled with the `compiled_data` Cargo feature.* |
1582 | /// |
1583 | /// [📚 Help choosing a constructor](icu_provider::constructors) |
1584 | #[cfg (feature = "compiled_data" )] |
1585 | pub const fn new_nfd() -> Self { |
1586 | const _: () = assert!( |
1587 | crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1 |
1588 | .scalars16 |
1589 | .const_len() |
1590 | + crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1 |
1591 | .scalars24 |
1592 | .const_len() |
1593 | <= 0xFFF, |
1594 | "NormalizerError::FutureExtension" |
1595 | ); |
1596 | |
1597 | DecomposingNormalizer { |
1598 | decompositions: DataPayload::from_static_ref( |
1599 | crate::provider::Baked::SINGLETON_NORMALIZER_NFD_V1, |
1600 | ), |
1601 | supplementary_decompositions: None, |
1602 | tables: DataPayload::from_static_ref( |
1603 | crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1, |
1604 | ), |
1605 | supplementary_tables: None, |
1606 | decomposition_passthrough_bound: 0xC0, |
1607 | composition_passthrough_bound: 0x0300, |
1608 | } |
1609 | } |
1610 | |
1611 | icu_provider::gen_any_buffer_data_constructors!( |
1612 | locale: skip, |
1613 | options: skip, |
1614 | error: NormalizerError, |
1615 | #[cfg (skip)] |
1616 | functions: [ |
1617 | new_nfd, |
1618 | try_new_nfd_with_any_provider, |
1619 | try_new_nfd_with_buffer_provider, |
1620 | try_new_nfd_unstable, |
1621 | Self, |
1622 | ] |
1623 | ); |
1624 | |
1625 | #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_nfd)] |
1626 | pub fn try_new_nfd_unstable<D>(provider: &D) -> Result<Self, NormalizerError> |
1627 | where |
1628 | D: DataProvider<CanonicalDecompositionDataV1Marker> |
1629 | + DataProvider<CanonicalDecompositionTablesV1Marker> |
1630 | + ?Sized, |
1631 | { |
1632 | let decompositions: DataPayload<CanonicalDecompositionDataV1Marker> = |
1633 | provider.load(Default::default())?.take_payload()?; |
1634 | let tables: DataPayload<CanonicalDecompositionTablesV1Marker> = |
1635 | provider.load(Default::default())?.take_payload()?; |
1636 | |
1637 | if tables.get().scalars16.len() + tables.get().scalars24.len() > 0xFFF { |
1638 | // The data is from a future where there exists a normalization flavor whose |
1639 | // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points |
1640 | // of space. If a good use case from such a decomposition flavor arises, we can |
1641 | // dynamically change the bit masks so that the length mask becomes 0x1FFF instead |
1642 | // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However, |
1643 | // since for now the masks are hard-coded, error out. |
1644 | return Err(NormalizerError::FutureExtension); |
1645 | } |
1646 | |
1647 | Ok(DecomposingNormalizer { |
1648 | decompositions, |
1649 | supplementary_decompositions: None, |
1650 | tables, |
1651 | supplementary_tables: None, |
1652 | decomposition_passthrough_bound: 0xC0, |
1653 | composition_passthrough_bound: 0x0300, |
1654 | }) |
1655 | } |
1656 | |
1657 | /// NFKD constructor using compiled data. |
1658 | /// |
1659 | /// ✨ *Enabled with the `compiled_data` Cargo feature.* |
1660 | /// |
1661 | /// [📚 Help choosing a constructor](icu_provider::constructors) |
1662 | #[cfg (feature = "compiled_data" )] |
1663 | pub const fn new_nfkd() -> Self { |
1664 | const _: () = assert!( |
1665 | crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1 |
1666 | .scalars16 |
1667 | .const_len() |
1668 | + crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1 |
1669 | .scalars24 |
1670 | .const_len() |
1671 | + crate::provider::Baked::SINGLETON_NORMALIZER_NFKDEX_V1 |
1672 | .scalars16 |
1673 | .const_len() |
1674 | + crate::provider::Baked::SINGLETON_NORMALIZER_NFKDEX_V1 |
1675 | .scalars24 |
1676 | .const_len() |
1677 | <= 0xFFF, |
1678 | "NormalizerError::FutureExtension" |
1679 | ); |
1680 | |
1681 | const _: () = assert!( |
1682 | crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_V1.passthrough_cap <= 0x0300, |
1683 | "NormalizerError::ValidationError" |
1684 | ); |
1685 | |
1686 | let decomposition_capped = |
1687 | if crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_V1.passthrough_cap < 0xC0 { |
1688 | crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_V1.passthrough_cap |
1689 | } else { |
1690 | 0xC0 |
1691 | }; |
1692 | let composition_capped = |
1693 | if crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_V1.passthrough_cap < 0x0300 { |
1694 | crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_V1.passthrough_cap |
1695 | } else { |
1696 | 0x0300 |
1697 | }; |
1698 | |
1699 | DecomposingNormalizer { |
1700 | decompositions: DataPayload::from_static_ref( |
1701 | crate::provider::Baked::SINGLETON_NORMALIZER_NFD_V1, |
1702 | ), |
1703 | supplementary_decompositions: Some(SupplementPayloadHolder::Compatibility( |
1704 | DataPayload::from_static_ref(crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_V1), |
1705 | )), |
1706 | tables: DataPayload::from_static_ref( |
1707 | crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1, |
1708 | ), |
1709 | supplementary_tables: Some(DataPayload::from_static_ref( |
1710 | crate::provider::Baked::SINGLETON_NORMALIZER_NFKDEX_V1, |
1711 | )), |
1712 | decomposition_passthrough_bound: decomposition_capped as u8, |
1713 | composition_passthrough_bound: composition_capped, |
1714 | } |
1715 | } |
1716 | |
1717 | icu_provider::gen_any_buffer_data_constructors!( |
1718 | locale: skip, |
1719 | options: skip, |
1720 | error: NormalizerError, |
1721 | #[cfg (skip)] |
1722 | functions: [ |
1723 | new_nfkd, |
1724 | try_new_nfkd_with_any_provider, |
1725 | try_new_nfkd_with_buffer_provider, |
1726 | try_new_nfkd_unstable, |
1727 | Self, |
1728 | ] |
1729 | ); |
1730 | |
1731 | #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_nfkd)] |
1732 | pub fn try_new_nfkd_unstable<D>(provider: &D) -> Result<Self, NormalizerError> |
1733 | where |
1734 | D: DataProvider<CanonicalDecompositionDataV1Marker> |
1735 | + DataProvider<CompatibilityDecompositionSupplementV1Marker> |
1736 | + DataProvider<CanonicalDecompositionTablesV1Marker> |
1737 | + DataProvider<CompatibilityDecompositionTablesV1Marker> |
1738 | + ?Sized, |
1739 | { |
1740 | let decompositions: DataPayload<CanonicalDecompositionDataV1Marker> = |
1741 | provider.load(Default::default())?.take_payload()?; |
1742 | let supplementary_decompositions: DataPayload< |
1743 | CompatibilityDecompositionSupplementV1Marker, |
1744 | > = provider.load(Default::default())?.take_payload()?; |
1745 | let tables: DataPayload<CanonicalDecompositionTablesV1Marker> = |
1746 | provider.load(Default::default())?.take_payload()?; |
1747 | let supplementary_tables: DataPayload<CompatibilityDecompositionTablesV1Marker> = |
1748 | provider.load(Default::default())?.take_payload()?; |
1749 | |
1750 | if tables.get().scalars16.len() |
1751 | + tables.get().scalars24.len() |
1752 | + supplementary_tables.get().scalars16.len() |
1753 | + supplementary_tables.get().scalars24.len() |
1754 | > 0xFFF |
1755 | { |
1756 | // The data is from a future where there exists a normalization flavor whose |
1757 | // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points |
1758 | // of space. If a good use case from such a decomposition flavor arises, we can |
1759 | // dynamically change the bit masks so that the length mask becomes 0x1FFF instead |
1760 | // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However, |
1761 | // since for now the masks are hard-coded, error out. |
1762 | return Err(NormalizerError::FutureExtension); |
1763 | } |
1764 | |
1765 | let cap = supplementary_decompositions.get().passthrough_cap; |
1766 | if cap > 0x0300 { |
1767 | return Err(NormalizerError::ValidationError); |
1768 | } |
1769 | let decomposition_capped = cap.min(0xC0); |
1770 | let composition_capped = cap.min(0x0300); |
1771 | |
1772 | Ok(DecomposingNormalizer { |
1773 | decompositions, |
1774 | supplementary_decompositions: Some(SupplementPayloadHolder::Compatibility( |
1775 | supplementary_decompositions, |
1776 | )), |
1777 | tables, |
1778 | supplementary_tables: Some(supplementary_tables), |
1779 | decomposition_passthrough_bound: decomposition_capped as u8, |
1780 | composition_passthrough_bound: composition_capped, |
1781 | }) |
1782 | } |
1783 | |
1784 | #[doc (hidden)] |
1785 | #[cfg (feature = "compiled_data" )] |
1786 | pub(crate) const fn new_uts46_decomposed() -> Self { |
1787 | const _: () = assert!( |
1788 | crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1 |
1789 | .scalars16 |
1790 | .const_len() |
1791 | + crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1 |
1792 | .scalars24 |
1793 | .const_len() |
1794 | + crate::provider::Baked::SINGLETON_NORMALIZER_NFKDEX_V1 |
1795 | .scalars16 |
1796 | .const_len() |
1797 | + crate::provider::Baked::SINGLETON_NORMALIZER_NFKDEX_V1 |
1798 | .scalars24 |
1799 | .const_len() |
1800 | <= 0xFFF, |
1801 | "NormalizerError::FutureExtension" |
1802 | ); |
1803 | |
1804 | const _: () = assert!( |
1805 | crate::provider::Baked::SINGLETON_NORMALIZER_UTS46D_V1.passthrough_cap <= 0x0300, |
1806 | "NormalizerError::ValidationError" |
1807 | ); |
1808 | |
1809 | let decomposition_capped = |
1810 | if crate::provider::Baked::SINGLETON_NORMALIZER_UTS46D_V1.passthrough_cap < 0xC0 { |
1811 | crate::provider::Baked::SINGLETON_NORMALIZER_UTS46D_V1.passthrough_cap |
1812 | } else { |
1813 | 0xC0 |
1814 | }; |
1815 | let composition_capped = |
1816 | if crate::provider::Baked::SINGLETON_NORMALIZER_UTS46D_V1.passthrough_cap < 0x0300 { |
1817 | crate::provider::Baked::SINGLETON_NORMALIZER_UTS46D_V1.passthrough_cap |
1818 | } else { |
1819 | 0x0300 |
1820 | }; |
1821 | |
1822 | DecomposingNormalizer { |
1823 | decompositions: DataPayload::from_static_ref( |
1824 | crate::provider::Baked::SINGLETON_NORMALIZER_NFD_V1, |
1825 | ), |
1826 | supplementary_decompositions: Some(SupplementPayloadHolder::Uts46( |
1827 | DataPayload::from_static_ref( |
1828 | crate::provider::Baked::SINGLETON_NORMALIZER_UTS46D_V1, |
1829 | ), |
1830 | )), |
1831 | tables: DataPayload::from_static_ref( |
1832 | crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1, |
1833 | ), |
1834 | supplementary_tables: Some(DataPayload::from_static_ref( |
1835 | crate::provider::Baked::SINGLETON_NORMALIZER_NFKDEX_V1, |
1836 | )), |
1837 | decomposition_passthrough_bound: decomposition_capped as u8, |
1838 | composition_passthrough_bound: composition_capped, |
1839 | } |
1840 | } |
1841 | |
1842 | /// UTS 46 decomposed constructor (testing only) |
1843 | /// |
1844 | /// This is a special building block normalization for IDNA. It is the decomposed counterpart of |
1845 | /// ICU4C's UTS 46 normalization with two exceptions: characters that UTS 46 disallows and |
1846 | /// ICU4C maps to U+FFFD and characters that UTS 46 maps to the empty string normalize as in |
1847 | /// NFD in this normalization. In both cases, the previous UTS 46 processing before using |
1848 | /// normalization is expected to deal with these characters. Making the disallowed characters |
1849 | /// behave like this is beneficial to data size, and this normalizer implementation cannot |
1850 | /// deal with a character normalizing to the empty string, which doesn't happen in NFD or |
1851 | /// NFKD as of Unicode 14. |
1852 | /// |
1853 | /// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior |
1854 | /// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns |
1855 | /// U+0345 from a reordered character into a non-reordered character before reordering happens. |
1856 | /// Therefore, the output of this normalization may differ for different inputs that are |
1857 | /// canonically equivalent with each other if they differ by how U+0345 is ordered relative |
1858 | /// to other reorderable characters. |
1859 | /// |
1860 | /// Public for testing only. |
1861 | #[doc (hidden)] |
1862 | pub(crate) fn try_new_uts46_decomposed_unstable<D>( |
1863 | provider: &D, |
1864 | ) -> Result<Self, NormalizerError> |
1865 | where |
1866 | D: DataProvider<CanonicalDecompositionDataV1Marker> |
1867 | + DataProvider<Uts46DecompositionSupplementV1Marker> |
1868 | + DataProvider<CanonicalDecompositionTablesV1Marker> |
1869 | + DataProvider<CompatibilityDecompositionTablesV1Marker> |
1870 | // UTS 46 tables merged into CompatibilityDecompositionTablesV1Marker |
1871 | + ?Sized, |
1872 | { |
1873 | let decompositions: DataPayload<CanonicalDecompositionDataV1Marker> = |
1874 | provider.load(Default::default())?.take_payload()?; |
1875 | let supplementary_decompositions: DataPayload<Uts46DecompositionSupplementV1Marker> = |
1876 | provider.load(Default::default())?.take_payload()?; |
1877 | let tables: DataPayload<CanonicalDecompositionTablesV1Marker> = |
1878 | provider.load(Default::default())?.take_payload()?; |
1879 | let supplementary_tables: DataPayload<CompatibilityDecompositionTablesV1Marker> = |
1880 | provider.load(Default::default())?.take_payload()?; |
1881 | |
1882 | if tables.get().scalars16.len() |
1883 | + tables.get().scalars24.len() |
1884 | + supplementary_tables.get().scalars16.len() |
1885 | + supplementary_tables.get().scalars24.len() |
1886 | > 0xFFF |
1887 | { |
1888 | // The data is from a future where there exists a normalization flavor whose |
1889 | // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points |
1890 | // of space. If a good use case from such a decomposition flavor arises, we can |
1891 | // dynamically change the bit masks so that the length mask becomes 0x1FFF instead |
1892 | // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However, |
1893 | // since for now the masks are hard-coded, error out. |
1894 | return Err(NormalizerError::FutureExtension); |
1895 | } |
1896 | |
1897 | let cap = supplementary_decompositions.get().passthrough_cap; |
1898 | if cap > 0x0300 { |
1899 | return Err(NormalizerError::ValidationError); |
1900 | } |
1901 | let decomposition_capped = cap.min(0xC0); |
1902 | let composition_capped = cap.min(0x0300); |
1903 | |
1904 | Ok(DecomposingNormalizer { |
1905 | decompositions, |
1906 | supplementary_decompositions: Some(SupplementPayloadHolder::Uts46( |
1907 | supplementary_decompositions, |
1908 | )), |
1909 | tables, |
1910 | supplementary_tables: Some(supplementary_tables), |
1911 | decomposition_passthrough_bound: decomposition_capped as u8, |
1912 | composition_passthrough_bound: composition_capped, |
1913 | }) |
1914 | } |
1915 | |
1916 | /// Wraps a delegate iterator into a decomposing iterator |
1917 | /// adapter by using the data already held by this normalizer. |
1918 | pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Decomposition<I> { |
1919 | Decomposition::new_with_supplements( |
1920 | iter, |
1921 | self.decompositions.get(), |
1922 | self.supplementary_decompositions.as_ref().map(|s| s.get()), |
1923 | self.tables.get(), |
1924 | self.supplementary_tables.as_ref().map(|s| s.get()), |
1925 | self.decomposition_passthrough_bound, |
1926 | IgnorableBehavior::Unsupported, |
1927 | ) |
1928 | } |
1929 | |
1930 | normalizer_methods!(); |
1931 | |
1932 | decomposing_normalize_to!( |
1933 | /// Normalize a string slice into a `Write` sink. |
1934 | , |
1935 | normalize_to, |
1936 | core::fmt::Write, |
1937 | &str, |
1938 | { |
1939 | }, |
1940 | as_str, |
1941 | { |
1942 | let decomposition_passthrough_byte_bound = if decomposition_passthrough_bound == 0xC0 { |
1943 | 0xC3u8 |
1944 | } else { |
1945 | decomposition_passthrough_bound.min(0x80) as u8 |
1946 | }; |
1947 | // The attribute belongs on an inner statement, but Rust doesn't allow it there. |
1948 | #[allow (clippy::unwrap_used)] |
1949 | 'fast: loop { |
1950 | let mut code_unit_iter = decomposition.delegate.as_str().as_bytes().iter(); |
1951 | 'fastest: loop { |
1952 | if let Some(&upcoming_byte) = code_unit_iter.next() { |
1953 | if upcoming_byte < decomposition_passthrough_byte_bound { |
1954 | // Fast-track succeeded! |
1955 | continue 'fastest; |
1956 | } |
1957 | decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars(); |
1958 | break 'fastest; |
1959 | } |
1960 | // End of stream |
1961 | sink.write_str(pending_slice)?; |
1962 | return Ok(()); |
1963 | } |
1964 | |
1965 | // `unwrap()` OK, because the slice is valid UTF-8 and we know there |
1966 | // is an upcoming byte. |
1967 | let upcoming = decomposition.delegate.next().unwrap(); |
1968 | let upcoming_with_trie_value = decomposition.attach_trie_value(upcoming); |
1969 | if upcoming_with_trie_value.starter_and_decomposes_to_self() { |
1970 | continue 'fast; |
1971 | } |
1972 | let consumed_so_far_slice = &pending_slice[..pending_slice.len() |
1973 | - decomposition.delegate.as_str().len() |
1974 | - upcoming.len_utf8()]; |
1975 | sink.write_str(consumed_so_far_slice)?; |
1976 | |
1977 | // Now let's figure out if we got a starter or a non-starter. |
1978 | if decomposition_starts_with_non_starter( |
1979 | upcoming_with_trie_value.trie_val, |
1980 | ) { |
1981 | // Let this trie value to be reprocessed in case it is |
1982 | // one of the rare decomposing ones. |
1983 | decomposition.pending = Some(upcoming_with_trie_value); |
1984 | decomposition.gather_and_sort_combining(0); |
1985 | continue 'outer; |
1986 | } |
1987 | undecomposed_starter = upcoming_with_trie_value; |
1988 | debug_assert!(decomposition.pending.is_none()); |
1989 | break 'fast; |
1990 | } |
1991 | }, |
1992 | text, |
1993 | sink, |
1994 | decomposition, |
1995 | decomposition_passthrough_bound, |
1996 | undecomposed_starter, |
1997 | pending_slice, |
1998 | 'outer, |
1999 | ); |
2000 | |
2001 | decomposing_normalize_to!( |
2002 | /// Normalize a slice of potentially-invalid UTF-8 into a `Write` sink. |
2003 | /// |
2004 | /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER |
2005 | /// according to the WHATWG Encoding Standard. |
2006 | , |
2007 | normalize_utf8_to, |
2008 | core::fmt::Write, |
2009 | &[u8], |
2010 | { |
2011 | }, |
2012 | as_slice, |
2013 | { |
2014 | let decomposition_passthrough_byte_bound = decomposition_passthrough_bound.min(0x80) as u8; |
2015 | // The attribute belongs on an inner statement, but Rust doesn't allow it there. |
2016 | #[allow (clippy::unwrap_used)] |
2017 | 'fast: loop { |
2018 | let mut code_unit_iter = decomposition.delegate.as_slice().iter(); |
2019 | 'fastest: loop { |
2020 | if let Some(&upcoming_byte) = code_unit_iter.next() { |
2021 | if upcoming_byte < decomposition_passthrough_byte_bound { |
2022 | // Fast-track succeeded! |
2023 | continue 'fastest; |
2024 | } |
2025 | break 'fastest; |
2026 | } |
2027 | // End of stream |
2028 | sink.write_str(unsafe { from_utf8_unchecked(pending_slice) })?; |
2029 | return Ok(()); |
2030 | } |
2031 | decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars(); |
2032 | |
2033 | // `unwrap()` OK, because the slice is valid UTF-8 and we know there |
2034 | // is an upcoming byte. |
2035 | let upcoming = decomposition.delegate.next().unwrap(); |
2036 | let upcoming_with_trie_value = decomposition.attach_trie_value(upcoming); |
2037 | if upcoming_with_trie_value.starter_and_decomposes_to_self() { |
2038 | if upcoming != REPLACEMENT_CHARACTER { |
2039 | continue 'fast; |
2040 | } |
2041 | // We might have an error, so fall out of the fast path. |
2042 | |
2043 | // Since the U+FFFD might signify an error, we can't |
2044 | // assume `upcoming.len_utf8()` for the backoff length. |
2045 | let mut consumed_so_far = pending_slice[..pending_slice.len() - decomposition.delegate.as_slice().len()].chars(); |
2046 | let back = consumed_so_far.next_back(); |
2047 | debug_assert_eq!(back, Some(REPLACEMENT_CHARACTER)); |
2048 | let consumed_so_far_slice = consumed_so_far.as_slice(); |
2049 | sink.write_str(unsafe{from_utf8_unchecked(consumed_so_far_slice)})?; |
2050 | |
2051 | // We could call `gather_and_sort_combining` here and |
2052 | // `continue 'outer`, but this should be better for code |
2053 | // size. |
2054 | undecomposed_starter = upcoming_with_trie_value; |
2055 | debug_assert!(decomposition.pending.is_none()); |
2056 | break 'fast; |
2057 | } |
2058 | let consumed_so_far_slice = &pending_slice[..pending_slice.len() |
2059 | - decomposition.delegate.as_slice().len() |
2060 | - upcoming.len_utf8()]; |
2061 | sink.write_str(unsafe{from_utf8_unchecked(consumed_so_far_slice)})?; |
2062 | |
2063 | // Now let's figure out if we got a starter or a non-starter. |
2064 | if decomposition_starts_with_non_starter( |
2065 | upcoming_with_trie_value.trie_val, |
2066 | ) { |
2067 | // Let this trie value to be reprocessed in case it is |
2068 | // one of the rare decomposing ones. |
2069 | decomposition.pending = Some(upcoming_with_trie_value); |
2070 | decomposition.gather_and_sort_combining(0); |
2071 | continue 'outer; |
2072 | } |
2073 | undecomposed_starter = upcoming_with_trie_value; |
2074 | debug_assert!(decomposition.pending.is_none()); |
2075 | break 'fast; |
2076 | } |
2077 | }, |
2078 | text, |
2079 | sink, |
2080 | decomposition, |
2081 | decomposition_passthrough_bound, |
2082 | undecomposed_starter, |
2083 | pending_slice, |
2084 | 'outer, |
2085 | ); |
2086 | |
2087 | decomposing_normalize_to!( |
2088 | /// Normalize a slice of potentially-invalid UTF-16 into a `Write16` sink. |
2089 | /// |
2090 | /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER |
2091 | /// before normalizing. |
2092 | , |
2093 | normalize_utf16_to, |
2094 | write16::Write16, |
2095 | &[u16], |
2096 | { |
2097 | sink.size_hint(text.len())?; |
2098 | }, |
2099 | as_slice, |
2100 | { |
2101 | let mut code_unit_iter = decomposition.delegate.as_slice().iter(); |
2102 | // The purpose of the counter is to flush once in a while. If we flush |
2103 | // too much, there is too much flushing overhead. If we flush too rarely, |
2104 | // the flush starts reading from too far behind compared to the hot |
2105 | // recently-read memory. |
2106 | let mut counter = UTF16_FAST_PATH_FLUSH_THRESHOLD; |
2107 | 'fast: loop { |
2108 | counter -= 1; |
2109 | if let Some(&upcoming_code_unit) = code_unit_iter.next() { |
2110 | let mut upcoming32 = u32::from(upcoming_code_unit); |
2111 | if upcoming32 < decomposition_passthrough_bound && counter != 0 { |
2112 | continue 'fast; |
2113 | } |
2114 | // The loop is only broken out of as goto forward |
2115 | #[allow (clippy::never_loop)] |
2116 | 'surrogateloop: loop { |
2117 | let surrogate_base = upcoming32.wrapping_sub(0xD800); |
2118 | if surrogate_base > (0xDFFF - 0xD800) { |
2119 | // Not surrogate |
2120 | break 'surrogateloop; |
2121 | } |
2122 | if surrogate_base <= (0xDBFF - 0xD800) { |
2123 | let iter_backup = code_unit_iter.clone(); |
2124 | if let Some(&low) = code_unit_iter.next() { |
2125 | if in_inclusive_range16(low, 0xDC00, 0xDFFF) { |
2126 | upcoming32 = (upcoming32 << 10) + u32::from(low) |
2127 | - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32); |
2128 | break 'surrogateloop; |
2129 | } else { |
2130 | code_unit_iter = iter_backup; |
2131 | } |
2132 | } |
2133 | } |
2134 | // unpaired surrogate |
2135 | let slice_to_write = &pending_slice |
2136 | [..pending_slice.len() - code_unit_iter.as_slice().len() - 1]; |
2137 | sink.write_slice(slice_to_write)?; |
2138 | undecomposed_starter = |
2139 | CharacterAndTrieValue::new(REPLACEMENT_CHARACTER, 0); |
2140 | debug_assert!(decomposition.pending.is_none()); |
2141 | // We could instead call `gather_and_sort_combining` and `continue 'outer`, but |
2142 | // assuming this is better for code size. |
2143 | break 'fast; |
2144 | } |
2145 | // Not unpaired surrogate |
2146 | let upcoming = unsafe { char::from_u32_unchecked(upcoming32) }; |
2147 | let upcoming_with_trie_value = |
2148 | decomposition.attach_trie_value(upcoming); |
2149 | if upcoming_with_trie_value.starter_and_decomposes_to_self() && counter != 0 { |
2150 | continue 'fast; |
2151 | } |
2152 | let consumed_so_far_slice = &pending_slice[..pending_slice.len() |
2153 | - code_unit_iter.as_slice().len() |
2154 | - upcoming.len_utf16()]; |
2155 | sink.write_slice(consumed_so_far_slice)?; |
2156 | |
2157 | // Now let's figure out if we got a starter or a non-starter. |
2158 | if decomposition_starts_with_non_starter( |
2159 | upcoming_with_trie_value.trie_val, |
2160 | ) { |
2161 | // Sync with main iterator |
2162 | decomposition.delegate = code_unit_iter.as_slice().chars(); |
2163 | // Let this trie value to be reprocessed in case it is |
2164 | // one of the rare decomposing ones. |
2165 | decomposition.pending = Some(upcoming_with_trie_value); |
2166 | decomposition.gather_and_sort_combining(0); |
2167 | continue 'outer; |
2168 | } |
2169 | undecomposed_starter = upcoming_with_trie_value; |
2170 | debug_assert!(decomposition.pending.is_none()); |
2171 | break 'fast; |
2172 | } |
2173 | // End of stream |
2174 | sink.write_slice(pending_slice)?; |
2175 | return Ok(()); |
2176 | } |
2177 | // Sync the main iterator |
2178 | decomposition.delegate = code_unit_iter.as_slice().chars(); |
2179 | }, |
2180 | text, |
2181 | sink, |
2182 | decomposition, |
2183 | decomposition_passthrough_bound, |
2184 | undecomposed_starter, |
2185 | pending_slice, |
2186 | 'outer, |
2187 | ); |
2188 | } |
2189 | |
2190 | /// A normalizer for performing composing normalization. |
2191 | #[derive (Debug)] |
2192 | pub struct ComposingNormalizer { |
2193 | decomposing_normalizer: DecomposingNormalizer, |
2194 | canonical_compositions: DataPayload<CanonicalCompositionsV1Marker>, |
2195 | } |
2196 | |
2197 | impl ComposingNormalizer { |
2198 | /// NFC constructor using compiled data. |
2199 | /// |
2200 | /// ✨ *Enabled with the `compiled_data` Cargo feature.* |
2201 | /// |
2202 | /// [📚 Help choosing a constructor](icu_provider::constructors) |
2203 | #[cfg (feature = "compiled_data" )] |
2204 | pub const fn new_nfc() -> Self { |
2205 | ComposingNormalizer { |
2206 | decomposing_normalizer: DecomposingNormalizer::new_nfd(), |
2207 | canonical_compositions: DataPayload::from_static_ref( |
2208 | crate::provider::Baked::SINGLETON_NORMALIZER_COMP_V1, |
2209 | ), |
2210 | } |
2211 | } |
2212 | |
2213 | icu_provider::gen_any_buffer_data_constructors!( |
2214 | locale: skip, |
2215 | options: skip, |
2216 | error: NormalizerError, |
2217 | #[cfg (skip)] |
2218 | functions: [ |
2219 | new_nfc, |
2220 | try_new_nfc_with_any_provider, |
2221 | try_new_nfc_with_buffer_provider, |
2222 | try_new_nfc_unstable, |
2223 | Self, |
2224 | ] |
2225 | ); |
2226 | |
2227 | #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_nfc)] |
2228 | pub fn try_new_nfc_unstable<D>(provider: &D) -> Result<Self, NormalizerError> |
2229 | where |
2230 | D: DataProvider<CanonicalDecompositionDataV1Marker> |
2231 | + DataProvider<CanonicalDecompositionTablesV1Marker> |
2232 | + DataProvider<CanonicalCompositionsV1Marker> |
2233 | + ?Sized, |
2234 | { |
2235 | let decomposing_normalizer = DecomposingNormalizer::try_new_nfd_unstable(provider)?; |
2236 | |
2237 | let canonical_compositions: DataPayload<CanonicalCompositionsV1Marker> = |
2238 | provider.load(Default::default())?.take_payload()?; |
2239 | |
2240 | Ok(ComposingNormalizer { |
2241 | decomposing_normalizer, |
2242 | canonical_compositions, |
2243 | }) |
2244 | } |
2245 | |
2246 | /// NFKC constructor using compiled data. |
2247 | /// |
2248 | /// ✨ *Enabled with the `compiled_data` Cargo feature.* |
2249 | /// |
2250 | /// [📚 Help choosing a constructor](icu_provider::constructors) |
2251 | #[cfg (feature = "compiled_data" )] |
2252 | pub const fn new_nfkc() -> Self { |
2253 | ComposingNormalizer { |
2254 | decomposing_normalizer: DecomposingNormalizer::new_nfkd(), |
2255 | canonical_compositions: DataPayload::from_static_ref( |
2256 | crate::provider::Baked::SINGLETON_NORMALIZER_COMP_V1, |
2257 | ), |
2258 | } |
2259 | } |
2260 | |
2261 | icu_provider::gen_any_buffer_data_constructors!( |
2262 | locale: skip, |
2263 | options: skip, |
2264 | error: NormalizerError, |
2265 | #[cfg (skip)] |
2266 | functions: [ |
2267 | new_nfkc, |
2268 | try_new_nfkc_with_any_provider, |
2269 | try_new_nfkc_with_buffer_provider, |
2270 | try_new_nfkc_unstable, |
2271 | Self, |
2272 | ] |
2273 | ); |
2274 | |
2275 | #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_nfkc)] |
2276 | pub fn try_new_nfkc_unstable<D>(provider: &D) -> Result<Self, NormalizerError> |
2277 | where |
2278 | D: DataProvider<CanonicalDecompositionDataV1Marker> |
2279 | + DataProvider<CompatibilityDecompositionSupplementV1Marker> |
2280 | + DataProvider<CanonicalDecompositionTablesV1Marker> |
2281 | + DataProvider<CompatibilityDecompositionTablesV1Marker> |
2282 | + DataProvider<CanonicalCompositionsV1Marker> |
2283 | + ?Sized, |
2284 | { |
2285 | let decomposing_normalizer = DecomposingNormalizer::try_new_nfkd_unstable(provider)?; |
2286 | |
2287 | let canonical_compositions: DataPayload<CanonicalCompositionsV1Marker> = |
2288 | provider.load(Default::default())?.take_payload()?; |
2289 | |
2290 | Ok(ComposingNormalizer { |
2291 | decomposing_normalizer, |
2292 | canonical_compositions, |
2293 | }) |
2294 | } |
2295 | |
2296 | /// This is a special building block normalization for IDNA that implements parts of the Map |
2297 | /// step and the following Normalize step. |
2298 | /// |
2299 | /// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior |
2300 | /// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns |
2301 | /// U+0345 from a reordered character into a non-reordered character before reordering happens. |
2302 | /// Therefore, the output of this normalization may differ for different inputs that are |
2303 | /// canonically equivalents with each other if they differ by how U+0345 is ordered relative |
2304 | /// to other reorderable characters. |
2305 | #[cfg (feature = "compiled_data" )] |
2306 | pub(crate) const fn new_uts46() -> Self { |
2307 | ComposingNormalizer { |
2308 | decomposing_normalizer: DecomposingNormalizer::new_uts46_decomposed(), |
2309 | canonical_compositions: DataPayload::from_static_ref( |
2310 | crate::provider::Baked::SINGLETON_NORMALIZER_COMP_V1, |
2311 | ), |
2312 | } |
2313 | } |
2314 | |
2315 | #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_uts46)] |
2316 | pub(crate) fn try_new_uts46_unstable<D>(provider: &D) -> Result<Self, NormalizerError> |
2317 | where |
2318 | D: DataProvider<CanonicalDecompositionDataV1Marker> |
2319 | + DataProvider<Uts46DecompositionSupplementV1Marker> |
2320 | + DataProvider<CanonicalDecompositionTablesV1Marker> |
2321 | + DataProvider<CompatibilityDecompositionTablesV1Marker> |
2322 | // UTS 46 tables merged into CompatibilityDecompositionTablesV1Marker |
2323 | + DataProvider<CanonicalCompositionsV1Marker> |
2324 | + ?Sized, |
2325 | { |
2326 | let decomposing_normalizer = |
2327 | DecomposingNormalizer::try_new_uts46_decomposed_unstable(provider)?; |
2328 | |
2329 | let canonical_compositions: DataPayload<CanonicalCompositionsV1Marker> = |
2330 | provider.load(Default::default())?.take_payload()?; |
2331 | |
2332 | Ok(ComposingNormalizer { |
2333 | decomposing_normalizer, |
2334 | canonical_compositions, |
2335 | }) |
2336 | } |
2337 | |
2338 | /// Wraps a delegate iterator into a composing iterator |
2339 | /// adapter by using the data already held by this normalizer. |
2340 | pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Composition<I> { |
2341 | self.normalize_iter_private(iter, IgnorableBehavior::Unsupported) |
2342 | } |
2343 | |
2344 | fn normalize_iter_private<I: Iterator<Item = char>>( |
2345 | &self, |
2346 | iter: I, |
2347 | ignorable_behavior: IgnorableBehavior, |
2348 | ) -> Composition<I> { |
2349 | Composition::new( |
2350 | Decomposition::new_with_supplements( |
2351 | iter, |
2352 | self.decomposing_normalizer.decompositions.get(), |
2353 | self.decomposing_normalizer |
2354 | .supplementary_decompositions |
2355 | .as_ref() |
2356 | .map(|s| s.get()), |
2357 | self.decomposing_normalizer.tables.get(), |
2358 | self.decomposing_normalizer |
2359 | .supplementary_tables |
2360 | .as_ref() |
2361 | .map(|s| s.get()), |
2362 | self.decomposing_normalizer.decomposition_passthrough_bound, |
2363 | ignorable_behavior, |
2364 | ), |
2365 | ZeroFrom::zero_from(&self.canonical_compositions.get().canonical_compositions), |
2366 | self.decomposing_normalizer.composition_passthrough_bound, |
2367 | ) |
2368 | } |
2369 | |
2370 | normalizer_methods!(); |
2371 | |
2372 | composing_normalize_to!( |
2373 | /// Normalize a string slice into a `Write` sink. |
2374 | , |
2375 | normalize_to, |
2376 | core::fmt::Write, |
2377 | &str, |
2378 | {}, |
2379 | true, |
2380 | as_str, |
2381 | { |
2382 | // Let's hope LICM hoists this outside `'outer`. |
2383 | let composition_passthrough_byte_bound = if composition_passthrough_bound == 0x300 { |
2384 | 0xCCu8 |
2385 | } else { |
2386 | // We can make this fancy if a normalization other than NFC where looking at |
2387 | // non-ASCII lead bytes is worthwhile is ever introduced. |
2388 | composition_passthrough_bound.min(0x80) as u8 |
2389 | }; |
2390 | // This is basically an `Option` discriminant for `undecomposed_starter`, |
2391 | // but making it a boolean so that writes in the tightest loop are as |
2392 | // simple as possible (and potentially as peel-hoistable as possible). |
2393 | // Furthermore, this reduces `unwrap()` later. |
2394 | let mut undecomposed_starter_valid = true; |
2395 | // Annotation belongs really on inner statements, but Rust doesn't |
2396 | // allow it there. |
2397 | #[allow (clippy::unwrap_used)] |
2398 | 'fast: loop { |
2399 | let mut code_unit_iter = composition.decomposition.delegate.as_str().as_bytes().iter(); |
2400 | 'fastest: loop { |
2401 | if let Some(&upcoming_byte) = code_unit_iter.next() { |
2402 | if upcoming_byte < composition_passthrough_byte_bound { |
2403 | // Fast-track succeeded! |
2404 | undecomposed_starter_valid = false; |
2405 | continue 'fastest; |
2406 | } |
2407 | composition.decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars(); |
2408 | break 'fastest; |
2409 | } |
2410 | // End of stream |
2411 | sink.write_str(pending_slice)?; |
2412 | return Ok(()); |
2413 | } |
2414 | // `unwrap()` OK, because the slice is valid UTF-8 and we know there |
2415 | // is an upcoming byte. |
2416 | let upcoming = composition.decomposition.delegate.next().unwrap(); |
2417 | let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming); |
2418 | if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() { |
2419 | // Can't combine backwards, hence a plain (non-backwards-combining) |
2420 | // starter albeit past `composition_passthrough_bound` |
2421 | |
2422 | // Fast-track succeeded! |
2423 | undecomposed_starter = upcoming_with_trie_value; |
2424 | undecomposed_starter_valid = true; |
2425 | continue 'fast; |
2426 | } |
2427 | // We need to fall off the fast path. |
2428 | composition.decomposition.pending = Some(upcoming_with_trie_value); |
2429 | let consumed_so_far_slice = if undecomposed_starter_valid { |
2430 | &pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_str().len() - upcoming.len_utf8() - undecomposed_starter.character.len_utf8()] |
2431 | } else { |
2432 | // slicing and unwrap OK, because we've just evidently read enough previously. |
2433 | let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_str().len() - upcoming.len_utf8()].chars(); |
2434 | // `unwrap` OK, because we've previously manage to read the previous character |
2435 | undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap()); |
2436 | undecomposed_starter_valid = true; |
2437 | consumed_so_far.as_str() |
2438 | }; |
2439 | sink.write_str(consumed_so_far_slice)?; |
2440 | break 'fast; |
2441 | } |
2442 | debug_assert!(undecomposed_starter_valid); |
2443 | }, |
2444 | text, |
2445 | sink, |
2446 | composition, |
2447 | composition_passthrough_bound, |
2448 | undecomposed_starter, |
2449 | pending_slice, |
2450 | len_utf8, |
2451 | ); |
2452 | |
2453 | composing_normalize_to!( |
2454 | /// Normalize a slice of potentially-invalid UTF-8 into a `Write` sink. |
2455 | /// |
2456 | /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER |
2457 | /// according to the WHATWG Encoding Standard. |
2458 | , |
2459 | normalize_utf8_to, |
2460 | core::fmt::Write, |
2461 | &[u8], |
2462 | {}, |
2463 | false, |
2464 | as_slice, |
2465 | { |
2466 | // This is basically an `Option` discriminant for `undecomposed_starter`, |
2467 | // but making it a boolean so that writes in the tightest loop are as |
2468 | // simple as possible (and potentially as peel-hoistable as possible). |
2469 | // Furthermore, this reduces `unwrap()` later. |
2470 | let mut undecomposed_starter_valid = true; |
2471 | 'fast: loop { |
2472 | if let Some(upcoming) = composition.decomposition.delegate.next() { |
2473 | if u32::from(upcoming) < composition_passthrough_bound { |
2474 | // Fast-track succeeded! |
2475 | undecomposed_starter_valid = false; |
2476 | continue 'fast; |
2477 | } |
2478 | // TODO(#2006): Annotate as unlikely |
2479 | if upcoming == REPLACEMENT_CHARACTER { |
2480 | // Can't tell if this is an error or a literal U+FFFD in |
2481 | // the input. Assuming the former to be sure. |
2482 | |
2483 | // Since the U+FFFD might signify an error, we can't |
2484 | // assume `upcoming.len_utf8()` for the backoff length. |
2485 | let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len()].chars(); |
2486 | let back = consumed_so_far.next_back(); |
2487 | debug_assert_eq!(back, Some(REPLACEMENT_CHARACTER)); |
2488 | let consumed_so_far_slice = consumed_so_far.as_slice(); |
2489 | sink.write_str(unsafe{ from_utf8_unchecked(consumed_so_far_slice)})?; |
2490 | undecomposed_starter = CharacterAndTrieValue::new(REPLACEMENT_CHARACTER, 0); |
2491 | undecomposed_starter_valid = true; |
2492 | composition.decomposition.pending = None; |
2493 | break 'fast; |
2494 | } |
2495 | let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming); |
2496 | if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() { |
2497 | // Can't combine backwards, hence a plain (non-backwards-combining) |
2498 | // starter albeit past `composition_passthrough_bound` |
2499 | |
2500 | // Fast-track succeeded! |
2501 | undecomposed_starter = upcoming_with_trie_value; |
2502 | undecomposed_starter_valid = true; |
2503 | continue 'fast; |
2504 | } |
2505 | // We need to fall off the fast path. |
2506 | composition.decomposition.pending = Some(upcoming_with_trie_value); |
2507 | // Annotation belongs really on inner statement, but Rust doesn't |
2508 | // allow it there. |
2509 | #[allow (clippy::unwrap_used)] |
2510 | let consumed_so_far_slice = if undecomposed_starter_valid { |
2511 | &pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len() - upcoming.len_utf8() - undecomposed_starter.character.len_utf8()] |
2512 | } else { |
2513 | // slicing and unwrap OK, because we've just evidently read enough previously. |
2514 | let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len() - upcoming.len_utf8()].chars(); |
2515 | // `unwrap` OK, because we've previously manage to read the previous character |
2516 | undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap()); |
2517 | undecomposed_starter_valid = true; |
2518 | consumed_so_far.as_slice() |
2519 | }; |
2520 | sink.write_str(unsafe { from_utf8_unchecked(consumed_so_far_slice)})?; |
2521 | break 'fast; |
2522 | } |
2523 | // End of stream |
2524 | sink.write_str(unsafe {from_utf8_unchecked(pending_slice) })?; |
2525 | return Ok(()); |
2526 | } |
2527 | debug_assert!(undecomposed_starter_valid); |
2528 | }, |
2529 | text, |
2530 | sink, |
2531 | composition, |
2532 | composition_passthrough_bound, |
2533 | undecomposed_starter, |
2534 | pending_slice, |
2535 | len_utf8, |
2536 | ); |
2537 | |
2538 | composing_normalize_to!( |
2539 | /// Normalize a slice of potentially-invalid UTF-16 into a `Write16` sink. |
2540 | /// |
2541 | /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER |
2542 | /// before normalizing. |
2543 | , |
2544 | normalize_utf16_to, |
2545 | write16::Write16, |
2546 | &[u16], |
2547 | { |
2548 | sink.size_hint(text.len())?; |
2549 | }, |
2550 | false, |
2551 | as_slice, |
2552 | { |
2553 | let mut code_unit_iter = composition.decomposition.delegate.as_slice().iter(); |
2554 | let mut upcoming32; |
2555 | // This is basically an `Option` discriminant for `undecomposed_starter`, |
2556 | // but making it a boolean so that writes to it are are as |
2557 | // simple as possible. |
2558 | // Furthermore, this removes the need for `unwrap()` later. |
2559 | let mut undecomposed_starter_valid; |
2560 | // The purpose of the counter is to flush once in a while. If we flush |
2561 | // too much, there is too much flushing overhead. If we flush too rarely, |
2562 | // the flush starts reading from too far behind compared to the hot |
2563 | // recently-read memory. |
2564 | let mut counter = UTF16_FAST_PATH_FLUSH_THRESHOLD; |
2565 | // The purpose of this trickiness is to avoid writing to |
2566 | // `undecomposed_starter_valid` from the tightest loop. Writing to it |
2567 | // from there destroys performance. |
2568 | let mut counter_reference = counter - 1; |
2569 | 'fast: loop { |
2570 | counter -= 1; |
2571 | if let Some(&upcoming_code_unit) = code_unit_iter.next() { |
2572 | upcoming32 = u32::from(upcoming_code_unit); // may be surrogate |
2573 | if upcoming32 < composition_passthrough_bound && counter != 0 { |
2574 | // No need for surrogate or U+FFFD check, because |
2575 | // `composition_passthrough_bound` cannot be higher than |
2576 | // U+0300. |
2577 | // Fast-track succeeded! |
2578 | continue 'fast; |
2579 | } |
2580 | // if `counter` equals `counter_reference`, the `continue 'fast` |
2581 | // line above has not executed and `undecomposed_starter` is still |
2582 | // valid. |
2583 | undecomposed_starter_valid = counter == counter_reference; |
2584 | // The loop is only broken out of as goto forward |
2585 | #[allow (clippy::never_loop)] |
2586 | 'surrogateloop: loop { |
2587 | let surrogate_base = upcoming32.wrapping_sub(0xD800); |
2588 | if surrogate_base > (0xDFFF - 0xD800) { |
2589 | // Not surrogate |
2590 | break 'surrogateloop; |
2591 | } |
2592 | if surrogate_base <= (0xDBFF - 0xD800) { |
2593 | let iter_backup = code_unit_iter.clone(); |
2594 | if let Some(&low) = code_unit_iter.next() { |
2595 | if in_inclusive_range16(low, 0xDC00, 0xDFFF) { |
2596 | upcoming32 = (upcoming32 << 10) + u32::from(low) |
2597 | - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32); |
2598 | break 'surrogateloop; |
2599 | } else { |
2600 | code_unit_iter = iter_backup; |
2601 | } |
2602 | } |
2603 | } |
2604 | // unpaired surrogate |
2605 | let slice_to_write = &pending_slice[..pending_slice.len() - code_unit_iter.as_slice().len() - 1]; |
2606 | sink.write_slice(slice_to_write)?; |
2607 | undecomposed_starter = CharacterAndTrieValue::new(REPLACEMENT_CHARACTER, 0); |
2608 | undecomposed_starter_valid = true; |
2609 | composition.decomposition.pending = None; |
2610 | break 'fast; |
2611 | } |
2612 | // Not unpaired surrogate |
2613 | let upcoming = unsafe { char::from_u32_unchecked(upcoming32) }; |
2614 | let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming); |
2615 | if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() && counter != 0 { |
2616 | // Can't combine backwards, hence a plain (non-backwards-combining) |
2617 | // starter albeit past `composition_passthrough_bound` |
2618 | |
2619 | // Fast-track succeeded! |
2620 | undecomposed_starter = upcoming_with_trie_value; |
2621 | // Cause `undecomposed_starter_valid` to be set to true. |
2622 | // This regresses English performance on Haswell by 11% |
2623 | // compared to commenting out this assignment to |
2624 | // `counter_reference`. |
2625 | counter_reference = counter - 1; |
2626 | continue 'fast; |
2627 | } |
2628 | // We need to fall off the fast path. |
2629 | composition.decomposition.pending = Some(upcoming_with_trie_value); |
2630 | // Annotation belongs really on inner statement, but Rust doesn't |
2631 | // allow it there. |
2632 | #[allow (clippy::unwrap_used)] |
2633 | let consumed_so_far_slice = if undecomposed_starter_valid { |
2634 | &pending_slice[..pending_slice.len() - code_unit_iter.as_slice().len() - upcoming.len_utf16() - undecomposed_starter.character.len_utf16()] |
2635 | } else { |
2636 | // slicing and unwrap OK, because we've just evidently read enough previously. |
2637 | let mut consumed_so_far = pending_slice[..pending_slice.len() - code_unit_iter.as_slice().len() - upcoming.len_utf16()].chars(); |
2638 | // `unwrap` OK, because we've previously manage to read the previous character |
2639 | undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap()); |
2640 | undecomposed_starter_valid = true; |
2641 | consumed_so_far.as_slice() |
2642 | }; |
2643 | sink.write_slice(consumed_so_far_slice)?; |
2644 | break 'fast; |
2645 | } |
2646 | // End of stream |
2647 | sink.write_slice(pending_slice)?; |
2648 | return Ok(()); |
2649 | } |
2650 | debug_assert!(undecomposed_starter_valid); |
2651 | // Sync the main iterator |
2652 | composition.decomposition.delegate = code_unit_iter.as_slice().chars(); |
2653 | }, |
2654 | text, |
2655 | sink, |
2656 | composition, |
2657 | composition_passthrough_bound, |
2658 | undecomposed_starter, |
2659 | pending_slice, |
2660 | len_utf16, |
2661 | ); |
2662 | } |
2663 | |
2664 | struct IsNormalizedSinkUtf16<'a> { |
2665 | expect: &'a [u16], |
2666 | } |
2667 | |
2668 | impl<'a> IsNormalizedSinkUtf16<'a> { |
2669 | pub fn new(slice: &'a [u16]) -> Self { |
2670 | IsNormalizedSinkUtf16 { expect: slice } |
2671 | } |
2672 | pub fn finished(&self) -> bool { |
2673 | self.expect.is_empty() |
2674 | } |
2675 | } |
2676 | |
2677 | impl<'a> Write16 for IsNormalizedSinkUtf16<'a> { |
2678 | fn write_slice(&mut self, s: &[u16]) -> core::fmt::Result { |
2679 | // We know that if we get a slice, it's a pass-through, |
2680 | // so we can compare addresses. Indexing is OK, because |
2681 | // an indexing failure would be a code bug rather than |
2682 | // an input or data issue. |
2683 | #[allow (clippy::indexing_slicing)] |
2684 | if s.as_ptr() == self.expect.as_ptr() { |
2685 | self.expect = &self.expect[s.len()..]; |
2686 | Ok(()) |
2687 | } else { |
2688 | Err(core::fmt::Error {}) |
2689 | } |
2690 | } |
2691 | |
2692 | fn write_char(&mut self, c: char) -> core::fmt::Result { |
2693 | let mut iter = self.expect.chars(); |
2694 | if iter.next() == Some(c) { |
2695 | self.expect = iter.as_slice(); |
2696 | Ok(()) |
2697 | } else { |
2698 | Err(core::fmt::Error {}) |
2699 | } |
2700 | } |
2701 | } |
2702 | |
2703 | struct IsNormalizedSinkUtf8<'a> { |
2704 | expect: &'a [u8], |
2705 | } |
2706 | |
2707 | impl<'a> IsNormalizedSinkUtf8<'a> { |
2708 | pub fn new(slice: &'a [u8]) -> Self { |
2709 | IsNormalizedSinkUtf8 { expect: slice } |
2710 | } |
2711 | pub fn finished(&self) -> bool { |
2712 | self.expect.is_empty() |
2713 | } |
2714 | } |
2715 | |
2716 | impl<'a> core::fmt::Write for IsNormalizedSinkUtf8<'a> { |
2717 | fn write_str(&mut self, s: &str) -> core::fmt::Result { |
2718 | // We know that if we get a slice, it's a pass-through, |
2719 | // so we can compare addresses. Indexing is OK, because |
2720 | // an indexing failure would be a code bug rather than |
2721 | // an input or data issue. |
2722 | #[allow (clippy::indexing_slicing)] |
2723 | if s.as_ptr() == self.expect.as_ptr() { |
2724 | self.expect = &self.expect[s.len()..]; |
2725 | Ok(()) |
2726 | } else { |
2727 | Err(core::fmt::Error {}) |
2728 | } |
2729 | } |
2730 | |
2731 | fn write_char(&mut self, c: char) -> core::fmt::Result { |
2732 | let mut iter = self.expect.chars(); |
2733 | if iter.next() == Some(c) { |
2734 | self.expect = iter.as_slice(); |
2735 | Ok(()) |
2736 | } else { |
2737 | Err(core::fmt::Error {}) |
2738 | } |
2739 | } |
2740 | } |
2741 | |
2742 | struct IsNormalizedSinkStr<'a> { |
2743 | expect: &'a str, |
2744 | } |
2745 | |
2746 | impl<'a> IsNormalizedSinkStr<'a> { |
2747 | pub fn new(slice: &'a str) -> Self { |
2748 | IsNormalizedSinkStr { expect: slice } |
2749 | } |
2750 | pub fn finished(&self) -> bool { |
2751 | self.expect.is_empty() |
2752 | } |
2753 | } |
2754 | |
2755 | impl<'a> core::fmt::Write for IsNormalizedSinkStr<'a> { |
2756 | fn write_str(&mut self, s: &str) -> core::fmt::Result { |
2757 | // We know that if we get a slice, it's a pass-through, |
2758 | // so we can compare addresses. Indexing is OK, because |
2759 | // an indexing failure would be a code bug rather than |
2760 | // an input or data issue. |
2761 | #[allow (clippy::indexing_slicing)] |
2762 | if s.as_ptr() == self.expect.as_ptr() { |
2763 | self.expect = &self.expect[s.len()..]; |
2764 | Ok(()) |
2765 | } else { |
2766 | Err(core::fmt::Error {}) |
2767 | } |
2768 | } |
2769 | |
2770 | fn write_char(&mut self, c: char) -> core::fmt::Result { |
2771 | let mut iter = self.expect.chars(); |
2772 | if iter.next() == Some(c) { |
2773 | self.expect = iter.as_str(); |
2774 | Ok(()) |
2775 | } else { |
2776 | Err(core::fmt::Error {}) |
2777 | } |
2778 | } |
2779 | } |
2780 | |