lib.rs source code [crates/icu_normalizer/src/lib.rs]

1	// This file is part of ICU4X. For terms of use, please see the file
2	// called LICENSE at the top level of the ICU4X source tree
3	// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5	// https://github.com/unicode-org/icu4x/blob/main/documents/process/boilerplate.md#library-annotations
6	#![cfg_attr(not(any(test, feature = "std")), no_std)]
7	#![cfg_attr(
8	not(test),
9	deny(
10	clippy::indexing_slicing,
11	clippy::unwrap_used,
12	clippy::expect_used,
13	clippy::panic,
14	clippy::exhaustive_structs,
15	clippy::exhaustive_enums,
16	missing_debug_implementations,
17	)
18	)]
19	#![warn(missing_docs)]
20
21	//! Normalizing text into Unicode Normalization Forms.
22	//!
23	//! This module is published as its own crate ([`icu_normalizer`](https://docs.rs/icu_normalizer/latest/icu_normalizer/))
24	//! and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project.
25	//!
26	//! # Implementation notes
27	//!
28	//! The normalizer operates on a lazy iterator over Unicode scalar values (Rust `char`) internally
29	//! and iterating over guaranteed-valid UTF-8, potentially-invalid UTF-8, and potentially-invalid
30	//! UTF-16 is a step that doesn’t leak into the normalizer internals. Ill-formed byte sequences are
31	//! treated as U+FFFD.
32	//!
33	//! The normalizer data layout is not based on the ICU4C design at all. Instead, the normalization
34	//! data layout is a clean-slate design optimized for the concept of fusing the NFD decomposition
35	//! into the collator. That is, the decomposing normalizer is a by-product of the collator-motivated
36	//! data layout.
37	//!
38	//! Notably, the decomposition data structure is optimized for a starter decomposing to itself,
39	//! which is the most common case, and for a starter decomposing to a starter and a non-starter
40	//! on the Basic Multilingual Plane. Notably, in this case, the collator makes use of the
41	//! knowledge that the second character of such a decomposition is a non-starter. Therefore,
42	//! decomposition into two starters is handled by generic fallback path that looks the
43	//! decomposition from an array by offset and length instead of baking a BMP starter pair directly
44	//! into a trie value.
45	//!
46	//! The decompositions into non-starters are hard-coded. At present in Unicode, these appear
47	//! to be special cases falling into three categories:
48	//!
49	//! 1. Deprecated combining marks.
50	//! 2. Particular Tibetan vowel sings.
51	//! 3. NFKD only: half-width kana voicing marks.
52	//!
53	//! Hopefully Unicode never adds more decompositions into non-starters (other than a character
54	//! decomposing to itself), but if it does, a code update is needed instead of a mere data update.
55	//!
56	//! The composing normalizer builds on the decomposing normalizer by performing the canonical
57	//! composition post-processing per spec. As an optimization, though, the composing normalizer
58	//! attempts to pass through already-normalized text consisting of starters that never combine
59	//! backwards and that map to themselves if followed by a character whose decomposition starts
60	//! with a starter that never combines backwards.
61	//!
62	//! As a difference with ICU4C, the composing normalizer has only the simplest possible
63	//! passthrough (only one inversion list lookup per character in the best case) and the full
64	//! decompose-then-canonically-compose behavior, whereas ICU4C has other paths between these
65	//! extremes. The ICU4X collator doesn't make use of the FCD concept at all in order to avoid
66	//! doing the work of checking whether the FCD condition holds.
67
68	extern crate alloc;
69
70	mod error;
71	pub mod properties;
72	pub mod provider;
73	pub mod uts46;
74
75	pub use crate::error::NormalizerError;
76
77	#[doc(no_inline)]
78	pub use NormalizerError as Error;
79
80	use crate::provider::CanonicalDecompositionDataV1Marker;
81	use crate::provider::CompatibilityDecompositionSupplementV1Marker;
82	use crate::provider::DecompositionDataV1;
83	use crate::provider::Uts46DecompositionSupplementV1Marker;
84	use alloc::string::String;
85	use alloc::vec::Vec;
86	use core::char::REPLACEMENT_CHARACTER;
87	use core::str::from_utf8_unchecked;
88	use icu_collections::char16trie::Char16Trie;
89	use icu_collections::char16trie::Char16TrieIterator;
90	use icu_collections::char16trie::TrieResult;
91	use icu_collections::codepointtrie::CodePointTrie;
92	use icu_properties::CanonicalCombiningClass;
93	use icu_provider::prelude::*;
94	use provider::CanonicalCompositionsV1Marker;
95	use provider::CanonicalDecompositionTablesV1Marker;
96	use provider::CompatibilityDecompositionTablesV1Marker;
97	use provider::DecompositionSupplementV1;
98	use provider::DecompositionTablesV1;
99	use smallvec::SmallVec;
100	use utf16_iter::Utf16CharsEx;
101	use utf8_iter::Utf8CharsEx;
102	use write16::Write16;
103	use zerofrom::ZeroFrom;
104	use zerovec::{zeroslice, ZeroSlice};
105
106	#[derive(Debug)]
107	enum SupplementPayloadHolder {
108	Compatibility(DataPayload<CompatibilityDecompositionSupplementV1Marker>),
109	Uts46(DataPayload<Uts46DecompositionSupplementV1Marker>),
110	}
111
112	impl SupplementPayloadHolder {
113	fn get(&self) -> &DecompositionSupplementV1 {
114	match self {
115	SupplementPayloadHolder::Compatibility(d: &DataPayload<{unknown}>) => d.get(),
116	SupplementPayloadHolder::Uts46(d: &DataPayload<{unknown}>) => d.get(),
117	}
118	}
119	}
120
121	/// Treatment of the ignorable marker (0xFFFFFFFF) in data.
122	#[derive(Debug, PartialEq, Eq)]
123	enum IgnorableBehavior {
124	/// 0xFFFFFFFF in data is not supported.
125	Unsupported,
126	/// Ignorables are ignored.
127	Ignored,
128	/// Ignorables are treated as singleton decompositions
129	/// to the REPLACEMENT CHARACTER.
130	ReplacementCharacter,
131	}
132
133	/// Number of iterations allowed on the fast path before flushing.
134	/// Since a typical UTF-16 iteration advances over a 2-byte BMP
135	/// character, this means two memory pages.
136	/// Intel Core i7-4770 had the best results between 2 and 4 pages
137	/// when testing powers of two. Apple M1 didn't seem to care
138	/// about 1, 2, 4, or 8 pages.
139	///
140	/// Curiously, the `str` case does not appear to benefit from
141	/// similar flushing, though the tested monomorphization never
142	/// passes an error through from `Write`.
143	const UTF16_FAST_PATH_FLUSH_THRESHOLD: usize = `4096`;
144
145	/// Marker for UTS 46 ignorables.
146	const IGNORABLE_MARKER: u32 = `0xFFFFFFFF`;
147
148	/// Marker for starters that decompose to themselves but may
149	/// combine backwards under canonical composition.
150	/// (Main trie only; not used in the supplementary trie.)
151	const BACKWARD_COMBINING_STARTER_MARKER: u32 = `1`;
152
153	/// Magic marker trie value for characters whose decomposition
154	/// starts with a non-starter. The actual decomposition is
155	/// hard-coded.
156	const SPECIAL_NON_STARTER_DECOMPOSITION_MARKER: u32 = `2`;
157
158	/// `u16` version of the previous marker value.
159	const SPECIAL_NON_STARTER_DECOMPOSITION_MARKER_U16: u16 = `2`;
160
161	/// Marker that a complex decomposition isn't round-trippable
162	/// under re-composition.
163	const NON_ROUND_TRIP_MARKER: u16 = `1`;
164
165	/// Checks if a trie value carries a (non-zero) canonical
166	/// combining class.
167	fn trie_value_has_ccc(trie_value: u32) -> bool {
168	(trie_value & `0xFFFFFF00`) == `0xD800`
169	}
170
171	/// Checks if the trie signifies a special non-starter decomposition.
172	fn trie_value_indicates_special_non_starter_decomposition(trie_value: u32) -> bool {
173	trie_value == SPECIAL_NON_STARTER_DECOMPOSITION_MARKER
174	}
175
176	/// Checks if a trie value signifies a character whose decomposition
177	/// starts with a non-starter.
178	fn decomposition_starts_with_non_starter(trie_value: u32) -> bool {
179	trie_value_has_ccc(trie_value)
180	\|\| trie_value_indicates_special_non_starter_decomposition(trie_value)
181	}
182
183	/// Extracts a canonical combining class (possibly zero) from a trie value.
184	///
185	/// # Panics
186	///
187	/// The trie value must not be one that signifies a special non-starter
188	/// decomposition. (Debug-only)
189	fn ccc_from_trie_value(trie_value: u32) -> CanonicalCombiningClass {
190	if trie_value_has_ccc(trie_value) {
191	CanonicalCombiningClass(trie_value as u8)
192	} else {
193	debug_assert_ne!(trie_value, SPECIAL_NON_STARTER_DECOMPOSITION_MARKER);
194	CanonicalCombiningClass::NotReordered
195	}
196	}
197
198	/// The tail (everything after the first character) of the NFKD form U+FDFA
199	/// as 16-bit units.
200	static FDFA_NFKD: [u16; `17`] = [
201	`0x644`, `0x649`, `0x20`, `0x627`, `0x644`, `0x644`, `0x647`, `0x20`, `0x639`, `0x644`, `0x64A`, `0x647`, `0x20`, `0x648`,
202	`0x633`, `0x644`, `0x645`,
203	];
204
205	/// Marker value for U+FDFA in NFKD
206	const FDFA_MARKER: u16 = `3`;
207
208	// These constants originate from page 143 of Unicode 14.0
209	/// Syllable base
210	const HANGUL_S_BASE: u32 = `0xAC00`;
211	/// Lead jamo base
212	const HANGUL_L_BASE: u32 = `0x1100`;
213	/// Vowel jamo base
214	const HANGUL_V_BASE: u32 = `0x1161`;
215	/// Trail jamo base (deliberately off by one to account for the absence of a trail)
216	const HANGUL_T_BASE: u32 = `0x11A7`;
217	/// Lead jamo count
218	const HANGUL_L_COUNT: u32 = `19`;
219	/// Vowel jamo count
220	const HANGUL_V_COUNT: u32 = `21`;
221	/// Trail jamo count (deliberately off by one to account for the absence of a trail)
222	const HANGUL_T_COUNT: u32 = `28`;
223	/// Vowel jamo count times trail jamo count
224	const HANGUL_N_COUNT: u32 = `588`;
225	/// Syllable count
226	const HANGUL_S_COUNT: u32 = `11172`;
227
228	/// One past the conjoining jamo block
229	const HANGUL_JAMO_LIMIT: u32 = `0x1200`;
230
231	/// If `opt` is `Some`, unwrap it. If `None`, panic if debug assertions
232	/// are enabled and return `default` if debug assertions are not enabled.
233	///
234	/// Use this only if the only reason why `opt` could be `None` is bogus
235	/// data from the provider.
236	#[inline(always)]
237	fn unwrap_or_gigo<T>(opt: Option<T>, default: T) -> T {
238	if let Some(val: T) = opt {
239	val
240	} else {
241	// GIGO case
242	debug_assert!(`false`);
243	default
244	}
245	}
246
247	/// Convert a `u32` _obtained from data provider data_ to `char`.
248	#[inline(always)]
249	fn char_from_u32(u: u32) -> char {
250	unwrap_or_gigo(opt:core::char::from_u32(u), REPLACEMENT_CHARACTER)
251	}
252
253	/// Convert a `u16` _obtained from data provider data_ to `char`.
254	#[inline(always)]
255	fn char_from_u16(u: u16) -> char {
256	char_from_u32(u32::from(u))
257	}
258
259	const EMPTY_U16: &ZeroSlice<u16> = zeroslice![];
260
261	const EMPTY_CHAR: &ZeroSlice<char> = zeroslice![];
262
263	#[inline(always)]
264	fn in_inclusive_range(c: char, start: char, end: char) -> bool {
265	u32::from(c).wrapping_sub(u32::from(start)) <= (u32::from(end) - u32::from(start))
266	}
267
268	#[inline(always)]
269	fn in_inclusive_range32(u: u32, start: u32, end: u32) -> bool {
270	u.wrapping_sub(start) <= (end - start)
271	}
272
273	#[inline(always)]
274	fn in_inclusive_range16(u: u16, start: u16, end: u16) -> bool {
275	u.wrapping_sub(start) <= (end - start)
276	}
277
278	/// Performs canonical composition (including Hangul) on a pair of
279	/// characters or returns `None` if these characters don't compose.
280	/// Composition exclusions are taken into account.
281	#[inline]
282	fn compose(iter: Char16TrieIterator, starter: char, second: char) -> Option<char> {
283	let v: u32 = u32::from(second).wrapping_sub(HANGUL_V_BASE);
284	if v >= HANGUL_JAMO_LIMIT - HANGUL_V_BASE {
285	return compose_non_hangul(iter, starter, second);
286	}
287	if v < HANGUL_V_COUNT {
288	let l: u32 = u32::from(starter).wrapping_sub(HANGUL_L_BASE);
289	if l < HANGUL_L_COUNT {
290	let lv: u32 = l * HANGUL_N_COUNT + v * HANGUL_T_COUNT;
291	// Safe, because the inputs are known to be in range.
292	return Some(unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lv) });
293	}
294	return None;
295	}
296	if in_inclusive_range(c:second, start:'`\u{11A8}`', end:'`\u{11C2}`') {
297	let lv: u32 = u32::from(starter).wrapping_sub(HANGUL_S_BASE);
298	if lv < HANGUL_S_COUNT && lv % HANGUL_T_COUNT == `0` {
299	let lvt: u32 = lv + (u32::from(second) - HANGUL_T_BASE);
300	// Safe, because the inputs are known to be in range.
301	return Some(unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lvt) });
302	}
303	}
304	None
305	}
306
307	/// Performs (non-Hangul) canonical composition on a pair of characters
308	/// or returns `None` if these characters don't compose. Composition
309	/// exclusions are taken into account.
310	fn compose_non_hangul(mut iter: Char16TrieIterator, starter: char, second: char) -> Option<char> {
311	// To make the trie smaller, the pairs are stored second character first.
312	// Given how this method is used in ways where it's known that `second`
313	// is or isn't a starter. We could potentially split the trie into two
314	// tries depending on whether `second` is a starter.
315	match iter.next(second) {
316	TrieResult::NoMatch => None,
317	TrieResult::NoValue => match iter.next(starter) {
318	TrieResult::NoMatch => None,
319	TrieResult::FinalValue(i) => {
320	if let Some(c) = char::from_u32(i as u32) {
321	Some(c)
322	} else {
323	// GIGO case
324	debug_assert!(`false`);
325	None
326	}
327	}
328	TrieResult::NoValue \| TrieResult::Intermediate(_) => {
329	// GIGO case
330	debug_assert!(`false`);
331	None
332	}
333	},
334	TrieResult::FinalValue(_) \| TrieResult::Intermediate(_) => {
335	// GIGO case
336	debug_assert!(`false`);
337	None
338	}
339	}
340	}
341
342	/// Struct for holding together a character and the value
343	/// looked up for it from the NFD trie in a more explicit
344	/// way than an anonymous pair.
345	/// Also holds a flag about the supplementary-trie provenance.
346	#[derive(Debug, PartialEq, Eq)]
347	struct CharacterAndTrieValue {
348	character: char,
349	trie_val: u32,
350	from_supplement: bool,
351	}
352
353	impl CharacterAndTrieValue {
354	#[inline(always)]
355	pub fn new(c: char, trie_value: u32) -> Self {
356	CharacterAndTrieValue {
357	character: c,
358	trie_val: trie_value,
359	from_supplement: `false`,
360	}
361	}
362	#[inline(always)]
363	pub fn new_from_supplement(c: char, trie_value: u32) -> Self {
364	CharacterAndTrieValue {
365	character: c,
366	trie_val: trie_value,
367	from_supplement: `true`,
368	}
369	}
370	#[inline(always)]
371	pub fn starter_and_decomposes_to_self(&self) -> bool {
372	if self.trie_val > BACKWARD_COMBINING_STARTER_MARKER {
373	return `false`;
374	}
375	// Hangul syllables get 0 as their trie value
376	u32::from(self.character).wrapping_sub(HANGUL_S_BASE) >= HANGUL_S_COUNT
377	}
378	#[inline(always)]
379	pub fn can_combine_backwards(&self) -> bool {
380	decomposition_starts_with_non_starter(self.trie_val)
381	\|\| self.trie_val == BACKWARD_COMBINING_STARTER_MARKER
382	\|\| in_inclusive_range32(self.trie_val, `0x1161`, `0x11C2`)
383	}
384	#[inline(always)]
385	pub fn potential_passthrough(&self) -> bool {
386	self.potential_passthrough_impl(BACKWARD_COMBINING_STARTER_MARKER)
387	}
388	#[inline(always)]
389	pub fn potential_passthrough_and_cannot_combine_backwards(&self) -> bool {
390	self.potential_passthrough_impl(`0`)
391	}
392	#[inline(always)]
393	fn potential_passthrough_impl(&self, bound: u32) -> bool {
394	// This methods looks badly branchy, but most characters
395	// take the first return.
396	if self.trie_val <= bound {
397	return `true`;
398	}
399	if self.from_supplement {
400	return `false`;
401	}
402	let trail_or_complex = (self.trie_val >> `16`) as u16;
403	if trail_or_complex == `0` {
404	return `false`;
405	}
406	let lead = self.trie_val as u16;
407	if lead == `0` {
408	return `true`;
409	}
410	if lead == NON_ROUND_TRIP_MARKER {
411	return `false`;
412	}
413	if (trail_or_complex & `0x7F`) == `0x3C`
414	&& in_inclusive_range16(trail_or_complex, `0x0900`, `0x0BFF`)
415	{
416	// Nukta
417	return `false`;
418	}
419	if in_inclusive_range(self.character, '`\u{FB1D}`', '`\u{FB4E}`') {
420	// Hebrew presentation forms
421	return `false`;
422	}
423	if in_inclusive_range(self.character, '`\u{1F71}`', '`\u{1FFB}`') {
424	// Polytonic Greek with oxia
425	return `false`;
426	}
427	// To avoid more branchiness, 4 characters that decompose to
428	// a BMP starter followed by a BMP non-starter are excluded
429	// from being encoded directly into the trie value and are
430	// handled as complex decompositions instead. These are:
431	// U+0F76 TIBETAN VOWEL SIGN VOCALIC R
432	// U+0F78 TIBETAN VOWEL SIGN VOCALIC L
433	// U+212B ANGSTROM SIGN
434	// U+2ADC FORKING
435	`true`
436	}
437	}
438
439	/// Pack a `char` and a `CanonicalCombiningClass` in
440	/// 32 bits (the former in the lower 24 bits and the
441	/// latter in the high 8 bits). The latter can be
442	/// initialized to 0xFF upon creation, in which case
443	/// it can be actually set later by calling
444	/// `set_ccc_from_trie_if_not_already_set`. This is
445	/// a micro optimization to avoid the Canonical
446	/// Combining Class trie lookup when there is only
447	/// one combining character in a sequence. This type
448	/// is intentionally non-`Copy` to get compiler help
449	/// in making sure that the class is set on the
450	/// instance on which it is intended to be set
451	/// and not on a temporary copy.
452	///
453	/// Note that 0xFF is won't be assigned to an actual
454	/// canonical combining class per definition D104
455	/// in The Unicode Standard.
456	//
457	// NOTE: The Pernosco debugger has special knowledge
458	// of this struct. Please do not change the bit layout
459	// or the crate-module-qualified name of this struct
460	// without coordination.
461	#[derive(Debug)]
462	struct CharacterAndClass(u32);
463
464	impl CharacterAndClass {
465	pub fn new(c: char, ccc: CanonicalCombiningClass) -> Self {
466	CharacterAndClass(u32::from(c) \| (u32::from(ccc.0) << `24`))
467	}
468	pub fn new_with_placeholder(c: char) -> Self {
469	CharacterAndClass(u32::from(c) \| ((`0xFF`) << `24`))
470	}
471	pub fn new_with_trie_value(c_tv: CharacterAndTrieValue) -> Self {
472	Self::new(c_tv.character, ccc_from_trie_value(c_tv.trie_val))
473	}
474	pub fn new_starter(c: char) -> Self {
475	CharacterAndClass(u32::from(c))
476	}
477	pub fn character(&self) -> char {
478	// Safe, because the low 24 bits came from a `char`
479	// originally.
480	unsafe { char::from_u32_unchecked(self.0 & `0xFFFFFF`) }
481	}
482	pub fn ccc(&self) -> CanonicalCombiningClass {
483	CanonicalCombiningClass((self.0 >> `24`) as u8)
484	}
485	pub fn character_and_ccc(&self) -> (char, CanonicalCombiningClass) {
486	(self.character(), self.ccc())
487	}
488	pub fn set_ccc_from_trie_if_not_already_set(&mut self, trie: &CodePointTrie<u32>) {
489	if self.0 >> `24` != `0xFF` {
490	return;
491	}
492	let scalar = self.0 & `0xFFFFFF`;
493	self.0 = ((ccc_from_trie_value(trie.get32_u32(scalar)).0 as u32) << `24`) \| scalar;
494	}
495	}
496
497	// This function exists as a borrow check helper.
498	#[inline(always)]
499	fn sort_slice_by_ccc(slice: &mut [CharacterAndClass], trie: &CodePointTrie<u32>) {
500	// We don't look up the canonical combining class for starters
501	// of for single combining characters between starters. When
502	// there's more than one combining character between starters,
503	// we look up the canonical combining class for each character
504	// exactly once.
505	if slice.len() < `2` {
506	return;
507	}
508	sliceIterMut<'_, CharacterAndClass>
509	.iter_mut()
510	.for_each(\|cc: &mut CharacterAndClass\| cc.set_ccc_from_trie_if_not_already_set(trie));
511	slice.sort_by_key(\|cc: &CharacterAndClass\| cc.ccc());
512	}
513
514	/// An iterator adaptor that turns an `Iterator` over `char` into
515	/// a lazily-decomposed `char` sequence.
516	#[derive(Debug)]
517	pub struct Decomposition<'data, I>
518	where
519	I: Iterator<Item = char>,
520	{
521	delegate: I,
522	buffer: SmallVec<[CharacterAndClass; `17`]>, // Enough to hold NFKD for U+FDFA
523	/// The index of the next item to be read from `buffer`.
524	/// The purpose if this index is to avoid having to move
525	/// the rest upon every read.
526	buffer_pos: usize,
527	// At the start of `next()` if not `None`, this is a pending unnormalized
528	// starter. When `Decomposition` appears alone, this is never a non-starter.
529	// However, when `Decomposition` appears inside a `Composition`, this
530	// may become a non-starter before `decomposing_next()` is called.
531	pending: Option<CharacterAndTrieValue>, // None at end of stream
532	trie: &'data CodePointTrie<'data, u32>,
533	supplementary_trie: Option<&'data CodePointTrie<'data, u32>>,
534	scalars16: &'data ZeroSlice<u16>,
535	scalars24: &'data ZeroSlice<char>,
536	supplementary_scalars16: &'data ZeroSlice<u16>,
537	supplementary_scalars24: &'data ZeroSlice<char>,
538	half_width_voicing_marks_become_non_starters: bool,
539	/// The lowest character for which either of the following does
540	/// not hold:
541	/// 1. Decomposes to self.
542	/// 2. Decomposition starts with a non-starter
543	decomposition_passthrough_bound: u32, // never above 0xC0
544	ignorable_behavior: IgnorableBehavior, // Arguably should be a type parameter
545	}
546
547	impl<'data, I> Decomposition<'data, I>
548	where
549	I: Iterator<Item = char>,
550	{
551	/// Constructs a decomposing iterator adapter from a delegate
552	/// iterator and references to the necessary data, without
553	/// supplementary data.
554	///
555	/// Use `DecomposingNormalizer::normalize_iter()` instead unless
556	/// there's a good reason to use this constructor directly.
557	///
558	/// Public but hidden in order to be able to use this from the
559	/// collator.
560	#[doc(hidden)]
561	pub fn new(
562	delegate: I,
563	decompositions: &'data DecompositionDataV1,
564	tables: &'data DecompositionTablesV1,
565	) -> Self {
566	Self::new_with_supplements(
567	delegate,
568	decompositions,
569	None,
570	tables,
571	None,
572	`0xC0`,
573	IgnorableBehavior::Unsupported,
574	)
575	}
576
577	/// Constructs a decomposing iterator adapter from a delegate
578	/// iterator and references to the necessary data, including
579	/// supplementary data.
580	///
581	/// Use `DecomposingNormalizer::normalize_iter()` instead unless
582	/// there's a good reason to use this constructor directly.
583	fn new_with_supplements(
584	delegate: I,
585	decompositions: &'data DecompositionDataV1,
586	supplementary_decompositions: Option<&'data DecompositionSupplementV1>,
587	tables: &'data DecompositionTablesV1,
588	supplementary_tables: Option<&'data DecompositionTablesV1>,
589	decomposition_passthrough_bound: u8,
590	ignorable_behavior: IgnorableBehavior,
591	) -> Self {
592	let half_width_voicing_marks_become_non_starters =
593	if let Some(supplementary) = supplementary_decompositions {
594	supplementary.half_width_voicing_marks_become_non_starters()
595	} else {
596	`false`
597	};
598	let mut ret = Decomposition::<I> {
599	delegate,
600	buffer: SmallVec::new(), // Normalized
601	buffer_pos: `0`,
602	// Initialize with a placeholder starter in case
603	// the real stream starts with a non-starter.
604	pending: Some(CharacterAndTrieValue::new('`\u{FFFF}`', `0`)),
605	trie: &decompositions.trie,
606	supplementary_trie: supplementary_decompositions.map(\|s\| &s.trie),
607	scalars16: &tables.scalars16,
608	scalars24: &tables.scalars24,
609	supplementary_scalars16: if let Some(supplementary) = supplementary_tables {
610	&supplementary.scalars16
611	} else {
612	EMPTY_U16
613	},
614	supplementary_scalars24: if let Some(supplementary) = supplementary_tables {
615	&supplementary.scalars24
616	} else {
617	EMPTY_CHAR
618	},
619	half_width_voicing_marks_become_non_starters,
620	decomposition_passthrough_bound: u32::from(decomposition_passthrough_bound),
621	ignorable_behavior,
622	};
623	let _ = ret.next(); // Remove the U+FFFF placeholder
624	ret
625	}
626
627	fn push_decomposition16(
628	&mut self,
629	low: u16,
630	offset: usize,
631	slice16: &ZeroSlice<u16>,
632	) -> (char, usize) {
633	let len = usize::from(low >> `13`) + `2`;
634	let (starter, tail) = slice16
635	.get_subslice(offset..offset + len)
636	.and_then(\|slice\| slice.split_first())
637	.map_or_else(
638	\|\| {
639	// GIGO case
640	debug_assert!(`false`);
641	(REPLACEMENT_CHARACTER, EMPTY_U16)
642	},
643	\|(first, trail)\| (char_from_u16(first), trail),
644	);
645	if low & `0x1000` != `0` {
646	// All the rest are combining
647	self.buffer.extend(
648	tail.iter()
649	.map(\|u\| CharacterAndClass::new_with_placeholder(char_from_u16(u))),
650	);
651	(starter, `0`)
652	} else {
653	let mut i = `0`;
654	let mut combining_start = `0`;
655	for u in tail.iter() {
656	let ch = char_from_u16(u);
657	let trie_value = self.trie.get(ch);
658	self.buffer.push(CharacterAndClass::new_with_trie_value(
659	CharacterAndTrieValue::new(ch, trie_value),
660	));
661	i += `1`;
662	// Half-width kana and iota subscript don't occur in the tails
663	// of these multicharacter decompositions.
664	if !decomposition_starts_with_non_starter(trie_value) {
665	combining_start = i;
666	}
667	}
668	(starter, combining_start)
669	}
670	}
671
672	fn push_decomposition32(
673	&mut self,
674	low: u16,
675	offset: usize,
676	slice32: &ZeroSlice<char>,
677	) -> (char, usize) {
678	let len = usize::from(low >> `13`) + `1`;
679	let (starter, tail) = slice32
680	.get_subslice(offset..offset + len)
681	.and_then(\|slice\| slice.split_first())
682	.unwrap_or_else(\|\| {
683	// GIGO case
684	debug_assert!(`false`);
685	(REPLACEMENT_CHARACTER, EMPTY_CHAR)
686	});
687	if low & `0x1000` != `0` {
688	// All the rest are combining
689	self.buffer
690	.extend(tail.iter().map(CharacterAndClass::new_with_placeholder));
691	(starter, `0`)
692	} else {
693	let mut i = `0`;
694	let mut combining_start = `0`;
695	for ch in tail.iter() {
696	let trie_value = self.trie.get(ch);
697	self.buffer.push(CharacterAndClass::new_with_trie_value(
698	CharacterAndTrieValue::new(ch, trie_value),
699	));
700	i += `1`;
701	// Half-width kana and iota subscript don't occur in the tails
702	// of these multicharacter decompositions.
703	if !decomposition_starts_with_non_starter(trie_value) {
704	combining_start = i;
705	}
706	}
707	(starter, combining_start)
708	}
709	}
710
711	#[inline(always)]
712	fn attach_trie_value(&self, c: char) -> CharacterAndTrieValue {
713	if let Some(supplementary) = self.supplementary_trie {
714	if let Some(value) = self.attach_supplementary_trie_value(c, supplementary) {
715	return value;
716	}
717	}
718
719	CharacterAndTrieValue::new(c, self.trie.get(c))
720	}
721
722	#[inline(never)]
723	fn attach_supplementary_trie_value(
724	&self,
725	c: char,
726	supplementary: &CodePointTrie<u32>,
727	) -> Option<CharacterAndTrieValue> {
728	let voicing_mark = u32::from(c).wrapping_sub(`0xFF9E`);
729	if voicing_mark <= `1` && self.half_width_voicing_marks_become_non_starters {
730	return Some(CharacterAndTrieValue::new(
731	if voicing_mark == `0` {
732	'`\u{3099}`'
733	} else {
734	'`\u{309A}`'
735	},
736	`0xD800` \| u32::from(CanonicalCombiningClass::KanaVoicing.0),
737	));
738	}
739	let trie_value = supplementary.get32(u32::from(c));
740	if trie_value != `0` {
741	return Some(CharacterAndTrieValue::new_from_supplement(c, trie_value));
742	}
743	None
744	}
745
746	fn delegate_next_no_pending(&mut self) -> Option<CharacterAndTrieValue> {
747	debug_assert!(self.pending.is_none());
748	loop {
749	let c = self.delegate.next()?;
750
751	// TODO(#2384): Measure if this check is actually an optimization even in the
752	// non-supplementary case of if this should go inside the supplementary
753	// `if` below.
754	if u32::from(c) < self.decomposition_passthrough_bound {
755	return Some(CharacterAndTrieValue::new(c, `0`));
756	}
757
758	if let Some(supplementary) = self.supplementary_trie {
759	if let Some(value) = self.attach_supplementary_trie_value(c, supplementary) {
760	if value.trie_val == IGNORABLE_MARKER {
761	match self.ignorable_behavior {
762	IgnorableBehavior::Unsupported => {
763	debug_assert!(`false`);
764	}
765	IgnorableBehavior::ReplacementCharacter => {
766	return Some(CharacterAndTrieValue::new(
767	c,
768	u32::from(REPLACEMENT_CHARACTER),
769	));
770	}
771	IgnorableBehavior::Ignored => {
772	// Else ignore this character by reading the next one from the delegate.
773	continue;
774	}
775	}
776	}
777	return Some(value);
778	}
779	}
780	let trie_val = self.trie.get(c);
781	debug_assert_ne!(trie_val, IGNORABLE_MARKER);
782	return Some(CharacterAndTrieValue::new(c, trie_val));
783	}
784	}
785
786	fn delegate_next(&mut self) -> Option<CharacterAndTrieValue> {
787	if let Some(pending) = self.pending.take() {
788	// Only happens as part of `Composition` and as part of
789	// the contiguous-buffer methods of `DecomposingNormalizer`.
790	// I.e. does not happen as part of standalone iterator
791	// usage of `Decomposition`.
792	Some(pending)
793	} else {
794	self.delegate_next_no_pending()
795	}
796	}
797
798	fn decomposing_next(&mut self, c_and_trie_val: CharacterAndTrieValue) -> char {
799	let (starter, combining_start) = {
800	let c = c_and_trie_val.character;
801	let hangul_offset = u32::from(c).wrapping_sub(HANGUL_S_BASE); // SIndex in the spec
802	if hangul_offset >= HANGUL_S_COUNT {
803	let decomposition = c_and_trie_val.trie_val;
804	if decomposition <= BACKWARD_COMBINING_STARTER_MARKER {
805	// The character is its own decomposition
806	(c, `0`)
807	} else {
808	let trail_or_complex = (decomposition >> `16`) as u16;
809	let lead = decomposition as u16;
810	if lead > NON_ROUND_TRIP_MARKER && trail_or_complex != `0` {
811	// Decomposition into two BMP characters: starter and non-starter
812	let starter = char_from_u16(lead);
813	let combining = char_from_u16(trail_or_complex);
814	self.buffer
815	.push(CharacterAndClass::new_with_placeholder(combining));
816	(starter, `0`)
817	} else if lead > NON_ROUND_TRIP_MARKER {
818	if lead != FDFA_MARKER {
819	debug_assert_ne!(
820	lead, SPECIAL_NON_STARTER_DECOMPOSITION_MARKER_U16,
821	"Should not reach this point with non-starter marker"
822	);
823	// Decomposition into one BMP character
824	let starter = char_from_u16(lead);
825	(starter, `0`)
826	} else {
827	// Special case for the NFKD form of U+FDFA.
828	self.buffer.extend(FDFA_NFKD.map(\|u\| {
829	// Safe, because `FDFA_NFKD` is known not to contain
830	// surrogates.
831	CharacterAndClass::new_starter(unsafe {
832	core::char::from_u32_unchecked(u32::from(u))
833	})
834	}));
835	('`\u{0635}`', `17`)
836	}
837	} else {
838	// Complex decomposition
839	// Format for 16-bit value:
840	// 15..13: length minus two for 16-bit case and length minus one for
841	// the 32-bit case. Length 8 needs to fit in three bits in
842	// the 16-bit case, and this way the value is future-proofed
843	// up to 9 in the 16-bit case. Zero is unused and length one
844	// in the 16-bit case goes directly into the trie.
845	// 12: 1 if all trailing characters are guaranteed non-starters,
846	// 0 if no guarantees about non-starterness.
847	// Note: The bit choice is this way around to allow for
848	// dynamically falling back to not having this but instead
849	// having one more bit for length by merely choosing
850	// different masks.
851	// 11..0: Start offset in storage. The offset is to the logical
852	// sequence of scalars16, scalars32, supplementary_scalars16,
853	// supplementary_scalars32.
854	let offset = usize::from(trail_or_complex & `0xFFF`);
855	if offset < self.scalars16.len() {
856	self.push_decomposition16(trail_or_complex, offset, self.scalars16)
857	} else if offset < self.scalars16.len() + self.scalars24.len() {
858	self.push_decomposition32(
859	trail_or_complex,
860	offset - self.scalars16.len(),
861	self.scalars24,
862	)
863	} else if offset
864	< self.scalars16.len()
865	+ self.scalars24.len()
866	+ self.supplementary_scalars16.len()
867	{
868	self.push_decomposition16(
869	trail_or_complex,
870	offset - (self.scalars16.len() + self.scalars24.len()),
871	self.supplementary_scalars16,
872	)
873	} else {
874	self.push_decomposition32(
875	trail_or_complex,
876	offset
877	- (self.scalars16.len()
878	+ self.scalars24.len()
879	+ self.supplementary_scalars16.len()),
880	self.supplementary_scalars24,
881	)
882	}
883	}
884	}
885	} else {
886	// Hangul syllable
887	// The math here comes from page 144 of Unicode 14.0
888	let l = hangul_offset / HANGUL_N_COUNT;
889	let v = (hangul_offset % HANGUL_N_COUNT) / HANGUL_T_COUNT;
890	let t = hangul_offset % HANGUL_T_COUNT;
891
892	// The unsafe blocks here are OK, because the values stay
893	// within the Hangul jamo block and, therefore, the scalar
894	// value range by construction.
895	self.buffer.push(CharacterAndClass::new_starter(unsafe {
896	core::char::from_u32_unchecked(HANGUL_V_BASE + v)
897	}));
898	let first = unsafe { core::char::from_u32_unchecked(HANGUL_L_BASE + l) };
899	if t != `0` {
900	self.buffer.push(CharacterAndClass::new_starter(unsafe {
901	core::char::from_u32_unchecked(HANGUL_T_BASE + t)
902	}));
903	(first, `2`)
904	} else {
905	(first, `1`)
906	}
907	}
908	};
909	// Either we're inside `Composition` or `self.pending.is_none()`.
910
911	self.gather_and_sort_combining(combining_start);
912	starter
913	}
914
915	fn gather_and_sort_combining(&mut self, combining_start: usize) {
916	// Not a `for` loop to avoid holding a mutable reference to `self` across
917	// the loop body.
918	while let Some(ch_and_trie_val) = self.delegate_next() {
919	if trie_value_has_ccc(ch_and_trie_val.trie_val) {
920	self.buffer
921	.push(CharacterAndClass::new_with_trie_value(ch_and_trie_val));
922	} else if trie_value_indicates_special_non_starter_decomposition(
923	ch_and_trie_val.trie_val,
924	) {
925	// The Tibetan special cases are starters that decompose into non-starters.
926	let mapped = match ch_and_trie_val.character {
927	'`\u{0340}`' => {
928	// COMBINING GRAVE TONE MARK
929	CharacterAndClass::new('`\u{0300}`', CanonicalCombiningClass::Above)
930	}
931	'`\u{0341}`' => {
932	// COMBINING ACUTE TONE MARK
933	CharacterAndClass::new('`\u{0301}`', CanonicalCombiningClass::Above)
934	}
935	'`\u{0343}`' => {
936	// COMBINING GREEK KORONIS
937	CharacterAndClass::new('`\u{0313}`', CanonicalCombiningClass::Above)
938	}
939	'`\u{0344}`' => {
940	// COMBINING GREEK DIALYTIKA TONOS
941	self.buffer.push(CharacterAndClass::new(
942	'`\u{0308}`',
943	CanonicalCombiningClass::Above,
944	));
945	CharacterAndClass::new('`\u{0301}`', CanonicalCombiningClass::Above)
946	}
947	'`\u{0F73}`' => {
948	// TIBETAN VOWEL SIGN II
949	self.buffer.push(CharacterAndClass::new(
950	'`\u{0F71}`',
951	CanonicalCombiningClass::CCC129,
952	));
953	CharacterAndClass::new('`\u{0F72}`', CanonicalCombiningClass::CCC130)
954	}
955	'`\u{0F75}`' => {
956	// TIBETAN VOWEL SIGN UU
957	self.buffer.push(CharacterAndClass::new(
958	'`\u{0F71}`',
959	CanonicalCombiningClass::CCC129,
960	));
961	CharacterAndClass::new('`\u{0F74}`', CanonicalCombiningClass::CCC132)
962	}
963	'`\u{0F81}`' => {
964	// TIBETAN VOWEL SIGN REVERSED II
965	self.buffer.push(CharacterAndClass::new(
966	'`\u{0F71}`',
967	CanonicalCombiningClass::CCC129,
968	));
969	CharacterAndClass::new('`\u{0F80}`', CanonicalCombiningClass::CCC130)
970	}
971	_ => {
972	// GIGO case
973	debug_assert!(`false`);
974	CharacterAndClass::new_with_placeholder(REPLACEMENT_CHARACTER)
975	}
976	};
977	self.buffer.push(mapped);
978	} else {
979	self.pending = Some(ch_and_trie_val);
980	break;
981	}
982	}
983	// Slicing succeeds by construction; we've always ensured that `combining_start`
984	// is in permissible range.
985	#[allow(clippy::indexing_slicing)]
986	sort_slice_by_ccc(&mut self.buffer[combining_start..], self.trie);
987	}
988	}
989
990	impl<'data, I> Iterator for Decomposition<'data, I>
991	where
992	I: Iterator<Item = char>,
993	{
994	type Item = char;
995
996	fn next(&mut self) -> Option<char> {
997	if let Some(ret: char) = self.buffer.get(self.buffer_pos).map(\|c: &CharacterAndClass\| c.character()) {
998	self.buffer_pos += `1`;
999	if self.buffer_pos == self.buffer.len() {
1000	self.buffer.clear();
1001	self.buffer_pos = `0`;
1002	}
1003	return Some(ret);
1004	}
1005	debug_assert_eq!(self.buffer_pos, `0`);
1006	let c_and_trie_val: CharacterAndTrieValue = self.pending.take()?;
1007	Some(self.decomposing_next(c_and_trie_val))
1008	}
1009	}
1010
1011	/// An iterator adaptor that turns an `Iterator` over `char` into
1012	/// a lazily-decomposed and then canonically composed `char` sequence.
1013	#[derive(Debug)]
1014	pub struct Composition<'data, I>
1015	where
1016	I: Iterator<Item = char>,
1017	{
1018	/// The decomposing part of the normalizer than operates before
1019	/// the canonical composition is performed on its output.
1020	decomposition: Decomposition<'data, I>,
1021	/// Non-Hangul canonical composition data.
1022	canonical_compositions: Char16Trie<'data>,
1023	/// To make `next()` yield in cases where there's a non-composing
1024	/// starter in the decomposition buffer, we put it here to let it
1025	/// wait for the next `next()` call (or a jump forward within the
1026	/// `next()` call).
1027	unprocessed_starter: Option<char>,
1028	/// The lowest character for which any one of the following does
1029	/// not hold:
1030	/// 1. Roundtrips via decomposition and recomposition.
1031	/// 2. Decomposition starts with a non-starter
1032	/// 3. Is not a backward-combining starter
1033	composition_passthrough_bound: u32,
1034	}
1035
1036	impl<'data, I> Composition<'data, I>
1037	where
1038	I: Iterator<Item = char>,
1039	{
1040	fn new(
1041	decomposition: Decomposition<'data, I>,
1042	canonical_compositions: Char16Trie<'data>,
1043	composition_passthrough_bound: u16,
1044	) -> Self {
1045	Self {
1046	decomposition,
1047	canonical_compositions,
1048	unprocessed_starter: None,
1049	composition_passthrough_bound: u32::from(composition_passthrough_bound),
1050	}
1051	}
1052
1053	/// Performs canonical composition (including Hangul) on a pair of
1054	/// characters or returns `None` if these characters don't compose.
1055	/// Composition exclusions are taken into account.
1056	#[inline(always)]
1057	pub fn compose(&self, starter: char, second: char) -> Option<char> {
1058	compose(self.canonical_compositions.iter(), starter, second)
1059	}
1060
1061	/// Performs (non-Hangul) canonical composition on a pair of characters
1062	/// or returns `None` if these characters don't compose. Composition
1063	/// exclusions are taken into account.
1064	#[inline(always)]
1065	fn compose_non_hangul(&self, starter: char, second: char) -> Option<char> {
1066	compose_non_hangul(self.canonical_compositions.iter(), starter, second)
1067	}
1068	}
1069
1070	impl<'data, I> Iterator for Composition<'data, I>
1071	where
1072	I: Iterator<Item = char>,
1073	{
1074	type Item = char;
1075
1076	#[inline]
1077	fn next(&mut self) -> Option<char> {
1078	let mut undecomposed_starter = CharacterAndTrieValue::new('`\u{0}`', `0`); // The compiler can't figure out that this gets overwritten before use.
1079	if self.unprocessed_starter.is_none() {
1080	// The loop is only broken out of as goto forward
1081	#[allow(clippy::never_loop)]
1082	loop {
1083	if let Some((character, ccc)) = self
1084	.decomposition
1085	.buffer
1086	.get(self.decomposition.buffer_pos)
1087	.map(\|c\| c.character_and_ccc())
1088	{
1089	self.decomposition.buffer_pos += `1`;
1090	if self.decomposition.buffer_pos == self.decomposition.buffer.len() {
1091	self.decomposition.buffer.clear();
1092	self.decomposition.buffer_pos = `0`;
1093	}
1094	if ccc == CanonicalCombiningClass::NotReordered {
1095	// Previous decomposition contains a starter. This must
1096	// now become the `unprocessed_starter` for it to have
1097	// a chance to compose with the upcoming characters.
1098	//
1099	// E.g. parenthesized Hangul in NFKC comes through here,
1100	// but suitable composition exclusion could exercise this
1101	// in NFC.
1102	self.unprocessed_starter = Some(character);
1103	break; // We already have a starter, so skip taking one from `pending`.
1104	}
1105	return Some(character);
1106	}
1107	debug_assert_eq!(self.decomposition.buffer_pos, `0`);
1108	undecomposed_starter = self.decomposition.pending.take()?;
1109	if u32::from(undecomposed_starter.character) < self.composition_passthrough_bound
1110	\|\| undecomposed_starter.potential_passthrough()
1111	{
1112	// TODO(#2385): In the NFC case (moot for NFKC and UTS46), if the upcoming
1113	// character is not below `decomposition_passthrough_bound` but is
1114	// below `composition_passthrough_bound`, we read from the trie
1115	// unnecessarily.
1116	if let Some(upcoming) = self.decomposition.delegate_next_no_pending() {
1117	let cannot_combine_backwards = u32::from(upcoming.character)
1118	< self.composition_passthrough_bound
1119	\|\| !upcoming.can_combine_backwards();
1120	self.decomposition.pending = Some(upcoming);
1121	if cannot_combine_backwards {
1122	// Fast-track succeeded!
1123	return Some(undecomposed_starter.character);
1124	}
1125	} else {
1126	// End of stream
1127	return Some(undecomposed_starter.character);
1128	}
1129	}
1130	break; // Not actually looping
1131	}
1132	}
1133	let mut starter = '`\u{0}`'; // The compiler can't figure out this gets overwritten before use.
1134
1135	// The point of having this boolean is to have only one call site to
1136	// `self.decomposition.decomposing_next`, which is hopefully beneficial for
1137	// code size under inlining.
1138	let mut attempt_composition = `false`;
1139	loop {
1140	if let Some(unprocessed) = self.unprocessed_starter.take() {
1141	debug_assert_eq!(undecomposed_starter, CharacterAndTrieValue::new('`\u{0}`', `0`));
1142	debug_assert_eq!(starter, '`\u{0}`');
1143	starter = unprocessed;
1144	} else {
1145	debug_assert_eq!(self.decomposition.buffer_pos, `0`);
1146	let next_starter = self.decomposition.decomposing_next(undecomposed_starter);
1147	if !attempt_composition {
1148	starter = next_starter;
1149	} else if let Some(composed) = self.compose(starter, next_starter) {
1150	starter = composed;
1151	} else {
1152	// This is our yield point. We'll pick this up above in the
1153	// next call to `next()`.
1154	self.unprocessed_starter = Some(next_starter);
1155	return Some(starter);
1156	}
1157	}
1158	// We first loop by index to avoid moving the contents of `buffer`, but
1159	// if there's a discontiguous match, we'll start modifying `buffer` instead.
1160	loop {
1161	let (character, ccc) = if let Some((character, ccc)) = self
1162	.decomposition
1163	.buffer
1164	.get(self.decomposition.buffer_pos)
1165	.map(\|c\| c.character_and_ccc())
1166	{
1167	(character, ccc)
1168	} else {
1169	self.decomposition.buffer.clear();
1170	self.decomposition.buffer_pos = `0`;
1171	break;
1172	};
1173	if let Some(composed) = self.compose(starter, character) {
1174	starter = composed;
1175	self.decomposition.buffer_pos += `1`;
1176	continue;
1177	}
1178	let mut most_recent_skipped_ccc = ccc;
1179	{
1180	let _ = self
1181	.decomposition
1182	.buffer
1183	.drain(`0`..self.decomposition.buffer_pos);
1184	}
1185	self.decomposition.buffer_pos = `0`;
1186	if most_recent_skipped_ccc == CanonicalCombiningClass::NotReordered {
1187	// We failed to compose a starter. Discontiguous match not allowed.
1188	// We leave the starter in `buffer` for `next()` to find.
1189	return Some(starter);
1190	}
1191	let mut i = `1`; // We have skipped one non-starter.
1192	while let Some((character, ccc)) = self
1193	.decomposition
1194	.buffer
1195	.get(i)
1196	.map(\|c\| c.character_and_ccc())
1197	{
1198	if ccc == CanonicalCombiningClass::NotReordered {
1199	// Discontiguous match not allowed.
1200	return Some(starter);
1201	}
1202	debug_assert!(ccc >= most_recent_skipped_ccc);
1203	if ccc != most_recent_skipped_ccc {
1204	// Using the non-Hangul version as a micro-optimization, since
1205	// we already rejected the case where `second` is a starter
1206	// above, and conjoining jamo are starters.
1207	if let Some(composed) = self.compose_non_hangul(starter, character) {
1208	self.decomposition.buffer.remove(i);
1209	starter = composed;
1210	continue;
1211	}
1212	}
1213	most_recent_skipped_ccc = ccc;
1214	i += `1`;
1215	}
1216	break;
1217	}
1218
1219	debug_assert_eq!(self.decomposition.buffer_pos, `0`);
1220
1221	if !self.decomposition.buffer.is_empty() {
1222	return Some(starter);
1223	}
1224	// Now we need to check if composition with an upcoming starter is possible.
1225	#[allow(clippy::unwrap_used)]
1226	if self.decomposition.pending.is_some() {
1227	// We know that `pending_starter` decomposes to start with a starter.
1228	// Otherwise, it would have been moved to `self.decomposition.buffer`
1229	// by `self.decomposing_next()`. We do this set lookup here in order
1230	// to get an opportunity to go back to the fast track.
1231	// Note that this check has to happen _after_ checking that `pending`
1232	// holds a character, because this flag isn't defined to be meaningful
1233	// when `pending` isn't holding a character.
1234	let pending = self.decomposition.pending.as_ref().unwrap();
1235	if u32::from(pending.character) < self.composition_passthrough_bound
1236	\|\| !pending.can_combine_backwards()
1237	{
1238	// Won't combine backwards anyway.
1239	return Some(starter);
1240	}
1241	// Consume what we peeked. `unwrap` OK, because we checked `is_some()`
1242	// above.
1243	undecomposed_starter = self.decomposition.pending.take().unwrap();
1244	// The following line is OK, because we're about to loop back
1245	// to `self.decomposition.decomposing_next(c);`, which will
1246	// restore the between-`next()`-calls invariant of `pending`
1247	// before this function returns.
1248	attempt_composition = `true`;
1249	continue;
1250	}
1251	// End of input
1252	return Some(starter);
1253	}
1254	}
1255	}
1256
1257	macro_rules! composing_normalize_to {
1258	($(#[$meta:meta])*,
1259	$normalize_to:ident,
1260	$write:path,
1261	$slice:ty,
1262	$prolog:block,
1263	$always_valid_utf:literal,
1264	$as_slice:ident,
1265	$fast:block,
1266	$text:ident,
1267	$sink:ident,
1268	$composition:ident,
1269	$composition_passthrough_bound:ident,
1270	$undecomposed_starter:ident,
1271	$pending_slice:ident,
1272	$len_utf:ident,
1273	) => {
1274	$(#[$meta])*
1275	pub fn $normalize_to<W: $write + ?Sized>(
1276	&self,
1277	$text: $slice,
1278	$sink: &mut W,
1279	) -> core::fmt::Result {
1280	$prolog
1281	let mut $composition = self.normalize_iter($text.chars());
1282	debug_assert_eq!($composition.decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);
1283	for cc in $composition.decomposition.buffer.drain(..) {
1284	$sink.write_char(cc.character())?;
1285	}
1286
1287	// Try to get the compiler to hoist the bound to a register.
1288	let $composition_passthrough_bound = $composition.composition_passthrough_bound;
1289	'outer: loop {
1290	debug_assert_eq!($composition.decomposition.buffer_pos, `0`);
1291	let mut $undecomposed_starter =
1292	if let Some(pending) = $composition.decomposition.pending.take() {
1293	pending
1294	} else {
1295	return Ok(());
1296	};
1297	// Allowing indexed slicing, because a failure would be a code bug and
1298	// not a data issue.
1299	#[allow(clippy::indexing_slicing)]
1300	if u32::from($undecomposed_starter.character) < $composition_passthrough_bound \|\|
1301	$undecomposed_starter.potential_passthrough()
1302	{
1303	// We don't know if a `REPLACEMENT_CHARACTER` occurred in the slice or
1304	// was returned in response to an error by the iterator. Assume the
1305	// latter for correctness even though it pessimizes the former.
1306	if $always_valid_utf \|\| $undecomposed_starter.character != REPLACEMENT_CHARACTER {
1307	let $pending_slice = &$text[$text.len() - $composition.decomposition.delegate.$as_slice().len() - $undecomposed_starter.character.$len_utf()..];
1308	// The `$fast` block must either:
1309	// 1. Return due to reaching EOF
1310	// 2. Leave a starter with its trie value in `$undecomposed_starter`
1311	// and, if there is still more input, leave the next character
1312	// and its trie value in `$composition.decomposition.pending`.
1313	$fast
1314	}
1315	}
1316	// Fast track above, full algorithm below
1317	let mut starter = $composition
1318	.decomposition
1319	.decomposing_next($undecomposed_starter);
1320	'bufferloop: loop {
1321	// We first loop by index to avoid moving the contents of `buffer`, but
1322	// if there's a discontiguous match, we'll start modifying `buffer` instead.
1323	loop {
1324	let (character, ccc) = if let Some((character, ccc)) = $composition
1325	.decomposition
1326	.buffer
1327	.get($composition.decomposition.buffer_pos)
1328	.map(\|c\| c.character_and_ccc())
1329	{
1330	(character, ccc)
1331	} else {
1332	$composition.decomposition.buffer.clear();
1333	$composition.decomposition.buffer_pos = `0`;
1334	break;
1335	};
1336	if let Some(composed) = $composition.compose(starter, character) {
1337	starter = composed;
1338	$composition.decomposition.buffer_pos += `1`;
1339	continue;
1340	}
1341	let mut most_recent_skipped_ccc = ccc;
1342	if most_recent_skipped_ccc == CanonicalCombiningClass::NotReordered {
1343	// We failed to compose a starter. Discontiguous match not allowed.
1344	// Write the current `starter` we've been composing, make the unmatched
1345	// starter in the buffer the new `starter` (we know it's been decomposed)
1346	// and process the rest of the buffer with that as the starter.
1347	$sink.write_char(starter)?;
1348	starter = character;
1349	$composition.decomposition.buffer_pos += `1`;
1350	continue 'bufferloop;
1351	} else {
1352	{
1353	let _ = $composition
1354	.decomposition
1355	.buffer
1356	.drain(`0`..$composition.decomposition.buffer_pos);
1357	}
1358	$composition.decomposition.buffer_pos = `0`;
1359	}
1360	let mut i = `1`; // We have skipped one non-starter.
1361	while let Some((character, ccc)) = $composition
1362	.decomposition
1363	.buffer
1364	.get(i)
1365	.map(\|c\| c.character_and_ccc())
1366	{
1367	if ccc == CanonicalCombiningClass::NotReordered {
1368	// Discontiguous match not allowed.
1369	$sink.write_char(starter)?;
1370	for cc in $composition.decomposition.buffer.drain(..i) {
1371	$sink.write_char(cc.character())?;
1372	}
1373	starter = character;
1374	{
1375	let removed = $composition.decomposition.buffer.remove(`0`);
1376	debug_assert_eq!(starter, removed.character());
1377	}
1378	debug_assert_eq!($composition.decomposition.buffer_pos, `0`);
1379	continue 'bufferloop;
1380	}
1381	debug_assert!(ccc >= most_recent_skipped_ccc);
1382	if ccc != most_recent_skipped_ccc {
1383	// Using the non-Hangul version as a micro-optimization, since
1384	// we already rejected the case where `second` is a starter
1385	// above, and conjoining jamo are starters.
1386	if let Some(composed) =
1387	$composition.compose_non_hangul(starter, character)
1388	{
1389	$composition.decomposition.buffer.remove(i);
1390	starter = composed;
1391	continue;
1392	}
1393	}
1394	most_recent_skipped_ccc = ccc;
1395	i += `1`;
1396	}
1397	break;
1398	}
1399	debug_assert_eq!($composition.decomposition.buffer_pos, `0`);
1400
1401	if !$composition.decomposition.buffer.is_empty() {
1402	$sink.write_char(starter)?;
1403	for cc in $composition.decomposition.buffer.drain(..) {
1404	$sink.write_char(cc.character())?;
1405	}
1406	// We had non-empty buffer, so can't compose with upcoming.
1407	continue 'outer;
1408	}
1409	// Now we need to check if composition with an upcoming starter is possible.
1410	if $composition.decomposition.pending.is_some() {
1411	// We know that `pending_starter` decomposes to start with a starter.
1412	// Otherwise, it would have been moved to `composition.decomposition.buffer`
1413	// by `composition.decomposing_next()`. We do this set lookup here in order
1414	// to get an opportunity to go back to the fast track.
1415	// Note that this check has to happen _after_ checking that `pending`
1416	// holds a character, because this flag isn't defined to be meaningful
1417	// when `pending` isn't holding a character.
1418	let pending = $composition.decomposition.pending.as_ref().unwrap();
1419	if u32::from(pending.character) < $composition.composition_passthrough_bound
1420	\|\| !pending.can_combine_backwards()
1421	{
1422	// Won't combine backwards anyway.
1423	$sink.write_char(starter)?;
1424	continue 'outer;
1425	}
1426	let pending_starter = $composition.decomposition.pending.take().unwrap();
1427	let decomposed = $composition.decomposition.decomposing_next(pending_starter);
1428	if let Some(composed) = $composition.compose(starter, decomposed) {
1429	starter = composed;
1430	} else {
1431	$sink.write_char(starter)?;
1432	starter = decomposed;
1433	}
1434	continue 'bufferloop;
1435	}
1436	// End of input
1437	$sink.write_char(starter)?;
1438	return Ok(());
1439	} // 'bufferloop
1440	}
1441	}
1442	};
1443	}
1444
1445	macro_rules! decomposing_normalize_to {
1446	($(#[$meta:meta])*,
1447	$normalize_to:ident,
1448	$write:path,
1449	$slice:ty,
1450	$prolog:block,
1451	$as_slice:ident,
1452	$fast:block,
1453	$text:ident,
1454	$sink:ident,
1455	$decomposition:ident,
1456	$decomposition_passthrough_bound:ident,
1457	$undecomposed_starter:ident,
1458	$pending_slice:ident,
1459	$outer:lifetime, // loop labels use lifetime tokens
1460	) => {
1461	$(#[$meta])*
1462	pub fn $normalize_to<W: $write + ?Sized>(
1463	&self,
1464	$text: $slice,
1465	$sink: &mut W,
1466	) -> core::fmt::Result {
1467	$prolog
1468
1469	let mut $decomposition = self.normalize_iter($text.chars());
1470	debug_assert_eq!($decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);
1471
1472	// Try to get the compiler to hoist the bound to a register.
1473	let $decomposition_passthrough_bound = $decomposition.decomposition_passthrough_bound;
1474	$outer: loop {
1475	for cc in $decomposition.buffer.drain(..) {
1476	$sink.write_char(cc.character())?;
1477	}
1478	debug_assert_eq!($decomposition.buffer_pos, `0`);
1479	let mut $undecomposed_starter = if let Some(pending) = $decomposition.pending.take() {
1480	pending
1481	} else {
1482	return Ok(());
1483	};
1484	// Allowing indexed slicing, because a failure would be a code bug and
1485	// not a data issue.
1486	#[allow(clippy::indexing_slicing)]
1487	if $undecomposed_starter.starter_and_decomposes_to_self() {
1488	// Don't bother including `undecomposed_starter` in a contiguous buffer
1489	// write: Just write it right away:
1490	$sink.write_char($undecomposed_starter.character)?;
1491
1492	let $pending_slice = $decomposition.delegate.$as_slice();
1493	$fast
1494	}
1495	let starter = $decomposition.decomposing_next($undecomposed_starter);
1496	$sink.write_char(starter)?;
1497	}
1498	}
1499	};
1500	}
1501
1502	macro_rules! normalizer_methods {
1503	() => {
1504	/// Normalize a string slice into a `String`.
1505	pub fn normalize(&self, text: &str) -> String {
1506	let mut ret = String::new();
1507	ret.reserve(text.len());
1508	let _ = self.normalize_to(text, &mut ret);
1509	ret
1510	}
1511
1512	/// Check whether a string slice is normalized.
1513	pub fn is_normalized(&self, text: &str) -> bool {
1514	let mut sink = IsNormalizedSinkStr::new(text);
1515	if self.normalize_to(text, &mut sink).is_err() {
1516	return `false`;
1517	}
1518	sink.finished()
1519	}
1520
1521	/// Normalize a slice of potentially-invalid UTF-16 into a `Vec`.
1522	///
1523	/// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
1524	/// before normalizing.
1525	pub fn normalize_utf16(&self, text: &[u16]) -> Vec<u16> {
1526	let mut ret = Vec::new();
1527	let _ = self.normalize_utf16_to(text, &mut ret);
1528	ret
1529	}
1530
1531	/// Checks whether a slice of potentially-invalid UTF-16 is normalized.
1532	///
1533	/// Unpaired surrogates are treated as the REPLACEMENT CHARACTER.
1534	pub fn is_normalized_utf16(&self, text: &[u16]) -> bool {
1535	let mut sink = IsNormalizedSinkUtf16::new(text);
1536	if self.normalize_utf16_to(text, &mut sink).is_err() {
1537	return `false`;
1538	}
1539	sink.finished()
1540	}
1541
1542	/// Normalize a slice of potentially-invalid UTF-8 into a `String`.
1543	///
1544	/// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
1545	/// according to the WHATWG Encoding Standard.
1546	pub fn normalize_utf8(&self, text: &[u8]) -> String {
1547	let mut ret = String::new();
1548	ret.reserve(text.len());
1549	let _ = self.normalize_utf8_to(text, &mut ret);
1550	ret
1551	}
1552
1553	/// Check if a slice of potentially-invalid UTF-8 is normalized.
1554	///
1555	/// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
1556	/// according to the WHATWG Encoding Standard before checking.
1557	pub fn is_normalized_utf8(&self, text: &[u8]) -> bool {
1558	let mut sink = IsNormalizedSinkUtf8::new(text);
1559	if self.normalize_utf8_to(text, &mut sink).is_err() {
1560	return `false`;
1561	}
1562	sink.finished()
1563	}
1564	};
1565	}
1566
1567	/// A normalizer for performing decomposing normalization.
1568	#[derive(Debug)]
1569	pub struct DecomposingNormalizer {
1570	decompositions: DataPayload<CanonicalDecompositionDataV1Marker>,
1571	supplementary_decompositions: Option<SupplementPayloadHolder>,
1572	tables: DataPayload<CanonicalDecompositionTablesV1Marker>,
1573	supplementary_tables: Option<DataPayload<CompatibilityDecompositionTablesV1Marker>>,
1574	decomposition_passthrough_bound: u8, // never above 0xC0
1575	composition_passthrough_bound: u16, // never above 0x0300
1576	}
1577
1578	impl DecomposingNormalizer {
1579	/// NFD constructor using compiled data.
1580	///
1581	/// ✨ Enabled with the `compiled_data` Cargo feature.
1582	///
1583	/// [📚 Help choosing a constructor](icu_provider::constructors)
1584	#[cfg(feature = "compiled_data")]
1585	pub const fn new_nfd() -> Self {
1586	const _: () = assert!(
1587	crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1
1588	.scalars16
1589	.const_len()
1590	+ crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1
1591	.scalars24
1592	.const_len()
1593	<= `0xFFF`,
1594	"NormalizerError::FutureExtension"
1595	);
1596
1597	DecomposingNormalizer {
1598	decompositions: DataPayload::from_static_ref(
1599	crate::provider::Baked::SINGLETON_NORMALIZER_NFD_V1,
1600	),
1601	supplementary_decompositions: None,
1602	tables: DataPayload::from_static_ref(
1603	crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1,
1604	),
1605	supplementary_tables: None,
1606	decomposition_passthrough_bound: `0xC0`,
1607	composition_passthrough_bound: `0x0300`,
1608	}
1609	}
1610
1611	icu_provider::gen_any_buffer_data_constructors!(
1612	locale: skip,
1613	options: skip,
1614	error: NormalizerError,
1615	#[cfg(skip)]
1616	functions: [
1617	new_nfd,
1618	try_new_nfd_with_any_provider,
1619	try_new_nfd_with_buffer_provider,
1620	try_new_nfd_unstable,
1621	Self,
1622	]
1623	);
1624
1625	#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_nfd)]
1626	pub fn try_new_nfd_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
1627	where
1628	D: DataProvider<CanonicalDecompositionDataV1Marker>
1629	+ DataProvider<CanonicalDecompositionTablesV1Marker>
1630	+ ?Sized,
1631	{
1632	let decompositions: DataPayload<CanonicalDecompositionDataV1Marker> =
1633	provider.load(Default::default())?.take_payload()?;
1634	let tables: DataPayload<CanonicalDecompositionTablesV1Marker> =
1635	provider.load(Default::default())?.take_payload()?;
1636
1637	if tables.get().scalars16.len() + tables.get().scalars24.len() > `0xFFF` {
1638	// The data is from a future where there exists a normalization flavor whose
1639	// complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
1640	// of space. If a good use case from such a decomposition flavor arises, we can
1641	// dynamically change the bit masks so that the length mask becomes 0x1FFF instead
1642	// of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
1643	// since for now the masks are hard-coded, error out.
1644	return Err(NormalizerError::FutureExtension);
1645	}
1646
1647	Ok(DecomposingNormalizer {
1648	decompositions,
1649	supplementary_decompositions: None,
1650	tables,
1651	supplementary_tables: None,
1652	decomposition_passthrough_bound: `0xC0`,
1653	composition_passthrough_bound: `0x0300`,
1654	})
1655	}
1656
1657	/// NFKD constructor using compiled data.
1658	///
1659	/// ✨ Enabled with the `compiled_data` Cargo feature.
1660	///
1661	/// [📚 Help choosing a constructor](icu_provider::constructors)
1662	#[cfg(feature = "compiled_data")]
1663	pub const fn new_nfkd() -> Self {
1664	const _: () = assert!(
1665	crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1
1666	.scalars16
1667	.const_len()
1668	+ crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1
1669	.scalars24
1670	.const_len()
1671	+ crate::provider::Baked::SINGLETON_NORMALIZER_NFKDEX_V1
1672	.scalars16
1673	.const_len()
1674	+ crate::provider::Baked::SINGLETON_NORMALIZER_NFKDEX_V1
1675	.scalars24
1676	.const_len()
1677	<= `0xFFF`,
1678	"NormalizerError::FutureExtension"
1679	);
1680
1681	const _: () = assert!(
1682	crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_V1.passthrough_cap <= `0x0300`,
1683	"NormalizerError::ValidationError"
1684	);
1685
1686	let decomposition_capped =
1687	if crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_V1.passthrough_cap < `0xC0` {
1688	crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_V1.passthrough_cap
1689	} else {
1690	`0xC0`
1691	};
1692	let composition_capped =
1693	if crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_V1.passthrough_cap < `0x0300` {
1694	crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_V1.passthrough_cap
1695	} else {
1696	`0x0300`
1697	};
1698
1699	DecomposingNormalizer {
1700	decompositions: DataPayload::from_static_ref(
1701	crate::provider::Baked::SINGLETON_NORMALIZER_NFD_V1,
1702	),
1703	supplementary_decompositions: Some(SupplementPayloadHolder::Compatibility(
1704	DataPayload::from_static_ref(crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_V1),
1705	)),
1706	tables: DataPayload::from_static_ref(
1707	crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1,
1708	),
1709	supplementary_tables: Some(DataPayload::from_static_ref(
1710	crate::provider::Baked::SINGLETON_NORMALIZER_NFKDEX_V1,
1711	)),
1712	decomposition_passthrough_bound: decomposition_capped as u8,
1713	composition_passthrough_bound: composition_capped,
1714	}
1715	}
1716
1717	icu_provider::gen_any_buffer_data_constructors!(
1718	locale: skip,
1719	options: skip,
1720	error: NormalizerError,
1721	#[cfg(skip)]
1722	functions: [
1723	new_nfkd,
1724	try_new_nfkd_with_any_provider,
1725	try_new_nfkd_with_buffer_provider,
1726	try_new_nfkd_unstable,
1727	Self,
1728	]
1729	);
1730
1731	#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_nfkd)]
1732	pub fn try_new_nfkd_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
1733	where
1734	D: DataProvider<CanonicalDecompositionDataV1Marker>
1735	+ DataProvider<CompatibilityDecompositionSupplementV1Marker>
1736	+ DataProvider<CanonicalDecompositionTablesV1Marker>
1737	+ DataProvider<CompatibilityDecompositionTablesV1Marker>
1738	+ ?Sized,
1739	{
1740	let decompositions: DataPayload<CanonicalDecompositionDataV1Marker> =
1741	provider.load(Default::default())?.take_payload()?;
1742	let supplementary_decompositions: DataPayload<
1743	CompatibilityDecompositionSupplementV1Marker,
1744	> = provider.load(Default::default())?.take_payload()?;
1745	let tables: DataPayload<CanonicalDecompositionTablesV1Marker> =
1746	provider.load(Default::default())?.take_payload()?;
1747	let supplementary_tables: DataPayload<CompatibilityDecompositionTablesV1Marker> =
1748	provider.load(Default::default())?.take_payload()?;
1749
1750	if tables.get().scalars16.len()
1751	+ tables.get().scalars24.len()
1752	+ supplementary_tables.get().scalars16.len()
1753	+ supplementary_tables.get().scalars24.len()
1754	> `0xFFF`
1755	{
1756	// The data is from a future where there exists a normalization flavor whose
1757	// complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
1758	// of space. If a good use case from such a decomposition flavor arises, we can
1759	// dynamically change the bit masks so that the length mask becomes 0x1FFF instead
1760	// of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
1761	// since for now the masks are hard-coded, error out.
1762	return Err(NormalizerError::FutureExtension);
1763	}
1764
1765	let cap = supplementary_decompositions.get().passthrough_cap;
1766	if cap > `0x0300` {
1767	return Err(NormalizerError::ValidationError);
1768	}
1769	let decomposition_capped = cap.min(`0xC0`);
1770	let composition_capped = cap.min(`0x0300`);
1771
1772	Ok(DecomposingNormalizer {
1773	decompositions,
1774	supplementary_decompositions: Some(SupplementPayloadHolder::Compatibility(
1775	supplementary_decompositions,
1776	)),
1777	tables,
1778	supplementary_tables: Some(supplementary_tables),
1779	decomposition_passthrough_bound: decomposition_capped as u8,
1780	composition_passthrough_bound: composition_capped,
1781	})
1782	}
1783
1784	#[doc(hidden)]
1785	#[cfg(feature = "compiled_data")]
1786	pub(crate) const fn new_uts46_decomposed() -> Self {
1787	const _: () = assert!(
1788	crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1
1789	.scalars16
1790	.const_len()
1791	+ crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1
1792	.scalars24
1793	.const_len()
1794	+ crate::provider::Baked::SINGLETON_NORMALIZER_NFKDEX_V1
1795	.scalars16
1796	.const_len()
1797	+ crate::provider::Baked::SINGLETON_NORMALIZER_NFKDEX_V1
1798	.scalars24
1799	.const_len()
1800	<= `0xFFF`,
1801	"NormalizerError::FutureExtension"
1802	);
1803
1804	const _: () = assert!(
1805	crate::provider::Baked::SINGLETON_NORMALIZER_UTS46D_V1.passthrough_cap <= `0x0300`,
1806	"NormalizerError::ValidationError"
1807	);
1808
1809	let decomposition_capped =
1810	if crate::provider::Baked::SINGLETON_NORMALIZER_UTS46D_V1.passthrough_cap < `0xC0` {
1811	crate::provider::Baked::SINGLETON_NORMALIZER_UTS46D_V1.passthrough_cap
1812	} else {
1813	`0xC0`
1814	};
1815	let composition_capped =
1816	if crate::provider::Baked::SINGLETON_NORMALIZER_UTS46D_V1.passthrough_cap < `0x0300` {
1817	crate::provider::Baked::SINGLETON_NORMALIZER_UTS46D_V1.passthrough_cap
1818	} else {
1819	`0x0300`
1820	};
1821
1822	DecomposingNormalizer {
1823	decompositions: DataPayload::from_static_ref(
1824	crate::provider::Baked::SINGLETON_NORMALIZER_NFD_V1,
1825	),
1826	supplementary_decompositions: Some(SupplementPayloadHolder::Uts46(
1827	DataPayload::from_static_ref(
1828	crate::provider::Baked::SINGLETON_NORMALIZER_UTS46D_V1,
1829	),
1830	)),
1831	tables: DataPayload::from_static_ref(
1832	crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1,
1833	),
1834	supplementary_tables: Some(DataPayload::from_static_ref(
1835	crate::provider::Baked::SINGLETON_NORMALIZER_NFKDEX_V1,
1836	)),
1837	decomposition_passthrough_bound: decomposition_capped as u8,
1838	composition_passthrough_bound: composition_capped,
1839	}
1840	}
1841
1842	/// UTS 46 decomposed constructor (testing only)
1843	///
1844	/// This is a special building block normalization for IDNA. It is the decomposed counterpart of
1845	/// ICU4C's UTS 46 normalization with two exceptions: characters that UTS 46 disallows and
1846	/// ICU4C maps to U+FFFD and characters that UTS 46 maps to the empty string normalize as in
1847	/// NFD in this normalization. In both cases, the previous UTS 46 processing before using
1848	/// normalization is expected to deal with these characters. Making the disallowed characters
1849	/// behave like this is beneficial to data size, and this normalizer implementation cannot
1850	/// deal with a character normalizing to the empty string, which doesn't happen in NFD or
1851	/// NFKD as of Unicode 14.
1852	///
1853	/// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior
1854	/// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns
1855	/// U+0345 from a reordered character into a non-reordered character before reordering happens.
1856	/// Therefore, the output of this normalization may differ for different inputs that are
1857	/// canonically equivalent with each other if they differ by how U+0345 is ordered relative
1858	/// to other reorderable characters.
1859	///
1860	/// Public for testing only.
1861	#[doc(hidden)]
1862	pub(crate) fn try_new_uts46_decomposed_unstable<D>(
1863	provider: &D,
1864	) -> Result<Self, NormalizerError>
1865	where
1866	D: DataProvider<CanonicalDecompositionDataV1Marker>
1867	+ DataProvider<Uts46DecompositionSupplementV1Marker>
1868	+ DataProvider<CanonicalDecompositionTablesV1Marker>
1869	+ DataProvider<CompatibilityDecompositionTablesV1Marker>
1870	// UTS 46 tables merged into CompatibilityDecompositionTablesV1Marker
1871	+ ?Sized,
1872	{
1873	let decompositions: DataPayload<CanonicalDecompositionDataV1Marker> =
1874	provider.load(Default::default())?.take_payload()?;
1875	let supplementary_decompositions: DataPayload<Uts46DecompositionSupplementV1Marker> =
1876	provider.load(Default::default())?.take_payload()?;
1877	let tables: DataPayload<CanonicalDecompositionTablesV1Marker> =
1878	provider.load(Default::default())?.take_payload()?;
1879	let supplementary_tables: DataPayload<CompatibilityDecompositionTablesV1Marker> =
1880	provider.load(Default::default())?.take_payload()?;
1881
1882	if tables.get().scalars16.len()
1883	+ tables.get().scalars24.len()
1884	+ supplementary_tables.get().scalars16.len()
1885	+ supplementary_tables.get().scalars24.len()
1886	> `0xFFF`
1887	{
1888	// The data is from a future where there exists a normalization flavor whose
1889	// complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
1890	// of space. If a good use case from such a decomposition flavor arises, we can
1891	// dynamically change the bit masks so that the length mask becomes 0x1FFF instead
1892	// of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
1893	// since for now the masks are hard-coded, error out.
1894	return Err(NormalizerError::FutureExtension);
1895	}
1896
1897	let cap = supplementary_decompositions.get().passthrough_cap;
1898	if cap > `0x0300` {
1899	return Err(NormalizerError::ValidationError);
1900	}
1901	let decomposition_capped = cap.min(`0xC0`);
1902	let composition_capped = cap.min(`0x0300`);
1903
1904	Ok(DecomposingNormalizer {
1905	decompositions,
1906	supplementary_decompositions: Some(SupplementPayloadHolder::Uts46(
1907	supplementary_decompositions,
1908	)),
1909	tables,
1910	supplementary_tables: Some(supplementary_tables),
1911	decomposition_passthrough_bound: decomposition_capped as u8,
1912	composition_passthrough_bound: composition_capped,
1913	})
1914	}
1915
1916	/// Wraps a delegate iterator into a decomposing iterator
1917	/// adapter by using the data already held by this normalizer.
1918	pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Decomposition<I> {
1919	Decomposition::new_with_supplements(
1920	iter,
1921	self.decompositions.get(),
1922	self.supplementary_decompositions.as_ref().map(\|s\| s.get()),
1923	self.tables.get(),
1924	self.supplementary_tables.as_ref().map(\|s\| s.get()),
1925	self.decomposition_passthrough_bound,
1926	IgnorableBehavior::Unsupported,
1927	)
1928	}
1929
1930	normalizer_methods!();
1931
1932	decomposing_normalize_to!(
1933	/// Normalize a string slice into a `Write` sink.
1934	,
1935	normalize_to,
1936	core::fmt::Write,
1937	&str,
1938	{
1939	},
1940	as_str,
1941	{
1942	let decomposition_passthrough_byte_bound = if decomposition_passthrough_bound == `0xC0` {
1943	`0xC3u8`
1944	} else {
1945	decomposition_passthrough_bound.min(`0x80`) as u8
1946	};
1947	// The attribute belongs on an inner statement, but Rust doesn't allow it there.
1948	#[allow(clippy::unwrap_used)]
1949	'fast: loop {
1950	let mut code_unit_iter = decomposition.delegate.as_str().as_bytes().iter();
1951	'fastest: loop {
1952	if let Some(&upcoming_byte) = code_unit_iter.next() {
1953	if upcoming_byte < decomposition_passthrough_byte_bound {
1954	// Fast-track succeeded!
1955	continue 'fastest;
1956	}
1957	decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - `1`..].chars();
1958	break 'fastest;
1959	}
1960	// End of stream
1961	sink.write_str(pending_slice)?;
1962	return Ok(());
1963	}
1964
1965	// `unwrap()` OK, because the slice is valid UTF-8 and we know there
1966	// is an upcoming byte.
1967	let upcoming = decomposition.delegate.next().unwrap();
1968	let upcoming_with_trie_value = decomposition.attach_trie_value(upcoming);
1969	if upcoming_with_trie_value.starter_and_decomposes_to_self() {
1970	continue 'fast;
1971	}
1972	let consumed_so_far_slice = &pending_slice[..pending_slice.len()
1973	- decomposition.delegate.as_str().len()
1974	- upcoming.len_utf8()];
1975	sink.write_str(consumed_so_far_slice)?;
1976
1977	// Now let's figure out if we got a starter or a non-starter.
1978	if decomposition_starts_with_non_starter(
1979	upcoming_with_trie_value.trie_val,
1980	) {
1981	// Let this trie value to be reprocessed in case it is
1982	// one of the rare decomposing ones.
1983	decomposition.pending = Some(upcoming_with_trie_value);
1984	decomposition.gather_and_sort_combining(`0`);
1985	continue 'outer;
1986	}
1987	undecomposed_starter = upcoming_with_trie_value;
1988	debug_assert!(decomposition.pending.is_none());
1989	break 'fast;
1990	}
1991	},
1992	text,
1993	sink,
1994	decomposition,
1995	decomposition_passthrough_bound,
1996	undecomposed_starter,
1997	pending_slice,
1998	'outer,
1999	);
2000
2001	decomposing_normalize_to!(
2002	/// Normalize a slice of potentially-invalid UTF-8 into a `Write` sink.
2003	///
2004	/// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
2005	/// according to the WHATWG Encoding Standard.
2006	,
2007	normalize_utf8_to,
2008	core::fmt::Write,
2009	&[u8],
2010	{
2011	},
2012	as_slice,
2013	{
2014	let decomposition_passthrough_byte_bound = decomposition_passthrough_bound.min(`0x80`) as u8;
2015	// The attribute belongs on an inner statement, but Rust doesn't allow it there.
2016	#[allow(clippy::unwrap_used)]
2017	'fast: loop {
2018	let mut code_unit_iter = decomposition.delegate.as_slice().iter();
2019	'fastest: loop {
2020	if let Some(&upcoming_byte) = code_unit_iter.next() {
2021	if upcoming_byte < decomposition_passthrough_byte_bound {
2022	// Fast-track succeeded!
2023	continue 'fastest;
2024	}
2025	break 'fastest;
2026	}
2027	// End of stream
2028	sink.write_str(unsafe { from_utf8_unchecked(pending_slice) })?;
2029	return Ok(());
2030	}
2031	decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - `1`..].chars();
2032
2033	// `unwrap()` OK, because the slice is valid UTF-8 and we know there
2034	// is an upcoming byte.
2035	let upcoming = decomposition.delegate.next().unwrap();
2036	let upcoming_with_trie_value = decomposition.attach_trie_value(upcoming);
2037	if upcoming_with_trie_value.starter_and_decomposes_to_self() {
2038	if upcoming != REPLACEMENT_CHARACTER {
2039	continue 'fast;
2040	}
2041	// We might have an error, so fall out of the fast path.
2042
2043	// Since the U+FFFD might signify an error, we can't
2044	// assume `upcoming.len_utf8()` for the backoff length.
2045	let mut consumed_so_far = pending_slice[..pending_slice.len() - decomposition.delegate.as_slice().len()].chars();
2046	let back = consumed_so_far.next_back();
2047	debug_assert_eq!(back, Some(REPLACEMENT_CHARACTER));
2048	let consumed_so_far_slice = consumed_so_far.as_slice();
2049	sink.write_str(unsafe{from_utf8_unchecked(consumed_so_far_slice)})?;
2050
2051	// We could call `gather_and_sort_combining` here and
2052	// `continue 'outer`, but this should be better for code
2053	// size.
2054	undecomposed_starter = upcoming_with_trie_value;
2055	debug_assert!(decomposition.pending.is_none());
2056	break 'fast;
2057	}
2058	let consumed_so_far_slice = &pending_slice[..pending_slice.len()
2059	- decomposition.delegate.as_slice().len()
2060	- upcoming.len_utf8()];
2061	sink.write_str(unsafe{from_utf8_unchecked(consumed_so_far_slice)})?;
2062
2063	// Now let's figure out if we got a starter or a non-starter.
2064	if decomposition_starts_with_non_starter(
2065	upcoming_with_trie_value.trie_val,
2066	) {
2067	// Let this trie value to be reprocessed in case it is
2068	// one of the rare decomposing ones.
2069	decomposition.pending = Some(upcoming_with_trie_value);
2070	decomposition.gather_and_sort_combining(`0`);
2071	continue 'outer;
2072	}
2073	undecomposed_starter = upcoming_with_trie_value;
2074	debug_assert!(decomposition.pending.is_none());
2075	break 'fast;
2076	}
2077	},
2078	text,
2079	sink,
2080	decomposition,
2081	decomposition_passthrough_bound,
2082	undecomposed_starter,
2083	pending_slice,
2084	'outer,
2085	);
2086
2087	decomposing_normalize_to!(
2088	/// Normalize a slice of potentially-invalid UTF-16 into a `Write16` sink.
2089	///
2090	/// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
2091	/// before normalizing.
2092	,
2093	normalize_utf16_to,
2094	write16::Write16,
2095	&[u16],
2096	{
2097	sink.size_hint(text.len())?;
2098	},
2099	as_slice,
2100	{
2101	let mut code_unit_iter = decomposition.delegate.as_slice().iter();
2102	// The purpose of the counter is to flush once in a while. If we flush
2103	// too much, there is too much flushing overhead. If we flush too rarely,
2104	// the flush starts reading from too far behind compared to the hot
2105	// recently-read memory.
2106	let mut counter = UTF16_FAST_PATH_FLUSH_THRESHOLD;
2107	'fast: loop {
2108	counter -= `1`;
2109	if let Some(&upcoming_code_unit) = code_unit_iter.next() {
2110	let mut upcoming32 = u32::from(upcoming_code_unit);
2111	if upcoming32 < decomposition_passthrough_bound && counter != `0` {
2112	continue 'fast;
2113	}
2114	// The loop is only broken out of as goto forward
2115	#[allow(clippy::never_loop)]
2116	'surrogateloop: loop {
2117	let surrogate_base = upcoming32.wrapping_sub(`0xD800`);
2118	if surrogate_base > (`0xDFFF` - `0xD800`) {
2119	// Not surrogate
2120	break 'surrogateloop;
2121	}
2122	if surrogate_base <= (`0xDBFF` - `0xD800`) {
2123	let iter_backup = code_unit_iter.clone();
2124	if let Some(&low) = code_unit_iter.next() {
2125	if in_inclusive_range16(low, `0xDC00`, `0xDFFF`) {
2126	upcoming32 = (upcoming32 << `10`) + u32::from(low)
2127	- (((`0xD800u32` << `10`) - `0x10000u32`) + `0xDC00u32`);
2128	break 'surrogateloop;
2129	} else {
2130	code_unit_iter = iter_backup;
2131	}
2132	}
2133	}
2134	// unpaired surrogate
2135	let slice_to_write = &pending_slice
2136	[..pending_slice.len() - code_unit_iter.as_slice().len() - `1`];
2137	sink.write_slice(slice_to_write)?;
2138	undecomposed_starter =
2139	CharacterAndTrieValue::new(REPLACEMENT_CHARACTER, `0`);
2140	debug_assert!(decomposition.pending.is_none());
2141	// We could instead call `gather_and_sort_combining` and `continue 'outer`, but
2142	// assuming this is better for code size.
2143	break 'fast;
2144	}
2145	// Not unpaired surrogate
2146	let upcoming = unsafe { char::from_u32_unchecked(upcoming32) };
2147	let upcoming_with_trie_value =
2148	decomposition.attach_trie_value(upcoming);
2149	if upcoming_with_trie_value.starter_and_decomposes_to_self() && counter != `0` {
2150	continue 'fast;
2151	}
2152	let consumed_so_far_slice = &pending_slice[..pending_slice.len()
2153	- code_unit_iter.as_slice().len()
2154	- upcoming.len_utf16()];
2155	sink.write_slice(consumed_so_far_slice)?;
2156
2157	// Now let's figure out if we got a starter or a non-starter.
2158	if decomposition_starts_with_non_starter(
2159	upcoming_with_trie_value.trie_val,
2160	) {
2161	// Sync with main iterator
2162	decomposition.delegate = code_unit_iter.as_slice().chars();
2163	// Let this trie value to be reprocessed in case it is
2164	// one of the rare decomposing ones.
2165	decomposition.pending = Some(upcoming_with_trie_value);
2166	decomposition.gather_and_sort_combining(`0`);
2167	continue 'outer;
2168	}
2169	undecomposed_starter = upcoming_with_trie_value;
2170	debug_assert!(decomposition.pending.is_none());
2171	break 'fast;
2172	}
2173	// End of stream
2174	sink.write_slice(pending_slice)?;
2175	return Ok(());
2176	}
2177	// Sync the main iterator
2178	decomposition.delegate = code_unit_iter.as_slice().chars();
2179	},
2180	text,
2181	sink,
2182	decomposition,
2183	decomposition_passthrough_bound,
2184	undecomposed_starter,
2185	pending_slice,
2186	'outer,
2187	);
2188	}
2189
2190	/// A normalizer for performing composing normalization.
2191	#[derive(Debug)]
2192	pub struct ComposingNormalizer {
2193	decomposing_normalizer: DecomposingNormalizer,
2194	canonical_compositions: DataPayload<CanonicalCompositionsV1Marker>,
2195	}
2196
2197	impl ComposingNormalizer {
2198	/// NFC constructor using compiled data.
2199	///
2200	/// ✨ Enabled with the `compiled_data` Cargo feature.
2201	///
2202	/// [📚 Help choosing a constructor](icu_provider::constructors)
2203	#[cfg(feature = "compiled_data")]
2204	pub const fn new_nfc() -> Self {
2205	ComposingNormalizer {
2206	decomposing_normalizer: DecomposingNormalizer::new_nfd(),
2207	canonical_compositions: DataPayload::from_static_ref(
2208	crate::provider::Baked::SINGLETON_NORMALIZER_COMP_V1,
2209	),
2210	}
2211	}
2212
2213	icu_provider::gen_any_buffer_data_constructors!(
2214	locale: skip,
2215	options: skip,
2216	error: NormalizerError,
2217	#[cfg(skip)]
2218	functions: [
2219	new_nfc,
2220	try_new_nfc_with_any_provider,
2221	try_new_nfc_with_buffer_provider,
2222	try_new_nfc_unstable,
2223	Self,
2224	]
2225	);
2226
2227	#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_nfc)]
2228	pub fn try_new_nfc_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
2229	where
2230	D: DataProvider<CanonicalDecompositionDataV1Marker>
2231	+ DataProvider<CanonicalDecompositionTablesV1Marker>
2232	+ DataProvider<CanonicalCompositionsV1Marker>
2233	+ ?Sized,
2234	{
2235	let decomposing_normalizer = DecomposingNormalizer::try_new_nfd_unstable(provider)?;
2236
2237	let canonical_compositions: DataPayload<CanonicalCompositionsV1Marker> =
2238	provider.load(Default::default())?.take_payload()?;
2239
2240	Ok(ComposingNormalizer {
2241	decomposing_normalizer,
2242	canonical_compositions,
2243	})
2244	}
2245
2246	/// NFKC constructor using compiled data.
2247	///
2248	/// ✨ Enabled with the `compiled_data` Cargo feature.
2249	///
2250	/// [📚 Help choosing a constructor](icu_provider::constructors)
2251	#[cfg(feature = "compiled_data")]
2252	pub const fn new_nfkc() -> Self {
2253	ComposingNormalizer {
2254	decomposing_normalizer: DecomposingNormalizer::new_nfkd(),
2255	canonical_compositions: DataPayload::from_static_ref(
2256	crate::provider::Baked::SINGLETON_NORMALIZER_COMP_V1,
2257	),
2258	}
2259	}
2260
2261	icu_provider::gen_any_buffer_data_constructors!(
2262	locale: skip,
2263	options: skip,
2264	error: NormalizerError,
2265	#[cfg(skip)]
2266	functions: [
2267	new_nfkc,
2268	try_new_nfkc_with_any_provider,
2269	try_new_nfkc_with_buffer_provider,
2270	try_new_nfkc_unstable,
2271	Self,
2272	]
2273	);
2274
2275	#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_nfkc)]
2276	pub fn try_new_nfkc_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
2277	where
2278	D: DataProvider<CanonicalDecompositionDataV1Marker>
2279	+ DataProvider<CompatibilityDecompositionSupplementV1Marker>
2280	+ DataProvider<CanonicalDecompositionTablesV1Marker>
2281	+ DataProvider<CompatibilityDecompositionTablesV1Marker>
2282	+ DataProvider<CanonicalCompositionsV1Marker>
2283	+ ?Sized,
2284	{
2285	let decomposing_normalizer = DecomposingNormalizer::try_new_nfkd_unstable(provider)?;
2286
2287	let canonical_compositions: DataPayload<CanonicalCompositionsV1Marker> =
2288	provider.load(Default::default())?.take_payload()?;
2289
2290	Ok(ComposingNormalizer {
2291	decomposing_normalizer,
2292	canonical_compositions,
2293	})
2294	}
2295
2296	/// This is a special building block normalization for IDNA that implements parts of the Map
2297	/// step and the following Normalize step.
2298	///
2299	/// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior
2300	/// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns
2301	/// U+0345 from a reordered character into a non-reordered character before reordering happens.
2302	/// Therefore, the output of this normalization may differ for different inputs that are
2303	/// canonically equivalents with each other if they differ by how U+0345 is ordered relative
2304	/// to other reorderable characters.
2305	#[cfg(feature = "compiled_data")]
2306	pub(crate) const fn new_uts46() -> Self {
2307	ComposingNormalizer {
2308	decomposing_normalizer: DecomposingNormalizer::new_uts46_decomposed(),
2309	canonical_compositions: DataPayload::from_static_ref(
2310	crate::provider::Baked::SINGLETON_NORMALIZER_COMP_V1,
2311	),
2312	}
2313	}
2314
2315	#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_uts46)]
2316	pub(crate) fn try_new_uts46_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
2317	where
2318	D: DataProvider<CanonicalDecompositionDataV1Marker>
2319	+ DataProvider<Uts46DecompositionSupplementV1Marker>
2320	+ DataProvider<CanonicalDecompositionTablesV1Marker>
2321	+ DataProvider<CompatibilityDecompositionTablesV1Marker>
2322	// UTS 46 tables merged into CompatibilityDecompositionTablesV1Marker
2323	+ DataProvider<CanonicalCompositionsV1Marker>
2324	+ ?Sized,
2325	{
2326	let decomposing_normalizer =
2327	DecomposingNormalizer::try_new_uts46_decomposed_unstable(provider)?;
2328
2329	let canonical_compositions: DataPayload<CanonicalCompositionsV1Marker> =
2330	provider.load(Default::default())?.take_payload()?;
2331
2332	Ok(ComposingNormalizer {
2333	decomposing_normalizer,
2334	canonical_compositions,
2335	})
2336	}
2337
2338	/// Wraps a delegate iterator into a composing iterator
2339	/// adapter by using the data already held by this normalizer.
2340	pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Composition<I> {
2341	self.normalize_iter_private(iter, IgnorableBehavior::Unsupported)
2342	}
2343
2344	fn normalize_iter_private<I: Iterator<Item = char>>(
2345	&self,
2346	iter: I,
2347	ignorable_behavior: IgnorableBehavior,
2348	) -> Composition<I> {
2349	Composition::new(
2350	Decomposition::new_with_supplements(
2351	iter,
2352	self.decomposing_normalizer.decompositions.get(),
2353	self.decomposing_normalizer
2354	.supplementary_decompositions
2355	.as_ref()
2356	.map(\|s\| s.get()),
2357	self.decomposing_normalizer.tables.get(),
2358	self.decomposing_normalizer
2359	.supplementary_tables
2360	.as_ref()
2361	.map(\|s\| s.get()),
2362	self.decomposing_normalizer.decomposition_passthrough_bound,
2363	ignorable_behavior,
2364	),
2365	ZeroFrom::zero_from(&self.canonical_compositions.get().canonical_compositions),
2366	self.decomposing_normalizer.composition_passthrough_bound,
2367	)
2368	}
2369
2370	normalizer_methods!();
2371
2372	composing_normalize_to!(
2373	/// Normalize a string slice into a `Write` sink.
2374	,
2375	normalize_to,
2376	core::fmt::Write,
2377	&str,
2378	{},
2379	`true`,
2380	as_str,
2381	{
2382	// Let's hope LICM hoists this outside `'outer`.
2383	let composition_passthrough_byte_bound = if composition_passthrough_bound == `0x300` {
2384	`0xCCu8`
2385	} else {
2386	// We can make this fancy if a normalization other than NFC where looking at
2387	// non-ASCII lead bytes is worthwhile is ever introduced.
2388	composition_passthrough_bound.min(`0x80`) as u8
2389	};
2390	// This is basically an `Option` discriminant for `undecomposed_starter`,
2391	// but making it a boolean so that writes in the tightest loop are as
2392	// simple as possible (and potentially as peel-hoistable as possible).
2393	// Furthermore, this reduces `unwrap()` later.
2394	let mut undecomposed_starter_valid = `true`;
2395	// Annotation belongs really on inner statements, but Rust doesn't
2396	// allow it there.
2397	#[allow(clippy::unwrap_used)]
2398	'fast: loop {
2399	let mut code_unit_iter = composition.decomposition.delegate.as_str().as_bytes().iter();
2400	'fastest: loop {
2401	if let Some(&upcoming_byte) = code_unit_iter.next() {
2402	if upcoming_byte < composition_passthrough_byte_bound {
2403	// Fast-track succeeded!
2404	undecomposed_starter_valid = `false`;
2405	continue 'fastest;
2406	}
2407	composition.decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - `1`..].chars();
2408	break 'fastest;
2409	}
2410	// End of stream
2411	sink.write_str(pending_slice)?;
2412	return Ok(());
2413	}
2414	// `unwrap()` OK, because the slice is valid UTF-8 and we know there
2415	// is an upcoming byte.
2416	let upcoming = composition.decomposition.delegate.next().unwrap();
2417	let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming);
2418	if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() {
2419	// Can't combine backwards, hence a plain (non-backwards-combining)
2420	// starter albeit past `composition_passthrough_bound`
2421
2422	// Fast-track succeeded!
2423	undecomposed_starter = upcoming_with_trie_value;
2424	undecomposed_starter_valid = `true`;
2425	continue 'fast;
2426	}
2427	// We need to fall off the fast path.
2428	composition.decomposition.pending = Some(upcoming_with_trie_value);
2429	let consumed_so_far_slice = if undecomposed_starter_valid {
2430	&pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_str().len() - upcoming.len_utf8() - undecomposed_starter.character.len_utf8()]
2431	} else {
2432	// slicing and unwrap OK, because we've just evidently read enough previously.
2433	let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_str().len() - upcoming.len_utf8()].chars();
2434	// `unwrap` OK, because we've previously manage to read the previous character
2435	undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
2436	undecomposed_starter_valid = `true`;
2437	consumed_so_far.as_str()
2438	};
2439	sink.write_str(consumed_so_far_slice)?;
2440	break 'fast;
2441	}
2442	debug_assert!(undecomposed_starter_valid);
2443	},
2444	text,
2445	sink,
2446	composition,
2447	composition_passthrough_bound,
2448	undecomposed_starter,
2449	pending_slice,
2450	len_utf8,
2451	);
2452
2453	composing_normalize_to!(
2454	/// Normalize a slice of potentially-invalid UTF-8 into a `Write` sink.
2455	///
2456	/// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
2457	/// according to the WHATWG Encoding Standard.
2458	,
2459	normalize_utf8_to,
2460	core::fmt::Write,
2461	&[u8],
2462	{},
2463	`false`,
2464	as_slice,
2465	{
2466	// This is basically an `Option` discriminant for `undecomposed_starter`,
2467	// but making it a boolean so that writes in the tightest loop are as
2468	// simple as possible (and potentially as peel-hoistable as possible).
2469	// Furthermore, this reduces `unwrap()` later.
2470	let mut undecomposed_starter_valid = `true`;
2471	'fast: loop {
2472	if let Some(upcoming) = composition.decomposition.delegate.next() {
2473	if u32::from(upcoming) < composition_passthrough_bound {
2474	// Fast-track succeeded!
2475	undecomposed_starter_valid = `false`;
2476	continue 'fast;
2477	}
2478	// TODO(#2006): Annotate as unlikely
2479	if upcoming == REPLACEMENT_CHARACTER {
2480	// Can't tell if this is an error or a literal U+FFFD in
2481	// the input. Assuming the former to be sure.
2482
2483	// Since the U+FFFD might signify an error, we can't
2484	// assume `upcoming.len_utf8()` for the backoff length.
2485	let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len()].chars();
2486	let back = consumed_so_far.next_back();
2487	debug_assert_eq!(back, Some(REPLACEMENT_CHARACTER));
2488	let consumed_so_far_slice = consumed_so_far.as_slice();
2489	sink.write_str(unsafe{ from_utf8_unchecked(consumed_so_far_slice)})?;
2490	undecomposed_starter = CharacterAndTrieValue::new(REPLACEMENT_CHARACTER, `0`);
2491	undecomposed_starter_valid = `true`;
2492	composition.decomposition.pending = None;
2493	break 'fast;
2494	}
2495	let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming);
2496	if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() {
2497	// Can't combine backwards, hence a plain (non-backwards-combining)
2498	// starter albeit past `composition_passthrough_bound`
2499
2500	// Fast-track succeeded!
2501	undecomposed_starter = upcoming_with_trie_value;
2502	undecomposed_starter_valid = `true`;
2503	continue 'fast;
2504	}
2505	// We need to fall off the fast path.
2506	composition.decomposition.pending = Some(upcoming_with_trie_value);
2507	// Annotation belongs really on inner statement, but Rust doesn't
2508	// allow it there.
2509	#[allow(clippy::unwrap_used)]
2510	let consumed_so_far_slice = if undecomposed_starter_valid {
2511	&pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len() - upcoming.len_utf8() - undecomposed_starter.character.len_utf8()]
2512	} else {
2513	// slicing and unwrap OK, because we've just evidently read enough previously.
2514	let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len() - upcoming.len_utf8()].chars();
2515	// `unwrap` OK, because we've previously manage to read the previous character
2516	undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
2517	undecomposed_starter_valid = `true`;
2518	consumed_so_far.as_slice()
2519	};
2520	sink.write_str(unsafe { from_utf8_unchecked(consumed_so_far_slice)})?;
2521	break 'fast;
2522	}
2523	// End of stream
2524	sink.write_str(unsafe {from_utf8_unchecked(pending_slice) })?;
2525	return Ok(());
2526	}
2527	debug_assert!(undecomposed_starter_valid);
2528	},
2529	text,
2530	sink,
2531	composition,
2532	composition_passthrough_bound,
2533	undecomposed_starter,
2534	pending_slice,
2535	len_utf8,
2536	);
2537
2538	composing_normalize_to!(
2539	/// Normalize a slice of potentially-invalid UTF-16 into a `Write16` sink.
2540	///
2541	/// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
2542	/// before normalizing.
2543	,
2544	normalize_utf16_to,
2545	write16::Write16,
2546	&[u16],
2547	{
2548	sink.size_hint(text.len())?;
2549	},
2550	`false`,
2551	as_slice,
2552	{
2553	let mut code_unit_iter = composition.decomposition.delegate.as_slice().iter();
2554	let mut upcoming32;
2555	// This is basically an `Option` discriminant for `undecomposed_starter`,
2556	// but making it a boolean so that writes to it are are as
2557	// simple as possible.
2558	// Furthermore, this removes the need for `unwrap()` later.
2559	let mut undecomposed_starter_valid;
2560	// The purpose of the counter is to flush once in a while. If we flush
2561	// too much, there is too much flushing overhead. If we flush too rarely,
2562	// the flush starts reading from too far behind compared to the hot
2563	// recently-read memory.
2564	let mut counter = UTF16_FAST_PATH_FLUSH_THRESHOLD;
2565	// The purpose of this trickiness is to avoid writing to
2566	// `undecomposed_starter_valid` from the tightest loop. Writing to it
2567	// from there destroys performance.
2568	let mut counter_reference = counter - `1`;
2569	'fast: loop {
2570	counter -= `1`;
2571	if let Some(&upcoming_code_unit) = code_unit_iter.next() {
2572	upcoming32 = u32::from(upcoming_code_unit); // may be surrogate
2573	if upcoming32 < composition_passthrough_bound && counter != `0` {
2574	// No need for surrogate or U+FFFD check, because
2575	// `composition_passthrough_bound` cannot be higher than
2576	// U+0300.
2577	// Fast-track succeeded!
2578	continue 'fast;
2579	}
2580	// if `counter` equals `counter_reference`, the `continue 'fast`
2581	// line above has not executed and `undecomposed_starter` is still
2582	// valid.
2583	undecomposed_starter_valid = counter == counter_reference;
2584	// The loop is only broken out of as goto forward
2585	#[allow(clippy::never_loop)]
2586	'surrogateloop: loop {
2587	let surrogate_base = upcoming32.wrapping_sub(`0xD800`);
2588	if surrogate_base > (`0xDFFF` - `0xD800`) {
2589	// Not surrogate
2590	break 'surrogateloop;
2591	}
2592	if surrogate_base <= (`0xDBFF` - `0xD800`) {
2593	let iter_backup = code_unit_iter.clone();
2594	if let Some(&low) = code_unit_iter.next() {
2595	if in_inclusive_range16(low, `0xDC00`, `0xDFFF`) {
2596	upcoming32 = (upcoming32 << `10`) + u32::from(low)
2597	- (((`0xD800u32` << `10`) - `0x10000u32`) + `0xDC00u32`);
2598	break 'surrogateloop;
2599	} else {
2600	code_unit_iter = iter_backup;
2601	}
2602	}
2603	}
2604	// unpaired surrogate
2605	let slice_to_write = &pending_slice[..pending_slice.len() - code_unit_iter.as_slice().len() - `1`];
2606	sink.write_slice(slice_to_write)?;
2607	undecomposed_starter = CharacterAndTrieValue::new(REPLACEMENT_CHARACTER, `0`);
2608	undecomposed_starter_valid = `true`;
2609	composition.decomposition.pending = None;
2610	break 'fast;
2611	}
2612	// Not unpaired surrogate
2613	let upcoming = unsafe { char::from_u32_unchecked(upcoming32) };
2614	let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming);
2615	if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() && counter != `0` {
2616	// Can't combine backwards, hence a plain (non-backwards-combining)
2617	// starter albeit past `composition_passthrough_bound`
2618
2619	// Fast-track succeeded!
2620	undecomposed_starter = upcoming_with_trie_value;
2621	// Cause `undecomposed_starter_valid` to be set to true.
2622	// This regresses English performance on Haswell by 11%
2623	// compared to commenting out this assignment to
2624	// `counter_reference`.
2625	counter_reference = counter - `1`;
2626	continue 'fast;
2627	}
2628	// We need to fall off the fast path.
2629	composition.decomposition.pending = Some(upcoming_with_trie_value);
2630	// Annotation belongs really on inner statement, but Rust doesn't
2631	// allow it there.
2632	#[allow(clippy::unwrap_used)]
2633	let consumed_so_far_slice = if undecomposed_starter_valid {
2634	&pending_slice[..pending_slice.len() - code_unit_iter.as_slice().len() - upcoming.len_utf16() - undecomposed_starter.character.len_utf16()]
2635	} else {
2636	// slicing and unwrap OK, because we've just evidently read enough previously.
2637	let mut consumed_so_far = pending_slice[..pending_slice.len() - code_unit_iter.as_slice().len() - upcoming.len_utf16()].chars();
2638	// `unwrap` OK, because we've previously manage to read the previous character
2639	undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
2640	undecomposed_starter_valid = `true`;
2641	consumed_so_far.as_slice()
2642	};
2643	sink.write_slice(consumed_so_far_slice)?;
2644	break 'fast;
2645	}
2646	// End of stream
2647	sink.write_slice(pending_slice)?;
2648	return Ok(());
2649	}
2650	debug_assert!(undecomposed_starter_valid);
2651	// Sync the main iterator
2652	composition.decomposition.delegate = code_unit_iter.as_slice().chars();
2653	},
2654	text,
2655	sink,
2656	composition,
2657	composition_passthrough_bound,
2658	undecomposed_starter,
2659	pending_slice,
2660	len_utf16,
2661	);
2662	}
2663
2664	struct IsNormalizedSinkUtf16<'a> {
2665	expect: &'a [u16],
2666	}
2667
2668	impl<'a> IsNormalizedSinkUtf16<'a> {
2669	pub fn new(slice: &'a [u16]) -> Self {
2670	IsNormalizedSinkUtf16 { expect: slice }
2671	}
2672	pub fn finished(&self) -> bool {
2673	self.expect.is_empty()
2674	}
2675	}
2676
2677	impl<'a> Write16 for IsNormalizedSinkUtf16<'a> {
2678	fn write_slice(&mut self, s: &[u16]) -> core::fmt::Result {
2679	// We know that if we get a slice, it's a pass-through,
2680	// so we can compare addresses. Indexing is OK, because
2681	// an indexing failure would be a code bug rather than
2682	// an input or data issue.
2683	#[allow(clippy::indexing_slicing)]
2684	if s.as_ptr() == self.expect.as_ptr() {
2685	self.expect = &self.expect[s.len()..];
2686	Ok(())
2687	} else {
2688	Err(core::fmt::Error {})
2689	}
2690	}
2691
2692	fn write_char(&mut self, c: char) -> core::fmt::Result {
2693	let mut iter = self.expect.chars();
2694	if iter.next() == Some(c) {
2695	self.expect = iter.as_slice();
2696	Ok(())
2697	} else {
2698	Err(core::fmt::Error {})
2699	}
2700	}
2701	}
2702
2703	struct IsNormalizedSinkUtf8<'a> {
2704	expect: &'a [u8],
2705	}
2706
2707	impl<'a> IsNormalizedSinkUtf8<'a> {
2708	pub fn new(slice: &'a [u8]) -> Self {
2709	IsNormalizedSinkUtf8 { expect: slice }
2710	}
2711	pub fn finished(&self) -> bool {
2712	self.expect.is_empty()
2713	}
2714	}
2715
2716	impl<'a> core::fmt::Write for IsNormalizedSinkUtf8<'a> {
2717	fn write_str(&mut self, s: &str) -> core::fmt::Result {
2718	// We know that if we get a slice, it's a pass-through,
2719	// so we can compare addresses. Indexing is OK, because
2720	// an indexing failure would be a code bug rather than
2721	// an input or data issue.
2722	#[allow(clippy::indexing_slicing)]
2723	if s.as_ptr() == self.expect.as_ptr() {
2724	self.expect = &self.expect[s.len()..];
2725	Ok(())
2726	} else {
2727	Err(core::fmt::Error {})
2728	}
2729	}
2730
2731	fn write_char(&mut self, c: char) -> core::fmt::Result {
2732	let mut iter = self.expect.chars();
2733	if iter.next() == Some(c) {
2734	self.expect = iter.as_slice();
2735	Ok(())
2736	} else {
2737	Err(core::fmt::Error {})
2738	}
2739	}
2740	}
2741
2742	struct IsNormalizedSinkStr<'a> {
2743	expect: &'a str,
2744	}
2745
2746	impl<'a> IsNormalizedSinkStr<'a> {
2747	pub fn new(slice: &'a str) -> Self {
2748	IsNormalizedSinkStr { expect: slice }
2749	}
2750	pub fn finished(&self) -> bool {
2751	self.expect.is_empty()
2752	}
2753	}
2754
2755	impl<'a> core::fmt::Write for IsNormalizedSinkStr<'a> {
2756	fn write_str(&mut self, s: &str) -> core::fmt::Result {
2757	// We know that if we get a slice, it's a pass-through,
2758	// so we can compare addresses. Indexing is OK, because
2759	// an indexing failure would be a code bug rather than
2760	// an input or data issue.
2761	#[allow(clippy::indexing_slicing)]
2762	if s.as_ptr() == self.expect.as_ptr() {
2763	self.expect = &self.expect[s.len()..];
2764	Ok(())
2765	} else {
2766	Err(core::fmt::Error {})
2767	}
2768	}
2769
2770	fn write_char(&mut self, c: char) -> core::fmt::Result {
2771	let mut iter = self.expect.chars();
2772	if iter.next() == Some(c) {
2773	self.expect = iter.as_str();
2774	Ok(())
2775	} else {
2776	Err(core::fmt::Error {})
2777	}
2778	}
2779	}
2780