uts46.rs source code [crates/idna/src/uts46.rs]

1	// Copyright The rust-url developers.
2	//
3	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6	// option. This file may not be copied, modified, or distributed
7	// except according to those terms.
8
9	//! This module provides the lower-level API for UTS 46.
10	//!
11	//! [`Uts46::process`] is the core that the other convenience
12	//! methods build on.
13	//!
14	//! UTS 46 flags map to this API as follows:
15	//!
16	//! _CheckHyphens_ - _true_:* [`Hyphens::Check`], _false_: [`Hyphens::Allow`]; the WHATWG URL Standard sets this to _false_ for normal (non-conformance-checker) user agents.
17	//! _CheckBidi_ - Always _true_; cannot be configured, since this flag is _true_ even when WHATWG URL Standard _beStrict_ is _false_.*
18	//! _CheckJoiners_ - Always _true_; cannot be configured, since this flag is _true_ even when WHATWG URL Standard _beStrict_ is _false_.*
19	//! _UseSTD3ASCIIRules_ - _true_:* [`AsciiDenyList::STD3`], _false_: [`AsciiDenyList::EMPTY`]; however, the check the WHATWG URL Standard performs right after the UTS 46 invocation corresponds to [`AsciiDenyList::URL`].
20	//! * _Transitional_Processing_ - Always _false_ but could be implemented as a preprocessing step. This flag is deprecated and for Web purposes the transition is over in the sense that all of Firefox, Safari, or Chrome set this flag to _false_.
21	//! _VerifyDnsLength_ - _true_:* [`DnsLength::Verify`], _false_: [`DnsLength::Ignore`]; the WHATWG URL Standard sets this to _false_ for normal (non-conformance-checker) user agents.
22	//! _IgnoreInvalidPunycode_ - Always _false_; cannot be configured. (Not yet covered by the WHATWG URL Standard, but 2 out of 3 major browser clearly behave as if this was _false_).*
23
24	use crate::punycode::Decoder;
25	use crate::punycode::InternalCaller;
26	use alloc::borrow::Cow;
27	use alloc::string::String;
28	use core::fmt::Write;
29	use idna_adapter::*;
30	use smallvec::SmallVec;
31	use utf8_iter::Utf8CharsEx;
32
33	/// ICU4C-compatible constraint.
34	/// https://unicode-org.atlassian.net/browse/ICU-13727
35	const PUNYCODE_DECODE_MAX_INPUT_LENGTH: usize = `2000`;
36
37	/// ICU4C-compatible constraint. (Note: ICU4C measures
38	/// UTF-16 and we measure UTF-32. This means that we
39	/// allow longer non-BMP inputs. For this implementation,
40	/// the denial-of-service scaling does not depend on BMP vs.
41	/// non-BMP: only the scalar values matter.)
42	///
43	/// https://unicode-org.atlassian.net/browse/ICU-13727
44	const PUNYCODE_ENCODE_MAX_INPUT_LENGTH: usize = `1000`;
45
46	/// For keeping track of what kind of numerals have been
47	/// seen in an RTL label.
48	#[derive(Debug, PartialEq, Eq)]
49	enum RtlNumeralState {
50	Undecided,
51	European,
52	Arabic,
53	}
54
55	/// Computes the mask for upper-case ASCII.
56	const fn upper_case_mask() -> u128 {
57	let mut accu: u128 = `0u128`;
58	let mut b: u8 = `0u8`;
59	while b < `128` {
60	if (b >= b'A') && (b <= b'Z') {
61	accu \|= `1u128` << b;
62	}
63	b += `1`;
64	}
65	accu
66	}
67
68	/// Bit set for upper-case ASCII.
69	const UPPER_CASE_MASK: u128 = upper_case_mask();
70
71	/// Computes the mask for glyphless ASCII.
72	const fn glyphless_mask() -> u128 {
73	let mut accu: u128 = `0u128`;
74	let mut b: u8 = `0u8`;
75	while b < `128` {
76	if (b <= b' ') \|\| (b == `0x7F`) {
77	accu \|= `1u128` << b;
78	}
79	b += `1`;
80	}
81	accu
82	}
83
84	/// Bit set for glyphless ASCII.
85	const GLYPHLESS_MASK: u128 = glyphless_mask();
86
87	/// The mask for the ASCII dot.
88	const DOT_MASK: u128 = `1` << b'.';
89
90	/// Computes the ASCII deny list for STD3 ASCII rules.
91	const fn ldh_mask() -> u128 {
92	let mut accu: u128 = `0u128`;
93	let mut b: u8 = `0u8`;
94	while b < `128` {
95	if !((b >= b'a' && b <= b'z') \|\| (b >= b'0' && b <= b'9') \|\| b == b'-' \|\| b == b'.') {
96	accu \|= `1u128` << b;
97	}
98	b += `1`;
99	}
100	accu
101	}
102
103	const PUNYCODE_PREFIX: u32 =
104	((b'-' as u32) << `24`) \| ((b'-' as u32) << `16`) \| ((b'N' as u32) << `8`) \| b'X' as u32;
105
106	const PUNYCODE_PREFIX_MASK: u32 = (`0xFF` << `24`) \| (`0xFF` << `16`) \| (`0xDF` << `8`) \| `0xDF`;
107
108	fn write_punycode_label<W: Write + ?Sized>(
109	label: &[char],
110	sink: &mut W,
111	) -> Result<(), ProcessingError> {
112	sink.write_str("xn--")?;
113	crate::punycode::encode_into::<_, _, InternalCaller>(input:label.iter().copied(), output:sink)?;
114	Ok(())
115	}
116
117	#[inline(always)]
118	fn has_punycode_prefix(slice: &[u8]) -> bool {
119	if slice.len() < `4` {
120	return `false`;
121	}
122	// Sadly, the optimizer doesn't figure out that more idiomatic code
123	// should compile to masking on 32-bit value.
124	let a: u8 = slice[`0`];
125	let b: u8 = slice[`1`];
126	let c: u8 = slice[`2`];
127	let d: u8 = slice[`3`];
128	let u: u32 = (u32::from(d) << `24`) \| (u32::from(c) << `16`) \| (u32::from(b) << `8`) \| u32::from(a);
129	(u & PUNYCODE_PREFIX_MASK) == PUNYCODE_PREFIX
130	}
131
132	#[inline(always)]
133	fn in_inclusive_range8(u: u8, start: u8, end: u8) -> bool {
134	u.wrapping_sub(start) <= (end - start)
135	}
136
137	#[inline(always)]
138	fn in_inclusive_range_char(c: char, start: char, end: char) -> bool {
139	u32::from(c).wrapping_sub(u32::from(start)) <= (u32::from(end) - u32::from(start))
140	}
141
142	#[inline(always)]
143	fn is_passthrough_ascii_label(label: &[u8]) -> bool {
144	// XXX if we aren't performing _CheckHyphens_, this could
145	// check for "xn--" and pass through YouTube CDN node names.
146	if label.len() >= `4` && label[`2`] == b'-' && label[`3`] == b'-' {
147	return `false`;
148	}
149	if let Some((&first, tail)) = label.split_first() {
150	// We need to check the first and last character
151	// more strictly in case this turns out to be a
152	// label in a bidi domain name. This has the side
153	// effect that this function only accepts labels
154	// that also conform to the STD3 rules.
155	//
156	// XXX: If we are in the fail-fast mode (i.e. we don't need
157	// to be able to overwrite anything with U+FFFD), we could
158	// merely record that we've seen a digit here and error out
159	// if we later discover that the domain name is a bidi
160	// domain name.
161	if !in_inclusive_range8(first, b'a', b'z') {
162	return `false`;
163	}
164	for &b in tail {
165	// If we used LDH_MASK, we'd have to check
166	// the bytes for the ASCII range anyhow.
167	if in_inclusive_range8(b, b'a', b'z') {
168	continue;
169	}
170	if in_inclusive_range8(b, b'0', b'9') {
171	continue;
172	}
173	if b == b'-' {
174	continue;
175	}
176	return `false`;
177	}
178	label.last() != Some(&b'-')
179	} else {
180	// empty
181	`true`
182	}
183	}
184
185	#[inline(always)]
186	fn split_ascii_fast_path_prefix(label: &[u8]) -> (&[u8], &[u8]) {
187	if let Some(pos: usize) = label.iter().position(\|b: &u8\| !b.is_ascii()) {
188	if pos == `0` {
189	// First is non-ASCII
190	(&[], label)
191	} else {
192	// Leave one ASCII character in the suffix
193	// in case it's a letter that a combining
194	// character combines with.
195	let (head: &[u8], tail: &[u8]) = label.split_at(mid:pos - `1`);
196	(head, tail)
197	}
198	} else {
199	// All ASCII
200	(label, &[])
201	}
202	}
203
204	// Input known to be lower-case, but may contain non-ASCII.
205	#[inline(always)]
206	fn apply_ascii_deny_list_to_lower_cased_unicode(c: char, deny_list: u128) -> char {
207	if let Some(shifted: u128) = `1u128`.checked_shl(u32::from(c)) {
208	if (deny_list & shifted) == `0` {
209	c
210	} else {
211	'`\u{FFFD}`'
212	}
213	} else {
214	c
215	}
216	}
217
218	// Input known to be ASCII, but may contain upper case ASCII.
219	#[inline(always)]
220	fn apply_ascii_deny_list_to_potentially_upper_case_ascii(b: u8, deny_list: u128) -> char {
221	if (deny_list & (`1u128` << b)) == `0` {
222	return char::from(b);
223	}
224	if in_inclusive_range8(u:b, start:b'A', end:b'Z') {
225	return char::from(b + `0x20`);
226	}
227	'`\u{FFFD}`'
228	}
229
230	#[inline(always)]
231	fn is_ascii(label: &[char]) -> bool {
232	for c: &char in label.iter() {
233	if !c.is_ascii() {
234	return `false`;
235	}
236	}
237	`true`
238	}
239
240	#[derive(PartialEq, Eq, Copy, Clone)]
241	enum PunycodeClassification {
242	Ascii,
243	Unicode,
244	Error,
245	}
246
247	#[inline(always)]
248	fn classify_for_punycode(label: &[char]) -> PunycodeClassification {
249	let mut iter: impl Iterator = label.iter().copied();
250	loop {
251	if let Some(c: char) = iter.next() {
252	if c.is_ascii() {
253	continue;
254	}
255	if c == '`\u{FFFD}`' {
256	return PunycodeClassification::Error;
257	}
258	for c: char in iter {
259	if c == '`\u{FFFD}`' {
260	return PunycodeClassification::Error;
261	}
262	}
263	return PunycodeClassification::Unicode;
264	}
265	return PunycodeClassification::Ascii;
266	}
267	}
268
269	/// The ASCII deny list to be applied.
270	#[derive(PartialEq, Eq, Copy, Clone)]
271	#[repr(transparent)]
272	pub struct AsciiDenyList {
273	bits: u128,
274	}
275
276	impl AsciiDenyList {
277	/// Computes (preferably at compile time) an ASCII deny list.
278	///
279	/// Setting `deny_glyphless` to `true` denies U+0020 SPACE and below
280	/// as well as U+007F DELETE for convenience without having to list
281	/// these characters in the `deny_list` string.
282	///
283	/// `deny_list` is the list of ASCII characters to deny. This
284	/// list must not contain any of:
285	/// Letters*
286	/// Digits*
287	/// Hyphen*
288	/// Dot (period / full-stop)*
289	/// Non-ASCII*
290	///
291	/// # Panics
292	///
293	/// If the deny list contains characters listed as prohibited above.
294	pub const fn new(deny_glyphless: bool, deny_list: &str) -> Self {
295	let mut bits = UPPER_CASE_MASK;
296	if deny_glyphless {
297	bits \|= GLYPHLESS_MASK;
298	}
299	let mut i = `0`;
300	let bytes = deny_list.as_bytes();
301	while i < bytes.len() {
302	let b = bytes[i];
303	assert!(b < `0x80`, "ASCII deny list must be ASCII.");
304	// assert_ne not yet available in const context.
305	assert!(b != b'.', "ASCII deny list must not contain the dot.");
306	assert!(b != b'-', "ASCII deny list must not contain the hyphen.");
307	assert!(
308	!((b >= b'0') && (b <= b'9')),
309	"ASCII deny list must not contain digits."
310	);
311	assert!(
312	!((b >= b'a') && (b <= b'z')),
313	"ASCII deny list must not contain letters."
314	);
315	assert!(
316	!((b >= b'A') && (b <= b'Z')),
317	"ASCII deny list must not contain letters."
318	);
319	bits \|= `1u128` << b;
320	i += `1`;
321	}
322	AsciiDenyList { bits }
323	}
324
325	/// No ASCII deny list. This corresponds to _UseSTD3ASCIIRules=false_.
326	///
327	/// Equivalent to `AsciiDenyList::new(false, "")`.
328	///
329	/// Note: Not denying the space and control characters can result in
330	/// strange behavior. Without a deny list provided to the UTS 46
331	/// operation, the caller is expected perform filtering afterwards,
332	/// but it's more efficient to use `AsciiDenyList` than post-processing,
333	/// because the internals of this crate can optimize away checks in
334	/// certain cases.
335	pub const EMPTY: AsciiDenyList = AsciiDenyList::new(`false`, "");
336
337	/// The STD3 deny list. This corresponds to _UseSTD3ASCIIRules=true_.
338	///
339	/// Note that this deny list rejects the underscore, which occurs in
340	/// pseudo-hosts used by various TXT record-based protocols, and also
341	/// characters that may occurs in non-DNS naming, such as NetBIOS.
342	pub const STD3: AsciiDenyList = AsciiDenyList { bits: ldh_mask() };
343
344	/// [Forbidden domain code point](https://url.spec.whatwg.org/#forbidden-domain-code-point) from the WHATWG URL Standard.
345	///
346	/// Equivalent to `AsciiDenyList::new(true, "%#/:<>?@[\\]^\|")`.
347	///
348	/// Note that this deny list rejects IPv6 addresses, so (as in URL
349	/// parsing) you need to check for IPv6 addresses first and not
350	/// put them through UTS 46 processing.
351	pub const URL: AsciiDenyList = AsciiDenyList::new(`true`, "%#/:<>?@[`\\`]^\|");
352	}
353
354	/// The _CheckHyphens_ mode.
355	#[derive(PartialEq, Eq, Copy, Clone)]
356	#[non_exhaustive] // non_exhaustive in case a middle mode that prohibits only first and last position needs to be added
357	pub enum Hyphens {
358	/// _CheckHyphens=false_: Do not place positional restrictions on hyphens.
359	///
360	/// This mode is used by the WHATWG URL Standard for normal User Agent processing
361	/// (i.e. not conformance checking).
362	Allow,
363
364	/// Prohibit hyphens in the first and last position in the label but allow in
365	/// the third and fourth position.
366	///
367	/// Note that this mode rejects real-world names, including some GitHub user pages.
368	CheckFirstLast,
369
370	/// _CheckHyphens=true_: Prohibit hyphens in the first, third, fourth,
371	/// and last position in the label.
372	///
373	/// Note that this mode rejects real-world names, including YouTube CDN nodes
374	/// and some GitHub user pages.
375	Check,
376	}
377
378	/// The UTS 46 _VerifyDNSLength_ flag.
379	#[derive(PartialEq, Eq, Copy, Clone)]
380	#[non_exhaustive]
381	pub enum DnsLength {
382	/// _VerifyDNSLength=false_. (Possibly relevant for allowing non-DNS naming systems.)
383	Ignore,
384	/// _VerifyDNSLength=true_ with the exception that the trailing root label dot is
385	/// allowed.
386	VerifyAllowRootDot,
387	/// _VerifyDNSLength=true_. (The trailing root label dot is not allowed.)
388	Verify,
389	}
390
391	/// Policy for customizing behavior in case of an error.
392	#[derive(PartialEq, Eq, Copy, Clone)]
393	#[non_exhaustive]
394	pub enum ErrorPolicy {
395	/// Return as early as possible without producing output in case of error.
396	FailFast,
397	/// In case of error, mark errors with the REPLACEMENT CHARACTER. (The output
398	/// containing REPLACEMENT CHARACTERs may be show to the user to illustrate
399	/// what was wrong but must not be used for naming in a network protocol.)
400	MarkErrors,
401	}
402
403	/// The success outcome of [`Uts46::process`]
404	#[derive(PartialEq, Eq, Copy, Clone, Debug)]
405	pub enum ProcessingSuccess {
406	/// There were no errors. The caller must consider the input to be the output.
407	///
408	/// This asserts that the input can be safely passed to [`core::str::from_utf8_unchecked`].
409	///
410	/// (Distinct from `WroteToSink` in order to allow `Cow` behavior to be implemented on top of
411	/// [`Uts46::process`].)
412	Passthrough,
413
414	/// There were no errors. The caller must consider what was written to the sink to be the output.
415	///
416	/// (Distinct from `Passthrough` in order to allow `Cow` behavior to be implemented on top of
417	/// [`Uts46::process`].)
418	WroteToSink,
419	}
420
421	/// The failure outcome of [`Uts46::process`]
422	#[derive(PartialEq, Eq, Copy, Clone, Debug)]
423	pub enum ProcessingError {
424	/// There was a validity error according to the chosen options.
425	///
426	/// In case of `Operation::ToAscii`, there is no output. Otherwise, output was written to the
427	/// sink and the output contains at least one U+FFFD REPLACEMENT CHARACTER to denote an error.
428	ValidityError,
429
430	/// The sink emitted [`core::fmt::Error`]. The partial output written to the sink must not
431	/// be used.
432	SinkError,
433	}
434
435	impl From<core::fmt::Error> for ProcessingError {
436	fn from(_: core::fmt::Error) -> Self {
437	ProcessingError::SinkError
438	}
439	}
440
441	impl From<crate::punycode::PunycodeEncodeError> for ProcessingError {
442	fn from(_: crate::punycode::PunycodeEncodeError) -> Self {
443	unreachable!(
444	"Punycode overflows should not be possible due to PUNYCODE_ENCODE_MAX_INPUT_LENGTH"
445	);
446	}
447	}
448
449	#[derive(Debug, Clone, Copy)]
450	enum AlreadyAsciiLabel<'a> {
451	MixedCaseAscii(&'a [u8]),
452	MixedCasePunycode(&'a [u8]),
453	Other,
454	}
455
456	/// Performs the _VerifyDNSLength_ check on the output of the _ToASCII_ operation.
457	///
458	/// If the second argument is `false`, the trailing root label dot is allowed.
459	///
460	/// # Panics
461	///
462	/// Panics in debug mode if the argument isn't ASCII.
463	pub fn verify_dns_length(domain_name: &str, allow_trailing_dot: bool) -> bool {
464	let bytes: &[u8] = domain_name.as_bytes();
465	debug_assert!(bytes.is_ascii());
466	let domain_name_without_trailing_dot: &[u8] = if let Some(without: &[u8]) = bytes.strip_suffix(b".") {
467	if !allow_trailing_dot {
468	return `false`;
469	}
470	without
471	} else {
472	bytes
473	};
474	if domain_name_without_trailing_dot.len() > `253` {
475	return `false`;
476	}
477	for label: &[u8] in domain_name_without_trailing_dot.split(\|b: &u8\| *b == b'.') {
478	if label.is_empty() {
479	return `false`;
480	}
481	if label.len() > `63` {
482	return `false`;
483	}
484	}
485	`true`
486	}
487
488	/// An implementation of UTS #46.
489	pub struct Uts46 {
490	data: idna_adapter::Adapter,
491	}
492
493	#[cfg(feature = "compiled_data")]
494	impl Default for Uts46 {
495	fn default() -> Self {
496	Self::new()
497	}
498	}
499
500	impl Uts46 {
501	/// Constructor using data compiled into the binary.
502	#[cfg(feature = "compiled_data")]
503	pub const fn new() -> Self {
504	Self {
505	data: idna_adapter::Adapter::new(),
506	}
507	}
508
509	// XXX Should there be an `icu_provider` feature for enabling
510	// a constructor for run-time data loading?
511
512	/// Performs the [ToASCII](https://www.unicode.org/reports/tr46/#ToASCII) operation
513	/// from UTS #46 with the options indicated.
514	///
515	/// # Arguments
516	///
517	/// `domain_name` - The input domain name as UTF-8 bytes. (The UTF-8ness is checked by*
518	/// this method and input that is not well-formed UTF-8 is treated as an error. If you
519	/// already have a `&str`, call `.as_bytes()` on it.)
520	/// `ascii_deny_list` - What ASCII deny list, if any, to apply. The UTS 46*
521	/// _UseSTD3ASCIIRules_ flag or the WHATWG URL Standard forbidden domain code point
522	/// processing is handled via this argument. Most callers are probably the best off
523	/// by using [`AsciiDenyList::URL`] here.
524	/// `hyphens` - The UTS 46 _CheckHyphens_ flag. Most callers are probably the best*
525	/// off by using [`Hyphens::Allow`] here.
526	/// `dns_length` - The UTS 46 _VerifyDNSLength_ flag.*
527	pub fn to_ascii<'a>(
528	&self,
529	domain_name: &'a [u8],
530	ascii_deny_list: AsciiDenyList,
531	hyphens: Hyphens,
532	dns_length: DnsLength,
533	) -> Result<Cow<'a, str>, crate::Errors> {
534	let mut s = String::new();
535	match self.process(
536	domain_name,
537	ascii_deny_list,
538	hyphens,
539	ErrorPolicy::FailFast,
540	\|_, _, _\| `false`,
541	&mut s,
542	None,
543	) {
544	// SAFETY: `ProcessingSuccess::Passthrough` asserts that `domain_name` is ASCII.
545	Ok(ProcessingSuccess::Passthrough) => {
546	let cow = Cow::Borrowed(unsafe { core::str::from_utf8_unchecked(domain_name) });
547	if dns_length != DnsLength::Ignore
548	&& !verify_dns_length(&cow, dns_length == DnsLength::VerifyAllowRootDot)
549	{
550	Err(crate::Errors::default())
551	} else {
552	Ok(cow)
553	}
554	}
555	Ok(ProcessingSuccess::WroteToSink) => {
556	let cow: Cow<'_, str> = Cow::Owned(s);
557	if dns_length != DnsLength::Ignore
558	&& !verify_dns_length(&cow, dns_length == DnsLength::VerifyAllowRootDot)
559	{
560	Err(crate::Errors::default())
561	} else {
562	Ok(cow)
563	}
564	}
565	Err(ProcessingError::ValidityError) => Err(crate::Errors::default()),
566	Err(ProcessingError::SinkError) => unreachable!(),
567	}
568	}
569
570	/// Performs the [ToUnicode](https://www.unicode.org/reports/tr46/#ToUnicode) operation
571	/// from UTS #46 according to the options given. When there
572	/// are errors, there is still output, which may be rendered user, even through
573	/// the output must not be used in networking protocols. Errors are denoted
574	/// by U+FFFD REPLACEMENT CHARACTERs in the output. (That is, if the second item of the
575	/// return tuple is `Err`, the first item of the return tuple is guaranteed to contain
576	/// at least one U+FFFD.)
577	///
578	/// Most applications probably shouldn't use this method and should be using
579	/// [`Uts46::to_user_interface`] instead.
580	///
581	/// # Arguments
582	///
583	/// `domain_name` - The input domain name as UTF-8 bytes. (The UTF-8ness is checked by*
584	/// this method and input that is not well-formed UTF-8 is treated as an error. If you
585	/// already have a `&str`, call `.as_bytes()` on it.)
586	/// `ascii_deny_list` - What ASCII deny list, if any, to apply. The UTS 46*
587	/// _UseSTD3ASCIIRules_ flag or the WHATWG URL Standard forbidden domain code point
588	/// processing is handled via this argument. Most callers are probably the best off
589	/// by using [`AsciiDenyList::URL`] here.
590	/// `hyphens` - The UTS 46 _CheckHyphens_ flag. Most callers are probably the best*
591	/// off by using [`Hyphens::Allow`] here.
592	pub fn to_unicode<'a>(
593	&self,
594	domain_name: &'a [u8],
595	ascii_deny_list: AsciiDenyList,
596	hyphens: Hyphens,
597	) -> (Cow<'a, str>, Result<(), crate::Errors>) {
598	self.to_user_interface(domain_name, ascii_deny_list, hyphens, \|_, _, _\| `true`)
599	}
600
601	/// Performs the [ToUnicode](https://www.unicode.org/reports/tr46/#ToUnicode) operation
602	/// from UTS #46 according to options given with some
603	/// error-free Unicode labels output according to
604	/// [ToASCII](https://www.unicode.org/reports/tr46/#ToASCII) instead as decided by
605	/// application policy implemented via the `output_as_unicode` closure. The purpose
606	/// is to convert user-visible domains to the Unicode form in general but to render
607	/// potentially misleading labels as Punycode.
608	///
609	/// This is an imperfect security mechanism, because [the Punycode form itself may be
610	/// resemble a user-recognizable name](https://www.unicode.org/reports/tr36/#TablePunycodeSpoofing).
611	/// However, since this mechanism is common practice, this API provides support for The
612	/// the mechanism.
613	///
614	/// ASCII labels always pass through as ASCII and labels with errors always pass through
615	/// as Unicode. For non-erroneous labels that contain at least one non-ASCII character
616	/// (implies non-empty), `output_as_unicode` is called with the Unicode form of the label,
617	/// the TLD (potentially empty), and a flag indicating whether the domain name as a whole
618	/// is a bidi domain name. If the return value is `true`, the label passes through as
619	/// Unicode. If the return value is `false`, the label is converted to Punycode.
620	///
621	/// When there are errors, there is still output, which may be rendered user, even through
622	/// the output must not be used in networking protocols. Errors are denoted by
623	/// U+FFFD REPLACEMENT CHARACTERs in the output. (That is, if the second item
624	/// of the return tuple is `Err`, the first item of the return tuple is guaranteed to contain
625	/// at least one U+FFFD.) Labels that contain errors are not converted to Punycode.
626	///
627	/// # Arguments
628	///
629	/// `domain_name` - The input domain name as UTF-8 bytes. (The UTF-8ness is checked by*
630	/// this method and input that is not well-formed UTF-8 is treated as an error. If you
631	/// already have a `&str`, call `.as_bytes()` on it.)
632	/// `ascii_deny_list` - What ASCII deny list, if any, to apply. The UTS 46*
633	/// _UseSTD3ASCIIRules_ flag or the WHATWG URL Standard forbidden domain code point
634	/// processing is handled via this argument. Most callers are probably the best off
635	/// by using [`AsciiDenyList::URL`] here.
636	/// `hyphens` - The UTS 46 _CheckHyphens_ flag. Most callers are probably the best*
637	/// off by using [`Hyphens::Allow`] here.
638	/// `output_as_unicode` - A closure for deciding if a label should be output as Unicode*
639	/// (as opposed to Punycode). The first argument is the label for which a decision is
640	/// needed (always non-empty slice). The second argument is the TLD (potentially empty).
641	/// The third argument is `true` iff the domain name as a whole is a bidi domain name.
642	/// Only non-erroneous labels that contain at least one non-ASCII character are passed
643	/// to the closure as the first argument. The second and third argument values are
644	/// guaranteed to remain the same during a single call to `process`, and the closure
645	/// may cache computations derived from the second and third argument (hence the
646	/// `FnMut` type).
647	pub fn to_user_interface<'a, OutputUnicode: FnMut(&[char], &[char], bool) -> bool>(
648	&self,
649	domain_name: &'a [u8],
650	ascii_deny_list: AsciiDenyList,
651	hyphens: Hyphens,
652	output_as_unicode: OutputUnicode,
653	) -> (Cow<'a, str>, Result<(), crate::Errors>) {
654	let mut s = String::new();
655	match self.process(
656	domain_name,
657	ascii_deny_list,
658	hyphens,
659	ErrorPolicy::MarkErrors,
660	output_as_unicode,
661	&mut s,
662	None,
663	) {
664	// SAFETY: `ProcessingSuccess::Passthrough` asserts that `domain_name` is ASCII.
665	Ok(ProcessingSuccess::Passthrough) => (
666	Cow::Borrowed(unsafe { core::str::from_utf8_unchecked(domain_name) }),
667	Ok(()),
668	),
669	Ok(ProcessingSuccess::WroteToSink) => (Cow::Owned(s), Ok(())),
670	Err(ProcessingError::ValidityError) => (Cow::Owned(s), Err(crate::Errors::default())),
671	Err(ProcessingError::SinkError) => unreachable!(),
672	}
673	}
674
675	/// The lower-level function that [`Uts46::to_ascii`], [`Uts46::to_unicode`], and
676	/// [`Uts46::to_user_interface`] are built on to allow support for output types other
677	/// than `Cow<'a, str>` (e.g. string types in a non-Rust programming language).
678	///
679	/// # Arguments
680	///
681	/// `domain_name` - The input domain name as UTF-8 bytes. (The UTF-8ness is checked by*
682	/// this method and input that is not well-formed UTF-8 is treated as an error. If you
683	/// already have a `&str`, call `.as_bytes()` on it.)
684	/// `ascii_deny_list` - What ASCII deny list, if any, to apply. The UTS 46*
685	/// _UseSTD3ASCIIRules_ flag or the WHATWG URL Standard forbidden domain code point
686	/// processing is handled via this argument. Most callers are probably the best off
687	/// by using [`AsciiDenyList::URL`] here.
688	/// `hyphens` - The UTS 46 _CheckHyphens_ flag. Most callers are probably the best*
689	/// off by using [`Hyphens::Allow`] here.
690	/// `error_policy` - Whether to fail fast or to produce output that may be rendered*
691	/// for the user to examine in case of errors.
692	/// `output_as_unicode` - A closure for deciding if a label should be output as Unicode*
693	/// (as opposed to Punycode). The first argument is the label for which a decision is
694	/// needed (always non-empty slice). The second argument is the TLD (potentially empty).
695	/// The third argument is `true` iff the domain name as a whole is a bidi domain name.
696	/// Only non-erroneous labels that contain at least one non-ASCII character are passed
697	/// to the closure as the first argument. The second and third argument values are
698	/// guaranteed to remain the same during a single call to `process`, and the closure
699	/// may cache computations derived from the second and third argument (hence the
700	/// `FnMut` type). To perform the _ToASCII_ operation, `\|_, _, _\| false` must be
701	/// passed as the closure. To perform the _ToUnicode_ operation, `\|_, _, _\| true` must
702	/// be passed as the closure. A more complex closure may be used to prepare a domain
703	/// name for display in a user interface so that labels are converted to the Unicode
704	/// form in general but potentially misleading labels are converted to the Punycode
705	/// form.
706	/// `sink` - The object that receives the output (in the non-passthrough case).*
707	/// `ascii_sink` - A second sink that receives the _ToASCII_ form only if there*
708	/// were no errors and `sink` received at least one character of non-ASCII output.
709	/// The purpose of this argument is to enable a user interface display form of the
710	/// domain and the _ToASCII_ form of the domain to be computed efficiently together.
711	/// This argument is useless when `output_as_unicode` always returns `false`, in
712	/// which case the _ToASCII_ form ends up in `sink` already. If `ascii_sink` receives
713	/// no output and the return value is `Ok(ProcessingSuccess::WroteToSink)`, use the
714	/// output received by `sink` also as the _ToASCII_ result.
715	///
716	/// # Return value
717	///
718	/// `Ok(ProcessingSuccess::Passthrough)` - The caller must treat*
719	/// `unsafe { core::str::from_utf8_unchecked(domain_name) }` as the output. (This
720	/// return value asserts that calling `core::str::from_utf8_unchecked(domain_name)`
721	/// is safe.)
722	/// `Ok(ProcessingSuccess::WroteToSink)` - The caller must treat was was written*
723	/// to `sink` as the output. If another sink was passed as `ascii_sink` but it did
724	/// not receive output, the caller must treat what was written to `sink` also as
725	/// the _ToASCII_ output. Otherwise, if `ascii_sink` received output, the caller
726	/// must treat what was written to `ascii_sink` as the _ToASCII_ output.
727	/// `Err(ProcessingError::ValidityError)` - The input was in error and must*
728	/// not be used for DNS lookup or otherwise in a network protocol. If `error_policy`
729	/// was `ErrorPolicy::MarkErrors`, the output written to `sink` may be displayed
730	/// to the user as an illustration of where the error was or the errors were.
731	/// `Err(ProcessingError::SinkError)` - Either `sink` or `ascii_sink` returned*
732	/// [`core::fmt::Error`]. The partial output written to `sink` `ascii_sink` must not
733	/// be used. If `W` never returns [`core::fmt::Error`], this method never returns
734	/// `Err(ProcessingError::SinkError)`.
735	///
736	/// # Safety-usable invariant
737	///
738	/// If the return value is `Ok(ProcessingSuccess::Passthrough)`, `domain_name` is
739	/// ASCII and `core::str::from_utf8_unchecked(domain_name)` is safe. (Note:
740	/// Other return values do _not_ imply that `domain_name` wasn't ASCII!)
741	///
742	/// # Security considerations
743	///
744	/// Showing labels whose Unicode form might mislead the user as Punycode instead is
745	/// an imperfect security mechanism, because [the Punycode form itself may be resemble
746	/// a user-recognizable name](https://www.unicode.org/reports/tr36/#TablePunycodeSpoofing).
747	/// However, since this mechanism is common practice, this API provides support for the
748	/// the mechanism.
749	///
750	/// Punycode processing is quadratic, so to avoid denial of service, this method imposes
751	/// length limits on Punycode treating especially long inputs as being in error. These
752	/// limits are well higher than the DNS length limits and are not more restrictive than
753	/// the limits imposed by ICU4C.
754	#[allow(clippy::too_many_arguments)]
755	pub fn process<W: Write + ?Sized, OutputUnicode: FnMut(&[char], &[char], bool) -> bool>(
756	&self,
757	domain_name: &[u8],
758	ascii_deny_list: AsciiDenyList,
759	hyphens: Hyphens,
760	error_policy: ErrorPolicy,
761	mut output_as_unicode: OutputUnicode,
762	sink: &mut W,
763	ascii_sink: Option<&mut W>,
764	) -> Result<ProcessingSuccess, ProcessingError> {
765	let fail_fast = error_policy == ErrorPolicy::FailFast;
766	let mut domain_buffer = SmallVec::<[char; `253`]>::new();
767	let mut already_punycode = SmallVec::<[AlreadyAsciiLabel; `8`]>::new();
768	// `process_inner` could be pasted inline here, but it's out of line in order
769	// to avoid duplicating that code when monomorphizing over `W` and `OutputUnicode`.
770	let (passthrough_up_to, is_bidi, had_errors) = self.process_inner(
771	domain_name,
772	ascii_deny_list,
773	hyphens,
774	fail_fast,
775	&mut domain_buffer,
776	&mut already_punycode,
777	);
778	if passthrough_up_to == domain_name.len() {
779	debug_assert!(!had_errors);
780	return Ok(ProcessingSuccess::Passthrough);
781	}
782	// Checked only after passthrough as a micro optimization.
783	if fail_fast && had_errors {
784	return Err(ProcessingError::ValidityError);
785	}
786	debug_assert_eq!(had_errors, domain_buffer.contains(&'`\u{FFFD}`'));
787	let without_dot = if let Some(without_dot) = domain_buffer.strip_suffix(&['.']) {
788	without_dot
789	} else {
790	&domain_buffer[..]
791	};
792	// unwrap is OK, because we always have at least one label
793	let tld = without_dot.rsplit(\|c\| *c == '.').next().unwrap();
794	let mut had_unicode_output = `false`;
795	let mut seen_label = `false`;
796	let mut already_punycode_iter = already_punycode.iter();
797	let mut passthrough_up_to_extended = passthrough_up_to;
798	let mut flushed_prefix = `false`;
799	for label in domain_buffer.split(\|c\| *c == '.') {
800	// Unwrap is OK, because there are supposed to be as many items in
801	// `already_punycode` as there are labels.
802	let input_punycode = *already_punycode_iter.next().unwrap();
803	if seen_label {
804	if flushed_prefix {
805	sink.write_char('.')?;
806	} else {
807	debug_assert_eq!(domain_name[passthrough_up_to_extended], b'.');
808	passthrough_up_to_extended += `1`;
809	if passthrough_up_to_extended == domain_name.len() {
810	debug_assert!(!had_errors);
811	return Ok(ProcessingSuccess::Passthrough);
812	}
813	}
814	}
815	seen_label = `true`;
816
817	if let AlreadyAsciiLabel::MixedCaseAscii(mixed_case) = input_punycode {
818	if let Some(first_upper_case) =
819	mixed_case.iter().position(\|c\| c.is_ascii_uppercase())
820	{
821	let (head, tail) = mixed_case.split_at(first_upper_case);
822	let slice_to_write = if flushed_prefix {
823	head
824	} else {
825	flushed_prefix = `true`;
826	passthrough_up_to_extended += head.len();
827	debug_assert_ne!(passthrough_up_to_extended, domain_name.len());
828	&domain_name[..passthrough_up_to_extended]
829	};
830	// SAFETY: `mixed_case` and `domain_name` up to `passthrough_up_to_extended` are known to be ASCII.
831	sink.write_str(unsafe { core::str::from_utf8_unchecked(slice_to_write) })?;
832	for c in tail.iter() {
833	sink.write_char(char::from(c.to_ascii_lowercase()))?;
834	}
835	} else if flushed_prefix {
836	// SAFETY: `mixed_case` is known to be ASCII.
837	sink.write_str(unsafe { core::str::from_utf8_unchecked(mixed_case) })?;
838	} else {
839	passthrough_up_to_extended += mixed_case.len();
840	if passthrough_up_to_extended == domain_name.len() {
841	debug_assert!(!had_errors);
842	return Ok(ProcessingSuccess::Passthrough);
843	}
844	}
845	continue;
846	}
847
848	let potentially_punycode = if fail_fast {
849	debug_assert!(classify_for_punycode(label) != PunycodeClassification::Error);
850	!is_ascii(label)
851	} else {
852	classify_for_punycode(label) == PunycodeClassification::Unicode
853	};
854	let passthrough = if potentially_punycode {
855	let unicode = output_as_unicode(label, tld, is_bidi);
856	had_unicode_output \|= unicode;
857	unicode
858	} else {
859	`true`
860	};
861	if passthrough {
862	if !flushed_prefix {
863	flushed_prefix = `true`;
864	// SAFETY: `domain_name` up to `passthrough_up_to_extended` is known to be ASCII.
865	sink.write_str(unsafe {
866	core::str::from_utf8_unchecked(&domain_name[..passthrough_up_to_extended])
867	})?;
868	}
869	for c in label.iter().copied() {
870	sink.write_char(c)?;
871	}
872	} else if let AlreadyAsciiLabel::MixedCasePunycode(mixed_case) = input_punycode {
873	if let Some(first_upper_case) =
874	mixed_case.iter().position(\|c\| c.is_ascii_uppercase())
875	{
876	let (head, tail) = mixed_case.split_at(first_upper_case);
877	let slice_to_write = if flushed_prefix {
878	head
879	} else {
880	flushed_prefix = `true`;
881	passthrough_up_to_extended += head.len();
882	debug_assert_ne!(passthrough_up_to_extended, domain_name.len());
883	&domain_name[..passthrough_up_to_extended]
884	};
885	// SAFETY: `mixed_case` and `domain_name` up to `passthrough_up_to_extended` are known to be ASCII.
886	sink.write_str(unsafe { core::str::from_utf8_unchecked(slice_to_write) })?;
887	for c in tail.iter() {
888	sink.write_char(char::from(c.to_ascii_lowercase()))?;
889	}
890	} else if flushed_prefix {
891	// SAFETY: `mixed_case` is known to be ASCII.
892	sink.write_str(unsafe { core::str::from_utf8_unchecked(mixed_case) })?;
893	} else {
894	passthrough_up_to_extended += mixed_case.len();
895	if passthrough_up_to_extended == domain_name.len() {
896	debug_assert!(!had_errors);
897	return Ok(ProcessingSuccess::Passthrough);
898	}
899	}
900	} else {
901	if !flushed_prefix {
902	flushed_prefix = `true`;
903	// SAFETY: `domain_name` up to `passthrough_up_to_extended` is known to be ASCII.
904	sink.write_str(unsafe {
905	core::str::from_utf8_unchecked(&domain_name[..passthrough_up_to_extended])
906	})?;
907	}
908	write_punycode_label(label, sink)?;
909	}
910	}
911
912	if had_errors {
913	return Err(ProcessingError::ValidityError);
914	}
915
916	if had_unicode_output {
917	if let Some(sink) = ascii_sink {
918	let mut seen_label = `false`;
919	let mut already_punycode_iter = already_punycode.iter();
920	let mut passthrough_up_to_extended = passthrough_up_to;
921	let mut flushed_prefix = `false`;
922	for label in domain_buffer.split(\|c\| *c == '.') {
923	// Unwrap is OK, because there are supposed to be as many items in
924	// `already_punycode` as there are labels.
925	let input_punycode = *already_punycode_iter.next().unwrap();
926	if seen_label {
927	if flushed_prefix {
928	sink.write_char('.')?;
929	} else {
930	debug_assert_eq!(domain_name[passthrough_up_to_extended], b'.');
931	passthrough_up_to_extended += `1`;
932	}
933	}
934	seen_label = `true`;
935
936	if let AlreadyAsciiLabel::MixedCaseAscii(mixed_case) = input_punycode {
937	if let Some(first_upper_case) =
938	mixed_case.iter().position(\|c\| c.is_ascii_uppercase())
939	{
940	let (head, tail) = mixed_case.split_at(first_upper_case);
941	let slice_to_write = if flushed_prefix {
942	head
943	} else {
944	flushed_prefix = `true`;
945	passthrough_up_to_extended += head.len();
946	debug_assert_ne!(passthrough_up_to_extended, domain_name.len());
947	&domain_name[..passthrough_up_to_extended]
948	};
949	// SAFETY: `mixed_case` and `domain_name` up to `passthrough_up_to_extended` are known to be ASCII.
950	sink.write_str(unsafe {
951	core::str::from_utf8_unchecked(slice_to_write)
952	})?;
953	for c in tail.iter() {
954	sink.write_char(char::from(c.to_ascii_lowercase()))?;
955	}
956	} else if flushed_prefix {
957	// SAFETY: `mixed_case` is known to be ASCII.
958	sink.write_str(unsafe { core::str::from_utf8_unchecked(mixed_case) })?;
959	} else {
960	passthrough_up_to_extended += mixed_case.len();
961	}
962	continue;
963	}
964
965	if is_ascii(label) {
966	if !flushed_prefix {
967	flushed_prefix = `true`;
968	// SAFETY: `domain_name` up to `passthrough_up_to_extended` is known to be ASCII.
969	sink.write_str(unsafe {
970	core::str::from_utf8_unchecked(
971	&domain_name[..passthrough_up_to_extended],
972	)
973	})?;
974	}
975	for c in label.iter().copied() {
976	sink.write_char(c)?;
977	}
978	} else if let AlreadyAsciiLabel::MixedCasePunycode(mixed_case) = input_punycode
979	{
980	if let Some(first_upper_case) =
981	mixed_case.iter().position(\|c\| c.is_ascii_uppercase())
982	{
983	let (head, tail) = mixed_case.split_at(first_upper_case);
984	let slice_to_write = if flushed_prefix {
985	head
986	} else {
987	flushed_prefix = `true`;
988	passthrough_up_to_extended += head.len();
989	debug_assert_ne!(passthrough_up_to_extended, domain_name.len());
990	&domain_name[..passthrough_up_to_extended]
991	};
992	// SAFETY: `mixed_case` and `domain_name` up to `passthrough_up_to_extended` are known to be ASCII.
993	sink.write_str(unsafe {
994	core::str::from_utf8_unchecked(slice_to_write)
995	})?;
996	for c in tail.iter() {
997	sink.write_char(char::from(c.to_ascii_lowercase()))?;
998	}
999	} else if flushed_prefix {
1000	// SAFETY: `mixed_case` is known to be ASCII.
1001	sink.write_str(unsafe { core::str::from_utf8_unchecked(mixed_case) })?;
1002	} else {
1003	passthrough_up_to_extended += mixed_case.len();
1004	}
1005	} else {
1006	if !flushed_prefix {
1007	flushed_prefix = `true`;
1008	// SAFETY: `domain_name` up to `passthrough_up_to_extended` is known to be ASCII.
1009	sink.write_str(unsafe {
1010	core::str::from_utf8_unchecked(
1011	&domain_name[..passthrough_up_to_extended],
1012	)
1013	})?;
1014	}
1015	write_punycode_label(label, sink)?;
1016	}
1017	}
1018	if !flushed_prefix {
1019	// SAFETY: `domain_name` up to `passthrough_up_to_extended` is known to be ASCII.
1020	sink.write_str(unsafe {
1021	core::str::from_utf8_unchecked(&domain_name[..passthrough_up_to_extended])
1022	})?;
1023	}
1024	}
1025	}
1026	Ok(ProcessingSuccess::WroteToSink)
1027	}
1028
1029	/// The part of `process` that doesn't need to be generic over the sink.
1030	#[inline(always)]
1031	fn process_inner<'a>(
1032	&self,
1033	domain_name: &'a [u8],
1034	ascii_deny_list: AsciiDenyList,
1035	hyphens: Hyphens,
1036	fail_fast: bool,
1037	domain_buffer: &mut SmallVec<[char; `253`]>,
1038	already_punycode: &mut SmallVec<[AlreadyAsciiLabel<'a>; `8`]>,
1039	) -> (usize, bool, bool) {
1040	// Sadly, this even faster-path ASCII tier is needed to avoid regressing
1041	// performance.
1042	let mut iter = domain_name.iter();
1043	let mut most_recent_label_start = iter.clone();
1044	loop {
1045	if let Some(&b) = iter.next() {
1046	if in_inclusive_range8(b, b'a', b'z') {
1047	continue;
1048	}
1049	if b == b'.' {
1050	most_recent_label_start = iter.clone();
1051	continue;
1052	}
1053	return self.process_innermost(
1054	domain_name,
1055	ascii_deny_list,
1056	hyphens,
1057	fail_fast,
1058	domain_buffer,
1059	already_punycode,
1060	most_recent_label_start.as_slice(),
1061	);
1062	} else {
1063	// Success! The whole input passes through on the fastest path!
1064	return (domain_name.len(), `false`, `false`);
1065	}
1066	}
1067	}
1068
1069	/// The part of `process` that doesn't need to be generic over the sink and
1070	/// can avoid monomorphizing in the interest of code size.
1071	/// Separating this into a different stack frame compared to `process_inner`
1072	/// improves performance in the ICU4X case.
1073	#[allow(clippy::too_many_arguments)]
1074	#[inline(never)]
1075	fn process_innermost<'a>(
1076	&self,
1077	domain_name: &'a [u8],
1078	ascii_deny_list: AsciiDenyList,
1079	hyphens: Hyphens,
1080	fail_fast: bool,
1081	domain_buffer: &mut SmallVec<[char; `253`]>,
1082	already_punycode: &mut SmallVec<[AlreadyAsciiLabel<'a>; `8`]>,
1083	tail: &'a [u8],
1084	) -> (usize, bool, bool) {
1085	let deny_list = ascii_deny_list.bits;
1086	let deny_list_deny_dot = deny_list \| DOT_MASK;
1087
1088	let mut had_errors = `false`;
1089
1090	let mut passthrough_up_to = domain_name.len() - tail.len(); // Index into `domain_name`
1091	// 253 ASCII characters is the max length for a valid domain name
1092	// (excluding the root dot).
1093	let mut current_label_start; // Index into `domain_buffer`
1094	let mut seen_label = `false`;
1095	let mut in_prefix = `true`;
1096	for label in tail.split(\|b\| *b == b'.') {
1097	// We check for passthrough only for the prefix. That is, if we
1098	// haven't moved on and started filling `domain_buffer`. Keeping
1099	// this stuff in one loop where the first items keep being skipped
1100	// once they have been skipped at least once instead of working
1101	// this into a fancier loop structure in order to make sure that
1102	// no item from the iterator is lost or processed twice.
1103	// Furthermore, after the passthrough fails, restarting the
1104	// normalization process after each pre-existing ASCII dot also
1105	// provides an opportunity for the processing to get back onto
1106	// an ASCII fast path that bypasses the normalizer for ASCII
1107	// after a pre-existing ASCII dot (pre-existing in the sense
1108	// of not coming from e.g. normalizing an ideographic dot).
1109	if in_prefix && is_passthrough_ascii_label(label) {
1110	if seen_label {
1111	debug_assert_eq!(domain_name[passthrough_up_to], b'.');
1112	passthrough_up_to += `1`;
1113	}
1114	seen_label = `true`;
1115
1116	passthrough_up_to += label.len();
1117	continue;
1118	}
1119	if seen_label {
1120	if in_prefix {
1121	debug_assert_eq!(domain_name[passthrough_up_to], b'.');
1122	passthrough_up_to += `1`;
1123	} else {
1124	domain_buffer.push('.');
1125	}
1126	}
1127	seen_label = `true`;
1128	in_prefix = `false`;
1129	current_label_start = domain_buffer.len();
1130	if !label.is_empty() {
1131	let (ascii, non_ascii) = split_ascii_fast_path_prefix(label);
1132	let non_punycode_ascii_label = if non_ascii.is_empty() {
1133	if has_punycode_prefix(ascii) {
1134	if (ascii.last() != Some(&b'-'))
1135	&& (ascii.len() - `4` <= PUNYCODE_DECODE_MAX_INPUT_LENGTH)
1136	{
1137	if let Ok(decode) =
1138	Decoder::default().decode::<u8, InternalCaller>(&ascii[`4`..])
1139	{
1140	// 63 ASCII characters is the max length for a valid DNS label and xn-- takes 4
1141	// characters.
1142	let mut label_buffer = SmallVec::<[char; `59`]>::new();
1143	label_buffer.extend(decode);
1144
1145	if self.after_punycode_decode(
1146	domain_buffer,
1147	current_label_start,
1148	&label_buffer,
1149	deny_list_deny_dot,
1150	fail_fast,
1151	&mut had_errors,
1152	) {
1153	return (`0`, `false`, `true`);
1154	}
1155
1156	if self.check_label(
1157	hyphens,
1158	&mut domain_buffer[current_label_start..],
1159	fail_fast,
1160	&mut had_errors,
1161	`true`,
1162	`true`,
1163	) {
1164	return (`0`, `false`, `true`);
1165	}
1166	} else {
1167	// Punycode failed
1168	if fail_fast {
1169	return (`0`, `false`, `true`);
1170	}
1171	had_errors = `true`;
1172	domain_buffer.push('`\u{FFFD}`');
1173	let mut iter = ascii.iter();
1174	// Discard the first character that we replaced.
1175	let _ = iter.next();
1176	domain_buffer.extend(iter.map(\|c\| {
1177	// Can't have dot here, so `deny_list` vs `deny_list_deny_dot` does
1178	// not matter.
1179	apply_ascii_deny_list_to_potentially_upper_case_ascii(
1180	*c, deny_list,
1181	)
1182	}));
1183	};
1184	// If there were errors, we won't be trying to use this
1185	// anyway later, so it's fine to put it here unconditionally.
1186	already_punycode.push(AlreadyAsciiLabel::MixedCasePunycode(label));
1187	continue;
1188	} else if fail_fast {
1189	return (`0`, `false`, `true`);
1190	}
1191	// Else fall through to the complex path and rediscover error
1192	// there.
1193	`false`
1194	} else {
1195	`true`
1196	}
1197	} else {
1198	`false`
1199	};
1200	for c in ascii.iter().map(\|c\| {
1201	// Can't have dot here, so `deny_list` vs `deny_list_deny_dot` does
1202	// not matter.
1203	apply_ascii_deny_list_to_potentially_upper_case_ascii(*c, deny_list)
1204	}) {
1205	if c == '`\u{FFFD}`' {
1206	if fail_fast {
1207	return (`0`, `false`, `true`);
1208	}
1209	had_errors = `true`;
1210	}
1211	domain_buffer.push(c);
1212	}
1213	if non_punycode_ascii_label {
1214	if hyphens != Hyphens::Allow
1215	&& check_hyphens(
1216	&mut domain_buffer[current_label_start..],
1217	hyphens == Hyphens::CheckFirstLast,
1218	fail_fast,
1219	&mut had_errors,
1220	)
1221	{
1222	return (`0`, `false`, `true`);
1223	}
1224	already_punycode.push(if had_errors {
1225	AlreadyAsciiLabel::Other
1226	} else {
1227	AlreadyAsciiLabel::MixedCaseAscii(label)
1228	});
1229	continue;
1230	}
1231	already_punycode.push(AlreadyAsciiLabel::Other);
1232	let mut first_needs_combining_mark_check = ascii.is_empty();
1233	let mut needs_contextj_check = !non_ascii.is_empty();
1234	let mut mapping = self
1235	.data
1236	.map_normalize(non_ascii.chars())
1237	.map(\|c\| apply_ascii_deny_list_to_lower_cased_unicode(c, deny_list));
1238	loop {
1239	let n = mapping.next();
1240	match n {
1241	None \| Some('.') => {
1242	if domain_buffer[current_label_start..]
1243	.starts_with(&['x', 'n', '-', '-'])
1244	{
1245	let mut punycode_precondition_failed = `false`;
1246	for c in domain_buffer[current_label_start + `4`..].iter_mut() {
1247	if !c.is_ascii() {
1248	if fail_fast {
1249	return (`0`, `false`, `true`);
1250	}
1251	had_errors = `true`;
1252	*c = '`\u{FFFD}`';
1253	punycode_precondition_failed = `true`;
1254	}
1255	}
1256
1257	if let Some(last) = domain_buffer.last_mut() {
1258	if *last == '-' {
1259	// Either there's nothing after the "xn--" prefix
1260	// and we got the last hyphen of "xn--", or there
1261	// are no Punycode digits after the last delimiter
1262	// which would result in Punycode decode outputting
1263	// ASCII only.
1264	if fail_fast {
1265	return (`0`, `false`, `true`);
1266	}
1267	had_errors = `true`;
1268	*last = '`\u{FFFD}`';
1269	punycode_precondition_failed = `true`;
1270	}
1271	} else {
1272	unreachable!();
1273	}
1274
1275	// Reject excessively long input
1276	// https://github.com/whatwg/url/issues/824
1277	// https://unicode-org.atlassian.net/browse/ICU-13727
1278	if domain_buffer.len() - current_label_start - `4`
1279	> PUNYCODE_DECODE_MAX_INPUT_LENGTH
1280	{
1281	if fail_fast {
1282	return (`0`, `false`, `true`);
1283	}
1284	had_errors = `true`;
1285	domain_buffer[current_label_start
1286	+ `4`
1287	+ PUNYCODE_DECODE_MAX_INPUT_LENGTH] = '`\u{FFFD}`';
1288	punycode_precondition_failed = `true`;
1289	}
1290
1291	if !punycode_precondition_failed {
1292	if let Ok(decode) = Decoder::default()
1293	.decode::<char, InternalCaller>(
1294	&domain_buffer[current_label_start + `4`..],
1295	)
1296	{
1297	first_needs_combining_mark_check = `true`;
1298	needs_contextj_check = `true`;
1299	// 63 ASCII characters is the max length for a valid DNS label and xn-- takes 4
1300	// characters.
1301	let mut label_buffer = SmallVec::<[char; `59`]>::new();
1302	label_buffer.extend(decode);
1303
1304	domain_buffer.truncate(current_label_start);
1305	if self.after_punycode_decode(
1306	domain_buffer,
1307	current_label_start,
1308	&label_buffer,
1309	deny_list_deny_dot,
1310	fail_fast,
1311	&mut had_errors,
1312	) {
1313	return (`0`, `false`, `true`);
1314	}
1315	} else {
1316	// Punycode failed
1317	if fail_fast {
1318	return (`0`, `false`, `true`);
1319	}
1320	had_errors = `true`;
1321	domain_buffer[current_label_start] = '`\u{FFFD}`';
1322	needs_contextj_check = `false`; // ASCII label
1323	first_needs_combining_mark_check = `false`;
1324	};
1325	} else {
1326	first_needs_combining_mark_check = `false`;
1327	needs_contextj_check = `false`; // Non-ASCII already turned to U+FFFD.
1328	}
1329	}
1330	if self.check_label(
1331	hyphens,
1332	&mut domain_buffer[current_label_start..],
1333	fail_fast,
1334	&mut had_errors,
1335	first_needs_combining_mark_check,
1336	needs_contextj_check,
1337	) {
1338	return (`0`, `false`, `true`);
1339	}
1340
1341	if n.is_none() {
1342	break;
1343	}
1344	domain_buffer.push('.');
1345	current_label_start = domain_buffer.len();
1346	first_needs_combining_mark_check = `true`;
1347	needs_contextj_check = `true`;
1348	already_punycode.push(AlreadyAsciiLabel::Other);
1349	}
1350	Some(c) => {
1351	if c == '`\u{FFFD}`' {
1352	if fail_fast {
1353	return (`0`, `false`, `true`);
1354	}
1355	had_errors = `true`;
1356	}
1357	domain_buffer.push(c);
1358	}
1359	}
1360	}
1361	} else {
1362	// Empty label
1363	already_punycode.push(AlreadyAsciiLabel::MixedCaseAscii(label));
1364	}
1365	}
1366
1367	let is_bidi = self.is_bidi(domain_buffer);
1368	if is_bidi {
1369	for label in domain_buffer.split_mut(\|c\| *c == '.') {
1370	if let Some((first, tail)) = label.split_first_mut() {
1371	let first_bc = self.data.bidi_class(*first);
1372	if !FIRST_BC_MASK.intersects(first_bc.to_mask()) {
1373	// Neither RTL label nor LTR label
1374	if fail_fast {
1375	return (`0`, `false`, `true`);
1376	}
1377	had_errors = `true`;
1378	*first = '`\u{FFFD}`';
1379	continue;
1380	}
1381	let is_ltr = first_bc.is_ltr();
1382	// Trim NSM
1383	let mut middle = tail;
1384	#[allow(clippy::while_let_loop)]
1385	loop {
1386	if let Some((last, prior)) = middle.split_last_mut() {
1387	let last_bc = self.data.bidi_class(*last);
1388	if last_bc.is_nonspacing_mark() {
1389	middle = prior;
1390	continue;
1391	}
1392	let last_mask = if is_ltr { LAST_LTR_MASK } else { LAST_RTL_MASK };
1393	if !last_mask.intersects(last_bc.to_mask()) {
1394	if fail_fast {
1395	return (`0`, `false`, `true`);
1396	}
1397	had_errors = `true`;
1398	*last = '`\u{FFFD}`';
1399	}
1400	if is_ltr {
1401	for c in prior.iter_mut() {
1402	let bc = self.data.bidi_class(*c);
1403	if !MIDDLE_LTR_MASK.intersects(bc.to_mask()) {
1404	if fail_fast {
1405	return (`0`, `false`, `true`);
1406	}
1407	had_errors = `true`;
1408	*c = '`\u{FFFD}`';
1409	}
1410	}
1411	} else {
1412	let mut numeral_state = RtlNumeralState::Undecided;
1413	for c in prior.iter_mut() {
1414	let bc = self.data.bidi_class(*c);
1415	if !MIDDLE_RTL_MASK.intersects(bc.to_mask()) {
1416	if fail_fast {
1417	return (`0`, `false`, `true`);
1418	}
1419	had_errors = `true`;
1420	*c = '`\u{FFFD}`';
1421	} else {
1422	match numeral_state {
1423	RtlNumeralState::Undecided => {
1424	if bc.is_european_number() {
1425	numeral_state = RtlNumeralState::European;
1426	} else if bc.is_arabic_number() {
1427	numeral_state = RtlNumeralState::Arabic;
1428	}
1429	}
1430	RtlNumeralState::European => {
1431	if bc.is_arabic_number() {
1432	if fail_fast {
1433	return (`0`, `false`, `true`);
1434	}
1435	had_errors = `true`;
1436	*c = '`\u{FFFD}`';
1437	}
1438	}
1439	RtlNumeralState::Arabic => {
1440	if bc.is_european_number() {
1441	if fail_fast {
1442	return (`0`, `false`, `true`);
1443	}
1444	had_errors = `true`;
1445	*c = '`\u{FFFD}`';
1446	}
1447	}
1448	}
1449	}
1450	}
1451	if (numeral_state == RtlNumeralState::European
1452	&& last_bc.is_arabic_number())
1453	\|\| (numeral_state == RtlNumeralState::Arabic
1454	&& last_bc.is_european_number())
1455	{
1456	if fail_fast {
1457	return (`0`, `false`, `true`);
1458	}
1459	had_errors = `true`;
1460	*last = '`\u{FFFD}`';
1461	}
1462	}
1463	break;
1464	} else {
1465	// One-character label or label where
1466	// everything after the first character
1467	// is just non-spacing marks.
1468	break;
1469	}
1470	}
1471	}
1472	}
1473	}
1474
1475	(passthrough_up_to, is_bidi, had_errors)
1476	}
1477
1478	#[inline(never)]
1479	fn after_punycode_decode(
1480	&self,
1481	domain_buffer: &mut SmallVec<[char; `253`]>,
1482	current_label_start: usize,
1483	label_buffer: &[char],
1484	deny_list_deny_dot: u128,
1485	fail_fast: bool,
1486	had_errors: &mut bool,
1487	) -> bool {
1488	for c in self
1489	.data
1490	.normalize_validate(label_buffer.iter().copied())
1491	.map(\|c\| apply_ascii_deny_list_to_lower_cased_unicode(c, deny_list_deny_dot))
1492	{
1493	if c == '`\u{FFFD}`' {
1494	if fail_fast {
1495	return `true`;
1496	}
1497	*had_errors = `true`;
1498	}
1499	domain_buffer.push(c);
1500	}
1501	let normalized = &mut domain_buffer[current_label_start..];
1502	if let Err(()) =
1503	normalized
1504	.iter_mut()
1505	.zip(label_buffer.iter())
1506	.try_for_each(\|(norm_c, decoded_c)\| {
1507	if norm_c == decoded_c {
1508	Ok(())
1509	} else {
1510	// Mark the first difference
1511	*norm_c = '`\u{FFFD}`';
1512	Err(())
1513	}
1514	})
1515	{
1516	if fail_fast {
1517	return `true`;
1518	}
1519	*had_errors = `true`;
1520	}
1521	`false`
1522	}
1523
1524	#[inline(never)]
1525	fn check_label(
1526	&self,
1527	hyphens: Hyphens,
1528	mut_label: &mut [char],
1529	fail_fast: bool,
1530	had_errors: &mut bool,
1531	first_needs_combining_mark_check: bool,
1532	needs_contextj_check: bool,
1533	) -> bool {
1534	if hyphens != Hyphens::Allow
1535	&& check_hyphens(
1536	mut_label,
1537	hyphens == Hyphens::CheckFirstLast,
1538	fail_fast,
1539	had_errors,
1540	)
1541	{
1542	return `true`;
1543	}
1544	if first_needs_combining_mark_check {
1545	if let Some(first) = mut_label.first_mut() {
1546	if self.data.is_mark(*first) {
1547	if fail_fast {
1548	return `true`;
1549	}
1550	*had_errors = `true`;
1551	*first = '`\u{FFFD}`';
1552	}
1553	}
1554	}
1555	if needs_contextj_check {
1556	// ContextJ
1557	for i in `0`..mut_label.len() {
1558	let c = mut_label[i];
1559	if !in_inclusive_range_char(c, '`\u{200C}`', '`\u{200D}`') {
1560	continue;
1561	}
1562	let (head, joiner_and_tail) = mut_label.split_at_mut(i);
1563
1564	if let Some((joiner, tail)) = joiner_and_tail.split_first_mut() {
1565	if let Some(previous) = head.last() {
1566	if self.data.is_virama(*previous) {
1567	continue;
1568	}
1569	} else {
1570	// No preceding character
1571	if fail_fast {
1572	return `true`;
1573	}
1574	*had_errors = `true`;
1575	*joiner = '`\u{FFFD}`';
1576	continue;
1577	}
1578	if c == '`\u{200D}`' {
1579	// ZWJ only has the virama rule
1580	if fail_fast {
1581	return `true`;
1582	}
1583	*had_errors = `true`;
1584	*joiner = '`\u{FFFD}`';
1585	continue;
1586	}
1587	debug_assert_eq!(c, '`\u{200C}`');
1588	if !self.has_appropriately_joining_char(
1589	head.iter().rev().copied(),
1590	LEFT_OR_DUAL_JOINING_MASK,
1591	) \|\| !self.has_appropriately_joining_char(
1592	tail.iter().copied(),
1593	RIGHT_OR_DUAL_JOINING_MASK,
1594	) {
1595	if fail_fast {
1596	return `true`;
1597	}
1598	*had_errors = `true`;
1599	*joiner = '`\u{FFFD}`';
1600	}
1601	} else {
1602	debug_assert!(`false`);
1603	}
1604	}
1605	}
1606
1607	if !is_ascii(mut_label) && mut_label.len() > PUNYCODE_ENCODE_MAX_INPUT_LENGTH {
1608	// Limit quadratic behavior
1609	// https://github.com/whatwg/url/issues/824
1610	// https://unicode-org.atlassian.net/browse/ICU-13727
1611	if fail_fast {
1612	return `true`;
1613	}
1614	*had_errors = `true`;
1615	mut_label[PUNYCODE_ENCODE_MAX_INPUT_LENGTH] = '`\u{FFFD}`';
1616	}
1617	`false`
1618	}
1619
1620	#[inline(always)]
1621	fn has_appropriately_joining_char<I: Iterator<Item = char>>(
1622	&self,
1623	iter: I,
1624	required_mask: JoiningTypeMask,
1625	) -> bool {
1626	for c in iter {
1627	let jt = self.data.joining_type(c);
1628	if jt.to_mask().intersects(required_mask) {
1629	return `true`;
1630	}
1631	if jt.is_transparent() {
1632	continue;
1633	}
1634	return `false`;
1635	}
1636	`false`
1637	}
1638
1639	#[inline(always)]
1640	fn is_bidi(&self, buffer: &[char]) -> bool {
1641	for &c in buffer {
1642	if c < '`\u{0590}`' {
1643	// Below Hebrew
1644	continue;
1645	}
1646	if in_inclusive_range_char(c, '`\u{0900}`', '`\u{FB1C}`') {
1647	debug_assert_ne!(c, '`\u{200F}`'); // disallowed
1648	continue;
1649	}
1650	if in_inclusive_range_char(c, '`\u{1F000}`', '`\u{3FFFF}`') {
1651	continue;
1652	}
1653	if in_inclusive_range_char(c, '`\u{FF00}`', '`\u{107FF}`') {
1654	continue;
1655	}
1656	if in_inclusive_range_char(c, '`\u{11000}`', '`\u{1E7FF}`') {
1657	continue;
1658	}
1659	if RTL_MASK.intersects(self.data.bidi_class(c).to_mask()) {
1660	return `true`;
1661	}
1662	}
1663	`false`
1664	}
1665	}
1666
1667	fn check_hyphens(
1668	mut_label: &mut [char],
1669	allow_third_fourth: bool,
1670	fail_fast: bool,
1671	had_errors: &mut bool,
1672	) -> bool {
1673	if let Some(first) = mut_label.first_mut() {
1674	if *first == '-' {
1675	if fail_fast {
1676	return `true`;
1677	}
1678	*had_errors = `true`;
1679	*first = '`\u{FFFD}`';
1680	}
1681	}
1682	if let Some(last) = mut_label.last_mut() {
1683	if *last == '-' {
1684	if fail_fast {
1685	return `true`;
1686	}
1687	*had_errors = `true`;
1688	*last = '`\u{FFFD}`';
1689	}
1690	}
1691	if allow_third_fourth {
1692	return `false`;
1693	}
1694	if mut_label.len() >= `4` && mut_label[`2`] == '-' && mut_label[`3`] == '-' {
1695	if fail_fast {
1696	return `true`;
1697	}
1698	*had_errors = `true`;
1699	mut_label[`2`] = '`\u{FFFD}`';
1700	mut_label[`3`] = '`\u{FFFD}`';
1701	}
1702	`false`
1703	}
1704