1 | // Copyright The rust-url developers. |
2 | // |
3 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
4 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
5 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
6 | // option. This file may not be copied, modified, or distributed |
7 | // except according to those terms. |
8 | |
9 | //! This module provides the lower-level API for UTS 46. |
10 | //! |
11 | //! [`Uts46::process`] is the core that the other convenience |
12 | //! methods build on. |
13 | //! |
14 | //! UTS 46 flags map to this API as follows: |
15 | //! |
16 | //! * _CheckHyphens_ - _true_: [`Hyphens::Check`], _false_: [`Hyphens::Allow`]; the WHATWG URL Standard sets this to _false_ for normal (non-conformance-checker) user agents. |
17 | //! * _CheckBidi_ - Always _true_; cannot be configured, since this flag is _true_ even when WHATWG URL Standard _beStrict_ is _false_. |
18 | //! * _CheckJoiners_ - Always _true_; cannot be configured, since this flag is _true_ even when WHATWG URL Standard _beStrict_ is _false_. |
19 | //! * _UseSTD3ASCIIRules_ - _true_: [`AsciiDenyList::STD3`], _false_: [`AsciiDenyList::EMPTY`]; however, the check the WHATWG URL Standard performs right after the UTS 46 invocation corresponds to [`AsciiDenyList::URL`]. |
20 | //! * _Transitional_Processing_ - Always _false_ but could be implemented as a preprocessing step. This flag is deprecated and for Web purposes the transition is over in the sense that all of Firefox, Safari, or Chrome set this flag to _false_. |
21 | //! * _VerifyDnsLength_ - _true_: [`DnsLength::Verify`], _false_: [`DnsLength::Ignore`]; the WHATWG URL Standard sets this to _false_ for normal (non-conformance-checker) user agents. |
22 | //! * _IgnoreInvalidPunycode_ - Always _false_; cannot be configured. (Not yet covered by the WHATWG URL Standard, but 2 out of 3 major browser clearly behave as if this was _false_). |
23 | |
24 | use crate::punycode::Decoder; |
25 | use crate::punycode::InternalCaller; |
26 | use alloc::borrow::Cow; |
27 | use alloc::string::String; |
28 | use core::fmt::Write; |
29 | use idna_adapter::*; |
30 | use smallvec::SmallVec; |
31 | use utf8_iter::Utf8CharsEx; |
32 | |
33 | /// ICU4C-compatible constraint. |
34 | /// https://unicode-org.atlassian.net/browse/ICU-13727 |
35 | const PUNYCODE_DECODE_MAX_INPUT_LENGTH: usize = 2000; |
36 | |
37 | /// ICU4C-compatible constraint. (Note: ICU4C measures |
38 | /// UTF-16 and we measure UTF-32. This means that we |
39 | /// allow longer non-BMP inputs. For this implementation, |
40 | /// the denial-of-service scaling does not depend on BMP vs. |
41 | /// non-BMP: only the scalar values matter.) |
42 | /// |
43 | /// https://unicode-org.atlassian.net/browse/ICU-13727 |
44 | const PUNYCODE_ENCODE_MAX_INPUT_LENGTH: usize = 1000; |
45 | |
46 | /// For keeping track of what kind of numerals have been |
47 | /// seen in an RTL label. |
48 | #[derive (Debug, PartialEq, Eq)] |
49 | enum RtlNumeralState { |
50 | Undecided, |
51 | European, |
52 | Arabic, |
53 | } |
54 | |
55 | /// Computes the mask for upper-case ASCII. |
56 | const fn upper_case_mask() -> u128 { |
57 | let mut accu: u128 = 0u128; |
58 | let mut b: u8 = 0u8; |
59 | while b < 128 { |
60 | if (b >= b'A' ) && (b <= b'Z' ) { |
61 | accu |= 1u128 << b; |
62 | } |
63 | b += 1; |
64 | } |
65 | accu |
66 | } |
67 | |
68 | /// Bit set for upper-case ASCII. |
69 | const UPPER_CASE_MASK: u128 = upper_case_mask(); |
70 | |
71 | /// Computes the mask for glyphless ASCII. |
72 | const fn glyphless_mask() -> u128 { |
73 | let mut accu: u128 = 0u128; |
74 | let mut b: u8 = 0u8; |
75 | while b < 128 { |
76 | if (b <= b' ' ) || (b == 0x7F) { |
77 | accu |= 1u128 << b; |
78 | } |
79 | b += 1; |
80 | } |
81 | accu |
82 | } |
83 | |
84 | /// Bit set for glyphless ASCII. |
85 | const GLYPHLESS_MASK: u128 = glyphless_mask(); |
86 | |
87 | /// The mask for the ASCII dot. |
88 | const DOT_MASK: u128 = 1 << b'.' ; |
89 | |
90 | /// Computes the ASCII deny list for STD3 ASCII rules. |
91 | const fn ldh_mask() -> u128 { |
92 | let mut accu: u128 = 0u128; |
93 | let mut b: u8 = 0u8; |
94 | while b < 128 { |
95 | if !((b >= b'a' && b <= b'z' ) || (b >= b'0' && b <= b'9' ) || b == b'-' || b == b'.' ) { |
96 | accu |= 1u128 << b; |
97 | } |
98 | b += 1; |
99 | } |
100 | accu |
101 | } |
102 | |
103 | const PUNYCODE_PREFIX: u32 = |
104 | ((b'-' as u32) << 24) | ((b'-' as u32) << 16) | ((b'N' as u32) << 8) | b'X' as u32; |
105 | |
106 | const PUNYCODE_PREFIX_MASK: u32 = (0xFF << 24) | (0xFF << 16) | (0xDF << 8) | 0xDF; |
107 | |
108 | fn write_punycode_label<W: Write + ?Sized>( |
109 | label: &[char], |
110 | sink: &mut W, |
111 | ) -> Result<(), ProcessingError> { |
112 | sink.write_str("xn--" )?; |
113 | crate::punycode::encode_into::<_, _, InternalCaller>(input:label.iter().copied(), output:sink)?; |
114 | Ok(()) |
115 | } |
116 | |
117 | #[inline (always)] |
118 | fn has_punycode_prefix(slice: &[u8]) -> bool { |
119 | if slice.len() < 4 { |
120 | return false; |
121 | } |
122 | // Sadly, the optimizer doesn't figure out that more idiomatic code |
123 | // should compile to masking on 32-bit value. |
124 | let a: u8 = slice[0]; |
125 | let b: u8 = slice[1]; |
126 | let c: u8 = slice[2]; |
127 | let d: u8 = slice[3]; |
128 | let u: u32 = (u32::from(d) << 24) | (u32::from(c) << 16) | (u32::from(b) << 8) | u32::from(a); |
129 | (u & PUNYCODE_PREFIX_MASK) == PUNYCODE_PREFIX |
130 | } |
131 | |
132 | #[inline (always)] |
133 | fn in_inclusive_range8(u: u8, start: u8, end: u8) -> bool { |
134 | u.wrapping_sub(start) <= (end - start) |
135 | } |
136 | |
137 | #[inline (always)] |
138 | fn in_inclusive_range_char(c: char, start: char, end: char) -> bool { |
139 | u32::from(c).wrapping_sub(u32::from(start)) <= (u32::from(end) - u32::from(start)) |
140 | } |
141 | |
142 | #[inline (always)] |
143 | fn is_passthrough_ascii_label(label: &[u8]) -> bool { |
144 | // XXX if we aren't performing _CheckHyphens_, this could |
145 | // check for "xn--" and pass through YouTube CDN node names. |
146 | if label.len() >= 4 && label[2] == b'-' && label[3] == b'-' { |
147 | return false; |
148 | } |
149 | if let Some((&first, tail)) = label.split_first() { |
150 | // We need to check the first and last character |
151 | // more strictly in case this turns out to be a |
152 | // label in a bidi domain name. This has the side |
153 | // effect that this function only accepts labels |
154 | // that also conform to the STD3 rules. |
155 | // |
156 | // XXX: If we are in the fail-fast mode (i.e. we don't need |
157 | // to be able to overwrite anything with U+FFFD), we could |
158 | // merely record that we've seen a digit here and error out |
159 | // if we later discover that the domain name is a bidi |
160 | // domain name. |
161 | if !in_inclusive_range8(first, b'a' , b'z' ) { |
162 | return false; |
163 | } |
164 | for &b in tail { |
165 | // If we used LDH_MASK, we'd have to check |
166 | // the bytes for the ASCII range anyhow. |
167 | if in_inclusive_range8(b, b'a' , b'z' ) { |
168 | continue; |
169 | } |
170 | if in_inclusive_range8(b, b'0' , b'9' ) { |
171 | continue; |
172 | } |
173 | if b == b'-' { |
174 | continue; |
175 | } |
176 | return false; |
177 | } |
178 | label.last() != Some(&b'-' ) |
179 | } else { |
180 | // empty |
181 | true |
182 | } |
183 | } |
184 | |
185 | #[inline (always)] |
186 | fn split_ascii_fast_path_prefix(label: &[u8]) -> (&[u8], &[u8]) { |
187 | if let Some(pos: usize) = label.iter().position(|b: &u8| !b.is_ascii()) { |
188 | if pos == 0 { |
189 | // First is non-ASCII |
190 | (&[], label) |
191 | } else { |
192 | // Leave one ASCII character in the suffix |
193 | // in case it's a letter that a combining |
194 | // character combines with. |
195 | let (head: &[u8], tail: &[u8]) = label.split_at(mid:pos - 1); |
196 | (head, tail) |
197 | } |
198 | } else { |
199 | // All ASCII |
200 | (label, &[]) |
201 | } |
202 | } |
203 | |
204 | // Input known to be lower-case, but may contain non-ASCII. |
205 | #[inline (always)] |
206 | fn apply_ascii_deny_list_to_lower_cased_unicode(c: char, deny_list: u128) -> char { |
207 | if let Some(shifted: u128) = 1u128.checked_shl(u32::from(c)) { |
208 | if (deny_list & shifted) == 0 { |
209 | c |
210 | } else { |
211 | ' \u{FFFD}' |
212 | } |
213 | } else { |
214 | c |
215 | } |
216 | } |
217 | |
218 | // Input known to be ASCII, but may contain upper case ASCII. |
219 | #[inline (always)] |
220 | fn apply_ascii_deny_list_to_potentially_upper_case_ascii(b: u8, deny_list: u128) -> char { |
221 | if (deny_list & (1u128 << b)) == 0 { |
222 | return char::from(b); |
223 | } |
224 | if in_inclusive_range8(u:b, start:b'A' , end:b'Z' ) { |
225 | return char::from(b + 0x20); |
226 | } |
227 | ' \u{FFFD}' |
228 | } |
229 | |
230 | #[inline (always)] |
231 | fn is_ascii(label: &[char]) -> bool { |
232 | for c: &char in label.iter() { |
233 | if !c.is_ascii() { |
234 | return false; |
235 | } |
236 | } |
237 | true |
238 | } |
239 | |
240 | #[derive (PartialEq, Eq, Copy, Clone)] |
241 | enum PunycodeClassification { |
242 | Ascii, |
243 | Unicode, |
244 | Error, |
245 | } |
246 | |
247 | #[inline (always)] |
248 | fn classify_for_punycode(label: &[char]) -> PunycodeClassification { |
249 | let mut iter: impl Iterator = label.iter().copied(); |
250 | loop { |
251 | if let Some(c: char) = iter.next() { |
252 | if c.is_ascii() { |
253 | continue; |
254 | } |
255 | if c == ' \u{FFFD}' { |
256 | return PunycodeClassification::Error; |
257 | } |
258 | for c: char in iter { |
259 | if c == ' \u{FFFD}' { |
260 | return PunycodeClassification::Error; |
261 | } |
262 | } |
263 | return PunycodeClassification::Unicode; |
264 | } |
265 | return PunycodeClassification::Ascii; |
266 | } |
267 | } |
268 | |
269 | /// The ASCII deny list to be applied. |
270 | #[derive (PartialEq, Eq, Copy, Clone)] |
271 | #[repr (transparent)] |
272 | pub struct AsciiDenyList { |
273 | bits: u128, |
274 | } |
275 | |
276 | impl AsciiDenyList { |
277 | /// Computes (preferably at compile time) an ASCII deny list. |
278 | /// |
279 | /// Setting `deny_glyphless` to `true` denies U+0020 SPACE and below |
280 | /// as well as U+007F DELETE for convenience without having to list |
281 | /// these characters in the `deny_list` string. |
282 | /// |
283 | /// `deny_list` is the list of ASCII characters to deny. This |
284 | /// list must not contain any of: |
285 | /// * Letters |
286 | /// * Digits |
287 | /// * Hyphen |
288 | /// * Dot (period / full-stop) |
289 | /// * Non-ASCII |
290 | /// |
291 | /// # Panics |
292 | /// |
293 | /// If the deny list contains characters listed as prohibited above. |
294 | pub const fn new(deny_glyphless: bool, deny_list: &str) -> Self { |
295 | let mut bits = UPPER_CASE_MASK; |
296 | if deny_glyphless { |
297 | bits |= GLYPHLESS_MASK; |
298 | } |
299 | let mut i = 0; |
300 | let bytes = deny_list.as_bytes(); |
301 | while i < bytes.len() { |
302 | let b = bytes[i]; |
303 | assert!(b < 0x80, "ASCII deny list must be ASCII." ); |
304 | // assert_ne not yet available in const context. |
305 | assert!(b != b'.' , "ASCII deny list must not contain the dot." ); |
306 | assert!(b != b'-' , "ASCII deny list must not contain the hyphen." ); |
307 | assert!( |
308 | !((b >= b'0' ) && (b <= b'9' )), |
309 | "ASCII deny list must not contain digits." |
310 | ); |
311 | assert!( |
312 | !((b >= b'a' ) && (b <= b'z' )), |
313 | "ASCII deny list must not contain letters." |
314 | ); |
315 | assert!( |
316 | !((b >= b'A' ) && (b <= b'Z' )), |
317 | "ASCII deny list must not contain letters." |
318 | ); |
319 | bits |= 1u128 << b; |
320 | i += 1; |
321 | } |
322 | AsciiDenyList { bits } |
323 | } |
324 | |
325 | /// No ASCII deny list. This corresponds to _UseSTD3ASCIIRules=false_. |
326 | /// |
327 | /// Equivalent to `AsciiDenyList::new(false, "")`. |
328 | /// |
329 | /// Note: Not denying the space and control characters can result in |
330 | /// strange behavior. Without a deny list provided to the UTS 46 |
331 | /// operation, the caller is expected perform filtering afterwards, |
332 | /// but it's more efficient to use `AsciiDenyList` than post-processing, |
333 | /// because the internals of this crate can optimize away checks in |
334 | /// certain cases. |
335 | pub const EMPTY: AsciiDenyList = AsciiDenyList::new(false, "" ); |
336 | |
337 | /// The STD3 deny list. This corresponds to _UseSTD3ASCIIRules=true_. |
338 | /// |
339 | /// Note that this deny list rejects the underscore, which occurs in |
340 | /// pseudo-hosts used by various TXT record-based protocols, and also |
341 | /// characters that may occurs in non-DNS naming, such as NetBIOS. |
342 | pub const STD3: AsciiDenyList = AsciiDenyList { bits: ldh_mask() }; |
343 | |
344 | /// [Forbidden domain code point](https://url.spec.whatwg.org/#forbidden-domain-code-point) from the WHATWG URL Standard. |
345 | /// |
346 | /// Equivalent to `AsciiDenyList::new(true, "%#/:<>?@[\\]^|")`. |
347 | /// |
348 | /// Note that this deny list rejects IPv6 addresses, so (as in URL |
349 | /// parsing) you need to check for IPv6 addresses first and not |
350 | /// put them through UTS 46 processing. |
351 | pub const URL: AsciiDenyList = AsciiDenyList::new(true, "%#/:<>?@[ \\]^|" ); |
352 | } |
353 | |
354 | /// The _CheckHyphens_ mode. |
355 | #[derive (PartialEq, Eq, Copy, Clone)] |
356 | #[non_exhaustive ] // non_exhaustive in case a middle mode that prohibits only first and last position needs to be added |
357 | pub enum Hyphens { |
358 | /// _CheckHyphens=false_: Do not place positional restrictions on hyphens. |
359 | /// |
360 | /// This mode is used by the WHATWG URL Standard for normal User Agent processing |
361 | /// (i.e. not conformance checking). |
362 | Allow, |
363 | |
364 | /// Prohibit hyphens in the first and last position in the label but allow in |
365 | /// the third and fourth position. |
366 | /// |
367 | /// Note that this mode rejects real-world names, including some GitHub user pages. |
368 | CheckFirstLast, |
369 | |
370 | /// _CheckHyphens=true_: Prohibit hyphens in the first, third, fourth, |
371 | /// and last position in the label. |
372 | /// |
373 | /// Note that this mode rejects real-world names, including YouTube CDN nodes |
374 | /// and some GitHub user pages. |
375 | Check, |
376 | } |
377 | |
378 | /// The UTS 46 _VerifyDNSLength_ flag. |
379 | #[derive (PartialEq, Eq, Copy, Clone)] |
380 | #[non_exhaustive ] |
381 | pub enum DnsLength { |
382 | /// _VerifyDNSLength=false_. (Possibly relevant for allowing non-DNS naming systems.) |
383 | Ignore, |
384 | /// _VerifyDNSLength=true_ with the exception that the trailing root label dot is |
385 | /// allowed. |
386 | VerifyAllowRootDot, |
387 | /// _VerifyDNSLength=true_. (The trailing root label dot is not allowed.) |
388 | Verify, |
389 | } |
390 | |
391 | /// Policy for customizing behavior in case of an error. |
392 | #[derive (PartialEq, Eq, Copy, Clone)] |
393 | #[non_exhaustive ] |
394 | pub enum ErrorPolicy { |
395 | /// Return as early as possible without producing output in case of error. |
396 | FailFast, |
397 | /// In case of error, mark errors with the REPLACEMENT CHARACTER. (The output |
398 | /// containing REPLACEMENT CHARACTERs may be show to the user to illustrate |
399 | /// what was wrong but must not be used for naming in a network protocol.) |
400 | MarkErrors, |
401 | } |
402 | |
403 | /// The success outcome of [`Uts46::process`] |
404 | #[derive (PartialEq, Eq, Copy, Clone, Debug)] |
405 | pub enum ProcessingSuccess { |
406 | /// There were no errors. The caller must consider the input to be the output. |
407 | /// |
408 | /// This asserts that the input can be safely passed to [`core::str::from_utf8_unchecked`]. |
409 | /// |
410 | /// (Distinct from `WroteToSink` in order to allow `Cow` behavior to be implemented on top of |
411 | /// [`Uts46::process`].) |
412 | Passthrough, |
413 | |
414 | /// There were no errors. The caller must consider what was written to the sink to be the output. |
415 | /// |
416 | /// (Distinct from `Passthrough` in order to allow `Cow` behavior to be implemented on top of |
417 | /// [`Uts46::process`].) |
418 | WroteToSink, |
419 | } |
420 | |
421 | /// The failure outcome of [`Uts46::process`] |
422 | #[derive (PartialEq, Eq, Copy, Clone, Debug)] |
423 | pub enum ProcessingError { |
424 | /// There was a validity error according to the chosen options. |
425 | /// |
426 | /// In case of `Operation::ToAscii`, there is no output. Otherwise, output was written to the |
427 | /// sink and the output contains at least one U+FFFD REPLACEMENT CHARACTER to denote an error. |
428 | ValidityError, |
429 | |
430 | /// The sink emitted [`core::fmt::Error`]. The partial output written to the sink must not |
431 | /// be used. |
432 | SinkError, |
433 | } |
434 | |
435 | impl From<core::fmt::Error> for ProcessingError { |
436 | fn from(_: core::fmt::Error) -> Self { |
437 | ProcessingError::SinkError |
438 | } |
439 | } |
440 | |
441 | impl From<crate::punycode::PunycodeEncodeError> for ProcessingError { |
442 | fn from(_: crate::punycode::PunycodeEncodeError) -> Self { |
443 | unreachable!( |
444 | "Punycode overflows should not be possible due to PUNYCODE_ENCODE_MAX_INPUT_LENGTH" |
445 | ); |
446 | } |
447 | } |
448 | |
449 | #[derive (Debug, Clone, Copy)] |
450 | enum AlreadyAsciiLabel<'a> { |
451 | MixedCaseAscii(&'a [u8]), |
452 | MixedCasePunycode(&'a [u8]), |
453 | Other, |
454 | } |
455 | |
456 | /// Performs the _VerifyDNSLength_ check on the output of the _ToASCII_ operation. |
457 | /// |
458 | /// If the second argument is `false`, the trailing root label dot is allowed. |
459 | /// |
460 | /// # Panics |
461 | /// |
462 | /// Panics in debug mode if the argument isn't ASCII. |
463 | pub fn verify_dns_length(domain_name: &str, allow_trailing_dot: bool) -> bool { |
464 | let bytes: &[u8] = domain_name.as_bytes(); |
465 | debug_assert!(bytes.is_ascii()); |
466 | let domain_name_without_trailing_dot: &[u8] = if let Some(without: &[u8]) = bytes.strip_suffix(b"." ) { |
467 | if !allow_trailing_dot { |
468 | return false; |
469 | } |
470 | without |
471 | } else { |
472 | bytes |
473 | }; |
474 | if domain_name_without_trailing_dot.len() > 253 { |
475 | return false; |
476 | } |
477 | for label: &[u8] in domain_name_without_trailing_dot.split(|b: &u8| *b == b'.' ) { |
478 | if label.is_empty() { |
479 | return false; |
480 | } |
481 | if label.len() > 63 { |
482 | return false; |
483 | } |
484 | } |
485 | true |
486 | } |
487 | |
488 | /// An implementation of UTS #46. |
489 | pub struct Uts46 { |
490 | data: idna_adapter::Adapter, |
491 | } |
492 | |
493 | #[cfg (feature = "compiled_data" )] |
494 | impl Default for Uts46 { |
495 | fn default() -> Self { |
496 | Self::new() |
497 | } |
498 | } |
499 | |
500 | impl Uts46 { |
501 | /// Constructor using data compiled into the binary. |
502 | #[cfg (feature = "compiled_data" )] |
503 | pub const fn new() -> Self { |
504 | Self { |
505 | data: idna_adapter::Adapter::new(), |
506 | } |
507 | } |
508 | |
509 | // XXX Should there be an `icu_provider` feature for enabling |
510 | // a constructor for run-time data loading? |
511 | |
512 | /// Performs the [ToASCII](https://www.unicode.org/reports/tr46/#ToASCII) operation |
513 | /// from UTS #46 with the options indicated. |
514 | /// |
515 | /// # Arguments |
516 | /// |
517 | /// * `domain_name` - The input domain name as UTF-8 bytes. (The UTF-8ness is checked by |
518 | /// this method and input that is not well-formed UTF-8 is treated as an error. If you |
519 | /// already have a `&str`, call `.as_bytes()` on it.) |
520 | /// * `ascii_deny_list` - What ASCII deny list, if any, to apply. The UTS 46 |
521 | /// _UseSTD3ASCIIRules_ flag or the WHATWG URL Standard forbidden domain code point |
522 | /// processing is handled via this argument. Most callers are probably the best off |
523 | /// by using [`AsciiDenyList::URL`] here. |
524 | /// * `hyphens` - The UTS 46 _CheckHyphens_ flag. Most callers are probably the best |
525 | /// off by using [`Hyphens::Allow`] here. |
526 | /// * `dns_length` - The UTS 46 _VerifyDNSLength_ flag. |
527 | pub fn to_ascii<'a>( |
528 | &self, |
529 | domain_name: &'a [u8], |
530 | ascii_deny_list: AsciiDenyList, |
531 | hyphens: Hyphens, |
532 | dns_length: DnsLength, |
533 | ) -> Result<Cow<'a, str>, crate::Errors> { |
534 | let mut s = String::new(); |
535 | match self.process( |
536 | domain_name, |
537 | ascii_deny_list, |
538 | hyphens, |
539 | ErrorPolicy::FailFast, |
540 | |_, _, _| false, |
541 | &mut s, |
542 | None, |
543 | ) { |
544 | // SAFETY: `ProcessingSuccess::Passthrough` asserts that `domain_name` is ASCII. |
545 | Ok(ProcessingSuccess::Passthrough) => { |
546 | let cow = Cow::Borrowed(unsafe { core::str::from_utf8_unchecked(domain_name) }); |
547 | if dns_length != DnsLength::Ignore |
548 | && !verify_dns_length(&cow, dns_length == DnsLength::VerifyAllowRootDot) |
549 | { |
550 | Err(crate::Errors::default()) |
551 | } else { |
552 | Ok(cow) |
553 | } |
554 | } |
555 | Ok(ProcessingSuccess::WroteToSink) => { |
556 | let cow: Cow<'_, str> = Cow::Owned(s); |
557 | if dns_length != DnsLength::Ignore |
558 | && !verify_dns_length(&cow, dns_length == DnsLength::VerifyAllowRootDot) |
559 | { |
560 | Err(crate::Errors::default()) |
561 | } else { |
562 | Ok(cow) |
563 | } |
564 | } |
565 | Err(ProcessingError::ValidityError) => Err(crate::Errors::default()), |
566 | Err(ProcessingError::SinkError) => unreachable!(), |
567 | } |
568 | } |
569 | |
570 | /// Performs the [ToUnicode](https://www.unicode.org/reports/tr46/#ToUnicode) operation |
571 | /// from UTS #46 according to the options given. When there |
572 | /// are errors, there is still output, which may be rendered user, even through |
573 | /// the output must not be used in networking protocols. Errors are denoted |
574 | /// by U+FFFD REPLACEMENT CHARACTERs in the output. (That is, if the second item of the |
575 | /// return tuple is `Err`, the first item of the return tuple is guaranteed to contain |
576 | /// at least one U+FFFD.) |
577 | /// |
578 | /// Most applications probably shouldn't use this method and should be using |
579 | /// [`Uts46::to_user_interface`] instead. |
580 | /// |
581 | /// # Arguments |
582 | /// |
583 | /// * `domain_name` - The input domain name as UTF-8 bytes. (The UTF-8ness is checked by |
584 | /// this method and input that is not well-formed UTF-8 is treated as an error. If you |
585 | /// already have a `&str`, call `.as_bytes()` on it.) |
586 | /// * `ascii_deny_list` - What ASCII deny list, if any, to apply. The UTS 46 |
587 | /// _UseSTD3ASCIIRules_ flag or the WHATWG URL Standard forbidden domain code point |
588 | /// processing is handled via this argument. Most callers are probably the best off |
589 | /// by using [`AsciiDenyList::URL`] here. |
590 | /// * `hyphens` - The UTS 46 _CheckHyphens_ flag. Most callers are probably the best |
591 | /// off by using [`Hyphens::Allow`] here. |
592 | pub fn to_unicode<'a>( |
593 | &self, |
594 | domain_name: &'a [u8], |
595 | ascii_deny_list: AsciiDenyList, |
596 | hyphens: Hyphens, |
597 | ) -> (Cow<'a, str>, Result<(), crate::Errors>) { |
598 | self.to_user_interface(domain_name, ascii_deny_list, hyphens, |_, _, _| true) |
599 | } |
600 | |
601 | /// Performs the [ToUnicode](https://www.unicode.org/reports/tr46/#ToUnicode) operation |
602 | /// from UTS #46 according to options given with some |
603 | /// error-free Unicode labels output according to |
604 | /// [ToASCII](https://www.unicode.org/reports/tr46/#ToASCII) instead as decided by |
605 | /// application policy implemented via the `output_as_unicode` closure. The purpose |
606 | /// is to convert user-visible domains to the Unicode form in general but to render |
607 | /// potentially misleading labels as Punycode. |
608 | /// |
609 | /// This is an imperfect security mechanism, because [the Punycode form itself may be |
610 | /// resemble a user-recognizable name](https://www.unicode.org/reports/tr36/#TablePunycodeSpoofing). |
611 | /// However, since this mechanism is common practice, this API provides support for The |
612 | /// the mechanism. |
613 | /// |
614 | /// ASCII labels always pass through as ASCII and labels with errors always pass through |
615 | /// as Unicode. For non-erroneous labels that contain at least one non-ASCII character |
616 | /// (implies non-empty), `output_as_unicode` is called with the Unicode form of the label, |
617 | /// the TLD (potentially empty), and a flag indicating whether the domain name as a whole |
618 | /// is a bidi domain name. If the return value is `true`, the label passes through as |
619 | /// Unicode. If the return value is `false`, the label is converted to Punycode. |
620 | /// |
621 | /// When there are errors, there is still output, which may be rendered user, even through |
622 | /// the output must not be used in networking protocols. Errors are denoted by |
623 | /// U+FFFD REPLACEMENT CHARACTERs in the output. (That is, if the second item |
624 | /// of the return tuple is `Err`, the first item of the return tuple is guaranteed to contain |
625 | /// at least one U+FFFD.) Labels that contain errors are not converted to Punycode. |
626 | /// |
627 | /// # Arguments |
628 | /// |
629 | /// * `domain_name` - The input domain name as UTF-8 bytes. (The UTF-8ness is checked by |
630 | /// this method and input that is not well-formed UTF-8 is treated as an error. If you |
631 | /// already have a `&str`, call `.as_bytes()` on it.) |
632 | /// * `ascii_deny_list` - What ASCII deny list, if any, to apply. The UTS 46 |
633 | /// _UseSTD3ASCIIRules_ flag or the WHATWG URL Standard forbidden domain code point |
634 | /// processing is handled via this argument. Most callers are probably the best off |
635 | /// by using [`AsciiDenyList::URL`] here. |
636 | /// * `hyphens` - The UTS 46 _CheckHyphens_ flag. Most callers are probably the best |
637 | /// off by using [`Hyphens::Allow`] here. |
638 | /// * `output_as_unicode` - A closure for deciding if a label should be output as Unicode |
639 | /// (as opposed to Punycode). The first argument is the label for which a decision is |
640 | /// needed (always non-empty slice). The second argument is the TLD (potentially empty). |
641 | /// The third argument is `true` iff the domain name as a whole is a bidi domain name. |
642 | /// Only non-erroneous labels that contain at least one non-ASCII character are passed |
643 | /// to the closure as the first argument. The second and third argument values are |
644 | /// guaranteed to remain the same during a single call to `process`, and the closure |
645 | /// may cache computations derived from the second and third argument (hence the |
646 | /// `FnMut` type). |
647 | pub fn to_user_interface<'a, OutputUnicode: FnMut(&[char], &[char], bool) -> bool>( |
648 | &self, |
649 | domain_name: &'a [u8], |
650 | ascii_deny_list: AsciiDenyList, |
651 | hyphens: Hyphens, |
652 | output_as_unicode: OutputUnicode, |
653 | ) -> (Cow<'a, str>, Result<(), crate::Errors>) { |
654 | let mut s = String::new(); |
655 | match self.process( |
656 | domain_name, |
657 | ascii_deny_list, |
658 | hyphens, |
659 | ErrorPolicy::MarkErrors, |
660 | output_as_unicode, |
661 | &mut s, |
662 | None, |
663 | ) { |
664 | // SAFETY: `ProcessingSuccess::Passthrough` asserts that `domain_name` is ASCII. |
665 | Ok(ProcessingSuccess::Passthrough) => ( |
666 | Cow::Borrowed(unsafe { core::str::from_utf8_unchecked(domain_name) }), |
667 | Ok(()), |
668 | ), |
669 | Ok(ProcessingSuccess::WroteToSink) => (Cow::Owned(s), Ok(())), |
670 | Err(ProcessingError::ValidityError) => (Cow::Owned(s), Err(crate::Errors::default())), |
671 | Err(ProcessingError::SinkError) => unreachable!(), |
672 | } |
673 | } |
674 | |
675 | /// The lower-level function that [`Uts46::to_ascii`], [`Uts46::to_unicode`], and |
676 | /// [`Uts46::to_user_interface`] are built on to allow support for output types other |
677 | /// than `Cow<'a, str>` (e.g. string types in a non-Rust programming language). |
678 | /// |
679 | /// # Arguments |
680 | /// |
681 | /// * `domain_name` - The input domain name as UTF-8 bytes. (The UTF-8ness is checked by |
682 | /// this method and input that is not well-formed UTF-8 is treated as an error. If you |
683 | /// already have a `&str`, call `.as_bytes()` on it.) |
684 | /// * `ascii_deny_list` - What ASCII deny list, if any, to apply. The UTS 46 |
685 | /// _UseSTD3ASCIIRules_ flag or the WHATWG URL Standard forbidden domain code point |
686 | /// processing is handled via this argument. Most callers are probably the best off |
687 | /// by using [`AsciiDenyList::URL`] here. |
688 | /// * `hyphens` - The UTS 46 _CheckHyphens_ flag. Most callers are probably the best |
689 | /// off by using [`Hyphens::Allow`] here. |
690 | /// * `error_policy` - Whether to fail fast or to produce output that may be rendered |
691 | /// for the user to examine in case of errors. |
692 | /// * `output_as_unicode` - A closure for deciding if a label should be output as Unicode |
693 | /// (as opposed to Punycode). The first argument is the label for which a decision is |
694 | /// needed (always non-empty slice). The second argument is the TLD (potentially empty). |
695 | /// The third argument is `true` iff the domain name as a whole is a bidi domain name. |
696 | /// Only non-erroneous labels that contain at least one non-ASCII character are passed |
697 | /// to the closure as the first argument. The second and third argument values are |
698 | /// guaranteed to remain the same during a single call to `process`, and the closure |
699 | /// may cache computations derived from the second and third argument (hence the |
700 | /// `FnMut` type). To perform the _ToASCII_ operation, `|_, _, _| false` must be |
701 | /// passed as the closure. To perform the _ToUnicode_ operation, `|_, _, _| true` must |
702 | /// be passed as the closure. A more complex closure may be used to prepare a domain |
703 | /// name for display in a user interface so that labels are converted to the Unicode |
704 | /// form in general but potentially misleading labels are converted to the Punycode |
705 | /// form. |
706 | /// * `sink` - The object that receives the output (in the non-passthrough case). |
707 | /// * `ascii_sink` - A second sink that receives the _ToASCII_ form only if there |
708 | /// were no errors and `sink` received at least one character of non-ASCII output. |
709 | /// The purpose of this argument is to enable a user interface display form of the |
710 | /// domain and the _ToASCII_ form of the domain to be computed efficiently together. |
711 | /// This argument is useless when `output_as_unicode` always returns `false`, in |
712 | /// which case the _ToASCII_ form ends up in `sink` already. If `ascii_sink` receives |
713 | /// no output and the return value is `Ok(ProcessingSuccess::WroteToSink)`, use the |
714 | /// output received by `sink` also as the _ToASCII_ result. |
715 | /// |
716 | /// # Return value |
717 | /// |
718 | /// * `Ok(ProcessingSuccess::Passthrough)` - The caller must treat |
719 | /// `unsafe { core::str::from_utf8_unchecked(domain_name) }` as the output. (This |
720 | /// return value asserts that calling `core::str::from_utf8_unchecked(domain_name)` |
721 | /// is safe.) |
722 | /// * `Ok(ProcessingSuccess::WroteToSink)` - The caller must treat was was written |
723 | /// to `sink` as the output. If another sink was passed as `ascii_sink` but it did |
724 | /// not receive output, the caller must treat what was written to `sink` also as |
725 | /// the _ToASCII_ output. Otherwise, if `ascii_sink` received output, the caller |
726 | /// must treat what was written to `ascii_sink` as the _ToASCII_ output. |
727 | /// * `Err(ProcessingError::ValidityError)` - The input was in error and must |
728 | /// not be used for DNS lookup or otherwise in a network protocol. If `error_policy` |
729 | /// was `ErrorPolicy::MarkErrors`, the output written to `sink` may be displayed |
730 | /// to the user as an illustration of where the error was or the errors were. |
731 | /// * `Err(ProcessingError::SinkError)` - Either `sink` or `ascii_sink` returned |
732 | /// [`core::fmt::Error`]. The partial output written to `sink` `ascii_sink` must not |
733 | /// be used. If `W` never returns [`core::fmt::Error`], this method never returns |
734 | /// `Err(ProcessingError::SinkError)`. |
735 | /// |
736 | /// # Safety-usable invariant |
737 | /// |
738 | /// If the return value is `Ok(ProcessingSuccess::Passthrough)`, `domain_name` is |
739 | /// ASCII and `core::str::from_utf8_unchecked(domain_name)` is safe. (Note: |
740 | /// Other return values do _not_ imply that `domain_name` wasn't ASCII!) |
741 | /// |
742 | /// # Security considerations |
743 | /// |
744 | /// Showing labels whose Unicode form might mislead the user as Punycode instead is |
745 | /// an imperfect security mechanism, because [the Punycode form itself may be resemble |
746 | /// a user-recognizable name](https://www.unicode.org/reports/tr36/#TablePunycodeSpoofing). |
747 | /// However, since this mechanism is common practice, this API provides support for the |
748 | /// the mechanism. |
749 | /// |
750 | /// Punycode processing is quadratic, so to avoid denial of service, this method imposes |
751 | /// length limits on Punycode treating especially long inputs as being in error. These |
752 | /// limits are well higher than the DNS length limits and are not more restrictive than |
753 | /// the limits imposed by ICU4C. |
754 | #[allow (clippy::too_many_arguments)] |
755 | pub fn process<W: Write + ?Sized, OutputUnicode: FnMut(&[char], &[char], bool) -> bool>( |
756 | &self, |
757 | domain_name: &[u8], |
758 | ascii_deny_list: AsciiDenyList, |
759 | hyphens: Hyphens, |
760 | error_policy: ErrorPolicy, |
761 | mut output_as_unicode: OutputUnicode, |
762 | sink: &mut W, |
763 | ascii_sink: Option<&mut W>, |
764 | ) -> Result<ProcessingSuccess, ProcessingError> { |
765 | let fail_fast = error_policy == ErrorPolicy::FailFast; |
766 | let mut domain_buffer = SmallVec::<[char; 253]>::new(); |
767 | let mut already_punycode = SmallVec::<[AlreadyAsciiLabel; 8]>::new(); |
768 | // `process_inner` could be pasted inline here, but it's out of line in order |
769 | // to avoid duplicating that code when monomorphizing over `W` and `OutputUnicode`. |
770 | let (passthrough_up_to, is_bidi, had_errors) = self.process_inner( |
771 | domain_name, |
772 | ascii_deny_list, |
773 | hyphens, |
774 | fail_fast, |
775 | &mut domain_buffer, |
776 | &mut already_punycode, |
777 | ); |
778 | if passthrough_up_to == domain_name.len() { |
779 | debug_assert!(!had_errors); |
780 | return Ok(ProcessingSuccess::Passthrough); |
781 | } |
782 | // Checked only after passthrough as a micro optimization. |
783 | if fail_fast && had_errors { |
784 | return Err(ProcessingError::ValidityError); |
785 | } |
786 | debug_assert_eq!(had_errors, domain_buffer.contains(&' \u{FFFD}' )); |
787 | let without_dot = if let Some(without_dot) = domain_buffer.strip_suffix(&['.' ]) { |
788 | without_dot |
789 | } else { |
790 | &domain_buffer[..] |
791 | }; |
792 | // unwrap is OK, because we always have at least one label |
793 | let tld = without_dot.rsplit(|c| *c == '.' ).next().unwrap(); |
794 | let mut had_unicode_output = false; |
795 | let mut seen_label = false; |
796 | let mut already_punycode_iter = already_punycode.iter(); |
797 | let mut passthrough_up_to_extended = passthrough_up_to; |
798 | let mut flushed_prefix = false; |
799 | for label in domain_buffer.split(|c| *c == '.' ) { |
800 | // Unwrap is OK, because there are supposed to be as many items in |
801 | // `already_punycode` as there are labels. |
802 | let input_punycode = *already_punycode_iter.next().unwrap(); |
803 | if seen_label { |
804 | if flushed_prefix { |
805 | sink.write_char('.' )?; |
806 | } else { |
807 | debug_assert_eq!(domain_name[passthrough_up_to_extended], b'.' ); |
808 | passthrough_up_to_extended += 1; |
809 | if passthrough_up_to_extended == domain_name.len() { |
810 | debug_assert!(!had_errors); |
811 | return Ok(ProcessingSuccess::Passthrough); |
812 | } |
813 | } |
814 | } |
815 | seen_label = true; |
816 | |
817 | if let AlreadyAsciiLabel::MixedCaseAscii(mixed_case) = input_punycode { |
818 | if let Some(first_upper_case) = |
819 | mixed_case.iter().position(|c| c.is_ascii_uppercase()) |
820 | { |
821 | let (head, tail) = mixed_case.split_at(first_upper_case); |
822 | let slice_to_write = if flushed_prefix { |
823 | head |
824 | } else { |
825 | flushed_prefix = true; |
826 | passthrough_up_to_extended += head.len(); |
827 | debug_assert_ne!(passthrough_up_to_extended, domain_name.len()); |
828 | &domain_name[..passthrough_up_to_extended] |
829 | }; |
830 | // SAFETY: `mixed_case` and `domain_name` up to `passthrough_up_to_extended` are known to be ASCII. |
831 | sink.write_str(unsafe { core::str::from_utf8_unchecked(slice_to_write) })?; |
832 | for c in tail.iter() { |
833 | sink.write_char(char::from(c.to_ascii_lowercase()))?; |
834 | } |
835 | } else if flushed_prefix { |
836 | // SAFETY: `mixed_case` is known to be ASCII. |
837 | sink.write_str(unsafe { core::str::from_utf8_unchecked(mixed_case) })?; |
838 | } else { |
839 | passthrough_up_to_extended += mixed_case.len(); |
840 | if passthrough_up_to_extended == domain_name.len() { |
841 | debug_assert!(!had_errors); |
842 | return Ok(ProcessingSuccess::Passthrough); |
843 | } |
844 | } |
845 | continue; |
846 | } |
847 | |
848 | let potentially_punycode = if fail_fast { |
849 | debug_assert!(classify_for_punycode(label) != PunycodeClassification::Error); |
850 | !is_ascii(label) |
851 | } else { |
852 | classify_for_punycode(label) == PunycodeClassification::Unicode |
853 | }; |
854 | let passthrough = if potentially_punycode { |
855 | let unicode = output_as_unicode(label, tld, is_bidi); |
856 | had_unicode_output |= unicode; |
857 | unicode |
858 | } else { |
859 | true |
860 | }; |
861 | if passthrough { |
862 | if !flushed_prefix { |
863 | flushed_prefix = true; |
864 | // SAFETY: `domain_name` up to `passthrough_up_to_extended` is known to be ASCII. |
865 | sink.write_str(unsafe { |
866 | core::str::from_utf8_unchecked(&domain_name[..passthrough_up_to_extended]) |
867 | })?; |
868 | } |
869 | for c in label.iter().copied() { |
870 | sink.write_char(c)?; |
871 | } |
872 | } else if let AlreadyAsciiLabel::MixedCasePunycode(mixed_case) = input_punycode { |
873 | if let Some(first_upper_case) = |
874 | mixed_case.iter().position(|c| c.is_ascii_uppercase()) |
875 | { |
876 | let (head, tail) = mixed_case.split_at(first_upper_case); |
877 | let slice_to_write = if flushed_prefix { |
878 | head |
879 | } else { |
880 | flushed_prefix = true; |
881 | passthrough_up_to_extended += head.len(); |
882 | debug_assert_ne!(passthrough_up_to_extended, domain_name.len()); |
883 | &domain_name[..passthrough_up_to_extended] |
884 | }; |
885 | // SAFETY: `mixed_case` and `domain_name` up to `passthrough_up_to_extended` are known to be ASCII. |
886 | sink.write_str(unsafe { core::str::from_utf8_unchecked(slice_to_write) })?; |
887 | for c in tail.iter() { |
888 | sink.write_char(char::from(c.to_ascii_lowercase()))?; |
889 | } |
890 | } else if flushed_prefix { |
891 | // SAFETY: `mixed_case` is known to be ASCII. |
892 | sink.write_str(unsafe { core::str::from_utf8_unchecked(mixed_case) })?; |
893 | } else { |
894 | passthrough_up_to_extended += mixed_case.len(); |
895 | if passthrough_up_to_extended == domain_name.len() { |
896 | debug_assert!(!had_errors); |
897 | return Ok(ProcessingSuccess::Passthrough); |
898 | } |
899 | } |
900 | } else { |
901 | if !flushed_prefix { |
902 | flushed_prefix = true; |
903 | // SAFETY: `domain_name` up to `passthrough_up_to_extended` is known to be ASCII. |
904 | sink.write_str(unsafe { |
905 | core::str::from_utf8_unchecked(&domain_name[..passthrough_up_to_extended]) |
906 | })?; |
907 | } |
908 | write_punycode_label(label, sink)?; |
909 | } |
910 | } |
911 | |
912 | if had_errors { |
913 | return Err(ProcessingError::ValidityError); |
914 | } |
915 | |
916 | if had_unicode_output { |
917 | if let Some(sink) = ascii_sink { |
918 | let mut seen_label = false; |
919 | let mut already_punycode_iter = already_punycode.iter(); |
920 | let mut passthrough_up_to_extended = passthrough_up_to; |
921 | let mut flushed_prefix = false; |
922 | for label in domain_buffer.split(|c| *c == '.' ) { |
923 | // Unwrap is OK, because there are supposed to be as many items in |
924 | // `already_punycode` as there are labels. |
925 | let input_punycode = *already_punycode_iter.next().unwrap(); |
926 | if seen_label { |
927 | if flushed_prefix { |
928 | sink.write_char('.' )?; |
929 | } else { |
930 | debug_assert_eq!(domain_name[passthrough_up_to_extended], b'.' ); |
931 | passthrough_up_to_extended += 1; |
932 | } |
933 | } |
934 | seen_label = true; |
935 | |
936 | if let AlreadyAsciiLabel::MixedCaseAscii(mixed_case) = input_punycode { |
937 | if let Some(first_upper_case) = |
938 | mixed_case.iter().position(|c| c.is_ascii_uppercase()) |
939 | { |
940 | let (head, tail) = mixed_case.split_at(first_upper_case); |
941 | let slice_to_write = if flushed_prefix { |
942 | head |
943 | } else { |
944 | flushed_prefix = true; |
945 | passthrough_up_to_extended += head.len(); |
946 | debug_assert_ne!(passthrough_up_to_extended, domain_name.len()); |
947 | &domain_name[..passthrough_up_to_extended] |
948 | }; |
949 | // SAFETY: `mixed_case` and `domain_name` up to `passthrough_up_to_extended` are known to be ASCII. |
950 | sink.write_str(unsafe { |
951 | core::str::from_utf8_unchecked(slice_to_write) |
952 | })?; |
953 | for c in tail.iter() { |
954 | sink.write_char(char::from(c.to_ascii_lowercase()))?; |
955 | } |
956 | } else if flushed_prefix { |
957 | // SAFETY: `mixed_case` is known to be ASCII. |
958 | sink.write_str(unsafe { core::str::from_utf8_unchecked(mixed_case) })?; |
959 | } else { |
960 | passthrough_up_to_extended += mixed_case.len(); |
961 | } |
962 | continue; |
963 | } |
964 | |
965 | if is_ascii(label) { |
966 | if !flushed_prefix { |
967 | flushed_prefix = true; |
968 | // SAFETY: `domain_name` up to `passthrough_up_to_extended` is known to be ASCII. |
969 | sink.write_str(unsafe { |
970 | core::str::from_utf8_unchecked( |
971 | &domain_name[..passthrough_up_to_extended], |
972 | ) |
973 | })?; |
974 | } |
975 | for c in label.iter().copied() { |
976 | sink.write_char(c)?; |
977 | } |
978 | } else if let AlreadyAsciiLabel::MixedCasePunycode(mixed_case) = input_punycode |
979 | { |
980 | if let Some(first_upper_case) = |
981 | mixed_case.iter().position(|c| c.is_ascii_uppercase()) |
982 | { |
983 | let (head, tail) = mixed_case.split_at(first_upper_case); |
984 | let slice_to_write = if flushed_prefix { |
985 | head |
986 | } else { |
987 | flushed_prefix = true; |
988 | passthrough_up_to_extended += head.len(); |
989 | debug_assert_ne!(passthrough_up_to_extended, domain_name.len()); |
990 | &domain_name[..passthrough_up_to_extended] |
991 | }; |
992 | // SAFETY: `mixed_case` and `domain_name` up to `passthrough_up_to_extended` are known to be ASCII. |
993 | sink.write_str(unsafe { |
994 | core::str::from_utf8_unchecked(slice_to_write) |
995 | })?; |
996 | for c in tail.iter() { |
997 | sink.write_char(char::from(c.to_ascii_lowercase()))?; |
998 | } |
999 | } else if flushed_prefix { |
1000 | // SAFETY: `mixed_case` is known to be ASCII. |
1001 | sink.write_str(unsafe { core::str::from_utf8_unchecked(mixed_case) })?; |
1002 | } else { |
1003 | passthrough_up_to_extended += mixed_case.len(); |
1004 | } |
1005 | } else { |
1006 | if !flushed_prefix { |
1007 | flushed_prefix = true; |
1008 | // SAFETY: `domain_name` up to `passthrough_up_to_extended` is known to be ASCII. |
1009 | sink.write_str(unsafe { |
1010 | core::str::from_utf8_unchecked( |
1011 | &domain_name[..passthrough_up_to_extended], |
1012 | ) |
1013 | })?; |
1014 | } |
1015 | write_punycode_label(label, sink)?; |
1016 | } |
1017 | } |
1018 | if !flushed_prefix { |
1019 | // SAFETY: `domain_name` up to `passthrough_up_to_extended` is known to be ASCII. |
1020 | sink.write_str(unsafe { |
1021 | core::str::from_utf8_unchecked(&domain_name[..passthrough_up_to_extended]) |
1022 | })?; |
1023 | } |
1024 | } |
1025 | } |
1026 | Ok(ProcessingSuccess::WroteToSink) |
1027 | } |
1028 | |
1029 | /// The part of `process` that doesn't need to be generic over the sink. |
1030 | #[inline (always)] |
1031 | fn process_inner<'a>( |
1032 | &self, |
1033 | domain_name: &'a [u8], |
1034 | ascii_deny_list: AsciiDenyList, |
1035 | hyphens: Hyphens, |
1036 | fail_fast: bool, |
1037 | domain_buffer: &mut SmallVec<[char; 253]>, |
1038 | already_punycode: &mut SmallVec<[AlreadyAsciiLabel<'a>; 8]>, |
1039 | ) -> (usize, bool, bool) { |
1040 | // Sadly, this even faster-path ASCII tier is needed to avoid regressing |
1041 | // performance. |
1042 | let mut iter = domain_name.iter(); |
1043 | let mut most_recent_label_start = iter.clone(); |
1044 | loop { |
1045 | if let Some(&b) = iter.next() { |
1046 | if in_inclusive_range8(b, b'a' , b'z' ) { |
1047 | continue; |
1048 | } |
1049 | if b == b'.' { |
1050 | most_recent_label_start = iter.clone(); |
1051 | continue; |
1052 | } |
1053 | return self.process_innermost( |
1054 | domain_name, |
1055 | ascii_deny_list, |
1056 | hyphens, |
1057 | fail_fast, |
1058 | domain_buffer, |
1059 | already_punycode, |
1060 | most_recent_label_start.as_slice(), |
1061 | ); |
1062 | } else { |
1063 | // Success! The whole input passes through on the fastest path! |
1064 | return (domain_name.len(), false, false); |
1065 | } |
1066 | } |
1067 | } |
1068 | |
1069 | /// The part of `process` that doesn't need to be generic over the sink and |
1070 | /// can avoid monomorphizing in the interest of code size. |
1071 | /// Separating this into a different stack frame compared to `process_inner` |
1072 | /// improves performance in the ICU4X case. |
1073 | #[allow (clippy::too_many_arguments)] |
1074 | #[inline (never)] |
1075 | fn process_innermost<'a>( |
1076 | &self, |
1077 | domain_name: &'a [u8], |
1078 | ascii_deny_list: AsciiDenyList, |
1079 | hyphens: Hyphens, |
1080 | fail_fast: bool, |
1081 | domain_buffer: &mut SmallVec<[char; 253]>, |
1082 | already_punycode: &mut SmallVec<[AlreadyAsciiLabel<'a>; 8]>, |
1083 | tail: &'a [u8], |
1084 | ) -> (usize, bool, bool) { |
1085 | let deny_list = ascii_deny_list.bits; |
1086 | let deny_list_deny_dot = deny_list | DOT_MASK; |
1087 | |
1088 | let mut had_errors = false; |
1089 | |
1090 | let mut passthrough_up_to = domain_name.len() - tail.len(); // Index into `domain_name` |
1091 | // 253 ASCII characters is the max length for a valid domain name |
1092 | // (excluding the root dot). |
1093 | let mut current_label_start; // Index into `domain_buffer` |
1094 | let mut seen_label = false; |
1095 | let mut in_prefix = true; |
1096 | for label in tail.split(|b| *b == b'.' ) { |
1097 | // We check for passthrough only for the prefix. That is, if we |
1098 | // haven't moved on and started filling `domain_buffer`. Keeping |
1099 | // this stuff in one loop where the first items keep being skipped |
1100 | // once they have been skipped at least once instead of working |
1101 | // this into a fancier loop structure in order to make sure that |
1102 | // no item from the iterator is lost or processed twice. |
1103 | // Furthermore, after the passthrough fails, restarting the |
1104 | // normalization process after each pre-existing ASCII dot also |
1105 | // provides an opportunity for the processing to get back onto |
1106 | // an ASCII fast path that bypasses the normalizer for ASCII |
1107 | // after a pre-existing ASCII dot (pre-existing in the sense |
1108 | // of not coming from e.g. normalizing an ideographic dot). |
1109 | if in_prefix && is_passthrough_ascii_label(label) { |
1110 | if seen_label { |
1111 | debug_assert_eq!(domain_name[passthrough_up_to], b'.' ); |
1112 | passthrough_up_to += 1; |
1113 | } |
1114 | seen_label = true; |
1115 | |
1116 | passthrough_up_to += label.len(); |
1117 | continue; |
1118 | } |
1119 | if seen_label { |
1120 | if in_prefix { |
1121 | debug_assert_eq!(domain_name[passthrough_up_to], b'.' ); |
1122 | passthrough_up_to += 1; |
1123 | } else { |
1124 | domain_buffer.push('.' ); |
1125 | } |
1126 | } |
1127 | seen_label = true; |
1128 | in_prefix = false; |
1129 | current_label_start = domain_buffer.len(); |
1130 | if !label.is_empty() { |
1131 | let (ascii, non_ascii) = split_ascii_fast_path_prefix(label); |
1132 | let non_punycode_ascii_label = if non_ascii.is_empty() { |
1133 | if has_punycode_prefix(ascii) { |
1134 | if (ascii.last() != Some(&b'-' )) |
1135 | && (ascii.len() - 4 <= PUNYCODE_DECODE_MAX_INPUT_LENGTH) |
1136 | { |
1137 | if let Ok(decode) = |
1138 | Decoder::default().decode::<u8, InternalCaller>(&ascii[4..]) |
1139 | { |
1140 | // 63 ASCII characters is the max length for a valid DNS label and xn-- takes 4 |
1141 | // characters. |
1142 | let mut label_buffer = SmallVec::<[char; 59]>::new(); |
1143 | label_buffer.extend(decode); |
1144 | |
1145 | if self.after_punycode_decode( |
1146 | domain_buffer, |
1147 | current_label_start, |
1148 | &label_buffer, |
1149 | deny_list_deny_dot, |
1150 | fail_fast, |
1151 | &mut had_errors, |
1152 | ) { |
1153 | return (0, false, true); |
1154 | } |
1155 | |
1156 | if self.check_label( |
1157 | hyphens, |
1158 | &mut domain_buffer[current_label_start..], |
1159 | fail_fast, |
1160 | &mut had_errors, |
1161 | true, |
1162 | true, |
1163 | ) { |
1164 | return (0, false, true); |
1165 | } |
1166 | } else { |
1167 | // Punycode failed |
1168 | if fail_fast { |
1169 | return (0, false, true); |
1170 | } |
1171 | had_errors = true; |
1172 | domain_buffer.push(' \u{FFFD}' ); |
1173 | let mut iter = ascii.iter(); |
1174 | // Discard the first character that we replaced. |
1175 | let _ = iter.next(); |
1176 | domain_buffer.extend(iter.map(|c| { |
1177 | // Can't have dot here, so `deny_list` vs `deny_list_deny_dot` does |
1178 | // not matter. |
1179 | apply_ascii_deny_list_to_potentially_upper_case_ascii( |
1180 | *c, deny_list, |
1181 | ) |
1182 | })); |
1183 | }; |
1184 | // If there were errors, we won't be trying to use this |
1185 | // anyway later, so it's fine to put it here unconditionally. |
1186 | already_punycode.push(AlreadyAsciiLabel::MixedCasePunycode(label)); |
1187 | continue; |
1188 | } else if fail_fast { |
1189 | return (0, false, true); |
1190 | } |
1191 | // Else fall through to the complex path and rediscover error |
1192 | // there. |
1193 | false |
1194 | } else { |
1195 | true |
1196 | } |
1197 | } else { |
1198 | false |
1199 | }; |
1200 | for c in ascii.iter().map(|c| { |
1201 | // Can't have dot here, so `deny_list` vs `deny_list_deny_dot` does |
1202 | // not matter. |
1203 | apply_ascii_deny_list_to_potentially_upper_case_ascii(*c, deny_list) |
1204 | }) { |
1205 | if c == ' \u{FFFD}' { |
1206 | if fail_fast { |
1207 | return (0, false, true); |
1208 | } |
1209 | had_errors = true; |
1210 | } |
1211 | domain_buffer.push(c); |
1212 | } |
1213 | if non_punycode_ascii_label { |
1214 | if hyphens != Hyphens::Allow |
1215 | && check_hyphens( |
1216 | &mut domain_buffer[current_label_start..], |
1217 | hyphens == Hyphens::CheckFirstLast, |
1218 | fail_fast, |
1219 | &mut had_errors, |
1220 | ) |
1221 | { |
1222 | return (0, false, true); |
1223 | } |
1224 | already_punycode.push(if had_errors { |
1225 | AlreadyAsciiLabel::Other |
1226 | } else { |
1227 | AlreadyAsciiLabel::MixedCaseAscii(label) |
1228 | }); |
1229 | continue; |
1230 | } |
1231 | already_punycode.push(AlreadyAsciiLabel::Other); |
1232 | let mut first_needs_combining_mark_check = ascii.is_empty(); |
1233 | let mut needs_contextj_check = !non_ascii.is_empty(); |
1234 | let mut mapping = self |
1235 | .data |
1236 | .map_normalize(non_ascii.chars()) |
1237 | .map(|c| apply_ascii_deny_list_to_lower_cased_unicode(c, deny_list)); |
1238 | loop { |
1239 | let n = mapping.next(); |
1240 | match n { |
1241 | None | Some('.' ) => { |
1242 | if domain_buffer[current_label_start..] |
1243 | .starts_with(&['x' , 'n' , '-' , '-' ]) |
1244 | { |
1245 | let mut punycode_precondition_failed = false; |
1246 | for c in domain_buffer[current_label_start + 4..].iter_mut() { |
1247 | if !c.is_ascii() { |
1248 | if fail_fast { |
1249 | return (0, false, true); |
1250 | } |
1251 | had_errors = true; |
1252 | *c = ' \u{FFFD}' ; |
1253 | punycode_precondition_failed = true; |
1254 | } |
1255 | } |
1256 | |
1257 | if let Some(last) = domain_buffer.last_mut() { |
1258 | if *last == '-' { |
1259 | // Either there's nothing after the "xn--" prefix |
1260 | // and we got the last hyphen of "xn--", or there |
1261 | // are no Punycode digits after the last delimiter |
1262 | // which would result in Punycode decode outputting |
1263 | // ASCII only. |
1264 | if fail_fast { |
1265 | return (0, false, true); |
1266 | } |
1267 | had_errors = true; |
1268 | *last = ' \u{FFFD}' ; |
1269 | punycode_precondition_failed = true; |
1270 | } |
1271 | } else { |
1272 | unreachable!(); |
1273 | } |
1274 | |
1275 | // Reject excessively long input |
1276 | // https://github.com/whatwg/url/issues/824 |
1277 | // https://unicode-org.atlassian.net/browse/ICU-13727 |
1278 | if domain_buffer.len() - current_label_start - 4 |
1279 | > PUNYCODE_DECODE_MAX_INPUT_LENGTH |
1280 | { |
1281 | if fail_fast { |
1282 | return (0, false, true); |
1283 | } |
1284 | had_errors = true; |
1285 | domain_buffer[current_label_start |
1286 | + 4 |
1287 | + PUNYCODE_DECODE_MAX_INPUT_LENGTH] = ' \u{FFFD}' ; |
1288 | punycode_precondition_failed = true; |
1289 | } |
1290 | |
1291 | if !punycode_precondition_failed { |
1292 | if let Ok(decode) = Decoder::default() |
1293 | .decode::<char, InternalCaller>( |
1294 | &domain_buffer[current_label_start + 4..], |
1295 | ) |
1296 | { |
1297 | first_needs_combining_mark_check = true; |
1298 | needs_contextj_check = true; |
1299 | // 63 ASCII characters is the max length for a valid DNS label and xn-- takes 4 |
1300 | // characters. |
1301 | let mut label_buffer = SmallVec::<[char; 59]>::new(); |
1302 | label_buffer.extend(decode); |
1303 | |
1304 | domain_buffer.truncate(current_label_start); |
1305 | if self.after_punycode_decode( |
1306 | domain_buffer, |
1307 | current_label_start, |
1308 | &label_buffer, |
1309 | deny_list_deny_dot, |
1310 | fail_fast, |
1311 | &mut had_errors, |
1312 | ) { |
1313 | return (0, false, true); |
1314 | } |
1315 | } else { |
1316 | // Punycode failed |
1317 | if fail_fast { |
1318 | return (0, false, true); |
1319 | } |
1320 | had_errors = true; |
1321 | domain_buffer[current_label_start] = ' \u{FFFD}' ; |
1322 | needs_contextj_check = false; // ASCII label |
1323 | first_needs_combining_mark_check = false; |
1324 | }; |
1325 | } else { |
1326 | first_needs_combining_mark_check = false; |
1327 | needs_contextj_check = false; // Non-ASCII already turned to U+FFFD. |
1328 | } |
1329 | } |
1330 | if self.check_label( |
1331 | hyphens, |
1332 | &mut domain_buffer[current_label_start..], |
1333 | fail_fast, |
1334 | &mut had_errors, |
1335 | first_needs_combining_mark_check, |
1336 | needs_contextj_check, |
1337 | ) { |
1338 | return (0, false, true); |
1339 | } |
1340 | |
1341 | if n.is_none() { |
1342 | break; |
1343 | } |
1344 | domain_buffer.push('.' ); |
1345 | current_label_start = domain_buffer.len(); |
1346 | first_needs_combining_mark_check = true; |
1347 | needs_contextj_check = true; |
1348 | already_punycode.push(AlreadyAsciiLabel::Other); |
1349 | } |
1350 | Some(c) => { |
1351 | if c == ' \u{FFFD}' { |
1352 | if fail_fast { |
1353 | return (0, false, true); |
1354 | } |
1355 | had_errors = true; |
1356 | } |
1357 | domain_buffer.push(c); |
1358 | } |
1359 | } |
1360 | } |
1361 | } else { |
1362 | // Empty label |
1363 | already_punycode.push(AlreadyAsciiLabel::MixedCaseAscii(label)); |
1364 | } |
1365 | } |
1366 | |
1367 | let is_bidi = self.is_bidi(domain_buffer); |
1368 | if is_bidi { |
1369 | for label in domain_buffer.split_mut(|c| *c == '.' ) { |
1370 | if let Some((first, tail)) = label.split_first_mut() { |
1371 | let first_bc = self.data.bidi_class(*first); |
1372 | if !FIRST_BC_MASK.intersects(first_bc.to_mask()) { |
1373 | // Neither RTL label nor LTR label |
1374 | if fail_fast { |
1375 | return (0, false, true); |
1376 | } |
1377 | had_errors = true; |
1378 | *first = ' \u{FFFD}' ; |
1379 | continue; |
1380 | } |
1381 | let is_ltr = first_bc.is_ltr(); |
1382 | // Trim NSM |
1383 | let mut middle = tail; |
1384 | #[allow (clippy::while_let_loop)] |
1385 | loop { |
1386 | if let Some((last, prior)) = middle.split_last_mut() { |
1387 | let last_bc = self.data.bidi_class(*last); |
1388 | if last_bc.is_nonspacing_mark() { |
1389 | middle = prior; |
1390 | continue; |
1391 | } |
1392 | let last_mask = if is_ltr { LAST_LTR_MASK } else { LAST_RTL_MASK }; |
1393 | if !last_mask.intersects(last_bc.to_mask()) { |
1394 | if fail_fast { |
1395 | return (0, false, true); |
1396 | } |
1397 | had_errors = true; |
1398 | *last = ' \u{FFFD}' ; |
1399 | } |
1400 | if is_ltr { |
1401 | for c in prior.iter_mut() { |
1402 | let bc = self.data.bidi_class(*c); |
1403 | if !MIDDLE_LTR_MASK.intersects(bc.to_mask()) { |
1404 | if fail_fast { |
1405 | return (0, false, true); |
1406 | } |
1407 | had_errors = true; |
1408 | *c = ' \u{FFFD}' ; |
1409 | } |
1410 | } |
1411 | } else { |
1412 | let mut numeral_state = RtlNumeralState::Undecided; |
1413 | for c in prior.iter_mut() { |
1414 | let bc = self.data.bidi_class(*c); |
1415 | if !MIDDLE_RTL_MASK.intersects(bc.to_mask()) { |
1416 | if fail_fast { |
1417 | return (0, false, true); |
1418 | } |
1419 | had_errors = true; |
1420 | *c = ' \u{FFFD}' ; |
1421 | } else { |
1422 | match numeral_state { |
1423 | RtlNumeralState::Undecided => { |
1424 | if bc.is_european_number() { |
1425 | numeral_state = RtlNumeralState::European; |
1426 | } else if bc.is_arabic_number() { |
1427 | numeral_state = RtlNumeralState::Arabic; |
1428 | } |
1429 | } |
1430 | RtlNumeralState::European => { |
1431 | if bc.is_arabic_number() { |
1432 | if fail_fast { |
1433 | return (0, false, true); |
1434 | } |
1435 | had_errors = true; |
1436 | *c = ' \u{FFFD}' ; |
1437 | } |
1438 | } |
1439 | RtlNumeralState::Arabic => { |
1440 | if bc.is_european_number() { |
1441 | if fail_fast { |
1442 | return (0, false, true); |
1443 | } |
1444 | had_errors = true; |
1445 | *c = ' \u{FFFD}' ; |
1446 | } |
1447 | } |
1448 | } |
1449 | } |
1450 | } |
1451 | if (numeral_state == RtlNumeralState::European |
1452 | && last_bc.is_arabic_number()) |
1453 | || (numeral_state == RtlNumeralState::Arabic |
1454 | && last_bc.is_european_number()) |
1455 | { |
1456 | if fail_fast { |
1457 | return (0, false, true); |
1458 | } |
1459 | had_errors = true; |
1460 | *last = ' \u{FFFD}' ; |
1461 | } |
1462 | } |
1463 | break; |
1464 | } else { |
1465 | // One-character label or label where |
1466 | // everything after the first character |
1467 | // is just non-spacing marks. |
1468 | break; |
1469 | } |
1470 | } |
1471 | } |
1472 | } |
1473 | } |
1474 | |
1475 | (passthrough_up_to, is_bidi, had_errors) |
1476 | } |
1477 | |
1478 | #[inline (never)] |
1479 | fn after_punycode_decode( |
1480 | &self, |
1481 | domain_buffer: &mut SmallVec<[char; 253]>, |
1482 | current_label_start: usize, |
1483 | label_buffer: &[char], |
1484 | deny_list_deny_dot: u128, |
1485 | fail_fast: bool, |
1486 | had_errors: &mut bool, |
1487 | ) -> bool { |
1488 | for c in self |
1489 | .data |
1490 | .normalize_validate(label_buffer.iter().copied()) |
1491 | .map(|c| apply_ascii_deny_list_to_lower_cased_unicode(c, deny_list_deny_dot)) |
1492 | { |
1493 | if c == ' \u{FFFD}' { |
1494 | if fail_fast { |
1495 | return true; |
1496 | } |
1497 | *had_errors = true; |
1498 | } |
1499 | domain_buffer.push(c); |
1500 | } |
1501 | let normalized = &mut domain_buffer[current_label_start..]; |
1502 | if let Err(()) = |
1503 | normalized |
1504 | .iter_mut() |
1505 | .zip(label_buffer.iter()) |
1506 | .try_for_each(|(norm_c, decoded_c)| { |
1507 | if *norm_c == *decoded_c { |
1508 | Ok(()) |
1509 | } else { |
1510 | // Mark the first difference |
1511 | *norm_c = ' \u{FFFD}' ; |
1512 | Err(()) |
1513 | } |
1514 | }) |
1515 | { |
1516 | if fail_fast { |
1517 | return true; |
1518 | } |
1519 | *had_errors = true; |
1520 | } |
1521 | false |
1522 | } |
1523 | |
1524 | #[inline (never)] |
1525 | fn check_label( |
1526 | &self, |
1527 | hyphens: Hyphens, |
1528 | mut_label: &mut [char], |
1529 | fail_fast: bool, |
1530 | had_errors: &mut bool, |
1531 | first_needs_combining_mark_check: bool, |
1532 | needs_contextj_check: bool, |
1533 | ) -> bool { |
1534 | if hyphens != Hyphens::Allow |
1535 | && check_hyphens( |
1536 | mut_label, |
1537 | hyphens == Hyphens::CheckFirstLast, |
1538 | fail_fast, |
1539 | had_errors, |
1540 | ) |
1541 | { |
1542 | return true; |
1543 | } |
1544 | if first_needs_combining_mark_check { |
1545 | if let Some(first) = mut_label.first_mut() { |
1546 | if self.data.is_mark(*first) { |
1547 | if fail_fast { |
1548 | return true; |
1549 | } |
1550 | *had_errors = true; |
1551 | *first = ' \u{FFFD}' ; |
1552 | } |
1553 | } |
1554 | } |
1555 | if needs_contextj_check { |
1556 | // ContextJ |
1557 | for i in 0..mut_label.len() { |
1558 | let c = mut_label[i]; |
1559 | if !in_inclusive_range_char(c, ' \u{200C}' , ' \u{200D}' ) { |
1560 | continue; |
1561 | } |
1562 | let (head, joiner_and_tail) = mut_label.split_at_mut(i); |
1563 | |
1564 | if let Some((joiner, tail)) = joiner_and_tail.split_first_mut() { |
1565 | if let Some(previous) = head.last() { |
1566 | if self.data.is_virama(*previous) { |
1567 | continue; |
1568 | } |
1569 | } else { |
1570 | // No preceding character |
1571 | if fail_fast { |
1572 | return true; |
1573 | } |
1574 | *had_errors = true; |
1575 | *joiner = ' \u{FFFD}' ; |
1576 | continue; |
1577 | } |
1578 | if c == ' \u{200D}' { |
1579 | // ZWJ only has the virama rule |
1580 | if fail_fast { |
1581 | return true; |
1582 | } |
1583 | *had_errors = true; |
1584 | *joiner = ' \u{FFFD}' ; |
1585 | continue; |
1586 | } |
1587 | debug_assert_eq!(c, ' \u{200C}' ); |
1588 | if !self.has_appropriately_joining_char( |
1589 | head.iter().rev().copied(), |
1590 | LEFT_OR_DUAL_JOINING_MASK, |
1591 | ) || !self.has_appropriately_joining_char( |
1592 | tail.iter().copied(), |
1593 | RIGHT_OR_DUAL_JOINING_MASK, |
1594 | ) { |
1595 | if fail_fast { |
1596 | return true; |
1597 | } |
1598 | *had_errors = true; |
1599 | *joiner = ' \u{FFFD}' ; |
1600 | } |
1601 | } else { |
1602 | debug_assert!(false); |
1603 | } |
1604 | } |
1605 | } |
1606 | |
1607 | if !is_ascii(mut_label) && mut_label.len() > PUNYCODE_ENCODE_MAX_INPUT_LENGTH { |
1608 | // Limit quadratic behavior |
1609 | // https://github.com/whatwg/url/issues/824 |
1610 | // https://unicode-org.atlassian.net/browse/ICU-13727 |
1611 | if fail_fast { |
1612 | return true; |
1613 | } |
1614 | *had_errors = true; |
1615 | mut_label[PUNYCODE_ENCODE_MAX_INPUT_LENGTH] = ' \u{FFFD}' ; |
1616 | } |
1617 | false |
1618 | } |
1619 | |
1620 | #[inline (always)] |
1621 | fn has_appropriately_joining_char<I: Iterator<Item = char>>( |
1622 | &self, |
1623 | iter: I, |
1624 | required_mask: JoiningTypeMask, |
1625 | ) -> bool { |
1626 | for c in iter { |
1627 | let jt = self.data.joining_type(c); |
1628 | if jt.to_mask().intersects(required_mask) { |
1629 | return true; |
1630 | } |
1631 | if jt.is_transparent() { |
1632 | continue; |
1633 | } |
1634 | return false; |
1635 | } |
1636 | false |
1637 | } |
1638 | |
1639 | #[inline (always)] |
1640 | fn is_bidi(&self, buffer: &[char]) -> bool { |
1641 | for &c in buffer { |
1642 | if c < ' \u{0590}' { |
1643 | // Below Hebrew |
1644 | continue; |
1645 | } |
1646 | if in_inclusive_range_char(c, ' \u{0900}' , ' \u{FB1C}' ) { |
1647 | debug_assert_ne!(c, ' \u{200F}' ); // disallowed |
1648 | continue; |
1649 | } |
1650 | if in_inclusive_range_char(c, ' \u{1F000}' , ' \u{3FFFF}' ) { |
1651 | continue; |
1652 | } |
1653 | if in_inclusive_range_char(c, ' \u{FF00}' , ' \u{107FF}' ) { |
1654 | continue; |
1655 | } |
1656 | if in_inclusive_range_char(c, ' \u{11000}' , ' \u{1E7FF}' ) { |
1657 | continue; |
1658 | } |
1659 | if RTL_MASK.intersects(self.data.bidi_class(c).to_mask()) { |
1660 | return true; |
1661 | } |
1662 | } |
1663 | false |
1664 | } |
1665 | } |
1666 | |
1667 | fn check_hyphens( |
1668 | mut_label: &mut [char], |
1669 | allow_third_fourth: bool, |
1670 | fail_fast: bool, |
1671 | had_errors: &mut bool, |
1672 | ) -> bool { |
1673 | if let Some(first) = mut_label.first_mut() { |
1674 | if *first == '-' { |
1675 | if fail_fast { |
1676 | return true; |
1677 | } |
1678 | *had_errors = true; |
1679 | *first = ' \u{FFFD}' ; |
1680 | } |
1681 | } |
1682 | if let Some(last) = mut_label.last_mut() { |
1683 | if *last == '-' { |
1684 | if fail_fast { |
1685 | return true; |
1686 | } |
1687 | *had_errors = true; |
1688 | *last = ' \u{FFFD}' ; |
1689 | } |
1690 | } |
1691 | if allow_third_fourth { |
1692 | return false; |
1693 | } |
1694 | if mut_label.len() >= 4 && mut_label[2] == '-' && mut_label[3] == '-' { |
1695 | if fail_fast { |
1696 | return true; |
1697 | } |
1698 | *had_errors = true; |
1699 | mut_label[2] = ' \u{FFFD}' ; |
1700 | mut_label[3] = ' \u{FFFD}' ; |
1701 | } |
1702 | false |
1703 | } |
1704 | |