1// Copyright 2013-2014 The rust-url developers.
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8
9//! [*Unicode IDNA Compatibility Processing*
10//! (Unicode Technical Standard #46)](http://www.unicode.org/reports/tr46/)
11
12use self::Mapping::*;
13use crate::punycode;
14
15use alloc::string::String;
16use core::fmt;
17use unicode_bidi::{bidi_class, BidiClass};
18use unicode_normalization::char::is_combining_mark;
19use unicode_normalization::{is_nfc, UnicodeNormalization};
20
21include!("uts46_mapping_table.rs");
22
23const PUNYCODE_PREFIX: &str = "xn--";
24
25#[derive(Debug)]
26struct StringTableSlice {
27 // Store these as separate fields so the structure will have an
28 // alignment of 1 and thus pack better into the Mapping enum, below.
29 byte_start_lo: u8,
30 byte_start_hi: u8,
31 byte_len: u8,
32}
33
34fn decode_slice(slice: &StringTableSlice) -> &'static str {
35 let lo: usize = slice.byte_start_lo as usize;
36 let hi: usize = slice.byte_start_hi as usize;
37 let start: usize = (hi << 8) | lo;
38 let len: usize = slice.byte_len as usize;
39 &STRING_TABLE[start..(start + len)]
40}
41
42#[repr(u8)]
43#[derive(Debug)]
44enum Mapping {
45 Valid,
46 Ignored,
47 Mapped(StringTableSlice),
48 Deviation(StringTableSlice),
49 Disallowed,
50 DisallowedStd3Valid,
51 DisallowedStd3Mapped(StringTableSlice),
52 DisallowedIdna2008,
53}
54
55fn find_char(codepoint: char) -> &'static Mapping {
56 let idx: usize = match TABLE.binary_search_by_key(&codepoint, |&val: (char, u16)| val.0) {
57 Ok(idx: usize) => idx,
58 Err(idx: usize) => idx - 1,
59 };
60
61 const SINGLE_MARKER: u16 = 1 << 15;
62
63 let (base: char, x: u16) = TABLE[idx];
64 let single: bool = (x & SINGLE_MARKER) != 0;
65 let offset: u16 = !SINGLE_MARKER & x;
66
67 if single {
68 &MAPPING_TABLE[offset as usize]
69 } else {
70 &MAPPING_TABLE[(offset + (codepoint as u16 - base as u16)) as usize]
71 }
72}
73
74struct Mapper<'a> {
75 chars: core::str::Chars<'a>,
76 config: Config,
77 errors: &'a mut Errors,
78 slice: Option<core::str::Chars<'static>>,
79}
80
81impl<'a> Iterator for Mapper<'a> {
82 type Item = char;
83
84 fn next(&mut self) -> Option<Self::Item> {
85 loop {
86 if let Some(s) = &mut self.slice {
87 match s.next() {
88 Some(c) => return Some(c),
89 None => {
90 self.slice = None;
91 }
92 }
93 }
94
95 let codepoint = self.chars.next()?;
96 if let '.' | '-' | 'a'..='z' | '0'..='9' = codepoint {
97 return Some(codepoint);
98 }
99
100 return Some(match *find_char(codepoint) {
101 Mapping::Valid => codepoint,
102 Mapping::Ignored => continue,
103 Mapping::Mapped(ref slice) => {
104 self.slice = Some(decode_slice(slice).chars());
105 continue;
106 }
107 Mapping::Deviation(ref slice) => {
108 if self.config.transitional_processing {
109 self.slice = Some(decode_slice(slice).chars());
110 continue;
111 } else {
112 codepoint
113 }
114 }
115 Mapping::Disallowed => {
116 self.errors.disallowed_character = true;
117 codepoint
118 }
119 Mapping::DisallowedStd3Valid => {
120 if self.config.use_std3_ascii_rules {
121 self.errors.disallowed_by_std3_ascii_rules = true;
122 };
123 codepoint
124 }
125 Mapping::DisallowedStd3Mapped(ref slice) => {
126 if self.config.use_std3_ascii_rules {
127 self.errors.disallowed_mapped_in_std3 = true;
128 };
129 self.slice = Some(decode_slice(slice).chars());
130 continue;
131 }
132 Mapping::DisallowedIdna2008 => {
133 if self.config.use_idna_2008_rules {
134 self.errors.disallowed_in_idna_2008 = true;
135 }
136 codepoint
137 }
138 });
139 }
140 }
141}
142
143// http://tools.ietf.org/html/rfc5893#section-2
144fn passes_bidi(label: &str, is_bidi_domain: bool) -> bool {
145 // Rule 0: Bidi Rules apply to Bidi Domain Names: a name with at least one RTL label. A label
146 // is RTL if it contains at least one character of bidi class R, AL or AN.
147 if !is_bidi_domain {
148 return true;
149 }
150
151 let mut chars = label.chars();
152 let first_char_class = match chars.next() {
153 Some(c) => bidi_class(c),
154 None => return true, // empty string
155 };
156
157 match first_char_class {
158 // LTR label
159 BidiClass::L => {
160 // Rule 5
161 for c in chars.by_ref() {
162 if !matches!(
163 bidi_class(c),
164 BidiClass::L
165 | BidiClass::EN
166 | BidiClass::ES
167 | BidiClass::CS
168 | BidiClass::ET
169 | BidiClass::ON
170 | BidiClass::BN
171 | BidiClass::NSM
172 ) {
173 return false;
174 }
175 }
176
177 // Rule 6
178 // must end in L or EN followed by 0 or more NSM
179 let mut rev_chars = label.chars().rev();
180 let mut last_non_nsm = rev_chars.next();
181 loop {
182 match last_non_nsm {
183 Some(c) if bidi_class(c) == BidiClass::NSM => {
184 last_non_nsm = rev_chars.next();
185 continue;
186 }
187 _ => {
188 break;
189 }
190 }
191 }
192 match last_non_nsm {
193 Some(c) if bidi_class(c) == BidiClass::L || bidi_class(c) == BidiClass::EN => {}
194 Some(_) => {
195 return false;
196 }
197 _ => {}
198 }
199 }
200
201 // RTL label
202 BidiClass::R | BidiClass::AL => {
203 let mut found_en = false;
204 let mut found_an = false;
205
206 // Rule 2
207 for c in chars {
208 let char_class = bidi_class(c);
209 if char_class == BidiClass::EN {
210 found_en = true;
211 } else if char_class == BidiClass::AN {
212 found_an = true;
213 }
214
215 if !matches!(
216 char_class,
217 BidiClass::R
218 | BidiClass::AL
219 | BidiClass::AN
220 | BidiClass::EN
221 | BidiClass::ES
222 | BidiClass::CS
223 | BidiClass::ET
224 | BidiClass::ON
225 | BidiClass::BN
226 | BidiClass::NSM
227 ) {
228 return false;
229 }
230 }
231 // Rule 3
232 let mut rev_chars = label.chars().rev();
233 let mut last = rev_chars.next();
234 loop {
235 // must end in L or EN followed by 0 or more NSM
236 match last {
237 Some(c) if bidi_class(c) == BidiClass::NSM => {
238 last = rev_chars.next();
239 continue;
240 }
241 _ => {
242 break;
243 }
244 }
245 }
246 match last {
247 Some(c)
248 if matches!(
249 bidi_class(c),
250 BidiClass::R | BidiClass::AL | BidiClass::EN | BidiClass::AN
251 ) => {}
252 _ => {
253 return false;
254 }
255 }
256
257 // Rule 4
258 if found_an && found_en {
259 return false;
260 }
261 }
262
263 // Rule 1: Should start with L or R/AL
264 _ => {
265 return false;
266 }
267 }
268
269 true
270}
271
272/// Check the validity criteria for the given label
273///
274/// V1 (NFC) and V8 (Bidi) are checked inside `processing()` to prevent doing duplicate work.
275///
276/// http://www.unicode.org/reports/tr46/#Validity_Criteria
277fn check_validity(label: &str, config: Config, errors: &mut Errors) {
278 let first_char = label.chars().next();
279 if first_char.is_none() {
280 // Empty string, pass
281 return;
282 }
283
284 // V2: No U+002D HYPHEN-MINUS in both third and fourth positions.
285 //
286 // NOTE: Spec says that the label must not contain a HYPHEN-MINUS character in both the
287 // third and fourth positions. But nobody follows this criteria. See the spec issue below:
288 // https://github.com/whatwg/url/issues/53
289
290 // V3: neither begin nor end with a U+002D HYPHEN-MINUS
291 if config.check_hyphens && (label.starts_with('-') || label.ends_with('-')) {
292 errors.check_hyphens = true;
293 return;
294 }
295
296 // V4: not contain a U+002E FULL STOP
297 //
298 // Here, label can't contain '.' since the input is from .split('.')
299
300 // V5: not begin with a GC=Mark
301 if is_combining_mark(first_char.unwrap()) {
302 errors.start_combining_mark = true;
303 return;
304 }
305
306 // V6: Check against Mapping Table
307 if label.chars().any(|c| match *find_char(c) {
308 Mapping::Valid | Mapping::DisallowedIdna2008 => false,
309 Mapping::Deviation(_) => config.transitional_processing,
310 Mapping::DisallowedStd3Valid => config.use_std3_ascii_rules,
311 _ => true,
312 }) {
313 errors.invalid_mapping = true;
314 }
315
316 // V7: ContextJ rules
317 //
318 // TODO: Implement rules and add *CheckJoiners* flag.
319
320 // V8: Bidi rules are checked inside `processing()`
321}
322
323// Detect simple cases: all lowercase ASCII characters and digits where none
324// of the labels start with PUNYCODE_PREFIX and labels don't start or end with hyphen.
325fn is_simple(domain: &str) -> bool {
326 if domain.is_empty() {
327 return false;
328 }
329 let (mut prev, mut puny_prefix) = ('?', 0);
330 for c in domain.chars() {
331 if c == '.' {
332 if prev == '-' {
333 return false;
334 }
335 puny_prefix = 0;
336 continue;
337 } else if puny_prefix == 0 && c == '-' {
338 return false;
339 } else if puny_prefix < 5 {
340 if c == ['x', 'n', '-', '-'][puny_prefix] {
341 puny_prefix += 1;
342 if puny_prefix == 4 {
343 return false;
344 }
345 } else {
346 puny_prefix = 5;
347 }
348 }
349 if !c.is_ascii_lowercase() && !c.is_ascii_digit() {
350 return false;
351 }
352 prev = c;
353 }
354
355 true
356}
357
358/// http://www.unicode.org/reports/tr46/#Processing
359fn processing(
360 domain: &str,
361 config: Config,
362 normalized: &mut String,
363 output: &mut String,
364) -> Errors {
365 normalized.clear();
366 let mut errors = Errors::default();
367 let offset = output.len();
368
369 let iter = Mapper {
370 chars: domain.chars(),
371 config,
372 errors: &mut errors,
373 slice: None,
374 };
375
376 normalized.extend(iter.nfc());
377
378 let mut decoder = punycode::Decoder::default();
379 let non_transitional = config.transitional_processing(false);
380 let (mut first, mut has_bidi_labels) = (true, false);
381 for label in normalized.split('.') {
382 if !first {
383 output.push('.');
384 }
385 first = false;
386 if let Some(remainder) = label.strip_prefix(PUNYCODE_PREFIX) {
387 match decoder.decode(remainder) {
388 Ok(decode) => {
389 let start = output.len();
390 output.extend(decode);
391 let decoded_label = &output[start..];
392
393 if !has_bidi_labels {
394 has_bidi_labels |= is_bidi_domain(decoded_label);
395 }
396
397 if !errors.is_err() {
398 if !is_nfc(decoded_label) {
399 errors.nfc = true;
400 } else {
401 check_validity(decoded_label, non_transitional, &mut errors);
402 }
403 }
404 }
405 Err(()) => {
406 has_bidi_labels = true;
407 errors.punycode = true;
408 }
409 }
410 } else {
411 if !has_bidi_labels {
412 has_bidi_labels |= is_bidi_domain(label);
413 }
414
415 // `normalized` is already `NFC` so we can skip that check
416 check_validity(label, config, &mut errors);
417 output.push_str(label)
418 }
419 }
420
421 for label in output[offset..].split('.') {
422 // V8: Bidi rules
423 //
424 // TODO: Add *CheckBidi* flag
425 if !passes_bidi(label, has_bidi_labels) {
426 errors.check_bidi = true;
427 break;
428 }
429 }
430
431 errors
432}
433
434#[derive(Default)]
435pub struct Idna {
436 config: Config,
437 normalized: String,
438 output: String,
439}
440
441impl Idna {
442 pub fn new(config: Config) -> Self {
443 Self {
444 config,
445 normalized: String::new(),
446 output: String::new(),
447 }
448 }
449
450 pub fn to_ascii_inner(&mut self, domain: &str, out: &mut String) -> Errors {
451 if is_simple(domain) {
452 out.push_str(domain);
453 return Errors::default();
454 }
455 let mut errors = processing(domain, self.config, &mut self.normalized, out);
456 self.output = core::mem::replace(out, String::with_capacity(out.len()));
457 let mut first = true;
458 for label in self.output.split('.') {
459 if !first {
460 out.push('.');
461 }
462 first = false;
463
464 if label.is_ascii() {
465 out.push_str(label);
466 } else {
467 let offset = out.len();
468 out.push_str(PUNYCODE_PREFIX);
469 if let Err(()) = punycode::encode_into(label.chars(), out) {
470 errors.punycode = true;
471 out.truncate(offset);
472 }
473 }
474 }
475 errors
476 }
477
478 /// http://www.unicode.org/reports/tr46/#ToASCII
479 #[allow(clippy::wrong_self_convention)]
480 pub fn to_ascii(&mut self, domain: &str, out: &mut String) -> Result<(), Errors> {
481 let mut errors = self.to_ascii_inner(domain, out);
482
483 if self.config.verify_dns_length {
484 let domain = if out.ends_with('.') {
485 &out[..out.len() - 1]
486 } else {
487 &*out
488 };
489 if domain.is_empty() || domain.split('.').any(|label| label.is_empty()) {
490 errors.too_short_for_dns = true;
491 }
492 if domain.len() > 253 || domain.split('.').any(|label| label.len() > 63) {
493 errors.too_long_for_dns = true;
494 }
495 }
496
497 errors.into()
498 }
499
500 /// http://www.unicode.org/reports/tr46/#ToUnicode
501 #[allow(clippy::wrong_self_convention)]
502 pub fn to_unicode(&mut self, domain: &str, out: &mut String) -> Result<(), Errors> {
503 if is_simple(domain) {
504 out.push_str(domain);
505 return Errors::default().into();
506 }
507 processing(domain, self.config, &mut self.normalized, out).into()
508 }
509}
510
511#[derive(Clone, Copy)]
512#[must_use]
513pub struct Config {
514 use_std3_ascii_rules: bool,
515 transitional_processing: bool,
516 verify_dns_length: bool,
517 check_hyphens: bool,
518 use_idna_2008_rules: bool,
519}
520
521/// The defaults are that of https://url.spec.whatwg.org/#idna
522impl Default for Config {
523 fn default() -> Self {
524 Config {
525 use_std3_ascii_rules: false,
526 transitional_processing: false,
527 check_hyphens: false,
528 // check_bidi: true,
529 // check_joiners: true,
530
531 // Only use for to_ascii, not to_unicode
532 verify_dns_length: false,
533 use_idna_2008_rules: false,
534 }
535 }
536}
537
538impl Config {
539 #[inline]
540 pub fn use_std3_ascii_rules(mut self, value: bool) -> Self {
541 self.use_std3_ascii_rules = value;
542 self
543 }
544
545 #[inline]
546 pub fn transitional_processing(mut self, value: bool) -> Self {
547 self.transitional_processing = value;
548 self
549 }
550
551 #[inline]
552 pub fn verify_dns_length(mut self, value: bool) -> Self {
553 self.verify_dns_length = value;
554 self
555 }
556
557 #[inline]
558 pub fn check_hyphens(mut self, value: bool) -> Self {
559 self.check_hyphens = value;
560 self
561 }
562
563 #[inline]
564 pub fn use_idna_2008_rules(mut self, value: bool) -> Self {
565 self.use_idna_2008_rules = value;
566 self
567 }
568
569 /// http://www.unicode.org/reports/tr46/#ToASCII
570 pub fn to_ascii(self, domain: &str) -> Result<String, Errors> {
571 let mut result = String::with_capacity(domain.len());
572 let mut codec = Idna::new(self);
573 codec.to_ascii(domain, &mut result).map(|()| result)
574 }
575
576 /// http://www.unicode.org/reports/tr46/#ToUnicode
577 pub fn to_unicode(self, domain: &str) -> (String, Result<(), Errors>) {
578 let mut codec = Idna::new(self);
579 let mut out = String::with_capacity(domain.len());
580 let result = codec.to_unicode(domain, &mut out);
581 (out, result)
582 }
583}
584
585fn is_bidi_domain(s: &str) -> bool {
586 for c: char in s.chars() {
587 if c.is_ascii_graphic() {
588 continue;
589 }
590 match bidi_class(c) {
591 BidiClass::R | BidiClass::AL | BidiClass::AN => return true,
592 _ => {}
593 }
594 }
595 false
596}
597
598/// Errors recorded during UTS #46 processing.
599///
600/// This is opaque for now, indicating what types of errors have been encountered at least once.
601/// More details may be exposed in the future.
602#[derive(Default)]
603pub struct Errors {
604 punycode: bool,
605 check_hyphens: bool,
606 check_bidi: bool,
607 start_combining_mark: bool,
608 invalid_mapping: bool,
609 nfc: bool,
610 disallowed_by_std3_ascii_rules: bool,
611 disallowed_mapped_in_std3: bool,
612 disallowed_character: bool,
613 too_long_for_dns: bool,
614 too_short_for_dns: bool,
615 disallowed_in_idna_2008: bool,
616}
617
618impl Errors {
619 fn is_err(&self) -> bool {
620 let Errors {
621 punycode,
622 check_hyphens,
623 check_bidi,
624 start_combining_mark,
625 invalid_mapping,
626 nfc,
627 disallowed_by_std3_ascii_rules,
628 disallowed_mapped_in_std3,
629 disallowed_character,
630 too_long_for_dns,
631 too_short_for_dns,
632 disallowed_in_idna_2008,
633 } = *self;
634 punycode
635 || check_hyphens
636 || check_bidi
637 || start_combining_mark
638 || invalid_mapping
639 || nfc
640 || disallowed_by_std3_ascii_rules
641 || disallowed_mapped_in_std3
642 || disallowed_character
643 || too_long_for_dns
644 || too_short_for_dns
645 || disallowed_in_idna_2008
646 }
647}
648
649impl fmt::Debug for Errors {
650 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
651 let Errors {
652 punycode,
653 check_hyphens,
654 check_bidi,
655 start_combining_mark,
656 invalid_mapping,
657 nfc,
658 disallowed_by_std3_ascii_rules,
659 disallowed_mapped_in_std3,
660 disallowed_character,
661 too_long_for_dns,
662 too_short_for_dns,
663 disallowed_in_idna_2008,
664 } = *self;
665
666 let fields = [
667 ("punycode", punycode),
668 ("check_hyphens", check_hyphens),
669 ("check_bidi", check_bidi),
670 ("start_combining_mark", start_combining_mark),
671 ("invalid_mapping", invalid_mapping),
672 ("nfc", nfc),
673 (
674 "disallowed_by_std3_ascii_rules",
675 disallowed_by_std3_ascii_rules,
676 ),
677 ("disallowed_mapped_in_std3", disallowed_mapped_in_std3),
678 ("disallowed_character", disallowed_character),
679 ("too_long_for_dns", too_long_for_dns),
680 ("too_short_for_dns", too_short_for_dns),
681 ("disallowed_in_idna_2008", disallowed_in_idna_2008),
682 ];
683
684 let mut empty = true;
685 f.write_str("Errors { ")?;
686 for (name, val) in &fields {
687 if *val {
688 if !empty {
689 f.write_str(", ")?;
690 }
691 f.write_str(name)?;
692 empty = false;
693 }
694 }
695
696 if !empty {
697 f.write_str(" }")
698 } else {
699 f.write_str("}")
700 }
701 }
702}
703
704impl From<Errors> for Result<(), Errors> {
705 fn from(e: Errors) -> Result<(), Errors> {
706 if !e.is_err() {
707 Ok(())
708 } else {
709 Err(e)
710 }
711 }
712}
713
714#[cfg(feature = "std")]
715impl std::error::Error for Errors {}
716
717impl fmt::Display for Errors {
718 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
719 fmt::Debug::fmt(self, f)
720 }
721}
722
723#[cfg(test)]
724mod tests {
725 use super::{find_char, Mapping};
726
727 #[test]
728 fn mapping_fast_path() {
729 assert_matches!(find_char('-'), &Mapping::Valid);
730 assert_matches!(find_char('.'), &Mapping::Valid);
731 for c in &['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] {
732 assert_matches!(find_char(*c), &Mapping::Valid);
733 }
734 for c in &[
735 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
736 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
737 ] {
738 assert_matches!(find_char(*c), &Mapping::Valid);
739 }
740 }
741}
742