1// Copyright (C) 2016 The Qt Company Ltd.
2// Copyright (C) 2016 Intel Corporation.
3// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4
5#include "qurl_p.h"
6
7#include <QtCore/qstringlist.h>
8#include <QtCore/private/qnumeric_p.h>
9#include <QtCore/private/qoffsetstringarray_p.h>
10#include <QtCore/private/qstringiterator_p.h>
11#include <QtCore/private/qunicodetables_p.h>
12
13#include <algorithm>
14
15QT_BEGIN_NAMESPACE
16
17using namespace Qt::StringLiterals;
18
19// needed by the punycode encoder/decoder
20static const uint base = 36;
21static const uint tmin = 1;
22static const uint tmax = 26;
23static const uint skew = 38;
24static const uint damp = 700;
25static const uint initial_bias = 72;
26static const uint initial_n = 128;
27
28static constexpr qsizetype MaxDomainLabelLength = 63;
29
30static inline uint encodeDigit(uint digit)
31{
32 return digit + 22 + 75 * (digit < 26);
33}
34
35static inline uint adapt(uint delta, uint numpoints, bool firsttime)
36{
37 delta /= (firsttime ? damp : 2);
38 delta += (delta / numpoints);
39
40 uint k = 0;
41 for (; delta > ((base - tmin) * tmax) / 2; k += base)
42 delta /= (base - tmin);
43
44 return k + (((base - tmin + 1) * delta) / (delta + skew));
45}
46
47static inline void appendEncode(QString *output, uint delta, uint bias)
48{
49 uint qq;
50 uint k;
51 uint t;
52
53 // insert the variable length delta integer.
54 for (qq = delta, k = base;; k += base) {
55 // stop generating digits when the threshold is
56 // detected.
57 t = (k <= bias) ? tmin : (k >= bias + tmax) ? tmax : k - bias;
58 if (qq < t) break;
59
60 *output += QChar(encodeDigit(digit: t + (qq - t) % (base - t)));
61 qq = (qq - t) / (base - t);
62 }
63
64 *output += QChar(encodeDigit(digit: qq));
65}
66
67Q_AUTOTEST_EXPORT void qt_punycodeEncoder(QStringView in, QString *output)
68{
69 uint n = initial_n;
70 uint delta = 0;
71 uint bias = initial_bias;
72
73 // Do not try to encode strings that certainly will result in output
74 // that is longer than allowable domain name label length. Note that
75 // non-BMP codepoints are encoded as two QChars.
76 if (in.size() > MaxDomainLabelLength * 2)
77 return;
78
79 int outLen = output->size();
80 output->resize(size: outLen + in.size());
81
82 QChar *d = output->data() + outLen;
83 bool skipped = false;
84 // copy all basic code points verbatim to output.
85 for (QChar c : in) {
86 if (c.unicode() < 0x80)
87 *d++ = c;
88 else
89 skipped = true;
90 }
91
92 // if there were only basic code points, just return them
93 // directly; don't do any encoding.
94 if (!skipped)
95 return;
96
97 output->truncate(pos: d - output->constData());
98 int copied = output->size() - outLen;
99
100 // h and b now contain the number of basic code points in input.
101 uint b = copied;
102 uint h = copied;
103
104 // if basic code points were copied, add the delimiter character.
105 if (h > 0)
106 *output += u'-';
107
108 // compute the input length in Unicode code points.
109 uint inputLength = 0;
110 for (QStringIterator iter(in); iter.hasNext();) {
111 inputLength++;
112
113 if (iter.next(invalidAs: char32_t(-1)) == char32_t(-1)) {
114 output->truncate(pos: outLen);
115 return; // invalid surrogate pair
116 }
117 }
118
119 // while there are still unprocessed non-basic code points left in
120 // the input string...
121 while (h < inputLength) {
122 // find the character in the input string with the lowest unprocessed value.
123 uint m = std::numeric_limits<uint>::max();
124 for (QStringIterator iter(in); iter.hasNext();) {
125 auto c = iter.nextUnchecked();
126 static_assert(std::numeric_limits<decltype(m)>::max()
127 >= std::numeric_limits<decltype(c)>::max(),
128 "Punycode uint should be able to cover all codepoints");
129 if (c >= n && c < m)
130 m = c;
131 }
132
133 // delta = delta + (m - n) * (h + 1), fail on overflow
134 uint tmp;
135 if (qMulOverflow<uint>(v1: m - n, v2: h + 1, r: &tmp) || qAddOverflow<uint>(v1: delta, v2: tmp, r: &delta)) {
136 output->truncate(pos: outLen);
137 return; // punycode_overflow
138 }
139 n = m;
140
141 for (QStringIterator iter(in); iter.hasNext();) {
142 auto c = iter.nextUnchecked();
143
144 // increase delta until we reach the character processed in this iteration;
145 // fail if delta overflows.
146 if (c < n) {
147 if (qAddOverflow<uint>(v1: delta, v2: 1, r: &delta)) {
148 output->truncate(pos: outLen);
149 return; // punycode_overflow
150 }
151 }
152
153 if (c == n) {
154 appendEncode(output, delta, bias);
155
156 bias = adapt(delta, numpoints: h + 1, firsttime: h == b);
157 delta = 0;
158 ++h;
159 }
160 }
161
162 ++delta;
163 ++n;
164 }
165
166 // prepend ACE prefix
167 output->insert(i: outLen, s: "xn--"_L1);
168 return;
169}
170
171Q_AUTOTEST_EXPORT QString qt_punycodeDecoder(const QString &pc)
172{
173 uint n = initial_n;
174 uint i = 0;
175 uint bias = initial_bias;
176
177 // Do not try to decode strings longer than allowable for a domain label.
178 // Non-ASCII strings are not allowed here anyway, so there is no need
179 // to account for surrogates.
180 if (pc.size() > MaxDomainLabelLength)
181 return QString();
182
183 // strip any ACE prefix
184 int start = pc.startsWith(s: "xn--"_L1) ? 4 : 0;
185 if (!start)
186 return pc;
187
188 // find the last delimiter character '-' in the input array. copy
189 // all data before this delimiter directly to the output array.
190 int delimiterPos = pc.lastIndexOf(c: u'-');
191 auto output = delimiterPos < 4 ? std::u32string()
192 : pc.mid(position: start, n: delimiterPos - start).toStdU32String();
193
194 // if a delimiter was found, skip to the position after it;
195 // otherwise start at the front of the input string. everything
196 // before the delimiter is assumed to be basic code points.
197 uint cnt = delimiterPos + 1;
198
199 // loop through the rest of the input string, inserting non-basic
200 // characters into output as we go.
201 while (cnt < (uint) pc.size()) {
202 uint oldi = i;
203 uint w = 1;
204
205 // find the next index for inserting a non-basic character.
206 for (uint k = base; cnt < (uint) pc.size(); k += base) {
207 // grab a character from the punycode input and find its
208 // delta digit (each digit code is part of the
209 // variable-length integer delta)
210 uint digit = pc.at(i: cnt++).unicode();
211 if (digit - 48 < 10) digit -= 22;
212 else if (digit - 65 < 26) digit -= 65;
213 else if (digit - 97 < 26) digit -= 97;
214 else digit = base;
215
216 // Fail if the code point has no digit value
217 if (digit >= base)
218 return QString();
219
220 // i = i + digit * w, fail on overflow
221 uint tmp;
222 if (qMulOverflow<uint>(v1: digit, v2: w, r: &tmp) || qAddOverflow<uint>(v1: i, v2: tmp, r: &i))
223 return QString();
224
225 // detect threshold to stop reading delta digits
226 uint t;
227 if (k <= bias) t = tmin;
228 else if (k >= bias + tmax) t = tmax;
229 else t = k - bias;
230
231 if (digit < t) break;
232
233 // w = w * (base - t), fail on overflow
234 if (qMulOverflow<uint>(v1: w, v2: base - t, r: &w))
235 return QString();
236 }
237
238 // find new bias and calculate the next non-basic code
239 // character.
240 uint outputLength = static_cast<uint>(output.length());
241 bias = adapt(delta: i - oldi, numpoints: outputLength + 1, firsttime: oldi == 0);
242
243 // n = n + i div (length(output) + 1), fail on overflow
244 if (qAddOverflow<uint>(v1: n, v2: i / (outputLength + 1), r: &n))
245 return QString();
246
247 // allow the deltas to wrap around
248 i %= (outputLength + 1);
249
250 // if n is a basic code point then fail; this should not happen with
251 // correct implementation of Punycode, but check just n case.
252 if (n < initial_n) {
253 // Don't use Q_ASSERT() to avoid possibility of DoS
254 qWarning(msg: "Attempt to insert a basic codepoint. Unhandled overflow?");
255 return QString();
256 }
257
258 // Surrogates should normally be rejected later by other IDNA code.
259 // But because of Qt's use of UTF-16 to represent strings the
260 // IDNA code is not able to distinguish characters represented as pairs
261 // of surrogates from normal code points. This is why surrogates are
262 // not allowed here.
263 //
264 // Allowing surrogates would lead to non-unique (after normalization)
265 // encoding of strings with non-BMP characters.
266 //
267 // Punycode that encodes characters outside the Unicode range is also
268 // invalid and is rejected here.
269 if (QChar::isSurrogate(ucs4: n) || n > QChar::LastValidCodePoint)
270 return QString();
271
272 // insert the character n at position i
273 output.insert(pos: i, n: 1, c: static_cast<char32_t>(n));
274 ++i;
275 }
276
277 return QString::fromStdU32String(s: output);
278}
279
280static constexpr auto idn_whitelist = qOffsetStringArray(
281 strings: "ac", strings: "ar", strings: "asia", strings: "at",
282 strings: "biz", strings: "br",
283 strings: "cat", strings: "ch", strings: "cl", strings: "cn", strings: "com",
284 strings: "de", strings: "dk",
285 strings: "es",
286 strings: "fi",
287 strings: "gr",
288 strings: "hu",
289 strings: "il", strings: "info", strings: "io", strings: "is", strings: "ir",
290 strings: "jp",
291 strings: "kr",
292 strings: "li", strings: "lt", strings: "lu", strings: "lv",
293 strings: "museum",
294 strings: "name", strings: "net", strings: "no", strings: "nu", strings: "nz",
295 strings: "org",
296 strings: "pl", strings: "pr",
297 strings: "se", strings: "sh",
298 strings: "tel", strings: "th", strings: "tm", strings: "tw",
299 strings: "ua",
300 strings: "vn",
301 strings: "xn--fiqs8s", // China
302 strings: "xn--fiqz9s", // China
303 strings: "xn--fzc2c9e2c", // Sri Lanka
304 strings: "xn--j6w193g", // Hong Kong
305 strings: "xn--kprw13d", // Taiwan
306 strings: "xn--kpry57d", // Taiwan
307 strings: "xn--mgba3a4f16a", // Iran
308 strings: "xn--mgba3a4fra", // Iran
309 strings: "xn--mgbaam7a8h", // UAE
310 strings: "xn--mgbayh7gpa", // Jordan
311 strings: "xn--mgberp4a5d4ar", // Saudi Arabia
312 strings: "xn--ogbpf8fl", // Syria
313 strings: "xn--p1ai", // Russian Federation
314 strings: "xn--wgbh1c", // Egypt
315 strings: "xn--wgbl6a", // Qatar
316 strings: "xn--xkc2al3hye2a" // Sri Lanka
317);
318
319Q_CONSTINIT static QStringList *user_idn_whitelist = nullptr;
320
321static bool lessThan(const QChar *a, int l, const char *c)
322{
323 const auto *uc = reinterpret_cast<const char16_t *>(a);
324 const char16_t *e = uc + l;
325
326 if (!c || *c == 0)
327 return false;
328
329 while (*c) {
330 if (uc == e || *uc != static_cast<unsigned char>(*c))
331 break;
332 ++uc;
333 ++c;
334 }
335 return uc == e ? *c : (*uc < static_cast<unsigned char>(*c));
336}
337
338static bool equal(const QChar *a, int l, const char *b)
339{
340 while (l && a->unicode() && *b) {
341 if (*a != QLatin1Char(*b))
342 return false;
343 ++a;
344 ++b;
345 --l;
346 }
347 return l == 0;
348}
349
350static bool qt_is_idn_enabled(QStringView aceDomain)
351{
352 auto idx = aceDomain.lastIndexOf(c: u'.');
353 if (idx == -1)
354 return false;
355
356 auto tldString = aceDomain.mid(pos: idx + 1);
357 const auto len = tldString.size();
358
359 const QChar *tld = tldString.constData();
360
361 if (user_idn_whitelist)
362 return user_idn_whitelist->contains(str: tldString);
363
364 int l = 0;
365 int r = idn_whitelist.count() - 1;
366 int i = (l + r + 1) / 2;
367
368 while (r != l) {
369 if (lessThan(a: tld, l: len, c: idn_whitelist.at(index: i)))
370 r = i - 1;
371 else
372 l = i;
373 i = (l + r + 1) / 2;
374 }
375 return equal(a: tld, l: len, b: idn_whitelist.at(index: i));
376}
377
378template<typename C>
379static inline bool isValidInNormalizedAsciiLabel(C c)
380{
381 return c == u'-' || c == u'_' || (c >= u'0' && c <= u'9') || (c >= u'a' && c <= u'z');
382}
383
384template<typename C>
385static inline bool isValidInNormalizedAsciiName(C c)
386{
387 return isValidInNormalizedAsciiLabel(c) || c == u'.';
388}
389
390/*
391 Map domain name according to algorithm in UTS #46, 4.1
392
393 Returns empty string if there are disallowed characters in the input.
394
395 Sets resultIsAscii if the result is known for sure to be all ASCII.
396*/
397static QString mapDomainName(const QString &in, QUrl::AceProcessingOptions options,
398 bool *resultIsAscii)
399{
400 *resultIsAscii = true;
401
402 // Check if the input is already normalized ASCII first and can be returned as is.
403 int i = 0;
404 for (auto c : in) {
405 if (c.unicode() >= 0x80 || !isValidInNormalizedAsciiName(c))
406 break;
407 i++;
408 }
409
410 if (i == in.size())
411 return in;
412
413 QString result;
414 result.reserve(asize: in.size());
415 result.append(uc: in.constData(), len: i);
416 bool allAscii = true;
417
418 for (QStringIterator iter(QStringView(in).sliced(pos: i)); iter.hasNext();) {
419 char32_t uc = iter.next();
420
421 // Fast path for ASCII-only inputs
422 if (Q_LIKELY(uc < 0x80)) {
423 if (uc >= U'A' && uc <= U'Z')
424 uc |= 0x20; // lower-case it
425
426 if (isValidInNormalizedAsciiName(c: uc)) {
427 result.append(c: static_cast<char16_t>(uc));
428 continue;
429 }
430 }
431
432 allAscii = false;
433
434 // Capital sharp S is a special case since UTR #46 revision 31 (Unicode 15.1)
435 if (uc == 0x1E9E && options.testFlag(flag: QUrl::AceTransitionalProcessing)) {
436 result.append(s: u"ss"_s);
437 continue;
438 }
439
440 QUnicodeTables::IdnaStatus status = QUnicodeTables::idnaStatus(ucs4: uc);
441
442 if (status == QUnicodeTables::IdnaStatus::Deviation)
443 status = options.testFlag(flag: QUrl::AceTransitionalProcessing)
444 ? QUnicodeTables::IdnaStatus::Mapped
445 : QUnicodeTables::IdnaStatus::Valid;
446
447 switch (status) {
448 case QUnicodeTables::IdnaStatus::Ignored:
449 continue;
450 case QUnicodeTables::IdnaStatus::Valid:
451 case QUnicodeTables::IdnaStatus::Disallowed:
452 for (auto c : QChar::fromUcs4(c: uc))
453 result.append(c);
454 break;
455 case QUnicodeTables::IdnaStatus::Mapped:
456 result.append(v: QUnicodeTables::idnaMapping(usc4: uc));
457 break;
458 default:
459 Q_UNREACHABLE();
460 }
461 }
462
463 *resultIsAscii = allAscii;
464 return result;
465}
466
467/*
468 Check the rules for an ASCII label.
469
470 Check the size restriction and that the label does not start or end with dashes.
471
472 The label should be nonempty.
473*/
474static bool validateAsciiLabel(QStringView label)
475{
476 if (label.size() > MaxDomainLabelLength)
477 return false;
478
479 if (label.first() == u'-' || label.last() == u'-')
480 return false;
481
482 return std::all_of(first: label.begin(), last: label.end(), pred: isValidInNormalizedAsciiLabel<QChar>);
483}
484
485namespace {
486
487class DomainValidityChecker
488{
489 bool domainNameIsBidi = false;
490 bool hadBidiErrors = false;
491 bool ignoreBidiErrors;
492
493 static constexpr char32_t ZWNJ = U'\u200C';
494 static constexpr char32_t ZWJ = U'\u200D';
495
496public:
497 DomainValidityChecker(bool ignoreBidiErrors = false) : ignoreBidiErrors(ignoreBidiErrors) { }
498 bool checkLabel(const QString &label, QUrl::AceProcessingOptions options);
499
500private:
501 static bool checkContextJRules(QStringView label);
502 static bool checkBidiRules(QStringView label);
503};
504
505} // anonymous namespace
506
507/*
508 Check CONTEXTJ rules according to RFC 5892, appendix A.1 & A.2.
509
510 Rule Set for U+200C (ZWNJ):
511
512 False;
513
514 If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True;
515
516 If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C
517
518 (Joining_Type:T)*(Joining_Type:{R,D})) Then True;
519
520 Rule Set for U+200D (ZWJ):
521
522 False;
523
524 If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True;
525
526*/
527bool DomainValidityChecker::checkContextJRules(QStringView label)
528{
529 constexpr unsigned char CombiningClassVirama = 9;
530
531 enum class State {
532 Initial,
533 LD_T, // L,D with possible following T*
534 ZWNJ_T, // ZWNJ with possible following T*
535 };
536 State regexpState = State::Initial;
537 bool previousIsVirama = false;
538
539 for (QStringIterator iter(label); iter.hasNext();) {
540 auto ch = iter.next();
541
542 if (ch == ZWJ) {
543 if (!previousIsVirama)
544 return false;
545 regexpState = State::Initial;
546 } else if (ch == ZWNJ) {
547 if (!previousIsVirama && regexpState != State::LD_T)
548 return false;
549 regexpState = previousIsVirama ? State::Initial : State::ZWNJ_T;
550 } else {
551 switch (QChar::joiningType(ucs4: ch)) {
552 case QChar::Joining_Left:
553 if (regexpState == State::ZWNJ_T)
554 return false;
555 regexpState = State::LD_T;
556 break;
557 case QChar::Joining_Right:
558 regexpState = State::Initial;
559 break;
560 case QChar::Joining_Dual:
561 regexpState = State::LD_T;
562 break;
563 case QChar::Joining_Transparent:
564 break;
565 default:
566 regexpState = State::Initial;
567 break;
568 }
569 }
570
571 previousIsVirama = QChar::combiningClass(ucs4: ch) == CombiningClassVirama;
572 }
573
574 return regexpState != State::ZWNJ_T;
575}
576
577/*
578 Check if the label conforms to BiDi rule of RFC 5893.
579
580 1. The first character must be a character with Bidi property L, R,
581 or AL. If it has the R or AL property, it is an RTL label; if it
582 has the L property, it is an LTR label.
583
584 2. In an RTL label, only characters with the Bidi properties R, AL,
585 AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
586
587 3. In an RTL label, the end of the label must be a character with
588 Bidi property R, AL, EN, or AN, followed by zero or more
589 characters with Bidi property NSM.
590
591 4. In an RTL label, if an EN is present, no AN may be present, and
592 vice versa.
593
594 5. In an LTR label, only characters with the Bidi properties L, EN,
595 ES, CS, ET, ON, BN, or NSM are allowed.
596
597 6. In an LTR label, the end of the label must be a character with
598 Bidi property L or EN, followed by zero or more characters with
599 Bidi property NSM.
600*/
601bool DomainValidityChecker::checkBidiRules(QStringView label)
602{
603 if (label.isEmpty())
604 return true;
605
606 QStringIterator iter(label);
607 Q_ASSERT(iter.hasNext());
608
609 char32_t ch = iter.next();
610 bool labelIsRTL = false;
611
612 switch (QChar::direction(ucs4: ch)) {
613 case QChar::DirL:
614 break;
615 case QChar::DirR:
616 case QChar::DirAL:
617 labelIsRTL = true;
618 break;
619 default:
620 return false;
621 }
622
623 bool tailOk = true;
624 bool labelHasEN = false;
625 bool labelHasAN = false;
626
627 while (iter.hasNext()) {
628 ch = iter.next();
629
630 switch (QChar::direction(ucs4: ch)) {
631 case QChar::DirR:
632 case QChar::DirAL:
633 if (!labelIsRTL)
634 return false;
635 tailOk = true;
636 break;
637
638 case QChar::DirL:
639 if (labelIsRTL)
640 return false;
641 tailOk = true;
642 break;
643
644 case QChar::DirES:
645 case QChar::DirCS:
646 case QChar::DirET:
647 case QChar::DirON:
648 case QChar::DirBN:
649 tailOk = false;
650 break;
651
652 case QChar::DirNSM:
653 break;
654
655 case QChar::DirAN:
656 if (labelIsRTL) {
657 if (labelHasEN)
658 return false;
659 labelHasAN = true;
660 tailOk = true;
661 } else {
662 return false;
663 }
664 break;
665
666 case QChar::DirEN:
667 if (labelIsRTL) {
668 if (labelHasAN)
669 return false;
670 labelHasEN = true;
671 }
672 tailOk = true;
673 break;
674
675 default:
676 return false;
677 }
678 }
679
680 return tailOk;
681}
682
683/*
684 Check if the given label is valid according to UTS #46 validity criteria.
685
686 NFC check can be skipped if the label was transformed to NFC before calling
687 this function (as optimization).
688
689 The domain name is considered invalid if this function returns false at least
690 once.
691
692 1. The label must be in Unicode Normalization Form NFC.
693 2. If CheckHyphens, the label must not contain a U+002D HYPHEN-MINUS character
694 in both the third and fourth positions.
695 3. If CheckHyphens, the label must neither begin nor end with a U+002D HYPHEN-MINUS character.
696 4. The label must not contain a U+002E ( . ) FULL STOP.
697 5. The label must not begin with a combining mark, that is: General_Category=Mark.
698 6. Each code point in the label must only have certain status values according to Section 5,
699 IDNA Mapping Table:
700 1. For Transitional Processing, each value must be valid.
701 2. For Nontransitional Processing, each value must be either valid or deviation.
702 7. If CheckJoiners, the label must satisfy the ContextJ rules from Appendix A, in The Unicode
703 Code Points and Internationalized Domain Names for Applications (IDNA).
704 8. If CheckBidi, and if the domain name is a Bidi domain name, then the label must satisfy
705 all six of the numbered conditions in RFC 5893, Section 2.
706
707 NOTE: Don't use QStringView for label, so that call to QString::normalized() can avoid
708 memory allocation when there is nothing to normalize.
709*/
710bool DomainValidityChecker::checkLabel(const QString &label, QUrl::AceProcessingOptions options)
711{
712 if (label.isEmpty())
713 return true;
714
715 if (label != label.normalized(mode: QString::NormalizationForm_C))
716 return false;
717
718 if (label.size() >= 4) {
719 // This assumes that the first two characters are in BMP, but that's ok
720 // because non-BMP characters are unlikely to be used for specifying
721 // future extensions.
722 if (label[2] == u'-' && label[3] == u'-')
723 return ignoreBidiErrors && label.startsWith(s: u"xn") && validateAsciiLabel(label);
724 }
725
726 if (label.startsWith(c: u'-') || label.endsWith(c: u'-'))
727 return false;
728
729 if (label.contains(c: u'.'))
730 return false;
731
732 QStringIterator iter(label);
733 auto c = iter.next();
734
735 if (QChar::isMark(ucs4: c))
736 return false;
737
738 // As optimization, CONTEXTJ rules check can be skipped if no
739 // ZWJ/ZWNJ characters were found during the first pass.
740 bool hasJoiners = false;
741
742 for (;;) {
743 hasJoiners = hasJoiners || c == ZWNJ || c == ZWJ;
744
745 if (!ignoreBidiErrors && !domainNameIsBidi) {
746 switch (QChar::direction(ucs4: c)) {
747 case QChar::DirR:
748 case QChar::DirAL:
749 case QChar::DirAN:
750 domainNameIsBidi = true;
751 if (hadBidiErrors)
752 return false;
753 break;
754 default:
755 break;
756 }
757 }
758
759 switch (QUnicodeTables::idnaStatus(ucs4: c)) {
760 case QUnicodeTables::IdnaStatus::Valid:
761 break;
762 case QUnicodeTables::IdnaStatus::Deviation:
763 if (options.testFlag(flag: QUrl::AceTransitionalProcessing))
764 return false;
765 break;
766 default:
767 return false;
768 }
769
770 if (!iter.hasNext())
771 break;
772 c = iter.next();
773 }
774
775 if (hasJoiners && !checkContextJRules(label))
776 return false;
777
778 hadBidiErrors = hadBidiErrors || !checkBidiRules(label);
779
780 if (domainNameIsBidi && hadBidiErrors)
781 return false;
782
783 return true;
784}
785
786static QString convertToAscii(QStringView normalizedDomain, AceLeadingDot dot)
787{
788 qsizetype lastIdx = 0;
789 QString aceForm; // this variable is here for caching
790 QString aceResult;
791
792 while (true) {
793 qsizetype idx = normalizedDomain.indexOf(c: u'.', from: lastIdx);
794 if (idx == -1)
795 idx = normalizedDomain.size();
796
797 const qsizetype labelLength = idx - lastIdx;
798 if (labelLength) {
799 const auto label = normalizedDomain.sliced(pos: lastIdx, n: labelLength);
800 aceForm.clear();
801 qt_punycodeEncoder(in: label, output: &aceForm);
802 if (aceForm.isEmpty())
803 return {};
804
805 aceResult.append(s: aceForm);
806 }
807
808 if (idx == normalizedDomain.size())
809 break;
810
811 if (labelLength == 0 && (dot == ForbidLeadingDot || idx > 0))
812 return {}; // two delimiters in a row -- empty label not allowed
813
814 lastIdx = idx + 1;
815 aceResult += u'.';
816 }
817
818 return aceResult;
819}
820
821static bool checkAsciiDomainName(QStringView normalizedDomain, AceLeadingDot dot,
822 bool *usesPunycode)
823{
824 qsizetype lastIdx = 0;
825 bool hasPunycode = false;
826 *usesPunycode = false;
827
828 while (lastIdx < normalizedDomain.size()) {
829 auto idx = normalizedDomain.indexOf(c: u'.', from: lastIdx);
830 if (idx == -1)
831 idx = normalizedDomain.size();
832
833 const auto labelLength = idx - lastIdx;
834 if (labelLength == 0) {
835 if (idx == normalizedDomain.size())
836 break;
837 if (dot == ForbidLeadingDot || idx > 0)
838 return false; // two delimiters in a row -- empty label not allowed
839 } else {
840 const auto label = normalizedDomain.sliced(pos: lastIdx, n: labelLength);
841 if (!validateAsciiLabel(label))
842 return false;
843
844 hasPunycode = hasPunycode || label.startsWith(s: "xn--"_L1);
845 }
846
847 lastIdx = idx + 1;
848 }
849
850 *usesPunycode = hasPunycode;
851 return true;
852}
853
854static QString convertToUnicode(const QString &asciiDomain, QUrl::AceProcessingOptions options)
855{
856 QString result;
857 result.reserve(asize: asciiDomain.size());
858 qsizetype lastIdx = 0;
859
860 DomainValidityChecker checker;
861
862 while (true) {
863 auto idx = asciiDomain.indexOf(c: u'.', from: lastIdx);
864 if (idx == -1)
865 idx = asciiDomain.size();
866
867 const auto labelLength = idx - lastIdx;
868 if (labelLength == 0) {
869 if (idx == asciiDomain.size())
870 break;
871 } else {
872 const auto label = asciiDomain.sliced(pos: lastIdx, n: labelLength);
873 const auto unicodeLabel = qt_punycodeDecoder(pc: label);
874
875 if (unicodeLabel.isEmpty())
876 return asciiDomain;
877
878 if (!checker.checkLabel(label: unicodeLabel, options))
879 return asciiDomain;
880
881 result.append(s: unicodeLabel);
882 }
883
884 if (idx == asciiDomain.size())
885 break;
886
887 lastIdx = idx + 1;
888 result += u'.';
889 }
890 return result;
891}
892
893static bool checkUnicodeName(const QString &domainName, QUrl::AceProcessingOptions options)
894{
895 qsizetype lastIdx = 0;
896
897 DomainValidityChecker checker(true);
898
899 while (true) {
900 qsizetype idx = domainName.indexOf(c: u'.', from: lastIdx);
901 if (idx == -1)
902 idx = domainName.size();
903
904 const qsizetype labelLength = idx - lastIdx;
905 if (labelLength) {
906 const auto label = domainName.sliced(pos: lastIdx, n: labelLength);
907
908 if (!checker.checkLabel(label, options))
909 return false;
910 }
911
912 if (idx == domainName.size())
913 break;
914
915 lastIdx = idx + 1;
916 }
917 return true;
918}
919
920QString qt_ACE_do(const QString &domain, AceOperation op, AceLeadingDot dot,
921 QUrl::AceProcessingOptions options)
922{
923 if (domain.isEmpty())
924 return {};
925
926 bool mappedToAscii;
927 const QString mapped = mapDomainName(in: domain, options, resultIsAscii: &mappedToAscii);
928 const QString normalized =
929 mappedToAscii ? mapped : mapped.normalized(mode: QString::NormalizationForm_C);
930
931 if (normalized.isEmpty())
932 return {};
933
934 if (!mappedToAscii && !checkUnicodeName(domainName: normalized, options))
935 return {};
936
937 bool needsConversionToUnicode;
938 const QString aceResult = mappedToAscii ? normalized : convertToAscii(normalizedDomain: normalized, dot);
939 if (aceResult.isEmpty() || !checkAsciiDomainName(normalizedDomain: aceResult, dot, usesPunycode: &needsConversionToUnicode))
940 return {};
941
942 if (op == ToAceOnly || !needsConversionToUnicode
943 || (!options.testFlag(flag: QUrl::IgnoreIDNWhitelist) && !qt_is_idn_enabled(aceDomain: aceResult))) {
944 return aceResult;
945 }
946
947 return convertToUnicode(asciiDomain: aceResult, options);
948}
949
950/*!
951 \since 4.2
952
953 Returns the current whitelist of top-level domains that are allowed
954 to have non-ASCII characters in their compositions.
955
956 See setIdnWhitelist() for the rationale of this list.
957
958 \sa AceProcessingOption
959*/
960QStringList QUrl::idnWhitelist()
961{
962 if (user_idn_whitelist)
963 return *user_idn_whitelist;
964 static const QStringList list = [] {
965 QStringList list;
966 list.reserve(asize: idn_whitelist.count());
967 int i = 0;
968 while (i < idn_whitelist.count()) {
969 list << QLatin1StringView(idn_whitelist.at(index: i));
970 ++i;
971 }
972 return list;
973 }();
974 return list;
975}
976
977/*!
978 \since 4.2
979
980 Sets the whitelist of Top-Level Domains (TLDs) that are allowed to have
981 non-ASCII characters in domains to the value of \a list.
982
983 Note that if you call this function, you need to do so \e before
984 you start any threads that might access idnWhitelist().
985
986 Qt comes with a default list that contains the Internet top-level domains
987 that have published support for Internationalized Domain Names (IDNs)
988 and rules to guarantee that no deception can happen between similarly-looking
989 characters (such as the Latin lowercase letter \c 'a' and the Cyrillic
990 equivalent, which in most fonts are visually identical).
991
992 This list is periodically maintained, as registrars publish new rules.
993
994 This function is provided for those who need to manipulate the list, in
995 order to add or remove a TLD. It is not recommended to change its value
996 for purposes other than testing, as it may expose users to security risks.
997*/
998void QUrl::setIdnWhitelist(const QStringList &list)
999{
1000 if (!user_idn_whitelist)
1001 user_idn_whitelist = new QStringList;
1002 *user_idn_whitelist = list;
1003}
1004
1005QT_END_NAMESPACE
1006

source code of qtbase/src/corelib/io/qurlidna.cpp