1 | // Copyright (C) 2016 The Qt Company Ltd. |
2 | // Copyright (C) 2016 Intel Corporation. |
3 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only |
4 | |
5 | #include "qurl_p.h" |
6 | |
7 | #include <QtCore/qstringlist.h> |
8 | #include <QtCore/private/qnumeric_p.h> |
9 | #include <QtCore/private/qoffsetstringarray_p.h> |
10 | #include <QtCore/private/qstringiterator_p.h> |
11 | #include <QtCore/private/qunicodetables_p.h> |
12 | |
13 | #include <algorithm> |
14 | |
15 | QT_BEGIN_NAMESPACE |
16 | |
17 | using namespace Qt::StringLiterals; |
18 | |
19 | // needed by the punycode encoder/decoder |
20 | static const uint base = 36; |
21 | static const uint tmin = 1; |
22 | static const uint tmax = 26; |
23 | static const uint skew = 38; |
24 | static const uint damp = 700; |
25 | static const uint initial_bias = 72; |
26 | static const uint initial_n = 128; |
27 | |
28 | static constexpr qsizetype MaxDomainLabelLength = 63; |
29 | |
30 | static inline uint encodeDigit(uint digit) |
31 | { |
32 | return digit + 22 + 75 * (digit < 26); |
33 | } |
34 | |
35 | static inline uint adapt(uint delta, uint numpoints, bool firsttime) |
36 | { |
37 | delta /= (firsttime ? damp : 2); |
38 | delta += (delta / numpoints); |
39 | |
40 | uint k = 0; |
41 | for (; delta > ((base - tmin) * tmax) / 2; k += base) |
42 | delta /= (base - tmin); |
43 | |
44 | return k + (((base - tmin + 1) * delta) / (delta + skew)); |
45 | } |
46 | |
47 | static inline void appendEncode(QString *output, uint delta, uint bias) |
48 | { |
49 | uint qq; |
50 | uint k; |
51 | uint t; |
52 | |
53 | // insert the variable length delta integer. |
54 | for (qq = delta, k = base;; k += base) { |
55 | // stop generating digits when the threshold is |
56 | // detected. |
57 | t = (k <= bias) ? tmin : (k >= bias + tmax) ? tmax : k - bias; |
58 | if (qq < t) break; |
59 | |
60 | *output += QChar(encodeDigit(digit: t + (qq - t) % (base - t))); |
61 | qq = (qq - t) / (base - t); |
62 | } |
63 | |
64 | *output += QChar(encodeDigit(digit: qq)); |
65 | } |
66 | |
67 | Q_AUTOTEST_EXPORT void qt_punycodeEncoder(QStringView in, QString *output) |
68 | { |
69 | uint n = initial_n; |
70 | uint delta = 0; |
71 | uint bias = initial_bias; |
72 | |
73 | // Do not try to encode strings that certainly will result in output |
74 | // that is longer than allowable domain name label length. Note that |
75 | // non-BMP codepoints are encoded as two QChars. |
76 | if (in.size() > MaxDomainLabelLength * 2) |
77 | return; |
78 | |
79 | int outLen = output->size(); |
80 | output->resize(size: outLen + in.size()); |
81 | |
82 | QChar *d = output->data() + outLen; |
83 | bool skipped = false; |
84 | // copy all basic code points verbatim to output. |
85 | for (QChar c : in) { |
86 | if (c.unicode() < 0x80) |
87 | *d++ = c; |
88 | else |
89 | skipped = true; |
90 | } |
91 | |
92 | // if there were only basic code points, just return them |
93 | // directly; don't do any encoding. |
94 | if (!skipped) |
95 | return; |
96 | |
97 | output->truncate(pos: d - output->constData()); |
98 | int copied = output->size() - outLen; |
99 | |
100 | // h and b now contain the number of basic code points in input. |
101 | uint b = copied; |
102 | uint h = copied; |
103 | |
104 | // if basic code points were copied, add the delimiter character. |
105 | if (h > 0) |
106 | *output += u'-'; |
107 | |
108 | // compute the input length in Unicode code points. |
109 | uint inputLength = 0; |
110 | for (QStringIterator iter(in); iter.hasNext();) { |
111 | inputLength++; |
112 | |
113 | if (iter.next(invalidAs: char32_t(-1)) == char32_t(-1)) { |
114 | output->truncate(pos: outLen); |
115 | return; // invalid surrogate pair |
116 | } |
117 | } |
118 | |
119 | // while there are still unprocessed non-basic code points left in |
120 | // the input string... |
121 | while (h < inputLength) { |
122 | // find the character in the input string with the lowest unprocessed value. |
123 | uint m = std::numeric_limits<uint>::max(); |
124 | for (QStringIterator iter(in); iter.hasNext();) { |
125 | auto c = iter.nextUnchecked(); |
126 | static_assert(std::numeric_limits<decltype(m)>::max() |
127 | >= std::numeric_limits<decltype(c)>::max(), |
128 | "Punycode uint should be able to cover all codepoints" ); |
129 | if (c >= n && c < m) |
130 | m = c; |
131 | } |
132 | |
133 | // delta = delta + (m - n) * (h + 1), fail on overflow |
134 | uint tmp; |
135 | if (qMulOverflow<uint>(v1: m - n, v2: h + 1, r: &tmp) || qAddOverflow<uint>(v1: delta, v2: tmp, r: &delta)) { |
136 | output->truncate(pos: outLen); |
137 | return; // punycode_overflow |
138 | } |
139 | n = m; |
140 | |
141 | for (QStringIterator iter(in); iter.hasNext();) { |
142 | auto c = iter.nextUnchecked(); |
143 | |
144 | // increase delta until we reach the character processed in this iteration; |
145 | // fail if delta overflows. |
146 | if (c < n) { |
147 | if (qAddOverflow<uint>(v1: delta, v2: 1, r: &delta)) { |
148 | output->truncate(pos: outLen); |
149 | return; // punycode_overflow |
150 | } |
151 | } |
152 | |
153 | if (c == n) { |
154 | appendEncode(output, delta, bias); |
155 | |
156 | bias = adapt(delta, numpoints: h + 1, firsttime: h == b); |
157 | delta = 0; |
158 | ++h; |
159 | } |
160 | } |
161 | |
162 | ++delta; |
163 | ++n; |
164 | } |
165 | |
166 | // prepend ACE prefix |
167 | output->insert(i: outLen, s: "xn--"_L1 ); |
168 | return; |
169 | } |
170 | |
171 | Q_AUTOTEST_EXPORT QString qt_punycodeDecoder(const QString &pc) |
172 | { |
173 | uint n = initial_n; |
174 | uint i = 0; |
175 | uint bias = initial_bias; |
176 | |
177 | // Do not try to decode strings longer than allowable for a domain label. |
178 | // Non-ASCII strings are not allowed here anyway, so there is no need |
179 | // to account for surrogates. |
180 | if (pc.size() > MaxDomainLabelLength) |
181 | return QString(); |
182 | |
183 | // strip any ACE prefix |
184 | int start = pc.startsWith(s: "xn--"_L1 ) ? 4 : 0; |
185 | if (!start) |
186 | return pc; |
187 | |
188 | // find the last delimiter character '-' in the input array. copy |
189 | // all data before this delimiter directly to the output array. |
190 | int delimiterPos = pc.lastIndexOf(c: u'-'); |
191 | auto output = delimiterPos < 4 ? std::u32string() |
192 | : pc.mid(position: start, n: delimiterPos - start).toStdU32String(); |
193 | |
194 | // if a delimiter was found, skip to the position after it; |
195 | // otherwise start at the front of the input string. everything |
196 | // before the delimiter is assumed to be basic code points. |
197 | uint cnt = delimiterPos + 1; |
198 | |
199 | // loop through the rest of the input string, inserting non-basic |
200 | // characters into output as we go. |
201 | while (cnt < (uint) pc.size()) { |
202 | uint oldi = i; |
203 | uint w = 1; |
204 | |
205 | // find the next index for inserting a non-basic character. |
206 | for (uint k = base; cnt < (uint) pc.size(); k += base) { |
207 | // grab a character from the punycode input and find its |
208 | // delta digit (each digit code is part of the |
209 | // variable-length integer delta) |
210 | uint digit = pc.at(i: cnt++).unicode(); |
211 | if (digit - 48 < 10) digit -= 22; |
212 | else if (digit - 65 < 26) digit -= 65; |
213 | else if (digit - 97 < 26) digit -= 97; |
214 | else digit = base; |
215 | |
216 | // Fail if the code point has no digit value |
217 | if (digit >= base) |
218 | return QString(); |
219 | |
220 | // i = i + digit * w, fail on overflow |
221 | uint tmp; |
222 | if (qMulOverflow<uint>(v1: digit, v2: w, r: &tmp) || qAddOverflow<uint>(v1: i, v2: tmp, r: &i)) |
223 | return QString(); |
224 | |
225 | // detect threshold to stop reading delta digits |
226 | uint t; |
227 | if (k <= bias) t = tmin; |
228 | else if (k >= bias + tmax) t = tmax; |
229 | else t = k - bias; |
230 | |
231 | if (digit < t) break; |
232 | |
233 | // w = w * (base - t), fail on overflow |
234 | if (qMulOverflow<uint>(v1: w, v2: base - t, r: &w)) |
235 | return QString(); |
236 | } |
237 | |
238 | // find new bias and calculate the next non-basic code |
239 | // character. |
240 | uint outputLength = static_cast<uint>(output.length()); |
241 | bias = adapt(delta: i - oldi, numpoints: outputLength + 1, firsttime: oldi == 0); |
242 | |
243 | // n = n + i div (length(output) + 1), fail on overflow |
244 | if (qAddOverflow<uint>(v1: n, v2: i / (outputLength + 1), r: &n)) |
245 | return QString(); |
246 | |
247 | // allow the deltas to wrap around |
248 | i %= (outputLength + 1); |
249 | |
250 | // if n is a basic code point then fail; this should not happen with |
251 | // correct implementation of Punycode, but check just n case. |
252 | if (n < initial_n) { |
253 | // Don't use Q_ASSERT() to avoid possibility of DoS |
254 | qWarning(msg: "Attempt to insert a basic codepoint. Unhandled overflow?" ); |
255 | return QString(); |
256 | } |
257 | |
258 | // Surrogates should normally be rejected later by other IDNA code. |
259 | // But because of Qt's use of UTF-16 to represent strings the |
260 | // IDNA code is not able to distinguish characters represented as pairs |
261 | // of surrogates from normal code points. This is why surrogates are |
262 | // not allowed here. |
263 | // |
264 | // Allowing surrogates would lead to non-unique (after normalization) |
265 | // encoding of strings with non-BMP characters. |
266 | // |
267 | // Punycode that encodes characters outside the Unicode range is also |
268 | // invalid and is rejected here. |
269 | if (QChar::isSurrogate(ucs4: n) || n > QChar::LastValidCodePoint) |
270 | return QString(); |
271 | |
272 | // insert the character n at position i |
273 | output.insert(pos: i, n: 1, c: static_cast<char32_t>(n)); |
274 | ++i; |
275 | } |
276 | |
277 | return QString::fromStdU32String(s: output); |
278 | } |
279 | |
280 | static constexpr auto idn_whitelist = qOffsetStringArray( |
281 | strings: "ac" , strings: "ar" , strings: "asia" , strings: "at" , |
282 | strings: "biz" , strings: "br" , |
283 | strings: "cat" , strings: "ch" , strings: "cl" , strings: "cn" , strings: "com" , |
284 | strings: "de" , strings: "dk" , |
285 | strings: "es" , |
286 | strings: "fi" , |
287 | strings: "gr" , |
288 | strings: "hu" , |
289 | strings: "il" , strings: "info" , strings: "io" , strings: "is" , strings: "ir" , |
290 | strings: "jp" , |
291 | strings: "kr" , |
292 | strings: "li" , strings: "lt" , strings: "lu" , strings: "lv" , |
293 | strings: "museum" , |
294 | strings: "name" , strings: "net" , strings: "no" , strings: "nu" , strings: "nz" , |
295 | strings: "org" , |
296 | strings: "pl" , strings: "pr" , |
297 | strings: "se" , strings: "sh" , |
298 | strings: "tel" , strings: "th" , strings: "tm" , strings: "tw" , |
299 | strings: "ua" , |
300 | strings: "vn" , |
301 | strings: "xn--fiqs8s" , // China |
302 | strings: "xn--fiqz9s" , // China |
303 | strings: "xn--fzc2c9e2c" , // Sri Lanka |
304 | strings: "xn--j6w193g" , // Hong Kong |
305 | strings: "xn--kprw13d" , // Taiwan |
306 | strings: "xn--kpry57d" , // Taiwan |
307 | strings: "xn--mgba3a4f16a" , // Iran |
308 | strings: "xn--mgba3a4fra" , // Iran |
309 | strings: "xn--mgbaam7a8h" , // UAE |
310 | strings: "xn--mgbayh7gpa" , // Jordan |
311 | strings: "xn--mgberp4a5d4ar" , // Saudi Arabia |
312 | strings: "xn--ogbpf8fl" , // Syria |
313 | strings: "xn--p1ai" , // Russian Federation |
314 | strings: "xn--wgbh1c" , // Egypt |
315 | strings: "xn--wgbl6a" , // Qatar |
316 | strings: "xn--xkc2al3hye2a" // Sri Lanka |
317 | ); |
318 | |
319 | Q_CONSTINIT static QStringList *user_idn_whitelist = nullptr; |
320 | |
321 | static bool lessThan(const QChar *a, int l, const char *c) |
322 | { |
323 | const auto *uc = reinterpret_cast<const char16_t *>(a); |
324 | const char16_t *e = uc + l; |
325 | |
326 | if (!c || *c == 0) |
327 | return false; |
328 | |
329 | while (*c) { |
330 | if (uc == e || *uc != static_cast<unsigned char>(*c)) |
331 | break; |
332 | ++uc; |
333 | ++c; |
334 | } |
335 | return uc == e ? *c : (*uc < static_cast<unsigned char>(*c)); |
336 | } |
337 | |
338 | static bool equal(const QChar *a, int l, const char *b) |
339 | { |
340 | while (l && a->unicode() && *b) { |
341 | if (*a != QLatin1Char(*b)) |
342 | return false; |
343 | ++a; |
344 | ++b; |
345 | --l; |
346 | } |
347 | return l == 0; |
348 | } |
349 | |
350 | static bool qt_is_idn_enabled(QStringView aceDomain) |
351 | { |
352 | auto idx = aceDomain.lastIndexOf(c: u'.'); |
353 | if (idx == -1) |
354 | return false; |
355 | |
356 | auto tldString = aceDomain.mid(pos: idx + 1); |
357 | const auto len = tldString.size(); |
358 | |
359 | const QChar *tld = tldString.constData(); |
360 | |
361 | if (user_idn_whitelist) |
362 | return user_idn_whitelist->contains(str: tldString); |
363 | |
364 | int l = 0; |
365 | int r = idn_whitelist.count() - 1; |
366 | int i = (l + r + 1) / 2; |
367 | |
368 | while (r != l) { |
369 | if (lessThan(a: tld, l: len, c: idn_whitelist.at(index: i))) |
370 | r = i - 1; |
371 | else |
372 | l = i; |
373 | i = (l + r + 1) / 2; |
374 | } |
375 | return equal(a: tld, l: len, b: idn_whitelist.at(index: i)); |
376 | } |
377 | |
378 | template<typename C> |
379 | static inline bool isValidInNormalizedAsciiLabel(C c) |
380 | { |
381 | return c == u'-' || c == u'_' || (c >= u'0' && c <= u'9') || (c >= u'a' && c <= u'z'); |
382 | } |
383 | |
384 | template<typename C> |
385 | static inline bool isValidInNormalizedAsciiName(C c) |
386 | { |
387 | return isValidInNormalizedAsciiLabel(c) || c == u'.'; |
388 | } |
389 | |
390 | /* |
391 | Map domain name according to algorithm in UTS #46, 4.1 |
392 | |
393 | Returns empty string if there are disallowed characters in the input. |
394 | |
395 | Sets resultIsAscii if the result is known for sure to be all ASCII. |
396 | */ |
397 | static QString mapDomainName(const QString &in, QUrl::AceProcessingOptions options, |
398 | bool *resultIsAscii) |
399 | { |
400 | *resultIsAscii = true; |
401 | |
402 | // Check if the input is already normalized ASCII first and can be returned as is. |
403 | int i = 0; |
404 | for (auto c : in) { |
405 | if (c.unicode() >= 0x80 || !isValidInNormalizedAsciiName(c)) |
406 | break; |
407 | i++; |
408 | } |
409 | |
410 | if (i == in.size()) |
411 | return in; |
412 | |
413 | QString result; |
414 | result.reserve(asize: in.size()); |
415 | result.append(uc: in.constData(), len: i); |
416 | bool allAscii = true; |
417 | |
418 | for (QStringIterator iter(QStringView(in).sliced(pos: i)); iter.hasNext();) { |
419 | char32_t uc = iter.next(); |
420 | |
421 | // Fast path for ASCII-only inputs |
422 | if (Q_LIKELY(uc < 0x80)) { |
423 | if (uc >= U'A' && uc <= U'Z') |
424 | uc |= 0x20; // lower-case it |
425 | |
426 | if (isValidInNormalizedAsciiName(c: uc)) { |
427 | result.append(c: static_cast<char16_t>(uc)); |
428 | continue; |
429 | } |
430 | } |
431 | |
432 | allAscii = false; |
433 | |
434 | // Capital sharp S is a special case since UTR #46 revision 31 (Unicode 15.1) |
435 | if (uc == 0x1E9E && options.testFlag(flag: QUrl::AceTransitionalProcessing)) { |
436 | result.append(s: u"ss"_s ); |
437 | continue; |
438 | } |
439 | |
440 | QUnicodeTables::IdnaStatus status = QUnicodeTables::idnaStatus(ucs4: uc); |
441 | |
442 | if (status == QUnicodeTables::IdnaStatus::Deviation) |
443 | status = options.testFlag(flag: QUrl::AceTransitionalProcessing) |
444 | ? QUnicodeTables::IdnaStatus::Mapped |
445 | : QUnicodeTables::IdnaStatus::Valid; |
446 | |
447 | switch (status) { |
448 | case QUnicodeTables::IdnaStatus::Ignored: |
449 | continue; |
450 | case QUnicodeTables::IdnaStatus::Valid: |
451 | case QUnicodeTables::IdnaStatus::Disallowed: |
452 | for (auto c : QChar::fromUcs4(c: uc)) |
453 | result.append(c); |
454 | break; |
455 | case QUnicodeTables::IdnaStatus::Mapped: |
456 | result.append(v: QUnicodeTables::idnaMapping(usc4: uc)); |
457 | break; |
458 | default: |
459 | Q_UNREACHABLE(); |
460 | } |
461 | } |
462 | |
463 | *resultIsAscii = allAscii; |
464 | return result; |
465 | } |
466 | |
467 | /* |
468 | Check the rules for an ASCII label. |
469 | |
470 | Check the size restriction and that the label does not start or end with dashes. |
471 | |
472 | The label should be nonempty. |
473 | */ |
474 | static bool validateAsciiLabel(QStringView label) |
475 | { |
476 | if (label.size() > MaxDomainLabelLength) |
477 | return false; |
478 | |
479 | if (label.first() == u'-' || label.last() == u'-') |
480 | return false; |
481 | |
482 | return std::all_of(first: label.begin(), last: label.end(), pred: isValidInNormalizedAsciiLabel<QChar>); |
483 | } |
484 | |
485 | namespace { |
486 | |
487 | class DomainValidityChecker |
488 | { |
489 | bool domainNameIsBidi = false; |
490 | bool hadBidiErrors = false; |
491 | bool ignoreBidiErrors; |
492 | |
493 | static constexpr char32_t ZWNJ = U'\u200C'; |
494 | static constexpr char32_t ZWJ = U'\u200D'; |
495 | |
496 | public: |
497 | DomainValidityChecker(bool ignoreBidiErrors = false) : ignoreBidiErrors(ignoreBidiErrors) { } |
498 | bool checkLabel(const QString &label, QUrl::AceProcessingOptions options); |
499 | |
500 | private: |
501 | static bool checkContextJRules(QStringView label); |
502 | static bool checkBidiRules(QStringView label); |
503 | }; |
504 | |
505 | } // anonymous namespace |
506 | |
507 | /* |
508 | Check CONTEXTJ rules according to RFC 5892, appendix A.1 & A.2. |
509 | |
510 | Rule Set for U+200C (ZWNJ): |
511 | |
512 | False; |
513 | |
514 | If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; |
515 | |
516 | If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C |
517 | |
518 | (Joining_Type:T)*(Joining_Type:{R,D})) Then True; |
519 | |
520 | Rule Set for U+200D (ZWJ): |
521 | |
522 | False; |
523 | |
524 | If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; |
525 | |
526 | */ |
527 | bool DomainValidityChecker::checkContextJRules(QStringView label) |
528 | { |
529 | constexpr unsigned char CombiningClassVirama = 9; |
530 | |
531 | enum class State { |
532 | Initial, |
533 | LD_T, // L,D with possible following T* |
534 | ZWNJ_T, // ZWNJ with possible following T* |
535 | }; |
536 | State regexpState = State::Initial; |
537 | bool previousIsVirama = false; |
538 | |
539 | for (QStringIterator iter(label); iter.hasNext();) { |
540 | auto ch = iter.next(); |
541 | |
542 | if (ch == ZWJ) { |
543 | if (!previousIsVirama) |
544 | return false; |
545 | regexpState = State::Initial; |
546 | } else if (ch == ZWNJ) { |
547 | if (!previousIsVirama && regexpState != State::LD_T) |
548 | return false; |
549 | regexpState = previousIsVirama ? State::Initial : State::ZWNJ_T; |
550 | } else { |
551 | switch (QChar::joiningType(ucs4: ch)) { |
552 | case QChar::Joining_Left: |
553 | if (regexpState == State::ZWNJ_T) |
554 | return false; |
555 | regexpState = State::LD_T; |
556 | break; |
557 | case QChar::Joining_Right: |
558 | regexpState = State::Initial; |
559 | break; |
560 | case QChar::Joining_Dual: |
561 | regexpState = State::LD_T; |
562 | break; |
563 | case QChar::Joining_Transparent: |
564 | break; |
565 | default: |
566 | regexpState = State::Initial; |
567 | break; |
568 | } |
569 | } |
570 | |
571 | previousIsVirama = QChar::combiningClass(ucs4: ch) == CombiningClassVirama; |
572 | } |
573 | |
574 | return regexpState != State::ZWNJ_T; |
575 | } |
576 | |
577 | /* |
578 | Check if the label conforms to BiDi rule of RFC 5893. |
579 | |
580 | 1. The first character must be a character with Bidi property L, R, |
581 | or AL. If it has the R or AL property, it is an RTL label; if it |
582 | has the L property, it is an LTR label. |
583 | |
584 | 2. In an RTL label, only characters with the Bidi properties R, AL, |
585 | AN, EN, ES, CS, ET, ON, BN, or NSM are allowed. |
586 | |
587 | 3. In an RTL label, the end of the label must be a character with |
588 | Bidi property R, AL, EN, or AN, followed by zero or more |
589 | characters with Bidi property NSM. |
590 | |
591 | 4. In an RTL label, if an EN is present, no AN may be present, and |
592 | vice versa. |
593 | |
594 | 5. In an LTR label, only characters with the Bidi properties L, EN, |
595 | ES, CS, ET, ON, BN, or NSM are allowed. |
596 | |
597 | 6. In an LTR label, the end of the label must be a character with |
598 | Bidi property L or EN, followed by zero or more characters with |
599 | Bidi property NSM. |
600 | */ |
601 | bool DomainValidityChecker::checkBidiRules(QStringView label) |
602 | { |
603 | if (label.isEmpty()) |
604 | return true; |
605 | |
606 | QStringIterator iter(label); |
607 | Q_ASSERT(iter.hasNext()); |
608 | |
609 | char32_t ch = iter.next(); |
610 | bool labelIsRTL = false; |
611 | |
612 | switch (QChar::direction(ucs4: ch)) { |
613 | case QChar::DirL: |
614 | break; |
615 | case QChar::DirR: |
616 | case QChar::DirAL: |
617 | labelIsRTL = true; |
618 | break; |
619 | default: |
620 | return false; |
621 | } |
622 | |
623 | bool tailOk = true; |
624 | bool labelHasEN = false; |
625 | bool labelHasAN = false; |
626 | |
627 | while (iter.hasNext()) { |
628 | ch = iter.next(); |
629 | |
630 | switch (QChar::direction(ucs4: ch)) { |
631 | case QChar::DirR: |
632 | case QChar::DirAL: |
633 | if (!labelIsRTL) |
634 | return false; |
635 | tailOk = true; |
636 | break; |
637 | |
638 | case QChar::DirL: |
639 | if (labelIsRTL) |
640 | return false; |
641 | tailOk = true; |
642 | break; |
643 | |
644 | case QChar::DirES: |
645 | case QChar::DirCS: |
646 | case QChar::DirET: |
647 | case QChar::DirON: |
648 | case QChar::DirBN: |
649 | tailOk = false; |
650 | break; |
651 | |
652 | case QChar::DirNSM: |
653 | break; |
654 | |
655 | case QChar::DirAN: |
656 | if (labelIsRTL) { |
657 | if (labelHasEN) |
658 | return false; |
659 | labelHasAN = true; |
660 | tailOk = true; |
661 | } else { |
662 | return false; |
663 | } |
664 | break; |
665 | |
666 | case QChar::DirEN: |
667 | if (labelIsRTL) { |
668 | if (labelHasAN) |
669 | return false; |
670 | labelHasEN = true; |
671 | } |
672 | tailOk = true; |
673 | break; |
674 | |
675 | default: |
676 | return false; |
677 | } |
678 | } |
679 | |
680 | return tailOk; |
681 | } |
682 | |
683 | /* |
684 | Check if the given label is valid according to UTS #46 validity criteria. |
685 | |
686 | NFC check can be skipped if the label was transformed to NFC before calling |
687 | this function (as optimization). |
688 | |
689 | The domain name is considered invalid if this function returns false at least |
690 | once. |
691 | |
692 | 1. The label must be in Unicode Normalization Form NFC. |
693 | 2. If CheckHyphens, the label must not contain a U+002D HYPHEN-MINUS character |
694 | in both the third and fourth positions. |
695 | 3. If CheckHyphens, the label must neither begin nor end with a U+002D HYPHEN-MINUS character. |
696 | 4. The label must not contain a U+002E ( . ) FULL STOP. |
697 | 5. The label must not begin with a combining mark, that is: General_Category=Mark. |
698 | 6. Each code point in the label must only have certain status values according to Section 5, |
699 | IDNA Mapping Table: |
700 | 1. For Transitional Processing, each value must be valid. |
701 | 2. For Nontransitional Processing, each value must be either valid or deviation. |
702 | 7. If CheckJoiners, the label must satisfy the ContextJ rules from Appendix A, in The Unicode |
703 | Code Points and Internationalized Domain Names for Applications (IDNA). |
704 | 8. If CheckBidi, and if the domain name is a Bidi domain name, then the label must satisfy |
705 | all six of the numbered conditions in RFC 5893, Section 2. |
706 | |
707 | NOTE: Don't use QStringView for label, so that call to QString::normalized() can avoid |
708 | memory allocation when there is nothing to normalize. |
709 | */ |
710 | bool DomainValidityChecker::checkLabel(const QString &label, QUrl::AceProcessingOptions options) |
711 | { |
712 | if (label.isEmpty()) |
713 | return true; |
714 | |
715 | if (label != label.normalized(mode: QString::NormalizationForm_C)) |
716 | return false; |
717 | |
718 | if (label.size() >= 4) { |
719 | // This assumes that the first two characters are in BMP, but that's ok |
720 | // because non-BMP characters are unlikely to be used for specifying |
721 | // future extensions. |
722 | if (label[2] == u'-' && label[3] == u'-') |
723 | return ignoreBidiErrors && label.startsWith(s: u"xn" ) && validateAsciiLabel(label); |
724 | } |
725 | |
726 | if (label.startsWith(c: u'-') || label.endsWith(c: u'-')) |
727 | return false; |
728 | |
729 | if (label.contains(c: u'.')) |
730 | return false; |
731 | |
732 | QStringIterator iter(label); |
733 | auto c = iter.next(); |
734 | |
735 | if (QChar::isMark(ucs4: c)) |
736 | return false; |
737 | |
738 | // As optimization, CONTEXTJ rules check can be skipped if no |
739 | // ZWJ/ZWNJ characters were found during the first pass. |
740 | bool hasJoiners = false; |
741 | |
742 | for (;;) { |
743 | hasJoiners = hasJoiners || c == ZWNJ || c == ZWJ; |
744 | |
745 | if (!ignoreBidiErrors && !domainNameIsBidi) { |
746 | switch (QChar::direction(ucs4: c)) { |
747 | case QChar::DirR: |
748 | case QChar::DirAL: |
749 | case QChar::DirAN: |
750 | domainNameIsBidi = true; |
751 | if (hadBidiErrors) |
752 | return false; |
753 | break; |
754 | default: |
755 | break; |
756 | } |
757 | } |
758 | |
759 | switch (QUnicodeTables::idnaStatus(ucs4: c)) { |
760 | case QUnicodeTables::IdnaStatus::Valid: |
761 | break; |
762 | case QUnicodeTables::IdnaStatus::Deviation: |
763 | if (options.testFlag(flag: QUrl::AceTransitionalProcessing)) |
764 | return false; |
765 | break; |
766 | default: |
767 | return false; |
768 | } |
769 | |
770 | if (!iter.hasNext()) |
771 | break; |
772 | c = iter.next(); |
773 | } |
774 | |
775 | if (hasJoiners && !checkContextJRules(label)) |
776 | return false; |
777 | |
778 | hadBidiErrors = hadBidiErrors || !checkBidiRules(label); |
779 | |
780 | if (domainNameIsBidi && hadBidiErrors) |
781 | return false; |
782 | |
783 | return true; |
784 | } |
785 | |
786 | static QString convertToAscii(QStringView normalizedDomain, AceLeadingDot dot) |
787 | { |
788 | qsizetype lastIdx = 0; |
789 | QString aceForm; // this variable is here for caching |
790 | QString aceResult; |
791 | |
792 | while (true) { |
793 | qsizetype idx = normalizedDomain.indexOf(c: u'.', from: lastIdx); |
794 | if (idx == -1) |
795 | idx = normalizedDomain.size(); |
796 | |
797 | const qsizetype labelLength = idx - lastIdx; |
798 | if (labelLength) { |
799 | const auto label = normalizedDomain.sliced(pos: lastIdx, n: labelLength); |
800 | aceForm.clear(); |
801 | qt_punycodeEncoder(in: label, output: &aceForm); |
802 | if (aceForm.isEmpty()) |
803 | return {}; |
804 | |
805 | aceResult.append(s: aceForm); |
806 | } |
807 | |
808 | if (idx == normalizedDomain.size()) |
809 | break; |
810 | |
811 | if (labelLength == 0 && (dot == ForbidLeadingDot || idx > 0)) |
812 | return {}; // two delimiters in a row -- empty label not allowed |
813 | |
814 | lastIdx = idx + 1; |
815 | aceResult += u'.'; |
816 | } |
817 | |
818 | return aceResult; |
819 | } |
820 | |
821 | static bool checkAsciiDomainName(QStringView normalizedDomain, AceLeadingDot dot, |
822 | bool *usesPunycode) |
823 | { |
824 | qsizetype lastIdx = 0; |
825 | bool hasPunycode = false; |
826 | *usesPunycode = false; |
827 | |
828 | while (lastIdx < normalizedDomain.size()) { |
829 | auto idx = normalizedDomain.indexOf(c: u'.', from: lastIdx); |
830 | if (idx == -1) |
831 | idx = normalizedDomain.size(); |
832 | |
833 | const auto labelLength = idx - lastIdx; |
834 | if (labelLength == 0) { |
835 | if (idx == normalizedDomain.size()) |
836 | break; |
837 | if (dot == ForbidLeadingDot || idx > 0) |
838 | return false; // two delimiters in a row -- empty label not allowed |
839 | } else { |
840 | const auto label = normalizedDomain.sliced(pos: lastIdx, n: labelLength); |
841 | if (!validateAsciiLabel(label)) |
842 | return false; |
843 | |
844 | hasPunycode = hasPunycode || label.startsWith(s: "xn--"_L1 ); |
845 | } |
846 | |
847 | lastIdx = idx + 1; |
848 | } |
849 | |
850 | *usesPunycode = hasPunycode; |
851 | return true; |
852 | } |
853 | |
854 | static QString convertToUnicode(const QString &asciiDomain, QUrl::AceProcessingOptions options) |
855 | { |
856 | QString result; |
857 | result.reserve(asize: asciiDomain.size()); |
858 | qsizetype lastIdx = 0; |
859 | |
860 | DomainValidityChecker checker; |
861 | |
862 | while (true) { |
863 | auto idx = asciiDomain.indexOf(c: u'.', from: lastIdx); |
864 | if (idx == -1) |
865 | idx = asciiDomain.size(); |
866 | |
867 | const auto labelLength = idx - lastIdx; |
868 | if (labelLength == 0) { |
869 | if (idx == asciiDomain.size()) |
870 | break; |
871 | } else { |
872 | const auto label = asciiDomain.sliced(pos: lastIdx, n: labelLength); |
873 | const auto unicodeLabel = qt_punycodeDecoder(pc: label); |
874 | |
875 | if (unicodeLabel.isEmpty()) |
876 | return asciiDomain; |
877 | |
878 | if (!checker.checkLabel(label: unicodeLabel, options)) |
879 | return asciiDomain; |
880 | |
881 | result.append(s: unicodeLabel); |
882 | } |
883 | |
884 | if (idx == asciiDomain.size()) |
885 | break; |
886 | |
887 | lastIdx = idx + 1; |
888 | result += u'.'; |
889 | } |
890 | return result; |
891 | } |
892 | |
893 | static bool checkUnicodeName(const QString &domainName, QUrl::AceProcessingOptions options) |
894 | { |
895 | qsizetype lastIdx = 0; |
896 | |
897 | DomainValidityChecker checker(true); |
898 | |
899 | while (true) { |
900 | qsizetype idx = domainName.indexOf(c: u'.', from: lastIdx); |
901 | if (idx == -1) |
902 | idx = domainName.size(); |
903 | |
904 | const qsizetype labelLength = idx - lastIdx; |
905 | if (labelLength) { |
906 | const auto label = domainName.sliced(pos: lastIdx, n: labelLength); |
907 | |
908 | if (!checker.checkLabel(label, options)) |
909 | return false; |
910 | } |
911 | |
912 | if (idx == domainName.size()) |
913 | break; |
914 | |
915 | lastIdx = idx + 1; |
916 | } |
917 | return true; |
918 | } |
919 | |
920 | QString qt_ACE_do(const QString &domain, AceOperation op, AceLeadingDot dot, |
921 | QUrl::AceProcessingOptions options) |
922 | { |
923 | if (domain.isEmpty()) |
924 | return {}; |
925 | |
926 | bool mappedToAscii; |
927 | const QString mapped = mapDomainName(in: domain, options, resultIsAscii: &mappedToAscii); |
928 | const QString normalized = |
929 | mappedToAscii ? mapped : mapped.normalized(mode: QString::NormalizationForm_C); |
930 | |
931 | if (normalized.isEmpty()) |
932 | return {}; |
933 | |
934 | if (!mappedToAscii && !checkUnicodeName(domainName: normalized, options)) |
935 | return {}; |
936 | |
937 | bool needsConversionToUnicode; |
938 | const QString aceResult = mappedToAscii ? normalized : convertToAscii(normalizedDomain: normalized, dot); |
939 | if (aceResult.isEmpty() || !checkAsciiDomainName(normalizedDomain: aceResult, dot, usesPunycode: &needsConversionToUnicode)) |
940 | return {}; |
941 | |
942 | if (op == ToAceOnly || !needsConversionToUnicode |
943 | || (!options.testFlag(flag: QUrl::IgnoreIDNWhitelist) && !qt_is_idn_enabled(aceDomain: aceResult))) { |
944 | return aceResult; |
945 | } |
946 | |
947 | return convertToUnicode(asciiDomain: aceResult, options); |
948 | } |
949 | |
950 | /*! |
951 | \since 4.2 |
952 | |
953 | Returns the current whitelist of top-level domains that are allowed |
954 | to have non-ASCII characters in their compositions. |
955 | |
956 | See setIdnWhitelist() for the rationale of this list. |
957 | |
958 | \sa AceProcessingOption |
959 | */ |
960 | QStringList QUrl::idnWhitelist() |
961 | { |
962 | if (user_idn_whitelist) |
963 | return *user_idn_whitelist; |
964 | static const QStringList list = [] { |
965 | QStringList list; |
966 | list.reserve(asize: idn_whitelist.count()); |
967 | int i = 0; |
968 | while (i < idn_whitelist.count()) { |
969 | list << QLatin1StringView(idn_whitelist.at(index: i)); |
970 | ++i; |
971 | } |
972 | return list; |
973 | }(); |
974 | return list; |
975 | } |
976 | |
977 | /*! |
978 | \since 4.2 |
979 | |
980 | Sets the whitelist of Top-Level Domains (TLDs) that are allowed to have |
981 | non-ASCII characters in domains to the value of \a list. |
982 | |
983 | Note that if you call this function, you need to do so \e before |
984 | you start any threads that might access idnWhitelist(). |
985 | |
986 | Qt comes with a default list that contains the Internet top-level domains |
987 | that have published support for Internationalized Domain Names (IDNs) |
988 | and rules to guarantee that no deception can happen between similarly-looking |
989 | characters (such as the Latin lowercase letter \c 'a' and the Cyrillic |
990 | equivalent, which in most fonts are visually identical). |
991 | |
992 | This list is periodically maintained, as registrars publish new rules. |
993 | |
994 | This function is provided for those who need to manipulate the list, in |
995 | order to add or remove a TLD. It is not recommended to change its value |
996 | for purposes other than testing, as it may expose users to security risks. |
997 | */ |
998 | void QUrl::setIdnWhitelist(const QStringList &list) |
999 | { |
1000 | if (!user_idn_whitelist) |
1001 | user_idn_whitelist = new QStringList; |
1002 | *user_idn_whitelist = list; |
1003 | } |
1004 | |
1005 | QT_END_NAMESPACE |
1006 | |