1 | // Copyright (C) 2016 The Qt Company Ltd. |
2 | // Copyright (C) 2016 Intel Corporation. |
3 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only |
4 | |
5 | #include "qurl_p.h" |
6 | |
7 | #include <QtCore/qstringlist.h> |
8 | #include <QtCore/private/qnumeric_p.h> |
9 | #include <QtCore/private/qoffsetstringarray_p.h> |
10 | #include <QtCore/private/qstringiterator_p.h> |
11 | #include <QtCore/private/qunicodetables_p.h> |
12 | |
13 | #include <algorithm> |
14 | |
15 | QT_BEGIN_NAMESPACE |
16 | |
17 | using namespace Qt::StringLiterals; |
18 | |
19 | // needed by the punycode encoder/decoder |
20 | static const uint base = 36; |
21 | static const uint tmin = 1; |
22 | static const uint tmax = 26; |
23 | static const uint skew = 38; |
24 | static const uint damp = 700; |
25 | static const uint initial_bias = 72; |
26 | static const uint initial_n = 128; |
27 | |
28 | static constexpr qsizetype MaxDomainLabelLength = 63; |
29 | |
30 | static inline uint encodeDigit(uint digit) |
31 | { |
32 | return digit + 22 + 75 * (digit < 26); |
33 | } |
34 | |
35 | static inline uint adapt(uint delta, uint numpoints, bool firsttime) |
36 | { |
37 | delta /= (firsttime ? damp : 2); |
38 | delta += (delta / numpoints); |
39 | |
40 | uint k = 0; |
41 | for (; delta > ((base - tmin) * tmax) / 2; k += base) |
42 | delta /= (base - tmin); |
43 | |
44 | return k + (((base - tmin + 1) * delta) / (delta + skew)); |
45 | } |
46 | |
47 | static inline void appendEncode(QString *output, uint delta, uint bias) |
48 | { |
49 | uint qq; |
50 | uint k; |
51 | uint t; |
52 | |
53 | // insert the variable length delta integer. |
54 | for (qq = delta, k = base;; k += base) { |
55 | // stop generating digits when the threshold is |
56 | // detected. |
57 | t = (k <= bias) ? tmin : (k >= bias + tmax) ? tmax : k - bias; |
58 | if (qq < t) break; |
59 | |
60 | *output += QChar(encodeDigit(digit: t + (qq - t) % (base - t))); |
61 | qq = (qq - t) / (base - t); |
62 | } |
63 | |
64 | *output += QChar(encodeDigit(digit: qq)); |
65 | } |
66 | |
67 | Q_AUTOTEST_EXPORT void qt_punycodeEncoder(QStringView in, QString *output) |
68 | { |
69 | uint n = initial_n; |
70 | uint delta = 0; |
71 | uint bias = initial_bias; |
72 | |
73 | // Do not try to encode strings that certainly will result in output |
74 | // that is longer than allowable domain name label length. Note that |
75 | // non-BMP codepoints are encoded as two QChars. |
76 | if (in.size() > MaxDomainLabelLength * 2) |
77 | return; |
78 | |
79 | int outLen = output->size(); |
80 | output->resize(size: outLen + in.size()); |
81 | |
82 | QChar *d = output->data() + outLen; |
83 | bool skipped = false; |
84 | // copy all basic code points verbatim to output. |
85 | for (QChar c : in) { |
86 | if (c.unicode() < 0x80) |
87 | *d++ = c; |
88 | else |
89 | skipped = true; |
90 | } |
91 | |
92 | // if there were only basic code points, just return them |
93 | // directly; don't do any encoding. |
94 | if (!skipped) |
95 | return; |
96 | |
97 | output->truncate(pos: d - output->constData()); |
98 | int copied = output->size() - outLen; |
99 | |
100 | // h and b now contain the number of basic code points in input. |
101 | uint b = copied; |
102 | uint h = copied; |
103 | |
104 | // if basic code points were copied, add the delimiter character. |
105 | if (h > 0) |
106 | *output += u'-'; |
107 | |
108 | // compute the input length in Unicode code points. |
109 | uint inputLength = 0; |
110 | for (QStringIterator iter(in); iter.hasNext();) { |
111 | inputLength++; |
112 | |
113 | if (iter.next(invalidAs: char32_t(-1)) == char32_t(-1)) { |
114 | output->truncate(pos: outLen); |
115 | return; // invalid surrogate pair |
116 | } |
117 | } |
118 | |
119 | // while there are still unprocessed non-basic code points left in |
120 | // the input string... |
121 | while (h < inputLength) { |
122 | // find the character in the input string with the lowest unprocessed value. |
123 | uint m = std::numeric_limits<uint>::max(); |
124 | for (QStringIterator iter(in); iter.hasNext();) { |
125 | auto c = iter.nextUnchecked(); |
126 | static_assert(std::numeric_limits<decltype(m)>::max() |
127 | >= std::numeric_limits<decltype(c)>::max(), |
128 | "Punycode uint should be able to cover all codepoints" ); |
129 | if (c >= n && c < m) |
130 | m = c; |
131 | } |
132 | |
133 | // delta = delta + (m - n) * (h + 1), fail on overflow |
134 | uint tmp; |
135 | if (qMulOverflow<uint>(v1: m - n, v2: h + 1, r: &tmp) || qAddOverflow<uint>(v1: delta, v2: tmp, r: &delta)) { |
136 | output->truncate(pos: outLen); |
137 | return; // punycode_overflow |
138 | } |
139 | n = m; |
140 | |
141 | for (QStringIterator iter(in); iter.hasNext();) { |
142 | auto c = iter.nextUnchecked(); |
143 | |
144 | // increase delta until we reach the character processed in this iteration; |
145 | // fail if delta overflows. |
146 | if (c < n) { |
147 | if (qAddOverflow<uint>(v1: delta, v2: 1, r: &delta)) { |
148 | output->truncate(pos: outLen); |
149 | return; // punycode_overflow |
150 | } |
151 | } |
152 | |
153 | if (c == n) { |
154 | appendEncode(output, delta, bias); |
155 | |
156 | bias = adapt(delta, numpoints: h + 1, firsttime: h == b); |
157 | delta = 0; |
158 | ++h; |
159 | } |
160 | } |
161 | |
162 | ++delta; |
163 | ++n; |
164 | } |
165 | |
166 | // prepend ACE prefix |
167 | output->insert(i: outLen, s: "xn--"_L1 ); |
168 | return; |
169 | } |
170 | |
171 | Q_AUTOTEST_EXPORT QString qt_punycodeDecoder(const QString &pc) |
172 | { |
173 | uint n = initial_n; |
174 | uint i = 0; |
175 | uint bias = initial_bias; |
176 | |
177 | // Do not try to decode strings longer than allowable for a domain label. |
178 | // Non-ASCII strings are not allowed here anyway, so there is no need |
179 | // to account for surrogates. |
180 | if (pc.size() > MaxDomainLabelLength) |
181 | return QString(); |
182 | |
183 | // strip any ACE prefix |
184 | int start = pc.startsWith(s: "xn--"_L1 ) ? 4 : 0; |
185 | if (!start) |
186 | return pc; |
187 | |
188 | // find the last delimiter character '-' in the input array. copy |
189 | // all data before this delimiter directly to the output array. |
190 | int delimiterPos = pc.lastIndexOf(c: u'-'); |
191 | auto output = delimiterPos < 4 ? std::u32string() |
192 | : pc.mid(position: start, n: delimiterPos - start).toStdU32String(); |
193 | |
194 | // if a delimiter was found, skip to the position after it; |
195 | // otherwise start at the front of the input string. everything |
196 | // before the delimiter is assumed to be basic code points. |
197 | uint cnt = delimiterPos + 1; |
198 | |
199 | // loop through the rest of the input string, inserting non-basic |
200 | // characters into output as we go. |
201 | while (cnt < (uint) pc.size()) { |
202 | uint oldi = i; |
203 | uint w = 1; |
204 | |
205 | // find the next index for inserting a non-basic character. |
206 | for (uint k = base; cnt < (uint) pc.size(); k += base) { |
207 | // grab a character from the punycode input and find its |
208 | // delta digit (each digit code is part of the |
209 | // variable-length integer delta) |
210 | uint digit = pc.at(i: cnt++).unicode(); |
211 | if (digit - 48 < 10) digit -= 22; |
212 | else if (digit - 65 < 26) digit -= 65; |
213 | else if (digit - 97 < 26) digit -= 97; |
214 | else digit = base; |
215 | |
216 | // Fail if the code point has no digit value |
217 | if (digit >= base) |
218 | return QString(); |
219 | |
220 | // i = i + digit * w, fail on overflow |
221 | uint tmp; |
222 | if (qMulOverflow<uint>(v1: digit, v2: w, r: &tmp) || qAddOverflow<uint>(v1: i, v2: tmp, r: &i)) |
223 | return QString(); |
224 | |
225 | // detect threshold to stop reading delta digits |
226 | uint t; |
227 | if (k <= bias) t = tmin; |
228 | else if (k >= bias + tmax) t = tmax; |
229 | else t = k - bias; |
230 | |
231 | if (digit < t) break; |
232 | |
233 | // w = w * (base - t), fail on overflow |
234 | if (qMulOverflow<uint>(v1: w, v2: base - t, r: &w)) |
235 | return QString(); |
236 | } |
237 | |
238 | // find new bias and calculate the next non-basic code |
239 | // character. |
240 | uint outputLength = static_cast<uint>(output.length()); |
241 | bias = adapt(delta: i - oldi, numpoints: outputLength + 1, firsttime: oldi == 0); |
242 | |
243 | // n = n + i div (length(output) + 1), fail on overflow |
244 | if (qAddOverflow<uint>(v1: n, v2: i / (outputLength + 1), r: &n)) |
245 | return QString(); |
246 | |
247 | // allow the deltas to wrap around |
248 | i %= (outputLength + 1); |
249 | |
250 | // if n is a basic code point then fail; this should not happen with |
251 | // correct implementation of Punycode, but check just n case. |
252 | if (n < initial_n) { |
253 | // Don't use Q_ASSERT() to avoid possibility of DoS |
254 | qWarning(msg: "Attempt to insert a basic codepoint. Unhandled overflow?" ); |
255 | return QString(); |
256 | } |
257 | |
258 | // Surrogates should normally be rejected later by other IDNA code. |
259 | // But because of Qt's use of UTF-16 to represent strings the |
260 | // IDNA code is not able to distinguish characters represented as pairs |
261 | // of surrogates from normal code points. This is why surrogates are |
262 | // not allowed here. |
263 | // |
264 | // Allowing surrogates would lead to non-unique (after normalization) |
265 | // encoding of strings with non-BMP characters. |
266 | // |
267 | // Punycode that encodes characters outside the Unicode range is also |
268 | // invalid and is rejected here. |
269 | if (QChar::isSurrogate(ucs4: n) || n > QChar::LastValidCodePoint) |
270 | return QString(); |
271 | |
272 | // insert the character n at position i |
273 | output.insert(pos: i, n: 1, c: static_cast<char32_t>(n)); |
274 | ++i; |
275 | } |
276 | |
277 | return QString::fromStdU32String(s: output); |
278 | } |
279 | |
280 | static constexpr auto idn_whitelist = qOffsetStringArray( |
281 | strings: "ac" , strings: "ar" , strings: "asia" , strings: "at" , |
282 | strings: "biz" , strings: "br" , |
283 | strings: "cat" , strings: "ch" , strings: "cl" , strings: "cn" , strings: "com" , |
284 | strings: "de" , strings: "dk" , |
285 | strings: "es" , |
286 | strings: "fi" , |
287 | strings: "gr" , |
288 | strings: "hu" , |
289 | strings: "il" , strings: "info" , strings: "io" , strings: "is" , strings: "ir" , |
290 | strings: "jp" , |
291 | strings: "kr" , |
292 | strings: "li" , strings: "lt" , strings: "lu" , strings: "lv" , |
293 | strings: "museum" , |
294 | strings: "name" , strings: "net" , strings: "no" , strings: "nu" , strings: "nz" , |
295 | strings: "org" , |
296 | strings: "pl" , strings: "pr" , |
297 | strings: "se" , strings: "sh" , |
298 | strings: "tel" , strings: "th" , strings: "tm" , strings: "tw" , |
299 | strings: "ua" , |
300 | strings: "vn" , |
301 | strings: "xn--fiqs8s" , // China |
302 | strings: "xn--fiqz9s" , // China |
303 | strings: "xn--fzc2c9e2c" , // Sri Lanka |
304 | strings: "xn--j6w193g" , // Hong Kong |
305 | strings: "xn--kprw13d" , // Taiwan |
306 | strings: "xn--kpry57d" , // Taiwan |
307 | strings: "xn--mgba3a4f16a" , // Iran |
308 | strings: "xn--mgba3a4fra" , // Iran |
309 | strings: "xn--mgbaam7a8h" , // UAE |
310 | strings: "xn--mgbayh7gpa" , // Jordan |
311 | strings: "xn--mgberp4a5d4ar" , // Saudi Arabia |
312 | strings: "xn--ogbpf8fl" , // Syria |
313 | strings: "xn--p1ai" , // Russian Federation |
314 | strings: "xn--wgbh1c" , // Egypt |
315 | strings: "xn--wgbl6a" , // Qatar |
316 | strings: "xn--xkc2al3hye2a" // Sri Lanka |
317 | ); |
318 | |
319 | Q_CONSTINIT static QStringList *user_idn_whitelist = nullptr; |
320 | |
321 | static bool lessThan(const QChar *a, int l, const char *c) |
322 | { |
323 | const auto *uc = reinterpret_cast<const char16_t *>(a); |
324 | const char16_t *e = uc + l; |
325 | |
326 | if (!c || *c == 0) |
327 | return false; |
328 | |
329 | while (*c) { |
330 | if (uc == e || *uc != static_cast<unsigned char>(*c)) |
331 | break; |
332 | ++uc; |
333 | ++c; |
334 | } |
335 | return uc == e ? *c : (*uc < static_cast<unsigned char>(*c)); |
336 | } |
337 | |
338 | static bool equal(const QChar *a, int l, const char *b) |
339 | { |
340 | while (l && a->unicode() && *b) { |
341 | if (*a != QLatin1Char(*b)) |
342 | return false; |
343 | ++a; |
344 | ++b; |
345 | --l; |
346 | } |
347 | return l == 0; |
348 | } |
349 | |
350 | static bool qt_is_idn_enabled(QStringView aceDomain) |
351 | { |
352 | auto idx = aceDomain.lastIndexOf(c: u'.'); |
353 | if (idx == -1) |
354 | return false; |
355 | |
356 | auto tldString = aceDomain.mid(pos: idx + 1); |
357 | const auto len = tldString.size(); |
358 | |
359 | const QChar *tld = tldString.constData(); |
360 | |
361 | if (user_idn_whitelist) |
362 | return user_idn_whitelist->contains(str: tldString); |
363 | |
364 | int l = 0; |
365 | int r = idn_whitelist.count() - 1; |
366 | int i = (l + r + 1) / 2; |
367 | |
368 | while (r != l) { |
369 | if (lessThan(a: tld, l: len, c: idn_whitelist.at(index: i))) |
370 | r = i - 1; |
371 | else |
372 | l = i; |
373 | i = (l + r + 1) / 2; |
374 | } |
375 | return equal(a: tld, l: len, b: idn_whitelist.at(index: i)); |
376 | } |
377 | |
378 | template<typename C> |
379 | static inline bool isValidInNormalizedAsciiLabel(C c) |
380 | { |
381 | return c == u'-' || c == u'_' || (c >= u'0' && c <= u'9') || (c >= u'a' && c <= u'z'); |
382 | } |
383 | |
384 | template<typename C> |
385 | static inline bool isValidInNormalizedAsciiName(C c) |
386 | { |
387 | return isValidInNormalizedAsciiLabel(c) || c == u'.'; |
388 | } |
389 | |
390 | /* |
391 | Map domain name according to algorithm in UTS #46, 4.1 |
392 | |
393 | Returns empty string if there are disallowed characters in the input. |
394 | |
395 | Sets resultIsAscii if the result is known for sure to be all ASCII. |
396 | */ |
397 | static QString mapDomainName(const QString &in, QUrl::AceProcessingOptions options, |
398 | bool *resultIsAscii) |
399 | { |
400 | *resultIsAscii = true; |
401 | |
402 | // Check if the input is already normalized ASCII first and can be returned as is. |
403 | int i = 0; |
404 | for (auto c : in) { |
405 | if (c.unicode() >= 0x80 || !isValidInNormalizedAsciiName(c)) |
406 | break; |
407 | i++; |
408 | } |
409 | |
410 | if (i == in.size()) |
411 | return in; |
412 | |
413 | QString result; |
414 | result.reserve(asize: in.size()); |
415 | result.append(uc: in.constData(), len: i); |
416 | bool allAscii = true; |
417 | |
418 | for (QStringIterator iter(QStringView(in).sliced(pos: i)); iter.hasNext();) { |
419 | char32_t uc = iter.next(); |
420 | |
421 | // Fast path for ASCII-only inputs |
422 | if (Q_LIKELY(uc < 0x80)) { |
423 | if (uc >= U'A' && uc <= U'Z') |
424 | uc |= 0x20; // lower-case it |
425 | |
426 | if (!isValidInNormalizedAsciiName(c: uc)) |
427 | return {}; |
428 | |
429 | result.append(c: static_cast<char16_t>(uc)); |
430 | continue; |
431 | } |
432 | allAscii = false; |
433 | |
434 | QUnicodeTables::IdnaStatus status = QUnicodeTables::idnaStatus(ucs4: uc); |
435 | |
436 | if (status == QUnicodeTables::IdnaStatus::Deviation) |
437 | status = options.testFlag(flag: QUrl::AceTransitionalProcessing) |
438 | ? QUnicodeTables::IdnaStatus::Mapped |
439 | : QUnicodeTables::IdnaStatus::Valid; |
440 | |
441 | switch (status) { |
442 | case QUnicodeTables::IdnaStatus::Ignored: |
443 | continue; |
444 | case QUnicodeTables::IdnaStatus::Valid: |
445 | for (auto c : QChar::fromUcs4(c: uc)) |
446 | result.append(c); |
447 | break; |
448 | case QUnicodeTables::IdnaStatus::Mapped: |
449 | result.append(v: QUnicodeTables::idnaMapping(usc4: uc)); |
450 | break; |
451 | case QUnicodeTables::IdnaStatus::Disallowed: |
452 | return {}; |
453 | default: |
454 | Q_UNREACHABLE(); |
455 | } |
456 | } |
457 | |
458 | *resultIsAscii = allAscii; |
459 | return result; |
460 | } |
461 | |
462 | /* |
463 | Check the rules for an ASCII label. |
464 | |
465 | Check the size restriction and that the label does not start or end with dashes. |
466 | |
467 | The label should be nonempty. |
468 | */ |
469 | static bool validateAsciiLabel(QStringView label) |
470 | { |
471 | if (label.size() > MaxDomainLabelLength) |
472 | return false; |
473 | |
474 | if (label.first() == u'-' || label.last() == u'-') |
475 | return false; |
476 | |
477 | return std::all_of(first: label.begin(), last: label.end(), pred: isValidInNormalizedAsciiLabel<QChar>); |
478 | } |
479 | |
480 | namespace { |
481 | |
482 | class DomainValidityChecker |
483 | { |
484 | bool domainNameIsBidi = false; |
485 | bool hadBidiErrors = false; |
486 | |
487 | static constexpr char32_t ZWNJ = U'\u200C'; |
488 | static constexpr char32_t ZWJ = U'\u200D'; |
489 | |
490 | public: |
491 | DomainValidityChecker() { } |
492 | bool checkLabel(const QString &label, QUrl::AceProcessingOptions options); |
493 | |
494 | private: |
495 | static bool checkContextJRules(QStringView label); |
496 | static bool checkBidiRules(QStringView label); |
497 | }; |
498 | |
499 | } // anonymous namespace |
500 | |
501 | /* |
502 | Check CONTEXTJ rules according to RFC 5892, appendix A.1 & A.2. |
503 | |
504 | Rule Set for U+200C (ZWNJ): |
505 | |
506 | False; |
507 | |
508 | If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; |
509 | |
510 | If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C |
511 | |
512 | (Joining_Type:T)*(Joining_Type:{R,D})) Then True; |
513 | |
514 | Rule Set for U+200D (ZWJ): |
515 | |
516 | False; |
517 | |
518 | If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; |
519 | |
520 | */ |
521 | bool DomainValidityChecker::checkContextJRules(QStringView label) |
522 | { |
523 | constexpr unsigned char CombiningClassVirama = 9; |
524 | |
525 | enum class State { |
526 | Initial, |
527 | LD_T, // L,D with possible following T* |
528 | ZWNJ_T, // ZWNJ with possible following T* |
529 | }; |
530 | State regexpState = State::Initial; |
531 | bool previousIsVirama = false; |
532 | |
533 | for (QStringIterator iter(label); iter.hasNext();) { |
534 | auto ch = iter.next(); |
535 | |
536 | if (ch == ZWJ) { |
537 | if (!previousIsVirama) |
538 | return false; |
539 | regexpState = State::Initial; |
540 | } else if (ch == ZWNJ) { |
541 | if (!previousIsVirama && regexpState != State::LD_T) |
542 | return false; |
543 | regexpState = previousIsVirama ? State::Initial : State::ZWNJ_T; |
544 | } else { |
545 | switch (QChar::joiningType(ucs4: ch)) { |
546 | case QChar::Joining_Left: |
547 | if (regexpState == State::ZWNJ_T) |
548 | return false; |
549 | regexpState = State::LD_T; |
550 | break; |
551 | case QChar::Joining_Right: |
552 | regexpState = State::Initial; |
553 | break; |
554 | case QChar::Joining_Dual: |
555 | regexpState = State::LD_T; |
556 | break; |
557 | case QChar::Joining_Transparent: |
558 | break; |
559 | default: |
560 | regexpState = State::Initial; |
561 | break; |
562 | } |
563 | } |
564 | |
565 | previousIsVirama = QChar::combiningClass(ucs4: ch) == CombiningClassVirama; |
566 | } |
567 | |
568 | return regexpState != State::ZWNJ_T; |
569 | } |
570 | |
571 | /* |
572 | Check if the label conforms to BiDi rule of RFC 5893. |
573 | |
574 | 1. The first character must be a character with Bidi property L, R, |
575 | or AL. If it has the R or AL property, it is an RTL label; if it |
576 | has the L property, it is an LTR label. |
577 | |
578 | 2. In an RTL label, only characters with the Bidi properties R, AL, |
579 | AN, EN, ES, CS, ET, ON, BN, or NSM are allowed. |
580 | |
581 | 3. In an RTL label, the end of the label must be a character with |
582 | Bidi property R, AL, EN, or AN, followed by zero or more |
583 | characters with Bidi property NSM. |
584 | |
585 | 4. In an RTL label, if an EN is present, no AN may be present, and |
586 | vice versa. |
587 | |
588 | 5. In an LTR label, only characters with the Bidi properties L, EN, |
589 | ES, CS, ET, ON, BN, or NSM are allowed. |
590 | |
591 | 6. In an LTR label, the end of the label must be a character with |
592 | Bidi property L or EN, followed by zero or more characters with |
593 | Bidi property NSM. |
594 | */ |
595 | bool DomainValidityChecker::checkBidiRules(QStringView label) |
596 | { |
597 | if (label.isEmpty()) |
598 | return true; |
599 | |
600 | QStringIterator iter(label); |
601 | Q_ASSERT(iter.hasNext()); |
602 | |
603 | char32_t ch = iter.next(); |
604 | bool labelIsRTL = false; |
605 | |
606 | switch (QChar::direction(ucs4: ch)) { |
607 | case QChar::DirL: |
608 | break; |
609 | case QChar::DirR: |
610 | case QChar::DirAL: |
611 | labelIsRTL = true; |
612 | break; |
613 | default: |
614 | return false; |
615 | } |
616 | |
617 | bool tailOk = true; |
618 | bool labelHasEN = false; |
619 | bool labelHasAN = false; |
620 | |
621 | while (iter.hasNext()) { |
622 | ch = iter.next(); |
623 | |
624 | switch (QChar::direction(ucs4: ch)) { |
625 | case QChar::DirR: |
626 | case QChar::DirAL: |
627 | if (!labelIsRTL) |
628 | return false; |
629 | tailOk = true; |
630 | break; |
631 | |
632 | case QChar::DirL: |
633 | if (labelIsRTL) |
634 | return false; |
635 | tailOk = true; |
636 | break; |
637 | |
638 | case QChar::DirES: |
639 | case QChar::DirCS: |
640 | case QChar::DirET: |
641 | case QChar::DirON: |
642 | case QChar::DirBN: |
643 | tailOk = false; |
644 | break; |
645 | |
646 | case QChar::DirNSM: |
647 | break; |
648 | |
649 | case QChar::DirAN: |
650 | if (labelIsRTL) { |
651 | if (labelHasEN) |
652 | return false; |
653 | labelHasAN = true; |
654 | tailOk = true; |
655 | } else { |
656 | return false; |
657 | } |
658 | break; |
659 | |
660 | case QChar::DirEN: |
661 | if (labelIsRTL) { |
662 | if (labelHasAN) |
663 | return false; |
664 | labelHasEN = true; |
665 | } |
666 | tailOk = true; |
667 | break; |
668 | |
669 | default: |
670 | return false; |
671 | } |
672 | } |
673 | |
674 | return tailOk; |
675 | } |
676 | |
677 | /* |
678 | Check if the given label is valid according to UTS #46 validity criteria. |
679 | |
680 | NFC check can be skipped if the label was transformed to NFC before calling |
681 | this function (as optimization). |
682 | |
683 | The domain name is considered invalid if this function returns false at least |
684 | once. |
685 | |
686 | 1. The label must be in Unicode Normalization Form NFC. |
687 | 2. If CheckHyphens, the label must not contain a U+002D HYPHEN-MINUS character |
688 | in both the third and fourth positions. |
689 | 3. If CheckHyphens, the label must neither begin nor end with a U+002D HYPHEN-MINUS character. |
690 | 4. The label must not contain a U+002E ( . ) FULL STOP. |
691 | 5. The label must not begin with a combining mark, that is: General_Category=Mark. |
692 | 6. Each code point in the label must only have certain status values according to Section 5, |
693 | IDNA Mapping Table: |
694 | 1. For Transitional Processing, each value must be valid. |
695 | 2. For Nontransitional Processing, each value must be either valid or deviation. |
696 | 7. If CheckJoiners, the label must satisfy the ContextJ rules from Appendix A, in The Unicode |
697 | Code Points and Internationalized Domain Names for Applications (IDNA). |
698 | 8. If CheckBidi, and if the domain name is a Bidi domain name, then the label must satisfy |
699 | all six of the numbered conditions in RFC 5893, Section 2. |
700 | |
701 | NOTE: Don't use QStringView for label, so that call to QString::normalized() can avoid |
702 | memory allocation when there is nothing to normalize. |
703 | */ |
704 | bool DomainValidityChecker::checkLabel(const QString &label, QUrl::AceProcessingOptions options) |
705 | { |
706 | if (label.isEmpty()) |
707 | return true; |
708 | |
709 | if (label != label.normalized(mode: QString::NormalizationForm_C)) |
710 | return false; |
711 | |
712 | if (label.size() >= 4) { |
713 | // This assumes that the first two characters are in BMP, but that's ok |
714 | // because non-BMP characters are unlikely to be used for specifying |
715 | // future extensions. |
716 | if (label[2] == u'-' && label[3] == u'-') |
717 | return false; |
718 | } |
719 | |
720 | if (label.startsWith(c: u'-') || label.endsWith(c: u'-')) |
721 | return false; |
722 | |
723 | if (label.contains(c: u'.')) |
724 | return false; |
725 | |
726 | QStringIterator iter(label); |
727 | auto c = iter.next(); |
728 | |
729 | if (QChar::isMark(ucs4: c)) |
730 | return false; |
731 | |
732 | // As optimization, CONTEXTJ rules check can be skipped if no |
733 | // ZWJ/ZWNJ characters were found during the first pass. |
734 | bool hasJoiners = false; |
735 | |
736 | for (;;) { |
737 | hasJoiners = hasJoiners || c == ZWNJ || c == ZWJ; |
738 | |
739 | if (!domainNameIsBidi) { |
740 | switch (QChar::direction(ucs4: c)) { |
741 | case QChar::DirR: |
742 | case QChar::DirAL: |
743 | case QChar::DirAN: |
744 | domainNameIsBidi = true; |
745 | if (hadBidiErrors) |
746 | return false; |
747 | break; |
748 | default: |
749 | break; |
750 | } |
751 | } |
752 | |
753 | switch (QUnicodeTables::idnaStatus(ucs4: c)) { |
754 | case QUnicodeTables::IdnaStatus::Valid: |
755 | break; |
756 | case QUnicodeTables::IdnaStatus::Deviation: |
757 | if (options.testFlag(flag: QUrl::AceTransitionalProcessing)) |
758 | return false; |
759 | break; |
760 | default: |
761 | return false; |
762 | } |
763 | |
764 | if (!iter.hasNext()) |
765 | break; |
766 | c = iter.next(); |
767 | } |
768 | |
769 | if (hasJoiners && !checkContextJRules(label)) |
770 | return false; |
771 | |
772 | hadBidiErrors = hadBidiErrors || !checkBidiRules(label); |
773 | |
774 | if (domainNameIsBidi && hadBidiErrors) |
775 | return false; |
776 | |
777 | return true; |
778 | } |
779 | |
780 | static QString convertToAscii(const QString &normalizedDomain, AceLeadingDot dot) |
781 | { |
782 | qsizetype lastIdx = 0; |
783 | QString aceForm; // this variable is here for caching |
784 | QString aceResult; |
785 | |
786 | while (true) { |
787 | auto idx = normalizedDomain.indexOf(c: u'.', from: lastIdx); |
788 | if (idx == -1) |
789 | idx = normalizedDomain.size(); |
790 | |
791 | const auto labelLength = idx - lastIdx; |
792 | if (labelLength == 0) { |
793 | if (idx == normalizedDomain.size()) |
794 | break; |
795 | if (dot == ForbidLeadingDot || idx > 0) |
796 | return {}; // two delimiters in a row -- empty label not allowed |
797 | } else { |
798 | const auto label = QStringView(normalizedDomain).sliced(pos: lastIdx, n: labelLength); |
799 | aceForm.clear(); |
800 | qt_punycodeEncoder(in: label, output: &aceForm); |
801 | if (aceForm.isEmpty()) |
802 | return {}; |
803 | |
804 | aceResult.append(s: aceForm); |
805 | } |
806 | |
807 | if (idx == normalizedDomain.size()) |
808 | break; |
809 | |
810 | lastIdx = idx + 1; |
811 | aceResult += u'.'; |
812 | } |
813 | |
814 | return aceResult; |
815 | } |
816 | |
817 | static bool checkAsciiDomainName(const QString &normalizedDomain, AceLeadingDot dot, |
818 | bool *usesPunycode) |
819 | { |
820 | qsizetype lastIdx = 0; |
821 | bool hasPunycode = false; |
822 | *usesPunycode = false; |
823 | |
824 | while (lastIdx < normalizedDomain.size()) { |
825 | auto idx = normalizedDomain.indexOf(c: u'.', from: lastIdx); |
826 | if (idx == -1) |
827 | idx = normalizedDomain.size(); |
828 | |
829 | const auto labelLength = idx - lastIdx; |
830 | if (labelLength == 0) { |
831 | if (idx == normalizedDomain.size()) |
832 | break; |
833 | if (dot == ForbidLeadingDot || idx > 0) |
834 | return false; // two delimiters in a row -- empty label not allowed |
835 | } else { |
836 | const auto label = QStringView(normalizedDomain).sliced(pos: lastIdx, n: labelLength); |
837 | if (!validateAsciiLabel(label)) |
838 | return false; |
839 | |
840 | hasPunycode = hasPunycode || label.startsWith(s: "xn--"_L1 ); |
841 | } |
842 | |
843 | lastIdx = idx + 1; |
844 | } |
845 | |
846 | *usesPunycode = hasPunycode; |
847 | return true; |
848 | } |
849 | |
850 | static QString convertToUnicode(const QString &asciiDomain, QUrl::AceProcessingOptions options) |
851 | { |
852 | QString result; |
853 | result.reserve(asize: asciiDomain.size()); |
854 | qsizetype lastIdx = 0; |
855 | |
856 | DomainValidityChecker checker; |
857 | |
858 | while (true) { |
859 | auto idx = asciiDomain.indexOf(c: u'.', from: lastIdx); |
860 | if (idx == -1) |
861 | idx = asciiDomain.size(); |
862 | |
863 | const auto labelLength = idx - lastIdx; |
864 | if (labelLength == 0) { |
865 | if (idx == asciiDomain.size()) |
866 | break; |
867 | } else { |
868 | const auto label = asciiDomain.sliced(pos: lastIdx, n: labelLength); |
869 | const auto unicodeLabel = qt_punycodeDecoder(pc: label); |
870 | |
871 | if (unicodeLabel.isEmpty()) |
872 | return asciiDomain; |
873 | |
874 | if (!checker.checkLabel(label: unicodeLabel, options)) |
875 | return asciiDomain; |
876 | |
877 | result.append(s: unicodeLabel); |
878 | } |
879 | |
880 | if (idx == asciiDomain.size()) |
881 | break; |
882 | |
883 | lastIdx = idx + 1; |
884 | result += u'.'; |
885 | } |
886 | return result; |
887 | } |
888 | |
889 | QString qt_ACE_do(const QString &domain, AceOperation op, AceLeadingDot dot, |
890 | QUrl::AceProcessingOptions options) |
891 | { |
892 | if (domain.isEmpty()) |
893 | return {}; |
894 | |
895 | bool mappedToAscii; |
896 | const QString mapped = mapDomainName(in: domain, options, resultIsAscii: &mappedToAscii); |
897 | const QString normalized = |
898 | mappedToAscii ? mapped : mapped.normalized(mode: QString::NormalizationForm_C); |
899 | |
900 | if (normalized.isEmpty()) |
901 | return {}; |
902 | |
903 | bool needsCoversionToUnicode; |
904 | const QString aceResult = mappedToAscii ? normalized : convertToAscii(normalizedDomain: normalized, dot); |
905 | if (aceResult.isEmpty() || !checkAsciiDomainName(normalizedDomain: aceResult, dot, usesPunycode: &needsCoversionToUnicode)) |
906 | return {}; |
907 | |
908 | if (op == ToAceOnly || !needsCoversionToUnicode |
909 | || (!options.testFlag(flag: QUrl::IgnoreIDNWhitelist) && !qt_is_idn_enabled(aceDomain: aceResult))) { |
910 | return aceResult; |
911 | } |
912 | |
913 | return convertToUnicode(asciiDomain: aceResult, options); |
914 | } |
915 | |
916 | /*! |
917 | \since 4.2 |
918 | |
919 | Returns the current whitelist of top-level domains that are allowed |
920 | to have non-ASCII characters in their compositions. |
921 | |
922 | See setIdnWhitelist() for the rationale of this list. |
923 | |
924 | \sa AceProcessingOption |
925 | */ |
926 | QStringList QUrl::idnWhitelist() |
927 | { |
928 | if (user_idn_whitelist) |
929 | return *user_idn_whitelist; |
930 | static const QStringList list = [] { |
931 | QStringList list; |
932 | list.reserve(asize: idn_whitelist.count()); |
933 | int i = 0; |
934 | while (i < idn_whitelist.count()) { |
935 | list << QLatin1StringView(idn_whitelist.at(index: i)); |
936 | ++i; |
937 | } |
938 | return list; |
939 | }(); |
940 | return list; |
941 | } |
942 | |
943 | /*! |
944 | \since 4.2 |
945 | |
946 | Sets the whitelist of Top-Level Domains (TLDs) that are allowed to have |
947 | non-ASCII characters in domains to the value of \a list. |
948 | |
949 | Note that if you call this function, you need to do so \e before |
950 | you start any threads that might access idnWhitelist(). |
951 | |
952 | Qt comes with a default list that contains the Internet top-level domains |
953 | that have published support for Internationalized Domain Names (IDNs) |
954 | and rules to guarantee that no deception can happen between similarly-looking |
955 | characters (such as the Latin lowercase letter \c 'a' and the Cyrillic |
956 | equivalent, which in most fonts are visually identical). |
957 | |
958 | This list is periodically maintained, as registrars publish new rules. |
959 | |
960 | This function is provided for those who need to manipulate the list, in |
961 | order to add or remove a TLD. It is not recommended to change its value |
962 | for purposes other than testing, as it may expose users to security risks. |
963 | */ |
964 | void QUrl::setIdnWhitelist(const QStringList &list) |
965 | { |
966 | if (!user_idn_whitelist) |
967 | user_idn_whitelist = new QStringList; |
968 | *user_idn_whitelist = list; |
969 | } |
970 | |
971 | QT_END_NAMESPACE |
972 | |