| 1 | // Copyright (C) 2016 The Qt Company Ltd. | 
| 2 | // Copyright (C) 2016 Intel Corporation. | 
| 3 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only | 
| 4 |  | 
| 5 | #include "qurl_p.h" | 
| 6 |  | 
| 7 | #include <QtCore/qstringlist.h> | 
| 8 | #include <QtCore/private/qnumeric_p.h> | 
| 9 | #include <QtCore/private/qoffsetstringarray_p.h> | 
| 10 | #include <QtCore/private/qstringiterator_p.h> | 
| 11 | #include <QtCore/private/qunicodetables_p.h> | 
| 12 |  | 
| 13 | #include <algorithm> | 
| 14 |  | 
| 15 | QT_BEGIN_NAMESPACE | 
| 16 |  | 
| 17 | using namespace Qt::StringLiterals; | 
| 18 |  | 
| 19 | // needed by the punycode encoder/decoder | 
| 20 | static const uint base = 36; | 
| 21 | static const uint tmin = 1; | 
| 22 | static const uint tmax = 26; | 
| 23 | static const uint skew = 38; | 
| 24 | static const uint damp = 700; | 
| 25 | static const uint initial_bias = 72; | 
| 26 | static const uint initial_n = 128; | 
| 27 |  | 
| 28 | static constexpr qsizetype MaxDomainLabelLength = 63; | 
| 29 |  | 
| 30 | static inline uint encodeDigit(uint digit) | 
| 31 | { | 
| 32 |   return digit + 22 + 75 * (digit < 26); | 
| 33 | } | 
| 34 |  | 
| 35 | static inline uint adapt(uint delta, uint numpoints, bool firsttime) | 
| 36 | { | 
| 37 |     delta /= (firsttime ? damp : 2); | 
| 38 |     delta += (delta / numpoints); | 
| 39 |  | 
| 40 |     uint k = 0; | 
| 41 |     for (; delta > ((base - tmin) * tmax) / 2; k += base) | 
| 42 |         delta /= (base - tmin); | 
| 43 |  | 
| 44 |     return k + (((base - tmin + 1) * delta) / (delta + skew)); | 
| 45 | } | 
| 46 |  | 
| 47 | static inline void appendEncode(QString *output, uint delta, uint bias) | 
| 48 | { | 
| 49 |     uint qq; | 
| 50 |     uint k; | 
| 51 |     uint t; | 
| 52 |  | 
| 53 |     // insert the variable length delta integer. | 
| 54 |     for (qq = delta, k = base;; k += base) { | 
| 55 |         // stop generating digits when the threshold is | 
| 56 |         // detected. | 
| 57 |         t = (k <= bias) ? tmin : (k >= bias + tmax) ? tmax : k - bias; | 
| 58 |         if (qq < t) break; | 
| 59 |  | 
| 60 |         *output += QChar(encodeDigit(digit: t + (qq - t) % (base - t))); | 
| 61 |         qq = (qq - t) / (base - t); | 
| 62 |     } | 
| 63 |  | 
| 64 |     *output += QChar(encodeDigit(digit: qq)); | 
| 65 | } | 
| 66 |  | 
| 67 | Q_AUTOTEST_EXPORT void qt_punycodeEncoder(QStringView in, QString *output) | 
| 68 | { | 
| 69 |     uint n = initial_n; | 
| 70 |     uint delta = 0; | 
| 71 |     uint bias = initial_bias; | 
| 72 |  | 
| 73 |     // Do not try to encode strings that certainly will result in output | 
| 74 |     // that is longer than allowable domain name label length. Note that | 
| 75 |     // non-BMP codepoints are encoded as two QChars. | 
| 76 |     if (in.size() > MaxDomainLabelLength * 2) | 
| 77 |         return; | 
| 78 |  | 
| 79 |     int outLen = output->size(); | 
| 80 |     output->resize(size: outLen + in.size()); | 
| 81 |  | 
| 82 |     QChar *d = output->data() + outLen; | 
| 83 |     bool skipped = false; | 
| 84 |     // copy all basic code points verbatim to output. | 
| 85 |     for (QChar c : in) { | 
| 86 |         if (c.unicode() < 0x80) | 
| 87 |             *d++ = c; | 
| 88 |         else | 
| 89 |             skipped = true; | 
| 90 |     } | 
| 91 |  | 
| 92 |     // if there were only basic code points, just return them | 
| 93 |     // directly; don't do any encoding. | 
| 94 |     if (!skipped) | 
| 95 |         return; | 
| 96 |  | 
| 97 |     output->truncate(pos: d - output->constData()); | 
| 98 |     int copied = output->size() - outLen; | 
| 99 |  | 
| 100 |     // h and b now contain the number of basic code points in input. | 
| 101 |     uint b = copied; | 
| 102 |     uint h = copied; | 
| 103 |  | 
| 104 |     // if basic code points were copied, add the delimiter character. | 
| 105 |     if (h > 0) | 
| 106 |         *output += u'-'; | 
| 107 |  | 
| 108 |     // compute the input length in Unicode code points. | 
| 109 |     uint inputLength = 0; | 
| 110 |     for (QStringIterator iter(in); iter.hasNext();) { | 
| 111 |         inputLength++; | 
| 112 |  | 
| 113 |         if (iter.next(invalidAs: char32_t(-1)) == char32_t(-1)) { | 
| 114 |             output->truncate(pos: outLen); | 
| 115 |             return; // invalid surrogate pair | 
| 116 |         } | 
| 117 |     } | 
| 118 |  | 
| 119 |     // while there are still unprocessed non-basic code points left in | 
| 120 |     // the input string... | 
| 121 |     while (h < inputLength) { | 
| 122 |         // find the character in the input string with the lowest unprocessed value. | 
| 123 |         uint m = std::numeric_limits<uint>::max(); | 
| 124 |         for (QStringIterator iter(in); iter.hasNext();) { | 
| 125 |             auto c = iter.nextUnchecked(); | 
| 126 |             static_assert(std::numeric_limits<decltype(m)>::max() | 
| 127 |                                   >= std::numeric_limits<decltype(c)>::max(), | 
| 128 |                           "Punycode uint should be able to cover all codepoints" ); | 
| 129 |             if (c >= n && c < m) | 
| 130 |                 m = c; | 
| 131 |         } | 
| 132 |  | 
| 133 |         // delta = delta + (m - n) * (h + 1), fail on overflow | 
| 134 |         uint tmp; | 
| 135 |         if (qMulOverflow<uint>(v1: m - n, v2: h + 1, r: &tmp) || qAddOverflow<uint>(v1: delta, v2: tmp, r: &delta)) { | 
| 136 |             output->truncate(pos: outLen); | 
| 137 |             return; // punycode_overflow | 
| 138 |         } | 
| 139 |         n = m; | 
| 140 |  | 
| 141 |         for (QStringIterator iter(in); iter.hasNext();) { | 
| 142 |             auto c = iter.nextUnchecked(); | 
| 143 |  | 
| 144 |             // increase delta until we reach the character processed in this iteration; | 
| 145 |             // fail if delta overflows. | 
| 146 |             if (c < n) { | 
| 147 |                 if (qAddOverflow<uint>(v1: delta, v2: 1, r: &delta)) { | 
| 148 |                     output->truncate(pos: outLen); | 
| 149 |                     return; // punycode_overflow | 
| 150 |                 } | 
| 151 |             } | 
| 152 |  | 
| 153 |             if (c == n) { | 
| 154 |                 appendEncode(output, delta, bias); | 
| 155 |  | 
| 156 |                 bias = adapt(delta, numpoints: h + 1, firsttime: h == b); | 
| 157 |                 delta = 0; | 
| 158 |                 ++h; | 
| 159 |             } | 
| 160 |         } | 
| 161 |  | 
| 162 |         ++delta; | 
| 163 |         ++n; | 
| 164 |     } | 
| 165 |  | 
| 166 |     // prepend ACE prefix | 
| 167 |     output->insert(i: outLen, s: "xn--"_L1 ); | 
| 168 |     return; | 
| 169 | } | 
| 170 |  | 
| 171 | Q_AUTOTEST_EXPORT QString qt_punycodeDecoder(const QString &pc) | 
| 172 | { | 
| 173 |     uint n = initial_n; | 
| 174 |     uint i = 0; | 
| 175 |     uint bias = initial_bias; | 
| 176 |  | 
| 177 |     // Do not try to decode strings longer than allowable for a domain label. | 
| 178 |     // Non-ASCII strings are not allowed here anyway, so there is no need | 
| 179 |     // to account for surrogates. | 
| 180 |     if (pc.size() > MaxDomainLabelLength) | 
| 181 |         return QString(); | 
| 182 |  | 
| 183 |     // strip any ACE prefix | 
| 184 |     int start = pc.startsWith(s: "xn--"_L1 ) ? 4 : 0; | 
| 185 |     if (!start) | 
| 186 |         return pc; | 
| 187 |  | 
| 188 |     // find the last delimiter character '-' in the input array. copy | 
| 189 |     // all data before this delimiter directly to the output array. | 
| 190 |     int delimiterPos = pc.lastIndexOf(c: u'-'); | 
| 191 |     auto output = delimiterPos < 4 ? std::u32string() | 
| 192 |                                    : pc.mid(position: start, n: delimiterPos - start).toStdU32String(); | 
| 193 |  | 
| 194 |     // if a delimiter was found, skip to the position after it; | 
| 195 |     // otherwise start at the front of the input string. everything | 
| 196 |     // before the delimiter is assumed to be basic code points. | 
| 197 |     uint cnt = delimiterPos + 1; | 
| 198 |  | 
| 199 |     // loop through the rest of the input string, inserting non-basic | 
| 200 |     // characters into output as we go. | 
| 201 |     while (cnt < (uint) pc.size()) { | 
| 202 |         uint oldi = i; | 
| 203 |         uint w = 1; | 
| 204 |  | 
| 205 |         // find the next index for inserting a non-basic character. | 
| 206 |         for (uint k = base; cnt < (uint) pc.size(); k += base) { | 
| 207 |             // grab a character from the punycode input and find its | 
| 208 |             // delta digit (each digit code is part of the | 
| 209 |             // variable-length integer delta) | 
| 210 |             uint digit = pc.at(i: cnt++).unicode(); | 
| 211 |             if (digit - 48 < 10) digit -= 22; | 
| 212 |             else if (digit - 65 < 26) digit -= 65; | 
| 213 |             else if (digit - 97 < 26) digit -= 97; | 
| 214 |             else digit = base; | 
| 215 |  | 
| 216 |             // Fail if the code point has no digit value | 
| 217 |             if (digit >= base) | 
| 218 |                 return QString(); | 
| 219 |  | 
| 220 |             // i = i + digit * w, fail on overflow | 
| 221 |             uint tmp; | 
| 222 |             if (qMulOverflow<uint>(v1: digit, v2: w, r: &tmp) || qAddOverflow<uint>(v1: i, v2: tmp, r: &i)) | 
| 223 |                 return QString(); | 
| 224 |  | 
| 225 |             // detect threshold to stop reading delta digits | 
| 226 |             uint t; | 
| 227 |             if (k <= bias) t = tmin; | 
| 228 |             else if (k >= bias + tmax) t = tmax; | 
| 229 |             else t = k - bias; | 
| 230 |  | 
| 231 |             if (digit < t) break; | 
| 232 |  | 
| 233 |             // w = w * (base - t), fail on overflow | 
| 234 |             if (qMulOverflow<uint>(v1: w, v2: base - t, r: &w)) | 
| 235 |                 return QString(); | 
| 236 |         } | 
| 237 |  | 
| 238 |         // find new bias and calculate the next non-basic code | 
| 239 |         // character. | 
| 240 |         uint outputLength = static_cast<uint>(output.length()); | 
| 241 |         bias = adapt(delta: i - oldi, numpoints: outputLength + 1, firsttime: oldi == 0); | 
| 242 |  | 
| 243 |         // n = n + i div (length(output) + 1), fail on overflow | 
| 244 |         if (qAddOverflow<uint>(v1: n, v2: i / (outputLength + 1), r: &n)) | 
| 245 |             return QString(); | 
| 246 |  | 
| 247 |         // allow the deltas to wrap around | 
| 248 |         i %= (outputLength + 1); | 
| 249 |  | 
| 250 |         // if n is a basic code point then fail; this should not happen with | 
| 251 |         // correct implementation of Punycode, but check just n case. | 
| 252 |         if (n < initial_n) { | 
| 253 |             // Don't use Q_ASSERT() to avoid possibility of DoS | 
| 254 |             qWarning(msg: "Attempt to insert a basic codepoint. Unhandled overflow?" ); | 
| 255 |             return QString(); | 
| 256 |         } | 
| 257 |  | 
| 258 |         // Surrogates should normally be rejected later by other IDNA code. | 
| 259 |         // But because of Qt's use of UTF-16 to represent strings the | 
| 260 |         // IDNA code is not able to distinguish characters represented as pairs | 
| 261 |         // of surrogates from normal code points. This is why surrogates are | 
| 262 |         // not allowed here. | 
| 263 |         // | 
| 264 |         // Allowing surrogates would lead to non-unique (after normalization) | 
| 265 |         // encoding of strings with non-BMP characters. | 
| 266 |         // | 
| 267 |         // Punycode that encodes characters outside the Unicode range is also | 
| 268 |         // invalid and is rejected here. | 
| 269 |         if (QChar::isSurrogate(ucs4: n) || n > QChar::LastValidCodePoint) | 
| 270 |             return QString(); | 
| 271 |  | 
| 272 |         // insert the character n at position i | 
| 273 |         output.insert(pos: i, n: 1, c: static_cast<char32_t>(n)); | 
| 274 |         ++i; | 
| 275 |     } | 
| 276 |  | 
| 277 |     return QString::fromStdU32String(s: output); | 
| 278 | } | 
| 279 |  | 
| 280 | static constexpr auto idn_whitelist = qOffsetStringArray( | 
| 281 |     strings: "ac" , strings: "ar" , strings: "asia" , strings: "at" , | 
| 282 |     strings: "biz" , strings: "br" , | 
| 283 |     strings: "cat" , strings: "ch" , strings: "cl" , strings: "cn" , strings: "com" , | 
| 284 |     strings: "de" , strings: "dk" , | 
| 285 |     strings: "es" , | 
| 286 |     strings: "fi" , | 
| 287 |     strings: "gr" , | 
| 288 |     strings: "hu" , | 
| 289 |     strings: "il" , strings: "info" , strings: "io" , strings: "ir" , strings: "is" , | 
| 290 |     strings: "jp" , | 
| 291 |     strings: "kr" , | 
| 292 |     strings: "li" , strings: "lt" , strings: "lu" , strings: "lv" , | 
| 293 |     strings: "museum" , | 
| 294 |     strings: "name" , strings: "net" , strings: "no" , strings: "nu" , strings: "nz" , | 
| 295 |     strings: "org" , | 
| 296 |     strings: "pl" , strings: "pr" , | 
| 297 |     strings: "se" , strings: "sh" , | 
| 298 |     strings: "tel" , strings: "th" , strings: "tm" , strings: "tw" , | 
| 299 |     strings: "ua" , | 
| 300 |     strings: "vn" , | 
| 301 |     strings: "xn--fiqs8s" ,               // China | 
| 302 |     strings: "xn--fiqz9s" ,               // China | 
| 303 |     strings: "xn--fzc2c9e2c" ,            // Sri Lanka | 
| 304 |     strings: "xn--j6w193g" ,              // Hong Kong | 
| 305 |     strings: "xn--kprw13d" ,              // Taiwan | 
| 306 |     strings: "xn--kpry57d" ,              // Taiwan | 
| 307 |     strings: "xn--mgba3a4f16a" ,          // Iran | 
| 308 |     strings: "xn--mgba3a4fra" ,           // Iran | 
| 309 |     strings: "xn--mgbaam7a8h" ,           // UAE | 
| 310 |     strings: "xn--mgbayh7gpa" ,           // Jordan | 
| 311 |     strings: "xn--mgberp4a5d4ar" ,        // Saudi Arabia | 
| 312 |     strings: "xn--ogbpf8fl" ,             // Syria | 
| 313 |     strings: "xn--p1ai" ,                 // Russian Federation | 
| 314 |     strings: "xn--wgbh1c" ,               // Egypt | 
| 315 |     strings: "xn--wgbl6a" ,               // Qatar | 
| 316 |     strings: "xn--xkc2al3hye2a"           // Sri Lanka | 
| 317 | ); | 
| 318 |  | 
| 319 | Q_CONSTINIT static QStringList *user_idn_whitelist = nullptr; | 
| 320 |  | 
| 321 | static bool lessThan(const QChar *a, int l, const char *c) | 
| 322 | { | 
| 323 |     const auto *uc = reinterpret_cast<const char16_t *>(a); | 
| 324 |     const char16_t *e = uc + l; | 
| 325 |  | 
| 326 |     if (!c || *c == 0) | 
| 327 |         return false; | 
| 328 |  | 
| 329 |     while (*c) { | 
| 330 |         if (uc == e || *uc != static_cast<unsigned char>(*c)) | 
| 331 |             break; | 
| 332 |         ++uc; | 
| 333 |         ++c; | 
| 334 |     } | 
| 335 |     return uc == e ? *c : (*uc < static_cast<unsigned char>(*c)); | 
| 336 | } | 
| 337 |  | 
| 338 | static bool equal(const QChar *a, int l, const char *b) | 
| 339 | { | 
| 340 |     while (l && a->unicode() && *b) { | 
| 341 |         if (*a != QLatin1Char(*b)) | 
| 342 |             return false; | 
| 343 |         ++a; | 
| 344 |         ++b; | 
| 345 |         --l; | 
| 346 |     } | 
| 347 |     return l == 0; | 
| 348 | } | 
| 349 |  | 
| 350 | static bool qt_is_idn_enabled(QStringView aceDomain) | 
| 351 | { | 
| 352 |     auto idx = aceDomain.lastIndexOf(c: u'.'); | 
| 353 |     if (idx == -1) | 
| 354 |         return false; | 
| 355 |  | 
| 356 |     auto tldString = aceDomain.mid(pos: idx + 1); | 
| 357 |     const auto len = tldString.size(); | 
| 358 |  | 
| 359 |     const QChar *tld = tldString.constData(); | 
| 360 |  | 
| 361 |     if (user_idn_whitelist) | 
| 362 |         return user_idn_whitelist->contains(str: tldString); | 
| 363 |  | 
| 364 |     int l = 0; | 
| 365 |     int r = idn_whitelist.count() - 1; | 
| 366 |     int i = (l + r + 1) / 2; | 
| 367 |  | 
| 368 |     while (r != l) { | 
| 369 |         if (lessThan(a: tld, l: len, c: idn_whitelist.at(index: i))) | 
| 370 |             r = i - 1; | 
| 371 |         else | 
| 372 |             l = i; | 
| 373 |         i = (l + r + 1) / 2; | 
| 374 |     } | 
| 375 |     return equal(a: tld, l: len, b: idn_whitelist.at(index: i)); | 
| 376 | } | 
| 377 |  | 
| 378 | template<typename C> | 
| 379 | static inline bool isValidInNormalizedAsciiLabel(C c) | 
| 380 | { | 
| 381 |     return c == u'-' || c == u'_' || (c >= u'0' && c <= u'9') || (c >= u'a' && c <= u'z'); | 
| 382 | } | 
| 383 |  | 
| 384 | template<typename C> | 
| 385 | static inline bool isValidInNormalizedAsciiName(C c) | 
| 386 | { | 
| 387 |     return isValidInNormalizedAsciiLabel(c) || c == u'.'; | 
| 388 | } | 
| 389 |  | 
| 390 | /* | 
| 391 |     Map domain name according to algorithm in UTS #46, 4.1 | 
| 392 |  | 
| 393 |     Returns empty string if there are disallowed characters in the input. | 
| 394 |  | 
| 395 |     Sets resultIsAscii if the result is known for sure to be all ASCII. | 
| 396 | */ | 
| 397 | static QString mapDomainName(const QString &in, QUrl::AceProcessingOptions options, | 
| 398 |                              bool *resultIsAscii) | 
| 399 | { | 
| 400 |     *resultIsAscii = true; | 
| 401 |  | 
| 402 |     // Check if the input is already normalized ASCII first and can be returned as is. | 
| 403 |     int i = 0; | 
| 404 |     for (auto c : in) { | 
| 405 |         if (c.unicode() >= 0x80 || !isValidInNormalizedAsciiName(c)) | 
| 406 |             break; | 
| 407 |         i++; | 
| 408 |     } | 
| 409 |  | 
| 410 |     if (i == in.size()) | 
| 411 |         return in; | 
| 412 |  | 
| 413 |     QString result; | 
| 414 |     result.reserve(asize: in.size()); | 
| 415 |     result.append(uc: in.constData(), len: i); | 
| 416 |     bool allAscii = true; | 
| 417 |  | 
| 418 |     for (QStringIterator iter(QStringView(in).sliced(pos: i)); iter.hasNext();) { | 
| 419 |         char32_t uc = iter.next(); | 
| 420 |  | 
| 421 |         // Fast path for ASCII-only inputs | 
| 422 |         if (Q_LIKELY(uc < 0x80)) { | 
| 423 |             if (uc >= U'A' && uc <= U'Z') | 
| 424 |                 uc |= 0x20; // lower-case it | 
| 425 |  | 
| 426 |             if (isValidInNormalizedAsciiName(c: uc)) { | 
| 427 |                 result.append(c: static_cast<char16_t>(uc)); | 
| 428 |                 continue; | 
| 429 |             } | 
| 430 |         } | 
| 431 |  | 
| 432 |         allAscii = false; | 
| 433 |  | 
| 434 |         // Capital sharp S is a special case since UTR #46 revision 31 (Unicode 15.1) | 
| 435 |         if (uc == 0x1E9E && options.testFlag(flag: QUrl::AceTransitionalProcessing)) { | 
| 436 |             result.append(s: u"ss"_s ); | 
| 437 |             continue; | 
| 438 |         } | 
| 439 |  | 
| 440 |         QUnicodeTables::IdnaStatus status = QUnicodeTables::idnaStatus(ucs4: uc); | 
| 441 |  | 
| 442 |         if (status == QUnicodeTables::IdnaStatus::Deviation) | 
| 443 |             status = options.testFlag(flag: QUrl::AceTransitionalProcessing) | 
| 444 |                     ? QUnicodeTables::IdnaStatus::Mapped | 
| 445 |                     : QUnicodeTables::IdnaStatus::Valid; | 
| 446 |  | 
| 447 |         switch (status) { | 
| 448 |         case QUnicodeTables::IdnaStatus::Ignored: | 
| 449 |             continue; | 
| 450 |         case QUnicodeTables::IdnaStatus::Valid: | 
| 451 |         case QUnicodeTables::IdnaStatus::Disallowed: | 
| 452 |             for (auto c : QChar::fromUcs4(c: uc)) | 
| 453 |                 result.append(c); | 
| 454 |             break; | 
| 455 |         case QUnicodeTables::IdnaStatus::Mapped: | 
| 456 |             result.append(v: QUnicodeTables::idnaMapping(usc4: uc)); | 
| 457 |             break; | 
| 458 |         default: | 
| 459 |             Q_UNREACHABLE(); | 
| 460 |         } | 
| 461 |     } | 
| 462 |  | 
| 463 |     *resultIsAscii = allAscii; | 
| 464 |     return result; | 
| 465 | } | 
| 466 |  | 
| 467 | /* | 
| 468 |     Check the rules for an ASCII label. | 
| 469 |  | 
| 470 |     Check the size restriction and that the label does not start or end with dashes. | 
| 471 |  | 
| 472 |     The label should be nonempty. | 
| 473 | */ | 
| 474 | static bool validateAsciiLabel(QStringView label) | 
| 475 | { | 
| 476 |     if (label.size() > MaxDomainLabelLength) | 
| 477 |         return false; | 
| 478 |  | 
| 479 |     if (label.first() == u'-' || label.last() == u'-') | 
| 480 |         return false; | 
| 481 |  | 
| 482 |     return std::all_of(first: label.begin(), last: label.end(), pred: isValidInNormalizedAsciiLabel<QChar>); | 
| 483 | } | 
| 484 |  | 
| 485 | namespace { | 
| 486 |  | 
| 487 | class DomainValidityChecker | 
| 488 | { | 
| 489 |     bool domainNameIsBidi = false; | 
| 490 |     bool hadBidiErrors = false; | 
| 491 |     bool ignoreBidiErrors; | 
| 492 |  | 
| 493 |     static constexpr char32_t ZWNJ = U'\u200C'; | 
| 494 |     static constexpr char32_t ZWJ = U'\u200D'; | 
| 495 |  | 
| 496 | public: | 
| 497 |     DomainValidityChecker(bool ignoreBidiErrors = false) : ignoreBidiErrors(ignoreBidiErrors) { } | 
| 498 |     bool checkLabel(const QString &label, QUrl::AceProcessingOptions options); | 
| 499 |  | 
| 500 | private: | 
| 501 |     static bool checkContextJRules(QStringView label); | 
| 502 |     static bool checkBidiRules(QStringView label); | 
| 503 | }; | 
| 504 |  | 
| 505 | } // anonymous namespace | 
| 506 |  | 
| 507 | /* | 
| 508 |     Check CONTEXTJ rules according to RFC 5892, appendix A.1 & A.2. | 
| 509 |  | 
| 510 |     Rule Set for U+200C (ZWNJ): | 
| 511 |  | 
| 512 |       False; | 
| 513 |  | 
| 514 |       If Canonical_Combining_Class(Before(cp)) .eq.  Virama Then True; | 
| 515 |  | 
| 516 |       If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C | 
| 517 |  | 
| 518 |          (Joining_Type:T)*(Joining_Type:{R,D})) Then True; | 
| 519 |  | 
| 520 |     Rule Set for U+200D (ZWJ): | 
| 521 |  | 
| 522 |       False; | 
| 523 |  | 
| 524 |       If Canonical_Combining_Class(Before(cp)) .eq.  Virama Then True; | 
| 525 |  | 
| 526 | */ | 
| 527 | bool DomainValidityChecker::checkContextJRules(QStringView label) | 
| 528 | { | 
| 529 |     constexpr unsigned char CombiningClassVirama = 9; | 
| 530 |  | 
| 531 |     enum class State { | 
| 532 |         Initial, | 
| 533 |         LD_T, // L,D with possible following T* | 
| 534 |         ZWNJ_T, // ZWNJ with possible following T* | 
| 535 |     }; | 
| 536 |     State regexpState = State::Initial; | 
| 537 |     bool previousIsVirama = false; | 
| 538 |  | 
| 539 |     for (QStringIterator iter(label); iter.hasNext();) { | 
| 540 |         auto ch = iter.next(); | 
| 541 |  | 
| 542 |         if (ch == ZWJ) { | 
| 543 |             if (!previousIsVirama) | 
| 544 |                 return false; | 
| 545 |             regexpState = State::Initial; | 
| 546 |         } else if (ch == ZWNJ) { | 
| 547 |             if (!previousIsVirama && regexpState != State::LD_T) | 
| 548 |                 return false; | 
| 549 |             regexpState = previousIsVirama ? State::Initial : State::ZWNJ_T; | 
| 550 |         } else { | 
| 551 |             switch (QChar::joiningType(ucs4: ch)) { | 
| 552 |             case QChar::Joining_Left: | 
| 553 |                 if (regexpState == State::ZWNJ_T) | 
| 554 |                     return false; | 
| 555 |                 regexpState = State::LD_T; | 
| 556 |                 break; | 
| 557 |             case QChar::Joining_Right: | 
| 558 |                 regexpState = State::Initial; | 
| 559 |                 break; | 
| 560 |             case QChar::Joining_Dual: | 
| 561 |                 regexpState = State::LD_T; | 
| 562 |                 break; | 
| 563 |             case QChar::Joining_Transparent: | 
| 564 |                 break; | 
| 565 |             default: | 
| 566 |                 regexpState = State::Initial; | 
| 567 |                 break; | 
| 568 |             } | 
| 569 |         } | 
| 570 |  | 
| 571 |         previousIsVirama = QChar::combiningClass(ucs4: ch) == CombiningClassVirama; | 
| 572 |     } | 
| 573 |  | 
| 574 |     return regexpState != State::ZWNJ_T; | 
| 575 | } | 
| 576 |  | 
| 577 | /* | 
| 578 |     Check if the label conforms to BiDi rule of RFC 5893. | 
| 579 |  | 
| 580 |     1.  The first character must be a character with Bidi property L, R, | 
| 581 |         or AL.  If it has the R or AL property, it is an RTL label; if it | 
| 582 |         has the L property, it is an LTR label. | 
| 583 |  | 
| 584 |     2.  In an RTL label, only characters with the Bidi properties R, AL, | 
| 585 |         AN, EN, ES, CS, ET, ON, BN, or NSM are allowed. | 
| 586 |  | 
| 587 |     3.  In an RTL label, the end of the label must be a character with | 
| 588 |         Bidi property R, AL, EN, or AN, followed by zero or more | 
| 589 |         characters with Bidi property NSM. | 
| 590 |  | 
| 591 |     4.  In an RTL label, if an EN is present, no AN may be present, and | 
| 592 |         vice versa. | 
| 593 |  | 
| 594 |     5.  In an LTR label, only characters with the Bidi properties L, EN, | 
| 595 |         ES, CS, ET, ON, BN, or NSM are allowed. | 
| 596 |  | 
| 597 |     6.  In an LTR label, the end of the label must be a character with | 
| 598 |         Bidi property L or EN, followed by zero or more characters with | 
| 599 |         Bidi property NSM. | 
| 600 | */ | 
| 601 | bool DomainValidityChecker::checkBidiRules(QStringView label) | 
| 602 | { | 
| 603 |     if (label.isEmpty()) | 
| 604 |         return true; | 
| 605 |  | 
| 606 |     QStringIterator iter(label); | 
| 607 |     Q_ASSERT(iter.hasNext()); | 
| 608 |  | 
| 609 |     char32_t ch = iter.next(); | 
| 610 |     bool labelIsRTL = false; | 
| 611 |  | 
| 612 |     switch (QChar::direction(ucs4: ch)) { | 
| 613 |     case QChar::DirL: | 
| 614 |         break; | 
| 615 |     case QChar::DirR: | 
| 616 |     case QChar::DirAL: | 
| 617 |         labelIsRTL = true; | 
| 618 |         break; | 
| 619 |     default: | 
| 620 |         return false; | 
| 621 |     } | 
| 622 |  | 
| 623 |     bool tailOk = true; | 
| 624 |     bool labelHasEN = false; | 
| 625 |     bool labelHasAN = false; | 
| 626 |  | 
| 627 |     while (iter.hasNext()) { | 
| 628 |         ch = iter.next(); | 
| 629 |  | 
| 630 |         switch (QChar::direction(ucs4: ch)) { | 
| 631 |         case QChar::DirR: | 
| 632 |         case QChar::DirAL: | 
| 633 |             if (!labelIsRTL) | 
| 634 |                 return false; | 
| 635 |             tailOk = true; | 
| 636 |             break; | 
| 637 |  | 
| 638 |         case QChar::DirL: | 
| 639 |             if (labelIsRTL) | 
| 640 |                 return false; | 
| 641 |             tailOk = true; | 
| 642 |             break; | 
| 643 |  | 
| 644 |         case QChar::DirES: | 
| 645 |         case QChar::DirCS: | 
| 646 |         case QChar::DirET: | 
| 647 |         case QChar::DirON: | 
| 648 |         case QChar::DirBN: | 
| 649 |             tailOk = false; | 
| 650 |             break; | 
| 651 |  | 
| 652 |         case QChar::DirNSM: | 
| 653 |             break; | 
| 654 |  | 
| 655 |         case QChar::DirAN: | 
| 656 |             if (labelIsRTL) { | 
| 657 |                 if (labelHasEN) | 
| 658 |                     return false; | 
| 659 |                 labelHasAN = true; | 
| 660 |                 tailOk = true; | 
| 661 |             } else { | 
| 662 |                 return false; | 
| 663 |             } | 
| 664 |             break; | 
| 665 |  | 
| 666 |         case QChar::DirEN: | 
| 667 |             if (labelIsRTL) { | 
| 668 |                 if (labelHasAN) | 
| 669 |                     return false; | 
| 670 |                 labelHasEN = true; | 
| 671 |             } | 
| 672 |             tailOk = true; | 
| 673 |             break; | 
| 674 |  | 
| 675 |         default: | 
| 676 |             return false; | 
| 677 |         } | 
| 678 |     } | 
| 679 |  | 
| 680 |     return tailOk; | 
| 681 | } | 
| 682 |  | 
| 683 | /* | 
| 684 |     Check if the given label is valid according to UTS #46 validity criteria. | 
| 685 |  | 
| 686 |     NFC check can be skipped if the label was transformed to NFC before calling | 
| 687 |     this function (as optimization). | 
| 688 |  | 
| 689 |     The domain name is considered invalid if this function returns false at least | 
| 690 |     once. | 
| 691 |  | 
| 692 |     1. The label must be in Unicode Normalization Form NFC. | 
| 693 |     2. If CheckHyphens, the label must not contain a U+002D HYPHEN-MINUS character | 
| 694 |        in both the third and fourth positions. | 
| 695 |     3. If CheckHyphens, the label must neither begin nor end with a U+002D HYPHEN-MINUS character. | 
| 696 |     4. The label must not contain a U+002E ( . ) FULL STOP. | 
| 697 |     5. The label must not begin with a combining mark, that is: General_Category=Mark. | 
| 698 |     6. Each code point in the label must only have certain status values according to Section 5, | 
| 699 |        IDNA Mapping Table: | 
| 700 |         1. For Transitional Processing, each value must be valid. | 
| 701 |         2. For Nontransitional Processing, each value must be either valid or deviation. | 
| 702 |     7. If CheckJoiners, the label must satisfy the ContextJ rules from Appendix A, in The Unicode | 
| 703 |        Code Points and Internationalized Domain Names for Applications (IDNA). | 
| 704 |     8. If CheckBidi, and if the domain name is a  Bidi domain name, then the label must satisfy | 
| 705 |        all six of the numbered conditions in RFC 5893, Section 2. | 
| 706 |  | 
| 707 |     NOTE: Don't use QStringView for label, so that call to QString::normalized() can avoid | 
| 708 |           memory allocation when there is nothing to normalize. | 
| 709 | */ | 
| 710 | bool DomainValidityChecker::checkLabel(const QString &label, QUrl::AceProcessingOptions options) | 
| 711 | { | 
| 712 |     if (label.isEmpty()) | 
| 713 |         return true; | 
| 714 |  | 
| 715 |     if (label != label.normalized(mode: QString::NormalizationForm_C)) | 
| 716 |         return false; | 
| 717 |  | 
| 718 |     if (label.size() >= 4) { | 
| 719 |         // This assumes that the first two characters are in BMP, but that's ok | 
| 720 |         // because non-BMP characters are unlikely to be used for specifying | 
| 721 |         // future extensions. | 
| 722 |         if (label[2] == u'-' && label[3] == u'-') | 
| 723 |             return ignoreBidiErrors && label.startsWith(s: u"xn" ) && validateAsciiLabel(label); | 
| 724 |     } | 
| 725 |  | 
| 726 |     if (label.startsWith(c: u'-') || label.endsWith(c: u'-')) | 
| 727 |         return false; | 
| 728 |  | 
| 729 |     if (label.contains(c: u'.')) | 
| 730 |         return false; | 
| 731 |  | 
| 732 |     QStringIterator iter(label); | 
| 733 |     auto c = iter.next(); | 
| 734 |  | 
| 735 |     if (QChar::isMark(ucs4: c)) | 
| 736 |         return false; | 
| 737 |  | 
| 738 |     // As optimization, CONTEXTJ rules check can be skipped if no | 
| 739 |     // ZWJ/ZWNJ characters were found during the first pass. | 
| 740 |     bool hasJoiners = false; | 
| 741 |  | 
| 742 |     for (;;) { | 
| 743 |         hasJoiners = hasJoiners || c == ZWNJ || c == ZWJ; | 
| 744 |  | 
| 745 |         if (!ignoreBidiErrors && !domainNameIsBidi) { | 
| 746 |             switch (QChar::direction(ucs4: c)) { | 
| 747 |             case QChar::DirR: | 
| 748 |             case QChar::DirAL: | 
| 749 |             case QChar::DirAN: | 
| 750 |                 domainNameIsBidi = true; | 
| 751 |                 if (hadBidiErrors) | 
| 752 |                     return false; | 
| 753 |                 break; | 
| 754 |             default: | 
| 755 |                 break; | 
| 756 |             } | 
| 757 |         } | 
| 758 |  | 
| 759 |         switch (QUnicodeTables::idnaStatus(ucs4: c)) { | 
| 760 |         case QUnicodeTables::IdnaStatus::Valid: | 
| 761 |             break; | 
| 762 |         case QUnicodeTables::IdnaStatus::Deviation: | 
| 763 |             if (options.testFlag(flag: QUrl::AceTransitionalProcessing)) | 
| 764 |                 return false; | 
| 765 |             break; | 
| 766 |         default: | 
| 767 |             return false; | 
| 768 |         } | 
| 769 |  | 
| 770 |         if (!iter.hasNext()) | 
| 771 |             break; | 
| 772 |         c = iter.next(); | 
| 773 |     } | 
| 774 |  | 
| 775 |     if (hasJoiners && !checkContextJRules(label)) | 
| 776 |         return false; | 
| 777 |  | 
| 778 |     hadBidiErrors = hadBidiErrors || !checkBidiRules(label); | 
| 779 |  | 
| 780 |     if (domainNameIsBidi && hadBidiErrors) | 
| 781 |         return false; | 
| 782 |  | 
| 783 |     return true; | 
| 784 | } | 
| 785 |  | 
| 786 | static QString convertToAscii(QStringView normalizedDomain, AceLeadingDot dot) | 
| 787 | { | 
| 788 |     qsizetype lastIdx = 0; | 
| 789 |     QString aceForm; // this variable is here for caching | 
| 790 |     QString aceResult; | 
| 791 |  | 
| 792 |     while (true) { | 
| 793 |         qsizetype idx = normalizedDomain.indexOf(c: u'.', from: lastIdx); | 
| 794 |         if (idx == -1) | 
| 795 |             idx = normalizedDomain.size(); | 
| 796 |  | 
| 797 |         const qsizetype labelLength = idx - lastIdx; | 
| 798 |         if (labelLength) { | 
| 799 |             const auto label = normalizedDomain.sliced(pos: lastIdx, n: labelLength); | 
| 800 |             aceForm.clear(); | 
| 801 |             qt_punycodeEncoder(in: label, output: &aceForm); | 
| 802 |             if (aceForm.isEmpty()) | 
| 803 |                 return {}; | 
| 804 |  | 
| 805 |             aceResult.append(s: aceForm); | 
| 806 |         } | 
| 807 |  | 
| 808 |         if (idx == normalizedDomain.size()) | 
| 809 |             break; | 
| 810 |  | 
| 811 |         if (labelLength == 0 && (dot == ForbidLeadingDot || idx > 0)) | 
| 812 |             return {}; // two delimiters in a row -- empty label not allowed | 
| 813 |  | 
| 814 |         lastIdx = idx + 1; | 
| 815 |         aceResult += u'.'; | 
| 816 |     } | 
| 817 |  | 
| 818 |     return aceResult; | 
| 819 | } | 
| 820 |  | 
| 821 | static bool checkAsciiDomainName(QStringView normalizedDomain, AceLeadingDot dot, | 
| 822 |                                  bool *usesPunycode) | 
| 823 | { | 
| 824 |     qsizetype lastIdx = 0; | 
| 825 |     bool hasPunycode = false; | 
| 826 |     *usesPunycode = false; | 
| 827 |  | 
| 828 |     while (lastIdx < normalizedDomain.size()) { | 
| 829 |         auto idx = normalizedDomain.indexOf(c: u'.', from: lastIdx); | 
| 830 |         if (idx == -1) | 
| 831 |             idx = normalizedDomain.size(); | 
| 832 |  | 
| 833 |         const auto labelLength = idx - lastIdx; | 
| 834 |         if (labelLength == 0) { | 
| 835 |             if (idx == normalizedDomain.size()) | 
| 836 |                 break; | 
| 837 |             if (dot == ForbidLeadingDot || idx > 0) | 
| 838 |                 return false; // two delimiters in a row -- empty label not allowed | 
| 839 |         } else { | 
| 840 |             const auto label = normalizedDomain.sliced(pos: lastIdx, n: labelLength); | 
| 841 |             if (!validateAsciiLabel(label)) | 
| 842 |                 return false; | 
| 843 |  | 
| 844 |             hasPunycode = hasPunycode || label.startsWith(s: "xn--"_L1 ); | 
| 845 |         } | 
| 846 |  | 
| 847 |         lastIdx = idx + 1; | 
| 848 |     } | 
| 849 |  | 
| 850 |     *usesPunycode = hasPunycode; | 
| 851 |     return true; | 
| 852 | } | 
| 853 |  | 
| 854 | static QString convertToUnicode(const QString &asciiDomain, QUrl::AceProcessingOptions options) | 
| 855 | { | 
| 856 |     QString result; | 
| 857 |     result.reserve(asize: asciiDomain.size()); | 
| 858 |     qsizetype lastIdx = 0; | 
| 859 |  | 
| 860 |     DomainValidityChecker checker; | 
| 861 |  | 
| 862 |     while (true) { | 
| 863 |         auto idx = asciiDomain.indexOf(c: u'.', from: lastIdx); | 
| 864 |         if (idx == -1) | 
| 865 |             idx = asciiDomain.size(); | 
| 866 |  | 
| 867 |         const auto labelLength = idx - lastIdx; | 
| 868 |         if (labelLength == 0) { | 
| 869 |             if (idx == asciiDomain.size()) | 
| 870 |                 break; | 
| 871 |         } else { | 
| 872 |             const auto label = asciiDomain.sliced(pos: lastIdx, n: labelLength); | 
| 873 |             const auto unicodeLabel = qt_punycodeDecoder(pc: label); | 
| 874 |  | 
| 875 |             if (unicodeLabel.isEmpty()) | 
| 876 |                 return asciiDomain; | 
| 877 |  | 
| 878 |             if (!checker.checkLabel(label: unicodeLabel, options)) | 
| 879 |                 return asciiDomain; | 
| 880 |  | 
| 881 |             result.append(s: unicodeLabel); | 
| 882 |         } | 
| 883 |  | 
| 884 |         if (idx == asciiDomain.size()) | 
| 885 |             break; | 
| 886 |  | 
| 887 |         lastIdx = idx + 1; | 
| 888 |         result += u'.'; | 
| 889 |     } | 
| 890 |     return result; | 
| 891 | } | 
| 892 |  | 
| 893 | static bool checkUnicodeName(const QString &domainName, QUrl::AceProcessingOptions options) | 
| 894 | { | 
| 895 |     qsizetype lastIdx = 0; | 
| 896 |  | 
| 897 |     DomainValidityChecker checker(true); | 
| 898 |  | 
| 899 |     while (true) { | 
| 900 |         qsizetype idx = domainName.indexOf(c: u'.', from: lastIdx); | 
| 901 |         if (idx == -1) | 
| 902 |             idx = domainName.size(); | 
| 903 |  | 
| 904 |         const qsizetype labelLength = idx - lastIdx; | 
| 905 |         if (labelLength) { | 
| 906 |             const auto label = domainName.sliced(pos: lastIdx, n: labelLength); | 
| 907 |  | 
| 908 |             if (!checker.checkLabel(label, options)) | 
| 909 |                 return false; | 
| 910 |         } | 
| 911 |  | 
| 912 |         if (idx == domainName.size()) | 
| 913 |             break; | 
| 914 |  | 
| 915 |         lastIdx = idx + 1; | 
| 916 |     } | 
| 917 |     return true; | 
| 918 | } | 
| 919 |  | 
| 920 | QString qt_ACE_do(const QString &domain, AceOperation op, AceLeadingDot dot, | 
| 921 |                   QUrl::AceProcessingOptions options) | 
| 922 | { | 
| 923 |     if (domain.isEmpty()) | 
| 924 |         return {}; | 
| 925 |  | 
| 926 |     bool mappedToAscii; | 
| 927 |     const QString mapped = mapDomainName(in: domain, options, resultIsAscii: &mappedToAscii); | 
| 928 |     const QString normalized = | 
| 929 |             mappedToAscii ? mapped : mapped.normalized(mode: QString::NormalizationForm_C); | 
| 930 |  | 
| 931 |     if (normalized.isEmpty()) | 
| 932 |         return {}; | 
| 933 |  | 
| 934 |     if (!mappedToAscii && !checkUnicodeName(domainName: normalized, options)) | 
| 935 |         return {}; | 
| 936 |  | 
| 937 |     bool needsConversionToUnicode; | 
| 938 |     const QString aceResult = mappedToAscii ? normalized : convertToAscii(normalizedDomain: normalized, dot); | 
| 939 |     if (aceResult.isEmpty() || !checkAsciiDomainName(normalizedDomain: aceResult, dot, usesPunycode: &needsConversionToUnicode)) | 
| 940 |         return {}; | 
| 941 |  | 
| 942 |     if (op == ToAceOnly || !needsConversionToUnicode | 
| 943 |         || (!options.testFlag(flag: QUrl::IgnoreIDNWhitelist) && !qt_is_idn_enabled(aceDomain: aceResult))) { | 
| 944 |         return aceResult; | 
| 945 |     } | 
| 946 |  | 
| 947 |     return convertToUnicode(asciiDomain: aceResult, options); | 
| 948 | } | 
| 949 |  | 
| 950 | /*! | 
| 951 |     \since 4.2 | 
| 952 |  | 
| 953 |     Returns the current whitelist of top-level domains that are allowed | 
| 954 |     to have non-ASCII characters in their compositions. | 
| 955 |  | 
| 956 |     See setIdnWhitelist() for the rationale of this list. | 
| 957 |  | 
| 958 |     \sa AceProcessingOption | 
| 959 | */ | 
| 960 | QStringList QUrl::idnWhitelist() | 
| 961 | { | 
| 962 |     if (user_idn_whitelist) | 
| 963 |         return *user_idn_whitelist; | 
| 964 |     static const QStringList list = [] { | 
| 965 |         QStringList list; | 
| 966 |         list.reserve(asize: idn_whitelist.count()); | 
| 967 |         int i = 0; | 
| 968 |         while (i < idn_whitelist.count()) { | 
| 969 |             list << QLatin1StringView(idn_whitelist.at(index: i)); | 
| 970 |             ++i; | 
| 971 |         } | 
| 972 |         return list; | 
| 973 |     }(); | 
| 974 |     return list; | 
| 975 | } | 
| 976 |  | 
| 977 | /*! | 
| 978 |     \since 4.2 | 
| 979 |  | 
| 980 |     Sets the whitelist of Top-Level Domains (TLDs) that are allowed to have | 
| 981 |     non-ASCII characters in domains to the value of \a list. | 
| 982 |  | 
| 983 |     Note that if you call this function, you need to do so \e before | 
| 984 |     you start any threads that might access idnWhitelist(). | 
| 985 |  | 
| 986 |     Qt comes with a default list that contains the Internet top-level domains | 
| 987 |     that have published support for Internationalized Domain Names (IDNs) | 
| 988 |     and rules to guarantee that no deception can happen between similarly-looking | 
| 989 |     characters (such as the Latin lowercase letter \c 'a' and the Cyrillic | 
| 990 |     equivalent, which in most fonts are visually identical). | 
| 991 |  | 
| 992 |     This list is periodically maintained, as registrars publish new rules. | 
| 993 |  | 
| 994 |     This function is provided for those who need to manipulate the list, in | 
| 995 |     order to add or remove a TLD. It is not recommended to change its value | 
| 996 |     for purposes other than testing, as it may expose users to security risks. | 
| 997 | */ | 
| 998 | void QUrl::setIdnWhitelist(const QStringList &list) | 
| 999 | { | 
| 1000 |     if (!user_idn_whitelist) | 
| 1001 |         user_idn_whitelist = new QStringList; | 
| 1002 |     *user_idn_whitelist = list; | 
| 1003 | } | 
| 1004 |  | 
| 1005 | QT_END_NAMESPACE | 
| 1006 |  |