| 1 | // Copyright (C) 2016 Intel Corporation. |
| 2 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only |
| 3 | // Qt-Security score:critical reason:data-parser |
| 4 | |
| 5 | #include "qurl.h" |
| 6 | #include "private/qstringconverter_p.h" |
| 7 | #include "private/qtools_p.h" |
| 8 | #include "private/qsimd_p.h" |
| 9 | |
| 10 | QT_BEGIN_NAMESPACE |
| 11 | |
| 12 | // ### move to qurl_p.h |
| 13 | enum EncodingAction { |
| 14 | DecodeCharacter = 0, |
| 15 | LeaveCharacter = 1, |
| 16 | EncodeCharacter = 2 |
| 17 | }; |
| 18 | |
| 19 | // From RFC 3896, Appendix A Collected ABNF for URI |
| 20 | // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" |
| 21 | // reserved = gen-delims / sub-delims |
| 22 | // gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" |
| 23 | // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" |
| 24 | // / "*" / "+" / "," / ";" / "=" |
| 25 | static const uchar defaultActionTable[96] = { |
| 26 | 0, // space |
| 27 | 1, // '!' (sub-delim) |
| 28 | 2, // '"' |
| 29 | 1, // '#' (gen-delim) |
| 30 | 1, // '$' (gen-delim) |
| 31 | 2, // '%' (percent) |
| 32 | 1, // '&' (gen-delim) |
| 33 | 1, // "'" (sub-delim) |
| 34 | 1, // '(' (sub-delim) |
| 35 | 1, // ')' (sub-delim) |
| 36 | 1, // '*' (sub-delim) |
| 37 | 1, // '+' (sub-delim) |
| 38 | 1, // ',' (sub-delim) |
| 39 | 0, // '-' (unreserved) |
| 40 | 0, // '.' (unreserved) |
| 41 | 1, // '/' (gen-delim) |
| 42 | |
| 43 | 0, 0, 0, 0, 0, // '0' to '4' (unreserved) |
| 44 | 0, 0, 0, 0, 0, // '5' to '9' (unreserved) |
| 45 | 1, // ':' (gen-delim) |
| 46 | 1, // ';' (sub-delim) |
| 47 | 2, // '<' |
| 48 | 1, // '=' (sub-delim) |
| 49 | 2, // '>' |
| 50 | 1, // '?' (gen-delim) |
| 51 | |
| 52 | 1, // '@' (gen-delim) |
| 53 | 0, 0, 0, 0, 0, // 'A' to 'E' (unreserved) |
| 54 | 0, 0, 0, 0, 0, // 'F' to 'J' (unreserved) |
| 55 | 0, 0, 0, 0, 0, // 'K' to 'O' (unreserved) |
| 56 | 0, 0, 0, 0, 0, // 'P' to 'T' (unreserved) |
| 57 | 0, 0, 0, 0, 0, 0, // 'U' to 'Z' (unreserved) |
| 58 | 1, // '[' (gen-delim) |
| 59 | 2, // '\' |
| 60 | 1, // ']' (gen-delim) |
| 61 | 2, // '^' |
| 62 | 0, // '_' (unreserved) |
| 63 | |
| 64 | 2, // '`' |
| 65 | 0, 0, 0, 0, 0, // 'a' to 'e' (unreserved) |
| 66 | 0, 0, 0, 0, 0, // 'f' to 'j' (unreserved) |
| 67 | 0, 0, 0, 0, 0, // 'k' to 'o' (unreserved) |
| 68 | 0, 0, 0, 0, 0, // 'p' to 't' (unreserved) |
| 69 | 0, 0, 0, 0, 0, 0, // 'u' to 'z' (unreserved) |
| 70 | 2, // '{' |
| 71 | 2, // '|' |
| 72 | 2, // '}' |
| 73 | 0, // '~' (unreserved) |
| 74 | |
| 75 | 2 // BSKP |
| 76 | }; |
| 77 | |
| 78 | // mask tables, in negative polarity |
| 79 | // 0x00 if it belongs to this category |
| 80 | // 0xff if it doesn't |
| 81 | |
| 82 | static const uchar reservedMask[96] = { |
| 83 | 0xff, // space |
| 84 | 0xff, // '!' (sub-delim) |
| 85 | 0x00, // '"' |
| 86 | 0xff, // '#' (gen-delim) |
| 87 | 0xff, // '$' (gen-delim) |
| 88 | 0xff, // '%' (percent) |
| 89 | 0xff, // '&' (gen-delim) |
| 90 | 0xff, // "'" (sub-delim) |
| 91 | 0xff, // '(' (sub-delim) |
| 92 | 0xff, // ')' (sub-delim) |
| 93 | 0xff, // '*' (sub-delim) |
| 94 | 0xff, // '+' (sub-delim) |
| 95 | 0xff, // ',' (sub-delim) |
| 96 | 0xff, // '-' (unreserved) |
| 97 | 0xff, // '.' (unreserved) |
| 98 | 0xff, // '/' (gen-delim) |
| 99 | |
| 100 | 0xff, 0xff, 0xff, 0xff, 0xff, // '0' to '4' (unreserved) |
| 101 | 0xff, 0xff, 0xff, 0xff, 0xff, // '5' to '9' (unreserved) |
| 102 | 0xff, // ':' (gen-delim) |
| 103 | 0xff, // ';' (sub-delim) |
| 104 | 0x00, // '<' |
| 105 | 0xff, // '=' (sub-delim) |
| 106 | 0x00, // '>' |
| 107 | 0xff, // '?' (gen-delim) |
| 108 | |
| 109 | 0xff, // '@' (gen-delim) |
| 110 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'A' to 'E' (unreserved) |
| 111 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'F' to 'J' (unreserved) |
| 112 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'K' to 'O' (unreserved) |
| 113 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'P' to 'T' (unreserved) |
| 114 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // 'U' to 'Z' (unreserved) |
| 115 | 0xff, // '[' (gen-delim) |
| 116 | 0x00, // '\' |
| 117 | 0xff, // ']' (gen-delim) |
| 118 | 0x00, // '^' |
| 119 | 0xff, // '_' (unreserved) |
| 120 | |
| 121 | 0x00, // '`' |
| 122 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'a' to 'e' (unreserved) |
| 123 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'f' to 'j' (unreserved) |
| 124 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'k' to 'o' (unreserved) |
| 125 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'p' to 't' (unreserved) |
| 126 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // 'u' to 'z' (unreserved) |
| 127 | 0x00, // '{' |
| 128 | 0x00, // '|' |
| 129 | 0x00, // '}' |
| 130 | 0xff, // '~' (unreserved) |
| 131 | |
| 132 | 0xff // BSKP |
| 133 | }; |
| 134 | |
| 135 | static inline bool isHex(char16_t c) |
| 136 | { |
| 137 | return (c >= u'a' && c <= u'f') || (c >= u'A' && c <= u'F') || (c >= u'0' && c <= u'9'); |
| 138 | } |
| 139 | |
| 140 | static inline bool isUpperHex(char16_t c) |
| 141 | { |
| 142 | // undefined behaviour if c isn't an hex char! |
| 143 | return c < 0x60; |
| 144 | } |
| 145 | |
| 146 | static inline char16_t toUpperHex(char16_t c) |
| 147 | { |
| 148 | return isUpperHex(c) ? c : c - 0x20; |
| 149 | } |
| 150 | |
| 151 | static inline ushort decodeNibble(char16_t c) |
| 152 | { |
| 153 | return c >= u'a' ? c - u'a' + 0xA : c >= u'A' ? c - u'A' + 0xA : c - u'0'; |
| 154 | } |
| 155 | |
| 156 | // if the sequence at input is 2*HEXDIG, returns its decoding |
| 157 | // returns -1 if it isn't. |
| 158 | // assumes that the range has been checked already |
| 159 | static inline char16_t decodePercentEncoding(const char16_t *input) |
| 160 | { |
| 161 | char16_t c1 = input[1]; |
| 162 | char16_t c2 = input[2]; |
| 163 | if (!isHex(c: c1) || !isHex(c: c2)) |
| 164 | return char16_t(-1); |
| 165 | return decodeNibble(c: c1) << 4 | decodeNibble(c: c2); |
| 166 | } |
| 167 | |
| 168 | static inline char16_t encodeNibble(ushort c) |
| 169 | { |
| 170 | return QtMiscUtils::toHexUpper(value: c); |
| 171 | } |
| 172 | |
| 173 | static void ensureDetached(QString &result, char16_t *&output, const char16_t *begin, const char16_t *input, const char16_t *end, |
| 174 | int add = 0) |
| 175 | { |
| 176 | if (!output) { |
| 177 | // now detach |
| 178 | // create enough space if the rest of the string needed to be percent-encoded |
| 179 | int charsProcessed = input - begin; |
| 180 | int charsRemaining = end - input; |
| 181 | int spaceNeeded = end - begin + 2 * charsRemaining + add; |
| 182 | int origSize = result.size(); |
| 183 | result.resize(size: origSize + spaceNeeded); |
| 184 | |
| 185 | // we know that resize() above detached, so we bypass the reference count check |
| 186 | output = const_cast<char16_t *>(reinterpret_cast<const char16_t *>(result.constData())) |
| 187 | + origSize; |
| 188 | |
| 189 | // copy the chars we've already processed |
| 190 | int i; |
| 191 | for (i = 0; i < charsProcessed; ++i) |
| 192 | output[i] = begin[i]; |
| 193 | output += i; |
| 194 | } |
| 195 | } |
| 196 | |
| 197 | namespace { |
| 198 | struct QUrlUtf8Traits : public QUtf8BaseTraitsNoAscii |
| 199 | { |
| 200 | // From RFC 3987: |
| 201 | // iunreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar |
| 202 | // |
| 203 | // ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF |
| 204 | // / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD |
| 205 | // / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD |
| 206 | // / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD |
| 207 | // / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD |
| 208 | // / %xD0000-DFFFD / %xE1000-EFFFD |
| 209 | // |
| 210 | // iprivate = %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD |
| 211 | // |
| 212 | // That RFC allows iprivate only as part of iquery, but we don't know here |
| 213 | // whether we're looking at a query or another part of an URI, so we accept |
| 214 | // them too. The definition above excludes U+FFF0 to U+FFFD from appearing |
| 215 | // unencoded, but we see no reason for its exclusion, so we allow them to |
| 216 | // be decoded (and we need U+FFFD the replacement character to indicate |
| 217 | // failure to decode). |
| 218 | // |
| 219 | // That means we must disallow: |
| 220 | // * unpaired surrogates (QUtf8Functions takes care of that for us) |
| 221 | // * non-characters |
| 222 | static const bool allowNonCharacters = false; |
| 223 | |
| 224 | // override: our "bytes" are three percent-encoded UTF-16 characters |
| 225 | static void appendByte(char16_t *&ptr, uchar b) |
| 226 | { |
| 227 | // b >= 0x80, by construction, so percent-encode |
| 228 | *ptr++ = '%'; |
| 229 | *ptr++ = encodeNibble(c: b >> 4); |
| 230 | *ptr++ = encodeNibble(c: b & 0xf); |
| 231 | } |
| 232 | |
| 233 | static uchar peekByte(const char16_t *ptr, qsizetype n = 0) |
| 234 | { |
| 235 | // decodePercentEncoding returns char16_t(-1) if it can't decode, |
| 236 | // which means we return 0xff, which is not a valid continuation byte. |
| 237 | // If ptr[i * 3] is not '%', we'll multiply by zero and return 0, |
| 238 | // also not a valid continuation byte (if it's '%', we multiply by 1). |
| 239 | return uchar(decodePercentEncoding(input: ptr + n * 3)) |
| 240 | * uchar(ptr[n * 3] == '%'); |
| 241 | } |
| 242 | |
| 243 | static qptrdiff availableBytes(const char16_t *ptr, const char16_t *end) |
| 244 | { |
| 245 | return (end - ptr) / 3; |
| 246 | } |
| 247 | |
| 248 | static void advanceByte(const char16_t *&ptr, int n = 1) |
| 249 | { |
| 250 | ptr += n * 3; |
| 251 | } |
| 252 | }; |
| 253 | } |
| 254 | |
| 255 | // returns true if we performed an UTF-8 decoding |
| 256 | static bool encodedUtf8ToUtf16(QString &result, char16_t *&output, const char16_t *begin, |
| 257 | const char16_t *&input, const char16_t *end, char16_t decoded) |
| 258 | { |
| 259 | char32_t buffer[1]; |
| 260 | char32_t &ucs4 = buffer[0]; |
| 261 | char32_t *dst = buffer; |
| 262 | const char16_t *src = input + 3;// skip the %XX that yielded \a decoded |
| 263 | int charsNeeded = QUtf8Functions::fromUtf8<QUrlUtf8Traits>(b: decoded, dst, src, end); |
| 264 | if (charsNeeded < 0) |
| 265 | return false; |
| 266 | |
| 267 | if (!QChar::requiresSurrogates(ucs4)) { |
| 268 | // UTF-8 decoded and no surrogates are required |
| 269 | // detach if necessary |
| 270 | // possibilities are: 6 chars (%XX%XX) -> one char; 9 chars (%XX%XX%XX) -> one char |
| 271 | ensureDetached(result, output, begin, input, end, add: -3 * charsNeeded + 1); |
| 272 | *output++ = ucs4; |
| 273 | } else { |
| 274 | // UTF-8 decoded to something that requires a surrogate pair |
| 275 | // compressing from %XX%XX%XX%XX (12 chars) to two |
| 276 | ensureDetached(result, output, begin, input, end, add: -10); |
| 277 | *output++ = QChar::highSurrogate(ucs4); |
| 278 | *output++ = QChar::lowSurrogate(ucs4); |
| 279 | } |
| 280 | |
| 281 | input = src - 1; |
| 282 | return true; |
| 283 | } |
| 284 | |
| 285 | static void unicodeToEncodedUtf8(QString &result, char16_t *&output, const char16_t *begin, |
| 286 | const char16_t *&input, const char16_t *end, char16_t decoded) |
| 287 | { |
| 288 | // calculate the utf8 length and ensure enough space is available |
| 289 | int utf8len = QChar::isHighSurrogate(ucs4: decoded) ? 4 : decoded >= 0x800 ? 3 : 2; |
| 290 | |
| 291 | // detach |
| 292 | if (!output) { |
| 293 | // we need 3 * utf8len for the encoded UTF-8 sequence |
| 294 | // but ensureDetached already adds 3 for the char we're processing |
| 295 | ensureDetached(result, output, begin, input, end, add: 3*utf8len - 3); |
| 296 | } else { |
| 297 | // verify that there's enough space or expand |
| 298 | int charsRemaining = end - input - 1; // not including this one |
| 299 | int pos = output - reinterpret_cast<const char16_t *>(result.constData()); |
| 300 | int spaceRemaining = result.size() - pos; |
| 301 | if (spaceRemaining < 3*charsRemaining + 3*utf8len) { |
| 302 | // must resize |
| 303 | result.resize(size: result.size() + 3*utf8len); |
| 304 | |
| 305 | // we know that resize() above detached, so we bypass the reference count check |
| 306 | output = const_cast<char16_t *>(reinterpret_cast<const char16_t *>(result.constData())); |
| 307 | output += pos; |
| 308 | } |
| 309 | } |
| 310 | |
| 311 | ++input; |
| 312 | int res = QUtf8Functions::toUtf8<QUrlUtf8Traits>(u: decoded, dst&: output, src&: input, end); |
| 313 | --input; |
| 314 | if (res < 0) { |
| 315 | // bad surrogate pair sequence |
| 316 | // we will encode bad UTF-16 to UTF-8 |
| 317 | // but they don't get decoded back |
| 318 | |
| 319 | // first of three bytes |
| 320 | uchar c = 0xe0 | uchar(decoded >> 12); |
| 321 | *output++ = '%'; |
| 322 | *output++ = 'E'; |
| 323 | *output++ = encodeNibble(c: c & 0xf); |
| 324 | |
| 325 | // second byte |
| 326 | c = 0x80 | (uchar(decoded >> 6) & 0x3f); |
| 327 | *output++ = '%'; |
| 328 | *output++ = encodeNibble(c: c >> 4); |
| 329 | *output++ = encodeNibble(c: c & 0xf); |
| 330 | |
| 331 | // third byte |
| 332 | c = 0x80 | (decoded & 0x3f); |
| 333 | *output++ = '%'; |
| 334 | *output++ = encodeNibble(c: c >> 4); |
| 335 | *output++ = encodeNibble(c: c & 0xf); |
| 336 | } |
| 337 | } |
| 338 | |
| 339 | static int recode(QString &result, const char16_t *begin, const char16_t *end, |
| 340 | QUrl::ComponentFormattingOptions encoding, const uchar *actionTable, |
| 341 | bool retryBadEncoding) |
| 342 | { |
| 343 | const int origSize = result.size(); |
| 344 | const char16_t *input = begin; |
| 345 | char16_t *output = nullptr; |
| 346 | |
| 347 | EncodingAction action = EncodeCharacter; |
| 348 | for ( ; input != end; ++input) { |
| 349 | char16_t c; |
| 350 | // try a run where no change is necessary |
| 351 | for ( ; input != end; ++input) { |
| 352 | c = *input; |
| 353 | if (c < 0x20U) |
| 354 | action = EncodeCharacter; |
| 355 | if (c < 0x20U || c >= 0x80U) // also: (c - 0x20 < 0x60U) |
| 356 | goto non_trivial; |
| 357 | action = EncodingAction(actionTable[c - ' ']); |
| 358 | if (action == EncodeCharacter) |
| 359 | goto non_trivial; |
| 360 | if (output) |
| 361 | *output++ = c; |
| 362 | } |
| 363 | break; |
| 364 | |
| 365 | non_trivial: |
| 366 | char16_t decoded; |
| 367 | if (c == '%' && retryBadEncoding) { |
| 368 | // always write "%25" |
| 369 | ensureDetached(result, output, begin, input, end); |
| 370 | *output++ = '%'; |
| 371 | *output++ = '2'; |
| 372 | *output++ = '5'; |
| 373 | continue; |
| 374 | } else if (c == '%') { |
| 375 | // check if the input is valid |
| 376 | if (input + 2 >= end || (decoded = decodePercentEncoding(input)) == char16_t(-1)) { |
| 377 | // not valid, retry |
| 378 | result.resize(size: origSize); |
| 379 | return recode(result, begin, end, encoding, actionTable, retryBadEncoding: true); |
| 380 | } |
| 381 | |
| 382 | if (decoded >= 0x80) { |
| 383 | // decode the UTF-8 sequence |
| 384 | if (!(encoding & QUrl::EncodeUnicode) && |
| 385 | encodedUtf8ToUtf16(result, output, begin, input, end, decoded)) |
| 386 | continue; |
| 387 | |
| 388 | // decoding the encoded UTF-8 failed |
| 389 | action = LeaveCharacter; |
| 390 | } else if (decoded >= 0x20) { |
| 391 | action = EncodingAction(actionTable[decoded - ' ']); |
| 392 | } |
| 393 | } else { |
| 394 | decoded = c; |
| 395 | if (decoded >= 0x80 && encoding & QUrl::EncodeUnicode) { |
| 396 | // encode the UTF-8 sequence |
| 397 | unicodeToEncodedUtf8(result, output, begin, input, end, decoded); |
| 398 | continue; |
| 399 | } else if (decoded >= 0x80) { |
| 400 | if (output) |
| 401 | *output++ = c; |
| 402 | continue; |
| 403 | } |
| 404 | } |
| 405 | |
| 406 | // there are six possibilities: |
| 407 | // current \ action | DecodeCharacter | LeaveCharacter | EncodeCharacter |
| 408 | // decoded | 1:leave | 2:leave | 3:encode |
| 409 | // encoded | 4:decode | 5:leave | 6:leave |
| 410 | // cases 1 and 2 were handled before this section |
| 411 | |
| 412 | if (c == '%' && action != DecodeCharacter) { |
| 413 | // cases 5 and 6: it's encoded and we're leaving it as it is |
| 414 | // except we're pedantic and we'll uppercase the hex |
| 415 | if (output || !isUpperHex(c: input[1]) || !isUpperHex(c: input[2])) { |
| 416 | ensureDetached(result, output, begin, input, end); |
| 417 | *output++ = '%'; |
| 418 | *output++ = toUpperHex(c: *++input); |
| 419 | *output++ = toUpperHex(c: *++input); |
| 420 | } |
| 421 | } else if (c == '%' && action == DecodeCharacter) { |
| 422 | // case 4: we need to decode |
| 423 | ensureDetached(result, output, begin, input, end); |
| 424 | *output++ = decoded; |
| 425 | input += 2; |
| 426 | } else { |
| 427 | // must be case 3: we need to encode |
| 428 | ensureDetached(result, output, begin, input, end); |
| 429 | *output++ = '%'; |
| 430 | *output++ = encodeNibble(c: c >> 4); |
| 431 | *output++ = encodeNibble(c: c & 0xf); |
| 432 | } |
| 433 | } |
| 434 | |
| 435 | if (output) { |
| 436 | int len = output - reinterpret_cast<const char16_t *>(result.constData()); |
| 437 | result.truncate(pos: len); |
| 438 | return len - origSize; |
| 439 | } |
| 440 | return 0; |
| 441 | } |
| 442 | |
| 443 | /* |
| 444 | * Returns true if the input it checked (if it checked anything) is not |
| 445 | * encoded. A return of false indicates there's a percent at \a input that |
| 446 | * needs to be decoded. |
| 447 | */ |
| 448 | #ifdef __SSE2__ |
| 449 | static bool simdCheckNonEncoded(QChar *&output, const char16_t *&input, const char16_t *end) |
| 450 | { |
| 451 | # ifdef __AVX2__ |
| 452 | const __m256i percents256 = _mm256_broadcastw_epi16(_mm_cvtsi32_si128('%')); |
| 453 | const __m128i percents = _mm256_castsi256_si128(percents256); |
| 454 | # else |
| 455 | const __m128i percents = _mm_set1_epi16(w: '%'); |
| 456 | # endif |
| 457 | |
| 458 | uint idx = 0; |
| 459 | quint32 mask = 0; |
| 460 | if (input + 16 <= end) { |
| 461 | qptrdiff offset = 0; |
| 462 | for ( ; input + offset + 16 <= end; offset += 16) { |
| 463 | # ifdef __AVX2__ |
| 464 | // do 32 bytes at a time using AVX2 |
| 465 | __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(input + offset)); |
| 466 | __m256i comparison = _mm256_cmpeq_epi16(data, percents256); |
| 467 | mask = _mm256_movemask_epi8(comparison); |
| 468 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(output + offset), data); |
| 469 | # else |
| 470 | // do 32 bytes at a time using unrolled SSE2 |
| 471 | __m128i data1 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(input + offset)); |
| 472 | __m128i data2 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(input + offset + 8)); |
| 473 | __m128i comparison1 = _mm_cmpeq_epi16(a: data1, b: percents); |
| 474 | __m128i comparison2 = _mm_cmpeq_epi16(a: data2, b: percents); |
| 475 | uint mask1 = _mm_movemask_epi8(a: comparison1); |
| 476 | uint mask2 = _mm_movemask_epi8(a: comparison2); |
| 477 | |
| 478 | _mm_storeu_si128(p: reinterpret_cast<__m128i *>(output + offset), b: data1); |
| 479 | if (!mask1) |
| 480 | _mm_storeu_si128(p: reinterpret_cast<__m128i *>(output + offset + 8), b: data2); |
| 481 | mask = mask1 | (mask2 << 16); |
| 482 | # endif |
| 483 | |
| 484 | if (mask) { |
| 485 | idx = qCountTrailingZeroBits(v: mask) / 2; |
| 486 | break; |
| 487 | } |
| 488 | } |
| 489 | |
| 490 | input += offset; |
| 491 | if (output) |
| 492 | output += offset; |
| 493 | } else if (input + 8 <= end) { |
| 494 | // do 16 bytes at a time |
| 495 | __m128i data = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(input)); |
| 496 | __m128i comparison = _mm_cmpeq_epi16(a: data, b: percents); |
| 497 | mask = _mm_movemask_epi8(a: comparison); |
| 498 | _mm_storeu_si128(p: reinterpret_cast<__m128i *>(output), b: data); |
| 499 | idx = qCountTrailingZeroBits(v: quint16(mask)) / 2; |
| 500 | } else if (input + 4 <= end) { |
| 501 | // do 8 bytes only |
| 502 | __m128i data = _mm_loadl_epi64(p: reinterpret_cast<const __m128i *>(input)); |
| 503 | __m128i comparison = _mm_cmpeq_epi16(a: data, b: percents); |
| 504 | mask = _mm_movemask_epi8(a: comparison) & 0xffu; |
| 505 | _mm_storel_epi64(p: reinterpret_cast<__m128i *>(output), a: data); |
| 506 | idx = qCountTrailingZeroBits(v: quint8(mask)) / 2; |
| 507 | } else { |
| 508 | // no percents found (because we didn't check) |
| 509 | return true; |
| 510 | } |
| 511 | |
| 512 | // advance to the next non-encoded |
| 513 | input += idx; |
| 514 | output += idx; |
| 515 | |
| 516 | return !mask; |
| 517 | } |
| 518 | #else |
| 519 | static bool simdCheckNonEncoded(...) |
| 520 | { |
| 521 | return true; |
| 522 | } |
| 523 | #endif |
| 524 | |
| 525 | /*! |
| 526 | \since 5.0 |
| 527 | \internal |
| 528 | |
| 529 | This function decodes a percent-encoded string located in \a in |
| 530 | by appending each character to \a appendTo. It returns the number of |
| 531 | characters appended. Each percent-encoded sequence is decoded as follows: |
| 532 | |
| 533 | \list |
| 534 | \li from %00 to %7F: the exact decoded value is appended; |
| 535 | \li from %80 to %FF: QChar::ReplacementCharacter is appended; |
| 536 | \li bad encoding: original input is copied to the output, undecoded. |
| 537 | \endlist |
| 538 | |
| 539 | Given the above, it's important for the input to already have all UTF-8 |
| 540 | percent sequences decoded by qt_urlRecode (that is, the input should not |
| 541 | have been processed with QUrl::EncodeUnicode). |
| 542 | |
| 543 | The input should also be a valid percent-encoded sequence (the output of |
| 544 | qt_urlRecode is always valid). |
| 545 | */ |
| 546 | static qsizetype decode(QString &appendTo, QStringView in) |
| 547 | { |
| 548 | const char16_t *begin = in.utf16(); |
| 549 | const char16_t *end = begin + in.size(); |
| 550 | |
| 551 | // fast check whether there's anything to be decoded in the first place |
| 552 | const char16_t *input = QtPrivate::qustrchr(str: in, ch: '%'); |
| 553 | |
| 554 | if (Q_LIKELY(input == end)) |
| 555 | return 0; // nothing to do, it was already decoded! |
| 556 | |
| 557 | // detach |
| 558 | const int origSize = appendTo.size(); |
| 559 | appendTo.resize(size: origSize + (end - begin)); |
| 560 | QChar *output = appendTo.data() + origSize; |
| 561 | memcpy(dest: static_cast<void *>(output), src: static_cast<const void *>(begin), n: (input - begin) * sizeof(QChar)); |
| 562 | output += input - begin; |
| 563 | |
| 564 | while (input != end) { |
| 565 | // something was encoded |
| 566 | Q_ASSERT(*input == '%'); |
| 567 | |
| 568 | if (Q_UNLIKELY(end - input < 3 || !isHex(input[1]) || !isHex(input[2]))) { |
| 569 | // badly-encoded data |
| 570 | appendTo.resize(size: origSize + (end - begin)); |
| 571 | memcpy(dest: static_cast<void *>(appendTo.begin() + origSize), |
| 572 | src: static_cast<const void *>(begin), n: (end - begin) * sizeof(*end)); |
| 573 | return end - begin; |
| 574 | } |
| 575 | |
| 576 | ++input; |
| 577 | *output++ = QChar::fromUcs2(c: decodeNibble(c: input[0]) << 4 | decodeNibble(c: input[1])); |
| 578 | if (output[-1].unicode() >= 0x80) |
| 579 | output[-1] = QChar::ReplacementCharacter; |
| 580 | input += 2; |
| 581 | |
| 582 | // search for the next percent, copying from input to output |
| 583 | if (simdCheckNonEncoded(output, input, end)) { |
| 584 | while (input != end) { |
| 585 | const char16_t uc = *input; |
| 586 | if (uc == '%') |
| 587 | break; |
| 588 | *output++ = uc; |
| 589 | ++input; |
| 590 | } |
| 591 | } |
| 592 | } |
| 593 | |
| 594 | const qsizetype len = output - appendTo.begin(); |
| 595 | appendTo.truncate(pos: len); |
| 596 | return len - origSize; |
| 597 | } |
| 598 | |
| 599 | template <size_t N> |
| 600 | static void maskTable(uchar (&table)[N], const uchar (&mask)[N]) |
| 601 | { |
| 602 | for (size_t i = 0; i < N; ++i) |
| 603 | table[i] &= mask[i]; |
| 604 | } |
| 605 | |
| 606 | /*! |
| 607 | \internal |
| 608 | |
| 609 | Recodes the string from \a begin to \a end. If any transformations are |
| 610 | done, append them to \a appendTo and return the number of characters added. |
| 611 | If no transformations were required, return 0. |
| 612 | |
| 613 | The \a encoding option modifies the default behaviour: |
| 614 | \list |
| 615 | \li QUrl::DecodeReserved: if set, reserved characters will be decoded; |
| 616 | if unset, reserved characters will be encoded |
| 617 | \li QUrl::EncodeSpaces: if set, spaces will be encoded to "%20"; if unset, they will be " " |
| 618 | \li QUrl::EncodeUnicode: if set, characters above U+0080 will be encoded to their UTF-8 |
| 619 | percent-encoded form; if unset, they will be decoded to UTF-16 |
| 620 | \li QUrl::FullyDecoded: if set, this function will decode all percent-encoded sequences, |
| 621 | including that of the percent character. The resulting string |
| 622 | will not be percent-encoded anymore. Use with caution! |
| 623 | In this mode, the behaviour is undefined if the input string |
| 624 | contains any percent-encoding sequences above %80. |
| 625 | Also, the function will not correct bad % sequences. |
| 626 | \endlist |
| 627 | |
| 628 | Other flags are ignored (including QUrl::EncodeReserved). |
| 629 | |
| 630 | The \a tableModifications argument can be used to supply extra |
| 631 | modifications to the tables, to be applied after the flags above are |
| 632 | handled. It consists of a sequence of 16-bit values, where the low 8 bits |
| 633 | indicate the character in question and the high 8 bits are either \c |
| 634 | EncodeCharacter, \c LeaveCharacter or \c DecodeCharacter. |
| 635 | |
| 636 | This function corrects percent-encoded errors by interpreting every '%' as |
| 637 | meaning "%25" (all percents in the same content). |
| 638 | */ |
| 639 | |
| 640 | Q_AUTOTEST_EXPORT qsizetype |
| 641 | qt_urlRecode(QString &appendTo, QStringView in, |
| 642 | QUrl::ComponentFormattingOptions encoding, const ushort *tableModifications) |
| 643 | { |
| 644 | uchar actionTable[sizeof defaultActionTable]; |
| 645 | if ((encoding & QUrl::FullyDecoded) == QUrl::FullyDecoded) { |
| 646 | return decode(appendTo, in); |
| 647 | } |
| 648 | |
| 649 | memcpy(dest: actionTable, src: defaultActionTable, n: sizeof actionTable); |
| 650 | if (encoding & QUrl::DecodeReserved) |
| 651 | maskTable(table&: actionTable, mask: reservedMask); |
| 652 | if (encoding & QUrl::EncodeSpaces) |
| 653 | actionTable[0] = EncodeCharacter; |
| 654 | |
| 655 | if (tableModifications) { |
| 656 | for (const ushort *p = tableModifications; *p; ++p) |
| 657 | actionTable[uchar(*p) - ' '] = *p >> 8; |
| 658 | } |
| 659 | |
| 660 | return recode(result&: appendTo, begin: reinterpret_cast<const char16_t *>(in.begin()), |
| 661 | end: reinterpret_cast<const char16_t *>(in.end()), encoding, actionTable, retryBadEncoding: false); |
| 662 | } |
| 663 | |
| 664 | qsizetype qt_encodeFromUser(QString &appendTo, const QString &in, const ushort *tableModifications) |
| 665 | { |
| 666 | uchar actionTable[sizeof defaultActionTable]; |
| 667 | memcpy(dest: actionTable, src: defaultActionTable, n: sizeof actionTable); |
| 668 | |
| 669 | // Different defaults to the regular encoded-to-encoded recoding |
| 670 | actionTable['[' - ' '] = EncodeCharacter; |
| 671 | actionTable[']' - ' '] = EncodeCharacter; |
| 672 | |
| 673 | if (tableModifications) { |
| 674 | for (const ushort *p = tableModifications; *p; ++p) |
| 675 | actionTable[uchar(*p) - ' '] = *p >> 8; |
| 676 | } |
| 677 | |
| 678 | return recode(result&: appendTo, begin: reinterpret_cast<const char16_t *>(in.begin()), |
| 679 | end: reinterpret_cast<const char16_t *>(in.end()), encoding: {}, actionTable, retryBadEncoding: true); |
| 680 | } |
| 681 | |
| 682 | QT_END_NAMESPACE |
| 683 | |