| 1 | /**************************************************************************** |
| 2 | ** |
| 3 | ** Copyright (C) 2016 Intel Corporation. |
| 4 | ** Contact: https://www.qt.io/licensing/ |
| 5 | ** |
| 6 | ** This file is part of the QtCore module of the Qt Toolkit. |
| 7 | ** |
| 8 | ** $QT_BEGIN_LICENSE:LGPL$ |
| 9 | ** Commercial License Usage |
| 10 | ** Licensees holding valid commercial Qt licenses may use this file in |
| 11 | ** accordance with the commercial license agreement provided with the |
| 12 | ** Software or, alternatively, in accordance with the terms contained in |
| 13 | ** a written agreement between you and The Qt Company. For licensing terms |
| 14 | ** and conditions see https://www.qt.io/terms-conditions. For further |
| 15 | ** information use the contact form at https://www.qt.io/contact-us. |
| 16 | ** |
| 17 | ** GNU Lesser General Public License Usage |
| 18 | ** Alternatively, this file may be used under the terms of the GNU Lesser |
| 19 | ** General Public License version 3 as published by the Free Software |
| 20 | ** Foundation and appearing in the file LICENSE.LGPL3 included in the |
| 21 | ** packaging of this file. Please review the following information to |
| 22 | ** ensure the GNU Lesser General Public License version 3 requirements |
| 23 | ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. |
| 24 | ** |
| 25 | ** GNU General Public License Usage |
| 26 | ** Alternatively, this file may be used under the terms of the GNU |
| 27 | ** General Public License version 2.0 or (at your option) the GNU General |
| 28 | ** Public license version 3 or any later version approved by the KDE Free |
| 29 | ** Qt Foundation. The licenses are as published by the Free Software |
| 30 | ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 |
| 31 | ** included in the packaging of this file. Please review the following |
| 32 | ** information to ensure the GNU General Public License requirements will |
| 33 | ** be met: https://www.gnu.org/licenses/gpl-2.0.html and |
| 34 | ** https://www.gnu.org/licenses/gpl-3.0.html. |
| 35 | ** |
| 36 | ** $QT_END_LICENSE$ |
| 37 | ** |
| 38 | ****************************************************************************/ |
| 39 | |
| 40 | #include "qurl.h" |
| 41 | #include "private/qutfcodec_p.h" |
| 42 | #include "private/qtools_p.h" |
| 43 | #include "private/qsimd_p.h" |
| 44 | |
| 45 | QT_BEGIN_NAMESPACE |
| 46 | |
| 47 | // ### move to qurl_p.h |
| 48 | enum EncodingAction { |
| 49 | DecodeCharacter = 0, |
| 50 | LeaveCharacter = 1, |
| 51 | EncodeCharacter = 2 |
| 52 | }; |
| 53 | |
| 54 | // From RFC 3896, Appendix A Collected ABNF for URI |
| 55 | // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" |
| 56 | // reserved = gen-delims / sub-delims |
| 57 | // gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" |
| 58 | // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" |
| 59 | // / "*" / "+" / "," / ";" / "=" |
| 60 | static const uchar defaultActionTable[96] = { |
| 61 | 2, // space |
| 62 | 1, // '!' (sub-delim) |
| 63 | 2, // '"' |
| 64 | 1, // '#' (gen-delim) |
| 65 | 1, // '$' (gen-delim) |
| 66 | 2, // '%' (percent) |
| 67 | 1, // '&' (gen-delim) |
| 68 | 1, // "'" (sub-delim) |
| 69 | 1, // '(' (sub-delim) |
| 70 | 1, // ')' (sub-delim) |
| 71 | 1, // '*' (sub-delim) |
| 72 | 1, // '+' (sub-delim) |
| 73 | 1, // ',' (sub-delim) |
| 74 | 0, // '-' (unreserved) |
| 75 | 0, // '.' (unreserved) |
| 76 | 1, // '/' (gen-delim) |
| 77 | |
| 78 | 0, 0, 0, 0, 0, // '0' to '4' (unreserved) |
| 79 | 0, 0, 0, 0, 0, // '5' to '9' (unreserved) |
| 80 | 1, // ':' (gen-delim) |
| 81 | 1, // ';' (sub-delim) |
| 82 | 2, // '<' |
| 83 | 1, // '=' (sub-delim) |
| 84 | 2, // '>' |
| 85 | 1, // '?' (gen-delim) |
| 86 | |
| 87 | 1, // '@' (gen-delim) |
| 88 | 0, 0, 0, 0, 0, // 'A' to 'E' (unreserved) |
| 89 | 0, 0, 0, 0, 0, // 'F' to 'J' (unreserved) |
| 90 | 0, 0, 0, 0, 0, // 'K' to 'O' (unreserved) |
| 91 | 0, 0, 0, 0, 0, // 'P' to 'T' (unreserved) |
| 92 | 0, 0, 0, 0, 0, 0, // 'U' to 'Z' (unreserved) |
| 93 | 1, // '[' (gen-delim) |
| 94 | 2, // '\' |
| 95 | 1, // ']' (gen-delim) |
| 96 | 2, // '^' |
| 97 | 0, // '_' (unreserved) |
| 98 | |
| 99 | 2, // '`' |
| 100 | 0, 0, 0, 0, 0, // 'a' to 'e' (unreserved) |
| 101 | 0, 0, 0, 0, 0, // 'f' to 'j' (unreserved) |
| 102 | 0, 0, 0, 0, 0, // 'k' to 'o' (unreserved) |
| 103 | 0, 0, 0, 0, 0, // 'p' to 't' (unreserved) |
| 104 | 0, 0, 0, 0, 0, 0, // 'u' to 'z' (unreserved) |
| 105 | 2, // '{' |
| 106 | 2, // '|' |
| 107 | 2, // '}' |
| 108 | 0, // '~' (unreserved) |
| 109 | |
| 110 | 2 // BSKP |
| 111 | }; |
| 112 | |
| 113 | // mask tables, in negative polarity |
| 114 | // 0x00 if it belongs to this category |
| 115 | // 0xff if it doesn't |
| 116 | |
| 117 | static const uchar reservedMask[96] = { |
| 118 | 0xff, // space |
| 119 | 0xff, // '!' (sub-delim) |
| 120 | 0x00, // '"' |
| 121 | 0xff, // '#' (gen-delim) |
| 122 | 0xff, // '$' (gen-delim) |
| 123 | 0xff, // '%' (percent) |
| 124 | 0xff, // '&' (gen-delim) |
| 125 | 0xff, // "'" (sub-delim) |
| 126 | 0xff, // '(' (sub-delim) |
| 127 | 0xff, // ')' (sub-delim) |
| 128 | 0xff, // '*' (sub-delim) |
| 129 | 0xff, // '+' (sub-delim) |
| 130 | 0xff, // ',' (sub-delim) |
| 131 | 0xff, // '-' (unreserved) |
| 132 | 0xff, // '.' (unreserved) |
| 133 | 0xff, // '/' (gen-delim) |
| 134 | |
| 135 | 0xff, 0xff, 0xff, 0xff, 0xff, // '0' to '4' (unreserved) |
| 136 | 0xff, 0xff, 0xff, 0xff, 0xff, // '5' to '9' (unreserved) |
| 137 | 0xff, // ':' (gen-delim) |
| 138 | 0xff, // ';' (sub-delim) |
| 139 | 0x00, // '<' |
| 140 | 0xff, // '=' (sub-delim) |
| 141 | 0x00, // '>' |
| 142 | 0xff, // '?' (gen-delim) |
| 143 | |
| 144 | 0xff, // '@' (gen-delim) |
| 145 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'A' to 'E' (unreserved) |
| 146 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'F' to 'J' (unreserved) |
| 147 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'K' to 'O' (unreserved) |
| 148 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'P' to 'T' (unreserved) |
| 149 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // 'U' to 'Z' (unreserved) |
| 150 | 0xff, // '[' (gen-delim) |
| 151 | 0x00, // '\' |
| 152 | 0xff, // ']' (gen-delim) |
| 153 | 0x00, // '^' |
| 154 | 0xff, // '_' (unreserved) |
| 155 | |
| 156 | 0x00, // '`' |
| 157 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'a' to 'e' (unreserved) |
| 158 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'f' to 'j' (unreserved) |
| 159 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'k' to 'o' (unreserved) |
| 160 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'p' to 't' (unreserved) |
| 161 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // 'u' to 'z' (unreserved) |
| 162 | 0x00, // '{' |
| 163 | 0x00, // '|' |
| 164 | 0x00, // '}' |
| 165 | 0xff, // '~' (unreserved) |
| 166 | |
| 167 | 0xff // BSKP |
| 168 | }; |
| 169 | |
| 170 | static inline bool isHex(ushort c) |
| 171 | { |
| 172 | return (c >= 'a' && c <= 'f') || |
| 173 | (c >= 'A' && c <= 'F') || |
| 174 | (c >= '0' && c <= '9'); |
| 175 | } |
| 176 | |
| 177 | static inline bool isUpperHex(ushort c) |
| 178 | { |
| 179 | // undefined behaviour if c isn't an hex char! |
| 180 | return c < 0x60; |
| 181 | } |
| 182 | |
| 183 | static inline ushort toUpperHex(ushort c) |
| 184 | { |
| 185 | return isUpperHex(c) ? c : c - 0x20; |
| 186 | } |
| 187 | |
| 188 | static inline ushort decodeNibble(ushort c) |
| 189 | { |
| 190 | return c >= 'a' ? c - 'a' + 0xA : |
| 191 | c >= 'A' ? c - 'A' + 0xA : c - '0'; |
| 192 | } |
| 193 | |
| 194 | // if the sequence at input is 2*HEXDIG, returns its decoding |
| 195 | // returns -1 if it isn't. |
| 196 | // assumes that the range has been checked already |
| 197 | static inline ushort decodePercentEncoding(const ushort *input) |
| 198 | { |
| 199 | ushort c1 = input[1]; |
| 200 | ushort c2 = input[2]; |
| 201 | if (!isHex(c: c1) || !isHex(c: c2)) |
| 202 | return ushort(-1); |
| 203 | return decodeNibble(c: c1) << 4 | decodeNibble(c: c2); |
| 204 | } |
| 205 | |
| 206 | static inline ushort encodeNibble(ushort c) |
| 207 | { |
| 208 | return ushort(QtMiscUtils::toHexUpper(value: c)); |
| 209 | } |
| 210 | |
| 211 | static void ensureDetached(QString &result, ushort *&output, const ushort *begin, const ushort *input, const ushort *end, |
| 212 | int add = 0) |
| 213 | { |
| 214 | if (!output) { |
| 215 | // now detach |
| 216 | // create enough space if the rest of the string needed to be percent-encoded |
| 217 | int charsProcessed = input - begin; |
| 218 | int charsRemaining = end - input; |
| 219 | int spaceNeeded = end - begin + 2 * charsRemaining + add; |
| 220 | int origSize = result.size(); |
| 221 | result.resize(size: origSize + spaceNeeded); |
| 222 | |
| 223 | // we know that resize() above detached, so we bypass the reference count check |
| 224 | output = const_cast<ushort *>(reinterpret_cast<const ushort *>(result.constData())) |
| 225 | + origSize; |
| 226 | |
| 227 | // copy the chars we've already processed |
| 228 | int i; |
| 229 | for (i = 0; i < charsProcessed; ++i) |
| 230 | output[i] = begin[i]; |
| 231 | output += i; |
| 232 | } |
| 233 | } |
| 234 | |
| 235 | namespace { |
| 236 | struct QUrlUtf8Traits : public QUtf8BaseTraitsNoAscii |
| 237 | { |
| 238 | // From RFC 3987: |
| 239 | // iunreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar |
| 240 | // |
| 241 | // ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF |
| 242 | // / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD |
| 243 | // / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD |
| 244 | // / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD |
| 245 | // / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD |
| 246 | // / %xD0000-DFFFD / %xE1000-EFFFD |
| 247 | // |
| 248 | // iprivate = %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD |
| 249 | // |
| 250 | // That RFC allows iprivate only as part of iquery, but we don't know here |
| 251 | // whether we're looking at a query or another part of an URI, so we accept |
| 252 | // them too. The definition above excludes U+FFF0 to U+FFFD from appearing |
| 253 | // unencoded, but we see no reason for its exclusion, so we allow them to |
| 254 | // be decoded (and we need U+FFFD the replacement character to indicate |
| 255 | // failure to decode). |
| 256 | // |
| 257 | // That means we must disallow: |
| 258 | // * unpaired surrogates (QUtf8Functions takes care of that for us) |
| 259 | // * non-characters |
| 260 | static const bool allowNonCharacters = false; |
| 261 | |
| 262 | // override: our "bytes" are three percent-encoded UTF-16 characters |
| 263 | static void appendByte(ushort *&ptr, uchar b) |
| 264 | { |
| 265 | // b >= 0x80, by construction, so percent-encode |
| 266 | *ptr++ = '%'; |
| 267 | *ptr++ = encodeNibble(c: b >> 4); |
| 268 | *ptr++ = encodeNibble(c: b & 0xf); |
| 269 | } |
| 270 | |
| 271 | static uchar peekByte(const ushort *ptr, int n = 0) |
| 272 | { |
| 273 | // decodePercentEncoding returns ushort(-1) if it can't decode, |
| 274 | // which means we return 0xff, which is not a valid continuation byte. |
| 275 | // If ptr[i * 3] is not '%', we'll multiply by zero and return 0, |
| 276 | // also not a valid continuation byte (if it's '%', we multiply by 1). |
| 277 | return uchar(decodePercentEncoding(input: ptr + n * 3)) |
| 278 | * uchar(ptr[n * 3] == '%'); |
| 279 | } |
| 280 | |
| 281 | static qptrdiff availableBytes(const ushort *ptr, const ushort *end) |
| 282 | { |
| 283 | return (end - ptr) / 3; |
| 284 | } |
| 285 | |
| 286 | static void advanceByte(const ushort *&ptr, int n = 1) |
| 287 | { |
| 288 | ptr += n * 3; |
| 289 | } |
| 290 | }; |
| 291 | } |
| 292 | |
| 293 | // returns true if we performed an UTF-8 decoding |
| 294 | static bool encodedUtf8ToUtf16(QString &result, ushort *&output, const ushort *begin, const ushort *&input, |
| 295 | const ushort *end, ushort decoded) |
| 296 | { |
| 297 | uint ucs4, *dst = &ucs4; |
| 298 | const ushort *src = input + 3;// skip the %XX that yielded \a decoded |
| 299 | int charsNeeded = QUtf8Functions::fromUtf8<QUrlUtf8Traits>(b: decoded, dst, src, end); |
| 300 | if (charsNeeded < 0) |
| 301 | return false; |
| 302 | |
| 303 | if (!QChar::requiresSurrogates(ucs4)) { |
| 304 | // UTF-8 decoded and no surrogates are required |
| 305 | // detach if necessary |
| 306 | // possibilities are: 6 chars (%XX%XX) -> one char; 9 chars (%XX%XX%XX) -> one char |
| 307 | ensureDetached(result, output, begin, input, end, add: -3 * charsNeeded + 1); |
| 308 | *output++ = ucs4; |
| 309 | } else { |
| 310 | // UTF-8 decoded to something that requires a surrogate pair |
| 311 | // compressing from %XX%XX%XX%XX (12 chars) to two |
| 312 | ensureDetached(result, output, begin, input, end, add: -10); |
| 313 | *output++ = QChar::highSurrogate(ucs4); |
| 314 | *output++ = QChar::lowSurrogate(ucs4); |
| 315 | } |
| 316 | |
| 317 | input = src - 1; |
| 318 | return true; |
| 319 | } |
| 320 | |
| 321 | static void unicodeToEncodedUtf8(QString &result, ushort *&output, const ushort *begin, |
| 322 | const ushort *&input, const ushort *end, ushort decoded) |
| 323 | { |
| 324 | // calculate the utf8 length and ensure enough space is available |
| 325 | int utf8len = QChar::isHighSurrogate(ucs4: decoded) ? 4 : decoded >= 0x800 ? 3 : 2; |
| 326 | |
| 327 | // detach |
| 328 | if (!output) { |
| 329 | // we need 3 * utf8len for the encoded UTF-8 sequence |
| 330 | // but ensureDetached already adds 3 for the char we're processing |
| 331 | ensureDetached(result, output, begin, input, end, add: 3*utf8len - 3); |
| 332 | } else { |
| 333 | // verify that there's enough space or expand |
| 334 | int charsRemaining = end - input - 1; // not including this one |
| 335 | int pos = output - reinterpret_cast<const ushort *>(result.constData()); |
| 336 | int spaceRemaining = result.size() - pos; |
| 337 | if (spaceRemaining < 3*charsRemaining + 3*utf8len) { |
| 338 | // must resize |
| 339 | result.resize(size: result.size() + 3*utf8len); |
| 340 | |
| 341 | // we know that resize() above detached, so we bypass the reference count check |
| 342 | output = const_cast<ushort *>(reinterpret_cast<const ushort *>(result.constData())); |
| 343 | output += pos; |
| 344 | } |
| 345 | } |
| 346 | |
| 347 | ++input; |
| 348 | int res = QUtf8Functions::toUtf8<QUrlUtf8Traits>(u: decoded, dst&: output, src&: input, end); |
| 349 | --input; |
| 350 | if (res < 0) { |
| 351 | // bad surrogate pair sequence |
| 352 | // we will encode bad UTF-16 to UTF-8 |
| 353 | // but they don't get decoded back |
| 354 | |
| 355 | // first of three bytes |
| 356 | uchar c = 0xe0 | uchar(decoded >> 12); |
| 357 | *output++ = '%'; |
| 358 | *output++ = 'E'; |
| 359 | *output++ = encodeNibble(c: c & 0xf); |
| 360 | |
| 361 | // second byte |
| 362 | c = 0x80 | (uchar(decoded >> 6) & 0x3f); |
| 363 | *output++ = '%'; |
| 364 | *output++ = encodeNibble(c: c >> 4); |
| 365 | *output++ = encodeNibble(c: c & 0xf); |
| 366 | |
| 367 | // third byte |
| 368 | c = 0x80 | (decoded & 0x3f); |
| 369 | *output++ = '%'; |
| 370 | *output++ = encodeNibble(c: c >> 4); |
| 371 | *output++ = encodeNibble(c: c & 0xf); |
| 372 | } |
| 373 | } |
| 374 | |
| 375 | static int recode(QString &result, const ushort *begin, const ushort *end, QUrl::ComponentFormattingOptions encoding, |
| 376 | const uchar *actionTable, bool retryBadEncoding) |
| 377 | { |
| 378 | const int origSize = result.size(); |
| 379 | const ushort *input = begin; |
| 380 | ushort *output = nullptr; |
| 381 | |
| 382 | EncodingAction action = EncodeCharacter; |
| 383 | for ( ; input != end; ++input) { |
| 384 | ushort c; |
| 385 | // try a run where no change is necessary |
| 386 | for ( ; input != end; ++input) { |
| 387 | c = *input; |
| 388 | if (c < 0x20U) |
| 389 | action = EncodeCharacter; |
| 390 | if (c < 0x20U || c >= 0x80U) // also: (c - 0x20 < 0x60U) |
| 391 | goto non_trivial; |
| 392 | action = EncodingAction(actionTable[c - ' ']); |
| 393 | if (action == EncodeCharacter) |
| 394 | goto non_trivial; |
| 395 | if (output) |
| 396 | *output++ = c; |
| 397 | } |
| 398 | break; |
| 399 | |
| 400 | non_trivial: |
| 401 | uint decoded; |
| 402 | if (c == '%' && retryBadEncoding) { |
| 403 | // always write "%25" |
| 404 | ensureDetached(result, output, begin, input, end); |
| 405 | *output++ = '%'; |
| 406 | *output++ = '2'; |
| 407 | *output++ = '5'; |
| 408 | continue; |
| 409 | } else if (c == '%') { |
| 410 | // check if the input is valid |
| 411 | if (input + 2 >= end || (decoded = decodePercentEncoding(input)) == ushort(-1)) { |
| 412 | // not valid, retry |
| 413 | result.resize(size: origSize); |
| 414 | return recode(result, begin, end, encoding, actionTable, retryBadEncoding: true); |
| 415 | } |
| 416 | |
| 417 | if (decoded >= 0x80) { |
| 418 | // decode the UTF-8 sequence |
| 419 | if (!(encoding & QUrl::EncodeUnicode) && |
| 420 | encodedUtf8ToUtf16(result, output, begin, input, end, decoded)) |
| 421 | continue; |
| 422 | |
| 423 | // decoding the encoded UTF-8 failed |
| 424 | action = LeaveCharacter; |
| 425 | } else if (decoded >= 0x20) { |
| 426 | action = EncodingAction(actionTable[decoded - ' ']); |
| 427 | } |
| 428 | } else { |
| 429 | decoded = c; |
| 430 | if (decoded >= 0x80 && encoding & QUrl::EncodeUnicode) { |
| 431 | // encode the UTF-8 sequence |
| 432 | unicodeToEncodedUtf8(result, output, begin, input, end, decoded); |
| 433 | continue; |
| 434 | } else if (decoded >= 0x80) { |
| 435 | if (output) |
| 436 | *output++ = c; |
| 437 | continue; |
| 438 | } |
| 439 | } |
| 440 | |
| 441 | // there are six possibilities: |
| 442 | // current \ action | DecodeCharacter | LeaveCharacter | EncodeCharacter |
| 443 | // decoded | 1:leave | 2:leave | 3:encode |
| 444 | // encoded | 4:decode | 5:leave | 6:leave |
| 445 | // cases 1 and 2 were handled before this section |
| 446 | |
| 447 | if (c == '%' && action != DecodeCharacter) { |
| 448 | // cases 5 and 6: it's encoded and we're leaving it as it is |
| 449 | // except we're pedantic and we'll uppercase the hex |
| 450 | if (output || !isUpperHex(c: input[1]) || !isUpperHex(c: input[2])) { |
| 451 | ensureDetached(result, output, begin, input, end); |
| 452 | *output++ = '%'; |
| 453 | *output++ = toUpperHex(c: *++input); |
| 454 | *output++ = toUpperHex(c: *++input); |
| 455 | } |
| 456 | } else if (c == '%' && action == DecodeCharacter) { |
| 457 | // case 4: we need to decode |
| 458 | ensureDetached(result, output, begin, input, end); |
| 459 | *output++ = decoded; |
| 460 | input += 2; |
| 461 | } else { |
| 462 | // must be case 3: we need to encode |
| 463 | ensureDetached(result, output, begin, input, end); |
| 464 | *output++ = '%'; |
| 465 | *output++ = encodeNibble(c: c >> 4); |
| 466 | *output++ = encodeNibble(c: c & 0xf); |
| 467 | } |
| 468 | } |
| 469 | |
| 470 | if (output) { |
| 471 | int len = output - reinterpret_cast<const ushort *>(result.constData()); |
| 472 | result.truncate(pos: len); |
| 473 | return len - origSize; |
| 474 | } |
| 475 | return 0; |
| 476 | } |
| 477 | |
| 478 | /* |
| 479 | * Returns true if the input it checked (if it checked anything) is not |
| 480 | * encoded. A return of false indicates there's a percent at \a input that |
| 481 | * needs to be decoded. |
| 482 | */ |
| 483 | #ifdef __SSE2__ |
| 484 | static bool simdCheckNonEncoded(ushort *&output, const ushort *&input, const ushort *end) |
| 485 | { |
| 486 | # ifdef __AVX2__ |
| 487 | const __m256i percents256 = _mm256_broadcastw_epi16(_mm_cvtsi32_si128('%')); |
| 488 | const __m128i percents = _mm256_castsi256_si128(percents256); |
| 489 | # else |
| 490 | const __m128i percents = _mm_set1_epi16(w: '%'); |
| 491 | # endif |
| 492 | |
| 493 | uint idx = 0; |
| 494 | quint32 mask = 0; |
| 495 | if (input + 16 <= end) { |
| 496 | qptrdiff offset = 0; |
| 497 | for ( ; input + offset + 16 <= end; offset += 16) { |
| 498 | # ifdef __AVX2__ |
| 499 | // do 32 bytes at a time using AVX2 |
| 500 | __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(input + offset)); |
| 501 | __m256i comparison = _mm256_cmpeq_epi16(data, percents256); |
| 502 | mask = _mm256_movemask_epi8(comparison); |
| 503 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(output + offset), data); |
| 504 | # else |
| 505 | // do 32 bytes at a time using unrolled SSE2 |
| 506 | __m128i data1 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(input + offset)); |
| 507 | __m128i data2 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(input + offset + 8)); |
| 508 | __m128i comparison1 = _mm_cmpeq_epi16(a: data1, b: percents); |
| 509 | __m128i comparison2 = _mm_cmpeq_epi16(a: data2, b: percents); |
| 510 | uint mask1 = _mm_movemask_epi8(a: comparison1); |
| 511 | uint mask2 = _mm_movemask_epi8(a: comparison2); |
| 512 | |
| 513 | _mm_storeu_si128(p: reinterpret_cast<__m128i *>(output + offset), b: data1); |
| 514 | if (!mask1) |
| 515 | _mm_storeu_si128(p: reinterpret_cast<__m128i *>(output + offset + 8), b: data2); |
| 516 | mask = mask1 | (mask2 << 16); |
| 517 | # endif |
| 518 | |
| 519 | if (mask) { |
| 520 | idx = qCountTrailingZeroBits(v: mask) / 2; |
| 521 | break; |
| 522 | } |
| 523 | } |
| 524 | |
| 525 | input += offset; |
| 526 | if (output) |
| 527 | output += offset; |
| 528 | } else if (input + 8 <= end) { |
| 529 | // do 16 bytes at a time |
| 530 | __m128i data = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(input)); |
| 531 | __m128i comparison = _mm_cmpeq_epi16(a: data, b: percents); |
| 532 | mask = _mm_movemask_epi8(a: comparison); |
| 533 | _mm_storeu_si128(p: reinterpret_cast<__m128i *>(output), b: data); |
| 534 | idx = qCountTrailingZeroBits(v: quint16(mask)) / 2; |
| 535 | } else if (input + 4 <= end) { |
| 536 | // do 8 bytes only |
| 537 | __m128i data = _mm_loadl_epi64(p: reinterpret_cast<const __m128i *>(input)); |
| 538 | __m128i comparison = _mm_cmpeq_epi16(a: data, b: percents); |
| 539 | mask = _mm_movemask_epi8(a: comparison) & 0xffu; |
| 540 | _mm_storel_epi64(p: reinterpret_cast<__m128i *>(output), a: data); |
| 541 | idx = qCountTrailingZeroBits(v: quint8(mask)) / 2; |
| 542 | } else { |
| 543 | // no percents found (because we didn't check) |
| 544 | return true; |
| 545 | } |
| 546 | |
| 547 | // advance to the next non-encoded |
| 548 | input += idx; |
| 549 | output += idx; |
| 550 | |
| 551 | return !mask; |
| 552 | } |
| 553 | #else |
| 554 | static bool simdCheckNonEncoded(...) |
| 555 | { |
| 556 | return true; |
| 557 | } |
| 558 | #endif |
| 559 | |
| 560 | /*! |
| 561 | \since 5.0 |
| 562 | \internal |
| 563 | |
| 564 | This function decodes a percent-encoded string located from \a begin to \a |
| 565 | end, by appending each character to \a appendTo. It returns the number of |
| 566 | characters appended. Each percent-encoded sequence is decoded as follows: |
| 567 | |
| 568 | \list |
| 569 | \li from %00 to %7F: the exact decoded value is appended; |
| 570 | \li from %80 to %FF: QChar::ReplacementCharacter is appended; |
| 571 | \li bad encoding: original input is copied to the output, undecoded. |
| 572 | \endlist |
| 573 | |
| 574 | Given the above, it's important for the input to already have all UTF-8 |
| 575 | percent sequences decoded by qt_urlRecode (that is, the input should not |
| 576 | have been processed with QUrl::EncodeUnicode). |
| 577 | |
| 578 | The input should also be a valid percent-encoded sequence (the output of |
| 579 | qt_urlRecode is always valid). |
| 580 | */ |
| 581 | static int decode(QString &appendTo, const ushort *begin, const ushort *end) |
| 582 | { |
| 583 | // fast check whether there's anything to be decoded in the first place |
| 584 | const ushort *input = QtPrivate::qustrchr(str: QStringView(begin, end), ch: '%'); |
| 585 | if (Q_LIKELY(input == end)) |
| 586 | return 0; // nothing to do, it was already decoded! |
| 587 | |
| 588 | // detach |
| 589 | const int origSize = appendTo.size(); |
| 590 | appendTo.resize(size: origSize + (end - begin)); |
| 591 | ushort *output = reinterpret_cast<ushort *>(appendTo.begin()) + origSize; |
| 592 | memcpy(dest: static_cast<void *>(output), src: static_cast<const void *>(begin), n: (input - begin) * sizeof(ushort)); |
| 593 | output += input - begin; |
| 594 | |
| 595 | while (input != end) { |
| 596 | // something was encoded |
| 597 | Q_ASSERT(*input == '%'); |
| 598 | |
| 599 | if (Q_UNLIKELY(end - input < 3 || !isHex(input[1]) || !isHex(input[2]))) { |
| 600 | // badly-encoded data |
| 601 | appendTo.resize(size: origSize + (end - begin)); |
| 602 | memcpy(dest: static_cast<void *>(appendTo.begin() + origSize), src: static_cast<const void *>(begin), n: (end - begin) * sizeof(ushort)); |
| 603 | return end - begin; |
| 604 | } |
| 605 | |
| 606 | ++input; |
| 607 | *output++ = decodeNibble(c: input[0]) << 4 | decodeNibble(c: input[1]); |
| 608 | if (output[-1] >= 0x80) |
| 609 | output[-1] = QChar::ReplacementCharacter; |
| 610 | input += 2; |
| 611 | |
| 612 | // search for the next percent, copying from input to output |
| 613 | if (simdCheckNonEncoded(output, input, end)) { |
| 614 | while (input != end) { |
| 615 | ushort uc = *input; |
| 616 | if (uc == '%') |
| 617 | break; |
| 618 | *output++ = uc; |
| 619 | ++input; |
| 620 | } |
| 621 | } |
| 622 | } |
| 623 | |
| 624 | int len = output - reinterpret_cast<ushort *>(appendTo.begin()); |
| 625 | appendTo.truncate(pos: len); |
| 626 | return len - origSize; |
| 627 | } |
| 628 | |
| 629 | template <size_t N> |
| 630 | static void maskTable(uchar (&table)[N], const uchar (&mask)[N]) |
| 631 | { |
| 632 | for (size_t i = 0; i < N; ++i) |
| 633 | table[i] &= mask[i]; |
| 634 | } |
| 635 | |
| 636 | /*! |
| 637 | \internal |
| 638 | |
| 639 | Recodes the string from \a begin to \a end. If any transformations are |
| 640 | done, append them to \a appendTo and return the number of characters added. |
| 641 | If no transformations were required, return 0. |
| 642 | |
| 643 | The \a encoding option modifies the default behaviour: |
| 644 | \list |
| 645 | \li QUrl::DecodeReserved: if set, reserved characters will be decoded; |
| 646 | if unset, reserved characters will be encoded |
| 647 | \li QUrl::EncodeSpaces: if set, spaces will be encoded to "%20"; if unset, they will be " " |
| 648 | \li QUrl::EncodeUnicode: if set, characters above U+0080 will be encoded to their UTF-8 |
| 649 | percent-encoded form; if unset, they will be decoded to UTF-16 |
| 650 | \li QUrl::FullyDecoded: if set, this function will decode all percent-encoded sequences, |
| 651 | including that of the percent character. The resulting string |
| 652 | will not be percent-encoded anymore. Use with caution! |
| 653 | In this mode, the behaviour is undefined if the input string |
| 654 | contains any percent-encoding sequences above %80. |
| 655 | Also, the function will not correct bad % sequences. |
| 656 | \endlist |
| 657 | |
| 658 | Other flags are ignored (including QUrl::EncodeReserved). |
| 659 | |
| 660 | The \a tableModifications argument can be used to supply extra |
| 661 | modifications to the tables, to be applied after the flags above are |
| 662 | handled. It consists of a sequence of 16-bit values, where the low 8 bits |
| 663 | indicate the character in question and the high 8 bits are either \c |
| 664 | EncodeCharacter, \c LeaveCharacter or \c DecodeCharacter. |
| 665 | |
| 666 | This function corrects percent-encoded errors by interpreting every '%' as |
| 667 | meaning "%25" (all percents in the same content). |
| 668 | */ |
| 669 | |
| 670 | Q_AUTOTEST_EXPORT int |
| 671 | qt_urlRecode(QString &appendTo, const QChar *begin, const QChar *end, |
| 672 | QUrl::ComponentFormattingOptions encoding, const ushort *tableModifications) |
| 673 | { |
| 674 | uchar actionTable[sizeof defaultActionTable]; |
| 675 | if ((encoding & QUrl::FullyDecoded) == QUrl::FullyDecoded) { |
| 676 | return decode(appendTo, begin: reinterpret_cast<const ushort *>(begin), end: reinterpret_cast<const ushort *>(end)); |
| 677 | } |
| 678 | |
| 679 | memcpy(dest: actionTable, src: defaultActionTable, n: sizeof actionTable); |
| 680 | if (encoding & QUrl::DecodeReserved) |
| 681 | maskTable(table&: actionTable, mask: reservedMask); |
| 682 | if (!(encoding & QUrl::EncodeSpaces)) |
| 683 | actionTable[0] = DecodeCharacter; // decode |
| 684 | |
| 685 | if (tableModifications) { |
| 686 | for (const ushort *p = tableModifications; *p; ++p) |
| 687 | actionTable[uchar(*p) - ' '] = *p >> 8; |
| 688 | } |
| 689 | |
| 690 | return recode(result&: appendTo, begin: reinterpret_cast<const ushort *>(begin), end: reinterpret_cast<const ushort *>(end), |
| 691 | encoding, actionTable, retryBadEncoding: false); |
| 692 | } |
| 693 | |
| 694 | // qstring.cpp |
| 695 | bool qt_is_ascii(const char *&ptr, const char *end) noexcept; |
| 696 | |
| 697 | /*! |
| 698 | \internal |
| 699 | \since 5.0 |
| 700 | |
| 701 | \a ba contains an 8-bit form of the component and it might be |
| 702 | percent-encoded already. We can't use QString::fromUtf8 because it might |
| 703 | contain non-UTF8 sequences. We can't use QByteArray::toPercentEncoding |
| 704 | because it might already contain percent-encoded sequences. We can't use |
| 705 | qt_urlRecode because it needs UTF-16 input. |
| 706 | */ |
| 707 | Q_AUTOTEST_EXPORT |
| 708 | QString qt_urlRecodeByteArray(const QByteArray &ba) |
| 709 | { |
| 710 | if (ba.isNull()) |
| 711 | return QString(); |
| 712 | |
| 713 | // scan ba for anything above or equal to 0x80 |
| 714 | // control points below 0x20 are fine in QString |
| 715 | const char *in = ba.constData(); |
| 716 | const char *const end = ba.constEnd(); |
| 717 | if (qt_is_ascii(ptr&: in, end)) { |
| 718 | // no non-ASCII found, we're safe to convert to QString |
| 719 | return QString::fromLatin1(str: ba, size: ba.size()); |
| 720 | } |
| 721 | |
| 722 | // we found something that we need to encode |
| 723 | QByteArray intermediate = ba; |
| 724 | intermediate.resize(size: ba.size() * 3 - (in - ba.constData())); |
| 725 | uchar *out = reinterpret_cast<uchar *>(intermediate.data() + (in - ba.constData())); |
| 726 | for ( ; in < end; ++in) { |
| 727 | if (*in & 0x80) { |
| 728 | // encode |
| 729 | *out++ = '%'; |
| 730 | *out++ = encodeNibble(c: uchar(*in) >> 4); |
| 731 | *out++ = encodeNibble(c: uchar(*in) & 0xf); |
| 732 | } else { |
| 733 | // keep |
| 734 | *out++ = uchar(*in); |
| 735 | } |
| 736 | } |
| 737 | |
| 738 | // now it's safe to call fromLatin1 |
| 739 | return QString::fromLatin1(str: intermediate, size: out - reinterpret_cast<uchar *>(intermediate.data())); |
| 740 | } |
| 741 | |
| 742 | QT_END_NAMESPACE |
| 743 | |