1 | /**************************************************************************** |
2 | ** |
3 | ** Copyright (C) 2016 Intel Corporation. |
4 | ** Contact: https://www.qt.io/licensing/ |
5 | ** |
6 | ** This file is part of the QtCore module of the Qt Toolkit. |
7 | ** |
8 | ** $QT_BEGIN_LICENSE:LGPL$ |
9 | ** Commercial License Usage |
10 | ** Licensees holding valid commercial Qt licenses may use this file in |
11 | ** accordance with the commercial license agreement provided with the |
12 | ** Software or, alternatively, in accordance with the terms contained in |
13 | ** a written agreement between you and The Qt Company. For licensing terms |
14 | ** and conditions see https://www.qt.io/terms-conditions. For further |
15 | ** information use the contact form at https://www.qt.io/contact-us. |
16 | ** |
17 | ** GNU Lesser General Public License Usage |
18 | ** Alternatively, this file may be used under the terms of the GNU Lesser |
19 | ** General Public License version 3 as published by the Free Software |
20 | ** Foundation and appearing in the file LICENSE.LGPL3 included in the |
21 | ** packaging of this file. Please review the following information to |
22 | ** ensure the GNU Lesser General Public License version 3 requirements |
23 | ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. |
24 | ** |
25 | ** GNU General Public License Usage |
26 | ** Alternatively, this file may be used under the terms of the GNU |
27 | ** General Public License version 2.0 or (at your option) the GNU General |
28 | ** Public license version 3 or any later version approved by the KDE Free |
29 | ** Qt Foundation. The licenses are as published by the Free Software |
30 | ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 |
31 | ** included in the packaging of this file. Please review the following |
32 | ** information to ensure the GNU General Public License requirements will |
33 | ** be met: https://www.gnu.org/licenses/gpl-2.0.html and |
34 | ** https://www.gnu.org/licenses/gpl-3.0.html. |
35 | ** |
36 | ** $QT_END_LICENSE$ |
37 | ** |
38 | ****************************************************************************/ |
39 | |
40 | #include "qurl.h" |
41 | #include "private/qutfcodec_p.h" |
42 | #include "private/qtools_p.h" |
43 | #include "private/qsimd_p.h" |
44 | |
45 | QT_BEGIN_NAMESPACE |
46 | |
47 | // ### move to qurl_p.h |
48 | enum EncodingAction { |
49 | DecodeCharacter = 0, |
50 | LeaveCharacter = 1, |
51 | EncodeCharacter = 2 |
52 | }; |
53 | |
54 | // From RFC 3896, Appendix A Collected ABNF for URI |
55 | // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" |
56 | // reserved = gen-delims / sub-delims |
57 | // gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" |
58 | // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" |
59 | // / "*" / "+" / "," / ";" / "=" |
60 | static const uchar defaultActionTable[96] = { |
61 | 2, // space |
62 | 1, // '!' (sub-delim) |
63 | 2, // '"' |
64 | 1, // '#' (gen-delim) |
65 | 1, // '$' (gen-delim) |
66 | 2, // '%' (percent) |
67 | 1, // '&' (gen-delim) |
68 | 1, // "'" (sub-delim) |
69 | 1, // '(' (sub-delim) |
70 | 1, // ')' (sub-delim) |
71 | 1, // '*' (sub-delim) |
72 | 1, // '+' (sub-delim) |
73 | 1, // ',' (sub-delim) |
74 | 0, // '-' (unreserved) |
75 | 0, // '.' (unreserved) |
76 | 1, // '/' (gen-delim) |
77 | |
78 | 0, 0, 0, 0, 0, // '0' to '4' (unreserved) |
79 | 0, 0, 0, 0, 0, // '5' to '9' (unreserved) |
80 | 1, // ':' (gen-delim) |
81 | 1, // ';' (sub-delim) |
82 | 2, // '<' |
83 | 1, // '=' (sub-delim) |
84 | 2, // '>' |
85 | 1, // '?' (gen-delim) |
86 | |
87 | 1, // '@' (gen-delim) |
88 | 0, 0, 0, 0, 0, // 'A' to 'E' (unreserved) |
89 | 0, 0, 0, 0, 0, // 'F' to 'J' (unreserved) |
90 | 0, 0, 0, 0, 0, // 'K' to 'O' (unreserved) |
91 | 0, 0, 0, 0, 0, // 'P' to 'T' (unreserved) |
92 | 0, 0, 0, 0, 0, 0, // 'U' to 'Z' (unreserved) |
93 | 1, // '[' (gen-delim) |
94 | 2, // '\' |
95 | 1, // ']' (gen-delim) |
96 | 2, // '^' |
97 | 0, // '_' (unreserved) |
98 | |
99 | 2, // '`' |
100 | 0, 0, 0, 0, 0, // 'a' to 'e' (unreserved) |
101 | 0, 0, 0, 0, 0, // 'f' to 'j' (unreserved) |
102 | 0, 0, 0, 0, 0, // 'k' to 'o' (unreserved) |
103 | 0, 0, 0, 0, 0, // 'p' to 't' (unreserved) |
104 | 0, 0, 0, 0, 0, 0, // 'u' to 'z' (unreserved) |
105 | 2, // '{' |
106 | 2, // '|' |
107 | 2, // '}' |
108 | 0, // '~' (unreserved) |
109 | |
110 | 2 // BSKP |
111 | }; |
112 | |
113 | // mask tables, in negative polarity |
114 | // 0x00 if it belongs to this category |
115 | // 0xff if it doesn't |
116 | |
117 | static const uchar reservedMask[96] = { |
118 | 0xff, // space |
119 | 0xff, // '!' (sub-delim) |
120 | 0x00, // '"' |
121 | 0xff, // '#' (gen-delim) |
122 | 0xff, // '$' (gen-delim) |
123 | 0xff, // '%' (percent) |
124 | 0xff, // '&' (gen-delim) |
125 | 0xff, // "'" (sub-delim) |
126 | 0xff, // '(' (sub-delim) |
127 | 0xff, // ')' (sub-delim) |
128 | 0xff, // '*' (sub-delim) |
129 | 0xff, // '+' (sub-delim) |
130 | 0xff, // ',' (sub-delim) |
131 | 0xff, // '-' (unreserved) |
132 | 0xff, // '.' (unreserved) |
133 | 0xff, // '/' (gen-delim) |
134 | |
135 | 0xff, 0xff, 0xff, 0xff, 0xff, // '0' to '4' (unreserved) |
136 | 0xff, 0xff, 0xff, 0xff, 0xff, // '5' to '9' (unreserved) |
137 | 0xff, // ':' (gen-delim) |
138 | 0xff, // ';' (sub-delim) |
139 | 0x00, // '<' |
140 | 0xff, // '=' (sub-delim) |
141 | 0x00, // '>' |
142 | 0xff, // '?' (gen-delim) |
143 | |
144 | 0xff, // '@' (gen-delim) |
145 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'A' to 'E' (unreserved) |
146 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'F' to 'J' (unreserved) |
147 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'K' to 'O' (unreserved) |
148 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'P' to 'T' (unreserved) |
149 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // 'U' to 'Z' (unreserved) |
150 | 0xff, // '[' (gen-delim) |
151 | 0x00, // '\' |
152 | 0xff, // ']' (gen-delim) |
153 | 0x00, // '^' |
154 | 0xff, // '_' (unreserved) |
155 | |
156 | 0x00, // '`' |
157 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'a' to 'e' (unreserved) |
158 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'f' to 'j' (unreserved) |
159 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'k' to 'o' (unreserved) |
160 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'p' to 't' (unreserved) |
161 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // 'u' to 'z' (unreserved) |
162 | 0x00, // '{' |
163 | 0x00, // '|' |
164 | 0x00, // '}' |
165 | 0xff, // '~' (unreserved) |
166 | |
167 | 0xff // BSKP |
168 | }; |
169 | |
170 | static inline bool isHex(ushort c) |
171 | { |
172 | return (c >= 'a' && c <= 'f') || |
173 | (c >= 'A' && c <= 'F') || |
174 | (c >= '0' && c <= '9'); |
175 | } |
176 | |
177 | static inline bool isUpperHex(ushort c) |
178 | { |
179 | // undefined behaviour if c isn't an hex char! |
180 | return c < 0x60; |
181 | } |
182 | |
183 | static inline ushort toUpperHex(ushort c) |
184 | { |
185 | return isUpperHex(c) ? c : c - 0x20; |
186 | } |
187 | |
188 | static inline ushort decodeNibble(ushort c) |
189 | { |
190 | return c >= 'a' ? c - 'a' + 0xA : |
191 | c >= 'A' ? c - 'A' + 0xA : c - '0'; |
192 | } |
193 | |
194 | // if the sequence at input is 2*HEXDIG, returns its decoding |
195 | // returns -1 if it isn't. |
196 | // assumes that the range has been checked already |
197 | static inline ushort decodePercentEncoding(const ushort *input) |
198 | { |
199 | ushort c1 = input[1]; |
200 | ushort c2 = input[2]; |
201 | if (!isHex(c: c1) || !isHex(c: c2)) |
202 | return ushort(-1); |
203 | return decodeNibble(c: c1) << 4 | decodeNibble(c: c2); |
204 | } |
205 | |
206 | static inline ushort encodeNibble(ushort c) |
207 | { |
208 | return ushort(QtMiscUtils::toHexUpper(value: c)); |
209 | } |
210 | |
211 | static void ensureDetached(QString &result, ushort *&output, const ushort *begin, const ushort *input, const ushort *end, |
212 | int add = 0) |
213 | { |
214 | if (!output) { |
215 | // now detach |
216 | // create enough space if the rest of the string needed to be percent-encoded |
217 | int charsProcessed = input - begin; |
218 | int charsRemaining = end - input; |
219 | int spaceNeeded = end - begin + 2 * charsRemaining + add; |
220 | int origSize = result.size(); |
221 | result.resize(size: origSize + spaceNeeded); |
222 | |
223 | // we know that resize() above detached, so we bypass the reference count check |
224 | output = const_cast<ushort *>(reinterpret_cast<const ushort *>(result.constData())) |
225 | + origSize; |
226 | |
227 | // copy the chars we've already processed |
228 | int i; |
229 | for (i = 0; i < charsProcessed; ++i) |
230 | output[i] = begin[i]; |
231 | output += i; |
232 | } |
233 | } |
234 | |
235 | namespace { |
236 | struct QUrlUtf8Traits : public QUtf8BaseTraitsNoAscii |
237 | { |
238 | // From RFC 3987: |
239 | // iunreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar |
240 | // |
241 | // ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF |
242 | // / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD |
243 | // / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD |
244 | // / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD |
245 | // / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD |
246 | // / %xD0000-DFFFD / %xE1000-EFFFD |
247 | // |
248 | // iprivate = %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD |
249 | // |
250 | // That RFC allows iprivate only as part of iquery, but we don't know here |
251 | // whether we're looking at a query or another part of an URI, so we accept |
252 | // them too. The definition above excludes U+FFF0 to U+FFFD from appearing |
253 | // unencoded, but we see no reason for its exclusion, so we allow them to |
254 | // be decoded (and we need U+FFFD the replacement character to indicate |
255 | // failure to decode). |
256 | // |
257 | // That means we must disallow: |
258 | // * unpaired surrogates (QUtf8Functions takes care of that for us) |
259 | // * non-characters |
260 | static const bool allowNonCharacters = false; |
261 | |
262 | // override: our "bytes" are three percent-encoded UTF-16 characters |
263 | static void appendByte(ushort *&ptr, uchar b) |
264 | { |
265 | // b >= 0x80, by construction, so percent-encode |
266 | *ptr++ = '%'; |
267 | *ptr++ = encodeNibble(c: b >> 4); |
268 | *ptr++ = encodeNibble(c: b & 0xf); |
269 | } |
270 | |
271 | static uchar peekByte(const ushort *ptr, int n = 0) |
272 | { |
273 | // decodePercentEncoding returns ushort(-1) if it can't decode, |
274 | // which means we return 0xff, which is not a valid continuation byte. |
275 | // If ptr[i * 3] is not '%', we'll multiply by zero and return 0, |
276 | // also not a valid continuation byte (if it's '%', we multiply by 1). |
277 | return uchar(decodePercentEncoding(input: ptr + n * 3)) |
278 | * uchar(ptr[n * 3] == '%'); |
279 | } |
280 | |
281 | static qptrdiff availableBytes(const ushort *ptr, const ushort *end) |
282 | { |
283 | return (end - ptr) / 3; |
284 | } |
285 | |
286 | static void advanceByte(const ushort *&ptr, int n = 1) |
287 | { |
288 | ptr += n * 3; |
289 | } |
290 | }; |
291 | } |
292 | |
293 | // returns true if we performed an UTF-8 decoding |
294 | static bool encodedUtf8ToUtf16(QString &result, ushort *&output, const ushort *begin, const ushort *&input, |
295 | const ushort *end, ushort decoded) |
296 | { |
297 | uint ucs4, *dst = &ucs4; |
298 | const ushort *src = input + 3;// skip the %XX that yielded \a decoded |
299 | int charsNeeded = QUtf8Functions::fromUtf8<QUrlUtf8Traits>(b: decoded, dst, src, end); |
300 | if (charsNeeded < 0) |
301 | return false; |
302 | |
303 | if (!QChar::requiresSurrogates(ucs4)) { |
304 | // UTF-8 decoded and no surrogates are required |
305 | // detach if necessary |
306 | // possibilities are: 6 chars (%XX%XX) -> one char; 9 chars (%XX%XX%XX) -> one char |
307 | ensureDetached(result, output, begin, input, end, add: -3 * charsNeeded + 1); |
308 | *output++ = ucs4; |
309 | } else { |
310 | // UTF-8 decoded to something that requires a surrogate pair |
311 | // compressing from %XX%XX%XX%XX (12 chars) to two |
312 | ensureDetached(result, output, begin, input, end, add: -10); |
313 | *output++ = QChar::highSurrogate(ucs4); |
314 | *output++ = QChar::lowSurrogate(ucs4); |
315 | } |
316 | |
317 | input = src - 1; |
318 | return true; |
319 | } |
320 | |
321 | static void unicodeToEncodedUtf8(QString &result, ushort *&output, const ushort *begin, |
322 | const ushort *&input, const ushort *end, ushort decoded) |
323 | { |
324 | // calculate the utf8 length and ensure enough space is available |
325 | int utf8len = QChar::isHighSurrogate(ucs4: decoded) ? 4 : decoded >= 0x800 ? 3 : 2; |
326 | |
327 | // detach |
328 | if (!output) { |
329 | // we need 3 * utf8len for the encoded UTF-8 sequence |
330 | // but ensureDetached already adds 3 for the char we're processing |
331 | ensureDetached(result, output, begin, input, end, add: 3*utf8len - 3); |
332 | } else { |
333 | // verify that there's enough space or expand |
334 | int charsRemaining = end - input - 1; // not including this one |
335 | int pos = output - reinterpret_cast<const ushort *>(result.constData()); |
336 | int spaceRemaining = result.size() - pos; |
337 | if (spaceRemaining < 3*charsRemaining + 3*utf8len) { |
338 | // must resize |
339 | result.resize(size: result.size() + 3*utf8len); |
340 | |
341 | // we know that resize() above detached, so we bypass the reference count check |
342 | output = const_cast<ushort *>(reinterpret_cast<const ushort *>(result.constData())); |
343 | output += pos; |
344 | } |
345 | } |
346 | |
347 | ++input; |
348 | int res = QUtf8Functions::toUtf8<QUrlUtf8Traits>(u: decoded, dst&: output, src&: input, end); |
349 | --input; |
350 | if (res < 0) { |
351 | // bad surrogate pair sequence |
352 | // we will encode bad UTF-16 to UTF-8 |
353 | // but they don't get decoded back |
354 | |
355 | // first of three bytes |
356 | uchar c = 0xe0 | uchar(decoded >> 12); |
357 | *output++ = '%'; |
358 | *output++ = 'E'; |
359 | *output++ = encodeNibble(c: c & 0xf); |
360 | |
361 | // second byte |
362 | c = 0x80 | (uchar(decoded >> 6) & 0x3f); |
363 | *output++ = '%'; |
364 | *output++ = encodeNibble(c: c >> 4); |
365 | *output++ = encodeNibble(c: c & 0xf); |
366 | |
367 | // third byte |
368 | c = 0x80 | (decoded & 0x3f); |
369 | *output++ = '%'; |
370 | *output++ = encodeNibble(c: c >> 4); |
371 | *output++ = encodeNibble(c: c & 0xf); |
372 | } |
373 | } |
374 | |
375 | static int recode(QString &result, const ushort *begin, const ushort *end, QUrl::ComponentFormattingOptions encoding, |
376 | const uchar *actionTable, bool retryBadEncoding) |
377 | { |
378 | const int origSize = result.size(); |
379 | const ushort *input = begin; |
380 | ushort *output = nullptr; |
381 | |
382 | EncodingAction action = EncodeCharacter; |
383 | for ( ; input != end; ++input) { |
384 | ushort c; |
385 | // try a run where no change is necessary |
386 | for ( ; input != end; ++input) { |
387 | c = *input; |
388 | if (c < 0x20U) |
389 | action = EncodeCharacter; |
390 | if (c < 0x20U || c >= 0x80U) // also: (c - 0x20 < 0x60U) |
391 | goto non_trivial; |
392 | action = EncodingAction(actionTable[c - ' ']); |
393 | if (action == EncodeCharacter) |
394 | goto non_trivial; |
395 | if (output) |
396 | *output++ = c; |
397 | } |
398 | break; |
399 | |
400 | non_trivial: |
401 | uint decoded; |
402 | if (c == '%' && retryBadEncoding) { |
403 | // always write "%25" |
404 | ensureDetached(result, output, begin, input, end); |
405 | *output++ = '%'; |
406 | *output++ = '2'; |
407 | *output++ = '5'; |
408 | continue; |
409 | } else if (c == '%') { |
410 | // check if the input is valid |
411 | if (input + 2 >= end || (decoded = decodePercentEncoding(input)) == ushort(-1)) { |
412 | // not valid, retry |
413 | result.resize(size: origSize); |
414 | return recode(result, begin, end, encoding, actionTable, retryBadEncoding: true); |
415 | } |
416 | |
417 | if (decoded >= 0x80) { |
418 | // decode the UTF-8 sequence |
419 | if (!(encoding & QUrl::EncodeUnicode) && |
420 | encodedUtf8ToUtf16(result, output, begin, input, end, decoded)) |
421 | continue; |
422 | |
423 | // decoding the encoded UTF-8 failed |
424 | action = LeaveCharacter; |
425 | } else if (decoded >= 0x20) { |
426 | action = EncodingAction(actionTable[decoded - ' ']); |
427 | } |
428 | } else { |
429 | decoded = c; |
430 | if (decoded >= 0x80 && encoding & QUrl::EncodeUnicode) { |
431 | // encode the UTF-8 sequence |
432 | unicodeToEncodedUtf8(result, output, begin, input, end, decoded); |
433 | continue; |
434 | } else if (decoded >= 0x80) { |
435 | if (output) |
436 | *output++ = c; |
437 | continue; |
438 | } |
439 | } |
440 | |
441 | // there are six possibilities: |
442 | // current \ action | DecodeCharacter | LeaveCharacter | EncodeCharacter |
443 | // decoded | 1:leave | 2:leave | 3:encode |
444 | // encoded | 4:decode | 5:leave | 6:leave |
445 | // cases 1 and 2 were handled before this section |
446 | |
447 | if (c == '%' && action != DecodeCharacter) { |
448 | // cases 5 and 6: it's encoded and we're leaving it as it is |
449 | // except we're pedantic and we'll uppercase the hex |
450 | if (output || !isUpperHex(c: input[1]) || !isUpperHex(c: input[2])) { |
451 | ensureDetached(result, output, begin, input, end); |
452 | *output++ = '%'; |
453 | *output++ = toUpperHex(c: *++input); |
454 | *output++ = toUpperHex(c: *++input); |
455 | } |
456 | } else if (c == '%' && action == DecodeCharacter) { |
457 | // case 4: we need to decode |
458 | ensureDetached(result, output, begin, input, end); |
459 | *output++ = decoded; |
460 | input += 2; |
461 | } else { |
462 | // must be case 3: we need to encode |
463 | ensureDetached(result, output, begin, input, end); |
464 | *output++ = '%'; |
465 | *output++ = encodeNibble(c: c >> 4); |
466 | *output++ = encodeNibble(c: c & 0xf); |
467 | } |
468 | } |
469 | |
470 | if (output) { |
471 | int len = output - reinterpret_cast<const ushort *>(result.constData()); |
472 | result.truncate(pos: len); |
473 | return len - origSize; |
474 | } |
475 | return 0; |
476 | } |
477 | |
478 | /* |
479 | * Returns true if the input it checked (if it checked anything) is not |
480 | * encoded. A return of false indicates there's a percent at \a input that |
481 | * needs to be decoded. |
482 | */ |
483 | #ifdef __SSE2__ |
484 | static bool simdCheckNonEncoded(ushort *&output, const ushort *&input, const ushort *end) |
485 | { |
486 | # ifdef __AVX2__ |
487 | const __m256i percents256 = _mm256_broadcastw_epi16(_mm_cvtsi32_si128('%')); |
488 | const __m128i percents = _mm256_castsi256_si128(percents256); |
489 | # else |
490 | const __m128i percents = _mm_set1_epi16(w: '%'); |
491 | # endif |
492 | |
493 | uint idx = 0; |
494 | quint32 mask = 0; |
495 | if (input + 16 <= end) { |
496 | qptrdiff offset = 0; |
497 | for ( ; input + offset + 16 <= end; offset += 16) { |
498 | # ifdef __AVX2__ |
499 | // do 32 bytes at a time using AVX2 |
500 | __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(input + offset)); |
501 | __m256i comparison = _mm256_cmpeq_epi16(data, percents256); |
502 | mask = _mm256_movemask_epi8(comparison); |
503 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(output + offset), data); |
504 | # else |
505 | // do 32 bytes at a time using unrolled SSE2 |
506 | __m128i data1 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(input + offset)); |
507 | __m128i data2 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(input + offset + 8)); |
508 | __m128i comparison1 = _mm_cmpeq_epi16(a: data1, b: percents); |
509 | __m128i comparison2 = _mm_cmpeq_epi16(a: data2, b: percents); |
510 | uint mask1 = _mm_movemask_epi8(a: comparison1); |
511 | uint mask2 = _mm_movemask_epi8(a: comparison2); |
512 | |
513 | _mm_storeu_si128(p: reinterpret_cast<__m128i *>(output + offset), b: data1); |
514 | if (!mask1) |
515 | _mm_storeu_si128(p: reinterpret_cast<__m128i *>(output + offset + 8), b: data2); |
516 | mask = mask1 | (mask2 << 16); |
517 | # endif |
518 | |
519 | if (mask) { |
520 | idx = qCountTrailingZeroBits(v: mask) / 2; |
521 | break; |
522 | } |
523 | } |
524 | |
525 | input += offset; |
526 | if (output) |
527 | output += offset; |
528 | } else if (input + 8 <= end) { |
529 | // do 16 bytes at a time |
530 | __m128i data = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(input)); |
531 | __m128i comparison = _mm_cmpeq_epi16(a: data, b: percents); |
532 | mask = _mm_movemask_epi8(a: comparison); |
533 | _mm_storeu_si128(p: reinterpret_cast<__m128i *>(output), b: data); |
534 | idx = qCountTrailingZeroBits(v: quint16(mask)) / 2; |
535 | } else if (input + 4 <= end) { |
536 | // do 8 bytes only |
537 | __m128i data = _mm_loadl_epi64(p: reinterpret_cast<const __m128i *>(input)); |
538 | __m128i comparison = _mm_cmpeq_epi16(a: data, b: percents); |
539 | mask = _mm_movemask_epi8(a: comparison) & 0xffu; |
540 | _mm_storel_epi64(p: reinterpret_cast<__m128i *>(output), a: data); |
541 | idx = qCountTrailingZeroBits(v: quint8(mask)) / 2; |
542 | } else { |
543 | // no percents found (because we didn't check) |
544 | return true; |
545 | } |
546 | |
547 | // advance to the next non-encoded |
548 | input += idx; |
549 | output += idx; |
550 | |
551 | return !mask; |
552 | } |
553 | #else |
554 | static bool simdCheckNonEncoded(...) |
555 | { |
556 | return true; |
557 | } |
558 | #endif |
559 | |
560 | /*! |
561 | \since 5.0 |
562 | \internal |
563 | |
564 | This function decodes a percent-encoded string located from \a begin to \a |
565 | end, by appending each character to \a appendTo. It returns the number of |
566 | characters appended. Each percent-encoded sequence is decoded as follows: |
567 | |
568 | \list |
569 | \li from %00 to %7F: the exact decoded value is appended; |
570 | \li from %80 to %FF: QChar::ReplacementCharacter is appended; |
571 | \li bad encoding: original input is copied to the output, undecoded. |
572 | \endlist |
573 | |
574 | Given the above, it's important for the input to already have all UTF-8 |
575 | percent sequences decoded by qt_urlRecode (that is, the input should not |
576 | have been processed with QUrl::EncodeUnicode). |
577 | |
578 | The input should also be a valid percent-encoded sequence (the output of |
579 | qt_urlRecode is always valid). |
580 | */ |
581 | static int decode(QString &appendTo, const ushort *begin, const ushort *end) |
582 | { |
583 | // fast check whether there's anything to be decoded in the first place |
584 | const ushort *input = QtPrivate::qustrchr(str: QStringView(begin, end), ch: '%'); |
585 | if (Q_LIKELY(input == end)) |
586 | return 0; // nothing to do, it was already decoded! |
587 | |
588 | // detach |
589 | const int origSize = appendTo.size(); |
590 | appendTo.resize(size: origSize + (end - begin)); |
591 | ushort *output = reinterpret_cast<ushort *>(appendTo.begin()) + origSize; |
592 | memcpy(dest: static_cast<void *>(output), src: static_cast<const void *>(begin), n: (input - begin) * sizeof(ushort)); |
593 | output += input - begin; |
594 | |
595 | while (input != end) { |
596 | // something was encoded |
597 | Q_ASSERT(*input == '%'); |
598 | |
599 | if (Q_UNLIKELY(end - input < 3 || !isHex(input[1]) || !isHex(input[2]))) { |
600 | // badly-encoded data |
601 | appendTo.resize(size: origSize + (end - begin)); |
602 | memcpy(dest: static_cast<void *>(appendTo.begin() + origSize), src: static_cast<const void *>(begin), n: (end - begin) * sizeof(ushort)); |
603 | return end - begin; |
604 | } |
605 | |
606 | ++input; |
607 | *output++ = decodeNibble(c: input[0]) << 4 | decodeNibble(c: input[1]); |
608 | if (output[-1] >= 0x80) |
609 | output[-1] = QChar::ReplacementCharacter; |
610 | input += 2; |
611 | |
612 | // search for the next percent, copying from input to output |
613 | if (simdCheckNonEncoded(output, input, end)) { |
614 | while (input != end) { |
615 | ushort uc = *input; |
616 | if (uc == '%') |
617 | break; |
618 | *output++ = uc; |
619 | ++input; |
620 | } |
621 | } |
622 | } |
623 | |
624 | int len = output - reinterpret_cast<ushort *>(appendTo.begin()); |
625 | appendTo.truncate(pos: len); |
626 | return len - origSize; |
627 | } |
628 | |
629 | template <size_t N> |
630 | static void maskTable(uchar (&table)[N], const uchar (&mask)[N]) |
631 | { |
632 | for (size_t i = 0; i < N; ++i) |
633 | table[i] &= mask[i]; |
634 | } |
635 | |
636 | /*! |
637 | \internal |
638 | |
639 | Recodes the string from \a begin to \a end. If any transformations are |
640 | done, append them to \a appendTo and return the number of characters added. |
641 | If no transformations were required, return 0. |
642 | |
643 | The \a encoding option modifies the default behaviour: |
644 | \list |
645 | \li QUrl::DecodeReserved: if set, reserved characters will be decoded; |
646 | if unset, reserved characters will be encoded |
647 | \li QUrl::EncodeSpaces: if set, spaces will be encoded to "%20"; if unset, they will be " " |
648 | \li QUrl::EncodeUnicode: if set, characters above U+0080 will be encoded to their UTF-8 |
649 | percent-encoded form; if unset, they will be decoded to UTF-16 |
650 | \li QUrl::FullyDecoded: if set, this function will decode all percent-encoded sequences, |
651 | including that of the percent character. The resulting string |
652 | will not be percent-encoded anymore. Use with caution! |
653 | In this mode, the behaviour is undefined if the input string |
654 | contains any percent-encoding sequences above %80. |
655 | Also, the function will not correct bad % sequences. |
656 | \endlist |
657 | |
658 | Other flags are ignored (including QUrl::EncodeReserved). |
659 | |
660 | The \a tableModifications argument can be used to supply extra |
661 | modifications to the tables, to be applied after the flags above are |
662 | handled. It consists of a sequence of 16-bit values, where the low 8 bits |
663 | indicate the character in question and the high 8 bits are either \c |
664 | EncodeCharacter, \c LeaveCharacter or \c DecodeCharacter. |
665 | |
666 | This function corrects percent-encoded errors by interpreting every '%' as |
667 | meaning "%25" (all percents in the same content). |
668 | */ |
669 | |
670 | Q_AUTOTEST_EXPORT int |
671 | qt_urlRecode(QString &appendTo, const QChar *begin, const QChar *end, |
672 | QUrl::ComponentFormattingOptions encoding, const ushort *tableModifications) |
673 | { |
674 | uchar actionTable[sizeof defaultActionTable]; |
675 | if ((encoding & QUrl::FullyDecoded) == QUrl::FullyDecoded) { |
676 | return decode(appendTo, begin: reinterpret_cast<const ushort *>(begin), end: reinterpret_cast<const ushort *>(end)); |
677 | } |
678 | |
679 | memcpy(dest: actionTable, src: defaultActionTable, n: sizeof actionTable); |
680 | if (encoding & QUrl::DecodeReserved) |
681 | maskTable(table&: actionTable, mask: reservedMask); |
682 | if (!(encoding & QUrl::EncodeSpaces)) |
683 | actionTable[0] = DecodeCharacter; // decode |
684 | |
685 | if (tableModifications) { |
686 | for (const ushort *p = tableModifications; *p; ++p) |
687 | actionTable[uchar(*p) - ' '] = *p >> 8; |
688 | } |
689 | |
690 | return recode(result&: appendTo, begin: reinterpret_cast<const ushort *>(begin), end: reinterpret_cast<const ushort *>(end), |
691 | encoding, actionTable, retryBadEncoding: false); |
692 | } |
693 | |
694 | // qstring.cpp |
695 | bool qt_is_ascii(const char *&ptr, const char *end) noexcept; |
696 | |
697 | /*! |
698 | \internal |
699 | \since 5.0 |
700 | |
701 | \a ba contains an 8-bit form of the component and it might be |
702 | percent-encoded already. We can't use QString::fromUtf8 because it might |
703 | contain non-UTF8 sequences. We can't use QByteArray::toPercentEncoding |
704 | because it might already contain percent-encoded sequences. We can't use |
705 | qt_urlRecode because it needs UTF-16 input. |
706 | */ |
707 | Q_AUTOTEST_EXPORT |
708 | QString qt_urlRecodeByteArray(const QByteArray &ba) |
709 | { |
710 | if (ba.isNull()) |
711 | return QString(); |
712 | |
713 | // scan ba for anything above or equal to 0x80 |
714 | // control points below 0x20 are fine in QString |
715 | const char *in = ba.constData(); |
716 | const char *const end = ba.constEnd(); |
717 | if (qt_is_ascii(ptr&: in, end)) { |
718 | // no non-ASCII found, we're safe to convert to QString |
719 | return QString::fromLatin1(str: ba, size: ba.size()); |
720 | } |
721 | |
722 | // we found something that we need to encode |
723 | QByteArray intermediate = ba; |
724 | intermediate.resize(size: ba.size() * 3 - (in - ba.constData())); |
725 | uchar *out = reinterpret_cast<uchar *>(intermediate.data() + (in - ba.constData())); |
726 | for ( ; in < end; ++in) { |
727 | if (*in & 0x80) { |
728 | // encode |
729 | *out++ = '%'; |
730 | *out++ = encodeNibble(c: uchar(*in) >> 4); |
731 | *out++ = encodeNibble(c: uchar(*in) & 0xf); |
732 | } else { |
733 | // keep |
734 | *out++ = uchar(*in); |
735 | } |
736 | } |
737 | |
738 | // now it's safe to call fromLatin1 |
739 | return QString::fromLatin1(str: intermediate, size: out - reinterpret_cast<uchar *>(intermediate.data())); |
740 | } |
741 | |
742 | QT_END_NAMESPACE |
743 | |