1 | // Copyright (C) 2016 Intel Corporation. |
2 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only |
3 | |
4 | #include "qurl.h" |
5 | #include "private/qstringconverter_p.h" |
6 | #include "private/qtools_p.h" |
7 | #include "private/qsimd_p.h" |
8 | |
9 | QT_BEGIN_NAMESPACE |
10 | |
11 | // ### move to qurl_p.h |
12 | enum EncodingAction { |
13 | DecodeCharacter = 0, |
14 | LeaveCharacter = 1, |
15 | EncodeCharacter = 2 |
16 | }; |
17 | |
18 | // From RFC 3896, Appendix A Collected ABNF for URI |
19 | // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" |
20 | // reserved = gen-delims / sub-delims |
21 | // gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" |
22 | // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" |
23 | // / "*" / "+" / "," / ";" / "=" |
24 | static const uchar defaultActionTable[96] = { |
25 | 2, // space |
26 | 1, // '!' (sub-delim) |
27 | 2, // '"' |
28 | 1, // '#' (gen-delim) |
29 | 1, // '$' (gen-delim) |
30 | 2, // '%' (percent) |
31 | 1, // '&' (gen-delim) |
32 | 1, // "'" (sub-delim) |
33 | 1, // '(' (sub-delim) |
34 | 1, // ')' (sub-delim) |
35 | 1, // '*' (sub-delim) |
36 | 1, // '+' (sub-delim) |
37 | 1, // ',' (sub-delim) |
38 | 0, // '-' (unreserved) |
39 | 0, // '.' (unreserved) |
40 | 1, // '/' (gen-delim) |
41 | |
42 | 0, 0, 0, 0, 0, // '0' to '4' (unreserved) |
43 | 0, 0, 0, 0, 0, // '5' to '9' (unreserved) |
44 | 1, // ':' (gen-delim) |
45 | 1, // ';' (sub-delim) |
46 | 2, // '<' |
47 | 1, // '=' (sub-delim) |
48 | 2, // '>' |
49 | 1, // '?' (gen-delim) |
50 | |
51 | 1, // '@' (gen-delim) |
52 | 0, 0, 0, 0, 0, // 'A' to 'E' (unreserved) |
53 | 0, 0, 0, 0, 0, // 'F' to 'J' (unreserved) |
54 | 0, 0, 0, 0, 0, // 'K' to 'O' (unreserved) |
55 | 0, 0, 0, 0, 0, // 'P' to 'T' (unreserved) |
56 | 0, 0, 0, 0, 0, 0, // 'U' to 'Z' (unreserved) |
57 | 1, // '[' (gen-delim) |
58 | 2, // '\' |
59 | 1, // ']' (gen-delim) |
60 | 2, // '^' |
61 | 0, // '_' (unreserved) |
62 | |
63 | 2, // '`' |
64 | 0, 0, 0, 0, 0, // 'a' to 'e' (unreserved) |
65 | 0, 0, 0, 0, 0, // 'f' to 'j' (unreserved) |
66 | 0, 0, 0, 0, 0, // 'k' to 'o' (unreserved) |
67 | 0, 0, 0, 0, 0, // 'p' to 't' (unreserved) |
68 | 0, 0, 0, 0, 0, 0, // 'u' to 'z' (unreserved) |
69 | 2, // '{' |
70 | 2, // '|' |
71 | 2, // '}' |
72 | 0, // '~' (unreserved) |
73 | |
74 | 2 // BSKP |
75 | }; |
76 | |
77 | // mask tables, in negative polarity |
78 | // 0x00 if it belongs to this category |
79 | // 0xff if it doesn't |
80 | |
81 | static const uchar reservedMask[96] = { |
82 | 0xff, // space |
83 | 0xff, // '!' (sub-delim) |
84 | 0x00, // '"' |
85 | 0xff, // '#' (gen-delim) |
86 | 0xff, // '$' (gen-delim) |
87 | 0xff, // '%' (percent) |
88 | 0xff, // '&' (gen-delim) |
89 | 0xff, // "'" (sub-delim) |
90 | 0xff, // '(' (sub-delim) |
91 | 0xff, // ')' (sub-delim) |
92 | 0xff, // '*' (sub-delim) |
93 | 0xff, // '+' (sub-delim) |
94 | 0xff, // ',' (sub-delim) |
95 | 0xff, // '-' (unreserved) |
96 | 0xff, // '.' (unreserved) |
97 | 0xff, // '/' (gen-delim) |
98 | |
99 | 0xff, 0xff, 0xff, 0xff, 0xff, // '0' to '4' (unreserved) |
100 | 0xff, 0xff, 0xff, 0xff, 0xff, // '5' to '9' (unreserved) |
101 | 0xff, // ':' (gen-delim) |
102 | 0xff, // ';' (sub-delim) |
103 | 0x00, // '<' |
104 | 0xff, // '=' (sub-delim) |
105 | 0x00, // '>' |
106 | 0xff, // '?' (gen-delim) |
107 | |
108 | 0xff, // '@' (gen-delim) |
109 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'A' to 'E' (unreserved) |
110 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'F' to 'J' (unreserved) |
111 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'K' to 'O' (unreserved) |
112 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'P' to 'T' (unreserved) |
113 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // 'U' to 'Z' (unreserved) |
114 | 0xff, // '[' (gen-delim) |
115 | 0x00, // '\' |
116 | 0xff, // ']' (gen-delim) |
117 | 0x00, // '^' |
118 | 0xff, // '_' (unreserved) |
119 | |
120 | 0x00, // '`' |
121 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'a' to 'e' (unreserved) |
122 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'f' to 'j' (unreserved) |
123 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'k' to 'o' (unreserved) |
124 | 0xff, 0xff, 0xff, 0xff, 0xff, // 'p' to 't' (unreserved) |
125 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // 'u' to 'z' (unreserved) |
126 | 0x00, // '{' |
127 | 0x00, // '|' |
128 | 0x00, // '}' |
129 | 0xff, // '~' (unreserved) |
130 | |
131 | 0xff // BSKP |
132 | }; |
133 | |
134 | static inline bool isHex(char16_t c) |
135 | { |
136 | return (c >= u'a' && c <= u'f') || (c >= u'A' && c <= u'F') || (c >= u'0' && c <= u'9'); |
137 | } |
138 | |
139 | static inline bool isUpperHex(char16_t c) |
140 | { |
141 | // undefined behaviour if c isn't an hex char! |
142 | return c < 0x60; |
143 | } |
144 | |
145 | static inline char16_t toUpperHex(char16_t c) |
146 | { |
147 | return isUpperHex(c) ? c : c - 0x20; |
148 | } |
149 | |
150 | static inline ushort decodeNibble(char16_t c) |
151 | { |
152 | return c >= u'a' ? c - u'a' + 0xA : c >= u'A' ? c - u'A' + 0xA : c - u'0'; |
153 | } |
154 | |
155 | // if the sequence at input is 2*HEXDIG, returns its decoding |
156 | // returns -1 if it isn't. |
157 | // assumes that the range has been checked already |
158 | static inline char16_t decodePercentEncoding(const char16_t *input) |
159 | { |
160 | char16_t c1 = input[1]; |
161 | char16_t c2 = input[2]; |
162 | if (!isHex(c: c1) || !isHex(c: c2)) |
163 | return char16_t(-1); |
164 | return decodeNibble(c: c1) << 4 | decodeNibble(c: c2); |
165 | } |
166 | |
167 | static inline char16_t encodeNibble(ushort c) |
168 | { |
169 | return QtMiscUtils::toHexUpper(value: c); |
170 | } |
171 | |
172 | static void ensureDetached(QString &result, char16_t *&output, const char16_t *begin, const char16_t *input, const char16_t *end, |
173 | int add = 0) |
174 | { |
175 | if (!output) { |
176 | // now detach |
177 | // create enough space if the rest of the string needed to be percent-encoded |
178 | int charsProcessed = input - begin; |
179 | int charsRemaining = end - input; |
180 | int spaceNeeded = end - begin + 2 * charsRemaining + add; |
181 | int origSize = result.size(); |
182 | result.resize(size: origSize + spaceNeeded); |
183 | |
184 | // we know that resize() above detached, so we bypass the reference count check |
185 | output = const_cast<char16_t *>(reinterpret_cast<const char16_t *>(result.constData())) |
186 | + origSize; |
187 | |
188 | // copy the chars we've already processed |
189 | int i; |
190 | for (i = 0; i < charsProcessed; ++i) |
191 | output[i] = begin[i]; |
192 | output += i; |
193 | } |
194 | } |
195 | |
196 | namespace { |
197 | struct QUrlUtf8Traits : public QUtf8BaseTraitsNoAscii |
198 | { |
199 | // From RFC 3987: |
200 | // iunreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar |
201 | // |
202 | // ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF |
203 | // / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD |
204 | // / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD |
205 | // / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD |
206 | // / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD |
207 | // / %xD0000-DFFFD / %xE1000-EFFFD |
208 | // |
209 | // iprivate = %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD |
210 | // |
211 | // That RFC allows iprivate only as part of iquery, but we don't know here |
212 | // whether we're looking at a query or another part of an URI, so we accept |
213 | // them too. The definition above excludes U+FFF0 to U+FFFD from appearing |
214 | // unencoded, but we see no reason for its exclusion, so we allow them to |
215 | // be decoded (and we need U+FFFD the replacement character to indicate |
216 | // failure to decode). |
217 | // |
218 | // That means we must disallow: |
219 | // * unpaired surrogates (QUtf8Functions takes care of that for us) |
220 | // * non-characters |
221 | static const bool allowNonCharacters = false; |
222 | |
223 | // override: our "bytes" are three percent-encoded UTF-16 characters |
224 | static void appendByte(char16_t *&ptr, uchar b) |
225 | { |
226 | // b >= 0x80, by construction, so percent-encode |
227 | *ptr++ = '%'; |
228 | *ptr++ = encodeNibble(c: b >> 4); |
229 | *ptr++ = encodeNibble(c: b & 0xf); |
230 | } |
231 | |
232 | static uchar peekByte(const char16_t *ptr, qsizetype n = 0) |
233 | { |
234 | // decodePercentEncoding returns char16_t(-1) if it can't decode, |
235 | // which means we return 0xff, which is not a valid continuation byte. |
236 | // If ptr[i * 3] is not '%', we'll multiply by zero and return 0, |
237 | // also not a valid continuation byte (if it's '%', we multiply by 1). |
238 | return uchar(decodePercentEncoding(input: ptr + n * 3)) |
239 | * uchar(ptr[n * 3] == '%'); |
240 | } |
241 | |
242 | static qptrdiff availableBytes(const char16_t *ptr, const char16_t *end) |
243 | { |
244 | return (end - ptr) / 3; |
245 | } |
246 | |
247 | static void advanceByte(const char16_t *&ptr, int n = 1) |
248 | { |
249 | ptr += n * 3; |
250 | } |
251 | }; |
252 | } |
253 | |
254 | // returns true if we performed an UTF-8 decoding |
255 | static bool encodedUtf8ToUtf16(QString &result, char16_t *&output, const char16_t *begin, |
256 | const char16_t *&input, const char16_t *end, char16_t decoded) |
257 | { |
258 | char32_t ucs4 = 0, *dst = &ucs4; |
259 | const char16_t *src = input + 3;// skip the %XX that yielded \a decoded |
260 | int charsNeeded = QUtf8Functions::fromUtf8<QUrlUtf8Traits>(b: decoded, dst, src, end); |
261 | if (charsNeeded < 0) |
262 | return false; |
263 | |
264 | if (!QChar::requiresSurrogates(ucs4)) { |
265 | // UTF-8 decoded and no surrogates are required |
266 | // detach if necessary |
267 | // possibilities are: 6 chars (%XX%XX) -> one char; 9 chars (%XX%XX%XX) -> one char |
268 | ensureDetached(result, output, begin, input, end, add: -3 * charsNeeded + 1); |
269 | *output++ = ucs4; |
270 | } else { |
271 | // UTF-8 decoded to something that requires a surrogate pair |
272 | // compressing from %XX%XX%XX%XX (12 chars) to two |
273 | ensureDetached(result, output, begin, input, end, add: -10); |
274 | *output++ = QChar::highSurrogate(ucs4); |
275 | *output++ = QChar::lowSurrogate(ucs4); |
276 | } |
277 | |
278 | input = src - 1; |
279 | return true; |
280 | } |
281 | |
282 | static void unicodeToEncodedUtf8(QString &result, char16_t *&output, const char16_t *begin, |
283 | const char16_t *&input, const char16_t *end, char16_t decoded) |
284 | { |
285 | // calculate the utf8 length and ensure enough space is available |
286 | int utf8len = QChar::isHighSurrogate(ucs4: decoded) ? 4 : decoded >= 0x800 ? 3 : 2; |
287 | |
288 | // detach |
289 | if (!output) { |
290 | // we need 3 * utf8len for the encoded UTF-8 sequence |
291 | // but ensureDetached already adds 3 for the char we're processing |
292 | ensureDetached(result, output, begin, input, end, add: 3*utf8len - 3); |
293 | } else { |
294 | // verify that there's enough space or expand |
295 | int charsRemaining = end - input - 1; // not including this one |
296 | int pos = output - reinterpret_cast<const char16_t *>(result.constData()); |
297 | int spaceRemaining = result.size() - pos; |
298 | if (spaceRemaining < 3*charsRemaining + 3*utf8len) { |
299 | // must resize |
300 | result.resize(size: result.size() + 3*utf8len); |
301 | |
302 | // we know that resize() above detached, so we bypass the reference count check |
303 | output = const_cast<char16_t *>(reinterpret_cast<const char16_t *>(result.constData())); |
304 | output += pos; |
305 | } |
306 | } |
307 | |
308 | ++input; |
309 | int res = QUtf8Functions::toUtf8<QUrlUtf8Traits>(u: decoded, dst&: output, src&: input, end); |
310 | --input; |
311 | if (res < 0) { |
312 | // bad surrogate pair sequence |
313 | // we will encode bad UTF-16 to UTF-8 |
314 | // but they don't get decoded back |
315 | |
316 | // first of three bytes |
317 | uchar c = 0xe0 | uchar(decoded >> 12); |
318 | *output++ = '%'; |
319 | *output++ = 'E'; |
320 | *output++ = encodeNibble(c: c & 0xf); |
321 | |
322 | // second byte |
323 | c = 0x80 | (uchar(decoded >> 6) & 0x3f); |
324 | *output++ = '%'; |
325 | *output++ = encodeNibble(c: c >> 4); |
326 | *output++ = encodeNibble(c: c & 0xf); |
327 | |
328 | // third byte |
329 | c = 0x80 | (decoded & 0x3f); |
330 | *output++ = '%'; |
331 | *output++ = encodeNibble(c: c >> 4); |
332 | *output++ = encodeNibble(c: c & 0xf); |
333 | } |
334 | } |
335 | |
336 | static int recode(QString &result, const char16_t *begin, const char16_t *end, |
337 | QUrl::ComponentFormattingOptions encoding, const uchar *actionTable, |
338 | bool retryBadEncoding) |
339 | { |
340 | const int origSize = result.size(); |
341 | const char16_t *input = begin; |
342 | char16_t *output = nullptr; |
343 | |
344 | EncodingAction action = EncodeCharacter; |
345 | for ( ; input != end; ++input) { |
346 | char16_t c; |
347 | // try a run where no change is necessary |
348 | for ( ; input != end; ++input) { |
349 | c = *input; |
350 | if (c < 0x20U) |
351 | action = EncodeCharacter; |
352 | if (c < 0x20U || c >= 0x80U) // also: (c - 0x20 < 0x60U) |
353 | goto non_trivial; |
354 | action = EncodingAction(actionTable[c - ' ']); |
355 | if (action == EncodeCharacter) |
356 | goto non_trivial; |
357 | if (output) |
358 | *output++ = c; |
359 | } |
360 | break; |
361 | |
362 | non_trivial: |
363 | char16_t decoded; |
364 | if (c == '%' && retryBadEncoding) { |
365 | // always write "%25" |
366 | ensureDetached(result, output, begin, input, end); |
367 | *output++ = '%'; |
368 | *output++ = '2'; |
369 | *output++ = '5'; |
370 | continue; |
371 | } else if (c == '%') { |
372 | // check if the input is valid |
373 | if (input + 2 >= end || (decoded = decodePercentEncoding(input)) == char16_t(-1)) { |
374 | // not valid, retry |
375 | result.resize(size: origSize); |
376 | return recode(result, begin, end, encoding, actionTable, retryBadEncoding: true); |
377 | } |
378 | |
379 | if (decoded >= 0x80) { |
380 | // decode the UTF-8 sequence |
381 | if (!(encoding & QUrl::EncodeUnicode) && |
382 | encodedUtf8ToUtf16(result, output, begin, input, end, decoded)) |
383 | continue; |
384 | |
385 | // decoding the encoded UTF-8 failed |
386 | action = LeaveCharacter; |
387 | } else if (decoded >= 0x20) { |
388 | action = EncodingAction(actionTable[decoded - ' ']); |
389 | } |
390 | } else { |
391 | decoded = c; |
392 | if (decoded >= 0x80 && encoding & QUrl::EncodeUnicode) { |
393 | // encode the UTF-8 sequence |
394 | unicodeToEncodedUtf8(result, output, begin, input, end, decoded); |
395 | continue; |
396 | } else if (decoded >= 0x80) { |
397 | if (output) |
398 | *output++ = c; |
399 | continue; |
400 | } |
401 | } |
402 | |
403 | // there are six possibilities: |
404 | // current \ action | DecodeCharacter | LeaveCharacter | EncodeCharacter |
405 | // decoded | 1:leave | 2:leave | 3:encode |
406 | // encoded | 4:decode | 5:leave | 6:leave |
407 | // cases 1 and 2 were handled before this section |
408 | |
409 | if (c == '%' && action != DecodeCharacter) { |
410 | // cases 5 and 6: it's encoded and we're leaving it as it is |
411 | // except we're pedantic and we'll uppercase the hex |
412 | if (output || !isUpperHex(c: input[1]) || !isUpperHex(c: input[2])) { |
413 | ensureDetached(result, output, begin, input, end); |
414 | *output++ = '%'; |
415 | *output++ = toUpperHex(c: *++input); |
416 | *output++ = toUpperHex(c: *++input); |
417 | } |
418 | } else if (c == '%' && action == DecodeCharacter) { |
419 | // case 4: we need to decode |
420 | ensureDetached(result, output, begin, input, end); |
421 | *output++ = decoded; |
422 | input += 2; |
423 | } else { |
424 | // must be case 3: we need to encode |
425 | ensureDetached(result, output, begin, input, end); |
426 | *output++ = '%'; |
427 | *output++ = encodeNibble(c: c >> 4); |
428 | *output++ = encodeNibble(c: c & 0xf); |
429 | } |
430 | } |
431 | |
432 | if (output) { |
433 | int len = output - reinterpret_cast<const char16_t *>(result.constData()); |
434 | result.truncate(pos: len); |
435 | return len - origSize; |
436 | } |
437 | return 0; |
438 | } |
439 | |
440 | /* |
441 | * Returns true if the input it checked (if it checked anything) is not |
442 | * encoded. A return of false indicates there's a percent at \a input that |
443 | * needs to be decoded. |
444 | */ |
445 | #ifdef __SSE2__ |
446 | static bool simdCheckNonEncoded(QChar *&output, const char16_t *&input, const char16_t *end) |
447 | { |
448 | # ifdef __AVX2__ |
449 | const __m256i percents256 = _mm256_broadcastw_epi16(_mm_cvtsi32_si128('%')); |
450 | const __m128i percents = _mm256_castsi256_si128(percents256); |
451 | # else |
452 | const __m128i percents = _mm_set1_epi16(w: '%'); |
453 | # endif |
454 | |
455 | uint idx = 0; |
456 | quint32 mask = 0; |
457 | if (input + 16 <= end) { |
458 | qptrdiff offset = 0; |
459 | for ( ; input + offset + 16 <= end; offset += 16) { |
460 | # ifdef __AVX2__ |
461 | // do 32 bytes at a time using AVX2 |
462 | __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(input + offset)); |
463 | __m256i comparison = _mm256_cmpeq_epi16(data, percents256); |
464 | mask = _mm256_movemask_epi8(comparison); |
465 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(output + offset), data); |
466 | # else |
467 | // do 32 bytes at a time using unrolled SSE2 |
468 | __m128i data1 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(input + offset)); |
469 | __m128i data2 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(input + offset + 8)); |
470 | __m128i comparison1 = _mm_cmpeq_epi16(a: data1, b: percents); |
471 | __m128i comparison2 = _mm_cmpeq_epi16(a: data2, b: percents); |
472 | uint mask1 = _mm_movemask_epi8(a: comparison1); |
473 | uint mask2 = _mm_movemask_epi8(a: comparison2); |
474 | |
475 | _mm_storeu_si128(p: reinterpret_cast<__m128i *>(output + offset), b: data1); |
476 | if (!mask1) |
477 | _mm_storeu_si128(p: reinterpret_cast<__m128i *>(output + offset + 8), b: data2); |
478 | mask = mask1 | (mask2 << 16); |
479 | # endif |
480 | |
481 | if (mask) { |
482 | idx = qCountTrailingZeroBits(v: mask) / 2; |
483 | break; |
484 | } |
485 | } |
486 | |
487 | input += offset; |
488 | if (output) |
489 | output += offset; |
490 | } else if (input + 8 <= end) { |
491 | // do 16 bytes at a time |
492 | __m128i data = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(input)); |
493 | __m128i comparison = _mm_cmpeq_epi16(a: data, b: percents); |
494 | mask = _mm_movemask_epi8(a: comparison); |
495 | _mm_storeu_si128(p: reinterpret_cast<__m128i *>(output), b: data); |
496 | idx = qCountTrailingZeroBits(v: quint16(mask)) / 2; |
497 | } else if (input + 4 <= end) { |
498 | // do 8 bytes only |
499 | __m128i data = _mm_loadl_epi64(p: reinterpret_cast<const __m128i *>(input)); |
500 | __m128i comparison = _mm_cmpeq_epi16(a: data, b: percents); |
501 | mask = _mm_movemask_epi8(a: comparison) & 0xffu; |
502 | _mm_storel_epi64(p: reinterpret_cast<__m128i *>(output), a: data); |
503 | idx = qCountTrailingZeroBits(v: quint8(mask)) / 2; |
504 | } else { |
505 | // no percents found (because we didn't check) |
506 | return true; |
507 | } |
508 | |
509 | // advance to the next non-encoded |
510 | input += idx; |
511 | output += idx; |
512 | |
513 | return !mask; |
514 | } |
515 | #else |
516 | static bool simdCheckNonEncoded(...) |
517 | { |
518 | return true; |
519 | } |
520 | #endif |
521 | |
522 | /*! |
523 | \since 5.0 |
524 | \internal |
525 | |
526 | This function decodes a percent-encoded string located in \a in |
527 | by appending each character to \a appendTo. It returns the number of |
528 | characters appended. Each percent-encoded sequence is decoded as follows: |
529 | |
530 | \list |
531 | \li from %00 to %7F: the exact decoded value is appended; |
532 | \li from %80 to %FF: QChar::ReplacementCharacter is appended; |
533 | \li bad encoding: original input is copied to the output, undecoded. |
534 | \endlist |
535 | |
536 | Given the above, it's important for the input to already have all UTF-8 |
537 | percent sequences decoded by qt_urlRecode (that is, the input should not |
538 | have been processed with QUrl::EncodeUnicode). |
539 | |
540 | The input should also be a valid percent-encoded sequence (the output of |
541 | qt_urlRecode is always valid). |
542 | */ |
543 | static qsizetype decode(QString &appendTo, QStringView in) |
544 | { |
545 | const char16_t *begin = in.utf16(); |
546 | const char16_t *end = begin + in.size(); |
547 | |
548 | // fast check whether there's anything to be decoded in the first place |
549 | const char16_t *input = QtPrivate::qustrchr(str: in, ch: '%'); |
550 | |
551 | if (Q_LIKELY(input == end)) |
552 | return 0; // nothing to do, it was already decoded! |
553 | |
554 | // detach |
555 | const int origSize = appendTo.size(); |
556 | appendTo.resize(size: origSize + (end - begin)); |
557 | QChar *output = appendTo.data() + origSize; |
558 | memcpy(dest: static_cast<void *>(output), src: static_cast<const void *>(begin), n: (input - begin) * sizeof(QChar)); |
559 | output += input - begin; |
560 | |
561 | while (input != end) { |
562 | // something was encoded |
563 | Q_ASSERT(*input == '%'); |
564 | |
565 | if (Q_UNLIKELY(end - input < 3 || !isHex(input[1]) || !isHex(input[2]))) { |
566 | // badly-encoded data |
567 | appendTo.resize(size: origSize + (end - begin)); |
568 | memcpy(dest: static_cast<void *>(appendTo.begin() + origSize), |
569 | src: static_cast<const void *>(begin), n: (end - begin) * sizeof(*end)); |
570 | return end - begin; |
571 | } |
572 | |
573 | ++input; |
574 | *output++ = QChar::fromUcs2(c: decodeNibble(c: input[0]) << 4 | decodeNibble(c: input[1])); |
575 | if (output[-1].unicode() >= 0x80) |
576 | output[-1] = QChar::ReplacementCharacter; |
577 | input += 2; |
578 | |
579 | // search for the next percent, copying from input to output |
580 | if (simdCheckNonEncoded(output, input, end)) { |
581 | while (input != end) { |
582 | const char16_t uc = *input; |
583 | if (uc == '%') |
584 | break; |
585 | *output++ = uc; |
586 | ++input; |
587 | } |
588 | } |
589 | } |
590 | |
591 | const qsizetype len = output - appendTo.begin(); |
592 | appendTo.truncate(pos: len); |
593 | return len - origSize; |
594 | } |
595 | |
596 | template <size_t N> |
597 | static void maskTable(uchar (&table)[N], const uchar (&mask)[N]) |
598 | { |
599 | for (size_t i = 0; i < N; ++i) |
600 | table[i] &= mask[i]; |
601 | } |
602 | |
603 | /*! |
604 | \internal |
605 | |
606 | Recodes the string from \a begin to \a end. If any transformations are |
607 | done, append them to \a appendTo and return the number of characters added. |
608 | If no transformations were required, return 0. |
609 | |
610 | The \a encoding option modifies the default behaviour: |
611 | \list |
612 | \li QUrl::DecodeReserved: if set, reserved characters will be decoded; |
613 | if unset, reserved characters will be encoded |
614 | \li QUrl::EncodeSpaces: if set, spaces will be encoded to "%20"; if unset, they will be " " |
615 | \li QUrl::EncodeUnicode: if set, characters above U+0080 will be encoded to their UTF-8 |
616 | percent-encoded form; if unset, they will be decoded to UTF-16 |
617 | \li QUrl::FullyDecoded: if set, this function will decode all percent-encoded sequences, |
618 | including that of the percent character. The resulting string |
619 | will not be percent-encoded anymore. Use with caution! |
620 | In this mode, the behaviour is undefined if the input string |
621 | contains any percent-encoding sequences above %80. |
622 | Also, the function will not correct bad % sequences. |
623 | \endlist |
624 | |
625 | Other flags are ignored (including QUrl::EncodeReserved). |
626 | |
627 | The \a tableModifications argument can be used to supply extra |
628 | modifications to the tables, to be applied after the flags above are |
629 | handled. It consists of a sequence of 16-bit values, where the low 8 bits |
630 | indicate the character in question and the high 8 bits are either \c |
631 | EncodeCharacter, \c LeaveCharacter or \c DecodeCharacter. |
632 | |
633 | This function corrects percent-encoded errors by interpreting every '%' as |
634 | meaning "%25" (all percents in the same content). |
635 | */ |
636 | |
637 | Q_AUTOTEST_EXPORT qsizetype |
638 | qt_urlRecode(QString &appendTo, QStringView in, |
639 | QUrl::ComponentFormattingOptions encoding, const ushort *tableModifications) |
640 | { |
641 | uchar actionTable[sizeof defaultActionTable]; |
642 | if ((encoding & QUrl::FullyDecoded) == QUrl::FullyDecoded) { |
643 | return decode(appendTo, in); |
644 | } |
645 | |
646 | memcpy(dest: actionTable, src: defaultActionTable, n: sizeof actionTable); |
647 | if (encoding & QUrl::DecodeReserved) |
648 | maskTable(table&: actionTable, mask: reservedMask); |
649 | if (!(encoding & QUrl::EncodeSpaces)) |
650 | actionTable[0] = DecodeCharacter; // decode |
651 | |
652 | if (tableModifications) { |
653 | for (const ushort *p = tableModifications; *p; ++p) |
654 | actionTable[uchar(*p) - ' '] = *p >> 8; |
655 | } |
656 | |
657 | return recode(result&: appendTo, begin: reinterpret_cast<const char16_t *>(in.begin()), |
658 | end: reinterpret_cast<const char16_t *>(in.end()), encoding, actionTable, retryBadEncoding: false); |
659 | } |
660 | |
661 | QT_END_NAMESPACE |
662 | |