qurlrecode.cpp source code [qtbase/src/corelib/io/qurlrecode.cpp]

1	/****************************************************************************
2	**
3	** Copyright (C) 2016 Intel Corporation.
4	** Contact: https://www.qt.io/licensing/
5	**
6	** This file is part of the QtCore module of the Qt Toolkit.
7	**
8	** $QT_BEGIN_LICENSE:LGPL$
9	** Commercial License Usage
10	** Licensees holding valid commercial Qt licenses may use this file in
11	** accordance with the commercial license agreement provided with the
12	** Software or, alternatively, in accordance with the terms contained in
13	** a written agreement between you and The Qt Company. For licensing terms
14	** and conditions see https://www.qt.io/terms-conditions. For further
15	** information use the contact form at https://www.qt.io/contact-us.
16	**
17	** GNU Lesser General Public License Usage
18	** Alternatively, this file may be used under the terms of the GNU Lesser
19	** General Public License version 3 as published by the Free Software
20	** Foundation and appearing in the file LICENSE.LGPL3 included in the
21	** packaging of this file. Please review the following information to
22	** ensure the GNU Lesser General Public License version 3 requirements
23	** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
24	**
25	** GNU General Public License Usage
26	** Alternatively, this file may be used under the terms of the GNU
27	** General Public License version 2.0 or (at your option) the GNU General
28	** Public license version 3 or any later version approved by the KDE Free
29	** Qt Foundation. The licenses are as published by the Free Software
30	** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31	** included in the packaging of this file. Please review the following
32	** information to ensure the GNU General Public License requirements will
33	** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34	** https://www.gnu.org/licenses/gpl-3.0.html.
35	**
36	** $QT_END_LICENSE$
37	**
38	****************************************************************************/
39
40	#include "qurl.h"
41	#include "private/qutfcodec_p.h"
42	#include "private/qtools_p.h"
43	#include "private/qsimd_p.h"
44
45	QT_BEGIN_NAMESPACE
46
47	// ### move to qurl_p.h
48	enum EncodingAction {
49	DecodeCharacter = `0`,
50	LeaveCharacter = `1`,
51	EncodeCharacter = `2`
52	};
53
54	// From RFC 3896, Appendix A Collected ABNF for URI
55	// unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
56	// reserved = gen-delims / sub-delims
57	// gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
58	// sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
59	// / "" / "+" / "," / ";" / "="*
60	static const uchar defaultActionTable[`96`] = {
61	`2`, // space
62	`1`, // '!' (sub-delim)
63	`2`, // '"'
64	`1`, // '#' (gen-delim)
65	`1`, // '$' (gen-delim)
66	`2`, // '%' (percent)
67	`1`, // '&' (gen-delim)
68	`1`, // "'" (sub-delim)
69	`1`, // '(' (sub-delim)
70	`1`, // ')' (sub-delim)
71	`1`, // '' (sub-delim)*
72	`1`, // '+' (sub-delim)
73	`1`, // ',' (sub-delim)
74	`0`, // '-' (unreserved)
75	`0`, // '.' (unreserved)
76	`1`, // '/' (gen-delim)
77
78	`0`, `0`, `0`, `0`, `0`, // '0' to '4' (unreserved)
79	`0`, `0`, `0`, `0`, `0`, // '5' to '9' (unreserved)
80	`1`, // ':' (gen-delim)
81	`1`, // ';' (sub-delim)
82	`2`, // '<'
83	`1`, // '=' (sub-delim)
84	`2`, // '>'
85	`1`, // '?' (gen-delim)
86
87	`1`, // '@' (gen-delim)
88	`0`, `0`, `0`, `0`, `0`, // 'A' to 'E' (unreserved)
89	`0`, `0`, `0`, `0`, `0`, // 'F' to 'J' (unreserved)
90	`0`, `0`, `0`, `0`, `0`, // 'K' to 'O' (unreserved)
91	`0`, `0`, `0`, `0`, `0`, // 'P' to 'T' (unreserved)
92	`0`, `0`, `0`, `0`, `0`, `0`, // 'U' to 'Z' (unreserved)
93	`1`, // '[' (gen-delim)
94	`2`, // '\'
95	`1`, // ']' (gen-delim)
96	`2`, // '^'
97	`0`, // '_' (unreserved)
98
99	`2`, // '`'
100	`0`, `0`, `0`, `0`, `0`, // 'a' to 'e' (unreserved)
101	`0`, `0`, `0`, `0`, `0`, // 'f' to 'j' (unreserved)
102	`0`, `0`, `0`, `0`, `0`, // 'k' to 'o' (unreserved)
103	`0`, `0`, `0`, `0`, `0`, // 'p' to 't' (unreserved)
104	`0`, `0`, `0`, `0`, `0`, `0`, // 'u' to 'z' (unreserved)
105	`2`, // '{'
106	`2`, // '\|'
107	`2`, // '}'
108	`0`, // '~' (unreserved)
109
110	`2` // BSKP
111	};
112
113	// mask tables, in negative polarity
114	// 0x00 if it belongs to this category
115	// 0xff if it doesn't
116
117	static const uchar reservedMask[`96`] = {
118	`0xff`, // space
119	`0xff`, // '!' (sub-delim)
120	`0x00`, // '"'
121	`0xff`, // '#' (gen-delim)
122	`0xff`, // '$' (gen-delim)
123	`0xff`, // '%' (percent)
124	`0xff`, // '&' (gen-delim)
125	`0xff`, // "'" (sub-delim)
126	`0xff`, // '(' (sub-delim)
127	`0xff`, // ')' (sub-delim)
128	`0xff`, // '' (sub-delim)*
129	`0xff`, // '+' (sub-delim)
130	`0xff`, // ',' (sub-delim)
131	`0xff`, // '-' (unreserved)
132	`0xff`, // '.' (unreserved)
133	`0xff`, // '/' (gen-delim)
134
135	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, // '0' to '4' (unreserved)
136	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, // '5' to '9' (unreserved)
137	`0xff`, // ':' (gen-delim)
138	`0xff`, // ';' (sub-delim)
139	`0x00`, // '<'
140	`0xff`, // '=' (sub-delim)
141	`0x00`, // '>'
142	`0xff`, // '?' (gen-delim)
143
144	`0xff`, // '@' (gen-delim)
145	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, // 'A' to 'E' (unreserved)
146	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, // 'F' to 'J' (unreserved)
147	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, // 'K' to 'O' (unreserved)
148	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, // 'P' to 'T' (unreserved)
149	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, // 'U' to 'Z' (unreserved)
150	`0xff`, // '[' (gen-delim)
151	`0x00`, // '\'
152	`0xff`, // ']' (gen-delim)
153	`0x00`, // '^'
154	`0xff`, // '_' (unreserved)
155
156	`0x00`, // '`'
157	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, // 'a' to 'e' (unreserved)
158	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, // 'f' to 'j' (unreserved)
159	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, // 'k' to 'o' (unreserved)
160	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, // 'p' to 't' (unreserved)
161	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, // 'u' to 'z' (unreserved)
162	`0x00`, // '{'
163	`0x00`, // '\|'
164	`0x00`, // '}'
165	`0xff`, // '~' (unreserved)
166
167	`0xff` // BSKP
168	};
169
170	static inline bool isHex(ushort c)
171	{
172	return (c >= `'a'` && c <= `'f'`) \|\|
173	(c >= `'A'` && c <= `'F'`) \|\|
174	(c >= `'0'` && c <= `'9'`);
175	}
176
177	static inline bool isUpperHex(ushort c)
178	{
179	// undefined behaviour if c isn't an hex char!
180	return c < `0x60`;
181	}
182
183	static inline ushort toUpperHex(ushort c)
184	{
185	return isUpperHex(c) ? c : c - `0x20`;
186	}
187
188	static inline ushort decodeNibble(ushort c)
189	{
190	return c >= `'a'` ? c - `'a'` + `0xA` :
191	c >= `'A'` ? c - `'A'` + `0xA` : c - `'0'`;
192	}
193
194	// if the sequence at input is 2HEXDIG, returns its decoding*
195	// returns -1 if it isn't.
196	// assumes that the range has been checked already
197	static inline ushort decodePercentEncoding(const ushort *input)
198	{
199	ushort c1 = input[`1`];
200	ushort c2 = input[`2`];
201	if (!isHex(c: c1) \|\| !isHex(c: c2))
202	return ushort(-`1`);
203	return decodeNibble(c: c1) << `4` \| decodeNibble(c: c2);
204	}
205
206	static inline ushort encodeNibble(ushort c)
207	{
208	return ushort(QtMiscUtils::toHexUpper(value: c));
209	}
210
211	static void ensureDetached(QString &result, ushort &output, const* ushort begin, const* ushort input, const* ushort *end,
212	int add = `0`)
213	{
214	if (!output) {
215	// now detach
216	// create enough space if the rest of the string needed to be percent-encoded
217	int charsProcessed = input - begin;
218	int charsRemaining = end - input;
219	int spaceNeeded = end - begin + `2` * charsRemaining + add;
220	int origSize = result.size();
221	result.resize(size: origSize + spaceNeeded);
222
223	// we know that resize() above detached, so we bypass the reference count check
224	output = const_cast<ushort >(reinterpret_cast<const* ushort *>(result.constData()))
225	+ origSize;
226
227	// copy the chars we've already processed
228	int i;
229	for (i = `0`; i < charsProcessed; ++i)
230	output[i] = begin[i];
231	output += i;
232	}
233	}
234
235	namespace {
236	struct QUrlUtf8Traits : public QUtf8BaseTraitsNoAscii
237	{
238	// From RFC 3987:
239	// iunreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar
240	//
241	// ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
242	// / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD
243	// / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD
244	// / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD
245	// / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD
246	// / %xD0000-DFFFD / %xE1000-EFFFD
247	//
248	// iprivate = %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD
249	//
250	// That RFC allows iprivate only as part of iquery, but we don't know here
251	// whether we're looking at a query or another part of an URI, so we accept
252	// them too. The definition above excludes U+FFF0 to U+FFFD from appearing
253	// unencoded, but we see no reason for its exclusion, so we allow them to
254	// be decoded (and we need U+FFFD the replacement character to indicate
255	// failure to decode).
256	//
257	// That means we must disallow:
258	// unpaired surrogates (QUtf8Functions takes care of that for us)*
259	// non-characters*
260	static const bool allowNonCharacters = false;
261
262	// override: our "bytes" are three percent-encoded UTF-16 characters
263	static void appendByte(ushort *&ptr, uchar b)
264	{
265	// b >= 0x80, by construction, so percent-encode
266	*ptr++ = `'%'`;
267	*ptr++ = encodeNibble(c: b >> `4`);
268	*ptr++ = encodeNibble(c: b & `0xf`);
269	}
270
271	static uchar peekByte(const ushort ptr, int* n = `0`)
272	{
273	// decodePercentEncoding returns ushort(-1) if it can't decode,
274	// which means we return 0xff, which is not a valid continuation byte.
275	// If ptr[i 3] is not '%', we'll multiply by zero and return 0,*
276	// also not a valid continuation byte (if it's '%', we multiply by 1).
277	return uchar(decodePercentEncoding(input: ptr + n * `3`))
278	* uchar(ptr[n * `3`] == `'%'`);
279	}
280
281	static qptrdiff availableBytes(const ushort ptr, const* ushort *end)
282	{
283	return (end - ptr) / `3`;
284	}
285
286	static void advanceByte(const ushort &ptr, int* n = `1`)
287	{
288	ptr += n * `3`;
289	}
290	};
291	}
292
293	// returns true if we performed an UTF-8 decoding
294	static bool encodedUtf8ToUtf16(QString &result, ushort &output, const* ushort begin, const* ushort *&input,
295	const ushort *end, ushort decoded)
296	{
297	uint ucs4, *dst = &ucs4;
298	const ushort src = input + `3`;// skip the %XX that yielded \a decoded*
299	int charsNeeded = QUtf8Functions::fromUtf8<QUrlUtf8Traits>(b: decoded, dst, src, end);
300	if (charsNeeded < `0`)
301	return false;
302
303	if (!QChar::requiresSurrogates(ucs4)) {
304	// UTF-8 decoded and no surrogates are required
305	// detach if necessary
306	// possibilities are: 6 chars (%XX%XX) -> one char; 9 chars (%XX%XX%XX) -> one char
307	ensureDetached(result, output, begin, input, end, add: -`3` * charsNeeded + `1`);
308	*output++ = ucs4;
309	} else {
310	// UTF-8 decoded to something that requires a surrogate pair
311	// compressing from %XX%XX%XX%XX (12 chars) to two
312	ensureDetached(result, output, begin, input, end, add: -`10`);
313	*output++ = QChar::highSurrogate(ucs4);
314	*output++ = QChar::lowSurrogate(ucs4);
315	}
316
317	input = src - `1`;
318	return true;
319	}
320
321	static void unicodeToEncodedUtf8(QString &result, ushort &output, const* ushort *begin,
322	const ushort &input, const* ushort *end, ushort decoded)
323	{
324	// calculate the utf8 length and ensure enough space is available
325	int utf8len = QChar::isHighSurrogate(ucs4: decoded) ? `4` : decoded >= `0x800` ? `3` : `2`;
326
327	// detach
328	if (!output) {
329	// we need 3 utf8len for the encoded UTF-8 sequence*
330	// but ensureDetached already adds 3 for the char we're processing
331	ensureDetached(result, output, begin, input, end, add: `3`*utf8len - `3`);
332	} else {
333	// verify that there's enough space or expand
334	int charsRemaining = end - input - `1`; // not including this one
335	int pos = output - reinterpret_cast<const ushort *>(result.constData());
336	int spaceRemaining = result.size() - pos;
337	if (spaceRemaining < `3`charsRemaining + `3`utf8len) {
338	// must resize
339	result.resize(size: result.size() + `3`*utf8len);
340
341	// we know that resize() above detached, so we bypass the reference count check
342	output = const_cast<ushort >(reinterpret_cast<const* ushort *>(result.constData()));
343	output += pos;
344	}
345	}
346
347	++input;
348	int res = QUtf8Functions::toUtf8<QUrlUtf8Traits>(u: decoded, dst&: output, src&: input, end);
349	--input;
350	if (res < `0`) {
351	// bad surrogate pair sequence
352	// we will encode bad UTF-16 to UTF-8
353	// but they don't get decoded back
354
355	// first of three bytes
356	uchar c = `0xe0` \| uchar(decoded >> `12`);
357	*output++ = `'%'`;
358	*output++ = `'E'`;
359	*output++ = encodeNibble(c: c & `0xf`);
360
361	// second byte
362	c = `0x80` \| (uchar(decoded >> `6`) & `0x3f`);
363	*output++ = `'%'`;
364	*output++ = encodeNibble(c: c >> `4`);
365	*output++ = encodeNibble(c: c & `0xf`);
366
367	// third byte
368	c = `0x80` \| (decoded & `0x3f`);
369	*output++ = `'%'`;
370	*output++ = encodeNibble(c: c >> `4`);
371	*output++ = encodeNibble(c: c & `0xf`);
372	}
373	}
374
375	static int recode(QString &result, const ushort begin, const* ushort *end, QUrl::ComponentFormattingOptions encoding,
376	const uchar actionTable, bool* retryBadEncoding)
377	{
378	const int origSize = result.size();
379	const ushort *input = begin;
380	ushort output = nullptr*;
381
382	EncodingAction action = EncodeCharacter;
383	for ( ; input != end; ++input) {
384	ushort c;
385	// try a run where no change is necessary
386	for ( ; input != end; ++input) {
387	c = *input;
388	if (c < `0x20U`)
389	action = EncodeCharacter;
390	if (c < `0x20U` \|\| c >= `0x80U`) // also: (c - 0x20 < 0x60U)
391	goto non_trivial;
392	action = EncodingAction(actionTable[c - `' '`]);
393	if (action == EncodeCharacter)
394	goto non_trivial;
395	if (output)
396	*output++ = c;
397	}
398	break;
399
400	non_trivial:
401	uint decoded;
402	if (c == `'%'` && retryBadEncoding) {
403	// always write "%25"
404	ensureDetached(result, output, begin, input, end);
405	*output++ = `'%'`;
406	*output++ = `'2'`;
407	*output++ = `'5'`;
408	continue;
409	} else if (c == `'%'`) {
410	// check if the input is valid
411	if (input + `2` >= end \|\| (decoded = decodePercentEncoding(input)) == ushort(-`1`)) {
412	// not valid, retry
413	result.resize(size: origSize);
414	return recode(result, begin, end, encoding, actionTable, retryBadEncoding: true);
415	}
416
417	if (decoded >= `0x80`) {
418	// decode the UTF-8 sequence
419	if (!(encoding & QUrl::EncodeUnicode) &&
420	encodedUtf8ToUtf16(result, output, begin, input, end, decoded))
421	continue;
422
423	// decoding the encoded UTF-8 failed
424	action = LeaveCharacter;
425	} else if (decoded >= `0x20`) {
426	action = EncodingAction(actionTable[decoded - `' '`]);
427	}
428	} else {
429	decoded = c;
430	if (decoded >= `0x80` && encoding & QUrl::EncodeUnicode) {
431	// encode the UTF-8 sequence
432	unicodeToEncodedUtf8(result, output, begin, input, end, decoded);
433	continue;
434	} else if (decoded >= `0x80`) {
435	if (output)
436	*output++ = c;
437	continue;
438	}
439	}
440
441	// there are six possibilities:
442	// current \ action \| DecodeCharacter \| LeaveCharacter \| EncodeCharacter
443	// decoded \| 1:leave \| 2:leave \| 3:encode
444	// encoded \| 4:decode \| 5:leave \| 6:leave
445	// cases 1 and 2 were handled before this section
446
447	if (c == `'%'` && action != DecodeCharacter) {
448	// cases 5 and 6: it's encoded and we're leaving it as it is
449	// except we're pedantic and we'll uppercase the hex
450	if (output \|\| !isUpperHex(c: input[`1`]) \|\| !isUpperHex(c: input[`2`])) {
451	ensureDetached(result, output, begin, input, end);
452	*output++ = `'%'`;
453	output++ = toUpperHex(c: ++input);
454	output++ = toUpperHex(c: ++input);
455	}
456	} else if (c == `'%'` && action == DecodeCharacter) {
457	// case 4: we need to decode
458	ensureDetached(result, output, begin, input, end);
459	*output++ = decoded;
460	input += `2`;
461	} else {
462	// must be case 3: we need to encode
463	ensureDetached(result, output, begin, input, end);
464	*output++ = `'%'`;
465	*output++ = encodeNibble(c: c >> `4`);
466	*output++ = encodeNibble(c: c & `0xf`);
467	}
468	}
469
470	if (output) {
471	int len = output - reinterpret_cast<const ushort *>(result.constData());
472	result.truncate(pos: len);
473	return len - origSize;
474	}
475	return `0`;
476	}
477
478	/*
479	* Returns true if the input it checked (if it checked anything) is not
480	* encoded. A return of false indicates there's a percent at \a input that
481	* needs to be decoded.
482	*/
483	#ifdef __SSE2__
484	static bool simdCheckNonEncoded(ushort &output, const* ushort &input, const* ushort *end)
485	{
486	# ifdef __AVX2__
487	const __m256i percents256 = _mm256_broadcastw_epi16(_mm_cvtsi32_si128(`'%'`));
488	const __m128i percents = _mm256_castsi256_si128(percents256);
489	# else
490	const __m128i percents = _mm_set1_epi16(w: `'%'`);
491	# endif
492
493	uint idx = `0`;
494	quint32 mask = `0`;
495	if (input + `16` <= end) {
496	qptrdiff offset = `0`;
497	for ( ; input + offset + `16` <= end; offset += `16`) {
498	# ifdef __AVX2__
499	// do 32 bytes at a time using AVX2
500	__m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(input + offset));
501	__m256i comparison = _mm256_cmpeq_epi16(data, percents256);
502	mask = _mm256_movemask_epi8(comparison);
503	_mm256_storeu_si256(reinterpret_cast<__m256i *>(output + offset), data);
504	# else
505	// do 32 bytes at a time using unrolled SSE2
506	__m128i data1 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(input + offset));
507	__m128i data2 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(input + offset + `8`));
508	__m128i comparison1 = _mm_cmpeq_epi16(a: data1, b: percents);
509	__m128i comparison2 = _mm_cmpeq_epi16(a: data2, b: percents);
510	uint mask1 = _mm_movemask_epi8(a: comparison1);
511	uint mask2 = _mm_movemask_epi8(a: comparison2);
512
513	_mm_storeu_si128(p: reinterpret_cast<__m128i *>(output + offset), b: data1);
514	if (!mask1)
515	_mm_storeu_si128(p: reinterpret_cast<__m128i *>(output + offset + `8`), b: data2);
516	mask = mask1 \| (mask2 << `16`);
517	# endif
518
519	if (mask) {
520	idx = qCountTrailingZeroBits(v: mask) / `2`;
521	break;
522	}
523	}
524
525	input += offset;
526	if (output)
527	output += offset;
528	} else if (input + `8` <= end) {
529	// do 16 bytes at a time
530	__m128i data = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(input));
531	__m128i comparison = _mm_cmpeq_epi16(a: data, b: percents);
532	mask = _mm_movemask_epi8(a: comparison);
533	_mm_storeu_si128(p: reinterpret_cast<__m128i *>(output), b: data);
534	idx = qCountTrailingZeroBits(v: quint16(mask)) / `2`;
535	} else if (input + `4` <= end) {
536	// do 8 bytes only
537	__m128i data = _mm_loadl_epi64(p: reinterpret_cast<const __m128i *>(input));
538	__m128i comparison = _mm_cmpeq_epi16(a: data, b: percents);
539	mask = _mm_movemask_epi8(a: comparison) & `0xffu`;
540	_mm_storel_epi64(p: reinterpret_cast<__m128i *>(output), a: data);
541	idx = qCountTrailingZeroBits(v: quint8(mask)) / `2`;
542	} else {
543	// no percents found (because we didn't check)
544	return true;
545	}
546
547	// advance to the next non-encoded
548	input += idx;
549	output += idx;
550
551	return !mask;
552	}
553	#else
554	static bool simdCheckNonEncoded(...)
555	{
556	return true;
557	}
558	#endif
559
560	/!*
561	\since 5.0
562	\internal
563
564	This function decodes a percent-encoded string located from \a begin to \a
565	end, by appending each character to \a appendTo. It returns the number of
566	characters appended. Each percent-encoded sequence is decoded as follows:
567
568	\list
569	\li from %00 to %7F: the exact decoded value is appended;
570	\li from %80 to %FF: QChar::ReplacementCharacter is appended;
571	\li bad encoding: original input is copied to the output, undecoded.
572	\endlist
573
574	Given the above, it's important for the input to already have all UTF-8
575	percent sequences decoded by qt_urlRecode (that is, the input should not
576	have been processed with QUrl::EncodeUnicode).
577
578	The input should also be a valid percent-encoded sequence (the output of
579	qt_urlRecode is always valid).
580	*/
581	static int decode(QString &appendTo, const ushort begin, const* ushort *end)
582	{
583	// fast check whether there's anything to be decoded in the first place
584	const ushort *input = QtPrivate::qustrchr(str: QStringView (begin, end), ch: `'%'`);
585	if (Q_LIKELY(input == end))
586	return `0`; // nothing to do, it was already decoded!
587
588	// detach
589	const int origSize = appendTo.size();
590	appendTo.resize(size: origSize + (end - begin));
591	ushort output = reinterpret_cast<ushort >(appendTo.begin()) + origSize;
592	memcpy(dest: static_cast<void >(output), src: static_cast<const* void >(begin), n: (input - begin) sizeof(ushort));
593	output += input - begin;
594
595	while (input != end) {
596	// something was encoded
597	Q_ASSERT(*input == `'%'`);
598
599	if (Q_UNLIKELY(end - input < `3` \|\| !isHex(input[`1`]) \|\| !isHex(input[`2`]))) {
600	// badly-encoded data
601	appendTo.resize(size: origSize + (end - begin));
602	memcpy(dest: static_cast<void >(appendTo.begin() + origSize), src: static_cast<const* void >(begin), n: (end - begin) sizeof(ushort));
603	return end - begin;
604	}
605
606	++input;
607	*output++ = decodeNibble(c: input[`0`]) << `4` \| decodeNibble(c: input[`1`]);
608	if (output[-`1`] >= `0x80`)
609	output[-`1`] = QChar::ReplacementCharacter;
610	input += `2`;
611
612	// search for the next percent, copying from input to output
613	if (simdCheckNonEncoded(output, input, end)) {
614	while (input != end) {
615	ushort uc = *input;
616	if (uc == `'%'`)
617	break;
618	*output++ = uc;
619	++input;
620	}
621	}
622	}
623
624	int len = output - reinterpret_cast<ushort *>(appendTo.begin());
625	appendTo.truncate(pos: len);
626	return len - origSize;
627	}
628
629	template <size_t N>
630	static void maskTable(uchar (&table)[N], const uchar (&mask)[N])
631	{
632	for (size_t i = `0`; i < N; ++i)
633	table[i] &= mask[i];
634	}
635
636	/!*
637	\internal
638
639	Recodes the string from \a begin to \a end. If any transformations are
640	done, append them to \a appendTo and return the number of characters added.
641	If no transformations were required, return 0.
642
643	The \a encoding option modifies the default behaviour:
644	\list
645	\li QUrl::DecodeReserved: if set, reserved characters will be decoded;
646	if unset, reserved characters will be encoded
647	\li QUrl::EncodeSpaces: if set, spaces will be encoded to "%20"; if unset, they will be " "
648	\li QUrl::EncodeUnicode: if set, characters above U+0080 will be encoded to their UTF-8
649	percent-encoded form; if unset, they will be decoded to UTF-16
650	\li QUrl::FullyDecoded: if set, this function will decode all percent-encoded sequences,
651	including that of the percent character. The resulting string
652	will not be percent-encoded anymore. Use with caution!
653	In this mode, the behaviour is undefined if the input string
654	contains any percent-encoding sequences above %80.
655	Also, the function will not correct bad % sequences.
656	\endlist
657
658	Other flags are ignored (including QUrl::EncodeReserved).
659
660	The \a tableModifications argument can be used to supply extra
661	modifications to the tables, to be applied after the flags above are
662	handled. It consists of a sequence of 16-bit values, where the low 8 bits
663	indicate the character in question and the high 8 bits are either \c
664	EncodeCharacter, \c LeaveCharacter or \c DecodeCharacter.
665
666	This function corrects percent-encoded errors by interpreting every '%' as
667	meaning "%25" (all percents in the same content).
668	*/
669
670	Q_AUTOTEST_EXPORT int
671	qt_urlRecode(QString &appendTo, const QChar begin, const* QChar *end,
672	QUrl::ComponentFormattingOptions encoding, const ushort *tableModifications)
673	{
674	uchar actionTable[sizeof defaultActionTable];
675	if ((encoding & QUrl::FullyDecoded) == QUrl::FullyDecoded) {
676	return decode(appendTo, begin: reinterpret_cast<const ushort >(begin), end: reinterpret_cast<const* ushort *>(end));
677	}
678
679	memcpy(dest: actionTable, src: defaultActionTable, n: sizeof actionTable);
680	if (encoding & QUrl::DecodeReserved)
681	maskTable(table&: actionTable, mask: reservedMask);
682	if (!(encoding & QUrl::EncodeSpaces))
683	actionTable[`0`] = DecodeCharacter; // decode
684
685	if (tableModifications) {
686	for (const ushort p = tableModifications; p; ++p)
687	actionTable[uchar(p) - `' '`] = p >> `8`;
688	}
689
690	return recode(result&: appendTo, begin: reinterpret_cast<const ushort >(begin), end: reinterpret_cast<const* ushort *>(end),
691	encoding, actionTable, retryBadEncoding: false);
692	}
693
694	// qstring.cpp
695	bool qt_is_ascii(const char &ptr, const* char end) noexcept*;
696
697	/!*
698	\internal
699	\since 5.0
700
701	\a ba contains an 8-bit form of the component and it might be
702	percent-encoded already. We can't use QString::fromUtf8 because it might
703	contain non-UTF8 sequences. We can't use QByteArray::toPercentEncoding
704	because it might already contain percent-encoded sequences. We can't use
705	qt_urlRecode because it needs UTF-16 input.
706	*/
707	Q_AUTOTEST_EXPORT
708	QString qt_urlRecodeByteArray(const QByteArray &ba)
709	{
710	if (ba.isNull())
711	return QString ();
712
713	// scan ba for anything above or equal to 0x80
714	// control points below 0x20 are fine in QString
715	const char *in = ba.constData();
716	const char *const end = ba.constEnd();
717	if (qt_is_ascii(ptr&: in, end)) {
718	// no non-ASCII found, we're safe to convert to QString
719	return QString::fromLatin1(str: ba, size: ba.size());
720	}
721
722	// we found something that we need to encode
723	QByteArray intermediate = ba;
724	intermediate.resize(size: ba.size() * `3` - (in - ba.constData()));
725	uchar out = reinterpret_cast<uchar >(intermediate.data() + (in - ba.constData()));
726	for ( ; in < end; ++in) {
727	if (*in & `0x80`) {
728	// encode
729	*out++ = `'%'`;
730	out++ = encodeNibble(c: uchar(in) >> `4`);
731	out++ = encodeNibble(c: uchar(in) & `0xf`);
732	} else {
733	// keep
734	out++ = uchar(in);
735	}
736	}
737
738	// now it's safe to call fromLatin1
739	return QString::fromLatin1(str: intermediate, size: out - reinterpret_cast<uchar *>(intermediate.data()));
740	}
741
742	QT_END_NAMESPACE
743

Provided by KDAB

Definitions

source code of qtbase/src/corelib/io/qurlrecode.cpp