qurlidna.cpp source code [qtbase/src/corelib/io/qurlidna.cpp]

1	// Copyright (C) 2016 The Qt Company Ltd.
2	// Copyright (C) 2016 Intel Corporation.
3	// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4
5	#include "qurl_p.h"
6
7	#include <QtCore/qstringlist.h>
8	#include <QtCore/private/qnumeric_p.h>
9	#include <QtCore/private/qoffsetstringarray_p.h>
10	#include <QtCore/private/qstringiterator_p.h>
11	#include <QtCore/private/qunicodetables_p.h>
12
13	#include <algorithm>
14
15	QT_BEGIN_NAMESPACE
16
17	using namespace Qt::StringLiterals;
18
19	// needed by the punycode encoder/decoder
20	static const uint base = `36`;
21	static const uint tmin = `1`;
22	static const uint tmax = `26`;
23	static const uint skew = `38`;
24	static const uint damp = `700`;
25	static const uint initial_bias = `72`;
26	static const uint initial_n = `128`;
27
28	static constexpr qsizetype MaxDomainLabelLength = `63`;
29
30	static inline uint encodeDigit(uint digit)
31	{
32	return digit + `22` + `75` * (digit < `26`);
33	}
34
35	static inline uint adapt(uint delta, uint numpoints, bool firsttime)
36	{
37	delta /= (firsttime ? damp : `2`);
38	delta += (delta / numpoints);
39
40	uint k = `0`;
41	for (; delta > ((base - tmin) * tmax) / `2`; k += base)
42	delta /= (base - tmin);
43
44	return k + (((base - tmin + `1`) * delta) / (delta + skew));
45	}
46
47	static inline void appendEncode(QString *output, uint delta, uint bias)
48	{
49	uint qq;
50	uint k;
51	uint t;
52
53	// insert the variable length delta integer.
54	for (qq = delta, k = base;; k += base) {
55	// stop generating digits when the threshold is
56	// detected.
57	t = (k <= bias) ? tmin : (k >= bias + tmax) ? tmax : k - bias;
58	if (qq < t) break;
59
60	*output += QChar (encodeDigit(digit: t + (qq - t) % (base - t)));
61	qq = (qq - t) / (base - t);
62	}
63
64	*output += QChar (encodeDigit(digit: qq));
65	}
66
67	Q_AUTOTEST_EXPORT void qt_punycodeEncoder(QStringView in, QString *output)
68	{
69	uint n = initial_n;
70	uint delta = `0`;
71	uint bias = initial_bias;
72
73	// Do not try to encode strings that certainly will result in output
74	// that is longer than allowable domain name label length. Note that
75	// non-BMP codepoints are encoded as two QChars.
76	if (in.size() > MaxDomainLabelLength * `2`)
77	return;
78
79	int outLen = output->size();
80	output->resize(size: outLen + in.size());
81
82	QChar *d = output->data() + outLen;
83	bool skipped = false;
84	// copy all basic code points verbatim to output.
85	for (QChar c : in) {
86	if (c.unicode() < `0x80`)
87	*d++ = c;
88	else
89	skipped = true;
90	}
91
92	// if there were only basic code points, just return them
93	// directly; don't do any encoding.
94	if (!skipped)
95	return;
96
97	output->truncate(pos: d - output->constData());
98	int copied = output->size() - outLen;
99
100	// h and b now contain the number of basic code points in input.
101	uint b = copied;
102	uint h = copied;
103
104	// if basic code points were copied, add the delimiter character.
105	if (h > `0`)
106	*output += u`'-'`;
107
108	// compute the input length in Unicode code points.
109	uint inputLength = `0`;
110	for (QStringIterator iter(in); iter.hasNext();) {
111	inputLength++;
112
113	if (iter.next(invalidAs: char32_t(-`1`)) == char32_t(-`1`)) {
114	output->truncate(pos: outLen);
115	return; // invalid surrogate pair
116	}
117	}
118
119	// while there are still unprocessed non-basic code points left in
120	// the input string...
121	while (h < inputLength) {
122	// find the character in the input string with the lowest unprocessed value.
123	uint m = std::numeric_limits<uint>::max();
124	for (QStringIterator iter(in); iter.hasNext();) {
125	auto c = iter.nextUnchecked();
126	static_assert(std::numeric_limits<decltype(m)>::max()
127	>= std::numeric_limits<decltype(c)>::max(),
128	"Punycode uint should be able to cover all codepoints");
129	if (c >= n && c < m)
130	m = c;
131	}
132
133	// delta = delta + (m - n) (h + 1), fail on overflow*
134	uint tmp;
135	if (qMulOverflow<uint>(v1: m - n, v2: h + `1`, r: &tmp) \|\| qAddOverflow<uint>(v1: delta, v2: tmp, r: &delta)) {
136	output->truncate(pos: outLen);
137	return; // punycode_overflow
138	}
139	n = m;
140
141	for (QStringIterator iter(in); iter.hasNext();) {
142	auto c = iter.nextUnchecked();
143
144	// increase delta until we reach the character processed in this iteration;
145	// fail if delta overflows.
146	if (c < n) {
147	if (qAddOverflow<uint>(v1: delta, v2: `1`, r: &delta)) {
148	output->truncate(pos: outLen);
149	return; // punycode_overflow
150	}
151	}
152
153	if (c == n) {
154	appendEncode(output, delta, bias);
155
156	bias = adapt(delta, numpoints: h + `1`, firsttime: h == b);
157	delta = `0`;
158	++h;
159	}
160	}
161
162	++delta;
163	++n;
164	}
165
166	// prepend ACE prefix
167	output->insert(i: outLen, s: "xn--"_L1);
168	return;
169	}
170
171	Q_AUTOTEST_EXPORT QString qt_punycodeDecoder(const QString &pc)
172	{
173	uint n = initial_n;
174	uint i = `0`;
175	uint bias = initial_bias;
176
177	// Do not try to decode strings longer than allowable for a domain label.
178	// Non-ASCII strings are not allowed here anyway, so there is no need
179	// to account for surrogates.
180	if (pc.size() > MaxDomainLabelLength)
181	return QString ();
182
183	// strip any ACE prefix
184	int start = pc.startsWith(s: "xn--"_L1) ? `4` : `0`;
185	if (!start)
186	return pc;
187
188	// find the last delimiter character '-' in the input array. copy
189	// all data before this delimiter directly to the output array.
190	int delimiterPos = pc.lastIndexOf(c: u`'-'`);
191	auto output = delimiterPos < `4` ? std::u32string ()
192	: pc.mid(position: start, n: delimiterPos - start).toStdU32String();
193
194	// if a delimiter was found, skip to the position after it;
195	// otherwise start at the front of the input string. everything
196	// before the delimiter is assumed to be basic code points.
197	uint cnt = delimiterPos + `1`;
198
199	// loop through the rest of the input string, inserting non-basic
200	// characters into output as we go.
201	while (cnt < (uint) pc.size()) {
202	uint oldi = i;
203	uint w = `1`;
204
205	// find the next index for inserting a non-basic character.
206	for (uint k = base; cnt < (uint) pc.size(); k += base) {
207	// grab a character from the punycode input and find its
208	// delta digit (each digit code is part of the
209	// variable-length integer delta)
210	uint digit = pc.at(i: cnt++).unicode();
211	if (digit - `48` < `10`) digit -= `22`;
212	else if (digit - `65` < `26`) digit -= `65`;
213	else if (digit - `97` < `26`) digit -= `97`;
214	else digit = base;
215
216	// Fail if the code point has no digit value
217	if (digit >= base)
218	return QString ();
219
220	// i = i + digit w, fail on overflow*
221	uint tmp;
222	if (qMulOverflow<uint>(v1: digit, v2: w, r: &tmp) \|\| qAddOverflow<uint>(v1: i, v2: tmp, r: &i))
223	return QString ();
224
225	// detect threshold to stop reading delta digits
226	uint t;
227	if (k <= bias) t = tmin;
228	else if (k >= bias + tmax) t = tmax;
229	else t = k - bias;
230
231	if (digit < t) break;
232
233	// w = w (base - t), fail on overflow*
234	if (qMulOverflow<uint>(v1: w, v2: base - t, r: &w))
235	return QString ();
236	}
237
238	// find new bias and calculate the next non-basic code
239	// character.
240	uint outputLength = static_cast<uint>(output.length());
241	bias = adapt(delta: i - oldi, numpoints: outputLength + `1`, firsttime: oldi == `0`);
242
243	// n = n + i div (length(output) + 1), fail on overflow
244	if (qAddOverflow<uint>(v1: n, v2: i / (outputLength + `1`), r: &n))
245	return QString ();
246
247	// allow the deltas to wrap around
248	i %= (outputLength + `1`);
249
250	// if n is a basic code point then fail; this should not happen with
251	// correct implementation of Punycode, but check just n case.
252	if (n < initial_n) {
253	// Don't use Q_ASSERT() to avoid possibility of DoS
254	qWarning(msg: "Attempt to insert a basic codepoint. Unhandled overflow?");
255	return QString ();
256	}
257
258	// Surrogates should normally be rejected later by other IDNA code.
259	// But because of Qt's use of UTF-16 to represent strings the
260	// IDNA code is not able to distinguish characters represented as pairs
261	// of surrogates from normal code points. This is why surrogates are
262	// not allowed here.
263	//
264	// Allowing surrogates would lead to non-unique (after normalization)
265	// encoding of strings with non-BMP characters.
266	//
267	// Punycode that encodes characters outside the Unicode range is also
268	// invalid and is rejected here.
269	if (QChar::isSurrogate(ucs4: n) \|\| n > QChar::LastValidCodePoint)
270	return QString ();
271
272	// insert the character n at position i
273	output.insert(pos: i, n: `1`, c: static_cast<char32_t>(n));
274	++i;
275	}
276
277	return QString::fromStdU32String(s: output);
278	}
279
280	static constexpr auto idn_whitelist = qOffsetStringArray(
281	strings: "ac", strings: "ar", strings: "asia", strings: "at",
282	strings: "biz", strings: "br",
283	strings: "cat", strings: "ch", strings: "cl", strings: "cn", strings: "com",
284	strings: "de", strings: "dk",
285	strings: "es",
286	strings: "fi",
287	strings: "gr",
288	strings: "hu",
289	strings: "il", strings: "info", strings: "io", strings: "ir", strings: "is",
290	strings: "jp",
291	strings: "kr",
292	strings: "li", strings: "lt", strings: "lu", strings: "lv",
293	strings: "museum",
294	strings: "name", strings: "net", strings: "no", strings: "nu", strings: "nz",
295	strings: "org",
296	strings: "pl", strings: "pr",
297	strings: "se", strings: "sh",
298	strings: "tel", strings: "th", strings: "tm", strings: "tw",
299	strings: "ua",
300	strings: "vn",
301	strings: "xn--fiqs8s", // China
302	strings: "xn--fiqz9s", // China
303	strings: "xn--fzc2c9e2c", // Sri Lanka
304	strings: "xn--j6w193g", // Hong Kong
305	strings: "xn--kprw13d", // Taiwan
306	strings: "xn--kpry57d", // Taiwan
307	strings: "xn--mgba3a4f16a", // Iran
308	strings: "xn--mgba3a4fra", // Iran
309	strings: "xn--mgbaam7a8h", // UAE
310	strings: "xn--mgbayh7gpa", // Jordan
311	strings: "xn--mgberp4a5d4ar", // Saudi Arabia
312	strings: "xn--ogbpf8fl", // Syria
313	strings: "xn--p1ai", // Russian Federation
314	strings: "xn--wgbh1c", // Egypt
315	strings: "xn--wgbl6a", // Qatar
316	strings: "xn--xkc2al3hye2a" // Sri Lanka
317	);
318
319	Q_CONSTINIT static QStringList user_idn_whitelist = nullptr*;
320
321	static bool lessThan(const QChar a, int* l, const char *c)
322	{
323	const auto uc = reinterpret_cast<const* char16_t *>(a);
324	const char16_t *e = uc + l;
325
326	if (!c \|\| *c == `0`)
327	return false;
328
329	while (*c) {
330	if (uc == e \|\| uc != static_cast<unsigned* char>(*c))
331	break;
332	++uc;
333	++c;
334	}
335	return uc == e ? c : (uc < static_cast<unsigned char>(*c));
336	}
337
338	static bool equal(const QChar a, int* l, const char *b)
339	{
340	while (l && a->unicode() && *b) {
341	if (a != QLatin1Char (b))
342	return false;
343	++a;
344	++b;
345	--l;
346	}
347	return l == `0`;
348	}
349
350	static bool qt_is_idn_enabled(QStringView aceDomain)
351	{
352	auto idx = aceDomain.lastIndexOf(c: u`'.'`);
353	if (idx == -`1`)
354	return false;
355
356	auto tldString = aceDomain.mid(pos: idx + `1`);
357	const auto len = tldString.size();
358
359	const QChar *tld = tldString.constData();
360
361	if (user_idn_whitelist)
362	return user_idn_whitelist->contains(str: tldString);
363
364	int l = `0`;
365	int r = idn_whitelist.count() - `1`;
366	int i = (l + r + `1`) / `2`;
367
368	while (r != l) {
369	if (lessThan(a: tld, l: len, c: idn_whitelist.at(index: i)))
370	r = i - `1`;
371	else
372	l = i;
373	i = (l + r + `1`) / `2`;
374	}
375	return equal(a: tld, l: len, b: idn_whitelist.at(index: i));
376	}
377
378	template<typename C>
379	static inline bool isValidInNormalizedAsciiLabel(C c)
380	{
381	return c == u`'-'` \|\| c == u`'_'` \|\| (c >= u`'0'` && c <= u`'9'`) \|\| (c >= u`'a'` && c <= u`'z'`);
382	}
383
384	template<typename C>
385	static inline bool isValidInNormalizedAsciiName(C c)
386	{
387	return isValidInNormalizedAsciiLabel(c) \|\| c == u`'.'`;
388	}
389
390	/*
391	Map domain name according to algorithm in UTS #46, 4.1
392
393	Returns empty string if there are disallowed characters in the input.
394
395	Sets resultIsAscii if the result is known for sure to be all ASCII.
396	*/
397	static QString mapDomainName(const QString &in, QUrl::AceProcessingOptions options,
398	bool *resultIsAscii)
399	{
400	resultIsAscii = true*;
401
402	// Check if the input is already normalized ASCII first and can be returned as is.
403	int i = `0`;
404	for (auto c : in) {
405	if (c.unicode() >= `0x80` \|\| !isValidInNormalizedAsciiName(c))
406	break;
407	i++;
408	}
409
410	if (i == in.size())
411	return in;
412
413	QString result;
414	result.reserve(asize: in.size());
415	result.append(uc: in.constData(), len: i);
416	bool allAscii = true;
417
418	for (QStringIterator iter(QStringView (in).sliced(pos: i)); iter.hasNext();) {
419	char32_t uc = iter.next();
420
421	// Fast path for ASCII-only inputs
422	if (Q_LIKELY(uc < `0x80`)) {
423	if (uc >= U`'A'` && uc <= U`'Z'`)
424	uc \|= `0x20`; // lower-case it
425
426	if (isValidInNormalizedAsciiName(c: uc)) {
427	result.append(c: static_cast<char16_t>(uc));
428	continue;
429	}
430	}
431
432	allAscii = false;
433
434	// Capital sharp S is a special case since UTR #46 revision 31 (Unicode 15.1)
435	if (uc == `0x1E9E` && options.testFlag(flag: QUrl::AceTransitionalProcessing)) {
436	result.append(s: u"ss"_s);
437	continue;
438	}
439
440	QUnicodeTables::IdnaStatus status = QUnicodeTables::idnaStatus(ucs4: uc);
441
442	if (status == QUnicodeTables::IdnaStatus::Deviation)
443	status = options.testFlag(flag: QUrl::AceTransitionalProcessing)
444	? QUnicodeTables::IdnaStatus::Mapped
445	: QUnicodeTables::IdnaStatus::Valid;
446
447	switch (status) {
448	case QUnicodeTables::IdnaStatus::Ignored:
449	continue;
450	case QUnicodeTables::IdnaStatus::Valid:
451	case QUnicodeTables::IdnaStatus::Disallowed:
452	for (auto c : QChar::fromUcs4(c: uc))
453	result.append(c);
454	break;
455	case QUnicodeTables::IdnaStatus::Mapped:
456	result.append(v: QUnicodeTables::idnaMapping(usc4: uc));
457	break;
458	default:
459	Q_UNREACHABLE();
460	}
461	}
462
463	*resultIsAscii = allAscii;
464	return result;
465	}
466
467	/*
468	Check the rules for an ASCII label.
469
470	Check the size restriction and that the label does not start or end with dashes.
471
472	The label should be nonempty.
473	*/
474	static bool validateAsciiLabel(QStringView label)
475	{
476	if (label.size() > MaxDomainLabelLength)
477	return false;
478
479	if (label.first() == u`'-'` \|\| label.last() == u`'-'`)
480	return false;
481
482	return std::all_of(first: label.begin(), last: label.end(), pred: isValidInNormalizedAsciiLabel<QChar>);
483	}
484
485	namespace {
486
487	class DomainValidityChecker
488	{
489	bool domainNameIsBidi = false;
490	bool hadBidiErrors = false;
491	bool ignoreBidiErrors;
492
493	static constexpr char32_t ZWNJ = U`'\u200C'`;
494	static constexpr char32_t ZWJ = U`'\u200D'`;
495
496	public:
497	DomainValidityChecker(bool ignoreBidiErrors = false) : ignoreBidiErrors(ignoreBidiErrors) { }
498	bool checkLabel(const QString &label, QUrl::AceProcessingOptions options);
499
500	private:
501	static bool checkContextJRules(QStringView label);
502	static bool checkBidiRules(QStringView label);
503	};
504
505	} // anonymous namespace
506
507	/*
508	Check CONTEXTJ rules according to RFC 5892, appendix A.1 & A.2.
509
510	Rule Set for U+200C (ZWNJ):
511
512	False;
513
514	If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True;
515
516	If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)\u200C*
517
518	(Joining_Type:T)(Joining_Type:{R,D})) Then True;*
519
520	Rule Set for U+200D (ZWJ):
521
522	False;
523
524	If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True;
525
526	*/
527	bool DomainValidityChecker::checkContextJRules(QStringView label)
528	{
529	constexpr unsigned char CombiningClassVirama = `9`;
530
531	enum class State {
532	Initial,
533	LD_T, // L,D with possible following T*
534	ZWNJ_T, // ZWNJ with possible following T*
535	};
536	State regexpState = State::Initial;
537	bool previousIsVirama = false;
538
539	for (QStringIterator iter(label); iter.hasNext();) {
540	auto ch = iter.next();
541
542	if (ch == ZWJ) {
543	if (!previousIsVirama)
544	return false;
545	regexpState = State::Initial;
546	} else if (ch == ZWNJ) {
547	if (!previousIsVirama && regexpState != State::LD_T)
548	return false;
549	regexpState = previousIsVirama ? State::Initial : State::ZWNJ_T;
550	} else {
551	switch (QChar::joiningType(ucs4: ch)) {
552	case QChar::Joining_Left:
553	if (regexpState == State::ZWNJ_T)
554	return false;
555	regexpState = State::LD_T;
556	break;
557	case QChar::Joining_Right:
558	regexpState = State::Initial;
559	break;
560	case QChar::Joining_Dual:
561	regexpState = State::LD_T;
562	break;
563	case QChar::Joining_Transparent:
564	break;
565	default:
566	regexpState = State::Initial;
567	break;
568	}
569	}
570
571	previousIsVirama = QChar::combiningClass(ucs4: ch) == CombiningClassVirama;
572	}
573
574	return regexpState != State::ZWNJ_T;
575	}
576
577	/*
578	Check if the label conforms to BiDi rule of RFC 5893.
579
580	1. The first character must be a character with Bidi property L, R,
581	or AL. If it has the R or AL property, it is an RTL label; if it
582	has the L property, it is an LTR label.
583
584	2. In an RTL label, only characters with the Bidi properties R, AL,
585	AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
586
587	3. In an RTL label, the end of the label must be a character with
588	Bidi property R, AL, EN, or AN, followed by zero or more
589	characters with Bidi property NSM.
590
591	4. In an RTL label, if an EN is present, no AN may be present, and
592	vice versa.
593
594	5. In an LTR label, only characters with the Bidi properties L, EN,
595	ES, CS, ET, ON, BN, or NSM are allowed.
596
597	6. In an LTR label, the end of the label must be a character with
598	Bidi property L or EN, followed by zero or more characters with
599	Bidi property NSM.
600	*/
601	bool DomainValidityChecker::checkBidiRules(QStringView label)
602	{
603	if (label.isEmpty())
604	return true;
605
606	QStringIterator iter(label);
607	Q_ASSERT(iter.hasNext());
608
609	char32_t ch = iter.next();
610	bool labelIsRTL = false;
611
612	switch (QChar::direction(ucs4: ch)) {
613	case QChar::DirL:
614	break;
615	case QChar::DirR:
616	case QChar::DirAL:
617	labelIsRTL = true;
618	break;
619	default:
620	return false;
621	}
622
623	bool tailOk = true;
624	bool labelHasEN = false;
625	bool labelHasAN = false;
626
627	while (iter.hasNext()) {
628	ch = iter.next();
629
630	switch (QChar::direction(ucs4: ch)) {
631	case QChar::DirR:
632	case QChar::DirAL:
633	if (!labelIsRTL)
634	return false;
635	tailOk = true;
636	break;
637
638	case QChar::DirL:
639	if (labelIsRTL)
640	return false;
641	tailOk = true;
642	break;
643
644	case QChar::DirES:
645	case QChar::DirCS:
646	case QChar::DirET:
647	case QChar::DirON:
648	case QChar::DirBN:
649	tailOk = false;
650	break;
651
652	case QChar::DirNSM:
653	break;
654
655	case QChar::DirAN:
656	if (labelIsRTL) {
657	if (labelHasEN)
658	return false;
659	labelHasAN = true;
660	tailOk = true;
661	} else {
662	return false;
663	}
664	break;
665
666	case QChar::DirEN:
667	if (labelIsRTL) {
668	if (labelHasAN)
669	return false;
670	labelHasEN = true;
671	}
672	tailOk = true;
673	break;
674
675	default:
676	return false;
677	}
678	}
679
680	return tailOk;
681	}
682
683	/*
684	Check if the given label is valid according to UTS #46 validity criteria.
685
686	NFC check can be skipped if the label was transformed to NFC before calling
687	this function (as optimization).
688
689	The domain name is considered invalid if this function returns false at least
690	once.
691
692	1. The label must be in Unicode Normalization Form NFC.
693	2. If CheckHyphens, the label must not contain a U+002D HYPHEN-MINUS character
694	in both the third and fourth positions.
695	3. If CheckHyphens, the label must neither begin nor end with a U+002D HYPHEN-MINUS character.
696	4. The label must not contain a U+002E ( . ) FULL STOP.
697	5. The label must not begin with a combining mark, that is: General_Category=Mark.
698	6. Each code point in the label must only have certain status values according to Section 5,
699	IDNA Mapping Table:
700	1. For Transitional Processing, each value must be valid.
701	2. For Nontransitional Processing, each value must be either valid or deviation.
702	7. If CheckJoiners, the label must satisfy the ContextJ rules from Appendix A, in The Unicode
703	Code Points and Internationalized Domain Names for Applications (IDNA).
704	8. If CheckBidi, and if the domain name is a Bidi domain name, then the label must satisfy
705	all six of the numbered conditions in RFC 5893, Section 2.
706
707	NOTE: Don't use QStringView for label, so that call to QString::normalized() can avoid
708	memory allocation when there is nothing to normalize.
709	*/
710	bool DomainValidityChecker::checkLabel(const QString &label, QUrl::AceProcessingOptions options)
711	{
712	if (label.isEmpty())
713	return true;
714
715	if (label != label.normalized(mode: QString::NormalizationForm_C))
716	return false;
717
718	if (label.size() >= `4`) {
719	// This assumes that the first two characters are in BMP, but that's ok
720	// because non-BMP characters are unlikely to be used for specifying
721	// future extensions.
722	if (label [`2`] == u`'-'` && label [`3`] == u`'-'`)
723	return ignoreBidiErrors && label.startsWith(s: u"xn") && validateAsciiLabel(label);
724	}
725
726	if (label.startsWith(c: u`'-'`) \|\| label.endsWith(c: u`'-'`))
727	return false;
728
729	if (label.contains(c: u`'.'`))
730	return false;
731
732	QStringIterator iter(label);
733	auto c = iter.next();
734
735	if (QChar::isMark(ucs4: c))
736	return false;
737
738	// As optimization, CONTEXTJ rules check can be skipped if no
739	// ZWJ/ZWNJ characters were found during the first pass.
740	bool hasJoiners = false;
741
742	for (;;) {
743	hasJoiners = hasJoiners \|\| c == ZWNJ \|\| c == ZWJ;
744
745	if (!ignoreBidiErrors && !domainNameIsBidi) {
746	switch (QChar::direction(ucs4: c)) {
747	case QChar::DirR:
748	case QChar::DirAL:
749	case QChar::DirAN:
750	domainNameIsBidi = true;
751	if (hadBidiErrors)
752	return false;
753	break;
754	default:
755	break;
756	}
757	}
758
759	switch (QUnicodeTables::idnaStatus(ucs4: c)) {
760	case QUnicodeTables::IdnaStatus::Valid:
761	break;
762	case QUnicodeTables::IdnaStatus::Deviation:
763	if (options.testFlag(flag: QUrl::AceTransitionalProcessing))
764	return false;
765	break;
766	default:
767	return false;
768	}
769
770	if (!iter.hasNext())
771	break;
772	c = iter.next();
773	}
774
775	if (hasJoiners && !checkContextJRules(label))
776	return false;
777
778	hadBidiErrors = hadBidiErrors \|\| !checkBidiRules(label);
779
780	if (domainNameIsBidi && hadBidiErrors)
781	return false;
782
783	return true;
784	}
785
786	static QString convertToAscii(QStringView normalizedDomain, AceLeadingDot dot)
787	{
788	qsizetype lastIdx = `0`;
789	QString aceForm; // this variable is here for caching
790	QString aceResult;
791
792	while (true) {
793	qsizetype idx = normalizedDomain.indexOf(c: u`'.'`, from: lastIdx);
794	if (idx == -`1`)
795	idx = normalizedDomain.size();
796
797	const qsizetype labelLength = idx - lastIdx;
798	if (labelLength) {
799	const auto label = normalizedDomain.sliced(pos: lastIdx, n: labelLength);
800	aceForm.clear();
801	qt_punycodeEncoder(in: label, output: &aceForm);
802	if (aceForm.isEmpty())
803	return {};
804
805	aceResult.append(s: aceForm);
806	}
807
808	if (idx == normalizedDomain.size())
809	break;
810
811	if (labelLength == `0` && (dot == ForbidLeadingDot \|\| idx > `0`))
812	return {}; // two delimiters in a row -- empty label not allowed
813
814	lastIdx = idx + `1`;
815	aceResult += u`'.'`;
816	}
817
818	return aceResult;
819	}
820
821	static bool checkAsciiDomainName(QStringView normalizedDomain, AceLeadingDot dot,
822	bool *usesPunycode)
823	{
824	qsizetype lastIdx = `0`;
825	bool hasPunycode = false;
826	usesPunycode = false*;
827
828	while (lastIdx < normalizedDomain.size()) {
829	auto idx = normalizedDomain.indexOf(c: u`'.'`, from: lastIdx);
830	if (idx == -`1`)
831	idx = normalizedDomain.size();
832
833	const auto labelLength = idx - lastIdx;
834	if (labelLength == `0`) {
835	if (idx == normalizedDomain.size())
836	break;
837	if (dot == ForbidLeadingDot \|\| idx > `0`)
838	return false; // two delimiters in a row -- empty label not allowed
839	} else {
840	const auto label = normalizedDomain.sliced(pos: lastIdx, n: labelLength);
841	if (!validateAsciiLabel(label))
842	return false;
843
844	hasPunycode = hasPunycode \|\| label.startsWith(s: "xn--"_L1);
845	}
846
847	lastIdx = idx + `1`;
848	}
849
850	*usesPunycode = hasPunycode;
851	return true;
852	}
853
854	static QString convertToUnicode(const QString &asciiDomain, QUrl::AceProcessingOptions options)
855	{
856	QString result;
857	result.reserve(asize: asciiDomain.size());
858	qsizetype lastIdx = `0`;
859
860	DomainValidityChecker checker;
861
862	while (true) {
863	auto idx = asciiDomain.indexOf(c: u`'.'`, from: lastIdx);
864	if (idx == -`1`)
865	idx = asciiDomain.size();
866
867	const auto labelLength = idx - lastIdx;
868	if (labelLength == `0`) {
869	if (idx == asciiDomain.size())
870	break;
871	} else {
872	const auto label = asciiDomain.sliced(pos: lastIdx, n: labelLength);
873	const auto unicodeLabel = qt_punycodeDecoder(pc: label);
874
875	if (unicodeLabel.isEmpty())
876	return asciiDomain;
877
878	if (!checker.checkLabel(label: unicodeLabel, options))
879	return asciiDomain;
880
881	result.append(s: unicodeLabel);
882	}
883
884	if (idx == asciiDomain.size())
885	break;
886
887	lastIdx = idx + `1`;
888	result += u`'.'`;
889	}
890	return result;
891	}
892
893	static bool checkUnicodeName(const QString &domainName, QUrl::AceProcessingOptions options)
894	{
895	qsizetype lastIdx = `0`;
896
897	DomainValidityChecker checker(true);
898
899	while (true) {
900	qsizetype idx = domainName.indexOf(c: u`'.'`, from: lastIdx);
901	if (idx == -`1`)
902	idx = domainName.size();
903
904	const qsizetype labelLength = idx - lastIdx;
905	if (labelLength) {
906	const auto label = domainName.sliced(pos: lastIdx, n: labelLength);
907
908	if (!checker.checkLabel(label, options))
909	return false;
910	}
911
912	if (idx == domainName.size())
913	break;
914
915	lastIdx = idx + `1`;
916	}
917	return true;
918	}
919
920	QString qt_ACE_do(const QString &domain, AceOperation op, AceLeadingDot dot,
921	QUrl::AceProcessingOptions options)
922	{
923	if (domain.isEmpty())
924	return {};
925
926	bool mappedToAscii;
927	const QString mapped = mapDomainName(in: domain, options, resultIsAscii: &mappedToAscii);
928	const QString normalized =
929	mappedToAscii ? mapped : mapped.normalized(mode: QString::NormalizationForm_C);
930
931	if (normalized.isEmpty())
932	return {};
933
934	if (!mappedToAscii && !checkUnicodeName(domainName: normalized, options))
935	return {};
936
937	bool needsConversionToUnicode;
938	const QString aceResult = mappedToAscii ? normalized : convertToAscii(normalizedDomain: normalized, dot);
939	if (aceResult.isEmpty() \|\| !checkAsciiDomainName(normalizedDomain: aceResult, dot, usesPunycode: &needsConversionToUnicode))
940	return {};
941
942	if (op == ToAceOnly \|\| !needsConversionToUnicode
943	\|\| (!options.testFlag(flag: QUrl::IgnoreIDNWhitelist) && !qt_is_idn_enabled(aceDomain: aceResult))) {
944	return aceResult;
945	}
946
947	return convertToUnicode(asciiDomain: aceResult, options);
948	}
949
950	/!*
951	\since 4.2
952
953	Returns the current whitelist of top-level domains that are allowed
954	to have non-ASCII characters in their compositions.
955
956	See setIdnWhitelist() for the rationale of this list.
957
958	\sa AceProcessingOption
959	*/
960	QStringList QUrl::idnWhitelist()
961	{
962	if (user_idn_whitelist)
963	return *user_idn_whitelist;
964	static const QStringList list = [] {
965	QStringList list;
966	list.reserve(asize: idn_whitelist.count());
967	int i = `0`;
968	while (i < idn_whitelist.count()) {
969	list << QLatin1StringView (idn_whitelist.at(index: i));
970	++i;
971	}
972	return list;
973	}();
974	return list;
975	}
976
977	/!*
978	\since 4.2
979
980	Sets the whitelist of Top-Level Domains (TLDs) that are allowed to have
981	non-ASCII characters in domains to the value of \a list.
982
983	Note that if you call this function, you need to do so \e before
984	you start any threads that might access idnWhitelist().
985
986	Qt comes with a default list that contains the Internet top-level domains
987	that have published support for Internationalized Domain Names (IDNs)
988	and rules to guarantee that no deception can happen between similarly-looking
989	characters (such as the Latin lowercase letter \c 'a' and the Cyrillic
990	equivalent, which in most fonts are visually identical).
991
992	This list is periodically maintained, as registrars publish new rules.
993
994	This function is provided for those who need to manipulate the list, in
995	order to add or remove a TLD. It is not recommended to change its value
996	for purposes other than testing, as it may expose users to security risks.
997	*/
998	void QUrl::setIdnWhitelist(const QStringList &list)
999	{
1000	if (!user_idn_whitelist)
1001	user_idn_whitelist = new QStringList;
1002	*user_idn_whitelist = list;
1003	}
1004
1005	QT_END_NAMESPACE
1006

Provided by KDAB

Definitions

source code of qtbase/src/corelib/io/qurlidna.cpp