qstringconverter.cpp source code [qtbase/src/corelib/text/qstringconverter.cpp]

1	// Copyright (C) 2020 The Qt Company Ltd.
2	// Copyright (C) 2020 Intel Corporation.
3	// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4
5	#include <qstringconverter.h>
6	#include <private/qstringconverter_p.h>
7	#include "qendian.h"
8
9	#include "private/qsimd_p.h"
10	#include "private/qstringiterator_p.h"
11	#include "private/qtools_p.h"
12	#include "qbytearraymatcher.h"
13	#include "qcontainertools_impl.h"
14	#include <QtCore/qbytearraylist.h>
15
16	#if QT_CONFIG(icu)
17	#include <unicode/ucnv.h>
18	#include <unicode/ucnv_cb.h>
19	#include <unicode/ucnv_err.h>
20	#include <unicode/ustring.h>
21	#endif
22
23	#ifdef Q_OS_WIN
24	#include <qt_windows.h>
25	#ifndef QT_BOOTSTRAPPED
26	#include <QtCore/qvarlengtharray.h>
27	#include <QtCore/q20iterator.h>
28	#include <QtCore/q26numeric.h>
29	#endif // !QT_BOOTSTRAPPED
30	#endif
31
32	#include <array>
33	#if __has_include(<bit>) && __cplusplus > 201703L
34	#include <bit>
35	#endif
36	#include <string>
37
38	QT_BEGIN_NAMESPACE
39
40	using namespace QtMiscUtils;
41
42	static_assert(std::is_nothrow_move_constructible_v<QStringEncoder>);
43	static_assert(std::is_nothrow_move_assignable_v<QStringEncoder>);
44	static_assert(std::is_nothrow_move_constructible_v<QStringDecoder>);
45	static_assert(std::is_nothrow_move_assignable_v<QStringDecoder>);
46
47	enum { Endian = `0`, Data = `1` };
48
49	static const uchar utf8bom[] = { `0xef`, `0xbb`, `0xbf` };
50
51	#if defined(__SSE2__) \|\| defined(__ARM_NEON__)
52	static Q_ALWAYS_INLINE uint qBitScanReverse(unsigned v) noexcept
53	{
54	#if defined(__cpp_lib_int_pow2) && __cpp_lib_int_pow2 >= 202002L
55	return std::bit_width(v) - `1`;
56	#else
57	uint result = qCountLeadingZeroBits(v);
58	// Now Invert the result: clz will count down* from the msb to the lsb, so the msb index is 31*
59	// and the lsb index is 0. The result for _bit_scan_reverse is expected to be the index when
60	// counting up: msb index is 0 (because it starts there), and the lsb index is 31.
61	result ^= sizeof(unsigned) * `8` - `1`;
62	return result;
63	#endif
64	}
65	#endif
66
67	#if defined(__SSE2__)
68	static inline bool simdEncodeAscii(uchar &dst, const* char16_t &nextAscii, const* char16_t &src, const* char16_t *end)
69	{
70	// do sixteen characters at a time
71	for ( ; end - src >= `16`; src += `16`, dst += `16`) {
72	# ifdef __AVX2__
73	__m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
74	__m128i data1 = _mm256_castsi256_si128(data);
75	__m128i data2 = _mm256_extracti128_si256(data, `1`);
76	# else
77	__m128i data1 = _mm_loadu_si128(p: (const __m128i*)src);
78	__m128i data2 = _mm_loadu_si128(p: `1`+(const __m128i*)src);
79	# endif
80
81	// check if everything is ASCII
82	// the highest ASCII value is U+007F
83	// Do the packing directly:
84	// The PACKUSWB instruction has packs a signed 16-bit integer to an unsigned 8-bit
85	// with saturation. That is, anything from 0x0100 to 0x7fff is saturated to 0xff,
86	// while all negatives (0x8000 to 0xffff) get saturated to 0x00. To detect non-ASCII,
87	// we simply do a signed greater-than comparison to 0x00. That means we detect NULs as
88	// "non-ASCII", but it's an acceptable compromise.
89	__m128i packed = _mm_packus_epi16(a: data1, b: data2);
90	__m128i nonAscii = _mm_cmpgt_epi8(a: packed, b: _mm_setzero_si128());
91
92	// store, even if there are non-ASCII characters here
93	_mm_storeu_si128(p: (__m128i*)dst, b: packed);
94
95	// n will contain 1 bit set per character in [data1, data2] that is non-ASCII (or NUL)
96	ushort n = ~_mm_movemask_epi8(a: nonAscii);
97	if (n) {
98	// find the next probable ASCII character
99	// we don't want to load 32 bytes again in this loop if we know there are non-ASCII
100	// characters still coming
101	nextAscii = src + qBitScanReverse(v: n) + `1`;
102
103	n = qCountTrailingZeroBits(v: n);
104	dst += n;
105	src += n;
106	return false;
107	}
108	}
109
110	if (end - src >= `8`) {
111	// do eight characters at a time
112	__m128i data = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(src));
113	__m128i packed = _mm_packus_epi16(a: data, b: data);
114	__m128i nonAscii = _mm_cmpgt_epi8(a: packed, b: _mm_setzero_si128());
115
116	// store even non-ASCII
117	_mm_storel_epi64(p: reinterpret_cast<__m128i *>(dst), a: packed);
118
119	uchar n = ~_mm_movemask_epi8(a: nonAscii);
120	if (n) {
121	nextAscii = src + qBitScanReverse(v: n) + `1`;
122	n = qCountTrailingZeroBits(v: n);
123	dst += n;
124	src += n;
125	return false;
126	}
127	}
128
129	return src == end;
130	}
131
132	static inline bool simdDecodeAscii(char16_t &dst, const* uchar &nextAscii, const* uchar &src, const* uchar *end)
133	{
134	// do sixteen characters at a time
135	for ( ; end - src >= `16`; src += `16`, dst += `16`) {
136	__m128i data = _mm_loadu_si128(p: (const __m128i*)src);
137
138	#ifdef __AVX2__
139	const int BitSpacing = `2`;
140	// load and zero extend to an YMM register
141	const __m256i extended = _mm256_cvtepu8_epi16(data);
142
143	uint n = _mm256_movemask_epi8(extended);
144	if (!n) {
145	// store
146	_mm256_storeu_si256((__m256i*)dst, extended);
147	continue;
148	}
149	#else
150	const int BitSpacing = `1`;
151
152	// check if everything is ASCII
153	// movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
154	uint n = _mm_movemask_epi8(a: data);
155	if (!n) {
156	// unpack
157	_mm_storeu_si128(p: (__m128i*)dst, b: _mm_unpacklo_epi8(a: data, b: _mm_setzero_si128()));
158	_mm_storeu_si128(p: `1`+(__m128i*)dst, b: _mm_unpackhi_epi8(a: data, b: _mm_setzero_si128()));
159	continue;
160	}
161	#endif
162
163	// copy the front part that is still ASCII
164	while (!(n & `1`)) {
165	dst++ = src++;
166	n >>= BitSpacing;
167	}
168
169	// find the next probable ASCII character
170	// we don't want to load 16 bytes again in this loop if we know there are non-ASCII
171	// characters still coming
172	n = qBitScanReverse(v: n);
173	nextAscii = src + (n / BitSpacing) + `1`;
174	return false;
175
176	}
177
178	if (end - src >= `8`) {
179	__m128i data = _mm_loadl_epi64(p: reinterpret_cast<const __m128i *>(src));
180	uint n = _mm_movemask_epi8(a: data) & `0xff`;
181	if (!n) {
182	// unpack and store
183	_mm_storeu_si128(p: reinterpret_cast<__m128i *>(dst), b: _mm_unpacklo_epi8(a: data, b: _mm_setzero_si128()));
184	} else {
185	while (!(n & `1`)) {
186	dst++ = src++;
187	n >>= `1`;
188	}
189
190	n = qBitScanReverse(v: n);
191	nextAscii = src + n + `1`;
192	return false;
193	}
194	}
195
196	return src == end;
197	}
198
199	static inline const uchar simdFindNonAscii(const* uchar src, const* uchar end, const* uchar *&nextAscii)
200	{
201	#ifdef __AVX2__
202	// do 32 characters at a time
203	// (this is similar to simdTestMask in qstring.cpp)
204	const __m256i mask = _mm256_set1_epi8(char(`0x80`));
205	for ( ; end - src >= `32`; src += `32`) {
206	__m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
207	if (_mm256_testz_si256(mask, data))
208	continue;
209
210	uint n = _mm256_movemask_epi8(data);
211	Q_ASSERT(n);
212
213	// find the next probable ASCII character
214	// we don't want to load 32 bytes again in this loop if we know there are non-ASCII
215	// characters still coming
216	nextAscii = src + qBitScanReverse(n) + `1`;
217
218	// return the non-ASCII character
219	return src + qCountTrailingZeroBits(n);
220	}
221	#endif
222
223	// do sixteen characters at a time
224	for ( ; end - src >= `16`; src += `16`) {
225	__m128i data = _mm_loadu_si128(p: reinterpret_cast<const __m128i*>(src));
226
227	// check if everything is ASCII
228	// movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
229	uint n = _mm_movemask_epi8(a: data);
230	if (!n)
231	continue;
232
233	// find the next probable ASCII character
234	// we don't want to load 16 bytes again in this loop if we know there are non-ASCII
235	// characters still coming
236	nextAscii = src + qBitScanReverse(v: n) + `1`;
237
238	// return the non-ASCII character
239	return src + qCountTrailingZeroBits(v: n);
240	}
241
242	// do four characters at a time
243	for ( ; end - src >= `4`; src += `4`) {
244	quint32 data = qFromUnaligned<quint32>(src);
245	data &= `0x80808080U`;
246	if (!data)
247	continue;
248
249	// We don't try to guess which of the three bytes is ASCII and which
250	// one isn't. The chance that at least two of them are non-ASCII is
251	// better than 75%.
252	nextAscii = src;
253	return src;
254	}
255	nextAscii = end;
256	return src;
257	}
258
259	// Compare only the US-ASCII beginning of [src8, end8) and [src16, end16)
260	// and advance src8 and src16 to the first character that could not be compared
261	static void simdCompareAscii(const qchar8_t &src8, const* qchar8_t end8, const* char16_t &src16, const* char16_t *end16)
262	{
263	int bitSpacing = `1`;
264	qptrdiff len = qMin(a: end8 - src8, b: end16 - src16);
265	qptrdiff offset = `0`;
266	uint mask = `0`;
267
268	// do sixteen characters at a time
269	for ( ; offset + `16` < len; offset += `16`) {
270	__m128i data8 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(src8 + offset));
271	#ifdef __AVX2__
272	// AVX2 version, use 256-bit registers and VPMOVXZBW
273	__m256i data16 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src16 + offset));
274
275	// expand US-ASCII as if it were Latin1 and confirm it's US-ASCII
276	__m256i datax8 = _mm256_cvtepu8_epi16(data8);
277	mask = _mm256_movemask_epi8(datax8);
278	if (mask)
279	break;
280
281	// compare Latin1 to UTF-16
282	__m256i latin1cmp = _mm256_cmpeq_epi16(datax8, data16);
283	mask = ~_mm256_movemask_epi8(latin1cmp);
284	if (mask)
285	break;
286	#else
287	// non-AVX2 code
288	__m128i datalo16 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(src16 + offset));
289	__m128i datahi16 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(src16 + offset) + `1`);
290
291	// expand US-ASCII as if it were Latin1, we'll confirm later
292	__m128i datalo8 = _mm_unpacklo_epi8(a: data8, b: _mm_setzero_si128());
293	__m128i datahi8 = _mm_unpackhi_epi8(a: data8, b: _mm_setzero_si128());
294
295	// compare Latin1 to UTF-16
296	__m128i latin1cmplo = _mm_cmpeq_epi16(a: datalo8, b: datalo16);
297	__m128i latin1cmphi = _mm_cmpeq_epi16(a: datahi8, b: datahi16);
298	mask = _mm_movemask_epi8(a: latin1cmphi) << `16`;
299	mask \|= ushort(_mm_movemask_epi8(a: latin1cmplo));
300	mask = ~mask;
301	if (mask)
302	break;
303
304	// confirm it was US-ASCII
305	mask = _mm_movemask_epi8(a: data8);
306	if (mask) {
307	bitSpacing = `0`;
308	break;
309	}
310	#endif
311	}
312
313	// helper for comparing 4 or 8 characters
314	auto cmp_lt_16 = [&mask, &offset](int n, __m128i data8, __m128i data16) {
315	// n = 4 -> sizemask = 0xff
316	// n = 8 -> sizemask = 0xffff
317	unsigned sizemask = (`1U` << (`2` * n)) - `1`;
318
319	// expand as if Latin1
320	data8 = _mm_unpacklo_epi8(a: data8, b: _mm_setzero_si128());
321
322	// compare and confirm it's US-ASCII
323	__m128i latin1cmp = _mm_cmpeq_epi16(a: data8, b: data16);
324	mask = ~_mm_movemask_epi8(a: latin1cmp) & sizemask;
325	mask \|= _mm_movemask_epi8(a: data8);
326	if (mask == `0`)
327	offset += n;
328	};
329
330	// do eight characters at a time
331	if (mask == `0` && offset + `8` < len) {
332	__m128i data8 = _mm_loadl_epi64(p: reinterpret_cast<const __m128i *>(src8 + offset));
333	__m128i data16 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(src16 + offset));
334	cmp_lt_16 (`8`, data8, data16);
335	}
336
337	// do four characters
338	if (mask == `0` && offset + `4` < len) {
339	__m128i data8 = _mm_cvtsi32_si128(a: qFromUnaligned<quint32>(src: src8 + offset));
340	__m128i data16 = _mm_loadl_epi64(p: reinterpret_cast<const __m128i *>(src16 + offset));
341	cmp_lt_16 (`4`, data8, data16);
342	}
343
344	// correct the source pointers to point to the first character we couldn't deal with
345	if (mask)
346	offset += qCountTrailingZeroBits(v: mask) >> bitSpacing;
347	src8 += offset;
348	src16 += offset;
349	}
350	#elif defined(__ARM_NEON__)
351	static inline bool simdEncodeAscii(uchar &dst, const* char16_t &nextAscii, const* char16_t &src, const* char16_t *end)
352	{
353	uint16x8_t maxAscii = vdupq_n_u16(`0x7f`);
354	uint16x8_t mask1 = qvsetq_n_u16(`1`, `1` << `2`, `1` << `4`, `1` << `6`, `1` << `8`, `1` << `10`, `1` << `12`, `1` << `14` );
355	uint16x8_t mask2 = vshlq_n_u16(mask1, `1`);
356
357	// do sixteen characters at a time
358	for ( ; end - src >= `16`; src += `16`, dst += `16`) {
359	// load 2 lanes (or: "load interleaved")
360	uint16x8x2_t in = vld2q_u16(reinterpret_cast<const uint16_t *>(src));
361
362	// check if any of the elements > 0x7f, select 1 bit per element (element 0 -> bit 0, element 1 -> bit 1, etc),
363	// add those together into a scalar, and merge the scalars.
364	uint16_t nonAscii = vaddvq_u16(vandq_u16(vcgtq_u16(in.val[`0`], maxAscii), mask1))
365	\| vaddvq_u16(vandq_u16(vcgtq_u16(in.val[`1`], maxAscii), mask2));
366
367	// merge the two lanes by shifting the values of the second by 8 and inserting them
368	uint16x8_t out = vsliq_n_u16(in.val[`0`], in.val[`1`], `8`);
369
370	// store, even if there are non-ASCII characters here
371	vst1q_u8(dst, vreinterpretq_u8_u16(out));
372
373	if (nonAscii) {
374	// find the next probable ASCII character
375	// we don't want to load 32 bytes again in this loop if we know there are non-ASCII
376	// characters still coming
377	nextAscii = src + qBitScanReverse(nonAscii) + `1`;
378
379	nonAscii = qCountTrailingZeroBits(nonAscii);
380	dst += nonAscii;
381	src += nonAscii;
382	return false;
383	}
384	}
385	return src == end;
386	}
387
388	static inline bool simdDecodeAscii(char16_t &dst, const* uchar &nextAscii, const* uchar &src, const* uchar *end)
389	{
390	// do eight characters at a time
391	uint8x8_t msb_mask = vdup_n_u8(`0x80`);
392	uint8x8_t add_mask = qvset_n_u8(`1`, `1` << `1`, `1` << `2`, `1` << `3`, `1` << `4`, `1` << `5`, `1` << `6`, `1` << `7` );
393	for ( ; end - src >= `8`; src += `8`, dst += `8`) {
394	uint8x8_t c = vld1_u8(src);
395	uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
396	if (!n) {
397	// store
398	vst1q_u16(reinterpret_cast<uint16_t *>(dst), vmovl_u8(c));
399	continue;
400	}
401
402	// copy the front part that is still ASCII
403	while (!(n & `1`)) {
404	dst++ = src++;
405	n >>= `1`;
406	}
407
408	// find the next probable ASCII character
409	// we don't want to load 16 bytes again in this loop if we know there are non-ASCII
410	// characters still coming
411	n = qBitScanReverse(n);
412	nextAscii = src + n + `1`;
413	return false;
414
415	}
416	return src == end;
417	}
418
419	static inline const uchar simdFindNonAscii(const* uchar src, const* uchar end, const* uchar *&nextAscii)
420	{
421	// The SIMD code below is untested, so just force an early return until
422	// we've had the time to verify it works.
423	nextAscii = end;
424	return src;
425
426	// do eight characters at a time
427	uint8x8_t msb_mask = vdup_n_u8(`0x80`);
428	uint8x8_t add_mask = qvset_n_u8(`1`, `1` << `1`, `1` << `2`, `1` << `3`, `1` << `4`, `1` << `5`, `1` << `6`, `1` << `7`);
429	for ( ; end - src >= `8`; src += `8`) {
430	uint8x8_t c = vld1_u8(src);
431	uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
432	if (!n)
433	continue;
434
435	// find the next probable ASCII character
436	// we don't want to load 16 bytes again in this loop if we know there are non-ASCII
437	// characters still coming
438	nextAscii = src + qBitScanReverse(n) + `1`;
439
440	// return the non-ASCII character
441	return src + qCountTrailingZeroBits(n);
442	}
443	nextAscii = end;
444	return src;
445	}
446
447	static void simdCompareAscii(const qchar8_t &, const* qchar8_t , const* char16_t &, const* char16_t *)
448	{
449	}
450	#else
451	static inline bool simdEncodeAscii(uchar , const* char16_t , const* char16_t , const* char16_t *)
452	{
453	return false;
454	}
455
456	static inline bool simdDecodeAscii(char16_t , const* uchar , const* uchar , const* uchar *)
457	{
458	return false;
459	}
460
461	static inline const uchar simdFindNonAscii(const* uchar src, const* uchar end, const* uchar *&nextAscii)
462	{
463	nextAscii = end;
464	return src;
465	}
466
467	static void simdCompareAscii(const qchar8_t &, const* qchar8_t , const* char16_t &, const* char16_t *)
468	{
469	}
470	#endif
471
472	enum { HeaderDone = `1` };
473
474	QByteArray QUtf8::convertFromUnicode(QStringView in)
475	{
476	qsizetype len = in.size();
477
478	// create a QByteArray with the worst case scenario size
479	QByteArray result(len * `3`, Qt::Uninitialized);
480	uchar dst = reinterpret_cast<uchar >(const_cast<char *>(result.constData()));
481	const char16_t src = reinterpret_cast<const* char16_t *>(in.data());
482	const char16_t *const end = src + len;
483
484	while (src != end) {
485	const char16_t *nextAscii = end;
486	if (simdEncodeAscii(dst, nextAscii, src, end))
487	break;
488
489	do {
490	char16_t u = *src++;
491	int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u, dst, src, end);
492	if (res < `0`) {
493	// encoding error - append '?'
494	*dst++ = `'?'`;
495	}
496	} while (src < nextAscii);
497	}
498
499	result.truncate(pos: dst - reinterpret_cast<uchar >(const_cast<char* *>(result.constData())));
500	return result;
501	}
502
503	QByteArray QUtf8::convertFromUnicode(QStringView in, QStringConverterBase::State *state)
504	{
505	QByteArray ba(`3`*in.size() +`3`, Qt::Uninitialized);
506	char *end = convertFromUnicode(out: ba.data(), in, state);
507	ba.truncate(pos: end - ba.data());
508	return ba;
509	}
510
511	char QUtf8::convertFromUnicode(char* out, QStringView in, QStringConverter::State state)
512	{
513	Q_ASSERT(state);
514	qsizetype len = in.size();
515	if (!len)
516	return out;
517
518	auto appendReplacementChar = [state](uchar cursor) -> uchar {
519	if (state->flags & QStringConverter::Flag::ConvertInvalidToNull) {
520	*cursor++ = `0`;
521	} else {
522	// QChar::replacement encoded in utf8
523	*cursor++ = `0xef`;
524	*cursor++ = `0xbf`;
525	*cursor++ = `0xbd`;
526	}
527	return cursor;
528	};
529
530	uchar cursor = reinterpret_cast<uchar >(out);
531	const char16_t *src = in.utf16();
532	const char16_t *const end = src + len;
533
534	if (!(state->flags & QStringDecoder::Flag::Stateless)) {
535	if (state->remainingChars) {
536	int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u: state->state_data[`0`], dst&: cursor, src, end);
537	if (res < `0`)
538	cursor = appendReplacementChar (cursor);
539	state->state_data[`0`] = `0`;
540	state->remainingChars = `0`;
541	} else if (!(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom) {
542	// append UTF-8 BOM
543	*cursor++ = utf8bom[`0`];
544	*cursor++ = utf8bom[`1`];
545	*cursor++ = utf8bom[`2`];
546	state->internalState \|= HeaderDone;
547	}
548	}
549
550	while (src != end) {
551	const char16_t *nextAscii = end;
552	if (simdEncodeAscii(dst&: cursor, nextAscii, src, end))
553	break;
554
555	do {
556	char16_t uc = *src++;
557	int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u: uc, dst&: cursor, src, end);
558	if (Q_LIKELY(res >= `0`))
559	continue;
560
561	if (res == QUtf8BaseTraits::Error) {
562	// encoding error
563	++state->invalidChars;
564	cursor = appendReplacementChar (cursor);
565	} else if (res == QUtf8BaseTraits::EndOfString) {
566	if (state->flags & QStringConverter::Flag::Stateless) {
567	++state->invalidChars;
568	cursor = appendReplacementChar (cursor);
569	} else {
570	state->remainingChars = `1`;
571	state->state_data[`0`] = uc;
572	}
573	return reinterpret_cast<char *>(cursor);
574	}
575	} while (src < nextAscii);
576	}
577
578	return reinterpret_cast<char *>(cursor);
579	}
580
581	char QUtf8::convertFromLatin1(char* *out, QLatin1StringView in)
582	{
583	// ### SIMD-optimize:
584	for (uchar ch : in) {
585	if (ch < `128`) {
586	*out++ = ch;
587	} else {
588	// as per https://en.wikipedia.org/wiki/UTF-8#Encoding, 2nd row
589	*out++ = `0b110'0'0000u` \| (ch >> `6`);
590	*out++ = `0b10'00'0000u` \| (ch & `0b0011'1111`);
591	}
592	}
593	return out;
594	}
595
596	QString QUtf8::convertToUnicode(QByteArrayView in)
597	{
598	// UTF-8 to UTF-16 always needs the exact same number of words or less:
599	// UTF-8 UTF-16
600	// 1 byte 1 word
601	// 2 bytes 1 word
602	// 3 bytes 1 word
603	// 4 bytes 2 words (one surrogate pair)
604	// That is, we'll use the full buffer if the input is US-ASCII (1-byte UTF-8),
605	// half the buffer for U+0080-U+07FF text (e.g., Greek, Cyrillic, Arabic) or
606	// non-BMP text, and one third of the buffer for U+0800-U+FFFF text (e.g, CJK).
607	//
608	// The table holds for invalid sequences too: we'll insert one replacement char
609	// per invalid byte.
610	QString result(in.size(), Qt::Uninitialized);
611	QChar data = const_cast<QChar>(result.constData()); // we know we're not shared
612	const QChar *end = convertToUnicode(buffer: data, in);
613	result.truncate(pos: end - data);
614	return result;
615	}
616
617	/! \internal*
618	\since 6.6
619	\overload
620
621	Converts the UTF-8 sequence of bytes viewed by \a in to a sequence of
622	QChar starting at \a dst in the destination buffer. The buffer is expected
623	to be large enough to hold the result. An upper bound for the size of the
624	buffer is \c in.size() QChars.
625
626	If, during decoding, an error occurs, a QChar::ReplacementCharacter is
627	written.
628
629	Returns a pointer to one past the last QChar written.
630
631	This function never throws.
632
633	For QChar buffers, instead of casting manually, you can use the static
634	QUtf8::convertToUnicode(QChar , QByteArrayView) directly.*
635	*/
636	char16_t QUtf8::convertToUnicode(char16_t* dst, QByteArrayView in) noexcept*
637	{
638	const uchar *const start = reinterpret_cast<const uchar *>(in.data());
639	const uchar *src = start;
640	const uchar *end = src + in.size();
641
642	// attempt to do a full decoding in SIMD
643	const uchar *nextAscii = end;
644	if (!simdDecodeAscii(dst, nextAscii, src, end)) {
645	// at least one non-ASCII entry
646	// check if we failed to decode the UTF-8 BOM; if so, skip it
647	if (Q_UNLIKELY(src == start)
648	&& end - src >= `3`
649	&& Q_UNLIKELY(src[`0`] == utf8bom[`0`] && src[`1`] == utf8bom[`1`] && src[`2`] == utf8bom[`2`])) {
650	src += `3`;
651	}
652
653	while (src < end) {
654	nextAscii = end;
655	if (simdDecodeAscii(dst, nextAscii, src, end))
656	break;
657
658	do {
659	uchar b = *src++;
660	const qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
661	if (res < `0`) {
662	// decoding error
663	*dst++ = QChar::ReplacementCharacter;
664	}
665	} while (src < nextAscii);
666	}
667	}
668
669	return dst;
670	}
671
672	QString QUtf8::convertToUnicode(QByteArrayView in, QStringConverter::State *state)
673	{
674	// See above for buffer requirements for stateless decoding. However, that
675	// fails if the state is not empty. The following situations can add to the
676	// requirements:
677	// state contains chars starts with requirement
678	// 1 of 2 bytes valid continuation 0
679	// 2 of 3 bytes same 0
680	// 3 bytes of 4 same +1 (need to insert surrogate pair)
681	// 1 of 2 bytes invalid continuation +1 (need to insert replacement and restart)
682	// 2 of 3 bytes same +1 (same)
683	// 3 of 4 bytes same +1 (same)
684	QString result(in.size() + `1`, Qt::Uninitialized);
685	QChar *end = convertToUnicode(out: result.data(), in, state);
686	result.truncate(pos: end - result.constData());
687	return result;
688	}
689
690	char16_t QUtf8::convertToUnicode(char16_t* dst, QByteArrayView in, QStringConverter::State state)
691	{
692	qsizetype len = in.size();
693
694	Q_ASSERT(state);
695	if (!len)
696	return dst;
697
698
699	char16_t replacement = QChar::ReplacementCharacter;
700	if (state->flags & QStringConverter::Flag::ConvertInvalidToNull)
701	replacement = QChar::Null;
702
703	qsizetype res;
704	uchar ch = `0`;
705
706	const uchar src = reinterpret_cast<const* uchar *>(in.data());
707	const uchar *end = src + len;
708
709	if (!(state->flags & QStringConverter::Flag::Stateless)) {
710	bool headerdone = state->internalState & HeaderDone \|\| state->flags & QStringConverter::Flag::ConvertInitialBom;
711	if (state->remainingChars \|\| !headerdone) {
712	// handle incoming state first
713	uchar remainingCharsData[`4`]; // longest UTF-8 sequence possible
714	qsizetype remainingCharsCount = state->remainingChars;
715	qsizetype newCharsToCopy = qMin<qsizetype>(a: sizeof(remainingCharsData) - remainingCharsCount, b: end - src);
716
717	memset(s: remainingCharsData, c: `0`, n: sizeof(remainingCharsData));
718	memcpy(dest: remainingCharsData, src: &state->state_data[`0`], n: remainingCharsCount);
719	memcpy(dest: remainingCharsData + remainingCharsCount, src: src, n: newCharsToCopy);
720
721	const uchar *begin = &remainingCharsData[`1`];
722	res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b: remainingCharsData[`0`], dst, src&: begin,
723	end: static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
724	if (res == QUtf8BaseTraits::Error) {
725	++state->invalidChars;
726	*dst++ = replacement;
727	++src;
728	} else if (res == QUtf8BaseTraits::EndOfString) {
729	// if we got EndOfString again, then there were too few bytes in src;
730	// copy to our state and return
731	state->remainingChars = remainingCharsCount + newCharsToCopy;
732	memcpy(dest: &state->state_data[`0`], src: remainingCharsData, n: state->remainingChars);
733	return dst;
734	} else if (!headerdone) {
735	// eat the UTF-8 BOM
736	if (dst[-`1`] == `0xfeff`)
737	--dst;
738	}
739	state->internalState \|= HeaderDone;
740
741	// adjust src now that we have maybe consumed a few chars
742	if (res >= `0`) {
743	Q_ASSERT(res > remainingCharsCount);
744	src += res - remainingCharsCount;
745	}
746	}
747	} else if (!(state->flags & QStringConverter::Flag::ConvertInitialBom)) {
748	// stateless, remove initial BOM
749	if (len > `2` && src[`0`] == utf8bom[`0`] && src[`1`] == utf8bom[`1`] && src[`2`] == utf8bom[`2`])
750	// skip BOM
751	src += `3`;
752	}
753
754	// main body, stateless decoding
755	res = `0`;
756	const uchar *nextAscii = src;
757	while (res >= `0` && src < end) {
758	if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end))
759	break;
760
761	ch = *src++;
762	res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b: ch, dst, src, end);
763	if (res == QUtf8BaseTraits::Error) {
764	res = `0`;
765	++state->invalidChars;
766	*dst++ = replacement;
767	}
768	}
769
770	if (res == QUtf8BaseTraits::EndOfString) {
771	// unterminated UTF sequence
772	if (state->flags & QStringConverter::Flag::Stateless) {
773	*dst++ = QChar::ReplacementCharacter;
774	++state->invalidChars;
775	while (src++ < end) {
776	*dst++ = QChar::ReplacementCharacter;
777	++state->invalidChars;
778	}
779	state->remainingChars = `0`;
780	} else {
781	--src; // unread the byte in ch
782	state->remainingChars = end - src;
783	memcpy(dest: &state->state_data[`0`], src: src, n: end - src);
784	}
785	} else {
786	state->remainingChars = `0`;
787	}
788
789	return dst;
790	}
791
792	struct QUtf8NoOutputTraits : public QUtf8BaseTraitsNoAscii
793	{
794	struct NoOutput {};
795	static void appendUtf16(const NoOutput &, char16_t) {}
796	static void appendUcs4(const NoOutput &, char32_t) {}
797	};
798
799	QUtf8::ValidUtf8Result QUtf8::isValidUtf8(QByteArrayView in)
800	{
801	const uchar src = reinterpret_cast<const* uchar *>(in.data());
802	const uchar *end = src + in.size();
803	const uchar *nextAscii = src;
804	bool isValidAscii = true;
805
806	while (src < end) {
807	if (src >= nextAscii)
808	src = simdFindNonAscii(src, end, nextAscii);
809	if (src == end)
810	break;
811
812	do {
813	uchar b = *src++;
814	if ((b & `0x80`) == `0`)
815	continue;
816
817	isValidAscii = false;
818	QUtf8NoOutputTraits::NoOutput output;
819	const qsizetype res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, dst&: output, src, end);
820	if (res < `0`) {
821	// decoding error
822	return { .isValidUtf8: false, .isValidAscii: false };
823	}
824	} while (src < nextAscii);
825	}
826
827	return { .isValidUtf8: true, .isValidAscii: isValidAscii };
828	}
829
830	int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16, Qt::CaseSensitivity cs) noexcept
831	{
832	auto src1 = reinterpret_cast<const qchar8_t *>(utf8.data());
833	auto end1 = src1 + utf8.size();
834	auto src2 = reinterpret_cast<const char16_t *>(utf16.data());
835	auto end2 = src2 + utf16.size();
836
837	do {
838	simdCompareAscii(src8&: src1, end8: end1, src16&: src2, end16: end2);
839
840	if (src1 < end1 && src2 < end2) {
841	char32_t uc1 = *src1++;
842	char32_t uc2 = *src2++;
843
844	if (uc1 >= `0x80`) {
845	char32_t *output = &uc1;
846	qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraitsNoAscii>(b: uc1, dst&: output, src&: src1, end: end1);
847	if (res < `0`) {
848	// decoding error
849	uc1 = QChar::ReplacementCharacter;
850	}
851
852	// Only decode the UTF-16 surrogate pair if the UTF-8 code point
853	// wasn't US-ASCII (a surrogate cannot match US-ASCII).
854	if (QChar::isHighSurrogate(ucs4: uc2) && src2 < end2 && QChar::isLowSurrogate(ucs4: *src2))
855	uc2 = QChar::surrogateToUcs4(high: uc2, low: *src2++);
856	}
857	if (cs == Qt::CaseInsensitive) {
858	uc1 = QChar::toCaseFolded(ucs4: uc1);
859	uc2 = QChar::toCaseFolded(ucs4: uc2);
860	}
861	if (uc1 != uc2)
862	return int(uc1) - int(uc2);
863	}
864	} while (src1 < end1 && src2 < end2);
865
866	// the shorter string sorts first
867	return (end1 > src1) - int(end2 > src2);
868	}
869
870	int QUtf8::compareUtf8(QByteArrayView utf8, QLatin1StringView s, Qt::CaseSensitivity cs)
871	{
872	char32_t uc1 = QChar::Null;
873	auto src1 = reinterpret_cast<const uchar *>(utf8.data());
874	auto end1 = src1 + utf8.size();
875	auto src2 = reinterpret_cast<const uchar *>(s.latin1());
876	auto end2 = src2 + s.size();
877
878	while (src1 < end1 && src2 < end2) {
879	uchar b = *src1++;
880	char32_t *output = &uc1;
881	const qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst&: output, src&: src1, end: end1);
882	if (res < `0`) {
883	// decoding error
884	uc1 = QChar::ReplacementCharacter;
885	}
886
887	char32_t uc2 = *src2++;
888	if (cs == Qt::CaseInsensitive) {
889	uc1 = QChar::toCaseFolded(ucs4: uc1);
890	uc2 = QChar::toCaseFolded(ucs4: uc2);
891	}
892	if (uc1 != uc2)
893	return int(uc1) - int(uc2);
894	}
895
896	// the shorter string sorts first
897	return (end1 > src1) - (end2 > src2);
898	}
899
900	int QUtf8::compareUtf8(QByteArrayView lhs, QByteArrayView rhs, Qt::CaseSensitivity cs) noexcept
901	{
902	if (lhs.isEmpty())
903	return qt_lencmp(lhs: `0`, rhs: rhs.size());
904
905	if (cs == Qt::CaseSensitive) {
906	const auto l = std::min(a: lhs.size(), b: rhs.size());
907	int r = memcmp(s1: lhs.data(), s2: rhs.data(), n: l);
908	return r ? r : qt_lencmp(lhs: lhs.size(), rhs: rhs.size());
909	}
910
911	char32_t uc1 = QChar::Null;
912	auto src1 = reinterpret_cast<const uchar *>(lhs.data());
913	auto end1 = src1 + lhs.size();
914	char32_t uc2 = QChar::Null;
915	auto src2 = reinterpret_cast<const uchar *>(rhs.data());
916	auto end2 = src2 + rhs.size();
917
918	while (src1 < end1 && src2 < end2) {
919	uchar b = *src1++;
920	char32_t *output = &uc1;
921	qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst&: output, src&: src1, end: end1);
922	if (res < `0`) {
923	// decoding error
924	uc1 = QChar::ReplacementCharacter;
925	}
926
927	b = *src2++;
928	output = &uc2;
929	res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst&: output, src&: src2, end: end2);
930	if (res < `0`) {
931	// decoding error
932	uc2 = QChar::ReplacementCharacter;
933	}
934
935	uc1 = QChar::toCaseFolded(ucs4: uc1);
936	uc2 = QChar::toCaseFolded(ucs4: uc2);
937	if (uc1 != uc2)
938	return int(uc1) - int(uc2);
939	}
940
941	// the shorter string sorts first
942	return (end1 > src1) - (end2 > src2);
943	}
944
945	#ifndef QT_BOOTSTRAPPED
946	QByteArray QUtf16::convertFromUnicode(QStringView in, QStringConverter::State *state, DataEndianness endian)
947	{
948	bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
949	qsizetype length = `2` * in.size();
950	if (writeBom)
951	length += `2`;
952
953	QByteArray d(length, Qt::Uninitialized);
954	char *end = convertFromUnicode(out: d.data(), in, state, endian);
955	Q_ASSERT(end - d.constData() == d.size());
956	Q_UNUSED(end);
957	return d;
958	}
959
960	char QUtf16::convertFromUnicode(char* out, QStringView in, QStringConverter::State state, DataEndianness endian)
961	{
962	Q_ASSERT(state);
963	bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
964
965	if (endian == DetectEndianness)
966	endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
967
968	if (writeBom) {
969	// set them up the BOM
970	QChar bom(QChar::ByteOrderMark);
971	if (endian == BigEndianness)
972	qToBigEndian(src: bom.unicode(), dest: out);
973	else
974	qToLittleEndian(src: bom.unicode(), dest: out);
975	out += `2`;
976	}
977	if (endian == BigEndianness)
978	qToBigEndian<char16_t>(source: in.data(), count: in.size(), dest: out);
979	else
980	qToLittleEndian<char16_t>(source: in.data(), count: in.size(), dest: out);
981
982	state->remainingChars = `0`;
983	state->internalState \|= HeaderDone;
984	return out + `2`*in.size();
985	}
986
987	QString QUtf16::convertToUnicode(QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
988	{
989	QString result((in.size() + `1`) >> `1`, Qt::Uninitialized); // worst case
990	QChar *qch = convertToUnicode(out: result.data(), in, state, endian);
991	result.truncate(pos: qch - result.constData());
992	return result;
993	}
994
995	QChar QUtf16::convertToUnicode(QChar out, QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
996	{
997	qsizetype len = in.size();
998	const char *chars = in.data();
999
1000	Q_ASSERT(state);
1001
1002	if (endian == DetectEndianness)
1003	endian = (DataEndianness)state->state_data[Endian];
1004
1005	const char *end = chars + len;
1006
1007	// make sure we can decode at least one char
1008	if (state->remainingChars + len < `2`) {
1009	if (len) {
1010	Q_ASSERT(state->remainingChars == `0` && len == `1`);
1011	state->remainingChars = `1`;
1012	state->state_data[Data] = *chars;
1013	}
1014	return out;
1015	}
1016
1017	bool headerdone = state && state->internalState & HeaderDone;
1018	if (state->flags & QStringConverter::Flag::ConvertInitialBom)
1019	headerdone = true;
1020
1021	if (!headerdone \|\| state->remainingChars) {
1022	uchar buf;
1023	if (state->remainingChars)
1024	buf = state->state_data[Data];
1025	else
1026	buf = *chars++;
1027
1028	// detect BOM, set endianness
1029	state->internalState \|= HeaderDone;
1030	QChar ch(buf, *chars++);
1031	if (endian == DetectEndianness) {
1032	// someone set us up the BOM
1033	if (ch == QChar::ByteOrderSwapped) {
1034	endian = BigEndianness;
1035	} else if (ch == QChar::ByteOrderMark) {
1036	endian = LittleEndianness;
1037	} else {
1038	if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
1039	endian = BigEndianness;
1040	} else {
1041	endian = LittleEndianness;
1042	}
1043	}
1044	}
1045	if (endian == BigEndianness)
1046	ch = QChar::fromUcs2(c: (ch.unicode() >> `8`) \| ((ch.unicode() & `0xff`) << `8`));
1047	if (headerdone \|\| ch != QChar::ByteOrderMark)
1048	*out++ = ch;
1049	} else if (endian == DetectEndianness) {
1050	endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1051	}
1052
1053	qsizetype nPairs = (end - chars) >> `1`;
1054	if (endian == BigEndianness)
1055	qFromBigEndian<char16_t>(source: chars, count: nPairs, dest: out);
1056	else
1057	qFromLittleEndian<char16_t>(source: chars, count: nPairs, dest: out);
1058	out += nPairs;
1059
1060	state->state_data[Endian] = endian;
1061	state->remainingChars = `0`;
1062	if ((end - chars) & `1`) {
1063	if (state->flags & QStringConverter::Flag::Stateless) {
1064	*out++ = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? QChar::Null : QChar::ReplacementCharacter;
1065	} else {
1066	state->remainingChars = `1`;
1067	state->state_data[Data] = *(end - `1`);
1068	}
1069	} else {
1070	state->state_data[Data] = `0`;
1071	}
1072
1073	return out;
1074	}
1075
1076	QByteArray QUtf32::convertFromUnicode(QStringView in, QStringConverter::State *state, DataEndianness endian)
1077	{
1078	bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1079	qsizetype length = `4`*in.size();
1080	if (writeBom)
1081	length += `4`;
1082	QByteArray ba(length, Qt::Uninitialized);
1083	char *end = convertFromUnicode(out: ba.data(), in, state, endian);
1084	ba.truncate(pos: end - ba.constData());
1085	return ba;
1086	}
1087
1088	char QUtf32::convertFromUnicode(char* out, QStringView in, QStringConverter::State state, DataEndianness endian)
1089	{
1090	Q_ASSERT(state);
1091
1092	bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1093	if (endian == DetectEndianness)
1094	endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1095
1096	if (writeBom) {
1097	// set them up the BOM
1098	if (endian == BigEndianness) {
1099	out[`0`] = `0`;
1100	out[`1`] = `0`;
1101	out[`2`] = (char)`0xfe`;
1102	out[`3`] = (char)`0xff`;
1103	} else {
1104	out[`0`] = (char)`0xff`;
1105	out[`1`] = (char)`0xfe`;
1106	out[`2`] = `0`;
1107	out[`3`] = `0`;
1108	}
1109	out += `4`;
1110	state->internalState \|= HeaderDone;
1111	}
1112
1113	const QChar *uc = in.data();
1114	const QChar *end = in.data() + in.size();
1115	QChar ch;
1116	char32_t ucs4;
1117	if (state->remainingChars == `1`) {
1118	auto character = state->state_data[Data];
1119	Q_ASSERT(character <= `0xFFFF`);
1120	ch = QChar (character);
1121	// this is ugly, but shortcuts a whole lot of logic that would otherwise be required
1122	state->remainingChars = `0`;
1123	goto decode_surrogate;
1124	}
1125
1126	while (uc < end) {
1127	ch = *uc++;
1128	if (Q_LIKELY(!ch.isSurrogate())) {
1129	ucs4 = ch.unicode();
1130	} else if (Q_LIKELY(ch.isHighSurrogate())) {
1131	decode_surrogate:
1132	if (uc == end) {
1133	if (state->flags & QStringConverter::Flag::Stateless) {
1134	ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? `0` : QChar::ReplacementCharacter;
1135	} else {
1136	state->remainingChars = `1`;
1137	state->state_data[Data] = ch.unicode();
1138	return out;
1139	}
1140	} else if (uc->isLowSurrogate()) {
1141	ucs4 = QChar::surrogateToUcs4(high: ch, low: *uc++);
1142	} else {
1143	ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? `0` : QChar::ReplacementCharacter;
1144	}
1145	} else {
1146	ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? `0` : QChar::ReplacementCharacter;
1147	}
1148	if (endian == BigEndianness)
1149	qToBigEndian(src: ucs4, dest: out);
1150	else
1151	qToLittleEndian(src: ucs4, dest: out);
1152	out += `4`;
1153	}
1154
1155	return out;
1156	}
1157
1158	QString QUtf32::convertToUnicode(QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
1159	{
1160	QString result;
1161	result.resize(size: (in.size() + `7`) >> `1`); // worst case
1162	QChar *end = convertToUnicode(out: result.data(), in, state, endian);
1163	result.truncate(pos: end - result.constData());
1164	return result;
1165	}
1166
1167	QChar QUtf32::convertToUnicode(QChar out, QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
1168	{
1169	qsizetype len = in.size();
1170	const char *chars = in.data();
1171
1172	Q_ASSERT(state);
1173	if (endian == DetectEndianness)
1174	endian = (DataEndianness)state->state_data[Endian];
1175
1176	const char *end = chars + len;
1177
1178	uchar tuple[`4`];
1179	memcpy(dest: tuple, src: &state->state_data[Data], n: `4`);
1180
1181	// make sure we can decode at least one char
1182	if (state->remainingChars + len < `4`) {
1183	if (len) {
1184	while (chars < end) {
1185	tuple[state->remainingChars] = *chars;
1186	++state->remainingChars;
1187	++chars;
1188	}
1189	Q_ASSERT(state->remainingChars < `4`);
1190	memcpy(dest: &state->state_data[Data], src: tuple, n: `4`);
1191	}
1192	return out;
1193	}
1194
1195	bool headerdone = state->internalState & HeaderDone;
1196	if (state->flags & QStringConverter::Flag::ConvertInitialBom)
1197	headerdone = true;
1198
1199	qsizetype num = state->remainingChars;
1200	state->remainingChars = `0`;
1201
1202	if (!headerdone \|\| endian == DetectEndianness \|\| num) {
1203	while (num < `4`)
1204	tuple[num++] = *chars++;
1205	if (endian == DetectEndianness) {
1206	// someone set us up the BOM?
1207	if (tuple[`0`] == `0xff` && tuple[`1`] == `0xfe` && tuple[`2`] == `0` && tuple[`3`] == `0`) {
1208	endian = LittleEndianness;
1209	} else if (tuple[`0`] == `0` && tuple[`1`] == `0` && tuple[`2`] == `0xfe` && tuple[`3`] == `0xff`) {
1210	endian = BigEndianness;
1211	} else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
1212	endian = BigEndianness;
1213	} else {
1214	endian = LittleEndianness;
1215	}
1216	}
1217	char32_t code = (endian == BigEndianness) ? qFromBigEndian<char32_t>(src: tuple) : qFromLittleEndian<char32_t>(src: tuple);
1218	if (headerdone \|\| code != QChar::ByteOrderMark) {
1219	if (QChar::requiresSurrogates(ucs4: code)) {
1220	*out++ = QChar (QChar::highSurrogate(ucs4: code));
1221	*out++ = QChar (QChar::lowSurrogate(ucs4: code));
1222	} else {
1223	*out++ = QChar (code);
1224	}
1225	}
1226	num = `0`;
1227	} else if (endian == DetectEndianness) {
1228	endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1229	}
1230	state->state_data[Endian] = endian;
1231	state->internalState \|= HeaderDone;
1232
1233	while (chars < end) {
1234	tuple[num++] = *chars++;
1235	if (num == `4`) {
1236	char32_t code = (endian == BigEndianness) ? qFromBigEndian<char32_t>(src: tuple) : qFromLittleEndian<char32_t>(src: tuple);
1237	for (char16_t c : QChar::fromUcs4(c: code))
1238	*out++ = c;
1239	num = `0`;
1240	}
1241	}
1242
1243	if (num) {
1244	if (state->flags & QStringDecoder::Flag::Stateless) {
1245	*out++ = QChar::ReplacementCharacter;
1246	} else {
1247	state->state_data[Endian] = endian;
1248	state->remainingChars = num;
1249	memcpy(dest: &state->state_data[Data], src: tuple, n: `4`);
1250	}
1251	}
1252
1253	return out;
1254	}
1255	#endif // !QT_BOOTSTRAPPED
1256
1257	#if defined(Q_OS_WIN) && !defined(QT_BOOTSTRAPPED)
1258	int QLocal8Bit::checkUtf8()
1259	{
1260	return GetACP() == CP_UTF8 ? `1` : -`1`;
1261	}
1262
1263	QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::State *state)
1264	{
1265	return convertToUnicode_sys(in, CP_ACP, state);
1266	}
1267
1268	QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage,
1269	QStringConverter::State *state)
1270	{
1271	const char *mb = in.data();
1272	qsizetype mblen = in.size();
1273
1274	Q_ASSERT(state);
1275	qsizetype &invalidChars = state->invalidChars;
1276	using Flag = QStringConverter::Flag;
1277	const bool useNullForReplacement = !!(state->flags & Flag::ConvertInvalidToNull);
1278	const char16_t replacementCharacter = useNullForReplacement ? QChar::Null
1279	: QChar::ReplacementCharacter;
1280	if (state->flags & Flag::Stateless) {
1281	Q_ASSERT(state->remainingChars == `0`);
1282	state = nullptr;
1283	}
1284
1285	if (!mb \|\| !mblen)
1286	return QString();
1287
1288	// Use a local stack-buffer at first to allow us a decently large container
1289	// to avoid a lot of resizing, without also returning an overallocated
1290	// QString to the user for small strings.
1291	// Then we can be fast for small strings and take the hit of extra resizes
1292	// and measuring how much storage is needed for large strings.
1293	std::array<wchar_t, `4096`> buf;
1294	wchar_t *out = buf.data();
1295	qsizetype outlen = buf.size();
1296
1297	QString sp;
1298
1299	// Return a pointer to storage where we have enough space for `size`
1300	const auto growOut = [&](qsizetype size) -> std::tuple<wchar_t *, qsizetype> {
1301	if (outlen >= size)
1302	return {out, outlen};
1303	const bool wasStackBuffer = sp.isEmpty();
1304	const auto begin = wasStackBuffer ? buf.data() : reinterpret_cast<wchar_t *>(sp.data());
1305	const qsizetype offset = qsizetype(std::distance(begin, out));
1306	qsizetype newSize = `0`;
1307	if (Q_UNLIKELY(qAddOverflow(offset, size, &newSize))) {
1308	Q_CHECK_PTR(false);
1309	return {nullptr, `0`};
1310	}
1311	sp.resize(newSize);
1312	auto it = reinterpret_cast<wchar_t *>(sp.data());
1313	if (wasStackBuffer)
1314	it = std::copy_n(buf.data(), offset, it);
1315	else
1316	it += offset;
1317	return {it, size};
1318	};
1319
1320	// Convert the pending characters (if available)
1321	while (state && state->remainingChars && mblen) {
1322	QStringConverter::State localState;
1323	localState.flags = state->flags;
1324	// Use at most 6 characters as a guess for the longest encoded character
1325	// in any multibyte encoding.
1326	// Even with a total of 2 bytes of overhead that would leave around
1327	// 2^(4 8) possible characters*
1328	std::array<char, `6`> prev = {`0`};
1329	Q_ASSERT(state->remainingChars <= q20::ssize(state->state_data));
1330	qsizetype index = `0`;
1331	for (; index < state->remainingChars; ++index)
1332	prev[index] = state->state_data[index];
1333	const qsizetype toCopy = std::min(q20::ssize(prev) - index, mblen);
1334	for (qsizetype i = `0`; i < toCopy; ++i, ++index)
1335	prev[index] = mb[i];
1336	mb += toCopy;
1337	mblen -= toCopy;
1338
1339	// Recursing:
1340	// Since we are using a clean local state it will try to decode what was
1341	// stored in our state + some extra octets from input (`prev`). If some
1342	// part fails we will have those characters stored in the local state's
1343	// storage, and we can extract those. It may also output some
1344	// replacement characters, which we'll count in the invalidChars.
1345	// In the best case we only do this once, but we will loop until we have
1346	// resolved all the remaining characters or we have run out of new input
1347	// in which case we may still have remaining characters.
1348	const QString tmp = convertToUnicode_sys(QByteArrayView(prev.data(), index), codePage,
1349	&localState);
1350	std::tie(out, outlen) = growOut(tmp.size());
1351	if (!out)
1352	return {};
1353	out = std::copy_n(reinterpret_cast<const wchar_t *>(tmp.constData()), tmp.size(), out);
1354	outlen -= tmp.size();
1355	const qsizetype tail = toCopy - localState.remainingChars;
1356	if (tail >= `0`) {
1357	// Everything left to process comes from `in`, so we can stop
1358	// looping. Adjust the window for `in` and unset remainingChars to
1359	// signal that we're done.
1360	mb -= localState.remainingChars;
1361	mblen += localState.remainingChars;
1362	localState.remainingChars = `0`;
1363	}
1364	state->remainingChars = localState.remainingChars;
1365	state->invalidChars += localState.invalidChars;
1366	std::copy_n(localState.state_data, state->remainingChars, state->state_data);
1367	}
1368
1369	Q_ASSERT(!state \|\| state->remainingChars == `0` \|\| mblen == `0`);
1370
1371	// Need it in this scope, since we try to decrease our window size if we
1372	// encounter an error
1373	int nextIn = q26::saturate_cast<int>(mblen);
1374	while (mblen > `0`) {
1375	std::tie(out, outlen) = growOut(`1`); // Need space for at least one character
1376	if (!out)
1377	return {};
1378	const int nextOut = q26::saturate_cast<int>(outlen);
1379	int len = MultiByteToWideChar(codePage, MB_ERR_INVALID_CHARS, mb, nextIn, out, nextOut);
1380	if (len) {
1381	mb += nextIn;
1382	mblen -= nextIn;
1383	out += len;
1384	outlen -= len;
1385	} else {
1386	int r = GetLastError();
1387	if (r == ERROR_INSUFFICIENT_BUFFER) {
1388	const int wclen = MultiByteToWideChar(codePage, `0`, mb, nextIn, `0`, `0`);
1389	std::tie(out, outlen) = growOut(wclen);
1390	if (!out)
1391	return {};
1392	} else if (r == ERROR_NO_UNICODE_TRANSLATION) {
1393	// Can't decode the current window, so either store the state,
1394	// reduce window size or output a replacement character.
1395
1396	// Check if we can store all remaining characters in the state
1397	// to be used next time we're called:
1398	if (state && mblen <= q20::ssize(state->state_data)) {
1399	state->remainingChars = mblen;
1400	std::copy_n(mb, mblen, state->state_data);
1401	mb += mblen;
1402	mblen = `0`;
1403	break;
1404	}
1405
1406	// .. if not, try to find the last valid character in the window
1407	// and try again with a shrunken window:
1408	if (nextIn > `1`) {
1409	// There may be some incomplete data at the end of our current
1410	// window, so decrease the window size and try again.
1411	// In the worst case scenario there is gigs of undecodable
1412	// garbage, but what are we supposed to do about that?
1413	const auto it = CharPrevExA(codePage, mb, mb + nextIn, `0`);
1414	if (it != mb)
1415	nextIn = int(it - mb);
1416	else
1417	--nextIn;
1418	continue;
1419	}
1420
1421	// Finally, we are forced to output a replacement character for
1422	// the first byte in the window:
1423	std::tie(out, outlen) = growOut(`1`);
1424	if (!out)
1425	return {};
1426	*out = replacementCharacter;
1427	++invalidChars;
1428	++out;
1429	--outlen;
1430	++mb;
1431	--mblen;
1432	} else {
1433	// Fail.
1434	qWarning("MultiByteToWideChar: Cannot convert multibyte text");
1435	break;
1436	}
1437	}
1438	nextIn = q26::saturate_cast<int>(mblen);
1439	}
1440
1441	if (sp.isEmpty()) {
1442	// We must have only used the stack buffer
1443	if (out != buf.data()) // else: we return null-string
1444	sp = QStringView(buf.data(), out).toString();
1445	} else{
1446	const auto begin = reinterpret_cast<wchar_t *>(sp.data());
1447	sp.truncate(std::distance(begin, out));
1448	}
1449
1450	if (sp.size() && sp.back().isNull())
1451	sp.chop(`1`);
1452
1453	if (!state && mblen > `0`) {
1454	// We have trailing character(s) that could not be converted, and
1455	// nowhere to cache them
1456	sp.resize(sp.size() + mblen, replacementCharacter);
1457	invalidChars += mblen;
1458	}
1459	return sp;
1460	}
1461
1462	QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, QStringConverter::State *state)
1463	{
1464	return convertFromUnicode_sys(in, CP_ACP, state);
1465	}
1466
1467	QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, quint32 codePage,
1468	QStringConverter::State *state)
1469	{
1470	const wchar_t ch = reinterpret_cast<const* wchar_t *>(in.data());
1471	qsizetype uclen = in.size();
1472
1473	Q_ASSERT(state);
1474	// The Windows API has a boolean* out-parameter that says if a replacement*
1475	// character was used, but it gives us no way to know _how many_ were used.
1476	// Since we cannot simply scan the string for replacement characters
1477	// (which is potentially a question mark, and thus a valid character),
1478	// we simply do not track the number of invalid characters here.
1479	// auto &invalidChars = state->invalidChars;
1480
1481	using Flag = QStringConverter::Flag;
1482	if (state->flags & Flag::Stateless) { // temporary
1483	Q_ASSERT(state->remainingChars == `0`);
1484	state = nullptr;
1485	}
1486
1487	if (!ch)
1488	return QByteArray();
1489	if (uclen == `0`)
1490	return QByteArray("");
1491
1492	// Use a local stack-buffer at first to allow us a decently large container
1493	// to avoid a lot of resizing, without also returning an overallocated
1494	// QByteArray to the user for small strings.
1495	// Then we can be fast for small strings and take the hit of extra resizes
1496	// and measuring how much storage is needed for large strings.
1497	std::array<char, `4096`> buf;
1498	char *out = buf.data();
1499	qsizetype outlen = buf.size();
1500	QByteArray mb;
1501
1502	if (state && state->remainingChars > `0`) {
1503	Q_ASSERT(state->remainingChars == `1`);
1504	// Let's try to decode the pending character
1505	wchar_t wc[`2`] = { wchar_t(state->state_data[`0`]), ch[`0`] };
1506	// Check if the second character is a valid low surrogate,
1507	// otherwise we'll just decode the first character, for which windows
1508	// will output a replacement character.
1509	const bool validCodePoint = QChar::isLowSurrogate(wc[`1`]);
1510	int len = WideCharToMultiByte(codePage, `0`, wc, validCodePoint ? `2` : `1`, out, outlen, nullptr,
1511	nullptr);
1512	if (!len)
1513	return {}; // Cannot recover, and I refuse to believe it was a size limitation
1514	out += len;
1515	outlen -= len;
1516	if (validCodePoint) {
1517	++ch;
1518	--uclen;
1519	}
1520	state->remainingChars = `0`;
1521	state->state_data[`0`] = `0`;
1522	if (uclen == `0`)
1523	return QByteArrayView(buf.data(), len).toByteArray();
1524	}
1525
1526	if (state && QChar::isHighSurrogate(ch[uclen - `1`])) {
1527	// We can handle a missing low surrogate at the end of the string,
1528	// so if there is one, exclude it now and store it in the state.
1529	state->remainingChars = `1`;
1530	state->state_data[`0`] = ch[uclen - `1`];
1531	--uclen;
1532	if (uclen == `0`)
1533	return QByteArray();
1534	}
1535
1536	Q_ASSERT(uclen > `0`);
1537
1538	// Return a pointer to storage where we have enough space for `size`
1539	const auto growOut = [&](qsizetype size) -> std::tuple<char *, qsizetype> {
1540	if (outlen >= size)
1541	return {out, outlen};
1542	const bool wasStackBuffer = mb.isEmpty();
1543	const auto begin = wasStackBuffer ? buf.data() : mb.data();
1544	const qsizetype offset = qsizetype(std::distance(begin, out));
1545	qsizetype newSize = `0`;
1546	if (Q_UNLIKELY(qAddOverflow(offset, size, &newSize))) {
1547	Q_CHECK_PTR(false);
1548	return {nullptr, `0`};
1549	}
1550	mb.resize(newSize);
1551	auto it = mb.data();
1552	if (wasStackBuffer)
1553	it = std::copy_n(buf.data(), offset, it);
1554	else
1555	it += offset;
1556	return {it, size};
1557	};
1558
1559	const auto getNextWindowSize = [&]() {
1560	int nextIn = q26::saturate_cast<int>(uclen);
1561	// The Windows API has some issues if the current window ends in the
1562	// middle of a surrogate pair, so we avoid that:
1563	if (nextIn > `1` && QChar::isHighSurrogate(ch[nextIn - `1`]))
1564	--nextIn;
1565	return nextIn;
1566	};
1567
1568	int len = `0`;
1569	while (uclen > `0`) {
1570	const int nextIn = getNextWindowSize();
1571	std::tie(out, outlen) = growOut(`1`); // We need at least one byte
1572	if (!out)
1573	return {};
1574	const int nextOut = q26::saturate_cast<int>(outlen);
1575	len = WideCharToMultiByte(codePage, `0`, ch, nextIn, out, nextOut, nullptr, nullptr);
1576	if (len > `0`) {
1577	ch += nextIn;
1578	uclen -= nextIn;
1579	out += len;
1580	outlen -= len;
1581	} else {
1582	int r = GetLastError();
1583	if (r == ERROR_INSUFFICIENT_BUFFER) {
1584	int neededLength = WideCharToMultiByte(codePage, `0`, ch, nextIn, nullptr, `0`,
1585	nullptr, nullptr);
1586	if (neededLength <= `0`) {
1587	// Fail. Observed with UTF8 where the input window was max int and ended in an
1588	// incomplete sequence, probably a Windows bug. We try to avoid that from
1589	// happening by reducing the window size in that case. But let's keep this
1590	// branch just in case of other bugs.
1591	#ifndef QT_NO_DEBUG
1592	r = GetLastError();
1593	fprintf(stderr,
1594	"WideCharToMultiByte: Cannot convert multibyte text (error %d)\n", r);
1595	#endif // !QT_NO_DEBUG
1596	break;
1597	}
1598	std::tie(out, outlen) = growOut(neededLength);
1599	if (!out)
1600	return {};
1601	// and try again...
1602	} else {
1603	// Fail. Probably can't happen in fact (dwFlags is 0).
1604	#ifndef QT_NO_DEBUG
1605	// Can't use qWarning(), as it'll recurse to handle %ls
1606	fprintf(stderr,
1607	"WideCharToMultiByte: Cannot convert multibyte text (error %d): %ls\n", r,
1608	reinterpret_cast<const wchar_t *>(
1609	QStringView(ch, uclen).left(`100`).toString().utf16()));
1610	#endif
1611	break;
1612	}
1613	}
1614	}
1615	if (mb.isEmpty()) {
1616	// We must have only used the stack buffer
1617	if (out != buf.data()) // else: we return null-array
1618	mb = QByteArrayView(buf.data(), out).toByteArray();
1619	} else {
1620	mb.truncate(std::distance(mb.data(), out));
1621	}
1622	return mb;
1623	}
1624	#endif
1625
1626	void QStringConverter::State::clear() noexcept
1627	{
1628	if (clearFn)
1629	clearFn(this);
1630	else
1631	state_data[`0`] = state_data[`1`] = state_data[`2`] = state_data[`3`] = `0`;
1632	remainingChars = `0`;
1633	invalidChars = `0`;
1634	internalState = `0`;
1635	}
1636
1637	void QStringConverter::State::reset() noexcept
1638	{
1639	if (flags & Flag::UsesIcu) {
1640	#if QT_CONFIG(icu)
1641	UConverter converter = static_cast<UConverter >(d[`0`]);
1642	if (converter)
1643	ucnv_reset(converter);
1644	#else
1645	Q_UNREACHABLE();
1646	#endif
1647	} else {
1648	clear();
1649	}
1650	}
1651
1652	#ifndef QT_BOOTSTRAPPED
1653	static QChar fromUtf16(QChar out, QByteArrayView in, QStringConverter::State *state)
1654	{
1655	return QUtf16::convertToUnicode(out, in, state, endian: DetectEndianness);
1656	}
1657
1658	static char toUtf16(char* out, QStringView in, QStringConverter::State state)
1659	{
1660	return QUtf16::convertFromUnicode(out, in, state, endian: DetectEndianness);
1661	}
1662
1663	static QChar fromUtf16BE(QChar out, QByteArrayView in, QStringConverter::State *state)
1664	{
1665	return QUtf16::convertToUnicode(out, in, state, endian: BigEndianness);
1666	}
1667
1668	static char toUtf16BE(char* out, QStringView in, QStringConverter::State state)
1669	{
1670	return QUtf16::convertFromUnicode(out, in, state, endian: BigEndianness);
1671	}
1672
1673	static QChar fromUtf16LE(QChar out, QByteArrayView in, QStringConverter::State *state)
1674	{
1675	return QUtf16::convertToUnicode(out, in, state, endian: LittleEndianness);
1676	}
1677
1678	static char toUtf16LE(char* out, QStringView in, QStringConverter::State state)
1679	{
1680	return QUtf16::convertFromUnicode(out, in, state, endian: LittleEndianness);
1681	}
1682
1683	static QChar fromUtf32(QChar out, QByteArrayView in, QStringConverter::State *state)
1684	{
1685	return QUtf32::convertToUnicode(out, in, state, endian: DetectEndianness);
1686	}
1687
1688	static char toUtf32(char* out, QStringView in, QStringConverter::State state)
1689	{
1690	return QUtf32::convertFromUnicode(out, in, state, endian: DetectEndianness);
1691	}
1692
1693	static QChar fromUtf32BE(QChar out, QByteArrayView in, QStringConverter::State *state)
1694	{
1695	return QUtf32::convertToUnicode(out, in, state, endian: BigEndianness);
1696	}
1697
1698	static char toUtf32BE(char* out, QStringView in, QStringConverter::State state)
1699	{
1700	return QUtf32::convertFromUnicode(out, in, state, endian: BigEndianness);
1701	}
1702
1703	static QChar fromUtf32LE(QChar out, QByteArrayView in, QStringConverter::State *state)
1704	{
1705	return QUtf32::convertToUnicode(out, in, state, endian: LittleEndianness);
1706	}
1707
1708	static char toUtf32LE(char* out, QStringView in, QStringConverter::State state)
1709	{
1710	return QUtf32::convertFromUnicode(out, in, state, endian: LittleEndianness);
1711	}
1712	#endif // !QT_BOOTSTRAPPED
1713
1714	char QLatin1::convertFromUnicode(char* out, QStringView in, QStringConverter::State state) noexcept
1715	{
1716	Q_ASSERT(state);
1717	if (state->flags & QStringConverter::Flag::Stateless) // temporary
1718	state = nullptr;
1719
1720	const char replacement = (state && state->flags & QStringConverter::Flag::ConvertInvalidToNull) ? `0` : `'?'`;
1721	qsizetype invalid = `0`;
1722	for (qsizetype i = `0`; i < in.size(); ++i) {
1723	if (in [i] > QChar (`0xff`)) {
1724	*out = replacement;
1725	++invalid;
1726	} else {
1727	out = (char*)in [i].cell();
1728	}
1729	++out;
1730	}
1731	if (state)
1732	state->invalidChars += invalid;
1733	return out;
1734	}
1735
1736	static QChar fromLocal8Bit(QChar out, QByteArrayView in, QStringConverter::State *state)
1737	{
1738	QString s = QLocal8Bit::convertToUnicode(in, state);
1739	memcpy(dest: out, src: s.constData(), n: s.size()*sizeof(QChar));
1740	return out + s.size();
1741	}
1742
1743	static char toLocal8Bit(char* out, QStringView in, QStringConverter::State state)
1744	{
1745	QByteArray s = QLocal8Bit::convertFromUnicode(in, state);
1746	memcpy(dest: out, src: s.constData(), n: s.size());
1747	return out + s.size();
1748	}
1749
1750
1751	static qsizetype fromUtf8Len(qsizetype l) { return l + `1`; }
1752	static qsizetype toUtf8Len(qsizetype l) { return `3`*(l + `1`); }
1753
1754	#ifndef QT_BOOTSTRAPPED
1755	static qsizetype fromUtf16Len(qsizetype l) { return l/`2` + `2`; }
1756	static qsizetype toUtf16Len(qsizetype l) { return `2`*(l + `1`); }
1757
1758	static qsizetype fromUtf32Len(qsizetype l) { return l/`2` + `2`; }
1759	static qsizetype toUtf32Len(qsizetype l) { return `4`*(l + `1`); }
1760	#endif
1761
1762	static qsizetype fromLatin1Len(qsizetype l) { return l + `1`; }
1763	static qsizetype toLatin1Len(qsizetype l) { return l + `1`; }
1764
1765
1766
1767	/!*
1768	\class QStringConverterBase
1769	\internal
1770
1771	Just a common base class for QStringConverter and QTextCodec
1772	*/
1773
1774	/!*
1775	\class QStringConverter
1776	\inmodule QtCore
1777	\brief The QStringConverter class provides a base class for encoding and decoding text.
1778	\reentrant
1779	\ingroup i18n
1780
1781	Qt uses UTF-16 to store, draw and manipulate strings. In many
1782	situations you may wish to deal with data that uses a different
1783	encoding. Most text data transferred over files and network connections is encoded
1784	in UTF-8.
1785
1786	The QStringConverter class is a base class for the \l {QStringEncoder} and
1787	\l {QStringDecoder} classes that help with converting between different
1788	text encodings. QStringDecoder can decode a string from an encoded representation
1789	into UTF-16, the format Qt uses internally. QStringEncoder does the opposite
1790	operation, encoding UTF-16 encoded data (usually in the form of a QString) to
1791	the requested encoding.
1792
1793	The following encodings are always supported:
1794
1795	\list
1796	\li UTF-8
1797	\li UTF-16
1798	\li UTF-16BE
1799	\li UTF-16LE
1800	\li UTF-32
1801	\li UTF-32BE
1802	\li UTF-32LE
1803	\li ISO-8859-1 (Latin-1)
1804	\li The system encoding
1805	\endlist
1806
1807	QStringConverter may support more encodings depending on how Qt was
1808	compiled. If more codecs are supported, they can be listed using
1809	availableCodecs().
1810
1811	\l {QStringConverter}s can be used as follows to convert some encoded
1812	string to and from UTF-16.
1813
1814	Suppose you have some string encoded in UTF-8, and
1815	want to convert it to a QString. The simple way
1816	to do it is to use a \l {QStringDecoder} like this:
1817
1818	\snippet code/src_corelib_text_qstringconverter.cpp 0
1819
1820	After this, \c string holds the text in decoded form.
1821	Converting a string from Unicode to the local encoding is just as
1822	easy using the \l {QStringEncoder} class:
1823
1824	\snippet code/src_corelib_text_qstringconverter.cpp 1
1825
1826	To read or write text files in various encodings, use QTextStream and
1827	its \l{QTextStream::setEncoding()}{setEncoding()} function.
1828
1829	Some care must be taken when trying to convert the data in chunks,
1830	for example, when receiving it over a network. In such cases it is
1831	possible that a multi-byte character will be split over two
1832	chunks. At best this might result in the loss of a character and
1833	at worst cause the entire conversion to fail.
1834
1835	Both QStringEncoder and QStringDecoder make this easy, by tracking
1836	this in an internal state. So simply calling the encoder or decoder
1837	again with the next chunk of data will automatically continue encoding
1838	or decoding the data correctly:
1839
1840	\snippet code/src_corelib_text_qstringconverter.cpp 2
1841
1842	The QStringDecoder object maintains state between chunks and therefore
1843	works correctly even if a multi-byte character is split between
1844	chunks.
1845
1846	QStringConverter objects can't be copied because of their internal state, but
1847	can be moved.
1848
1849	\sa QTextStream, QStringDecoder, QStringEncoder
1850	*/
1851
1852	/!*
1853	\enum QStringConverter::Flag
1854
1855	\value Default Default conversion rules apply.
1856	\value ConvertInvalidToNull If this flag is set, each invalid input
1857	character is output as a null character. If it is not set,
1858	invalid input characters are represented as QChar::ReplacementCharacter
1859	if the output encoding can represent that character, otherwise as a question mark.
1860	\value WriteBom When converting from a QString to an output encoding, write a QChar::ByteOrderMark as the first
1861	character if the output encoding supports this. This is the case for UTF-8, UTF-16 and UTF-32
1862	encodings.
1863	\value ConvertInitialBom When converting from an input encoding to a QString the QStringDecoder usually skips an
1864	leading QChar::ByteOrderMark. When this flag is set, the byte order mark will not be
1865	skipped, but converted to utf-16 and inserted at the start of the created QString.
1866	\value Stateless Ignore possible converter states between different function calls
1867	to encode or decode strings. This will also cause the QStringConverter to raise an error if an incomplete
1868	sequence of data is encountered.
1869	\omitvalue UsesIcu
1870	*/
1871
1872	/!*
1873	\enum QStringConverter::Encoding
1874	\value Utf8 Create a converter to or from UTF-8
1875	\value Utf16 Create a converter to or from UTF-16. When decoding, the byte order will get automatically
1876	detected by a leading byte order mark. If none exists or when encoding, the system byte order will
1877	be assumed.
1878	\value Utf16BE Create a converter to or from big-endian UTF-16.
1879	\value Utf16LE Create a converter to or from little-endian UTF-16.
1880	\value Utf32 Create a converter to or from UTF-32. When decoding, the byte order will get automatically
1881	detected by a leading byte order mark. If none exists or when encoding, the system byte order will
1882	be assumed.
1883	\value Utf32BE Create a converter to or from big-endian UTF-32.
1884	\value Utf32LE Create a converter to or from little-endian UTF-32.
1885	\value Latin1 Create a converter to or from ISO-8859-1 (Latin1).
1886	\value System Create a converter to or from the underlying encoding of the
1887	operating systems locale. This is always assumed to be UTF-8 for Unix based
1888	systems. On Windows, this converts to and from the locale code page.
1889	\omitvalue LastEncoding
1890	*/
1891
1892	/!*
1893	\struct QStringConverter::Interface
1894	\internal
1895	*/
1896
1897	const QStringConverter::Interface QStringConverter::encodingInterfaces[QStringConverter::LastEncoding + `1`] =
1898	{
1899	{ .name: "UTF-8", .toUtf16: QUtf8::convertToUnicode, .toUtf16Len: fromUtf8Len, .fromUtf16: QUtf8::convertFromUnicode, .fromUtf16Len: toUtf8Len },
1900	#ifndef QT_BOOTSTRAPPED
1901	{ .name: "UTF-16", .toUtf16: fromUtf16, .toUtf16Len: fromUtf16Len, .fromUtf16: toUtf16, .fromUtf16Len: toUtf16Len },
1902	{ .name: "UTF-16LE", .toUtf16: fromUtf16LE, .toUtf16Len: fromUtf16Len, .fromUtf16: toUtf16LE, .fromUtf16Len: toUtf16Len },
1903	{ .name: "UTF-16BE", .toUtf16: fromUtf16BE, .toUtf16Len: fromUtf16Len, .fromUtf16: toUtf16BE, .fromUtf16Len: toUtf16Len },
1904	{ .name: "UTF-32", .toUtf16: fromUtf32, .toUtf16Len: fromUtf32Len, .fromUtf16: toUtf32, .fromUtf16Len: toUtf32Len },
1905	{ .name: "UTF-32LE", .toUtf16: fromUtf32LE, .toUtf16Len: fromUtf32Len, .fromUtf16: toUtf32LE, .fromUtf16Len: toUtf32Len },
1906	{ .name: "UTF-32BE", .toUtf16: fromUtf32BE, .toUtf16Len: fromUtf32Len, .fromUtf16: toUtf32BE, .fromUtf16Len: toUtf32Len },
1907	#endif
1908	{ .name: "ISO-8859-1", .toUtf16: QLatin1::convertToUnicode, .toUtf16Len: fromLatin1Len, .fromUtf16: QLatin1::convertFromUnicode, .fromUtf16Len: toLatin1Len },
1909	{ .name: "Locale", .toUtf16: fromLocal8Bit, .toUtf16Len: fromUtf8Len, .fromUtf16: toLocal8Bit, .fromUtf16Len: toUtf8Len }
1910	};
1911
1912	// match names case insensitive and skipping '-' and '_'
1913	template <typename Char>
1914	static bool nameMatch_impl_impl(const char a, const* Char b, const* Char *b_end)
1915	{
1916	do {
1917	while (a == `'-'` \|\| a == `'_'`)
1918	++a;
1919	while (b != b_end && (b == Char{`'-'`} \|\| b == Char{`'_'`}))
1920	++b;
1921	if (!a && b == b_end) // end of both strings*
1922	return true;
1923	if (char16_t(*b) > `127`)
1924	return false; // non-US-ASCII cannot match US-ASCII (prevents narrowing below)
1925	} while (QtMiscUtils::toAsciiLower(ch: a++) == QtMiscUtils::toAsciiLower(ch: char(b++)));
1926
1927	return false;
1928	}
1929
1930	static bool nameMatch_impl(const char *a, QLatin1StringView b)
1931	{
1932	return nameMatch_impl_impl(a, b: b.begin(), b_end: b.end());
1933	}
1934
1935	static bool nameMatch_impl(const char *a, QUtf8StringView b)
1936	{
1937	return nameMatch_impl(a, b: QLatin1StringView {QByteArrayView {b}});
1938	}
1939
1940	static bool nameMatch_impl(const char *a, QStringView b)
1941	{
1942	return nameMatch_impl_impl(a, b: b.utf16(), b_end: b.utf16() + b.size()); // uses char16_t, not QChar
1943	}
1944
1945	static bool nameMatch(const char *a, QAnyStringView b)
1946	{
1947	return b.visit(v: [a](auto b) { return nameMatch_impl(a, b); });
1948	}
1949
1950
1951	/!*
1952	\fn constexpr QStringConverter::QStringConverter()
1953	\internal
1954	*/
1955
1956	/!*
1957	\fn constexpr QStringConverter::QStringConverter(Encoding, Flags)
1958	\internal
1959	*/
1960
1961
1962	#if QT_CONFIG(icu)
1963	// only derives from QStringConverter to get access to protected types
1964	struct QStringConverterICU : QStringConverter
1965	{
1966	static void clear_function(QStringConverterBase::State state) noexcept*
1967	{
1968	ucnv_close(converter: static_cast<UConverter *>(state->d[`0`]));
1969	state->d[`0`] = nullptr;
1970	}
1971
1972	static void ensureConverter(QStringConverter::State *state)
1973	{
1974	// old code might reset the state via clear instead of reset
1975	// in that case, the converter has been closed, and we have to reopen it
1976	if (state->d[`0`] == nullptr)
1977	state->d[`0`] = createConverterForName(name: static_cast<const char *>(state->d[`1`]), state);
1978	}
1979
1980	static QChar toUtf16(QChar out, QByteArrayView in, QStringConverter::State *state)
1981	{
1982	ensureConverter(state);
1983
1984	auto icu_conv = static_cast<UConverter *>(state->d[`0`]);
1985	UErrorCode err = U_ZERO_ERROR;
1986	auto source = in.data();
1987	auto sourceLimit = in.data() + in.size();
1988
1989	qsizetype length = toLen(inLength: in.size());
1990
1991	UChar target = reinterpret_cast<UChar >(out);
1992	auto targetLimit = target + length;
1993	// We explicitly clean up anyway, so no need to set flush to true,
1994	// which would just reset the converter.
1995	UBool flush = false;
1996
1997	// If the QStringConverter was moved, the state that we used as a context is stale now.
1998	UConverterToUCallback action;
1999	const void *context;
2000	ucnv_getToUCallBack(converter: icu_conv, action: &action, context: &context);
2001	if (context != state)
2002	ucnv_setToUCallBack(converter: icu_conv, newAction: action, newContext: state, oldAction: nullptr, oldContext: nullptr, err: &err);
2003
2004	ucnv_toUnicode(converter: icu_conv, target: &target, targetLimit, source: &source, sourceLimit, offsets: nullptr, flush, err: &err);
2005	// We did reserve enough space:
2006	Q_ASSERT(err != U_BUFFER_OVERFLOW_ERROR);
2007	if (state->flags.testFlag(flag: QStringConverter::Flag::Stateless)) {
2008	if (auto leftOver = ucnv_toUCountPending(cnv: icu_conv, status: &err)) {
2009	ucnv_reset(converter: icu_conv);
2010	state->invalidChars += leftOver;
2011	}
2012	}
2013	return reinterpret_cast<QChar *>(target);
2014	}
2015
2016	static char fromUtf16(char* out, QStringView in, QStringConverter::State state)
2017	{
2018	ensureConverter(state);
2019	auto icu_conv = static_cast<UConverter *>(state->d[`0`]);
2020	UErrorCode err = U_ZERO_ERROR;
2021	auto source = reinterpret_cast<const UChar *>(in.data());
2022	auto sourceLimit = reinterpret_cast<const UChar *>(in.data() + in.size());
2023
2024	qsizetype length = UCNV_GET_MAX_BYTES_FOR_STRING(in.size(), ucnv_getMaxCharSize(icu_conv));
2025
2026	char *target = out;
2027	char *targetLimit = out + length;
2028	UBool flush = false;
2029
2030	// If the QStringConverter was moved, the state that we used as a context is stale now.
2031	UConverterFromUCallback action;
2032	const void *context;
2033	ucnv_getFromUCallBack(converter: icu_conv, action: &action, context: &context);
2034	if (context != state)
2035	ucnv_setFromUCallBack(converter: icu_conv, newAction: action, newContext: state, oldAction: nullptr, oldContext: nullptr, err: &err);
2036
2037	ucnv_fromUnicode(converter: icu_conv, target: &target, targetLimit, source: &source, sourceLimit, offsets: nullptr, flush, err: &err);
2038	// We did reserve enough space:
2039	Q_ASSERT(err != U_BUFFER_OVERFLOW_ERROR);
2040	if (state->flags.testFlag(flag: QStringConverter::Flag::Stateless)) {
2041	if (auto leftOver = ucnv_fromUCountPending(cnv: icu_conv, status: &err)) {
2042	ucnv_reset(converter: icu_conv);
2043	state->invalidChars += leftOver;
2044	}
2045	}
2046	return target;
2047	}
2048
2049	Q_DISABLE_COPY_MOVE(QStringConverterICU)
2050
2051	template<qsizetype X>
2052	static qsizetype fromLen(qsizetype inLength)
2053	{
2054	return X * inLength * sizeof(UChar);
2055	}
2056
2057	static qsizetype toLen(qsizetype inLength)
2058	{
2059
2060	/ Assumption: each input char might map to a different codepoint*
2061	Each codepoint can take up to 4 bytes == 2 QChar
2062	We can ignore reserving space for a BOM, as only UTF encodings use one
2063	and those are not handled by the ICU converter.
2064	*/
2065	return `2` * inLength;
2066	}
2067
2068	static constexpr QStringConverter::Interface forLength[] = {
2069	{.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<`1`>},
2070	{.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<`2`>},
2071	{.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<`3`>},
2072	{.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<`4`>},
2073	{.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<`5`>},
2074	{.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<`6`>},
2075	{.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<`7`>},
2076	{.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<`8`>}
2077	};
2078
2079	static UConverter createConverterForName(const* char name, const* State *state)
2080	{
2081	Q_ASSERT(name);
2082	Q_ASSERT(state);
2083	UErrorCode status = U_ZERO_ERROR;
2084	UConverter *conv = ucnv_open(converterName: name, err: &status);
2085	if (status != U_ZERO_ERROR && status != U_AMBIGUOUS_ALIAS_WARNING) {
2086	ucnv_close(converter: conv);
2087	return nullptr;
2088	}
2089
2090	if (state->flags.testFlag(flag: Flag::ConvertInvalidToNull)) {
2091	UErrorCode error = U_ZERO_ERROR;
2092
2093	auto nullToSubstituter = [](const void context, UConverterToUnicodeArgs toUArgs,
2094	const char *, int32_t length,
2095	UConverterCallbackReason reason, UErrorCode *err) {
2096	if (reason <= UCNV_IRREGULAR) {
2097	*err = U_ZERO_ERROR;
2098	UChar c = `'\0'`;
2099	ucnv_cbToUWriteUChars(args: toUArgs, source: &c, length: `1`, offsetIndex: `0`, err);
2100	// Recover outer scope's state (which isn't const) from context:
2101	auto state = const_cast<State >(static_cast<const* State *>(context));
2102	state->invalidChars += length;
2103	}
2104	};
2105	ucnv_setToUCallBack(converter: conv, newAction: nullToSubstituter, newContext: state, oldAction: nullptr, oldContext: nullptr, err: &error);
2106
2107	auto nullFromSubstituter = [](const void context, UConverterFromUnicodeArgs fromUArgs,
2108	const UChar *, int32_t length,
2109	UChar32, UConverterCallbackReason reason, UErrorCode *err) {
2110	if (reason <= UCNV_IRREGULAR) {
2111	*err = U_ZERO_ERROR;
2112	const UChar replacement[] = { `0` };
2113	const UChar *stringBegin = std::begin(arr: replacement);
2114	ucnv_cbFromUWriteUChars(args: fromUArgs, source: &stringBegin, sourceLimit: std::end(arr: replacement), offsetIndex: `0`, err);
2115	// Recover outer scope's state (which isn't const) from context:
2116	auto state = const_cast<State >(static_cast<const* State *>(context));
2117	state->invalidChars += length;
2118	}
2119	};
2120	ucnv_setFromUCallBack(converter: conv, newAction: nullFromSubstituter, newContext: state, oldAction: nullptr, oldContext: nullptr, err: &error);
2121	} else {
2122	UErrorCode error = U_ZERO_ERROR;
2123
2124	auto qmarkToSubstituter = [](const void context, UConverterToUnicodeArgs toUArgs,
2125	const char *codeUnits,int32_t length,
2126	UConverterCallbackReason reason, UErrorCode *err) {
2127	if (reason <= UCNV_IRREGULAR) {
2128	// Recover outer scope's state (which isn't const) from context:
2129	auto state = const_cast<State >(static_cast<const* State *>(context));
2130	state->invalidChars += length;
2131	}
2132	// use existing ICU callback for logic
2133	UCNV_TO_U_CALLBACK_SUBSTITUTE(context: nullptr, toUArgs, codeUnits, length, reason, err);
2134
2135	};
2136	ucnv_setToUCallBack(converter: conv, newAction: qmarkToSubstituter, newContext: state, oldAction: nullptr, oldContext: nullptr, err: &error);
2137
2138	auto qmarkFromSubstituter = [](const void context, UConverterFromUnicodeArgs fromUArgs,
2139	const UChar *codeUnits, int32_t length,
2140	UChar32 codePoint, UConverterCallbackReason reason, UErrorCode *err) {
2141	if (reason <= UCNV_IRREGULAR) {
2142	// Recover outer scope's state (which isn't const) from context:
2143	auto state = const_cast<State >(static_cast<const* State *>(context));
2144	state->invalidChars += length;
2145	}
2146	// use existing ICU callback for logic
2147	UCNV_FROM_U_CALLBACK_SUBSTITUTE(context: nullptr, fromUArgs, codeUnits, length,
2148	codePoint, reason, err);
2149	};
2150	ucnv_setFromUCallBack(converter: conv, newAction: qmarkFromSubstituter, newContext: state, oldAction: nullptr, oldContext: nullptr, err: &error);
2151	}
2152	return conv;
2153	}
2154
2155	static std::string nul_terminate_impl(QLatin1StringView name)
2156	{ return name.isNull() ? std::string () : std::string {name.data(), size_t(name.size())}; }
2157
2158	static std::string nul_terminate_impl(QUtf8StringView name)
2159	{ return nul_terminate_impl(name: QLatin1StringView {QByteArrayView {name}}); }
2160
2161	static std::string nul_terminate_impl(QStringView name)
2162	{
2163	std::string result;
2164	const auto convert = [&](char *p, size_t n) {
2165	const auto sz = QLatin1::convertFromUnicode(out: p, in: name) - p;
2166	Q_ASSERT(size_t(sz) <= n);
2167	return sz;
2168	};
2169	#ifdef __cpp_lib_string_resize_and_overwrite
2170	result.resize_and_overwrite(size_t(name.size()), convert);
2171	#else
2172	result.resize(n: size_t(name.size()));
2173	result.resize(n: convert(result.data(), result.size()));
2174	#endif // __cpp_lib_string_resize_and_overwrite
2175	return result;
2176	}
2177
2178	static std::string nul_terminate(QAnyStringView name)
2179	{ return name.visit(v: [](auto name) { return nul_terminate_impl(name); }); }
2180
2181	static const QStringConverter::Interface *
2182	make_icu_converter(QStringConverterBase::State *state, QAnyStringView name)
2183	{ return make_icu_converter(state, name: nul_terminate(name).data()); }
2184
2185	static const QStringConverter::Interface *make_icu_converter(
2186	QStringConverterBase::State *state,
2187	const char *name)
2188	{
2189	UErrorCode status = U_ZERO_ERROR;
2190	UConverter *conv = createConverterForName(name, state);
2191	if (!conv)
2192	return nullptr;
2193
2194	const char *icuName = ucnv_getName(converter: conv, err: &status);
2195	// ucnv_getStandardName returns a name which is owned by the library
2196	// we can thus store it in the state without worrying aobut its lifetime
2197	const char *persistentName = ucnv_getStandardName(name: icuName, standard: "MIME", pErrorCode: &status);
2198	if (U_FAILURE(code: status) \|\| !persistentName) {
2199	status = U_ZERO_ERROR;
2200	persistentName = ucnv_getStandardName(name: icuName, standard: "IANA", pErrorCode: &status);
2201	}
2202	state->d[`1`] = const_cast<char *>(persistentName);
2203	state->d[`0`] = conv;
2204	state->flags \|= QStringConverterBase::Flag::UsesIcu;
2205	qsizetype maxCharSize = ucnv_getMaxCharSize(converter: conv);
2206	state->clearFn = QStringConverterICU::clear_function;
2207	if (maxCharSize > `8` \|\| maxCharSize < `1`) {
2208	qWarning(msg: "Encountered unexpected codec \"%s\" which requires >8x space", name);
2209	return nullptr;
2210	} else {
2211	return &forLength[maxCharSize - `1`];
2212	}
2213
2214	}
2215
2216	};
2217	#endif
2218
2219	/!*
2220	\internal
2221	*/
2222	QStringConverter::QStringConverter(QAnyStringView name, Flags f)
2223	: iface(nullptr), state (f)
2224	{
2225	auto e = encodingForName(name);
2226	if (e)
2227	iface = encodingInterfaces + int(*e);
2228	#if QT_CONFIG(icu)
2229	else
2230	iface = QStringConverterICU::make_icu_converter(state: &state, name);
2231	#endif
2232	}
2233
2234
2235	const char QStringConverter::name() const* noexcept
2236	{
2237	if (!iface)
2238	return nullptr;
2239	if (state.flags & QStringConverter::Flag::UsesIcu) {
2240	#if QT_CONFIG(icu)
2241	return static_cast<const char*>(state.d[`1`]);
2242	#else
2243	return nullptr;
2244	#endif
2245	} else {
2246	return iface->name;
2247	}
2248	}
2249
2250	/!*
2251	\fn bool QStringConverter::isValid() const
2252
2253	Returns true if this is a valid string converter that can be used for encoding or
2254	decoding text.
2255
2256	Default constructed string converters or converters constructed with an unsupported
2257	name are not valid.
2258	*/
2259
2260	/!*
2261	\fn void QStringConverter::resetState()
2262
2263	Resets the internal state of the converter, clearing potential errors or partial
2264	conversions.
2265	*/
2266
2267	/!*
2268	\fn bool QStringConverter::hasError() const
2269
2270	Returns true if a conversion could not correctly convert a character. This could for example
2271	get triggered by an invalid UTF-8 sequence or when a character can't get converted due to
2272	limitations in the target encoding.
2273	*/
2274
2275	/!*
2276	\fn const char QStringConverter::name() const*
2277
2278	Returns the canonical name of the encoding this QStringConverter can encode or decode.
2279	Returns a nullptr if the converter is not valid.
2280	The returned name is UTF-8 encoded.
2281
2282	\sa isValid()
2283	*/
2284
2285	/!*
2286	Convert \a name to the corresponding \l Encoding member, if there is one.
2287
2288	If the \a name is not the name of a codec listed in the Encoding enumeration,
2289	\c{std::nullopt} is returned. Such a name may, none the less, be accepted by
2290	the QStringConverter constructor when Qt is built with ICU, if ICU provides a
2291	converter with the given name.
2292
2293	\note In Qt versions prior to 6.8, this function took only a \c{const char },*
2294	which was expected to be UTF-8-encoded.
2295	*/
2296	std::optional<QStringConverter::Encoding> QStringConverter::encodingForName(QAnyStringView name) noexcept
2297	{
2298	if (name.isEmpty())
2299	return std::nullopt;
2300	for (qsizetype i = `0`; i < LastEncoding + `1`; ++i) {
2301	if (nameMatch(a: encodingInterfaces[i].name, b: name))
2302	return QStringConverter::Encoding(i);
2303	}
2304	if (nameMatch(a: "latin1", b: name))
2305	return QStringConverter::Latin1;
2306	return std::nullopt;
2307	}
2308
2309	#ifndef QT_BOOTSTRAPPED
2310	/!*
2311	Returns the encoding for the content of \a data if it can be determined.
2312	\a expectedFirstCharacter can be passed as an additional hint to help determine
2313	the encoding.
2314
2315	The returned optional is empty, if the encoding is unclear.
2316	*/
2317	std::optional<QStringConverter::Encoding>
2318	QStringConverter::encodingForData(QByteArrayView data, char16_t expectedFirstCharacter) noexcept
2319	{
2320	// someone set us up the BOM?
2321	qsizetype arraySize = data.size();
2322	if (arraySize > `3`) {
2323	char32_t uc = qFromUnaligned<char32_t>(src: data.data());
2324	if (uc == qToBigEndian(source: char32_t(QChar::ByteOrderMark)))
2325	return QStringConverter::Utf32BE;
2326	if (uc == qToLittleEndian(source: char32_t(QChar::ByteOrderMark)))
2327	return QStringConverter::Utf32LE;
2328	if (expectedFirstCharacter) {
2329	// catch also anything starting with the expected character
2330	if (qToLittleEndian(source: uc) == expectedFirstCharacter)
2331	return QStringConverter::Utf32LE;
2332	else if (qToBigEndian(source: uc) == expectedFirstCharacter)
2333	return QStringConverter::Utf32BE;
2334	}
2335	}
2336
2337	if (arraySize > `2`) {
2338	if (memcmp(s1: data.data(), s2: utf8bom, n: sizeof(utf8bom)) == `0`)
2339	return QStringConverter::Utf8;
2340	}
2341
2342	if (arraySize > `1`) {
2343	char16_t uc = qFromUnaligned<char16_t>(src: data.data());
2344	if (uc == qToBigEndian(source: char16_t(QChar::ByteOrderMark)))
2345	return QStringConverter::Utf16BE;
2346	if (uc == qToLittleEndian(source: char16_t(QChar::ByteOrderMark)))
2347	return QStringConverter::Utf16LE;
2348	if (expectedFirstCharacter) {
2349	// catch also anything starting with the expected character
2350	if (qToLittleEndian(source: uc) == expectedFirstCharacter)
2351	return QStringConverter::Utf16LE;
2352	else if (qToBigEndian(source: uc) == expectedFirstCharacter)
2353	return QStringConverter::Utf16BE;
2354	}
2355	}
2356	return std::nullopt;
2357	}
2358
2359	static QByteArray parseHtmlMetaForEncoding(QByteArrayView data)
2360	{
2361	static constexpr auto metaSearcher = qMakeStaticByteArrayMatcher(pattern: "meta ");
2362	static constexpr auto charsetSearcher = qMakeStaticByteArrayMatcher(pattern: "charset=");
2363
2364	QByteArray header = data.first(n: qMin(a: data.size(), b: qsizetype(`1024`))).toByteArray().toLower();
2365	qsizetype pos = metaSearcher.indexIn(haystack: header);
2366	if (pos != -`1`) {
2367	pos = charsetSearcher.indexIn(haystack: header, from: pos);
2368	if (pos != -`1`) {
2369	pos += qstrlen(str: "charset=");
2370	if (pos < header.size() && (header.at(i: pos) == `'\"'` \|\| header.at(i: pos) == `'\''`))
2371	++pos;
2372
2373	qsizetype pos2 = pos;
2374	// The attribute can be closed with either """, "'", ">" or "/",
2375	// none of which are valid charset characters.
2376	while (++pos2 < header.size()) {
2377	char ch = header.at(i: pos2);
2378	if (ch == `'\"'` \|\| ch == `'\''` \|\| ch == `'>'` \|\| ch == `'/'`) {
2379	QByteArray name = header.mid(index: pos, len: pos2 - pos);
2380	qsizetype colon = name.indexOf(c: `':'`);
2381	if (colon > `0`)
2382	name = name.left(n: colon);
2383	name = name.simplified();
2384	if (name == "unicode") // QTBUG-41998, ICU will return UTF-16.
2385	name = QByteArrayLiteral("UTF-8");
2386	if (!name.isEmpty())
2387	return name;
2388	}
2389	}
2390	}
2391	}
2392	return QByteArray ();
2393	}
2394
2395	/!*
2396	Tries to determine the encoding of the HTML in \a data by looking at leading byte
2397	order marks or a charset specifier in the HTML meta tag. If the optional is empty,
2398	the encoding specified is not supported by QStringConverter. If no encoding is
2399	detected, the method returns Utf8.
2400
2401	\sa QStringDecoder::decoderForHtml()
2402	*/
2403	std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(QByteArrayView data)
2404	{
2405	// determine charset
2406	std::optional<QStringConverter::Encoding> encoding = encodingForData(data);
2407	if (encoding)
2408	// trust the initial BOM
2409	return encoding;
2410
2411	QByteArray encodingTag = parseHtmlMetaForEncoding(data);
2412	if (!encodingTag.isEmpty())
2413	return encodingForName(name: encodingTag);
2414
2415	return Utf8;
2416	}
2417
2418	static qsizetype availableCodecCount()
2419	{
2420	#if !QT_CONFIG(icu)
2421	return QStringConverter::Encoding::LastEncoding;
2422	#else
2423	/ icu contains also the names of what Qt provides*
2424	except for the special Locale one (so add one for it)
2425	*/
2426	return `1` + ucnv_countAvailable();
2427	#endif
2428	}
2429
2430	/!*
2431	Returns a list of names of supported codecs. The names returned
2432	by this function can be passed to QStringEncoder's and
2433	QStringDecoder's constructor to create a en- or decoder for
2434	the given codec.
2435
2436	This function may be used to obtain a listing of additional codecs beyond
2437	the standard ones. Support for additional codecs requires Qt be compiled
2438	with support for the ICU library.
2439
2440	\note The order of codecs is an internal implementation detail
2441	and not guaranteed to be stable.
2442	*/
2443	QStringList QStringConverter::availableCodecs()
2444	{
2445	auto availableCodec = [](qsizetype index) -> QString
2446	{
2447	#if !QT_CONFIG(icu)
2448	return QString::fromLatin1(encodingInterfaces[index].name);
2449	#else
2450	if (index == `0`) // "Locale", not provided by icu
2451	return QString::fromLatin1(
2452	ba: encodingInterfaces[QStringConverter::Encoding::System].name);
2453	// this mirrors the setup we do to set a converters name
2454	UErrorCode status = U_ZERO_ERROR;
2455	auto icuName = ucnv_getAvailableName(n: int32_t(index - `1`));
2456	const char *standardName = ucnv_getStandardName(name: icuName, standard: "MIME", pErrorCode: &status);
2457	if (U_FAILURE(code: status) \|\| !standardName) {
2458	status = U_ZERO_ERROR;
2459	standardName = ucnv_getStandardName(name: icuName, standard: "IANA", pErrorCode: &status);
2460	}
2461	if (!standardName)
2462	standardName = icuName;
2463	return QString::fromLatin1(ba: standardName);
2464	#endif
2465	};
2466
2467	qsizetype codecCount = availableCodecCount();
2468	QStringList result;
2469	result.reserve(asize: codecCount);
2470	for (qsizetype i = `0`; i < codecCount; ++i)
2471	result.push_back(t: availableCodec (i));
2472	return result;
2473	}
2474
2475	/!*
2476	Tries to determine the encoding of the HTML in \a data by looking at leading byte
2477	order marks or a charset specifier in the HTML meta tag and returns a QStringDecoder
2478	matching the encoding. If the returned decoder is not valid,
2479	the encoding specified is not supported by QStringConverter. If no encoding is
2480	detected, the method returns a decoder for Utf8.
2481
2482	\sa isValid()
2483	*/
2484	QStringDecoder QStringDecoder::decoderForHtml(QByteArrayView data)
2485	{
2486	// determine charset
2487	std::optional<QStringConverter::Encoding> encoding = encodingForData(data);
2488	if (encoding)
2489	// trust the initial BOM
2490	return QStringDecoder (encoding.value());
2491
2492	QByteArray encodingTag = parseHtmlMetaForEncoding(data);
2493	if (!encodingTag.isEmpty())
2494	return QStringDecoder (encodingTag);
2495
2496	return QStringDecoder (Utf8);
2497	}
2498	#endif // !QT_BOOTSTRAPPED
2499
2500	/!*
2501	Returns the canonical name for encoding \a e.
2502	*/
2503	const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e)
2504	{
2505	return encodingInterfaces[int(e)].name;
2506	}
2507
2508	/!*
2509	\class QStringEncoder
2510	\inmodule QtCore
2511	\brief The QStringEncoder class provides a state-based encoder for text.
2512	\reentrant
2513	\ingroup i18n
2514
2515	A text encoder converts text from Qt's internal representation into an encoded
2516	text format using a specific encoding.
2517
2518	Converting a string from Unicode to the local encoding can be achieved
2519	using the following code:
2520
2521	\snippet code/src_corelib_text_qstringconverter.cpp 1
2522
2523	The encoder remembers any state that is required between calls, so converting
2524	data received in chunks, for example, when receiving it over a network, is just as
2525	easy, by calling the encoder whenever new data is available:
2526
2527	\snippet code/src_corelib_text_qstringconverter.cpp 3
2528
2529	The QStringEncoder object maintains state between chunks and therefore
2530	works correctly even if a UTF-16 surrogate character is split between
2531	chunks.
2532
2533	QStringEncoder objects can't be copied because of their internal state, but
2534	can be moved.
2535
2536	\sa QStringConverter, QStringDecoder
2537	*/
2538
2539	/!*
2540	\fn constexpr QStringEncoder::QStringEncoder(const Interface i)*
2541	\internal
2542	*/
2543
2544	/!*
2545	\fn constexpr QStringEncoder::QStringEncoder()
2546
2547	Default constructs an encoder. The default encoder is not valid,
2548	and can't be used for converting text.
2549	*/
2550
2551	/!*
2552	\fn constexpr QStringEncoder::QStringEncoder(Encoding encoding, Flags flags = Flag::Default)
2553
2554	Creates an encoder object using \a encoding and \a flags.
2555	*/
2556
2557	/!*
2558	\fn QStringEncoder::QStringEncoder(QAnyStringView name, Flags flags = Flag::Default)
2559
2560	Creates an encoder object using \a name and \a flags.
2561	If \a name is not the name of a known encoding an invalid converter will get created.
2562
2563	\note In Qt versions prior to 6.8, this function took only a \c{const char },*
2564	which was expected to be UTF-8-encoded.
2565
2566	\sa isValid()
2567	*/
2568
2569	/!*
2570	\fn QStringEncoder::DecodedData<const QString &> QStringEncoder::encode(const QString &in)
2571	\fn QStringEncoder::DecodedData<QStringView> QStringEncoder::encode(QStringView in)
2572	\fn QStringEncoder::DecodedData<const QString &> QStringEncoder::operator()(const QString &in)
2573	\fn QStringEncoder::DecodedData<QStringView> QStringEncoder::operator()(QStringView in)
2574
2575	Converts \a in and returns a struct that is implicitly convertible to QByteArray.
2576
2577	\snippet code/src_corelib_text_qstringconverter.cpp 5
2578	*/
2579
2580	/!*
2581	\fn qsizetype QStringEncoder::requiredSpace(qsizetype inputLength) const
2582
2583	Returns the maximum amount of characters required to be able to process
2584	\a inputLength decoded data.
2585
2586	\sa appendToBuffer()
2587	*/
2588
2589	/!*
2590	\fn char QStringEncoder::appendToBuffer(char out, QStringView in)
2591
2592	Encodes \a in and writes the encoded result into the buffer
2593	starting at \a out. Returns a pointer to the end of the data written.
2594
2595	\note \a out must be large enough to be able to hold all the decoded data. Use
2596	requiredSpace() to determine the maximum size requirement to be able to encode
2597	\a in.
2598
2599	\sa requiredSpace()
2600	*/
2601
2602	/!*
2603	\class QStringDecoder
2604	\inmodule QtCore
2605	\brief The QStringDecoder class provides a state-based decoder for text.
2606	\reentrant
2607	\ingroup i18n
2608
2609	A text decoder converts text an encoded text format that uses a specific encoding
2610	into Qt's internal representation.
2611
2612	Converting encoded data into a QString can be achieved
2613	using the following code:
2614
2615	\snippet code/src_corelib_text_qstringconverter.cpp 0
2616
2617	The decoder remembers any state that is required between calls, so converting
2618	data received in chunks, for example, when receiving it over a network, is just as
2619	easy, by calling the decoder whenever new data is available:
2620
2621	\snippet code/src_corelib_text_qstringconverter.cpp 2
2622
2623	The QStringDecoder object maintains state between chunks and therefore
2624	works correctly even if chunks are split in the middle of a multi-byte character
2625	sequence.
2626
2627	QStringDecoder objects can't be copied because of their internal state, but
2628	can be moved.
2629
2630	\sa QStringConverter, QStringEncoder
2631	*/
2632
2633	/!*
2634	\fn constexpr QStringDecoder::QStringDecoder(const Interface i)*
2635	\internal
2636	*/
2637
2638	/!*
2639	\fn constexpr QStringDecoder::QStringDecoder()
2640
2641	Default constructs an decoder. The default decoder is not valid,
2642	and can't be used for converting text.
2643	*/
2644
2645	/!*
2646	\fn constexpr QStringDecoder::QStringDecoder(Encoding encoding, Flags flags = Flag::Default)
2647
2648	Creates an decoder object using \a encoding and \a flags.
2649	*/
2650
2651	/!*
2652	\fn QStringDecoder::QStringDecoder(QAnyStringView name, Flags flags = Flag::Default)
2653
2654	Creates an decoder object using \a name and \a flags.
2655	If \a name is not the name of a known encoding an invalid converter will get created.
2656
2657	\note In Qt versions prior to 6.8, this function took only a \c{const char },*
2658	which was expected to be UTF-8-encoded.
2659
2660	\sa isValid()
2661	*/
2662
2663	/!*
2664	\fn QStringDecoder::EncodedData<const QByteArray &> QStringDecoder::operator()(const QByteArray &ba)
2665	\fn QStringDecoder::EncodedData<const QByteArray &> QStringDecoder::decode(const QByteArray &ba)
2666	\fn QStringDecoder::EncodedData<QByteArrayView> QStringDecoder::operator()(QByteArrayView ba)
2667	\fn QStringDecoder::EncodedData<QByteArrayView> QStringDecoder::decode(QByteArrayView ba)
2668
2669	Converts \a ba and returns a struct that is implicitly convertible to QString.
2670
2671
2672	\snippet code/src_corelib_text_qstringconverter.cpp 4
2673	*/
2674
2675	/!*
2676	\fn qsizetype QStringDecoder::requiredSpace(qsizetype inputLength) const
2677
2678	Returns the maximum amount of UTF-16 code units required to be able to process
2679	\a inputLength encoded data.
2680
2681	\sa appendToBuffer
2682	*/
2683
2684	/!*
2685	\fn QChar QStringDecoder::appendToBuffer(QChar out, QByteArrayView in)
2686
2687	Decodes the sequence of bytes viewed by \a in and writes the decoded result into
2688	the buffer starting at \a out. Returns a pointer to the end of data written.
2689
2690	\a out needs to be large enough to be able to hold all the decoded data. Use
2691	\l{requiredSpace} to determine the maximum size requirements to decode an encoded
2692	data buffer of \c in.size() bytes.
2693
2694	\sa requiredSpace
2695	*/
2696
2697	/!*
2698	\fn char16_t QStringDecoder::appendToBuffer(char16_t out, QByteArrayView in)
2699	\since 6.6
2700	\overload
2701	*/
2702
2703	QT_END_NAMESPACE
2704

Provided by KDAB

Definitions

utf8bom
qBitScanReverse
simdEncodeAscii
simdDecodeAscii
simdFindNonAscii
simdCompareAscii
convertFromUnicode
convertFromUnicode
convertFromUnicode
convertFromLatin1
convertToUnicode
convertToUnicode
convertToUnicode
convertToUnicode
QUtf8NoOutputTraits
NoOutput
appendUtf16
appendUcs4
isValidUtf8
compareUtf8
compareUtf8
compareUtf8
convertFromUnicode
convertFromUnicode
convertToUnicode
convertToUnicode
convertFromUnicode
convertFromUnicode
convertToUnicode
convertToUnicode
clear
reset
fromUtf16
toUtf16
fromUtf16BE
toUtf16BE
fromUtf16LE
toUtf16LE
fromUtf32
toUtf32
fromUtf32BE
toUtf32BE
fromUtf32LE
toUtf32LE
convertFromUnicode
fromLocal8Bit
toLocal8Bit
fromUtf8Len
toUtf8Len
fromUtf16Len
toUtf16Len
fromUtf32Len
toUtf32Len
fromLatin1Len
toLatin1Len
encodingInterfaces
nameMatch_impl_impl
nameMatch_impl
nameMatch_impl
nameMatch_impl
nameMatch
QStringConverterICU
clear_function
ensureConverter
toUtf16
fromUtf16
QStringConverterICU
fromLen
toLen
forLength
createConverterForName
nul_terminate_impl
nul_terminate_impl
nul_terminate_impl
nul_terminate
make_icu_converter
make_icu_converter
QStringConverter
name
encodingForName
encodingForData
parseHtmlMetaForEncoding
encodingForHtml
availableCodecCount
availableCodecs
decoderForHtml

Learn to use CMake with our Intro Training

Find out more

Definitions

source code of qtbase/src/corelib/text/qstringconverter.cpp