qutfcodec.cpp source code [qtbase/src/corelib/codecs/qutfcodec.cpp]

1	/****************************************************************************
2	**
3	** Copyright (C) 2016 The Qt Company Ltd.
4	** Copyright (C) 2018 Intel Corporation.
5	** Contact: https://www.qt.io/licensing/
6	**
7	** This file is part of the QtCore module of the Qt Toolkit.
8	**
9	** $QT_BEGIN_LICENSE:LGPL$
10	** Commercial License Usage
11	** Licensees holding valid commercial Qt licenses may use this file in
12	** accordance with the commercial license agreement provided with the
13	** Software or, alternatively, in accordance with the terms contained in
14	** a written agreement between you and The Qt Company. For licensing terms
15	** and conditions see https://www.qt.io/terms-conditions. For further
16	** information use the contact form at https://www.qt.io/contact-us.
17	**
18	** GNU Lesser General Public License Usage
19	** Alternatively, this file may be used under the terms of the GNU Lesser
20	** General Public License version 3 as published by the Free Software
21	** Foundation and appearing in the file LICENSE.LGPL3 included in the
22	** packaging of this file. Please review the following information to
23	** ensure the GNU Lesser General Public License version 3 requirements
24	** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
25	**
26	** GNU General Public License Usage
27	** Alternatively, this file may be used under the terms of the GNU
28	** General Public License version 2.0 or (at your option) the GNU General
29	** Public license version 3 or any later version approved by the KDE Free
30	** Qt Foundation. The licenses are as published by the Free Software
31	** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
32	** included in the packaging of this file. Please review the following
33	** information to ensure the GNU General Public License requirements will
34	** be met: https://www.gnu.org/licenses/gpl-2.0.html and
35	** https://www.gnu.org/licenses/gpl-3.0.html.
36	**
37	** $QT_END_LICENSE$
38	**
39	****************************************************************************/
40
41	#include "qutfcodec_p.h"
42	#include "qlist.h"
43	#include "qendian.h"
44	#include "qchar.h"
45
46	#include "private/qsimd_p.h"
47	#include "private/qstringiterator_p.h"
48
49	QT_BEGIN_NAMESPACE
50
51	enum { Endian = `0`, Data = `1` };
52
53	static const uchar utf8bom[] = { `0xef`, `0xbb`, `0xbf` };
54
55	#if (defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)) \
56	\|\| (defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64))
57	static Q_ALWAYS_INLINE uint qBitScanReverse(unsigned v) noexcept
58	{
59	uint result = qCountLeadingZeroBits(v);
60	// Now Invert the result: clz will count down* from the msb to the lsb, so the msb index is 31*
61	// and the lsb index is 0. The result for _bit_scan_reverse is expected to be the index when
62	// counting up: msb index is 0 (because it starts there), and the lsb index is 31.
63	result ^= sizeof(unsigned) * `8` - `1`;
64	return result;
65	}
66	#endif
67
68	#if defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)
69	static inline bool simdEncodeAscii(uchar &dst, const* ushort &nextAscii, const* ushort &src, const* ushort *end)
70	{
71	// do sixteen characters at a time
72	for ( ; end - src >= `16`; src += `16`, dst += `16`) {
73	# ifdef __AVX2__
74	__m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
75	__m128i data1 = _mm256_castsi256_si128(data);
76	__m128i data2 = _mm256_extracti128_si256(data, `1`);
77	# else
78	__m128i data1 = _mm_loadu_si128(p: (const __m128i*)src);
79	__m128i data2 = _mm_loadu_si128(p: `1`+(const __m128i*)src);
80	# endif
81
82	// check if everything is ASCII
83	// the highest ASCII value is U+007F
84	// Do the packing directly:
85	// The PACKUSWB instruction has packs a signed 16-bit integer to an unsigned 8-bit
86	// with saturation. That is, anything from 0x0100 to 0x7fff is saturated to 0xff,
87	// while all negatives (0x8000 to 0xffff) get saturated to 0x00. To detect non-ASCII,
88	// we simply do a signed greater-than comparison to 0x00. That means we detect NULs as
89	// "non-ASCII", but it's an acceptable compromise.
90	__m128i packed = _mm_packus_epi16(a: data1, b: data2);
91	__m128i nonAscii = _mm_cmpgt_epi8(a: packed, b: _mm_setzero_si128());
92
93	// store, even if there are non-ASCII characters here
94	_mm_storeu_si128(p: (__m128i*)dst, b: packed);
95
96	// n will contain 1 bit set per character in [data1, data2] that is non-ASCII (or NUL)
97	ushort n = ~_mm_movemask_epi8(a: nonAscii);
98	if (n) {
99	// find the next probable ASCII character
100	// we don't want to load 32 bytes again in this loop if we know there are non-ASCII
101	// characters still coming
102	nextAscii = src + qBitScanReverse(v: n) + `1`;
103
104	n = qCountTrailingZeroBits(v: n);
105	dst += n;
106	src += n;
107	return false;
108	}
109	}
110
111	if (end - src >= `8`) {
112	// do eight characters at a time
113	__m128i data = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(src));
114	__m128i packed = _mm_packus_epi16(a: data, b: data);
115	__m128i nonAscii = _mm_cmpgt_epi8(a: packed, b: _mm_setzero_si128());
116
117	// store even non-ASCII
118	_mm_storel_epi64(p: reinterpret_cast<__m128i *>(dst), a: packed);
119
120	uchar n = ~_mm_movemask_epi8(a: nonAscii);
121	if (n) {
122	nextAscii = src + qBitScanReverse(v: n) + `1`;
123	n = qCountTrailingZeroBits(v: n);
124	dst += n;
125	src += n;
126	return false;
127	}
128	}
129
130	return src == end;
131	}
132
133	static inline bool simdDecodeAscii(ushort &dst, const* uchar &nextAscii, const* uchar &src, const* uchar *end)
134	{
135	// do sixteen characters at a time
136	for ( ; end - src >= `16`; src += `16`, dst += `16`) {
137	__m128i data = _mm_loadu_si128(p: (const __m128i*)src);
138
139	#ifdef __AVX2__
140	const int BitSpacing = `2`;
141	// load and zero extend to an YMM register
142	const __m256i extended = _mm256_cvtepu8_epi16(data);
143
144	uint n = _mm256_movemask_epi8(extended);
145	if (!n) {
146	// store
147	_mm256_storeu_si256((__m256i*)dst, extended);
148	continue;
149	}
150	#else
151	const int BitSpacing = `1`;
152
153	// check if everything is ASCII
154	// movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
155	uint n = _mm_movemask_epi8(a: data);
156	if (!n) {
157	// unpack
158	_mm_storeu_si128(p: (__m128i*)dst, b: _mm_unpacklo_epi8(a: data, b: _mm_setzero_si128()));
159	_mm_storeu_si128(p: `1`+(__m128i*)dst, b: _mm_unpackhi_epi8(a: data, b: _mm_setzero_si128()));
160	continue;
161	}
162	#endif
163
164	// copy the front part that is still ASCII
165	while (!(n & `1`)) {
166	dst++ = src++;
167	n >>= BitSpacing;
168	}
169
170	// find the next probable ASCII character
171	// we don't want to load 16 bytes again in this loop if we know there are non-ASCII
172	// characters still coming
173	n = qBitScanReverse(v: n);
174	nextAscii = src + (n / BitSpacing) + `1`;
175	return false;
176
177	}
178
179	if (end - src >= `8`) {
180	__m128i data = _mm_loadl_epi64(p: reinterpret_cast<const __m128i *>(src));
181	uint n = _mm_movemask_epi8(a: data) & `0xff`;
182	if (!n) {
183	// unpack and store
184	_mm_storeu_si128(p: reinterpret_cast<__m128i *>(dst), b: _mm_unpacklo_epi8(a: data, b: _mm_setzero_si128()));
185	} else {
186	while (!(n & `1`)) {
187	dst++ = src++;
188	n >>= `1`;
189	}
190
191	n = qBitScanReverse(v: n);
192	nextAscii = src + n + `1`;
193	return false;
194	}
195	}
196
197	return src == end;
198	}
199
200	static inline const uchar simdFindNonAscii(const* uchar src, const* uchar end, const* uchar *&nextAscii)
201	{
202	#ifdef __AVX2__
203	// do 32 characters at a time
204	// (this is similar to simdTestMask in qstring.cpp)
205	const __m256i mask = _mm256_set1_epi8(`0x80`);
206	for ( ; end - src >= `32`; src += `32`) {
207	__m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
208	if (_mm256_testz_si256(mask, data))
209	continue;
210
211	uint n = _mm256_movemask_epi8(data);
212	Q_ASSUME(n);
213
214	// find the next probable ASCII character
215	// we don't want to load 32 bytes again in this loop if we know there are non-ASCII
216	// characters still coming
217	nextAscii = src + qBitScanReverse(n) + `1`;
218
219	// return the non-ASCII character
220	return src + qCountTrailingZeroBits(n);
221	}
222	#endif
223
224	// do sixteen characters at a time
225	for ( ; end - src >= `16`; src += `16`) {
226	__m128i data = _mm_loadu_si128(p: reinterpret_cast<const __m128i*>(src));
227
228	// check if everything is ASCII
229	// movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
230	uint n = _mm_movemask_epi8(a: data);
231	if (!n)
232	continue;
233
234	// find the next probable ASCII character
235	// we don't want to load 16 bytes again in this loop if we know there are non-ASCII
236	// characters still coming
237	nextAscii = src + qBitScanReverse(v: n) + `1`;
238
239	// return the non-ASCII character
240	return src + qCountTrailingZeroBits(v: n);
241	}
242
243	// do four characters at a time
244	for ( ; end - src >= `4`; src += `4`) {
245	quint32 data = qFromUnaligned<quint32>(src);
246	data &= `0x80808080U`;
247	if (!data)
248	continue;
249
250	// We don't try to guess which of the three bytes is ASCII and which
251	// one isn't. The chance that at least two of them are non-ASCII is
252	// better than 75%.
253	nextAscii = src;
254	return src;
255	}
256	nextAscii = end;
257	return src;
258	}
259	#elif defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64) // vaddv is only available on Aarch64
260	static inline bool simdEncodeAscii(uchar &dst, const* ushort &nextAscii, const* ushort &src, const* ushort *end)
261	{
262	uint16x8_t maxAscii = vdupq_n_u16(`0x7f`);
263	uint16x8_t mask1 = { `1`, `1` << `2`, `1` << `4`, `1` << `6`, `1` << `8`, `1` << `10`, `1` << `12`, `1` << `14` };
264	uint16x8_t mask2 = vshlq_n_u16(mask1, `1`);
265
266	// do sixteen characters at a time
267	for ( ; end - src >= `16`; src += `16`, dst += `16`) {
268	// load 2 lanes (or: "load interleaved")
269	uint16x8x2_t in = vld2q_u16(src);
270
271	// check if any of the elements > 0x7f, select 1 bit per element (element 0 -> bit 0, element 1 -> bit 1, etc),
272	// add those together into a scalar, and merge the scalars.
273	uint16_t nonAscii = vaddvq_u16(vandq_u16(vcgtq_u16(in.val[`0`], maxAscii), mask1))
274	\| vaddvq_u16(vandq_u16(vcgtq_u16(in.val[`1`], maxAscii), mask2));
275
276	// merge the two lanes by shifting the values of the second by 8 and inserting them
277	uint16x8_t out = vsliq_n_u16(in.val[`0`], in.val[`1`], `8`);
278
279	// store, even if there are non-ASCII characters here
280	vst1q_u8(dst, vreinterpretq_u8_u16(out));
281
282	if (nonAscii) {
283	// find the next probable ASCII character
284	// we don't want to load 32 bytes again in this loop if we know there are non-ASCII
285	// characters still coming
286	nextAscii = src + qBitScanReverse(nonAscii) + `1`;
287
288	nonAscii = qCountTrailingZeroBits(nonAscii);
289	dst += nonAscii;
290	src += nonAscii;
291	return false;
292	}
293	}
294	return src == end;
295	}
296
297	static inline bool simdDecodeAscii(ushort &dst, const* uchar &nextAscii, const* uchar &src, const* uchar *end)
298	{
299	// do eight characters at a time
300	uint8x8_t msb_mask = vdup_n_u8(`0x80`);
301	uint8x8_t add_mask = { `1`, `1` << `1`, `1` << `2`, `1` << `3`, `1` << `4`, `1` << `5`, `1` << `6`, `1` << `7` };
302	for ( ; end - src >= `8`; src += `8`, dst += `8`) {
303	uint8x8_t c = vld1_u8(src);
304	uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
305	if (!n) {
306	// store
307	vst1q_u16(dst, vmovl_u8(c));
308	continue;
309	}
310
311	// copy the front part that is still ASCII
312	while (!(n & `1`)) {
313	dst++ = src++;
314	n >>= `1`;
315	}
316
317	// find the next probable ASCII character
318	// we don't want to load 16 bytes again in this loop if we know there are non-ASCII
319	// characters still coming
320	n = qBitScanReverse(n);
321	nextAscii = src + n + `1`;
322	return false;
323
324	}
325	return src == end;
326	}
327
328	static inline const uchar simdFindNonAscii(const* uchar src, const* uchar end, const* uchar *&nextAscii)
329	{
330	// The SIMD code below is untested, so just force an early return until
331	// we've had the time to verify it works.
332	nextAscii = end;
333	return src;
334
335	// do eight characters at a time
336	uint8x8_t msb_mask = vdup_n_u8(`0x80`);
337	uint8x8_t add_mask = { `1`, `1` << `1`, `1` << `2`, `1` << `3`, `1` << `4`, `1` << `5`, `1` << `6`, `1` << `7` };
338	for ( ; end - src >= `8`; src += `8`) {
339	uint8x8_t c = vld1_u8(src);
340	uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
341	if (!n)
342	continue;
343
344	// find the next probable ASCII character
345	// we don't want to load 16 bytes again in this loop if we know there are non-ASCII
346	// characters still coming
347	nextAscii = src + qBitScanReverse(n) + `1`;
348
349	// return the non-ASCII character
350	return src + qCountTrailingZeroBits(n);
351	}
352	nextAscii = end;
353	return src;
354	}
355	#else
356	static inline bool simdEncodeAscii(uchar , const* ushort , const* ushort , const* ushort *)
357	{
358	return false;
359	}
360
361	static inline bool simdDecodeAscii(ushort , const* uchar , const* uchar , const* uchar *)
362	{
363	return false;
364	}
365
366	static inline const uchar simdFindNonAscii(const* uchar src, const* uchar end, const* uchar *&nextAscii)
367	{
368	nextAscii = end;
369	return src;
370	}
371	#endif
372
373	QByteArray QUtf8::convertFromUnicode(const QChar uc, int* len)
374	{
375	// create a QByteArray with the worst case scenario size
376	QByteArray result(len * `3`, Qt::Uninitialized);
377	uchar dst = reinterpret_cast<uchar >(const_cast<char *>(result.constData()));
378	const ushort src = reinterpret_cast<const* ushort *>(uc);
379	const ushort *const end = src + len;
380
381	while (src != end) {
382	const ushort *nextAscii = end;
383	if (simdEncodeAscii(dst, nextAscii, src, end))
384	break;
385
386	do {
387	ushort uc = *src++;
388	int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u: uc, dst, src, end);
389	if (res < `0`) {
390	// encoding error - append '?'
391	*dst++ = `'?'`;
392	}
393	} while (src < nextAscii);
394	}
395
396	result.truncate(pos: dst - reinterpret_cast<uchar >(const_cast<char* *>(result.constData())));
397	return result;
398	}
399
400	QByteArray QUtf8::convertFromUnicode(const QChar uc, int* len, QTextCodec::ConverterState *state)
401	{
402	uchar replacement = `'?'`;
403	int rlen = `3`*len;
404	int surrogate_high = -`1`;
405	if (state) {
406	if (state->flags & QTextCodec::ConvertInvalidToNull)
407	replacement = `0`;
408	if (!(state->flags & QTextCodec::IgnoreHeader))
409	rlen += `3`;
410	if (state->remainingChars)
411	surrogate_high = state->state_data[`0`];
412	}
413
414
415	QByteArray rstr(rlen, Qt::Uninitialized);
416	uchar cursor = reinterpret_cast<uchar >(const_cast<char *>(rstr.constData()));
417	const ushort src = reinterpret_cast<const* ushort *>(uc);
418	const ushort *const end = src + len;
419
420	int invalid = `0`;
421	if (state && !(state->flags & QTextCodec::IgnoreHeader)) {
422	// append UTF-8 BOM
423	*cursor++ = utf8bom[`0`];
424	*cursor++ = utf8bom[`1`];
425	*cursor++ = utf8bom[`2`];
426	}
427
428	const ushort *nextAscii = src;
429	while (src != end) {
430	int res;
431	ushort uc;
432	if (surrogate_high != -`1`) {
433	uc = surrogate_high;
434	surrogate_high = -`1`;
435	res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u: uc, dst&: cursor, src, end);
436	} else {
437	if (src >= nextAscii && simdEncodeAscii(dst&: cursor, nextAscii, src, end))
438	break;
439
440	uc = *src++;
441	res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u: uc, dst&: cursor, src, end);
442	}
443	if (Q_LIKELY(res >= `0`))
444	continue;
445
446	if (res == QUtf8BaseTraits::Error) {
447	// encoding error
448	++invalid;
449	*cursor++ = replacement;
450	} else if (res == QUtf8BaseTraits::EndOfString) {
451	surrogate_high = uc;
452	break;
453	}
454	}
455
456	rstr.resize(size: cursor - (const uchar*)rstr.constData());
457	if (state) {
458	state->invalidChars += invalid;
459	state->flags \|= QTextCodec::IgnoreHeader;
460	state->remainingChars = `0`;
461	if (surrogate_high >= `0`) {
462	state->remainingChars = `1`;
463	state->state_data[`0`] = surrogate_high;
464	}
465	}
466	return rstr;
467	}
468
469	QString QUtf8::convertToUnicode(const char chars, int* len)
470	{
471	// UTF-8 to UTF-16 always needs the exact same number of words or less:
472	// UTF-8 UTF-16
473	// 1 byte 1 word
474	// 2 bytes 1 word
475	// 3 bytes 1 word
476	// 4 bytes 2 words (one surrogate pair)
477	// That is, we'll use the full buffer if the input is US-ASCII (1-byte UTF-8),
478	// half the buffer for U+0080-U+07FF text (e.g., Greek, Cyrillic, Arabic) or
479	// non-BMP text, and one third of the buffer for U+0800-U+FFFF text (e.g, CJK).
480	//
481	// The table holds for invalid sequences too: we'll insert one replacement char
482	// per invalid byte.
483	QString result(len, Qt::Uninitialized);
484	QChar data = const_cast<QChar>(result.constData()); // we know we're not shared
485	const QChar *end = convertToUnicode(data, chars, len);
486	result.truncate(pos: end - data);
487	return result;
488	}
489
490	/!*
491	\since 5.7
492	\overload
493
494	Converts the UTF-8 sequence of \a len octets beginning at \a chars to
495	a sequence of QChar starting at \a buffer. The buffer is expected to be
496	large enough to hold the result. An upper bound for the size of the
497	buffer is \a len QChars.
498
499	If, during decoding, an error occurs, a QChar::ReplacementCharacter is
500	written.
501
502	Returns a pointer to one past the last QChar written.
503
504	This function never throws.
505	*/
506
507	QChar QUtf8::convertToUnicode(QChar buffer, const char chars, int* len) noexcept
508	{
509	ushort dst = reinterpret_cast<ushort >(buffer);
510	const uchar src = reinterpret_cast<const* uchar *>(chars);
511	const uchar *end = src + len;
512
513	// attempt to do a full decoding in SIMD
514	const uchar *nextAscii = end;
515	if (!simdDecodeAscii(dst, nextAscii, src, end)) {
516	// at least one non-ASCII entry
517	// check if we failed to decode the UTF-8 BOM; if so, skip it
518	if (Q_UNLIKELY(src == reinterpret_cast<const uchar *>(chars))
519	&& end - src >= `3`
520	&& Q_UNLIKELY(src[`0`] == utf8bom[`0`] && src[`1`] == utf8bom[`1`] && src[`2`] == utf8bom[`2`])) {
521	src += `3`;
522	}
523
524	while (src < end) {
525	nextAscii = end;
526	if (simdDecodeAscii(dst, nextAscii, src, end))
527	break;
528
529	do {
530	uchar b = *src++;
531	int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
532	if (res < `0`) {
533	// decoding error
534	*dst++ = QChar::ReplacementCharacter;
535	}
536	} while (src < nextAscii);
537	}
538	}
539
540	return reinterpret_cast<QChar *>(dst);
541	}
542
543	QString QUtf8::convertToUnicode(const char chars, int* len, QTextCodec::ConverterState *state)
544	{
545	bool headerdone = false;
546	ushort replacement = QChar::ReplacementCharacter;
547	int invalid = `0`;
548	int res;
549	uchar ch = `0`;
550
551	// See above for buffer requirements for stateless decoding. However, that
552	// fails if the state is not empty. The following situations can add to the
553	// requirements:
554	// state contains chars starts with requirement
555	// 1 of 2 bytes valid continuation 0
556	// 2 of 3 bytes same 0
557	// 3 bytes of 4 same +1 (need to insert surrogate pair)
558	// 1 of 2 bytes invalid continuation +1 (need to insert replacement and restart)
559	// 2 of 3 bytes same +1 (same)
560	// 3 of 4 bytes same +1 (same)
561	QString result(len + `1`, Qt::Uninitialized);
562
563	ushort dst = reinterpret_cast<ushort >(const_cast<QChar *>(result.constData()));
564	const uchar src = reinterpret_cast<const* uchar *>(chars);
565	const uchar *end = src + len;
566
567	if (state) {
568	if (state->flags & QTextCodec::IgnoreHeader)
569	headerdone = true;
570	if (state->flags & QTextCodec::ConvertInvalidToNull)
571	replacement = QChar::Null;
572	if (state->remainingChars) {
573	// handle incoming state first
574	uchar remainingCharsData[`4`]; // longest UTF-8 sequence possible
575	int remainingCharsCount = state->remainingChars;
576	int newCharsToCopy = qMin<int>(a: sizeof(remainingCharsData) - remainingCharsCount, b: end - src);
577
578	memset(s: remainingCharsData, c: `0`, n: sizeof(remainingCharsData));
579	memcpy(dest: remainingCharsData, src: &state->state_data[`0`], n: remainingCharsCount);
580	memcpy(dest: remainingCharsData + remainingCharsCount, src: src, n: newCharsToCopy);
581
582	const uchar *begin = &remainingCharsData[`1`];
583	res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b: remainingCharsData[`0`], dst, src&: begin,
584	end: static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
585	if (res == QUtf8BaseTraits::Error \|\| (res == QUtf8BaseTraits::EndOfString && len == `0`)) {
586	// special case for len == 0:
587	// if we were supplied an empty string, terminate the previous, unfinished sequence with error
588	++invalid;
589	*dst++ = replacement;
590	} else if (res == QUtf8BaseTraits::EndOfString) {
591	// if we got EndOfString again, then there were too few bytes in src;
592	// copy to our state and return
593	state->remainingChars = remainingCharsCount + newCharsToCopy;
594	memcpy(dest: &state->state_data[`0`], src: remainingCharsData, n: state->remainingChars);
595	return QString ();
596	} else if (!headerdone && res >= `0`) {
597	// eat the UTF-8 BOM
598	headerdone = true;
599	if (dst[-`1`] == `0xfeff`)
600	--dst;
601	}
602
603	// adjust src now that we have maybe consumed a few chars
604	if (res >= `0`) {
605	Q_ASSERT(res > remainingCharsCount);
606	src += res - remainingCharsCount;
607	}
608	}
609	}
610
611	// main body, stateless decoding
612	res = `0`;
613	const uchar *nextAscii = src;
614	const uchar *start = src;
615	while (res >= `0` && src < end) {
616	if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end))
617	break;
618
619	ch = *src++;
620	res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b: ch, dst, src, end);
621	if (!headerdone && res >= `0`) {
622	headerdone = true;
623	if (src == start + `3`) { // 3 == sizeof(utf8-bom)
624	// eat the UTF-8 BOM (it can only appear at the beginning of the string).
625	if (dst[-`1`] == `0xfeff`)
626	--dst;
627	}
628	}
629	if (res == QUtf8BaseTraits::Error) {
630	res = `0`;
631	++invalid;
632	*dst++ = replacement;
633	}
634	}
635
636	if (!state && res == QUtf8BaseTraits::EndOfString) {
637	// unterminated UTF sequence
638	*dst++ = QChar::ReplacementCharacter;
639	while (src++ < end)
640	*dst++ = QChar::ReplacementCharacter;
641	}
642
643	result.truncate(pos: dst - (const ushort *)result.unicode());
644	if (state) {
645	state->invalidChars += invalid;
646	if (headerdone)
647	state->flags \|= QTextCodec::IgnoreHeader;
648	if (res == QUtf8BaseTraits::EndOfString) {
649	--src; // unread the byte in ch
650	state->remainingChars = end - src;
651	memcpy(dest: &state->state_data[`0`], src: src, n: end - src);
652	} else {
653	state->remainingChars = `0`;
654	}
655	}
656	return result;
657	}
658
659	struct QUtf8NoOutputTraits : public QUtf8BaseTraitsNoAscii
660	{
661	struct NoOutput {};
662	static void appendUtf16(const NoOutput &, ushort) {}
663	static void appendUcs4(const NoOutput &, uint) {}
664	};
665
666	QUtf8::ValidUtf8Result QUtf8::isValidUtf8(const char *chars, qsizetype len)
667	{
668	const uchar src = reinterpret_cast<const* uchar *>(chars);
669	const uchar *end = src + len;
670	const uchar *nextAscii = src;
671	bool isValidAscii = true;
672
673	while (src < end) {
674	if (src >= nextAscii)
675	src = simdFindNonAscii(src, end, nextAscii);
676	if (src == end)
677	break;
678
679	do {
680	uchar b = *src++;
681	if ((b & `0x80`) == `0`)
682	continue;
683
684	isValidAscii = false;
685	QUtf8NoOutputTraits::NoOutput output;
686	int res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, dst&: output, src, end);
687	if (res < `0`) {
688	// decoding error
689	return { .isValidUtf8: false, .isValidAscii: false };
690	}
691	} while (src < nextAscii);
692	}
693
694	return { .isValidUtf8: true, .isValidAscii: isValidAscii };
695	}
696
697	int QUtf8::compareUtf8(const char utf8, qsizetype u8len, const* QChar utf16, int* u16len)
698	{
699	uint uc1, uc2;
700	auto src1 = reinterpret_cast<const uchar *>(utf8);
701	auto end1 = src1 + u8len;
702	QStringIterator src2(utf16, utf16 + u16len);
703
704	while (src1 < end1 && src2.hasNext()) {
705	uchar b = *src1++;
706	uint *output = &uc1;
707	int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst&: output, src&: src1, end: end1);
708	if (res < `0`) {
709	// decoding error
710	uc1 = QChar::ReplacementCharacter;
711	}
712
713	uc2 = src2.next();
714	if (uc1 != uc2)
715	return int(uc1) - int(uc2);
716	}
717
718	// the shorter string sorts first
719	return (end1 > src1) - int(src2.hasNext());
720	}
721
722	int QUtf8::compareUtf8(const char *utf8, qsizetype u8len, QLatin1String s)
723	{
724	uint uc1;
725	auto src1 = reinterpret_cast<const uchar *>(utf8);
726	auto end1 = src1 + u8len;
727	auto src2 = reinterpret_cast<const uchar *>(s.latin1());
728	auto end2 = src2 + s.size();
729
730	while (src1 < end1 && src2 < end2) {
731	uchar b = *src1++;
732	uint *output = &uc1;
733	int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst&: output, src&: src1, end: end1);
734	if (res < `0`) {
735	// decoding error
736	uc1 = QChar::ReplacementCharacter;
737	}
738
739	uint uc2 = *src2++;
740	if (uc1 != uc2)
741	return int(uc1) - int(uc2);
742	}
743
744	// the shorter string sorts first
745	return (end1 > src1) - (end2 > src2);
746	}
747
748	QByteArray QUtf16::convertFromUnicode(const QChar uc, int* len, QTextCodec::ConverterState *state, DataEndianness e)
749	{
750	DataEndianness endian = e;
751	int length = `2`*len;
752	if (!state \|\| (!(state->flags & QTextCodec::IgnoreHeader))) {
753	length += `2`;
754	}
755	if (e == DetectEndianness) {
756	endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
757	}
758
759	QByteArray d;
760	d.resize(size: length);
761	char *data = d.data();
762	if (!state \|\| !(state->flags & QTextCodec::IgnoreHeader)) {
763	QChar bom(QChar::ByteOrderMark);
764	if (endian == BigEndianness)
765	qToBigEndian(src: bom.unicode(), dest: data);
766	else
767	qToLittleEndian(src: bom.unicode(), dest: data);
768	data += `2`;
769	}
770	if (endian == BigEndianness)
771	qToBigEndian<ushort>(source: uc, count: len, dest: data);
772	else
773	qToLittleEndian<ushort>(source: uc, count: len, dest: data);
774
775	if (state) {
776	state->remainingChars = `0`;
777	state->flags \|= QTextCodec::IgnoreHeader;
778	}
779	return d;
780	}
781
782	QString QUtf16::convertToUnicode(const char chars, int* len, QTextCodec::ConverterState *state, DataEndianness e)
783	{
784	DataEndianness endian = e;
785	bool half = false;
786	uchar buf = `0`;
787	bool headerdone = false;
788	if (state) {
789	headerdone = state->flags & QTextCodec::IgnoreHeader;
790	if (endian == DetectEndianness)
791	endian = (DataEndianness)state->state_data[Endian];
792	if (state->remainingChars) {
793	half = true;
794	buf = state->state_data[Data];
795	}
796	}
797	if (headerdone && endian == DetectEndianness)
798	endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
799
800	QString result(len, Qt::Uninitialized); // worst case
801	QChar qch = (QChar )result.data();
802	while (len--) {
803	if (half) {
804	QChar ch;
805	if (endian == LittleEndianness) {
806	ch.setRow(*chars++);
807	ch.setCell(buf);
808	} else {
809	ch.setRow(buf);
810	ch.setCell(*chars++);
811	}
812	if (!headerdone) {
813	headerdone = true;
814	if (endian == DetectEndianness) {
815	if (ch == QChar::ByteOrderSwapped) {
816	endian = LittleEndianness;
817	} else if (ch == QChar::ByteOrderMark) {
818	endian = BigEndianness;
819	} else {
820	if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
821	endian = BigEndianness;
822	} else {
823	endian = LittleEndianness;
824	ch = QChar ((ch.unicode() >> `8`) \| ((ch.unicode() & `0xff`) << `8`));
825	}
826	*qch++ = ch;
827	}
828	} else if (ch != QChar::ByteOrderMark) {
829	*qch++ = ch;
830	}
831	} else {
832	*qch++ = ch;
833	}
834	half = false;
835	} else {
836	buf = *chars++;
837	half = true;
838	}
839	}
840	result.truncate(pos: qch - result.unicode());
841
842	if (state) {
843	if (headerdone)
844	state->flags \|= QTextCodec::IgnoreHeader;
845	state->state_data[Endian] = endian;
846	if (half) {
847	state->remainingChars = `1`;
848	state->state_data[Data] = buf;
849	} else {
850	state->remainingChars = `0`;
851	state->state_data[Data] = `0`;
852	}
853	}
854	return result;
855	}
856
857	QByteArray QUtf32::convertFromUnicode(const QChar uc, int* len, QTextCodec::ConverterState *state, DataEndianness e)
858	{
859	DataEndianness endian = e;
860	int length = `4`*len;
861	if (!state \|\| (!(state->flags & QTextCodec::IgnoreHeader))) {
862	length += `4`;
863	}
864	if (e == DetectEndianness) {
865	endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
866	}
867
868	QByteArray d(length, Qt::Uninitialized);
869	char *data = d.data();
870	if (!state \|\| !(state->flags & QTextCodec::IgnoreHeader)) {
871	if (endian == BigEndianness) {
872	data[`0`] = `0`;
873	data[`1`] = `0`;
874	data[`2`] = (char)`0xfe`;
875	data[`3`] = (char)`0xff`;
876	} else {
877	data[`0`] = (char)`0xff`;
878	data[`1`] = (char)`0xfe`;
879	data[`2`] = `0`;
880	data[`3`] = `0`;
881	}
882	data += `4`;
883	}
884
885	QStringIterator i(uc, uc + len);
886	if (endian == BigEndianness) {
887	while (i.hasNext()) {
888	uint cp = i.next();
889	qToBigEndian(src: cp, dest: data);
890	data += `4`;
891	}
892	} else {
893	while (i.hasNext()) {
894	uint cp = i.next();
895	qToLittleEndian(src: cp, dest: data);
896	data += `4`;
897	}
898	}
899
900	if (state) {
901	state->remainingChars = `0`;
902	state->flags \|= QTextCodec::IgnoreHeader;
903	}
904	return d;
905	}
906
907	QString QUtf32::convertToUnicode(const char chars, int* len, QTextCodec::ConverterState *state, DataEndianness e)
908	{
909	DataEndianness endian = e;
910	uchar tuple[`4`];
911	int num = `0`;
912	bool headerdone = false;
913	if (state) {
914	headerdone = state->flags & QTextCodec::IgnoreHeader;
915	if (endian == DetectEndianness) {
916	endian = (DataEndianness)state->state_data[Endian];
917	}
918	num = state->remainingChars;
919	memcpy(dest: tuple, src: &state->state_data[Data], n: `4`);
920	}
921	if (headerdone && endian == DetectEndianness)
922	endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
923
924	QString result;
925	result.resize(size: (num + len) >> `2` << `1`); // worst case
926	QChar qch = (QChar )result.data();
927
928	const char *end = chars + len;
929	while (chars < end) {
930	tuple[num++] = *chars++;
931	if (num == `4`) {
932	if (!headerdone) {
933	headerdone = true;
934	if (endian == DetectEndianness) {
935	if (tuple[`0`] == `0xff` && tuple[`1`] == `0xfe` && tuple[`2`] == `0` && tuple[`3`] == `0` && endian != BigEndianness) {
936	endian = LittleEndianness;
937	num = `0`;
938	continue;
939	} else if (tuple[`0`] == `0` && tuple[`1`] == `0` && tuple[`2`] == `0xfe` && tuple[`3`] == `0xff` && endian != LittleEndianness) {
940	endian = BigEndianness;
941	num = `0`;
942	continue;
943	} else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
944	endian = BigEndianness;
945	} else {
946	endian = LittleEndianness;
947	}
948	} else if (((endian == BigEndianness) ? qFromBigEndian<quint32>(src: tuple) : qFromLittleEndian<quint32>(src: tuple)) == QChar::ByteOrderMark) {
949	num = `0`;
950	continue;
951	}
952	}
953	uint code = (endian == BigEndianness) ? qFromBigEndian<quint32>(src: tuple) : qFromLittleEndian<quint32>(src: tuple);
954	if (QChar::requiresSurrogates(ucs4: code)) {
955	*qch++ = QChar (QChar::highSurrogate(ucs4: code));
956	*qch++ = QChar (QChar::lowSurrogate(ucs4: code));
957	} else {
958	*qch++ = QChar (code);
959	}
960	num = `0`;
961	}
962	}
963	result.truncate(pos: qch - result.unicode());
964
965	if (state) {
966	if (headerdone)
967	state->flags \|= QTextCodec::IgnoreHeader;
968	state->state_data[Endian] = endian;
969	state->remainingChars = num;
970	memcpy(dest: &state->state_data[Data], src: tuple, n: `4`);
971	}
972	return result;
973	}
974
975
976	#if QT_CONFIG(textcodec)
977
978	QUtf8Codec::~QUtf8Codec()
979	{
980	}
981
982	QByteArray QUtf8Codec::convertFromUnicode(const QChar uc, int* len, ConverterState state) const*
983	{
984	return QUtf8::convertFromUnicode(uc, len, state);
985	}
986
987	void QUtf8Codec::convertToUnicode(QString target, const* char chars, int* len, ConverterState state) const*
988	{
989	*target += QUtf8::convertToUnicode(chars, len, state);
990	}
991
992	QString QUtf8Codec::convertToUnicode(const char chars, int* len, ConverterState state) const*
993	{
994	return QUtf8::convertToUnicode(chars, len, state);
995	}
996
997	QByteArray QUtf8Codec::name() const
998	{
999	return "UTF-8";
1000	}
1001
1002	int QUtf8Codec::mibEnum() const
1003	{
1004	return `106`;
1005	}
1006
1007	QUtf16Codec::~QUtf16Codec()
1008	{
1009	}
1010
1011	QByteArray QUtf16Codec::convertFromUnicode(const QChar uc, int* len, ConverterState state) const*
1012	{
1013	return QUtf16::convertFromUnicode(uc, len, state, e);
1014	}
1015
1016	QString QUtf16Codec::convertToUnicode(const char chars, int* len, ConverterState state) const*
1017	{
1018	return QUtf16::convertToUnicode(chars, len, state, e);
1019	}
1020
1021	int QUtf16Codec::mibEnum() const
1022	{
1023	return `1015`;
1024	}
1025
1026	QByteArray QUtf16Codec::name() const
1027	{
1028	return "UTF-16";
1029	}
1030
1031	QList<QByteArray> QUtf16Codec::aliases() const
1032	{
1033	return QList<QByteArray>();
1034	}
1035
1036	int QUtf16BECodec::mibEnum() const
1037	{
1038	return `1013`;
1039	}
1040
1041	QByteArray QUtf16BECodec::name() const
1042	{
1043	return "UTF-16BE";
1044	}
1045
1046	QList<QByteArray> QUtf16BECodec::aliases() const
1047	{
1048	QList<QByteArray> list;
1049	return list;
1050	}
1051
1052	int QUtf16LECodec::mibEnum() const
1053	{
1054	return `1014`;
1055	}
1056
1057	QByteArray QUtf16LECodec::name() const
1058	{
1059	return "UTF-16LE";
1060	}
1061
1062	QList<QByteArray> QUtf16LECodec::aliases() const
1063	{
1064	QList<QByteArray> list;
1065	return list;
1066	}
1067
1068	QUtf32Codec::~QUtf32Codec()
1069	{
1070	}
1071
1072	QByteArray QUtf32Codec::convertFromUnicode(const QChar uc, int* len, ConverterState state) const*
1073	{
1074	return QUtf32::convertFromUnicode(uc, len, state, e);
1075	}
1076
1077	QString QUtf32Codec::convertToUnicode(const char chars, int* len, ConverterState state) const*
1078	{
1079	return QUtf32::convertToUnicode(chars, len, state, e);
1080	}
1081
1082	int QUtf32Codec::mibEnum() const
1083	{
1084	return `1017`;
1085	}
1086
1087	QByteArray QUtf32Codec::name() const
1088	{
1089	return "UTF-32";
1090	}
1091
1092	QList<QByteArray> QUtf32Codec::aliases() const
1093	{
1094	QList<QByteArray> list;
1095	return list;
1096	}
1097
1098	int QUtf32BECodec::mibEnum() const
1099	{
1100	return `1018`;
1101	}
1102
1103	QByteArray QUtf32BECodec::name() const
1104	{
1105	return "UTF-32BE";
1106	}
1107
1108	QList<QByteArray> QUtf32BECodec::aliases() const
1109	{
1110	QList<QByteArray> list;
1111	return list;
1112	}
1113
1114	int QUtf32LECodec::mibEnum() const
1115	{
1116	return `1019`;
1117	}
1118
1119	QByteArray QUtf32LECodec::name() const
1120	{
1121	return "UTF-32LE";
1122	}
1123
1124	QList<QByteArray> QUtf32LECodec::aliases() const
1125	{
1126	QList<QByteArray> list;
1127	return list;
1128	}
1129
1130	#endif // textcodec
1131
1132	QT_END_NAMESPACE
1133

source code of qtbase/src/corelib/codecs/qutfcodec.cpp