1// Copyright (C) 2020 The Qt Company Ltd.
2// Copyright (C) 2020 Intel Corporation.
3// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4// Qt-Security score:critical reason:data-parser
5
6#include <qstringconverter.h>
7#include <private/qstringconverter_p.h>
8#include "qendian.h"
9
10#include "private/qsimd_p.h"
11#include "private/qstringiterator_p.h"
12#include "private/qtools_p.h"
13#include "qbytearraymatcher.h"
14#include "qcontainertools_impl.h"
15#include <QtCore/qbytearraylist.h>
16
17#if QT_CONFIG(icu)
18
19#include <unicode/ucnv.h>
20#include <unicode/ucnv_cb.h>
21#include <unicode/ucnv_err.h>
22#include <unicode/ustring.h>
23#define QT_USE_ICU_CODECS
24#define QT_COM_THREAD_INIT
25
26#elif QT_CONFIG(winsdkicu)
27
28#include <icu.h>
29#include <private/qfunctions_win_p.h>
30#define QT_USE_ICU_CODECS
31#define QT_COM_THREAD_INIT qt_win_ensureComInitializedOnThisThread();
32
33#endif // QT_CONFIG(icu) || QT_CONFIG(winsdkicu)
34
35#ifdef Q_OS_WIN
36#include <qt_windows.h>
37#ifndef QT_BOOTSTRAPPED
38#include <QtCore/qvarlengtharray.h>
39#include <QtCore/private/wcharhelpers_win_p.h>
40
41#include <QtCore/q20iterator.h>
42#include <QtCore/q26numeric.h>
43#endif // !QT_BOOTSTRAPPED
44#endif
45
46#include <array>
47#if __has_include(<bit>) && __cplusplus > 201703L
48#include <bit>
49#endif
50#include <string>
51#include <QtCore/q20utility.h>
52
53QT_BEGIN_NAMESPACE
54
55using namespace QtMiscUtils;
56
57static_assert(std::is_nothrow_move_constructible_v<QStringEncoder>);
58static_assert(std::is_nothrow_move_assignable_v<QStringEncoder>);
59static_assert(std::is_nothrow_move_constructible_v<QStringDecoder>);
60static_assert(std::is_nothrow_move_assignable_v<QStringDecoder>);
61
62enum { Endian = 0, Data = 1 };
63
64static const uchar utf8bom[] = { 0xef, 0xbb, 0xbf };
65
66#if defined(__SSE2__) || defined(__ARM_NEON__)
67Q_ALWAYS_INLINE static uint qBitScanReverse(unsigned v) noexcept
68{
69#if defined(__cpp_lib_int_pow2) && __cpp_lib_int_pow2 >= 202002L
70 return std::bit_width(v) - 1;
71#else
72 uint result = qCountLeadingZeroBits(v);
73 // Now Invert the result: clz will count *down* from the msb to the lsb, so the msb index is 31
74 // and the lsb index is 0. The result for _bit_scan_reverse is expected to be the index when
75 // counting up: msb index is 0 (because it starts there), and the lsb index is 31.
76 result ^= sizeof(unsigned) * 8 - 1;
77 return result;
78#endif
79}
80#endif
81
82#if defined(__SSE2__)
83template <QCpuFeatureType Cpu = _compilerCpuFeatures> Q_ALWAYS_INLINE static bool
84simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end)
85{
86 size_t sizeBytes = reinterpret_cast<const char *>(end) - reinterpret_cast<const char *>(src);
87
88 // do sixteen characters at a time
89 auto process16Chars = [](uchar *dst, const char16_t *src) {
90 __m128i data1 = _mm_loadu_si128(p: (const __m128i*)src);
91 __m128i data2 = _mm_loadu_si128(p: 1+(const __m128i*)src);
92
93 // check if everything is ASCII
94 // the highest ASCII value is U+007F
95 // Do the packing directly:
96 // The PACKUSWB instruction has packs a signed 16-bit integer to an unsigned 8-bit
97 // with saturation. That is, anything from 0x0100 to 0x7fff is saturated to 0xff,
98 // while all negatives (0x8000 to 0xffff) get saturated to 0x00. To detect non-ASCII,
99 // we simply do a signed greater-than comparison to 0x00. That means we detect NULs as
100 // "non-ASCII", but it's an acceptable compromise.
101 __m128i packed = _mm_packus_epi16(a: data1, b: data2);
102 __m128i nonAscii = _mm_cmpgt_epi8(a: packed, b: _mm_setzero_si128());
103
104 // store, even if there are non-ASCII characters here
105 _mm_storeu_si128(p: (__m128i*)dst, b: packed);
106
107 // n will contain 1 bit set per character in [data1, data2] that is non-ASCII (or NUL)
108 ushort n = ~_mm_movemask_epi8(a: nonAscii);
109 return n;
110 };
111 auto maybeFoundNonAscii = [&](auto n, qptrdiff offset = 0) {
112 if (n) {
113 // find the next probable ASCII character
114 // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
115 // characters still coming
116 src += offset;
117 dst += offset;
118 nextAscii = src + qBitScanReverse(n) + 1;
119
120 n = qCountTrailingZeroBits(n);
121 dst += n;
122 src += n;
123 return false;
124 }
125 return src == end;
126 };
127 auto adjustToEnd = [&] {
128 dst += sizeBytes / sizeof(char16_t);
129 src = end;
130 };
131
132 if constexpr (Cpu & CpuFeatureAVX2) {
133 // The 256-bit VPACKUSWB[1] instruction interleaves the two input
134 // operands, so we need an extra permutation to get them back in-order.
135 // VPERMW takes 2 cyles to run while VPERMQ takes only 1.
136 // [1] https://www.felixcloutier.com/x86/PACKUSWB.html
137 constexpr size_t Step = 32;
138 auto process32Chars = [](const char16_t *src, uchar *dst) {
139 __m256i data1 = _mm256_loadu_si256(p: reinterpret_cast<const __m256i *>(src));
140 __m256i data2 = _mm256_loadu_si256(p: reinterpret_cast<const __m256i *>(src) + 1);
141 __m256i packed = _mm256_packus_epi16(a: data1, b: data2); // will be [A, B, A, B]
142 __m256i permuted = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0));
143 __m256i nonAscii = _mm256_cmpgt_epi8(a: permuted, b: _mm256_setzero_si256());
144
145 // store, even if there are non-ASCII characters here
146 _mm256_storeu_si256(p: reinterpret_cast<__m256i *>(dst), a: permuted);
147
148 return ~_mm256_movemask_epi8(a: nonAscii);
149 };
150
151 if constexpr (Cpu & CpuFeatureAVX512VL) {
152 // with AVX512/AXV10, we always process everything
153 if (sizeBytes <= Step * sizeof(char16_t)) {
154 uint mask = _bzhi_u32(X: -1, Y: uint(sizeBytes / 2));
155 __m256i data1 = _mm256_maskz_loadu_epi16(U: mask, P: src);
156 __m256i data2 = _mm256_maskz_loadu_epi16(U: mask >> 16, P: src + Step / 2);
157 __m256i packed = _mm256_packus_epi16(a: data1, b: data2);
158 __m256i permuted = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0));
159 __mmask32 nonAscii = _mm256_mask_cmple_epi8_mask(mask, permuted, _mm256_setzero_si256());
160
161 // store, even if there are non-ASCII characters here
162 _mm256_mask_storeu_epi8(P: dst, U: mask, A: permuted);
163 if (nonAscii)
164 return maybeFoundNonAscii(nonAscii);
165 adjustToEnd();
166 return true;
167 }
168 }
169
170 if (sizeBytes >= Step * sizeof(char16_t)) {
171 // do 32 characters at a time
172 qptrdiff offset = 0;
173 for ( ; (offset + Step) * sizeof(char16_t) < sizeBytes; offset += Step) {
174 if (uint n = process32Chars(src + offset, dst + offset))
175 return maybeFoundNonAscii(n, offset);
176 }
177
178 // do 32 characters again, possibly overlapping with the loop above
179 adjustToEnd();
180 uint n = process32Chars(src - Step, dst - Step);
181 return maybeFoundNonAscii(n, -int(Step));
182 }
183 }
184
185 constexpr size_t Step = 16;
186 if (sizeBytes >= Step * sizeof(char16_t)) {
187
188 qptrdiff offset = 0;
189 for ( ; (offset + Step) * sizeof(char16_t) < sizeBytes; offset += Step) {
190 ushort n = process16Chars(dst + offset, src + offset);
191 if (n)
192 return maybeFoundNonAscii(n, offset);
193 if (Cpu & CpuFeatureAVX2)
194 break; // we can only ever loop once because of the code above
195 }
196
197 // do sixteen characters again, possibly overlapping with the loop above
198 adjustToEnd();
199 ushort n = process16Chars(dst - Step, src - Step);
200 return maybeFoundNonAscii(n, -int(Step));
201 }
202
203# if !defined(__OPTIMIZE_SIZE__)
204 if (sizeBytes >= 8 * sizeof(char16_t)) {
205 // do eight characters at a time
206 __m128i data = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(src));
207 __m128i data2 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(end - 8));
208 __m128i packed = _mm_packus_epi16(a: data, b: data);
209 __m128i nonAscii = _mm_cmpgt_epi8(a: packed, b: _mm_setzero_si128());
210
211 // store even non-ASCII
212 _mm_storel_epi64(p: reinterpret_cast<__m128i *>(dst), a: packed);
213
214 uchar n = ~_mm_movemask_epi8(a: nonAscii);
215 if (n)
216 return maybeFoundNonAscii(n);
217
218 adjustToEnd();
219 packed = _mm_packus_epi16(a: data2, b: data2);
220 nonAscii = _mm_cmpgt_epi8(a: packed, b: _mm_setzero_si128());
221 _mm_storel_epi64(p: reinterpret_cast<__m128i *>(dst - 8), a: packed);
222 n = ~_mm_movemask_epi8(a: nonAscii);
223 return maybeFoundNonAscii(n, -8);
224 } else if (sizeBytes >= 4 * sizeof(char16_t)) {
225 // do four characters at a time
226 __m128i data1 = _mm_loadl_epi64(p: reinterpret_cast<const __m128i *>(src));
227 __m128i data2 = _mm_loadl_epi64(p: reinterpret_cast<const __m128i *>(end - 4));
228 __m128i packed = _mm_packus_epi16(a: data1, b: data1);
229 __m128i nonAscii = _mm_cmpgt_epi8(a: packed, b: _mm_setzero_si128());
230
231 // store even non-ASCII
232 qToUnaligned(src: _mm_cvtsi128_si32(a: packed), dest: dst);
233
234 uchar n = uchar(_mm_movemask_epi8(a: nonAscii) ^ 0xf);
235 if (n)
236 return maybeFoundNonAscii(n);
237
238 adjustToEnd();
239 packed = _mm_packus_epi16(a: data2, b: data2);
240 nonAscii = _mm_cmpgt_epi8(a: packed, b: _mm_setzero_si128());
241 qToUnaligned(src: _mm_cvtsi128_si32(a: packed), dest: dst - 4);
242 n = uchar(_mm_movemask_epi8(a: nonAscii) ^ 0xf);
243 return maybeFoundNonAscii(n, -4);
244 }
245#endif
246
247 return src == end;
248}
249
250template <QCpuFeatureType Cpu = _compilerCpuFeatures> Q_ALWAYS_INLINE static bool
251simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
252{
253 // do sixteen characters at a time
254 auto process16Chars = [](char16_t *dst, const uchar *src) {
255 __m128i data = _mm_loadu_si128(p: (const __m128i*)src);
256
257 // check if everything is ASCII
258 // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
259 uint n = _mm_movemask_epi8(a: data);
260
261 // store everything, even mojibake
262 _mm_storeu_si128(p: (__m128i*)dst, b: _mm_unpacklo_epi8(a: data, b: _mm_setzero_si128()));
263 _mm_storeu_si128(p: 1+(__m128i*)dst, b: _mm_unpackhi_epi8(a: data, b: _mm_setzero_si128()));
264 return ushort(n);
265 };
266 auto maybeFoundNonAscii = [&](uint n, qptrdiff offset = 0) {
267 // find the next probable ASCII character
268 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
269 // characters still coming
270 if (n) {
271 uint c = qCountTrailingZeroBits(v: n);
272 src += offset;
273 dst += offset;
274 n = qBitScanReverse(v: n);
275 nextAscii = src + n + 1;
276 src += c;
277 dst += c;
278 }
279 return src == end;
280 };
281 auto adjustToEnd = [&] {
282 dst += end - src;
283 src = end;
284 };
285
286 if constexpr (Cpu & CpuFeatureAVX2) {
287 constexpr qsizetype Step = 32;
288 auto process32Chars = [](char16_t *dst, const uchar *src) {
289 __m128i data1 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(src));
290 __m128i data2 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(src) + 1);
291
292 // the processor can execute this VPOR (dispatches 3/cycle) faster
293 // than waiting for the VPMOVMSKB (1/cycle) of both data to check
294 // their masks
295 __m128i ored = _mm_or_si128(a: data1, b: data2);
296 bool any = _mm_movemask_epi8(a: ored);
297
298 // store everything, even mojibake
299 __m256i extended1 = _mm256_cvtepu8_epi16(V: data1);
300 __m256i extended2 = _mm256_cvtepu8_epi16(V: data2);
301 _mm256_storeu_si256(p: reinterpret_cast<__m256i *>(dst), a: extended1);
302 _mm256_storeu_si256(p: reinterpret_cast<__m256i *>(dst) + 1, a: extended2);
303
304 uint n1 = _mm_movemask_epi8(a: data1);
305 uint n2 = _mm_movemask_epi8(a: data2);
306 struct R {
307 uint n1, n2;
308 bool any;
309 operator bool() const { return any; }
310 operator uint() const { return n1|(n2 << 16); }
311 };
312 return R{ n1, n2, any };
313 };
314
315 if constexpr (Cpu & CpuFeatureAVX512VL) {
316 // with AVX512/AXV10, we always process everything
317 if (end - src <= Step) {
318 __mmask32 mask = _bzhi_u32(X: -1, Y: uint(end - src));
319 __m256i data = _mm256_maskz_loadu_epi8(U: mask, P: src);
320 __mmask32 nonAscii = _mm256_mask_cmple_epi8_mask(mask, data, _mm256_setzero_si256());
321
322 // store everything, even mojibake
323 __m256i extended1 = _mm256_cvtepu8_epi16(V: _mm256_castsi256_si128(a: data));
324 __m256i extended2 = _mm256_cvtepu8_epi16(_mm256_extracti64x2_epi64(data, 1));
325 _mm256_mask_storeu_epi16(P: dst, U: mask, A: extended1);
326 _mm256_mask_storeu_epi16(P: dst + Step/2, U: mask >> 16, A: extended2);
327 if (nonAscii)
328 return maybeFoundNonAscii(nonAscii);
329 adjustToEnd();
330 return true;
331 }
332 }
333
334 if (end - src >= Step) {
335 // do 32 characters at a time
336 qptrdiff offset = 0;
337 for ( ; offset + Step < end - src; offset += Step) {
338 auto r = process32Chars(dst + offset, src + offset);
339 if (r)
340 return maybeFoundNonAscii(r, offset);
341 }
342
343 // do 32 characters again, possibly overlapping with the loop above
344 adjustToEnd();
345 auto r = process32Chars(dst - Step, src - Step);
346 return maybeFoundNonAscii(r, -Step);
347 }
348 }
349
350 constexpr qsizetype Step = 16;
351 if (end - src >= Step) {
352 qptrdiff offset = 0;
353 for ( ; offset + Step < end - src; offset += Step) {
354 ushort n = process16Chars(dst + offset, src + offset);
355 if (n)
356 return maybeFoundNonAscii(n, offset);
357 if (Cpu & CpuFeatureAVX2)
358 break; // we can only ever loop once because of the code above
359 }
360
361 // do one chunk again, possibly overlapping with the loop above
362 adjustToEnd();
363 return maybeFoundNonAscii(process16Chars(dst - Step, src - Step), -Step);
364 }
365
366# if !defined(__OPTIMIZE_SIZE__)
367 if (end - src >= 8) {
368 __m128i data = _mm_loadl_epi64(p: reinterpret_cast<const __m128i *>(src));
369 __m128i data2 = _mm_loadl_epi64(p: reinterpret_cast<const __m128i *>(end - 8));
370 uint n = _mm_movemask_epi8(a: data) & 0xff;
371 // store everything, even mojibake
372 _mm_storeu_si128(p: reinterpret_cast<__m128i *>(dst), b: _mm_unpacklo_epi8(a: data, b: _mm_setzero_si128()));
373 if (n)
374 return maybeFoundNonAscii(n);
375
376 // do one chunk again, possibly overlapping the above
377 adjustToEnd();
378 n = _mm_movemask_epi8(a: data2) & 0xff;
379 data2 = _mm_unpacklo_epi8(a: data2, b: _mm_setzero_si128());
380 _mm_storeu_si128(p: reinterpret_cast<__m128i *>(dst - 8), b: data2);
381 return maybeFoundNonAscii(n, -8);
382 }
383 if (end - src >= 4) {
384 __m128i data = _mm_cvtsi32_si128(a: qFromUnaligned<quint32>(src));
385 __m128i data2 = _mm_cvtsi32_si128(a: qFromUnaligned<quint32>(src: end - 4));
386 uchar n = uchar(_mm_movemask_epi8(a: data) & 0xf);
387 // store everything, even mojibake
388 data = _mm_unpacklo_epi8(a: data, b: _mm_setzero_si128());
389 _mm_storel_epi64(p: reinterpret_cast<__m128i *>(dst), a: data);
390 if (n)
391 return maybeFoundNonAscii(n);
392
393 // do one chunk again, possibly overlapping the above
394 adjustToEnd();
395 n = uchar(_mm_movemask_epi8(a: data2) & 0xf);
396 data2 = _mm_unpacklo_epi8(a: data2, b: _mm_setzero_si128());
397 _mm_storel_epi64(p: reinterpret_cast<__m128i *>(dst - 4), a: data2);
398 return maybeFoundNonAscii(n, -4);
399 }
400#endif
401
402 return src == end;
403}
404
405static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
406{
407#ifdef __AVX2__
408 // do 32 characters at a time
409 // (this is similar to simdTestMask in qstring.cpp)
410 const __m256i mask = _mm256_set1_epi8(char(0x80));
411 for ( ; end - src >= 32; src += 32) {
412 __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
413 if (_mm256_testz_si256(mask, data))
414 continue;
415
416 uint n = _mm256_movemask_epi8(data);
417 Q_ASSERT(n);
418
419 // find the next probable ASCII character
420 // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
421 // characters still coming
422 nextAscii = src + qBitScanReverse(n) + 1;
423
424 // return the non-ASCII character
425 return src + qCountTrailingZeroBits(n);
426 }
427#endif
428
429 // do sixteen characters at a time
430 for ( ; end - src >= 16; src += 16) {
431 __m128i data = _mm_loadu_si128(p: reinterpret_cast<const __m128i*>(src));
432
433 // check if everything is ASCII
434 // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
435 uint n = _mm_movemask_epi8(a: data);
436 if (!n)
437 continue;
438
439 // find the next probable ASCII character
440 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
441 // characters still coming
442 nextAscii = src + qBitScanReverse(v: n) + 1;
443
444 // return the non-ASCII character
445 return src + qCountTrailingZeroBits(v: n);
446 }
447
448 // do four characters at a time
449 for ( ; end - src >= 4; src += 4) {
450 quint32 data = qFromUnaligned<quint32>(src);
451 data &= 0x80808080U;
452 if (!data)
453 continue;
454
455 // We don't try to guess which of the three bytes is ASCII and which
456 // one isn't. The chance that at least two of them are non-ASCII is
457 // better than 75%.
458 nextAscii = src;
459 return src;
460 }
461 nextAscii = end;
462 return src;
463}
464
465// Compare only the US-ASCII beginning of [src8, end8) and [src16, end16)
466// and advance src8 and src16 to the first character that could not be compared
467static void simdCompareAscii(const qchar8_t *&src8, const qchar8_t *end8, const char16_t *&src16, const char16_t *end16)
468{
469 int bitSpacing = 1;
470 qptrdiff len = qMin(a: end8 - src8, b: end16 - src16);
471 qptrdiff offset = 0;
472 uint mask = 0;
473
474 // do sixteen characters at a time
475 for ( ; offset + 16 < len; offset += 16) {
476 __m128i data8 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(src8 + offset));
477#ifdef __AVX2__
478 // AVX2 version, use 256-bit registers and VPMOVXZBW
479 __m256i data16 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src16 + offset));
480
481 // expand US-ASCII as if it were Latin1 and confirm it's US-ASCII
482 __m256i datax8 = _mm256_cvtepu8_epi16(data8);
483 mask = _mm256_movemask_epi8(datax8);
484 if (mask)
485 break;
486
487 // compare Latin1 to UTF-16
488 __m256i latin1cmp = _mm256_cmpeq_epi16(datax8, data16);
489 mask = ~_mm256_movemask_epi8(latin1cmp);
490 if (mask)
491 break;
492#else
493 // non-AVX2 code
494 __m128i datalo16 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(src16 + offset));
495 __m128i datahi16 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(src16 + offset) + 1);
496
497 // expand US-ASCII as if it were Latin1, we'll confirm later
498 __m128i datalo8 = _mm_unpacklo_epi8(a: data8, b: _mm_setzero_si128());
499 __m128i datahi8 = _mm_unpackhi_epi8(a: data8, b: _mm_setzero_si128());
500
501 // compare Latin1 to UTF-16
502 __m128i latin1cmplo = _mm_cmpeq_epi16(a: datalo8, b: datalo16);
503 __m128i latin1cmphi = _mm_cmpeq_epi16(a: datahi8, b: datahi16);
504 mask = _mm_movemask_epi8(a: latin1cmphi) << 16;
505 mask |= ushort(_mm_movemask_epi8(a: latin1cmplo));
506 mask = ~mask;
507 if (mask)
508 break;
509
510 // confirm it was US-ASCII
511 mask = _mm_movemask_epi8(a: data8);
512 if (mask) {
513 bitSpacing = 0;
514 break;
515 }
516#endif
517 }
518
519 // helper for comparing 4 or 8 characters
520 auto cmp_lt_16 = [&mask, &offset](int n, __m128i data8, __m128i data16) {
521 // n = 4 -> sizemask = 0xff
522 // n = 8 -> sizemask = 0xffff
523 unsigned sizemask = (1U << (2 * n)) - 1;
524
525 // expand as if Latin1
526 data8 = _mm_unpacklo_epi8(a: data8, b: _mm_setzero_si128());
527
528 // compare and confirm it's US-ASCII
529 __m128i latin1cmp = _mm_cmpeq_epi16(a: data8, b: data16);
530 mask = ~_mm_movemask_epi8(a: latin1cmp) & sizemask;
531 mask |= _mm_movemask_epi8(a: data8);
532 if (mask == 0)
533 offset += n;
534 };
535
536 // do eight characters at a time
537 if (mask == 0 && offset + 8 < len) {
538 __m128i data8 = _mm_loadl_epi64(p: reinterpret_cast<const __m128i *>(src8 + offset));
539 __m128i data16 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(src16 + offset));
540 cmp_lt_16(8, data8, data16);
541 }
542
543 // do four characters
544 if (mask == 0 && offset + 4 < len) {
545 __m128i data8 = _mm_cvtsi32_si128(a: qFromUnaligned<quint32>(src: src8 + offset));
546 __m128i data16 = _mm_loadl_epi64(p: reinterpret_cast<const __m128i *>(src16 + offset));
547 cmp_lt_16(4, data8, data16);
548 }
549
550 // correct the source pointers to point to the first character we couldn't deal with
551 if (mask)
552 offset += qCountTrailingZeroBits(v: mask) >> bitSpacing;
553 src8 += offset;
554 src16 += offset;
555}
556#elif defined(__ARM_NEON__)
557static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end)
558{
559 uint16x8_t maxAscii = vdupq_n_u16(0x7f);
560 uint16x8_t mask1 = qvsetq_n_u16(1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 );
561 uint16x8_t mask2 = vshlq_n_u16(mask1, 1);
562
563 // do sixteen characters at a time
564 for ( ; end - src >= 16; src += 16, dst += 16) {
565 // load 2 lanes (or: "load interleaved")
566 uint16x8x2_t in = vld2q_u16(reinterpret_cast<const uint16_t *>(src));
567
568 // check if any of the elements > 0x7f, select 1 bit per element (element 0 -> bit 0, element 1 -> bit 1, etc),
569 // add those together into a scalar, and merge the scalars.
570 uint16_t nonAscii = vaddvq_u16(vandq_u16(vcgtq_u16(in.val[0], maxAscii), mask1))
571 | vaddvq_u16(vandq_u16(vcgtq_u16(in.val[1], maxAscii), mask2));
572
573 // merge the two lanes by shifting the values of the second by 8 and inserting them
574 uint16x8_t out = vsliq_n_u16(in.val[0], in.val[1], 8);
575
576 // store, even if there are non-ASCII characters here
577 vst1q_u8(dst, vreinterpretq_u8_u16(out));
578
579 if (nonAscii) {
580 // find the next probable ASCII character
581 // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
582 // characters still coming
583 nextAscii = src + qBitScanReverse(nonAscii) + 1;
584
585 nonAscii = qCountTrailingZeroBits(nonAscii);
586 dst += nonAscii;
587 src += nonAscii;
588 return false;
589 }
590 }
591 return src == end;
592}
593
594static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
595{
596 // do eight characters at a time
597 uint8x8_t msb_mask = vdup_n_u8(0x80);
598 uint8x8_t add_mask = qvset_n_u8(1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 );
599 for ( ; end - src >= 8; src += 8, dst += 8) {
600 uint8x8_t c = vld1_u8(src);
601 uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
602 if (!n) {
603 // store
604 vst1q_u16(reinterpret_cast<uint16_t *>(dst), vmovl_u8(c));
605 continue;
606 }
607
608 // copy the front part that is still ASCII
609 while (!(n & 1)) {
610 *dst++ = *src++;
611 n >>= 1;
612 }
613
614 // find the next probable ASCII character
615 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
616 // characters still coming
617 n = qBitScanReverse(n);
618 nextAscii = src + n + 1;
619 return false;
620
621 }
622 return src == end;
623}
624
625static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
626{
627 // The SIMD code below is untested, so just force an early return until
628 // we've had the time to verify it works.
629 nextAscii = end;
630 return src;
631
632 // do eight characters at a time
633 uint8x8_t msb_mask = vdup_n_u8(0x80);
634 uint8x8_t add_mask = qvset_n_u8(1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7);
635 for ( ; end - src >= 8; src += 8) {
636 uint8x8_t c = vld1_u8(src);
637 uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
638 if (!n)
639 continue;
640
641 // find the next probable ASCII character
642 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
643 // characters still coming
644 nextAscii = src + qBitScanReverse(n) + 1;
645
646 // return the non-ASCII character
647 return src + qCountTrailingZeroBits(n);
648 }
649 nextAscii = end;
650 return src;
651}
652
653static void simdCompareAscii(const qchar8_t *&, const qchar8_t *, const char16_t *&, const char16_t *)
654{
655}
656#else
657static inline bool simdEncodeAscii(uchar *, const char16_t *, const char16_t *, const char16_t *)
658{
659 return false;
660}
661
662static inline bool simdDecodeAscii(char16_t *, const uchar *, const uchar *, const uchar *)
663{
664 return false;
665}
666
667static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
668{
669 nextAscii = end;
670 return src;
671}
672
673static void simdCompareAscii(const qchar8_t *&, const qchar8_t *, const char16_t *&, const char16_t *)
674{
675}
676#endif
677
678enum { HeaderDone = 1 };
679
680template <typename OnErrorLambda> Q_ALWAYS_INLINE
681char *QUtf8::convertFromUnicode(char *out, QStringView in, OnErrorLambda &&onError) noexcept
682{
683 qsizetype len = in.size();
684
685 uchar *dst = reinterpret_cast<uchar *>(out);
686 const char16_t *src = reinterpret_cast<const char16_t *>(in.data());
687 const char16_t *const end = src + len;
688
689 while (src != end) {
690 const char16_t *nextAscii = end;
691 if (simdEncodeAscii(dst, nextAscii, src, end))
692 break;
693
694 do {
695 char16_t u = *src++;
696 int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u, dst, src, end);
697 if (Q_UNLIKELY(res < 0))
698 onError(dst, u, res);
699 } while (src < nextAscii);
700 }
701
702 return reinterpret_cast<char *>(dst);
703}
704
705char *QUtf8::convertFromUnicode(char *dst, QStringView in) noexcept
706{
707 return convertFromUnicode(out: dst, in, onError: [](auto *dst, ...) {
708 // encoding error - append '?'
709 *dst++ = '?';
710 });
711}
712
713QByteArray QUtf8::convertFromUnicode(QStringView in)
714{
715 qsizetype len = in.size();
716
717 // create a QByteArray with the worst case scenario size
718 QByteArray result(len * 3, Qt::Uninitialized);
719 char *dst = const_cast<char *>(result.constData());
720 dst = convertFromUnicode(dst, in);
721 result.truncate(pos: dst - result.constData());
722 return result;
723}
724
725QByteArray QUtf8::convertFromUnicode(QStringView in, QStringConverter::State *state)
726{
727 QByteArray ba(3*in.size() +3, Qt::Uninitialized);
728 char *end = convertFromUnicode(out: ba.data(), in, state);
729 ba.truncate(pos: end - ba.data());
730 return ba;
731}
732
733char *QUtf8::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state)
734{
735 Q_ASSERT(state);
736 qsizetype len = in.size();
737 if (!len)
738 return out;
739
740 auto appendReplacementChar = [state](uchar *cursor) -> uchar * {
741 if (state->flags & QStringConverter::Flag::ConvertInvalidToNull) {
742 *cursor++ = 0;
743 } else {
744 // QChar::replacement encoded in utf8
745 *cursor++ = 0xef;
746 *cursor++ = 0xbf;
747 *cursor++ = 0xbd;
748 }
749 return cursor;
750 };
751
752 uchar *cursor = reinterpret_cast<uchar *>(out);
753 const char16_t *src = in.utf16();
754 const char16_t *const end = src + len;
755
756 if (!(state->flags & QStringDecoder::Flag::Stateless)) {
757 if (state->remainingChars) {
758 int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u: state->state_data[0], dst&: cursor, src, end);
759 if (res < 0)
760 cursor = appendReplacementChar(cursor);
761 state->state_data[0] = 0;
762 state->remainingChars = 0;
763 } else if (!(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom) {
764 // append UTF-8 BOM
765 *cursor++ = utf8bom[0];
766 *cursor++ = utf8bom[1];
767 *cursor++ = utf8bom[2];
768 state->internalState |= HeaderDone;
769 }
770 }
771
772 out = reinterpret_cast<char *>(cursor);
773 return convertFromUnicode(out, in: { src, end }, onError: [&](uchar *&cursor, char16_t uc, int res) {
774 if (res == QUtf8BaseTraits::Error) {
775 // encoding error
776 ++state->invalidChars;
777 cursor = appendReplacementChar(cursor);
778 } else if (res == QUtf8BaseTraits::EndOfString) {
779 if (state->flags & QStringConverter::Flag::Stateless) {
780 ++state->invalidChars;
781 cursor = appendReplacementChar(cursor);
782 } else {
783 state->remainingChars = 1;
784 state->state_data[0] = uc;
785 }
786 }
787 });
788}
789
790char *QUtf8::convertFromLatin1(char *out, QLatin1StringView in)
791{
792 // ### SIMD-optimize:
793 for (uchar ch : in) {
794 if (ch < 128) {
795 *out++ = ch;
796 } else {
797 // as per https://en.wikipedia.org/wiki/UTF-8#Encoding, 2nd row
798 *out++ = 0b110'0'0000u | (ch >> 6);
799 *out++ = 0b10'00'0000u | (ch & 0b0011'1111);
800 }
801 }
802 return out;
803}
804
805QString QUtf8::convertToUnicode(QByteArrayView in)
806{
807 // UTF-8 to UTF-16 always needs the exact same number of words or less:
808 // UTF-8 UTF-16
809 // 1 byte 1 word
810 // 2 bytes 1 word
811 // 3 bytes 1 word
812 // 4 bytes 2 words (one surrogate pair)
813 // That is, we'll use the full buffer if the input is US-ASCII (1-byte UTF-8),
814 // half the buffer for U+0080-U+07FF text (e.g., Greek, Cyrillic, Arabic) or
815 // non-BMP text, and one third of the buffer for U+0800-U+FFFF text (e.g, CJK).
816 //
817 // The table holds for invalid sequences too: we'll insert one replacement char
818 // per invalid byte.
819 QString result(in.size(), Qt::Uninitialized);
820 QChar *data = const_cast<QChar*>(result.constData()); // we know we're not shared
821 const QChar *end = convertToUnicode(buffer: data, in);
822 result.truncate(pos: end - data);
823 return result;
824}
825
826/*! \internal
827 \since 6.6
828 \overload
829
830 Converts the UTF-8 sequence of bytes viewed by \a in to a sequence of
831 QChar starting at \a dst in the destination buffer. The buffer is expected
832 to be large enough to hold the result. An upper bound for the size of the
833 buffer is \c in.size() QChars.
834
835 If, during decoding, an error occurs, a QChar::ReplacementCharacter is
836 written.
837
838 Returns a pointer to one past the last QChar written.
839
840 This function never throws.
841
842 For QChar buffers, instead of casting manually, you can use the static
843 QUtf8::convertToUnicode(QChar *, QByteArrayView) directly.
844*/
845char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in) noexcept
846{
847 // check if have to skip a BOM
848 auto bom = QByteArrayView::fromArray(data: utf8bom);
849 if (in.size() >= bom.size() && in.first(n: bom.size()) == bom)
850 in.slice(pos: sizeof(utf8bom));
851
852 return convertToUnicode(dst, in, onError: [](char16_t *&dst, ...) {
853 // decoding error
854 *dst++ = QChar::ReplacementCharacter;
855 return true; // continue decoding
856 });
857}
858
859template <typename OnErrorLambda> Q_ALWAYS_INLINE char16_t *
860QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in, OnErrorLambda &&onError) noexcept
861{
862 const uchar *const start = reinterpret_cast<const uchar *>(in.data());
863 const uchar *src = start;
864 const uchar *end = src + in.size();
865
866 // attempt to do a full decoding in SIMD
867 const uchar *nextAscii = end;
868 while (src < end) {
869 nextAscii = end;
870 if (simdDecodeAscii(dst, nextAscii, src, end))
871 break;
872
873 do {
874 uchar b = *src++;
875 const qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
876 if (Q_LIKELY(res >= 0))
877 continue;
878 // decoding error
879 if (!onError(dst, src, res))
880 return dst;
881 } while (src < nextAscii);
882 }
883
884 return dst;
885}
886
887QString QUtf8::convertToUnicode(QByteArrayView in, QStringConverter::State *state)
888{
889 // See above for buffer requirements for stateless decoding. However, that
890 // fails if the state is not empty. The following situations can add to the
891 // requirements:
892 // state contains chars starts with requirement
893 // 1 of 2 bytes valid continuation 0
894 // 2 of 3 bytes same 0
895 // 3 bytes of 4 same +1 (need to insert surrogate pair)
896 // 1 of 2 bytes invalid continuation +1 (need to insert replacement and restart)
897 // 2 of 3 bytes same +1 (same)
898 // 3 of 4 bytes same +1 (same)
899 QString result(in.size() + 1, Qt::Uninitialized);
900 QChar *end = convertToUnicode(out: result.data(), in, state);
901 result.truncate(pos: end - result.constData());
902 return result;
903}
904
905char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in, QStringConverter::State *state)
906{
907 qsizetype len = in.size();
908
909 Q_ASSERT(state);
910 if (!len)
911 return dst;
912
913
914 char16_t replacement = QChar::ReplacementCharacter;
915 if (state->flags & QStringConverter::Flag::ConvertInvalidToNull)
916 replacement = QChar::Null;
917
918 qsizetype res;
919
920 const uchar *src = reinterpret_cast<const uchar *>(in.data());
921 const uchar *end = src + len;
922
923 if (!(state->flags & QStringConverter::Flag::Stateless)) {
924 bool headerdone = state->internalState & HeaderDone || state->flags & QStringConverter::Flag::ConvertInitialBom;
925 if (state->remainingChars || !headerdone) {
926 // handle incoming state first
927 uchar remainingCharsData[4]; // longest UTF-8 sequence possible
928 qsizetype remainingCharsCount = state->remainingChars;
929 qsizetype newCharsToCopy = qMin<qsizetype>(a: sizeof(remainingCharsData) - remainingCharsCount, b: end - src);
930
931 memset(s: remainingCharsData, c: 0, n: sizeof(remainingCharsData));
932 memcpy(dest: remainingCharsData, src: &state->state_data[0], n: remainingCharsCount);
933 memcpy(dest: remainingCharsData + remainingCharsCount, src: src, n: newCharsToCopy);
934
935 const uchar *begin = &remainingCharsData[1];
936 res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b: remainingCharsData[0], dst, src&: begin,
937 end: static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
938 if (res == QUtf8BaseTraits::Error) {
939 ++state->invalidChars;
940 *dst++ = replacement;
941 ++src;
942 } else if (res == QUtf8BaseTraits::EndOfString) {
943 // if we got EndOfString again, then there were too few bytes in src;
944 // copy to our state and return
945 state->remainingChars = remainingCharsCount + newCharsToCopy;
946 memcpy(dest: &state->state_data[0], src: remainingCharsData, n: state->remainingChars);
947 return dst;
948 } else if (!headerdone) {
949 // eat the UTF-8 BOM
950 if (dst[-1] == 0xfeff)
951 --dst;
952 }
953 state->internalState |= HeaderDone;
954
955 // adjust src now that we have maybe consumed a few chars
956 if (res >= 0) {
957 Q_ASSERT(res > remainingCharsCount);
958 src += res - remainingCharsCount;
959 }
960 }
961 } else if (!(state->flags & QStringConverter::Flag::ConvertInitialBom)) {
962 // stateless, remove initial BOM
963 if (len > 2 && src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])
964 // skip BOM
965 src += 3;
966 }
967
968 // main body, stateless decoding
969 res = 0;
970 dst = convertToUnicode(dst, in: { src, end }, onError: [&](char16_t *&dst, const uchar *src_, int res_) {
971 res = res_;
972 src = src_;
973 if (res == QUtf8BaseTraits::Error) {
974 res = 0;
975 ++state->invalidChars;
976 *dst++ = replacement;
977 }
978 return res == 0; // continue if plain decoding error
979 });
980
981 if (res == QUtf8BaseTraits::EndOfString) {
982 // unterminated UTF sequence
983 if (state->flags & QStringConverter::Flag::Stateless) {
984 *dst++ = QChar::ReplacementCharacter;
985 ++state->invalidChars;
986 while (src++ < end) {
987 *dst++ = QChar::ReplacementCharacter;
988 ++state->invalidChars;
989 }
990 state->remainingChars = 0;
991 } else {
992 --src; // unread the byte in ch
993 state->remainingChars = end - src;
994 memcpy(dest: &state->state_data[0], src: src, n: end - src);
995 }
996 } else {
997 state->remainingChars = 0;
998 }
999
1000 return dst;
1001}
1002
1003struct QUtf8NoOutputTraits : public QUtf8BaseTraitsNoAscii
1004{
1005 struct NoOutput {};
1006 static void appendUtf16(const NoOutput &, char16_t) {}
1007 static void appendUcs4(const NoOutput &, char32_t) {}
1008};
1009
1010QUtf8::ValidUtf8Result QUtf8::isValidUtf8(QByteArrayView in)
1011{
1012 const uchar *src = reinterpret_cast<const uchar *>(in.data());
1013 const uchar *end = src + in.size();
1014 const uchar *nextAscii = src;
1015 bool isValidAscii = true;
1016
1017 while (src < end) {
1018 if (src >= nextAscii)
1019 src = simdFindNonAscii(src, end, nextAscii);
1020 if (src == end)
1021 break;
1022
1023 do {
1024 uchar b = *src++;
1025 if ((b & 0x80) == 0)
1026 continue;
1027
1028 isValidAscii = false;
1029 QUtf8NoOutputTraits::NoOutput output;
1030 const qsizetype res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, dst&: output, src, end);
1031 if (res < 0) {
1032 // decoding error
1033 return { .isValidUtf8: false, .isValidAscii: false };
1034 }
1035 } while (src < nextAscii);
1036 }
1037
1038 return { .isValidUtf8: true, .isValidAscii: isValidAscii };
1039}
1040
1041int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16, Qt::CaseSensitivity cs) noexcept
1042{
1043 auto src1 = reinterpret_cast<const qchar8_t *>(utf8.data());
1044 auto end1 = src1 + utf8.size();
1045 auto src2 = reinterpret_cast<const char16_t *>(utf16.data());
1046 auto end2 = src2 + utf16.size();
1047
1048 do {
1049 simdCompareAscii(src8&: src1, end8: end1, src16&: src2, end16: end2);
1050
1051 if (src1 < end1 && src2 < end2) {
1052 char32_t uc1 = QUtf8Functions::nextUcs4FromUtf8(src&: src1, end: end1);
1053 char32_t uc2 = *src2++;
1054
1055 if (uc1 >= 0x80) {
1056 // Only decode the UTF-16 surrogate pair if the UTF-8 code point
1057 // wasn't US-ASCII (a surrogate cannot match US-ASCII).
1058 if (QChar::isHighSurrogate(ucs4: uc2) && src2 < end2 && QChar::isLowSurrogate(ucs4: *src2))
1059 uc2 = QChar::surrogateToUcs4(high: uc2, low: *src2++);
1060 }
1061 if (cs == Qt::CaseInsensitive) {
1062 uc1 = QChar::toCaseFolded(ucs4: uc1);
1063 uc2 = QChar::toCaseFolded(ucs4: uc2);
1064 }
1065 if (uc1 != uc2)
1066 return int(uc1) - int(uc2);
1067 }
1068 } while (src1 < end1 && src2 < end2);
1069
1070 // the shorter string sorts first
1071 return (end1 > src1) - int(end2 > src2);
1072}
1073
1074int QUtf8::compareUtf8(QByteArrayView utf8, QLatin1StringView s, Qt::CaseSensitivity cs)
1075{
1076 auto src1 = reinterpret_cast<const qchar8_t *>(utf8.data());
1077 auto end1 = src1 + utf8.size();
1078 auto src2 = reinterpret_cast<const uchar *>(s.latin1());
1079 auto end2 = src2 + s.size();
1080
1081 while (src1 < end1 && src2 < end2) {
1082 char32_t uc1 = QUtf8Functions::nextUcs4FromUtf8(src&: src1, end: end1);
1083 char32_t uc2 = *src2++;
1084 if (cs == Qt::CaseInsensitive) {
1085 uc1 = QChar::toCaseFolded(ucs4: uc1);
1086 uc2 = QChar::toCaseFolded(ucs4: uc2);
1087 }
1088 if (uc1 != uc2)
1089 return int(uc1) - int(uc2);
1090 }
1091
1092 // the shorter string sorts first
1093 return (end1 > src1) - (end2 > src2);
1094}
1095
1096int QUtf8::compareUtf8(QByteArrayView lhs, QByteArrayView rhs, Qt::CaseSensitivity cs) noexcept
1097{
1098 if (lhs.isEmpty())
1099 return qt_lencmp(lhs: 0, rhs: rhs.size());
1100
1101 if (rhs.isEmpty())
1102 return qt_lencmp(lhs: lhs.size(), rhs: 0);
1103
1104 if (cs == Qt::CaseSensitive) {
1105 const auto l = std::min(a: lhs.size(), b: rhs.size());
1106 int r = memcmp(s1: lhs.data(), s2: rhs.data(), n: l);
1107 return r ? r : qt_lencmp(lhs: lhs.size(), rhs: rhs.size());
1108 }
1109
1110 auto src1 = reinterpret_cast<const qchar8_t *>(lhs.data());
1111 auto end1 = src1 + lhs.size();
1112 auto src2 = reinterpret_cast<const qchar8_t *>(rhs.data());
1113 auto end2 = src2 + rhs.size();
1114
1115 while (src1 < end1 && src2 < end2) {
1116 char32_t uc1 = QUtf8Functions::nextUcs4FromUtf8(src&: src1, end: end1);
1117 char32_t uc2 = QUtf8Functions::nextUcs4FromUtf8(src&: src2, end: end2);
1118
1119 uc1 = QChar::toCaseFolded(ucs4: uc1);
1120 uc2 = QChar::toCaseFolded(ucs4: uc2);
1121 if (uc1 != uc2)
1122 return int(uc1) - int(uc2);
1123 }
1124
1125 // the shorter string sorts first
1126 return (end1 > src1) - (end2 > src2);
1127}
1128
1129#ifndef QT_BOOTSTRAPPED
1130QByteArray QUtf16::convertFromUnicode(QStringView in, QStringConverter::State *state, DataEndianness endian)
1131{
1132 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1133 qsizetype length = 2 * in.size();
1134 if (writeBom)
1135 length += 2;
1136
1137 QByteArray d(length, Qt::Uninitialized);
1138 char *end = convertFromUnicode(out: d.data(), in, state, endian);
1139 Q_ASSERT(end - d.constData() == d.size());
1140 Q_UNUSED(end);
1141 return d;
1142}
1143
1144char *QUtf16::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state, DataEndianness endian)
1145{
1146 Q_ASSERT(state);
1147 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1148
1149 if (endian == DetectEndianness)
1150 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1151
1152 if (writeBom) {
1153 // set them up the BOM
1154 QChar bom(QChar::ByteOrderMark);
1155 if (endian == BigEndianness)
1156 qToBigEndian(src: bom.unicode(), dest: out);
1157 else
1158 qToLittleEndian(src: bom.unicode(), dest: out);
1159 out += 2;
1160 }
1161 if (endian == BigEndianness)
1162 qToBigEndian<char16_t>(source: in.data(), count: in.size(), dest: out);
1163 else
1164 qToLittleEndian<char16_t>(source: in.data(), count: in.size(), dest: out);
1165
1166 state->remainingChars = 0;
1167 state->internalState |= HeaderDone;
1168 return out + 2*in.size();
1169}
1170
1171QString QUtf16::convertToUnicode(QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
1172{
1173 QString result((in.size() + 1) >> 1, Qt::Uninitialized); // worst case
1174 QChar *qch = convertToUnicode(out: result.data(), in, state, endian);
1175 result.truncate(pos: qch - result.constData());
1176 return result;
1177}
1178
1179QChar *QUtf16::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
1180{
1181 qsizetype len = in.size();
1182 const char *chars = in.data();
1183
1184 Q_ASSERT(state);
1185
1186 if (endian == DetectEndianness)
1187 endian = (DataEndianness)state->state_data[Endian];
1188
1189 const char *end = chars + len;
1190
1191 // make sure we can decode at least one char
1192 if (state->remainingChars + len < 2) {
1193 if (len) {
1194 Q_ASSERT(state->remainingChars == 0 && len == 1);
1195 state->remainingChars = 1;
1196 state->state_data[Data] = *chars;
1197 }
1198 return out;
1199 }
1200
1201 bool headerdone = state && state->internalState & HeaderDone;
1202 if (state->flags & QStringConverter::Flag::ConvertInitialBom)
1203 headerdone = true;
1204
1205 if (!headerdone || state->remainingChars) {
1206 uchar buf;
1207 if (state->remainingChars)
1208 buf = state->state_data[Data];
1209 else
1210 buf = *chars++;
1211
1212 // detect BOM, set endianness
1213 state->internalState |= HeaderDone;
1214 QChar ch(buf, *chars++);
1215 if (endian == DetectEndianness) {
1216 // someone set us up the BOM
1217 if (ch == QChar::ByteOrderSwapped) {
1218 endian = BigEndianness;
1219 } else if (ch == QChar::ByteOrderMark) {
1220 endian = LittleEndianness;
1221 } else {
1222 if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
1223 endian = BigEndianness;
1224 } else {
1225 endian = LittleEndianness;
1226 }
1227 }
1228 }
1229 if (endian == BigEndianness)
1230 ch = QChar::fromUcs2(c: (ch.unicode() >> 8) | ((ch.unicode() & 0xff) << 8));
1231 if (headerdone || ch != QChar::ByteOrderMark)
1232 *out++ = ch;
1233 } else if (endian == DetectEndianness) {
1234 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1235 }
1236
1237 qsizetype nPairs = (end - chars) >> 1;
1238 if (endian == BigEndianness)
1239 qFromBigEndian<char16_t>(source: chars, count: nPairs, dest: out);
1240 else
1241 qFromLittleEndian<char16_t>(source: chars, count: nPairs, dest: out);
1242 out += nPairs;
1243
1244 state->state_data[Endian] = endian;
1245 state->remainingChars = 0;
1246 if ((end - chars) & 1) {
1247 if (state->flags & QStringConverter::Flag::Stateless) {
1248 *out++ = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? QChar::Null : QChar::ReplacementCharacter;
1249 } else {
1250 state->remainingChars = 1;
1251 state->state_data[Data] = *(end - 1);
1252 }
1253 } else {
1254 state->state_data[Data] = 0;
1255 }
1256
1257 return out;
1258}
1259
1260QByteArray QUtf32::convertFromUnicode(QStringView in, QStringConverter::State *state, DataEndianness endian)
1261{
1262 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1263 qsizetype length = 4*in.size();
1264 if (writeBom)
1265 length += 4;
1266 QByteArray ba(length, Qt::Uninitialized);
1267 char *end = convertFromUnicode(out: ba.data(), in, state, endian);
1268 ba.truncate(pos: end - ba.constData());
1269 return ba;
1270}
1271
1272char *QUtf32::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state, DataEndianness endian)
1273{
1274 Q_ASSERT(state);
1275
1276 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1277 if (endian == DetectEndianness)
1278 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1279
1280 if (writeBom) {
1281 // set them up the BOM
1282 if (endian == BigEndianness) {
1283 out[0] = 0;
1284 out[1] = 0;
1285 out[2] = (char)0xfe;
1286 out[3] = (char)0xff;
1287 } else {
1288 out[0] = (char)0xff;
1289 out[1] = (char)0xfe;
1290 out[2] = 0;
1291 out[3] = 0;
1292 }
1293 out += 4;
1294 state->internalState |= HeaderDone;
1295 }
1296
1297 const QChar *uc = in.data();
1298 const QChar *end = in.data() + in.size();
1299 QChar ch;
1300 char32_t ucs4;
1301 if (state->remainingChars == 1) {
1302 auto character = state->state_data[Data];
1303 Q_ASSERT(character <= 0xFFFF);
1304 ch = QChar(character);
1305 // this is ugly, but shortcuts a whole lot of logic that would otherwise be required
1306 state->remainingChars = 0;
1307 goto decode_surrogate;
1308 }
1309
1310 while (uc < end) {
1311 ch = *uc++;
1312 if (Q_LIKELY(!ch.isSurrogate())) {
1313 ucs4 = ch.unicode();
1314 } else if (Q_LIKELY(ch.isHighSurrogate())) {
1315decode_surrogate:
1316 if (uc == end) {
1317 if (state->flags & QStringConverter::Flag::Stateless) {
1318 ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? 0 : QChar::ReplacementCharacter;
1319 } else {
1320 state->remainingChars = 1;
1321 state->state_data[Data] = ch.unicode();
1322 return out;
1323 }
1324 } else if (uc->isLowSurrogate()) {
1325 ucs4 = QChar::surrogateToUcs4(high: ch, low: *uc++);
1326 } else {
1327 ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? 0 : QChar::ReplacementCharacter;
1328 }
1329 } else {
1330 ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? 0 : QChar::ReplacementCharacter;
1331 }
1332 if (endian == BigEndianness)
1333 qToBigEndian(src: ucs4, dest: out);
1334 else
1335 qToLittleEndian(src: ucs4, dest: out);
1336 out += 4;
1337 }
1338
1339 return out;
1340}
1341
1342QString QUtf32::convertToUnicode(QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
1343{
1344 QString result;
1345 result.resize(size: (in.size() + 7) >> 1); // worst case
1346 QChar *end = convertToUnicode(out: result.data(), in, state, endian);
1347 result.truncate(pos: end - result.constData());
1348 return result;
1349}
1350
1351QChar *QUtf32::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
1352{
1353 qsizetype len = in.size();
1354 const char *chars = in.data();
1355
1356 Q_ASSERT(state);
1357 if (endian == DetectEndianness)
1358 endian = (DataEndianness)state->state_data[Endian];
1359
1360 const char *end = chars + len;
1361
1362 uchar tuple[4];
1363 memcpy(dest: tuple, src: &state->state_data[Data], n: 4);
1364
1365 // make sure we can decode at least one char
1366 if (state->remainingChars + len < 4) {
1367 if (len) {
1368 while (chars < end) {
1369 tuple[state->remainingChars] = *chars;
1370 ++state->remainingChars;
1371 ++chars;
1372 }
1373 Q_ASSERT(state->remainingChars < 4);
1374 memcpy(dest: &state->state_data[Data], src: tuple, n: 4);
1375 }
1376 return out;
1377 }
1378
1379 bool headerdone = state->internalState & HeaderDone;
1380 if (state->flags & QStringConverter::Flag::ConvertInitialBom)
1381 headerdone = true;
1382
1383 qsizetype num = state->remainingChars;
1384 state->remainingChars = 0;
1385
1386 if (!headerdone || endian == DetectEndianness || num) {
1387 while (num < 4)
1388 tuple[num++] = *chars++;
1389 if (endian == DetectEndianness) {
1390 // someone set us up the BOM?
1391 if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0) {
1392 endian = LittleEndianness;
1393 } else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff) {
1394 endian = BigEndianness;
1395 } else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
1396 endian = BigEndianness;
1397 } else {
1398 endian = LittleEndianness;
1399 }
1400 }
1401 char32_t code = (endian == BigEndianness) ? qFromBigEndian<char32_t>(src: tuple) : qFromLittleEndian<char32_t>(src: tuple);
1402 if (headerdone || code != QChar::ByteOrderMark) {
1403 if (QChar::requiresSurrogates(ucs4: code)) {
1404 *out++ = QChar(QChar::highSurrogate(ucs4: code));
1405 *out++ = QChar(QChar::lowSurrogate(ucs4: code));
1406 } else {
1407 *out++ = QChar(code);
1408 }
1409 }
1410 num = 0;
1411 } else if (endian == DetectEndianness) {
1412 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1413 }
1414 state->state_data[Endian] = endian;
1415 state->internalState |= HeaderDone;
1416
1417 while (chars < end) {
1418 tuple[num++] = *chars++;
1419 if (num == 4) {
1420 char32_t code = (endian == BigEndianness) ? qFromBigEndian<char32_t>(src: tuple) : qFromLittleEndian<char32_t>(src: tuple);
1421 for (char16_t c : QChar::fromUcs4(c: code))
1422 *out++ = c;
1423 num = 0;
1424 }
1425 }
1426
1427 if (num) {
1428 if (state->flags & QStringDecoder::Flag::Stateless) {
1429 *out++ = QChar::ReplacementCharacter;
1430 } else {
1431 state->state_data[Endian] = endian;
1432 state->remainingChars = num;
1433 memcpy(dest: &state->state_data[Data], src: tuple, n: 4);
1434 }
1435 }
1436
1437 return out;
1438}
1439#endif // !QT_BOOTSTRAPPED
1440
1441#if defined(Q_OS_WIN) && !defined(QT_BOOTSTRAPPED)
1442int QLocal8Bit::checkUtf8()
1443{
1444 return GetACP() == CP_UTF8 ? 1 : -1;
1445}
1446
1447QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::State *state)
1448{
1449 return convertToUnicode_sys(in, CP_ACP, state);
1450}
1451
1452QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage,
1453 QStringConverter::State *state)
1454{
1455 const char *mb = in.data();
1456 qsizetype mblen = in.size();
1457
1458 Q_ASSERT(state);
1459 qsizetype &invalidChars = state->invalidChars;
1460 using Flag = QStringConverter::Flag;
1461 const bool useNullForReplacement = !!(state->flags & Flag::ConvertInvalidToNull);
1462 const char16_t replacementCharacter = useNullForReplacement ? QChar::Null
1463 : QChar::ReplacementCharacter;
1464 if (state->flags & Flag::Stateless) {
1465 Q_ASSERT(state->remainingChars == 0);
1466 state = nullptr;
1467 }
1468
1469 if (!mb || !mblen)
1470 return QString();
1471
1472 // Use a local stack-buffer at first to allow us a decently large container
1473 // to avoid a lot of resizing, without also returning an overallocated
1474 // QString to the user for small strings.
1475 // Then we can be fast for small strings and take the hit of extra resizes
1476 // and measuring how much storage is needed for large strings.
1477 std::array<wchar_t, 4096> buf;
1478 wchar_t *out = buf.data();
1479 qsizetype outlen = buf.size();
1480
1481 QString sp;
1482
1483 // Return a pointer to storage where we have enough space for `size`
1484 const auto growOut = [&](qsizetype size) -> std::tuple<wchar_t *, qsizetype> {
1485 if (outlen >= size)
1486 return {out, outlen};
1487 const bool wasStackBuffer = sp.isEmpty();
1488 const auto begin = wasStackBuffer ? buf.data() : reinterpret_cast<wchar_t *>(sp.data());
1489 const qsizetype offset = qsizetype(std::distance(begin, out));
1490 qsizetype newSize = 0;
1491 if (Q_UNLIKELY(qAddOverflow(offset, size, &newSize))) {
1492 Q_CHECK_PTR(false);
1493 return {nullptr, 0};
1494 }
1495 sp.resize(newSize);
1496 auto it = reinterpret_cast<wchar_t *>(sp.data());
1497 if (wasStackBuffer)
1498 it = std::copy_n(buf.data(), offset, it);
1499 else
1500 it += offset;
1501 return {it, size};
1502 };
1503
1504 // Convert the pending characters (if available)
1505 while (state && state->remainingChars && mblen) {
1506 QStringConverter::State localState;
1507 localState.flags = state->flags;
1508 // Use at most 6 characters as a guess for the longest encoded character
1509 // in any multibyte encoding.
1510 // Even with a total of 2 bytes of overhead that would leave around
1511 // 2^(4 * 8) possible characters
1512 std::array<char, 6> prev = {0};
1513 Q_ASSERT(state->remainingChars <= q20::ssize(state->state_data));
1514 qsizetype index = 0;
1515 for (; index < state->remainingChars; ++index)
1516 prev[index] = state->state_data[index];
1517 const qsizetype toCopy = std::min(q20::ssize(prev) - index, mblen);
1518 for (qsizetype i = 0; i < toCopy; ++i, ++index)
1519 prev[index] = mb[i];
1520 mb += toCopy;
1521 mblen -= toCopy;
1522
1523 // Recursing:
1524 // Since we are using a clean local state it will try to decode what was
1525 // stored in our state + some extra octets from input (`prev`). If some
1526 // part fails we will have those characters stored in the local state's
1527 // storage, and we can extract those. It may also output some
1528 // replacement characters, which we'll count in the invalidChars.
1529 // In the best case we only do this once, but we will loop until we have
1530 // resolved all the remaining characters or we have run out of new input
1531 // in which case we may still have remaining characters.
1532 const QString tmp = convertToUnicode_sys(QByteArrayView(prev.data(), index), codePage,
1533 &localState);
1534 std::tie(out, outlen) = growOut(tmp.size());
1535 if (!out)
1536 return {};
1537 out = std::copy_n(reinterpret_cast<const wchar_t *>(tmp.constData()), tmp.size(), out);
1538 outlen -= tmp.size();
1539 const qsizetype tail = toCopy - localState.remainingChars;
1540 if (tail >= 0) {
1541 // Everything left to process comes from `in`, so we can stop
1542 // looping. Adjust the window for `in` and unset remainingChars to
1543 // signal that we're done.
1544 mb -= localState.remainingChars;
1545 mblen += localState.remainingChars;
1546 localState.remainingChars = 0;
1547 }
1548 state->remainingChars = localState.remainingChars;
1549 state->invalidChars += localState.invalidChars;
1550 std::copy_n(localState.state_data, state->remainingChars, state->state_data);
1551 }
1552
1553 Q_ASSERT(!state || state->remainingChars == 0 || mblen == 0);
1554
1555 // Need it in this scope, since we try to decrease our window size if we
1556 // encounter an error
1557 int nextIn = q26::saturate_cast<int>(mblen);
1558 while (mblen > 0) {
1559 std::tie(out, outlen) = growOut(1); // Need space for at least one character
1560 if (!out)
1561 return {};
1562 const int nextOut = q26::saturate_cast<int>(outlen);
1563 int len = MultiByteToWideChar(codePage, MB_ERR_INVALID_CHARS, mb, nextIn, out, nextOut);
1564 if (len) {
1565 mb += nextIn;
1566 mblen -= nextIn;
1567 out += len;
1568 outlen -= len;
1569 } else {
1570 int r = GetLastError();
1571 if (r == ERROR_INSUFFICIENT_BUFFER) {
1572 const int wclen = MultiByteToWideChar(codePage, 0, mb, nextIn, 0, 0);
1573 std::tie(out, outlen) = growOut(wclen);
1574 if (!out)
1575 return {};
1576 } else if (r == ERROR_NO_UNICODE_TRANSLATION) {
1577 // Can't decode the current window, so either store the state,
1578 // reduce window size or output a replacement character.
1579
1580 // Check if we can store all remaining characters in the state
1581 // to be used next time we're called:
1582 if (state && mblen <= q20::ssize(state->state_data)) {
1583 state->remainingChars = mblen;
1584 std::copy_n(mb, mblen, state->state_data);
1585 mb += mblen;
1586 mblen = 0;
1587 break;
1588 }
1589
1590 // .. if not, try to find the last valid character in the window
1591 // and try again with a shrunken window:
1592 if (nextIn > 1) {
1593 // There may be some incomplete data at the end of our current
1594 // window, so decrease the window size and try again.
1595 // In the worst case scenario there is gigs of undecodable
1596 // garbage, but what are we supposed to do about that?
1597 const auto it = CharPrevExA(codePage, mb, mb + nextIn, 0);
1598 if (it != mb)
1599 nextIn = int(it - mb);
1600 else
1601 --nextIn;
1602 continue;
1603 }
1604
1605 // Finally, we are forced to output a replacement character for
1606 // the first byte in the window:
1607 std::tie(out, outlen) = growOut(1);
1608 if (!out)
1609 return {};
1610 *out = replacementCharacter;
1611 ++invalidChars;
1612 ++out;
1613 --outlen;
1614 ++mb;
1615 --mblen;
1616 } else {
1617 // Fail.
1618 qWarning("MultiByteToWideChar: Cannot convert multibyte text");
1619 break;
1620 }
1621 }
1622 nextIn = q26::saturate_cast<int>(mblen);
1623 }
1624
1625 if (sp.isEmpty()) {
1626 // We must have only used the stack buffer
1627 if (out != buf.data()) // else: we return null-string
1628 sp = QStringView(buf.data(), out).toString();
1629 } else{
1630 const auto begin = reinterpret_cast<wchar_t *>(sp.data());
1631 sp.truncate(std::distance(begin, out));
1632 }
1633
1634 if (sp.size() && sp.back().isNull())
1635 sp.chop(1);
1636
1637 if (!state && mblen > 0) {
1638 // We have trailing character(s) that could not be converted, and
1639 // nowhere to cache them
1640 sp.resize(sp.size() + mblen, replacementCharacter);
1641 invalidChars += mblen;
1642 }
1643 return sp;
1644}
1645
1646QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, QStringConverter::State *state)
1647{
1648 return convertFromUnicode_sys(in, CP_ACP, state);
1649}
1650
1651QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, quint32 codePage,
1652 QStringConverter::State *state)
1653{
1654 const wchar_t *ch = reinterpret_cast<const wchar_t *>(in.data());
1655 qsizetype uclen = in.size();
1656
1657 Q_ASSERT(state);
1658 // The Windows API has a *boolean* out-parameter that says if a replacement
1659 // character was used, but it gives us no way to know _how many_ were used.
1660 // Since we cannot simply scan the string for replacement characters
1661 // (which is potentially a question mark, and thus a valid character),
1662 // we simply do not track the number of invalid characters here.
1663 // auto &invalidChars = state->invalidChars;
1664
1665 using Flag = QStringConverter::Flag;
1666 if (state->flags & Flag::Stateless) { // temporary
1667 Q_ASSERT(state->remainingChars == 0);
1668 state = nullptr;
1669 }
1670
1671 if (!ch)
1672 return QByteArray();
1673 if (uclen == 0)
1674 return QByteArray("");
1675
1676 // Use a local stack-buffer at first to allow us a decently large container
1677 // to avoid a lot of resizing, without also returning an overallocated
1678 // QByteArray to the user for small strings.
1679 // Then we can be fast for small strings and take the hit of extra resizes
1680 // and measuring how much storage is needed for large strings.
1681 std::array<char, 4096> buf;
1682 char *out = buf.data();
1683 qsizetype outlen = buf.size();
1684 QByteArray mb;
1685
1686 if (state && state->remainingChars > 0) {
1687 Q_ASSERT(state->remainingChars == 1);
1688 // Let's try to decode the pending character
1689 wchar_t wc[2] = { wchar_t(state->state_data[0]), ch[0] };
1690 // Check if the second character is a valid low surrogate,
1691 // otherwise we'll just decode the first character, for which windows
1692 // will output a replacement character.
1693 const bool validCodePoint = QChar::isLowSurrogate(wc[1]);
1694 int len = WideCharToMultiByte(codePage, 0, wc, validCodePoint ? 2 : 1, out, outlen, nullptr,
1695 nullptr);
1696 if (!len)
1697 return {}; // Cannot recover, and I refuse to believe it was a size limitation
1698 out += len;
1699 outlen -= len;
1700 if (validCodePoint) {
1701 ++ch;
1702 --uclen;
1703 }
1704 state->remainingChars = 0;
1705 state->state_data[0] = 0;
1706 if (uclen == 0)
1707 return QByteArrayView(buf.data(), len).toByteArray();
1708 }
1709
1710 if (state && QChar::isHighSurrogate(ch[uclen - 1])) {
1711 // We can handle a missing low surrogate at the end of the string,
1712 // so if there is one, exclude it now and store it in the state.
1713 state->remainingChars = 1;
1714 state->state_data[0] = ch[uclen - 1];
1715 --uclen;
1716 if (uclen == 0)
1717 return QByteArray();
1718 }
1719
1720 Q_ASSERT(uclen > 0);
1721
1722 // Return a pointer to storage where we have enough space for `size`
1723 const auto growOut = [&](qsizetype size) -> std::tuple<char *, qsizetype> {
1724 if (outlen >= size)
1725 return {out, outlen};
1726 const bool wasStackBuffer = mb.isEmpty();
1727 const auto begin = wasStackBuffer ? buf.data() : mb.data();
1728 const qsizetype offset = qsizetype(std::distance(begin, out));
1729 qsizetype newSize = 0;
1730 if (Q_UNLIKELY(qAddOverflow(offset, size, &newSize))) {
1731 Q_CHECK_PTR(false);
1732 return {nullptr, 0};
1733 }
1734 mb.resize(newSize);
1735 auto it = mb.data();
1736 if (wasStackBuffer)
1737 it = std::copy_n(buf.data(), offset, it);
1738 else
1739 it += offset;
1740 return {it, size};
1741 };
1742
1743 const auto getNextWindowSize = [&]() {
1744 int nextIn = q26::saturate_cast<int>(uclen);
1745 // The Windows API has some issues if the current window ends in the
1746 // middle of a surrogate pair, so we avoid that:
1747 if (nextIn > 1 && QChar::isHighSurrogate(ch[nextIn - 1]))
1748 --nextIn;
1749 return nextIn;
1750 };
1751
1752 int len = 0;
1753 while (uclen > 0) {
1754 const int nextIn = getNextWindowSize();
1755 std::tie(out, outlen) = growOut(1); // We need at least one byte
1756 if (!out)
1757 return {};
1758 const int nextOut = q26::saturate_cast<int>(outlen);
1759 len = WideCharToMultiByte(codePage, 0, ch, nextIn, out, nextOut, nullptr, nullptr);
1760 if (len > 0) {
1761 ch += nextIn;
1762 uclen -= nextIn;
1763 out += len;
1764 outlen -= len;
1765 } else {
1766 int r = GetLastError();
1767 if (r == ERROR_INSUFFICIENT_BUFFER) {
1768 int neededLength = WideCharToMultiByte(codePage, 0, ch, nextIn, nullptr, 0,
1769 nullptr, nullptr);
1770 if (neededLength <= 0) {
1771 // Fail. Observed with UTF8 where the input window was max int and ended in an
1772 // incomplete sequence, probably a Windows bug. We try to avoid that from
1773 // happening by reducing the window size in that case. But let's keep this
1774 // branch just in case of other bugs.
1775#ifndef QT_NO_DEBUG
1776 r = GetLastError();
1777 fprintf(stderr,
1778 "WideCharToMultiByte: Cannot convert multibyte text (error %d)\n", r);
1779#endif // !QT_NO_DEBUG
1780 break;
1781 }
1782 std::tie(out, outlen) = growOut(neededLength);
1783 if (!out)
1784 return {};
1785 // and try again...
1786 } else {
1787 // Fail. Probably can't happen in fact (dwFlags is 0).
1788#ifndef QT_NO_DEBUG
1789 // Can't use qWarning(), as it'll recurse to handle %ls
1790 fprintf(stderr,
1791 "WideCharToMultiByte: Cannot convert multibyte text (error %d): %ls\n",
1792 r, qt_castToWchar(QStringView(ch, uclen).left(100).toString()));
1793#endif
1794 break;
1795 }
1796 }
1797 }
1798 if (mb.isEmpty()) {
1799 // We must have only used the stack buffer
1800 if (out != buf.data()) // else: we return null-array
1801 mb = QByteArrayView(buf.data(), out).toByteArray();
1802 } else {
1803 mb.truncate(std::distance(mb.data(), out));
1804 }
1805 return mb;
1806}
1807#endif
1808
1809void QStringConverter::State::clear() noexcept
1810{
1811 if (clearFn)
1812 clearFn(this);
1813 else
1814 state_data[0] = state_data[1] = state_data[2] = state_data[3] = 0;
1815 remainingChars = 0;
1816 invalidChars = 0;
1817 internalState = 0;
1818}
1819
1820void QStringConverter::State::reset() noexcept
1821{
1822 if (flags & Flag::UsesIcu) {
1823#if defined(QT_USE_ICU_CODECS)
1824 QT_COM_THREAD_INIT
1825 UConverter *converter = static_cast<UConverter *>(d[0]);
1826 if (converter)
1827 ucnv_reset(converter);
1828#else
1829 Q_UNREACHABLE();
1830#endif
1831 } else {
1832 clear();
1833 }
1834}
1835
1836#ifndef QT_BOOTSTRAPPED
1837static QChar *fromUtf16(QChar *out, QByteArrayView in, QStringConverter::State *state)
1838{
1839 return QUtf16::convertToUnicode(out, in, state, endian: DetectEndianness);
1840}
1841
1842static char *toUtf16(char *out, QStringView in, QStringConverter::State *state)
1843{
1844 return QUtf16::convertFromUnicode(out, in, state, endian: DetectEndianness);
1845}
1846
1847static QChar *fromUtf16BE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1848{
1849 return QUtf16::convertToUnicode(out, in, state, endian: BigEndianness);
1850}
1851
1852static char *toUtf16BE(char *out, QStringView in, QStringConverter::State *state)
1853{
1854 return QUtf16::convertFromUnicode(out, in, state, endian: BigEndianness);
1855}
1856
1857static QChar *fromUtf16LE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1858{
1859 return QUtf16::convertToUnicode(out, in, state, endian: LittleEndianness);
1860}
1861
1862static char *toUtf16LE(char *out, QStringView in, QStringConverter::State *state)
1863{
1864 return QUtf16::convertFromUnicode(out, in, state, endian: LittleEndianness);
1865}
1866
1867static QChar *fromUtf32(QChar *out, QByteArrayView in, QStringConverter::State *state)
1868{
1869 return QUtf32::convertToUnicode(out, in, state, endian: DetectEndianness);
1870}
1871
1872static char *toUtf32(char *out, QStringView in, QStringConverter::State *state)
1873{
1874 return QUtf32::convertFromUnicode(out, in, state, endian: DetectEndianness);
1875}
1876
1877static QChar *fromUtf32BE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1878{
1879 return QUtf32::convertToUnicode(out, in, state, endian: BigEndianness);
1880}
1881
1882static char *toUtf32BE(char *out, QStringView in, QStringConverter::State *state)
1883{
1884 return QUtf32::convertFromUnicode(out, in, state, endian: BigEndianness);
1885}
1886
1887static QChar *fromUtf32LE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1888{
1889 return QUtf32::convertToUnicode(out, in, state, endian: LittleEndianness);
1890}
1891
1892static char *toUtf32LE(char *out, QStringView in, QStringConverter::State *state)
1893{
1894 return QUtf32::convertFromUnicode(out, in, state, endian: LittleEndianness);
1895}
1896#endif // !QT_BOOTSTRAPPED
1897
1898char *QLatin1::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state) noexcept
1899{
1900 Q_ASSERT(state);
1901 if (state->flags & QStringConverter::Flag::Stateless) // temporary
1902 state = nullptr;
1903
1904 const char replacement = (state && state->flags & QStringConverter::Flag::ConvertInvalidToNull) ? 0 : '?';
1905 qsizetype invalid = 0;
1906 for (qsizetype i = 0; i < in.size(); ++i) {
1907 if (in[i] > QChar(0xff)) {
1908 *out = replacement;
1909 ++invalid;
1910 } else {
1911 *out = (char)in[i].cell();
1912 }
1913 ++out;
1914 }
1915 if (state)
1916 state->invalidChars += invalid;
1917 return out;
1918}
1919
1920static QChar *fromLocal8Bit(QChar *out, QByteArrayView in, QStringConverter::State *state)
1921{
1922 QString s = QLocal8Bit::convertToUnicode(in, state);
1923 memcpy(dest: out, src: s.constData(), n: s.size()*sizeof(QChar));
1924 return out + s.size();
1925}
1926
1927static char *toLocal8Bit(char *out, QStringView in, QStringConverter::State *state)
1928{
1929 QByteArray s = QLocal8Bit::convertFromUnicode(in, state);
1930 memcpy(dest: out, src: s.constData(), n: s.size());
1931 return out + s.size();
1932}
1933
1934
1935static qsizetype fromUtf8Len(qsizetype l) { return l + 1; }
1936static qsizetype toUtf8Len(qsizetype l) { return 3*(l + 1); }
1937
1938#ifndef QT_BOOTSTRAPPED
1939static qsizetype fromUtf16Len(qsizetype l) { return l/2 + 2; }
1940static qsizetype toUtf16Len(qsizetype l) { return 2*(l + 1); }
1941
1942static qsizetype fromUtf32Len(qsizetype l) { return l/2 + 2; }
1943static qsizetype toUtf32Len(qsizetype l) { return 4*(l + 1); }
1944#endif
1945
1946static qsizetype fromLatin1Len(qsizetype l) { return l + 1; }
1947static qsizetype toLatin1Len(qsizetype l) { return l + 1; }
1948
1949
1950
1951/*!
1952 \class QStringConverter
1953 \inmodule QtCore
1954 \brief The QStringConverter class provides a base class for encoding and decoding text.
1955 \reentrant
1956 \ingroup i18n
1957 \ingroup string-processing
1958
1959 Qt uses UTF-16 to store, draw and manipulate strings. In many
1960 situations you may wish to deal with data that uses a different
1961 encoding. Most text data transferred over files and network connections is encoded
1962 in UTF-8.
1963
1964 The QStringConverter class is a base class for the \l {QStringEncoder} and
1965 \l {QStringDecoder} classes that help with converting between different
1966 text encodings. QStringDecoder can decode a string from an encoded representation
1967 into UTF-16, the format Qt uses internally. QStringEncoder does the opposite
1968 operation, encoding UTF-16 encoded data (usually in the form of a QString) to
1969 the requested encoding.
1970
1971 The following encodings are always supported:
1972
1973 \list
1974 \li UTF-8
1975 \li UTF-16
1976 \li UTF-16BE
1977 \li UTF-16LE
1978 \li UTF-32
1979 \li UTF-32BE
1980 \li UTF-32LE
1981 \li ISO-8859-1 (Latin-1)
1982 \li The system encoding
1983 \endlist
1984
1985 QStringConverter may support more encodings depending on how Qt was
1986 compiled. If more codecs are supported, they can be listed using
1987 availableCodecs().
1988
1989 \l {QStringConverter}s can be used as follows to convert some encoded
1990 string to and from UTF-16.
1991
1992 Suppose you have some string encoded in UTF-8, and
1993 want to convert it to a QString. The simple way
1994 to do it is to use a \l {QStringDecoder} like this:
1995
1996 \snippet code/src_corelib_text_qstringconverter.cpp 0
1997
1998 After this, \c string holds the text in decoded form.
1999 Converting a string from Unicode to the local encoding is just as
2000 easy using the \l {QStringEncoder} class:
2001
2002 \snippet code/src_corelib_text_qstringconverter.cpp 1
2003
2004 To read or write text files in various encodings, use QTextStream and
2005 its \l{QTextStream::setEncoding()}{setEncoding()} function.
2006
2007 Some care must be taken when trying to convert the data in chunks,
2008 for example, when receiving it over a network. In such cases it is
2009 possible that a multi-byte character will be split over two
2010 chunks. At best this might result in the loss of a character and
2011 at worst cause the entire conversion to fail.
2012
2013 Both QStringEncoder and QStringDecoder make this easy, by tracking
2014 this in an internal state. So simply calling the encoder or decoder
2015 again with the next chunk of data will automatically continue encoding
2016 or decoding the data correctly:
2017
2018 \snippet code/src_corelib_text_qstringconverter.cpp 2
2019
2020 The QStringDecoder object maintains state between chunks and therefore
2021 works correctly even if a multi-byte character is split between
2022 chunks.
2023
2024 QStringConverter objects can't be copied because of their internal state, but
2025 can be moved.
2026
2027 \sa QTextStream, QStringDecoder, QStringEncoder
2028*/
2029
2030/*!
2031 \enum QStringConverter::Flag
2032
2033 \value Default Default conversion rules apply.
2034 \value ConvertInvalidToNull If this flag is set, each invalid input
2035 character is output as a null character. If it is not set,
2036 invalid input characters are represented as QChar::ReplacementCharacter
2037 if the output encoding can represent that character, otherwise as a question mark.
2038 \value WriteBom When converting from a QString to an output encoding, write a QChar::ByteOrderMark as the first
2039 character if the output encoding supports this. This is the case for UTF-8, UTF-16 and UTF-32
2040 encodings.
2041 \value ConvertInitialBom When converting from an input encoding to a QString the QStringDecoder usually skips an
2042 leading QChar::ByteOrderMark. When this flag is set, the byte order mark will not be
2043 skipped, but converted to utf-16 and inserted at the start of the created QString.
2044 \value Stateless Ignore possible converter states between different function calls
2045 to encode or decode strings. This will also cause the QStringConverter to raise an error if an incomplete
2046 sequence of data is encountered.
2047 \omitvalue UsesIcu
2048*/
2049
2050/*!
2051 \enum QStringConverter::Encoding
2052 \value Utf8 Create a converter to or from UTF-8
2053 \value Utf16 Create a converter to or from UTF-16. When decoding, the byte order will get automatically
2054 detected by a leading byte order mark. If none exists or when encoding, the system byte order will
2055 be assumed.
2056 \value Utf16BE Create a converter to or from big-endian UTF-16.
2057 \value Utf16LE Create a converter to or from little-endian UTF-16.
2058 \value Utf32 Create a converter to or from UTF-32. When decoding, the byte order will get automatically
2059 detected by a leading byte order mark. If none exists or when encoding, the system byte order will
2060 be assumed.
2061 \value Utf32BE Create a converter to or from big-endian UTF-32.
2062 \value Utf32LE Create a converter to or from little-endian UTF-32.
2063 \value Latin1 Create a converter to or from ISO-8859-1 (Latin1).
2064 \value System Create a converter to or from the underlying encoding of the
2065 operating systems locale. This is always assumed to be UTF-8 for Unix based
2066 systems. On Windows, this converts to and from the locale code page.
2067 \omitvalue LastEncoding
2068*/
2069
2070/*!
2071 \struct QStringConverter::Interface
2072 \internal
2073*/
2074
2075const QStringConverter::Interface QStringConverter::encodingInterfaces[QStringConverter::LastEncoding + 1] =
2076{
2077 { .name: "UTF-8", .toUtf16: QUtf8::convertToUnicode, .toUtf16Len: fromUtf8Len, .fromUtf16: QUtf8::convertFromUnicode, .fromUtf16Len: toUtf8Len },
2078#ifndef QT_BOOTSTRAPPED
2079 { .name: "UTF-16", .toUtf16: fromUtf16, .toUtf16Len: fromUtf16Len, .fromUtf16: toUtf16, .fromUtf16Len: toUtf16Len },
2080 { .name: "UTF-16LE", .toUtf16: fromUtf16LE, .toUtf16Len: fromUtf16Len, .fromUtf16: toUtf16LE, .fromUtf16Len: toUtf16Len },
2081 { .name: "UTF-16BE", .toUtf16: fromUtf16BE, .toUtf16Len: fromUtf16Len, .fromUtf16: toUtf16BE, .fromUtf16Len: toUtf16Len },
2082 { .name: "UTF-32", .toUtf16: fromUtf32, .toUtf16Len: fromUtf32Len, .fromUtf16: toUtf32, .fromUtf16Len: toUtf32Len },
2083 { .name: "UTF-32LE", .toUtf16: fromUtf32LE, .toUtf16Len: fromUtf32Len, .fromUtf16: toUtf32LE, .fromUtf16Len: toUtf32Len },
2084 { .name: "UTF-32BE", .toUtf16: fromUtf32BE, .toUtf16Len: fromUtf32Len, .fromUtf16: toUtf32BE, .fromUtf16Len: toUtf32Len },
2085#endif
2086 { .name: "ISO-8859-1", .toUtf16: QLatin1::convertToUnicode, .toUtf16Len: fromLatin1Len, .fromUtf16: QLatin1::convertFromUnicode, .fromUtf16Len: toLatin1Len },
2087 { .name: "Locale", .toUtf16: fromLocal8Bit, .toUtf16Len: fromUtf8Len, .fromUtf16: toLocal8Bit, .fromUtf16Len: toUtf8Len }
2088};
2089
2090// match names case insensitive and skipping '-' and '_'
2091template <typename Char>
2092static bool nameMatch_impl_impl(const char *a, const Char *b, const Char *b_end)
2093{
2094 do {
2095 while (*a == '-' || *a == '_')
2096 ++a;
2097 while (b != b_end && (*b == Char{'-'} || *b == Char{'_'}))
2098 ++b;
2099 if (!*a && b == b_end) // end of both strings
2100 return true;
2101 if (char16_t(*b) > 127)
2102 return false; // non-US-ASCII cannot match US-ASCII (prevents narrowing below)
2103 } while (QtMiscUtils::toAsciiLower(ch: *a++) == QtMiscUtils::toAsciiLower(ch: char(*b++)));
2104
2105 return false;
2106}
2107
2108static bool nameMatch_impl(const char *a, QLatin1StringView b)
2109{
2110 return nameMatch_impl_impl(a, b: b.begin(), b_end: b.end());
2111}
2112
2113static bool nameMatch_impl(const char *a, QUtf8StringView b)
2114{
2115 return nameMatch_impl(a, b: QLatin1StringView{QByteArrayView{b}});
2116}
2117
2118static bool nameMatch_impl(const char *a, QStringView b)
2119{
2120 return nameMatch_impl_impl(a, b: b.utf16(), b_end: b.utf16() + b.size()); // uses char16_t*, not QChar*
2121}
2122
2123static bool nameMatch(const char *a, QAnyStringView b)
2124{
2125 return b.visit(v: [a](auto b) { return nameMatch_impl(a, b); });
2126}
2127
2128
2129/*!
2130 \fn constexpr QStringConverter::QStringConverter()
2131 \internal
2132*/
2133
2134/*!
2135 \fn constexpr QStringConverter::QStringConverter(Encoding, Flags)
2136 \internal
2137*/
2138
2139
2140#if defined(QT_USE_ICU_CODECS)
2141// only derives from QStringConverter to get access to protected types
2142struct QStringConverterICU : QStringConverter
2143{
2144 static void clear_function(QStringConverter::State *state) noexcept
2145 {
2146 QT_COM_THREAD_INIT
2147 ucnv_close(converter: static_cast<UConverter *>(state->d[0]));
2148 state->d[0] = nullptr;
2149 }
2150
2151 static void ensureConverter(QStringConverter::State *state)
2152 {
2153 // old code might reset the state via clear instead of reset
2154 // in that case, the converter has been closed, and we have to reopen it
2155 if (state->d[0] == nullptr)
2156 state->d[0] = createConverterForName(name: static_cast<const char *>(state->d[1]), state);
2157 }
2158
2159 static QChar *toUtf16(QChar *out, QByteArrayView in, QStringConverter::State *state)
2160 {
2161 QT_COM_THREAD_INIT
2162 ensureConverter(state);
2163
2164 auto icu_conv = static_cast<UConverter *>(state->d[0]);
2165 UErrorCode err = U_ZERO_ERROR;
2166 auto source = in.data();
2167 auto sourceLimit = in.data() + in.size();
2168
2169 qsizetype length = toLen(inLength: in.size());
2170
2171 UChar *target = reinterpret_cast<UChar *>(out);
2172 auto targetLimit = target + length;
2173 // We explicitly clean up anyway, so no need to set flush to true,
2174 // which would just reset the converter.
2175 UBool flush = false;
2176
2177 // If the QStringConverter was moved, the state that we used as a context is stale now.
2178 UConverterToUCallback action;
2179 const void *context;
2180 ucnv_getToUCallBack(converter: icu_conv, action: &action, context: &context);
2181 if (context != state)
2182 ucnv_setToUCallBack(converter: icu_conv, newAction: action, newContext: state, oldAction: nullptr, oldContext: nullptr, err: &err);
2183
2184 ucnv_toUnicode(converter: icu_conv, target: &target, targetLimit, source: &source, sourceLimit, offsets: nullptr, flush, err: &err);
2185 // We did reserve enough space:
2186 Q_ASSERT(err != U_BUFFER_OVERFLOW_ERROR);
2187 if (state->flags.testFlag(flag: QStringConverter::Flag::Stateless)) {
2188 if (auto leftOver = ucnv_toUCountPending(cnv: icu_conv, status: &err)) {
2189 ucnv_reset(converter: icu_conv);
2190 state->invalidChars += leftOver;
2191 }
2192 }
2193 return reinterpret_cast<QChar *>(target);
2194 }
2195
2196 static char *fromUtf16(char *out, QStringView in, QStringConverter::State *state)
2197 {
2198 QT_COM_THREAD_INIT
2199 ensureConverter(state);
2200 auto icu_conv = static_cast<UConverter *>(state->d[0]);
2201 UErrorCode err = U_ZERO_ERROR;
2202 auto source = reinterpret_cast<const UChar *>(in.data());
2203 auto sourceLimit = reinterpret_cast<const UChar *>(in.data() + in.size());
2204
2205 qsizetype length = UCNV_GET_MAX_BYTES_FOR_STRING(in.size(), ucnv_getMaxCharSize(icu_conv));
2206
2207 char *target = out;
2208 char *targetLimit = out + length;
2209 UBool flush = false;
2210
2211 // If the QStringConverter was moved, the state that we used as a context is stale now.
2212 UConverterFromUCallback action;
2213 const void *context;
2214 ucnv_getFromUCallBack(converter: icu_conv, action: &action, context: &context);
2215 if (context != state)
2216 ucnv_setFromUCallBack(converter: icu_conv, newAction: action, newContext: state, oldAction: nullptr, oldContext: nullptr, err: &err);
2217
2218 ucnv_fromUnicode(converter: icu_conv, target: &target, targetLimit, source: &source, sourceLimit, offsets: nullptr, flush, err: &err);
2219 // We did reserve enough space:
2220 Q_ASSERT(err != U_BUFFER_OVERFLOW_ERROR);
2221 if (state->flags.testFlag(flag: QStringConverter::Flag::Stateless)) {
2222 if (auto leftOver = ucnv_fromUCountPending(cnv: icu_conv, status: &err)) {
2223 ucnv_reset(converter: icu_conv);
2224 state->invalidChars += leftOver;
2225 }
2226 }
2227 return target;
2228 }
2229
2230 Q_DISABLE_COPY_MOVE(QStringConverterICU)
2231
2232 template<qsizetype X>
2233 static qsizetype fromLen(qsizetype inLength)
2234 {
2235 return X * inLength * sizeof(UChar);
2236 }
2237
2238 static qsizetype toLen(qsizetype inLength)
2239 {
2240
2241 /* Assumption: each input char might map to a different codepoint
2242 Each codepoint can take up to 4 bytes == 2 QChar
2243 We can ignore reserving space for a BOM, as only UTF encodings use one
2244 and those are not handled by the ICU converter.
2245 */
2246 return 2 * inLength;
2247 }
2248
2249 static constexpr QStringConverter::Interface forLength[] = {
2250 {.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<1>},
2251 {.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<2>},
2252 {.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<3>},
2253 {.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<4>},
2254 {.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<5>},
2255 {.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<6>},
2256 {.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<7>},
2257 {.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<8>}
2258 };
2259
2260 static UConverter *createConverterForName(const char *name, const State *state)
2261 {
2262 Q_ASSERT(name);
2263 Q_ASSERT(state);
2264 QT_COM_THREAD_INIT
2265 UErrorCode status = U_ZERO_ERROR;
2266 UConverter *conv = ucnv_open(converterName: name, err: &status);
2267 if (status != U_ZERO_ERROR && status != U_AMBIGUOUS_ALIAS_WARNING) {
2268 ucnv_close(converter: conv);
2269 return nullptr;
2270 }
2271
2272 if (state->flags.testFlag(flag: Flag::ConvertInvalidToNull)) {
2273 UErrorCode error = U_ZERO_ERROR;
2274
2275 auto nullToSubstituter = [](const void *context, UConverterToUnicodeArgs *toUArgs,
2276 const char *, int32_t length,
2277 UConverterCallbackReason reason, UErrorCode *err) {
2278 if (reason <= UCNV_IRREGULAR) {
2279 *err = U_ZERO_ERROR;
2280 UChar c = '\0';
2281 ucnv_cbToUWriteUChars(args: toUArgs, source: &c, length: 1, offsetIndex: 0, err);
2282 // Recover outer scope's state (which isn't const) from context:
2283 auto state = const_cast<State *>(static_cast<const State *>(context));
2284 state->invalidChars += length;
2285 }
2286 };
2287 ucnv_setToUCallBack(converter: conv, newAction: nullToSubstituter, newContext: state, oldAction: nullptr, oldContext: nullptr, err: &error);
2288
2289 auto nullFromSubstituter = [](const void *context, UConverterFromUnicodeArgs *fromUArgs,
2290 const UChar *, int32_t length,
2291 UChar32, UConverterCallbackReason reason, UErrorCode *err) {
2292 if (reason <= UCNV_IRREGULAR) {
2293 *err = U_ZERO_ERROR;
2294 const UChar replacement[] = { 0 };
2295 const UChar *stringBegin = std::begin(arr: replacement);
2296 ucnv_cbFromUWriteUChars(args: fromUArgs, source: &stringBegin, sourceLimit: std::end(arr: replacement), offsetIndex: 0, err);
2297 // Recover outer scope's state (which isn't const) from context:
2298 auto state = const_cast<State *>(static_cast<const State *>(context));
2299 state->invalidChars += length;
2300 }
2301 };
2302 ucnv_setFromUCallBack(converter: conv, newAction: nullFromSubstituter, newContext: state, oldAction: nullptr, oldContext: nullptr, err: &error);
2303 } else {
2304 UErrorCode error = U_ZERO_ERROR;
2305
2306 auto qmarkToSubstituter = [](const void *context, UConverterToUnicodeArgs *toUArgs,
2307 const char *codeUnits,int32_t length,
2308 UConverterCallbackReason reason, UErrorCode *err) {
2309 if (reason <= UCNV_IRREGULAR) {
2310 // Recover outer scope's state (which isn't const) from context:
2311 auto state = const_cast<State *>(static_cast<const State *>(context));
2312 state->invalidChars += length;
2313 }
2314 // use existing ICU callback for logic
2315 UCNV_TO_U_CALLBACK_SUBSTITUTE(context: nullptr, toUArgs, codeUnits, length, reason, err);
2316
2317 };
2318 ucnv_setToUCallBack(converter: conv, newAction: qmarkToSubstituter, newContext: state, oldAction: nullptr, oldContext: nullptr, err: &error);
2319
2320 auto qmarkFromSubstituter = [](const void *context, UConverterFromUnicodeArgs *fromUArgs,
2321 const UChar *codeUnits, int32_t length,
2322 UChar32 codePoint, UConverterCallbackReason reason, UErrorCode *err) {
2323 if (reason <= UCNV_IRREGULAR) {
2324 // Recover outer scope's state (which isn't const) from context:
2325 auto state = const_cast<State *>(static_cast<const State *>(context));
2326 state->invalidChars += length;
2327 }
2328 // use existing ICU callback for logic
2329 UCNV_FROM_U_CALLBACK_SUBSTITUTE(context: nullptr, fromUArgs, codeUnits, length,
2330 codePoint, reason, err);
2331 };
2332 ucnv_setFromUCallBack(converter: conv, newAction: qmarkFromSubstituter, newContext: state, oldAction: nullptr, oldContext: nullptr, err: &error);
2333 }
2334 return conv;
2335 }
2336
2337 static std::string nul_terminate_impl(QLatin1StringView name)
2338 { return name.isNull() ? std::string() : std::string{name.data(), size_t(name.size())}; }
2339
2340 static std::string nul_terminate_impl(QUtf8StringView name)
2341 { return nul_terminate_impl(name: QLatin1StringView{QByteArrayView{name}}); }
2342
2343 static std::string nul_terminate_impl(QStringView name)
2344 {
2345 std::string result;
2346 const auto convert = [&](char *p, size_t n) {
2347 const auto sz = QLatin1::convertFromUnicode(out: p, in: name) - p;
2348 Q_ASSERT(q20::cmp_less_equal(sz, n));
2349 return sz;
2350 };
2351#ifdef __cpp_lib_string_resize_and_overwrite
2352 result.resize_and_overwrite(size_t(name.size()), convert);
2353#else
2354 result.resize(n: size_t(name.size()));
2355 result.resize(n: convert(result.data(), result.size()));
2356#endif // __cpp_lib_string_resize_and_overwrite
2357 return result;
2358 }
2359
2360 static std::string nul_terminate(QAnyStringView name)
2361 { return name.visit(v: [](auto name) { return nul_terminate_impl(name); }); }
2362
2363 static const QStringConverter::Interface *
2364 make_icu_converter(QStringConverter::State *state, QAnyStringView name)
2365 { return make_icu_converter(state, name: nul_terminate(name).data()); }
2366
2367 static const QStringConverter::Interface *make_icu_converter(
2368 QStringConverter::State *state,
2369 const char *name)
2370 {
2371 QT_COM_THREAD_INIT
2372 UErrorCode status = U_ZERO_ERROR;
2373 UConverter *conv = createConverterForName(name, state);
2374 if (!conv)
2375 return nullptr;
2376
2377 const char *icuName = ucnv_getName(converter: conv, err: &status);
2378 // ucnv_getStandardName returns a name which is owned by the library
2379 // we can thus store it in the state without worrying aobut its lifetime
2380 const char *persistentName = ucnv_getStandardName(name: icuName, standard: "MIME", pErrorCode: &status);
2381 if (U_FAILURE(code: status) || !persistentName) {
2382 status = U_ZERO_ERROR;
2383 persistentName = ucnv_getStandardName(name: icuName, standard: "IANA", pErrorCode: &status);
2384 }
2385 state->d[1] = const_cast<char *>(persistentName);
2386 state->d[0] = conv;
2387 state->flags |= QStringConverter::Flag::UsesIcu;
2388 qsizetype maxCharSize = ucnv_getMaxCharSize(converter: conv);
2389 state->clearFn = QStringConverterICU::clear_function;
2390 if (maxCharSize > 8 || maxCharSize < 1) {
2391 qWarning(msg: "Encountered unexpected codec \"%s\" which requires >8x space", name);
2392 return nullptr;
2393 } else {
2394 return &forLength[maxCharSize - 1];
2395 }
2396
2397 }
2398
2399};
2400#endif
2401
2402/*!
2403 \internal
2404*/
2405QStringConverter::QStringConverter(QAnyStringView name, Flags f)
2406 : iface(nullptr), state(f)
2407{
2408 auto e = encodingForName(name);
2409 if (e)
2410 iface = encodingInterfaces + int(*e);
2411#if defined(QT_USE_ICU_CODECS)
2412 else
2413 iface = QStringConverterICU::make_icu_converter(state: &state, name);
2414#endif
2415}
2416
2417
2418const char *QStringConverter::name() const noexcept
2419{
2420 if (!iface)
2421 return nullptr;
2422 if (state.flags & QStringConverter::Flag::UsesIcu) {
2423#if defined(QT_USE_ICU_CODECS)
2424 return static_cast<const char*>(state.d[1]);
2425#else
2426 return nullptr;
2427#endif
2428 } else {
2429 return iface->name;
2430 }
2431}
2432
2433/*!
2434 \fn bool QStringConverter::isValid() const
2435
2436 Returns true if this is a valid string converter that can be used for encoding or
2437 decoding text.
2438
2439 Default constructed string converters or converters constructed with an unsupported
2440 name are not valid.
2441*/
2442
2443/*!
2444 \fn void QStringConverter::resetState()
2445
2446 Resets the internal state of the converter, clearing potential errors or partial
2447 conversions.
2448*/
2449
2450/*!
2451 \fn bool QStringConverter::hasError() const
2452
2453 Returns true if a conversion could not correctly convert a character. This could for example
2454 get triggered by an invalid UTF-8 sequence or when a character can't get converted due to
2455 limitations in the target encoding.
2456*/
2457
2458/*!
2459 \fn const char *QStringConverter::name() const
2460
2461 Returns the canonical name of the encoding this QStringConverter can encode or decode.
2462 Returns a nullptr if the converter is not valid.
2463 The returned name is UTF-8 encoded.
2464
2465 \sa isValid()
2466*/
2467
2468/*!
2469 Convert \a name to the corresponding \l Encoding member, if there is one.
2470
2471 If the \a name is not the name of a codec listed in the Encoding enumeration,
2472 \c{std::nullopt} is returned. Such a name may, none the less, be accepted by
2473 the QStringConverter constructor when Qt is built with ICU, if ICU provides a
2474 converter with the given name.
2475
2476 \note In Qt versions prior to 6.8, this function took only a \c{const char *},
2477 which was expected to be UTF-8-encoded.
2478*/
2479std::optional<QStringConverter::Encoding> QStringConverter::encodingForName(QAnyStringView name) noexcept
2480{
2481 if (name.isEmpty())
2482 return std::nullopt;
2483 for (qsizetype i = 0; i < LastEncoding + 1; ++i) {
2484 if (nameMatch(a: encodingInterfaces[i].name, b: name))
2485 return QStringConverter::Encoding(i);
2486 }
2487 if (nameMatch(a: "latin1", b: name))
2488 return QStringConverter::Latin1;
2489 return std::nullopt;
2490}
2491
2492#ifndef QT_BOOTSTRAPPED
2493/*!
2494 Returns the encoding for the content of \a data if it can be determined.
2495 \a expectedFirstCharacter can be passed as an additional hint to help determine
2496 the encoding.
2497
2498 The returned optional is empty, if the encoding is unclear.
2499 */
2500std::optional<QStringConverter::Encoding>
2501QStringConverter::encodingForData(QByteArrayView data, char16_t expectedFirstCharacter) noexcept
2502{
2503 // someone set us up the BOM?
2504 qsizetype arraySize = data.size();
2505 if (arraySize > 3) {
2506 char32_t uc = qFromUnaligned<char32_t>(src: data.data());
2507 if (uc == qToBigEndian(source: char32_t(QChar::ByteOrderMark)))
2508 return QStringConverter::Utf32BE;
2509 if (uc == qToLittleEndian(source: char32_t(QChar::ByteOrderMark)))
2510 return QStringConverter::Utf32LE;
2511 if (expectedFirstCharacter) {
2512 // catch also anything starting with the expected character
2513 if (qToLittleEndian(source: uc) == expectedFirstCharacter)
2514 return QStringConverter::Utf32LE;
2515 else if (qToBigEndian(source: uc) == expectedFirstCharacter)
2516 return QStringConverter::Utf32BE;
2517 }
2518 }
2519
2520 if (arraySize > 2) {
2521 if (memcmp(s1: data.data(), s2: utf8bom, n: sizeof(utf8bom)) == 0)
2522 return QStringConverter::Utf8;
2523 }
2524
2525 if (arraySize > 1) {
2526 char16_t uc = qFromUnaligned<char16_t>(src: data.data());
2527 if (uc == qToBigEndian(source: char16_t(QChar::ByteOrderMark)))
2528 return QStringConverter::Utf16BE;
2529 if (uc == qToLittleEndian(source: char16_t(QChar::ByteOrderMark)))
2530 return QStringConverter::Utf16LE;
2531 if (expectedFirstCharacter) {
2532 // catch also anything starting with the expected character
2533 if (qToLittleEndian(source: uc) == expectedFirstCharacter)
2534 return QStringConverter::Utf16LE;
2535 else if (qToBigEndian(source: uc) == expectedFirstCharacter)
2536 return QStringConverter::Utf16BE;
2537 }
2538 }
2539 return std::nullopt;
2540}
2541
2542static QByteArray parseHtmlMetaForEncoding(QByteArrayView data)
2543{
2544 static constexpr auto metaSearcher = qMakeStaticByteArrayMatcher(pattern: "meta ");
2545 static constexpr auto charsetSearcher = qMakeStaticByteArrayMatcher(pattern: "charset=");
2546
2547 QByteArray header = data.first(n: qMin(a: data.size(), b: qsizetype(1024))).toByteArray().toLower();
2548 qsizetype pos = metaSearcher.indexIn(haystack: header);
2549 if (pos != -1) {
2550 pos = charsetSearcher.indexIn(haystack: header, from: pos);
2551 if (pos != -1) {
2552 pos += qstrlen(str: "charset=");
2553 if (pos < header.size() && (header.at(i: pos) == '\"' || header.at(i: pos) == '\''))
2554 ++pos;
2555
2556 qsizetype pos2 = pos;
2557 // The attribute can be closed with either """, "'", ">" or "/",
2558 // none of which are valid charset characters.
2559 while (++pos2 < header.size()) {
2560 char ch = header.at(i: pos2);
2561 if (ch == '\"' || ch == '\'' || ch == '>' || ch == '/') {
2562 QByteArray name = header.mid(index: pos, len: pos2 - pos);
2563 qsizetype colon = name.indexOf(c: ':');
2564 if (colon > 0)
2565 name = name.left(n: colon);
2566 name = name.simplified();
2567 if (name == "unicode") // QTBUG-41998, ICU will return UTF-16.
2568 name = QByteArrayLiteral("UTF-8");
2569 if (!name.isEmpty())
2570 return name;
2571 }
2572 }
2573 }
2574 }
2575 return QByteArray();
2576}
2577
2578/*!
2579 Tries to determine the encoding of the HTML in \a data by looking at leading byte
2580 order marks or a charset specifier in the HTML meta tag. If the optional is empty,
2581 the encoding specified is not supported by QStringConverter. If no encoding is
2582 detected, the method returns Utf8.
2583
2584 \sa QStringDecoder::decoderForHtml()
2585*/
2586std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(QByteArrayView data)
2587{
2588 // determine charset
2589 std::optional<QStringConverter::Encoding> encoding = encodingForData(data);
2590 if (encoding)
2591 // trust the initial BOM
2592 return encoding;
2593
2594 QByteArray encodingTag = parseHtmlMetaForEncoding(data);
2595 if (!encodingTag.isEmpty())
2596 return encodingForName(name: encodingTag);
2597
2598 return Utf8;
2599}
2600
2601static qsizetype availableCodecCount()
2602{
2603#if !defined(QT_USE_ICU_CODECS)
2604 return QStringConverter::Encoding::LastEncoding;
2605#else
2606 QT_COM_THREAD_INIT
2607 /* icu contains also the names of what Qt provides
2608 except for the special Locale one (so add one for it)
2609 */
2610 return 1 + ucnv_countAvailable();
2611#endif
2612}
2613
2614/*!
2615 Returns a list of names of supported codecs. The names returned
2616 by this function can be passed to QStringEncoder's and
2617 QStringDecoder's constructor to create a en- or decoder for
2618 the given codec.
2619
2620 This function may be used to obtain a listing of additional codecs beyond
2621 the standard ones. Support for additional codecs requires Qt be compiled
2622 with support for the ICU library.
2623
2624 \note The order of codecs is an internal implementation detail
2625 and not guaranteed to be stable.
2626 */
2627QStringList QStringConverter::availableCodecs()
2628{
2629 auto availableCodec = [](qsizetype index) -> QString
2630 {
2631 #if !defined(QT_USE_ICU_CODECS)
2632 return QString::fromLatin1(encodingInterfaces[index].name);
2633 #else
2634 if (index == 0) // "Locale", not provided by icu
2635 return QString::fromLatin1(
2636 ba: encodingInterfaces[QStringConverter::Encoding::System].name);
2637 QT_COM_THREAD_INIT
2638 // this mirrors the setup we do to set a converters name
2639 UErrorCode status = U_ZERO_ERROR;
2640 auto icuName = ucnv_getAvailableName(n: int32_t(index - 1));
2641 const char *standardName = ucnv_getStandardName(name: icuName, standard: "MIME", pErrorCode: &status);
2642 if (U_FAILURE(code: status) || !standardName) {
2643 status = U_ZERO_ERROR;
2644 standardName = ucnv_getStandardName(name: icuName, standard: "IANA", pErrorCode: &status);
2645 }
2646 if (!standardName)
2647 standardName = icuName;
2648 return QString::fromLatin1(ba: standardName);
2649 #endif
2650 };
2651
2652 qsizetype codecCount = availableCodecCount();
2653 QStringList result;
2654 result.reserve(asize: codecCount);
2655 for (qsizetype i = 0; i < codecCount; ++i)
2656 result.push_back(t: availableCodec(i));
2657 return result;
2658}
2659
2660/*!
2661 Tries to determine the encoding of the HTML in \a data by looking at leading byte
2662 order marks or a charset specifier in the HTML meta tag and returns a QStringDecoder
2663 matching the encoding. If the returned decoder is not valid,
2664 the encoding specified is not supported by QStringConverter. If no encoding is
2665 detected, the method returns a decoder for Utf8.
2666
2667 \sa isValid()
2668*/
2669QStringDecoder QStringDecoder::decoderForHtml(QByteArrayView data)
2670{
2671 // determine charset
2672 std::optional<QStringConverter::Encoding> encoding = encodingForData(data);
2673 if (encoding)
2674 // trust the initial BOM
2675 return QStringDecoder(encoding.value());
2676
2677 QByteArray encodingTag = parseHtmlMetaForEncoding(data);
2678 if (!encodingTag.isEmpty())
2679 return QStringDecoder(encodingTag);
2680
2681 return QStringDecoder(Utf8);
2682}
2683#endif // !QT_BOOTSTRAPPED
2684
2685/*!
2686 Returns the canonical name for encoding \a e or \nullptr if \a e is an
2687 invalid value.
2688
2689 \note In Qt versions prior to 6.10, 6.9.1, 6.8.4 or 6.5.9, calling this
2690 function with an invalid argument resulted in undefined behavior. Since the
2691 above-mentioned Qt versions, it returns nullptr instead.
2692*/
2693const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e) noexcept
2694{
2695 auto i = size_t(e);
2696 if (Q_UNLIKELY(i >= std::size(encodingInterfaces)))
2697 return nullptr;
2698 return encodingInterfaces[i].name;
2699}
2700
2701/*!
2702 \class QStringEncoder
2703 \inmodule QtCore
2704 \brief The QStringEncoder class provides a state-based encoder for text.
2705 \reentrant
2706 \ingroup i18n
2707 \ingroup string-processing
2708
2709 A text encoder converts text from Qt's internal representation into an encoded
2710 text format using a specific encoding.
2711
2712 Converting a string from Unicode to the local encoding can be achieved
2713 using the following code:
2714
2715 \snippet code/src_corelib_text_qstringconverter.cpp 1
2716
2717 The encoder remembers any state that is required between calls, so converting
2718 data received in chunks, for example, when receiving it over a network, is just as
2719 easy, by calling the encoder whenever new data is available:
2720
2721 \snippet code/src_corelib_text_qstringconverter.cpp 3
2722
2723 The QStringEncoder object maintains state between chunks and therefore
2724 works correctly even if a UTF-16 surrogate character is split between
2725 chunks.
2726
2727 QStringEncoder objects can't be copied because of their internal state, but
2728 can be moved.
2729
2730 \sa QStringConverter, QStringDecoder
2731*/
2732
2733/*!
2734 \fn constexpr QStringEncoder::QStringEncoder(const Interface *i)
2735 \internal
2736*/
2737
2738/*!
2739 \fn constexpr QStringEncoder::QStringEncoder()
2740
2741 Default constructs an encoder. The default encoder is not valid,
2742 and can't be used for converting text.
2743*/
2744
2745/*!
2746 \fn constexpr QStringEncoder::QStringEncoder(Encoding encoding, Flags flags = Flag::Default)
2747
2748 Creates an encoder object using \a encoding and \a flags.
2749*/
2750
2751/*!
2752 \fn QStringEncoder::QStringEncoder(QAnyStringView name, Flags flags = Flag::Default)
2753
2754 Creates an encoder object using \a name and \a flags.
2755 If \a name is not the name of a known encoding an invalid converter will get created.
2756
2757 \note In Qt versions prior to 6.8, this function took only a \c{const char *},
2758 which was expected to be UTF-8-encoded.
2759
2760 \sa isValid()
2761*/
2762
2763/*!
2764 \fn QStringEncoder::DecodedData<const QString &> QStringEncoder::encode(const QString &in)
2765 \fn QStringEncoder::DecodedData<QStringView> QStringEncoder::encode(QStringView in)
2766 \fn QStringEncoder::DecodedData<const QString &> QStringEncoder::operator()(const QString &in)
2767 \fn QStringEncoder::DecodedData<QStringView> QStringEncoder::operator()(QStringView in)
2768
2769 Converts \a in and returns a struct that is implicitly convertible to QByteArray.
2770
2771 \snippet code/src_corelib_text_qstringconverter.cpp 5
2772*/
2773
2774/*!
2775 \fn qsizetype QStringEncoder::requiredSpace(qsizetype inputLength) const
2776
2777 Returns the maximum amount of characters required to be able to process
2778 \a inputLength decoded data.
2779
2780 \sa appendToBuffer()
2781*/
2782
2783/*!
2784 \fn char *QStringEncoder::appendToBuffer(char *out, QStringView in)
2785
2786 Encodes \a in and writes the encoded result into the buffer
2787 starting at \a out. Returns a pointer to the end of the data written.
2788
2789 \note \a out must be large enough to be able to hold all the decoded data. Use
2790 requiredSpace() to determine the maximum size requirement to be able to encode
2791 \a in.
2792
2793 \sa requiredSpace()
2794*/
2795
2796/*!
2797 \class QStringDecoder
2798 \inmodule QtCore
2799 \brief The QStringDecoder class provides a state-based decoder for text.
2800 \reentrant
2801 \ingroup i18n
2802 \ingroup string-processing
2803
2804 A text decoder converts text an encoded text format that uses a specific encoding
2805 into Qt's internal representation.
2806
2807 Converting encoded data into a QString can be achieved
2808 using the following code:
2809
2810 \snippet code/src_corelib_text_qstringconverter.cpp 0
2811
2812 The decoder remembers any state that is required between calls, so converting
2813 data received in chunks, for example, when receiving it over a network, is just as
2814 easy, by calling the decoder whenever new data is available:
2815
2816 \snippet code/src_corelib_text_qstringconverter.cpp 2
2817
2818 The QStringDecoder object maintains state between chunks and therefore
2819 works correctly even if chunks are split in the middle of a multi-byte character
2820 sequence.
2821
2822 QStringDecoder objects can't be copied because of their internal state, but
2823 can be moved.
2824
2825 \sa QStringConverter, QStringEncoder
2826*/
2827
2828/*!
2829 \fn constexpr QStringDecoder::QStringDecoder(const Interface *i)
2830 \internal
2831*/
2832
2833/*!
2834 \fn constexpr QStringDecoder::QStringDecoder()
2835
2836 Default constructs an decoder. The default decoder is not valid,
2837 and can't be used for converting text.
2838*/
2839
2840/*!
2841 \fn constexpr QStringDecoder::QStringDecoder(Encoding encoding, Flags flags = Flag::Default)
2842
2843 Creates an decoder object using \a encoding and \a flags.
2844*/
2845
2846/*!
2847 \fn QStringDecoder::QStringDecoder(QAnyStringView name, Flags flags = Flag::Default)
2848
2849 Creates an decoder object using \a name and \a flags.
2850 If \a name is not the name of a known encoding an invalid converter will get created.
2851
2852 \note In Qt versions prior to 6.8, this function took only a \c{const char *},
2853 which was expected to be UTF-8-encoded.
2854
2855 \sa isValid()
2856*/
2857
2858/*!
2859 \fn QStringDecoder::EncodedData<const QByteArray &> QStringDecoder::operator()(const QByteArray &ba)
2860 \fn QStringDecoder::EncodedData<const QByteArray &> QStringDecoder::decode(const QByteArray &ba)
2861 \fn QStringDecoder::EncodedData<QByteArrayView> QStringDecoder::operator()(QByteArrayView ba)
2862 \fn QStringDecoder::EncodedData<QByteArrayView> QStringDecoder::decode(QByteArrayView ba)
2863
2864 Converts \a ba and returns a struct that is implicitly convertible to QString.
2865
2866
2867 \snippet code/src_corelib_text_qstringconverter.cpp 4
2868*/
2869
2870/*!
2871 \fn qsizetype QStringDecoder::requiredSpace(qsizetype inputLength) const
2872
2873 Returns the maximum amount of UTF-16 code units required to be able to process
2874 \a inputLength encoded data.
2875
2876 \sa appendToBuffer
2877*/
2878
2879/*!
2880 \fn QChar *QStringDecoder::appendToBuffer(QChar *out, QByteArrayView in)
2881
2882 Decodes the sequence of bytes viewed by \a in and writes the decoded result into
2883 the buffer starting at \a out. Returns a pointer to the end of data written.
2884
2885 \a out needs to be large enough to be able to hold all the decoded data. Use
2886 \l{requiredSpace} to determine the maximum size requirements to decode an encoded
2887 data buffer of \c in.size() bytes.
2888
2889 \sa requiredSpace
2890*/
2891
2892/*!
2893 \fn char16_t *QStringDecoder::appendToBuffer(char16_t *out, QByteArrayView in)
2894 \since 6.6
2895 \overload
2896*/
2897
2898QT_END_NAMESPACE
2899

source code of qtbase/src/corelib/text/qstringconverter.cpp