1// Copyright (C) 2020 The Qt Company Ltd.
2// Copyright (C) 2020 Intel Corporation.
3// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4
5#include <qstringconverter.h>
6#include <private/qstringconverter_p.h>
7#include "qendian.h"
8
9#include "private/qsimd_p.h"
10#include "private/qstringiterator_p.h"
11#include "private/qtools_p.h"
12#include "qbytearraymatcher.h"
13#include "qcontainertools_impl.h"
14#include <QtCore/qbytearraylist.h>
15
16#if QT_CONFIG(icu)
17#include <unicode/ucnv.h>
18#include <unicode/ucnv_cb.h>
19#include <unicode/ucnv_err.h>
20#include <unicode/ustring.h>
21#endif
22
23#ifdef Q_OS_WIN
24#include <qt_windows.h>
25#ifndef QT_BOOTSTRAPPED
26#include <QtCore/qvarlengtharray.h>
27#include <QtCore/q20iterator.h>
28#include <QtCore/q26numeric.h>
29#endif // !QT_BOOTSTRAPPED
30#endif
31
32#include <array>
33#if __has_include(<bit>) && __cplusplus > 201703L
34#include <bit>
35#endif
36#include <string>
37
38QT_BEGIN_NAMESPACE
39
40using namespace QtMiscUtils;
41
42static_assert(std::is_nothrow_move_constructible_v<QStringEncoder>);
43static_assert(std::is_nothrow_move_assignable_v<QStringEncoder>);
44static_assert(std::is_nothrow_move_constructible_v<QStringDecoder>);
45static_assert(std::is_nothrow_move_assignable_v<QStringDecoder>);
46
47enum { Endian = 0, Data = 1 };
48
49static const uchar utf8bom[] = { 0xef, 0xbb, 0xbf };
50
51#if defined(__SSE2__) || defined(__ARM_NEON__)
52static Q_ALWAYS_INLINE uint qBitScanReverse(unsigned v) noexcept
53{
54#if defined(__cpp_lib_int_pow2) && __cpp_lib_int_pow2 >= 202002L
55 return std::bit_width(v) - 1;
56#else
57 uint result = qCountLeadingZeroBits(v);
58 // Now Invert the result: clz will count *down* from the msb to the lsb, so the msb index is 31
59 // and the lsb index is 0. The result for _bit_scan_reverse is expected to be the index when
60 // counting up: msb index is 0 (because it starts there), and the lsb index is 31.
61 result ^= sizeof(unsigned) * 8 - 1;
62 return result;
63#endif
64}
65#endif
66
67#if defined(__SSE2__)
68static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end)
69{
70 // do sixteen characters at a time
71 for ( ; end - src >= 16; src += 16, dst += 16) {
72# ifdef __AVX2__
73 __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
74 __m128i data1 = _mm256_castsi256_si128(data);
75 __m128i data2 = _mm256_extracti128_si256(data, 1);
76# else
77 __m128i data1 = _mm_loadu_si128(p: (const __m128i*)src);
78 __m128i data2 = _mm_loadu_si128(p: 1+(const __m128i*)src);
79# endif
80
81 // check if everything is ASCII
82 // the highest ASCII value is U+007F
83 // Do the packing directly:
84 // The PACKUSWB instruction has packs a signed 16-bit integer to an unsigned 8-bit
85 // with saturation. That is, anything from 0x0100 to 0x7fff is saturated to 0xff,
86 // while all negatives (0x8000 to 0xffff) get saturated to 0x00. To detect non-ASCII,
87 // we simply do a signed greater-than comparison to 0x00. That means we detect NULs as
88 // "non-ASCII", but it's an acceptable compromise.
89 __m128i packed = _mm_packus_epi16(a: data1, b: data2);
90 __m128i nonAscii = _mm_cmpgt_epi8(a: packed, b: _mm_setzero_si128());
91
92 // store, even if there are non-ASCII characters here
93 _mm_storeu_si128(p: (__m128i*)dst, b: packed);
94
95 // n will contain 1 bit set per character in [data1, data2] that is non-ASCII (or NUL)
96 ushort n = ~_mm_movemask_epi8(a: nonAscii);
97 if (n) {
98 // find the next probable ASCII character
99 // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
100 // characters still coming
101 nextAscii = src + qBitScanReverse(v: n) + 1;
102
103 n = qCountTrailingZeroBits(v: n);
104 dst += n;
105 src += n;
106 return false;
107 }
108 }
109
110 if (end - src >= 8) {
111 // do eight characters at a time
112 __m128i data = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(src));
113 __m128i packed = _mm_packus_epi16(a: data, b: data);
114 __m128i nonAscii = _mm_cmpgt_epi8(a: packed, b: _mm_setzero_si128());
115
116 // store even non-ASCII
117 _mm_storel_epi64(p: reinterpret_cast<__m128i *>(dst), a: packed);
118
119 uchar n = ~_mm_movemask_epi8(a: nonAscii);
120 if (n) {
121 nextAscii = src + qBitScanReverse(v: n) + 1;
122 n = qCountTrailingZeroBits(v: n);
123 dst += n;
124 src += n;
125 return false;
126 }
127 }
128
129 return src == end;
130}
131
132static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
133{
134 // do sixteen characters at a time
135 for ( ; end - src >= 16; src += 16, dst += 16) {
136 __m128i data = _mm_loadu_si128(p: (const __m128i*)src);
137
138#ifdef __AVX2__
139 const int BitSpacing = 2;
140 // load and zero extend to an YMM register
141 const __m256i extended = _mm256_cvtepu8_epi16(data);
142
143 uint n = _mm256_movemask_epi8(extended);
144 if (!n) {
145 // store
146 _mm256_storeu_si256((__m256i*)dst, extended);
147 continue;
148 }
149#else
150 const int BitSpacing = 1;
151
152 // check if everything is ASCII
153 // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
154 uint n = _mm_movemask_epi8(a: data);
155 if (!n) {
156 // unpack
157 _mm_storeu_si128(p: (__m128i*)dst, b: _mm_unpacklo_epi8(a: data, b: _mm_setzero_si128()));
158 _mm_storeu_si128(p: 1+(__m128i*)dst, b: _mm_unpackhi_epi8(a: data, b: _mm_setzero_si128()));
159 continue;
160 }
161#endif
162
163 // copy the front part that is still ASCII
164 while (!(n & 1)) {
165 *dst++ = *src++;
166 n >>= BitSpacing;
167 }
168
169 // find the next probable ASCII character
170 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
171 // characters still coming
172 n = qBitScanReverse(v: n);
173 nextAscii = src + (n / BitSpacing) + 1;
174 return false;
175
176 }
177
178 if (end - src >= 8) {
179 __m128i data = _mm_loadl_epi64(p: reinterpret_cast<const __m128i *>(src));
180 uint n = _mm_movemask_epi8(a: data) & 0xff;
181 if (!n) {
182 // unpack and store
183 _mm_storeu_si128(p: reinterpret_cast<__m128i *>(dst), b: _mm_unpacklo_epi8(a: data, b: _mm_setzero_si128()));
184 } else {
185 while (!(n & 1)) {
186 *dst++ = *src++;
187 n >>= 1;
188 }
189
190 n = qBitScanReverse(v: n);
191 nextAscii = src + n + 1;
192 return false;
193 }
194 }
195
196 return src == end;
197}
198
199static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
200{
201#ifdef __AVX2__
202 // do 32 characters at a time
203 // (this is similar to simdTestMask in qstring.cpp)
204 const __m256i mask = _mm256_set1_epi8(char(0x80));
205 for ( ; end - src >= 32; src += 32) {
206 __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
207 if (_mm256_testz_si256(mask, data))
208 continue;
209
210 uint n = _mm256_movemask_epi8(data);
211 Q_ASSERT(n);
212
213 // find the next probable ASCII character
214 // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
215 // characters still coming
216 nextAscii = src + qBitScanReverse(n) + 1;
217
218 // return the non-ASCII character
219 return src + qCountTrailingZeroBits(n);
220 }
221#endif
222
223 // do sixteen characters at a time
224 for ( ; end - src >= 16; src += 16) {
225 __m128i data = _mm_loadu_si128(p: reinterpret_cast<const __m128i*>(src));
226
227 // check if everything is ASCII
228 // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
229 uint n = _mm_movemask_epi8(a: data);
230 if (!n)
231 continue;
232
233 // find the next probable ASCII character
234 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
235 // characters still coming
236 nextAscii = src + qBitScanReverse(v: n) + 1;
237
238 // return the non-ASCII character
239 return src + qCountTrailingZeroBits(v: n);
240 }
241
242 // do four characters at a time
243 for ( ; end - src >= 4; src += 4) {
244 quint32 data = qFromUnaligned<quint32>(src);
245 data &= 0x80808080U;
246 if (!data)
247 continue;
248
249 // We don't try to guess which of the three bytes is ASCII and which
250 // one isn't. The chance that at least two of them are non-ASCII is
251 // better than 75%.
252 nextAscii = src;
253 return src;
254 }
255 nextAscii = end;
256 return src;
257}
258
259// Compare only the US-ASCII beginning of [src8, end8) and [src16, end16)
260// and advance src8 and src16 to the first character that could not be compared
261static void simdCompareAscii(const qchar8_t *&src8, const qchar8_t *end8, const char16_t *&src16, const char16_t *end16)
262{
263 int bitSpacing = 1;
264 qptrdiff len = qMin(a: end8 - src8, b: end16 - src16);
265 qptrdiff offset = 0;
266 uint mask = 0;
267
268 // do sixteen characters at a time
269 for ( ; offset + 16 < len; offset += 16) {
270 __m128i data8 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(src8 + offset));
271#ifdef __AVX2__
272 // AVX2 version, use 256-bit registers and VPMOVXZBW
273 __m256i data16 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src16 + offset));
274
275 // expand US-ASCII as if it were Latin1 and confirm it's US-ASCII
276 __m256i datax8 = _mm256_cvtepu8_epi16(data8);
277 mask = _mm256_movemask_epi8(datax8);
278 if (mask)
279 break;
280
281 // compare Latin1 to UTF-16
282 __m256i latin1cmp = _mm256_cmpeq_epi16(datax8, data16);
283 mask = ~_mm256_movemask_epi8(latin1cmp);
284 if (mask)
285 break;
286#else
287 // non-AVX2 code
288 __m128i datalo16 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(src16 + offset));
289 __m128i datahi16 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(src16 + offset) + 1);
290
291 // expand US-ASCII as if it were Latin1, we'll confirm later
292 __m128i datalo8 = _mm_unpacklo_epi8(a: data8, b: _mm_setzero_si128());
293 __m128i datahi8 = _mm_unpackhi_epi8(a: data8, b: _mm_setzero_si128());
294
295 // compare Latin1 to UTF-16
296 __m128i latin1cmplo = _mm_cmpeq_epi16(a: datalo8, b: datalo16);
297 __m128i latin1cmphi = _mm_cmpeq_epi16(a: datahi8, b: datahi16);
298 mask = _mm_movemask_epi8(a: latin1cmphi) << 16;
299 mask |= ushort(_mm_movemask_epi8(a: latin1cmplo));
300 mask = ~mask;
301 if (mask)
302 break;
303
304 // confirm it was US-ASCII
305 mask = _mm_movemask_epi8(a: data8);
306 if (mask) {
307 bitSpacing = 0;
308 break;
309 }
310#endif
311 }
312
313 // helper for comparing 4 or 8 characters
314 auto cmp_lt_16 = [&mask, &offset](int n, __m128i data8, __m128i data16) {
315 // n = 4 -> sizemask = 0xff
316 // n = 8 -> sizemask = 0xffff
317 unsigned sizemask = (1U << (2 * n)) - 1;
318
319 // expand as if Latin1
320 data8 = _mm_unpacklo_epi8(a: data8, b: _mm_setzero_si128());
321
322 // compare and confirm it's US-ASCII
323 __m128i latin1cmp = _mm_cmpeq_epi16(a: data8, b: data16);
324 mask = ~_mm_movemask_epi8(a: latin1cmp) & sizemask;
325 mask |= _mm_movemask_epi8(a: data8);
326 if (mask == 0)
327 offset += n;
328 };
329
330 // do eight characters at a time
331 if (mask == 0 && offset + 8 < len) {
332 __m128i data8 = _mm_loadl_epi64(p: reinterpret_cast<const __m128i *>(src8 + offset));
333 __m128i data16 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(src16 + offset));
334 cmp_lt_16(8, data8, data16);
335 }
336
337 // do four characters
338 if (mask == 0 && offset + 4 < len) {
339 __m128i data8 = _mm_cvtsi32_si128(a: qFromUnaligned<quint32>(src: src8 + offset));
340 __m128i data16 = _mm_loadl_epi64(p: reinterpret_cast<const __m128i *>(src16 + offset));
341 cmp_lt_16(4, data8, data16);
342 }
343
344 // correct the source pointers to point to the first character we couldn't deal with
345 if (mask)
346 offset += qCountTrailingZeroBits(v: mask) >> bitSpacing;
347 src8 += offset;
348 src16 += offset;
349}
350#elif defined(__ARM_NEON__)
351static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end)
352{
353 uint16x8_t maxAscii = vdupq_n_u16(0x7f);
354 uint16x8_t mask1 = qvsetq_n_u16(1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 );
355 uint16x8_t mask2 = vshlq_n_u16(mask1, 1);
356
357 // do sixteen characters at a time
358 for ( ; end - src >= 16; src += 16, dst += 16) {
359 // load 2 lanes (or: "load interleaved")
360 uint16x8x2_t in = vld2q_u16(reinterpret_cast<const uint16_t *>(src));
361
362 // check if any of the elements > 0x7f, select 1 bit per element (element 0 -> bit 0, element 1 -> bit 1, etc),
363 // add those together into a scalar, and merge the scalars.
364 uint16_t nonAscii = vaddvq_u16(vandq_u16(vcgtq_u16(in.val[0], maxAscii), mask1))
365 | vaddvq_u16(vandq_u16(vcgtq_u16(in.val[1], maxAscii), mask2));
366
367 // merge the two lanes by shifting the values of the second by 8 and inserting them
368 uint16x8_t out = vsliq_n_u16(in.val[0], in.val[1], 8);
369
370 // store, even if there are non-ASCII characters here
371 vst1q_u8(dst, vreinterpretq_u8_u16(out));
372
373 if (nonAscii) {
374 // find the next probable ASCII character
375 // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
376 // characters still coming
377 nextAscii = src + qBitScanReverse(nonAscii) + 1;
378
379 nonAscii = qCountTrailingZeroBits(nonAscii);
380 dst += nonAscii;
381 src += nonAscii;
382 return false;
383 }
384 }
385 return src == end;
386}
387
388static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
389{
390 // do eight characters at a time
391 uint8x8_t msb_mask = vdup_n_u8(0x80);
392 uint8x8_t add_mask = qvset_n_u8(1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 );
393 for ( ; end - src >= 8; src += 8, dst += 8) {
394 uint8x8_t c = vld1_u8(src);
395 uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
396 if (!n) {
397 // store
398 vst1q_u16(reinterpret_cast<uint16_t *>(dst), vmovl_u8(c));
399 continue;
400 }
401
402 // copy the front part that is still ASCII
403 while (!(n & 1)) {
404 *dst++ = *src++;
405 n >>= 1;
406 }
407
408 // find the next probable ASCII character
409 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
410 // characters still coming
411 n = qBitScanReverse(n);
412 nextAscii = src + n + 1;
413 return false;
414
415 }
416 return src == end;
417}
418
419static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
420{
421 // The SIMD code below is untested, so just force an early return until
422 // we've had the time to verify it works.
423 nextAscii = end;
424 return src;
425
426 // do eight characters at a time
427 uint8x8_t msb_mask = vdup_n_u8(0x80);
428 uint8x8_t add_mask = qvset_n_u8(1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7);
429 for ( ; end - src >= 8; src += 8) {
430 uint8x8_t c = vld1_u8(src);
431 uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
432 if (!n)
433 continue;
434
435 // find the next probable ASCII character
436 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
437 // characters still coming
438 nextAscii = src + qBitScanReverse(n) + 1;
439
440 // return the non-ASCII character
441 return src + qCountTrailingZeroBits(n);
442 }
443 nextAscii = end;
444 return src;
445}
446
447static void simdCompareAscii(const qchar8_t *&, const qchar8_t *, const char16_t *&, const char16_t *)
448{
449}
450#else
451static inline bool simdEncodeAscii(uchar *, const char16_t *, const char16_t *, const char16_t *)
452{
453 return false;
454}
455
456static inline bool simdDecodeAscii(char16_t *, const uchar *, const uchar *, const uchar *)
457{
458 return false;
459}
460
461static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
462{
463 nextAscii = end;
464 return src;
465}
466
467static void simdCompareAscii(const qchar8_t *&, const qchar8_t *, const char16_t *&, const char16_t *)
468{
469}
470#endif
471
472enum { HeaderDone = 1 };
473
474QByteArray QUtf8::convertFromUnicode(QStringView in)
475{
476 qsizetype len = in.size();
477
478 // create a QByteArray with the worst case scenario size
479 QByteArray result(len * 3, Qt::Uninitialized);
480 uchar *dst = reinterpret_cast<uchar *>(const_cast<char *>(result.constData()));
481 const char16_t *src = reinterpret_cast<const char16_t *>(in.data());
482 const char16_t *const end = src + len;
483
484 while (src != end) {
485 const char16_t *nextAscii = end;
486 if (simdEncodeAscii(dst, nextAscii, src, end))
487 break;
488
489 do {
490 char16_t u = *src++;
491 int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u, dst, src, end);
492 if (res < 0) {
493 // encoding error - append '?'
494 *dst++ = '?';
495 }
496 } while (src < nextAscii);
497 }
498
499 result.truncate(pos: dst - reinterpret_cast<uchar *>(const_cast<char *>(result.constData())));
500 return result;
501}
502
503QByteArray QUtf8::convertFromUnicode(QStringView in, QStringConverterBase::State *state)
504{
505 QByteArray ba(3*in.size() +3, Qt::Uninitialized);
506 char *end = convertFromUnicode(out: ba.data(), in, state);
507 ba.truncate(pos: end - ba.data());
508 return ba;
509}
510
511char *QUtf8::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state)
512{
513 Q_ASSERT(state);
514 qsizetype len = in.size();
515 if (!len)
516 return out;
517
518 auto appendReplacementChar = [state](uchar *cursor) -> uchar * {
519 if (state->flags & QStringConverter::Flag::ConvertInvalidToNull) {
520 *cursor++ = 0;
521 } else {
522 // QChar::replacement encoded in utf8
523 *cursor++ = 0xef;
524 *cursor++ = 0xbf;
525 *cursor++ = 0xbd;
526 }
527 return cursor;
528 };
529
530 uchar *cursor = reinterpret_cast<uchar *>(out);
531 const char16_t *src = in.utf16();
532 const char16_t *const end = src + len;
533
534 if (!(state->flags & QStringDecoder::Flag::Stateless)) {
535 if (state->remainingChars) {
536 int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u: state->state_data[0], dst&: cursor, src, end);
537 if (res < 0)
538 cursor = appendReplacementChar(cursor);
539 state->state_data[0] = 0;
540 state->remainingChars = 0;
541 } else if (!(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom) {
542 // append UTF-8 BOM
543 *cursor++ = utf8bom[0];
544 *cursor++ = utf8bom[1];
545 *cursor++ = utf8bom[2];
546 state->internalState |= HeaderDone;
547 }
548 }
549
550 while (src != end) {
551 const char16_t *nextAscii = end;
552 if (simdEncodeAscii(dst&: cursor, nextAscii, src, end))
553 break;
554
555 do {
556 char16_t uc = *src++;
557 int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u: uc, dst&: cursor, src, end);
558 if (Q_LIKELY(res >= 0))
559 continue;
560
561 if (res == QUtf8BaseTraits::Error) {
562 // encoding error
563 ++state->invalidChars;
564 cursor = appendReplacementChar(cursor);
565 } else if (res == QUtf8BaseTraits::EndOfString) {
566 if (state->flags & QStringConverter::Flag::Stateless) {
567 ++state->invalidChars;
568 cursor = appendReplacementChar(cursor);
569 } else {
570 state->remainingChars = 1;
571 state->state_data[0] = uc;
572 }
573 return reinterpret_cast<char *>(cursor);
574 }
575 } while (src < nextAscii);
576 }
577
578 return reinterpret_cast<char *>(cursor);
579}
580
581char *QUtf8::convertFromLatin1(char *out, QLatin1StringView in)
582{
583 // ### SIMD-optimize:
584 for (uchar ch : in) {
585 if (ch < 128) {
586 *out++ = ch;
587 } else {
588 // as per https://en.wikipedia.org/wiki/UTF-8#Encoding, 2nd row
589 *out++ = 0b110'0'0000u | (ch >> 6);
590 *out++ = 0b10'00'0000u | (ch & 0b0011'1111);
591 }
592 }
593 return out;
594}
595
596QString QUtf8::convertToUnicode(QByteArrayView in)
597{
598 // UTF-8 to UTF-16 always needs the exact same number of words or less:
599 // UTF-8 UTF-16
600 // 1 byte 1 word
601 // 2 bytes 1 word
602 // 3 bytes 1 word
603 // 4 bytes 2 words (one surrogate pair)
604 // That is, we'll use the full buffer if the input is US-ASCII (1-byte UTF-8),
605 // half the buffer for U+0080-U+07FF text (e.g., Greek, Cyrillic, Arabic) or
606 // non-BMP text, and one third of the buffer for U+0800-U+FFFF text (e.g, CJK).
607 //
608 // The table holds for invalid sequences too: we'll insert one replacement char
609 // per invalid byte.
610 QString result(in.size(), Qt::Uninitialized);
611 QChar *data = const_cast<QChar*>(result.constData()); // we know we're not shared
612 const QChar *end = convertToUnicode(buffer: data, in);
613 result.truncate(pos: end - data);
614 return result;
615}
616
617/*! \internal
618 \since 6.6
619 \overload
620
621 Converts the UTF-8 sequence of bytes viewed by \a in to a sequence of
622 QChar starting at \a dst in the destination buffer. The buffer is expected
623 to be large enough to hold the result. An upper bound for the size of the
624 buffer is \c in.size() QChars.
625
626 If, during decoding, an error occurs, a QChar::ReplacementCharacter is
627 written.
628
629 Returns a pointer to one past the last QChar written.
630
631 This function never throws.
632
633 For QChar buffers, instead of casting manually, you can use the static
634 QUtf8::convertToUnicode(QChar *, QByteArrayView) directly.
635*/
636char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in) noexcept
637{
638 const uchar *const start = reinterpret_cast<const uchar *>(in.data());
639 const uchar *src = start;
640 const uchar *end = src + in.size();
641
642 // attempt to do a full decoding in SIMD
643 const uchar *nextAscii = end;
644 if (!simdDecodeAscii(dst, nextAscii, src, end)) {
645 // at least one non-ASCII entry
646 // check if we failed to decode the UTF-8 BOM; if so, skip it
647 if (Q_UNLIKELY(src == start)
648 && end - src >= 3
649 && Q_UNLIKELY(src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])) {
650 src += 3;
651 }
652
653 while (src < end) {
654 nextAscii = end;
655 if (simdDecodeAscii(dst, nextAscii, src, end))
656 break;
657
658 do {
659 uchar b = *src++;
660 const qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
661 if (res < 0) {
662 // decoding error
663 *dst++ = QChar::ReplacementCharacter;
664 }
665 } while (src < nextAscii);
666 }
667 }
668
669 return dst;
670}
671
672QString QUtf8::convertToUnicode(QByteArrayView in, QStringConverter::State *state)
673{
674 // See above for buffer requirements for stateless decoding. However, that
675 // fails if the state is not empty. The following situations can add to the
676 // requirements:
677 // state contains chars starts with requirement
678 // 1 of 2 bytes valid continuation 0
679 // 2 of 3 bytes same 0
680 // 3 bytes of 4 same +1 (need to insert surrogate pair)
681 // 1 of 2 bytes invalid continuation +1 (need to insert replacement and restart)
682 // 2 of 3 bytes same +1 (same)
683 // 3 of 4 bytes same +1 (same)
684 QString result(in.size() + 1, Qt::Uninitialized);
685 QChar *end = convertToUnicode(out: result.data(), in, state);
686 result.truncate(pos: end - result.constData());
687 return result;
688}
689
690char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in, QStringConverter::State *state)
691{
692 qsizetype len = in.size();
693
694 Q_ASSERT(state);
695 if (!len)
696 return dst;
697
698
699 char16_t replacement = QChar::ReplacementCharacter;
700 if (state->flags & QStringConverter::Flag::ConvertInvalidToNull)
701 replacement = QChar::Null;
702
703 qsizetype res;
704 uchar ch = 0;
705
706 const uchar *src = reinterpret_cast<const uchar *>(in.data());
707 const uchar *end = src + len;
708
709 if (!(state->flags & QStringConverter::Flag::Stateless)) {
710 bool headerdone = state->internalState & HeaderDone || state->flags & QStringConverter::Flag::ConvertInitialBom;
711 if (state->remainingChars || !headerdone) {
712 // handle incoming state first
713 uchar remainingCharsData[4]; // longest UTF-8 sequence possible
714 qsizetype remainingCharsCount = state->remainingChars;
715 qsizetype newCharsToCopy = qMin<qsizetype>(a: sizeof(remainingCharsData) - remainingCharsCount, b: end - src);
716
717 memset(s: remainingCharsData, c: 0, n: sizeof(remainingCharsData));
718 memcpy(dest: remainingCharsData, src: &state->state_data[0], n: remainingCharsCount);
719 memcpy(dest: remainingCharsData + remainingCharsCount, src: src, n: newCharsToCopy);
720
721 const uchar *begin = &remainingCharsData[1];
722 res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b: remainingCharsData[0], dst, src&: begin,
723 end: static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
724 if (res == QUtf8BaseTraits::Error) {
725 ++state->invalidChars;
726 *dst++ = replacement;
727 ++src;
728 } else if (res == QUtf8BaseTraits::EndOfString) {
729 // if we got EndOfString again, then there were too few bytes in src;
730 // copy to our state and return
731 state->remainingChars = remainingCharsCount + newCharsToCopy;
732 memcpy(dest: &state->state_data[0], src: remainingCharsData, n: state->remainingChars);
733 return dst;
734 } else if (!headerdone) {
735 // eat the UTF-8 BOM
736 if (dst[-1] == 0xfeff)
737 --dst;
738 }
739 state->internalState |= HeaderDone;
740
741 // adjust src now that we have maybe consumed a few chars
742 if (res >= 0) {
743 Q_ASSERT(res > remainingCharsCount);
744 src += res - remainingCharsCount;
745 }
746 }
747 } else if (!(state->flags & QStringConverter::Flag::ConvertInitialBom)) {
748 // stateless, remove initial BOM
749 if (len > 2 && src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])
750 // skip BOM
751 src += 3;
752 }
753
754 // main body, stateless decoding
755 res = 0;
756 const uchar *nextAscii = src;
757 while (res >= 0 && src < end) {
758 if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end))
759 break;
760
761 ch = *src++;
762 res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b: ch, dst, src, end);
763 if (res == QUtf8BaseTraits::Error) {
764 res = 0;
765 ++state->invalidChars;
766 *dst++ = replacement;
767 }
768 }
769
770 if (res == QUtf8BaseTraits::EndOfString) {
771 // unterminated UTF sequence
772 if (state->flags & QStringConverter::Flag::Stateless) {
773 *dst++ = QChar::ReplacementCharacter;
774 ++state->invalidChars;
775 while (src++ < end) {
776 *dst++ = QChar::ReplacementCharacter;
777 ++state->invalidChars;
778 }
779 state->remainingChars = 0;
780 } else {
781 --src; // unread the byte in ch
782 state->remainingChars = end - src;
783 memcpy(dest: &state->state_data[0], src: src, n: end - src);
784 }
785 } else {
786 state->remainingChars = 0;
787 }
788
789 return dst;
790}
791
792struct QUtf8NoOutputTraits : public QUtf8BaseTraitsNoAscii
793{
794 struct NoOutput {};
795 static void appendUtf16(const NoOutput &, char16_t) {}
796 static void appendUcs4(const NoOutput &, char32_t) {}
797};
798
799QUtf8::ValidUtf8Result QUtf8::isValidUtf8(QByteArrayView in)
800{
801 const uchar *src = reinterpret_cast<const uchar *>(in.data());
802 const uchar *end = src + in.size();
803 const uchar *nextAscii = src;
804 bool isValidAscii = true;
805
806 while (src < end) {
807 if (src >= nextAscii)
808 src = simdFindNonAscii(src, end, nextAscii);
809 if (src == end)
810 break;
811
812 do {
813 uchar b = *src++;
814 if ((b & 0x80) == 0)
815 continue;
816
817 isValidAscii = false;
818 QUtf8NoOutputTraits::NoOutput output;
819 const qsizetype res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, dst&: output, src, end);
820 if (res < 0) {
821 // decoding error
822 return { .isValidUtf8: false, .isValidAscii: false };
823 }
824 } while (src < nextAscii);
825 }
826
827 return { .isValidUtf8: true, .isValidAscii: isValidAscii };
828}
829
830int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16, Qt::CaseSensitivity cs) noexcept
831{
832 auto src1 = reinterpret_cast<const qchar8_t *>(utf8.data());
833 auto end1 = src1 + utf8.size();
834 auto src2 = reinterpret_cast<const char16_t *>(utf16.data());
835 auto end2 = src2 + utf16.size();
836
837 do {
838 simdCompareAscii(src8&: src1, end8: end1, src16&: src2, end16: end2);
839
840 if (src1 < end1 && src2 < end2) {
841 char32_t uc1 = *src1++;
842 char32_t uc2 = *src2++;
843
844 if (uc1 >= 0x80) {
845 char32_t *output = &uc1;
846 qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraitsNoAscii>(b: uc1, dst&: output, src&: src1, end: end1);
847 if (res < 0) {
848 // decoding error
849 uc1 = QChar::ReplacementCharacter;
850 }
851
852 // Only decode the UTF-16 surrogate pair if the UTF-8 code point
853 // wasn't US-ASCII (a surrogate cannot match US-ASCII).
854 if (QChar::isHighSurrogate(ucs4: uc2) && src2 < end2 && QChar::isLowSurrogate(ucs4: *src2))
855 uc2 = QChar::surrogateToUcs4(high: uc2, low: *src2++);
856 }
857 if (cs == Qt::CaseInsensitive) {
858 uc1 = QChar::toCaseFolded(ucs4: uc1);
859 uc2 = QChar::toCaseFolded(ucs4: uc2);
860 }
861 if (uc1 != uc2)
862 return int(uc1) - int(uc2);
863 }
864 } while (src1 < end1 && src2 < end2);
865
866 // the shorter string sorts first
867 return (end1 > src1) - int(end2 > src2);
868}
869
870int QUtf8::compareUtf8(QByteArrayView utf8, QLatin1StringView s, Qt::CaseSensitivity cs)
871{
872 char32_t uc1 = QChar::Null;
873 auto src1 = reinterpret_cast<const uchar *>(utf8.data());
874 auto end1 = src1 + utf8.size();
875 auto src2 = reinterpret_cast<const uchar *>(s.latin1());
876 auto end2 = src2 + s.size();
877
878 while (src1 < end1 && src2 < end2) {
879 uchar b = *src1++;
880 char32_t *output = &uc1;
881 const qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst&: output, src&: src1, end: end1);
882 if (res < 0) {
883 // decoding error
884 uc1 = QChar::ReplacementCharacter;
885 }
886
887 char32_t uc2 = *src2++;
888 if (cs == Qt::CaseInsensitive) {
889 uc1 = QChar::toCaseFolded(ucs4: uc1);
890 uc2 = QChar::toCaseFolded(ucs4: uc2);
891 }
892 if (uc1 != uc2)
893 return int(uc1) - int(uc2);
894 }
895
896 // the shorter string sorts first
897 return (end1 > src1) - (end2 > src2);
898}
899
900int QUtf8::compareUtf8(QByteArrayView lhs, QByteArrayView rhs, Qt::CaseSensitivity cs) noexcept
901{
902 if (lhs.isEmpty())
903 return qt_lencmp(lhs: 0, rhs: rhs.size());
904
905 if (cs == Qt::CaseSensitive) {
906 const auto l = std::min(a: lhs.size(), b: rhs.size());
907 int r = memcmp(s1: lhs.data(), s2: rhs.data(), n: l);
908 return r ? r : qt_lencmp(lhs: lhs.size(), rhs: rhs.size());
909 }
910
911 char32_t uc1 = QChar::Null;
912 auto src1 = reinterpret_cast<const uchar *>(lhs.data());
913 auto end1 = src1 + lhs.size();
914 char32_t uc2 = QChar::Null;
915 auto src2 = reinterpret_cast<const uchar *>(rhs.data());
916 auto end2 = src2 + rhs.size();
917
918 while (src1 < end1 && src2 < end2) {
919 uchar b = *src1++;
920 char32_t *output = &uc1;
921 qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst&: output, src&: src1, end: end1);
922 if (res < 0) {
923 // decoding error
924 uc1 = QChar::ReplacementCharacter;
925 }
926
927 b = *src2++;
928 output = &uc2;
929 res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst&: output, src&: src2, end: end2);
930 if (res < 0) {
931 // decoding error
932 uc2 = QChar::ReplacementCharacter;
933 }
934
935 uc1 = QChar::toCaseFolded(ucs4: uc1);
936 uc2 = QChar::toCaseFolded(ucs4: uc2);
937 if (uc1 != uc2)
938 return int(uc1) - int(uc2);
939 }
940
941 // the shorter string sorts first
942 return (end1 > src1) - (end2 > src2);
943}
944
945#ifndef QT_BOOTSTRAPPED
946QByteArray QUtf16::convertFromUnicode(QStringView in, QStringConverter::State *state, DataEndianness endian)
947{
948 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
949 qsizetype length = 2 * in.size();
950 if (writeBom)
951 length += 2;
952
953 QByteArray d(length, Qt::Uninitialized);
954 char *end = convertFromUnicode(out: d.data(), in, state, endian);
955 Q_ASSERT(end - d.constData() == d.size());
956 Q_UNUSED(end);
957 return d;
958}
959
960char *QUtf16::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state, DataEndianness endian)
961{
962 Q_ASSERT(state);
963 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
964
965 if (endian == DetectEndianness)
966 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
967
968 if (writeBom) {
969 // set them up the BOM
970 QChar bom(QChar::ByteOrderMark);
971 if (endian == BigEndianness)
972 qToBigEndian(src: bom.unicode(), dest: out);
973 else
974 qToLittleEndian(src: bom.unicode(), dest: out);
975 out += 2;
976 }
977 if (endian == BigEndianness)
978 qToBigEndian<char16_t>(source: in.data(), count: in.size(), dest: out);
979 else
980 qToLittleEndian<char16_t>(source: in.data(), count: in.size(), dest: out);
981
982 state->remainingChars = 0;
983 state->internalState |= HeaderDone;
984 return out + 2*in.size();
985}
986
987QString QUtf16::convertToUnicode(QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
988{
989 QString result((in.size() + 1) >> 1, Qt::Uninitialized); // worst case
990 QChar *qch = convertToUnicode(out: result.data(), in, state, endian);
991 result.truncate(pos: qch - result.constData());
992 return result;
993}
994
995QChar *QUtf16::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
996{
997 qsizetype len = in.size();
998 const char *chars = in.data();
999
1000 Q_ASSERT(state);
1001
1002 if (endian == DetectEndianness)
1003 endian = (DataEndianness)state->state_data[Endian];
1004
1005 const char *end = chars + len;
1006
1007 // make sure we can decode at least one char
1008 if (state->remainingChars + len < 2) {
1009 if (len) {
1010 Q_ASSERT(state->remainingChars == 0 && len == 1);
1011 state->remainingChars = 1;
1012 state->state_data[Data] = *chars;
1013 }
1014 return out;
1015 }
1016
1017 bool headerdone = state && state->internalState & HeaderDone;
1018 if (state->flags & QStringConverter::Flag::ConvertInitialBom)
1019 headerdone = true;
1020
1021 if (!headerdone || state->remainingChars) {
1022 uchar buf;
1023 if (state->remainingChars)
1024 buf = state->state_data[Data];
1025 else
1026 buf = *chars++;
1027
1028 // detect BOM, set endianness
1029 state->internalState |= HeaderDone;
1030 QChar ch(buf, *chars++);
1031 if (endian == DetectEndianness) {
1032 // someone set us up the BOM
1033 if (ch == QChar::ByteOrderSwapped) {
1034 endian = BigEndianness;
1035 } else if (ch == QChar::ByteOrderMark) {
1036 endian = LittleEndianness;
1037 } else {
1038 if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
1039 endian = BigEndianness;
1040 } else {
1041 endian = LittleEndianness;
1042 }
1043 }
1044 }
1045 if (endian == BigEndianness)
1046 ch = QChar::fromUcs2(c: (ch.unicode() >> 8) | ((ch.unicode() & 0xff) << 8));
1047 if (headerdone || ch != QChar::ByteOrderMark)
1048 *out++ = ch;
1049 } else if (endian == DetectEndianness) {
1050 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1051 }
1052
1053 qsizetype nPairs = (end - chars) >> 1;
1054 if (endian == BigEndianness)
1055 qFromBigEndian<char16_t>(source: chars, count: nPairs, dest: out);
1056 else
1057 qFromLittleEndian<char16_t>(source: chars, count: nPairs, dest: out);
1058 out += nPairs;
1059
1060 state->state_data[Endian] = endian;
1061 state->remainingChars = 0;
1062 if ((end - chars) & 1) {
1063 if (state->flags & QStringConverter::Flag::Stateless) {
1064 *out++ = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? QChar::Null : QChar::ReplacementCharacter;
1065 } else {
1066 state->remainingChars = 1;
1067 state->state_data[Data] = *(end - 1);
1068 }
1069 } else {
1070 state->state_data[Data] = 0;
1071 }
1072
1073 return out;
1074}
1075
1076QByteArray QUtf32::convertFromUnicode(QStringView in, QStringConverter::State *state, DataEndianness endian)
1077{
1078 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1079 qsizetype length = 4*in.size();
1080 if (writeBom)
1081 length += 4;
1082 QByteArray ba(length, Qt::Uninitialized);
1083 char *end = convertFromUnicode(out: ba.data(), in, state, endian);
1084 ba.truncate(pos: end - ba.constData());
1085 return ba;
1086}
1087
1088char *QUtf32::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state, DataEndianness endian)
1089{
1090 Q_ASSERT(state);
1091
1092 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1093 if (endian == DetectEndianness)
1094 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1095
1096 if (writeBom) {
1097 // set them up the BOM
1098 if (endian == BigEndianness) {
1099 out[0] = 0;
1100 out[1] = 0;
1101 out[2] = (char)0xfe;
1102 out[3] = (char)0xff;
1103 } else {
1104 out[0] = (char)0xff;
1105 out[1] = (char)0xfe;
1106 out[2] = 0;
1107 out[3] = 0;
1108 }
1109 out += 4;
1110 state->internalState |= HeaderDone;
1111 }
1112
1113 const QChar *uc = in.data();
1114 const QChar *end = in.data() + in.size();
1115 QChar ch;
1116 char32_t ucs4;
1117 if (state->remainingChars == 1) {
1118 auto character = state->state_data[Data];
1119 Q_ASSERT(character <= 0xFFFF);
1120 ch = QChar(character);
1121 // this is ugly, but shortcuts a whole lot of logic that would otherwise be required
1122 state->remainingChars = 0;
1123 goto decode_surrogate;
1124 }
1125
1126 while (uc < end) {
1127 ch = *uc++;
1128 if (Q_LIKELY(!ch.isSurrogate())) {
1129 ucs4 = ch.unicode();
1130 } else if (Q_LIKELY(ch.isHighSurrogate())) {
1131decode_surrogate:
1132 if (uc == end) {
1133 if (state->flags & QStringConverter::Flag::Stateless) {
1134 ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? 0 : QChar::ReplacementCharacter;
1135 } else {
1136 state->remainingChars = 1;
1137 state->state_data[Data] = ch.unicode();
1138 return out;
1139 }
1140 } else if (uc->isLowSurrogate()) {
1141 ucs4 = QChar::surrogateToUcs4(high: ch, low: *uc++);
1142 } else {
1143 ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? 0 : QChar::ReplacementCharacter;
1144 }
1145 } else {
1146 ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? 0 : QChar::ReplacementCharacter;
1147 }
1148 if (endian == BigEndianness)
1149 qToBigEndian(src: ucs4, dest: out);
1150 else
1151 qToLittleEndian(src: ucs4, dest: out);
1152 out += 4;
1153 }
1154
1155 return out;
1156}
1157
1158QString QUtf32::convertToUnicode(QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
1159{
1160 QString result;
1161 result.resize(size: (in.size() + 7) >> 1); // worst case
1162 QChar *end = convertToUnicode(out: result.data(), in, state, endian);
1163 result.truncate(pos: end - result.constData());
1164 return result;
1165}
1166
1167QChar *QUtf32::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
1168{
1169 qsizetype len = in.size();
1170 const char *chars = in.data();
1171
1172 Q_ASSERT(state);
1173 if (endian == DetectEndianness)
1174 endian = (DataEndianness)state->state_data[Endian];
1175
1176 const char *end = chars + len;
1177
1178 uchar tuple[4];
1179 memcpy(dest: tuple, src: &state->state_data[Data], n: 4);
1180
1181 // make sure we can decode at least one char
1182 if (state->remainingChars + len < 4) {
1183 if (len) {
1184 while (chars < end) {
1185 tuple[state->remainingChars] = *chars;
1186 ++state->remainingChars;
1187 ++chars;
1188 }
1189 Q_ASSERT(state->remainingChars < 4);
1190 memcpy(dest: &state->state_data[Data], src: tuple, n: 4);
1191 }
1192 return out;
1193 }
1194
1195 bool headerdone = state->internalState & HeaderDone;
1196 if (state->flags & QStringConverter::Flag::ConvertInitialBom)
1197 headerdone = true;
1198
1199 qsizetype num = state->remainingChars;
1200 state->remainingChars = 0;
1201
1202 if (!headerdone || endian == DetectEndianness || num) {
1203 while (num < 4)
1204 tuple[num++] = *chars++;
1205 if (endian == DetectEndianness) {
1206 // someone set us up the BOM?
1207 if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0) {
1208 endian = LittleEndianness;
1209 } else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff) {
1210 endian = BigEndianness;
1211 } else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
1212 endian = BigEndianness;
1213 } else {
1214 endian = LittleEndianness;
1215 }
1216 }
1217 char32_t code = (endian == BigEndianness) ? qFromBigEndian<char32_t>(src: tuple) : qFromLittleEndian<char32_t>(src: tuple);
1218 if (headerdone || code != QChar::ByteOrderMark) {
1219 if (QChar::requiresSurrogates(ucs4: code)) {
1220 *out++ = QChar(QChar::highSurrogate(ucs4: code));
1221 *out++ = QChar(QChar::lowSurrogate(ucs4: code));
1222 } else {
1223 *out++ = QChar(code);
1224 }
1225 }
1226 num = 0;
1227 } else if (endian == DetectEndianness) {
1228 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1229 }
1230 state->state_data[Endian] = endian;
1231 state->internalState |= HeaderDone;
1232
1233 while (chars < end) {
1234 tuple[num++] = *chars++;
1235 if (num == 4) {
1236 char32_t code = (endian == BigEndianness) ? qFromBigEndian<char32_t>(src: tuple) : qFromLittleEndian<char32_t>(src: tuple);
1237 for (char16_t c : QChar::fromUcs4(c: code))
1238 *out++ = c;
1239 num = 0;
1240 }
1241 }
1242
1243 if (num) {
1244 if (state->flags & QStringDecoder::Flag::Stateless) {
1245 *out++ = QChar::ReplacementCharacter;
1246 } else {
1247 state->state_data[Endian] = endian;
1248 state->remainingChars = num;
1249 memcpy(dest: &state->state_data[Data], src: tuple, n: 4);
1250 }
1251 }
1252
1253 return out;
1254}
1255#endif // !QT_BOOTSTRAPPED
1256
1257#if defined(Q_OS_WIN) && !defined(QT_BOOTSTRAPPED)
1258int QLocal8Bit::checkUtf8()
1259{
1260 return GetACP() == CP_UTF8 ? 1 : -1;
1261}
1262
1263QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::State *state)
1264{
1265 return convertToUnicode_sys(in, CP_ACP, state);
1266}
1267
1268QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage,
1269 QStringConverter::State *state)
1270{
1271 const char *mb = in.data();
1272 qsizetype mblen = in.size();
1273
1274 Q_ASSERT(state);
1275 qsizetype &invalidChars = state->invalidChars;
1276 using Flag = QStringConverter::Flag;
1277 const bool useNullForReplacement = !!(state->flags & Flag::ConvertInvalidToNull);
1278 const char16_t replacementCharacter = useNullForReplacement ? QChar::Null
1279 : QChar::ReplacementCharacter;
1280 if (state->flags & Flag::Stateless) {
1281 Q_ASSERT(state->remainingChars == 0);
1282 state = nullptr;
1283 }
1284
1285 if (!mb || !mblen)
1286 return QString();
1287
1288 // Use a local stack-buffer at first to allow us a decently large container
1289 // to avoid a lot of resizing, without also returning an overallocated
1290 // QString to the user for small strings.
1291 // Then we can be fast for small strings and take the hit of extra resizes
1292 // and measuring how much storage is needed for large strings.
1293 std::array<wchar_t, 4096> buf;
1294 wchar_t *out = buf.data();
1295 qsizetype outlen = buf.size();
1296
1297 QString sp;
1298
1299 // Return a pointer to storage where we have enough space for `size`
1300 const auto growOut = [&](qsizetype size) -> std::tuple<wchar_t *, qsizetype> {
1301 if (outlen >= size)
1302 return {out, outlen};
1303 const bool wasStackBuffer = sp.isEmpty();
1304 const auto begin = wasStackBuffer ? buf.data() : reinterpret_cast<wchar_t *>(sp.data());
1305 const qsizetype offset = qsizetype(std::distance(begin, out));
1306 qsizetype newSize = 0;
1307 if (Q_UNLIKELY(qAddOverflow(offset, size, &newSize))) {
1308 Q_CHECK_PTR(false);
1309 return {nullptr, 0};
1310 }
1311 sp.resize(newSize);
1312 auto it = reinterpret_cast<wchar_t *>(sp.data());
1313 if (wasStackBuffer)
1314 it = std::copy_n(buf.data(), offset, it);
1315 else
1316 it += offset;
1317 return {it, size};
1318 };
1319
1320 // Convert the pending characters (if available)
1321 while (state && state->remainingChars && mblen) {
1322 QStringConverter::State localState;
1323 localState.flags = state->flags;
1324 // Use at most 6 characters as a guess for the longest encoded character
1325 // in any multibyte encoding.
1326 // Even with a total of 2 bytes of overhead that would leave around
1327 // 2^(4 * 8) possible characters
1328 std::array<char, 6> prev = {0};
1329 Q_ASSERT(state->remainingChars <= q20::ssize(state->state_data));
1330 qsizetype index = 0;
1331 for (; index < state->remainingChars; ++index)
1332 prev[index] = state->state_data[index];
1333 const qsizetype toCopy = std::min(q20::ssize(prev) - index, mblen);
1334 for (qsizetype i = 0; i < toCopy; ++i, ++index)
1335 prev[index] = mb[i];
1336 mb += toCopy;
1337 mblen -= toCopy;
1338
1339 // Recursing:
1340 // Since we are using a clean local state it will try to decode what was
1341 // stored in our state + some extra octets from input (`prev`). If some
1342 // part fails we will have those characters stored in the local state's
1343 // storage, and we can extract those. It may also output some
1344 // replacement characters, which we'll count in the invalidChars.
1345 // In the best case we only do this once, but we will loop until we have
1346 // resolved all the remaining characters or we have run out of new input
1347 // in which case we may still have remaining characters.
1348 const QString tmp = convertToUnicode_sys(QByteArrayView(prev.data(), index), codePage,
1349 &localState);
1350 std::tie(out, outlen) = growOut(tmp.size());
1351 if (!out)
1352 return {};
1353 out = std::copy_n(reinterpret_cast<const wchar_t *>(tmp.constData()), tmp.size(), out);
1354 outlen -= tmp.size();
1355 const qsizetype tail = toCopy - localState.remainingChars;
1356 if (tail >= 0) {
1357 // Everything left to process comes from `in`, so we can stop
1358 // looping. Adjust the window for `in` and unset remainingChars to
1359 // signal that we're done.
1360 mb -= localState.remainingChars;
1361 mblen += localState.remainingChars;
1362 localState.remainingChars = 0;
1363 }
1364 state->remainingChars = localState.remainingChars;
1365 state->invalidChars += localState.invalidChars;
1366 std::copy_n(localState.state_data, state->remainingChars, state->state_data);
1367 }
1368
1369 Q_ASSERT(!state || state->remainingChars == 0 || mblen == 0);
1370
1371 // Need it in this scope, since we try to decrease our window size if we
1372 // encounter an error
1373 int nextIn = q26::saturate_cast<int>(mblen);
1374 while (mblen > 0) {
1375 std::tie(out, outlen) = growOut(1); // Need space for at least one character
1376 if (!out)
1377 return {};
1378 const int nextOut = q26::saturate_cast<int>(outlen);
1379 int len = MultiByteToWideChar(codePage, MB_ERR_INVALID_CHARS, mb, nextIn, out, nextOut);
1380 if (len) {
1381 mb += nextIn;
1382 mblen -= nextIn;
1383 out += len;
1384 outlen -= len;
1385 } else {
1386 int r = GetLastError();
1387 if (r == ERROR_INSUFFICIENT_BUFFER) {
1388 const int wclen = MultiByteToWideChar(codePage, 0, mb, nextIn, 0, 0);
1389 std::tie(out, outlen) = growOut(wclen);
1390 if (!out)
1391 return {};
1392 } else if (r == ERROR_NO_UNICODE_TRANSLATION) {
1393 // Can't decode the current window, so either store the state,
1394 // reduce window size or output a replacement character.
1395
1396 // Check if we can store all remaining characters in the state
1397 // to be used next time we're called:
1398 if (state && mblen <= q20::ssize(state->state_data)) {
1399 state->remainingChars = mblen;
1400 std::copy_n(mb, mblen, state->state_data);
1401 mb += mblen;
1402 mblen = 0;
1403 break;
1404 }
1405
1406 // .. if not, try to find the last valid character in the window
1407 // and try again with a shrunken window:
1408 if (nextIn > 1) {
1409 // There may be some incomplete data at the end of our current
1410 // window, so decrease the window size and try again.
1411 // In the worst case scenario there is gigs of undecodable
1412 // garbage, but what are we supposed to do about that?
1413 const auto it = CharPrevExA(codePage, mb, mb + nextIn, 0);
1414 if (it != mb)
1415 nextIn = int(it - mb);
1416 else
1417 --nextIn;
1418 continue;
1419 }
1420
1421 // Finally, we are forced to output a replacement character for
1422 // the first byte in the window:
1423 std::tie(out, outlen) = growOut(1);
1424 if (!out)
1425 return {};
1426 *out = replacementCharacter;
1427 ++invalidChars;
1428 ++out;
1429 --outlen;
1430 ++mb;
1431 --mblen;
1432 } else {
1433 // Fail.
1434 qWarning("MultiByteToWideChar: Cannot convert multibyte text");
1435 break;
1436 }
1437 }
1438 nextIn = q26::saturate_cast<int>(mblen);
1439 }
1440
1441 if (sp.isEmpty()) {
1442 // We must have only used the stack buffer
1443 if (out != buf.data()) // else: we return null-string
1444 sp = QStringView(buf.data(), out).toString();
1445 } else{
1446 const auto begin = reinterpret_cast<wchar_t *>(sp.data());
1447 sp.truncate(std::distance(begin, out));
1448 }
1449
1450 if (sp.size() && sp.back().isNull())
1451 sp.chop(1);
1452
1453 if (!state && mblen > 0) {
1454 // We have trailing character(s) that could not be converted, and
1455 // nowhere to cache them
1456 sp.resize(sp.size() + mblen, replacementCharacter);
1457 invalidChars += mblen;
1458 }
1459 return sp;
1460}
1461
1462QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, QStringConverter::State *state)
1463{
1464 return convertFromUnicode_sys(in, CP_ACP, state);
1465}
1466
1467QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, quint32 codePage,
1468 QStringConverter::State *state)
1469{
1470 const wchar_t *ch = reinterpret_cast<const wchar_t *>(in.data());
1471 qsizetype uclen = in.size();
1472
1473 Q_ASSERT(state);
1474 // The Windows API has a *boolean* out-parameter that says if a replacement
1475 // character was used, but it gives us no way to know _how many_ were used.
1476 // Since we cannot simply scan the string for replacement characters
1477 // (which is potentially a question mark, and thus a valid character),
1478 // we simply do not track the number of invalid characters here.
1479 // auto &invalidChars = state->invalidChars;
1480
1481 using Flag = QStringConverter::Flag;
1482 if (state->flags & Flag::Stateless) { // temporary
1483 Q_ASSERT(state->remainingChars == 0);
1484 state = nullptr;
1485 }
1486
1487 if (!ch)
1488 return QByteArray();
1489 if (uclen == 0)
1490 return QByteArray("");
1491
1492 // Use a local stack-buffer at first to allow us a decently large container
1493 // to avoid a lot of resizing, without also returning an overallocated
1494 // QByteArray to the user for small strings.
1495 // Then we can be fast for small strings and take the hit of extra resizes
1496 // and measuring how much storage is needed for large strings.
1497 std::array<char, 4096> buf;
1498 char *out = buf.data();
1499 qsizetype outlen = buf.size();
1500 QByteArray mb;
1501
1502 if (state && state->remainingChars > 0) {
1503 Q_ASSERT(state->remainingChars == 1);
1504 // Let's try to decode the pending character
1505 wchar_t wc[2] = { wchar_t(state->state_data[0]), ch[0] };
1506 // Check if the second character is a valid low surrogate,
1507 // otherwise we'll just decode the first character, for which windows
1508 // will output a replacement character.
1509 const bool validCodePoint = QChar::isLowSurrogate(wc[1]);
1510 int len = WideCharToMultiByte(codePage, 0, wc, validCodePoint ? 2 : 1, out, outlen, nullptr,
1511 nullptr);
1512 if (!len)
1513 return {}; // Cannot recover, and I refuse to believe it was a size limitation
1514 out += len;
1515 outlen -= len;
1516 if (validCodePoint) {
1517 ++ch;
1518 --uclen;
1519 }
1520 state->remainingChars = 0;
1521 state->state_data[0] = 0;
1522 if (uclen == 0)
1523 return QByteArrayView(buf.data(), len).toByteArray();
1524 }
1525
1526 if (state && QChar::isHighSurrogate(ch[uclen - 1])) {
1527 // We can handle a missing low surrogate at the end of the string,
1528 // so if there is one, exclude it now and store it in the state.
1529 state->remainingChars = 1;
1530 state->state_data[0] = ch[uclen - 1];
1531 --uclen;
1532 if (uclen == 0)
1533 return QByteArray();
1534 }
1535
1536 Q_ASSERT(uclen > 0);
1537
1538 // Return a pointer to storage where we have enough space for `size`
1539 const auto growOut = [&](qsizetype size) -> std::tuple<char *, qsizetype> {
1540 if (outlen >= size)
1541 return {out, outlen};
1542 const bool wasStackBuffer = mb.isEmpty();
1543 const auto begin = wasStackBuffer ? buf.data() : mb.data();
1544 const qsizetype offset = qsizetype(std::distance(begin, out));
1545 qsizetype newSize = 0;
1546 if (Q_UNLIKELY(qAddOverflow(offset, size, &newSize))) {
1547 Q_CHECK_PTR(false);
1548 return {nullptr, 0};
1549 }
1550 mb.resize(newSize);
1551 auto it = mb.data();
1552 if (wasStackBuffer)
1553 it = std::copy_n(buf.data(), offset, it);
1554 else
1555 it += offset;
1556 return {it, size};
1557 };
1558
1559 const auto getNextWindowSize = [&]() {
1560 int nextIn = q26::saturate_cast<int>(uclen);
1561 // The Windows API has some issues if the current window ends in the
1562 // middle of a surrogate pair, so we avoid that:
1563 if (nextIn > 1 && QChar::isHighSurrogate(ch[nextIn - 1]))
1564 --nextIn;
1565 return nextIn;
1566 };
1567
1568 int len = 0;
1569 while (uclen > 0) {
1570 const int nextIn = getNextWindowSize();
1571 std::tie(out, outlen) = growOut(1); // We need at least one byte
1572 if (!out)
1573 return {};
1574 const int nextOut = q26::saturate_cast<int>(outlen);
1575 len = WideCharToMultiByte(codePage, 0, ch, nextIn, out, nextOut, nullptr, nullptr);
1576 if (len > 0) {
1577 ch += nextIn;
1578 uclen -= nextIn;
1579 out += len;
1580 outlen -= len;
1581 } else {
1582 int r = GetLastError();
1583 if (r == ERROR_INSUFFICIENT_BUFFER) {
1584 int neededLength = WideCharToMultiByte(codePage, 0, ch, nextIn, nullptr, 0,
1585 nullptr, nullptr);
1586 if (neededLength <= 0) {
1587 // Fail. Observed with UTF8 where the input window was max int and ended in an
1588 // incomplete sequence, probably a Windows bug. We try to avoid that from
1589 // happening by reducing the window size in that case. But let's keep this
1590 // branch just in case of other bugs.
1591#ifndef QT_NO_DEBUG
1592 r = GetLastError();
1593 fprintf(stderr,
1594 "WideCharToMultiByte: Cannot convert multibyte text (error %d)\n", r);
1595#endif // !QT_NO_DEBUG
1596 break;
1597 }
1598 std::tie(out, outlen) = growOut(neededLength);
1599 if (!out)
1600 return {};
1601 // and try again...
1602 } else {
1603 // Fail. Probably can't happen in fact (dwFlags is 0).
1604#ifndef QT_NO_DEBUG
1605 // Can't use qWarning(), as it'll recurse to handle %ls
1606 fprintf(stderr,
1607 "WideCharToMultiByte: Cannot convert multibyte text (error %d): %ls\n", r,
1608 reinterpret_cast<const wchar_t *>(
1609 QStringView(ch, uclen).left(100).toString().utf16()));
1610#endif
1611 break;
1612 }
1613 }
1614 }
1615 if (mb.isEmpty()) {
1616 // We must have only used the stack buffer
1617 if (out != buf.data()) // else: we return null-array
1618 mb = QByteArrayView(buf.data(), out).toByteArray();
1619 } else {
1620 mb.truncate(std::distance(mb.data(), out));
1621 }
1622 return mb;
1623}
1624#endif
1625
1626void QStringConverter::State::clear() noexcept
1627{
1628 if (clearFn)
1629 clearFn(this);
1630 else
1631 state_data[0] = state_data[1] = state_data[2] = state_data[3] = 0;
1632 remainingChars = 0;
1633 invalidChars = 0;
1634 internalState = 0;
1635}
1636
1637void QStringConverter::State::reset() noexcept
1638{
1639 if (flags & Flag::UsesIcu) {
1640#if QT_CONFIG(icu)
1641 UConverter *converter = static_cast<UConverter *>(d[0]);
1642 if (converter)
1643 ucnv_reset(converter);
1644#else
1645 Q_UNREACHABLE();
1646#endif
1647 } else {
1648 clear();
1649 }
1650}
1651
1652#ifndef QT_BOOTSTRAPPED
1653static QChar *fromUtf16(QChar *out, QByteArrayView in, QStringConverter::State *state)
1654{
1655 return QUtf16::convertToUnicode(out, in, state, endian: DetectEndianness);
1656}
1657
1658static char *toUtf16(char *out, QStringView in, QStringConverter::State *state)
1659{
1660 return QUtf16::convertFromUnicode(out, in, state, endian: DetectEndianness);
1661}
1662
1663static QChar *fromUtf16BE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1664{
1665 return QUtf16::convertToUnicode(out, in, state, endian: BigEndianness);
1666}
1667
1668static char *toUtf16BE(char *out, QStringView in, QStringConverter::State *state)
1669{
1670 return QUtf16::convertFromUnicode(out, in, state, endian: BigEndianness);
1671}
1672
1673static QChar *fromUtf16LE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1674{
1675 return QUtf16::convertToUnicode(out, in, state, endian: LittleEndianness);
1676}
1677
1678static char *toUtf16LE(char *out, QStringView in, QStringConverter::State *state)
1679{
1680 return QUtf16::convertFromUnicode(out, in, state, endian: LittleEndianness);
1681}
1682
1683static QChar *fromUtf32(QChar *out, QByteArrayView in, QStringConverter::State *state)
1684{
1685 return QUtf32::convertToUnicode(out, in, state, endian: DetectEndianness);
1686}
1687
1688static char *toUtf32(char *out, QStringView in, QStringConverter::State *state)
1689{
1690 return QUtf32::convertFromUnicode(out, in, state, endian: DetectEndianness);
1691}
1692
1693static QChar *fromUtf32BE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1694{
1695 return QUtf32::convertToUnicode(out, in, state, endian: BigEndianness);
1696}
1697
1698static char *toUtf32BE(char *out, QStringView in, QStringConverter::State *state)
1699{
1700 return QUtf32::convertFromUnicode(out, in, state, endian: BigEndianness);
1701}
1702
1703static QChar *fromUtf32LE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1704{
1705 return QUtf32::convertToUnicode(out, in, state, endian: LittleEndianness);
1706}
1707
1708static char *toUtf32LE(char *out, QStringView in, QStringConverter::State *state)
1709{
1710 return QUtf32::convertFromUnicode(out, in, state, endian: LittleEndianness);
1711}
1712#endif // !QT_BOOTSTRAPPED
1713
1714char *QLatin1::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state) noexcept
1715{
1716 Q_ASSERT(state);
1717 if (state->flags & QStringConverter::Flag::Stateless) // temporary
1718 state = nullptr;
1719
1720 const char replacement = (state && state->flags & QStringConverter::Flag::ConvertInvalidToNull) ? 0 : '?';
1721 qsizetype invalid = 0;
1722 for (qsizetype i = 0; i < in.size(); ++i) {
1723 if (in[i] > QChar(0xff)) {
1724 *out = replacement;
1725 ++invalid;
1726 } else {
1727 *out = (char)in[i].cell();
1728 }
1729 ++out;
1730 }
1731 if (state)
1732 state->invalidChars += invalid;
1733 return out;
1734}
1735
1736static QChar *fromLocal8Bit(QChar *out, QByteArrayView in, QStringConverter::State *state)
1737{
1738 QString s = QLocal8Bit::convertToUnicode(in, state);
1739 memcpy(dest: out, src: s.constData(), n: s.size()*sizeof(QChar));
1740 return out + s.size();
1741}
1742
1743static char *toLocal8Bit(char *out, QStringView in, QStringConverter::State *state)
1744{
1745 QByteArray s = QLocal8Bit::convertFromUnicode(in, state);
1746 memcpy(dest: out, src: s.constData(), n: s.size());
1747 return out + s.size();
1748}
1749
1750
1751static qsizetype fromUtf8Len(qsizetype l) { return l + 1; }
1752static qsizetype toUtf8Len(qsizetype l) { return 3*(l + 1); }
1753
1754#ifndef QT_BOOTSTRAPPED
1755static qsizetype fromUtf16Len(qsizetype l) { return l/2 + 2; }
1756static qsizetype toUtf16Len(qsizetype l) { return 2*(l + 1); }
1757
1758static qsizetype fromUtf32Len(qsizetype l) { return l/2 + 2; }
1759static qsizetype toUtf32Len(qsizetype l) { return 4*(l + 1); }
1760#endif
1761
1762static qsizetype fromLatin1Len(qsizetype l) { return l + 1; }
1763static qsizetype toLatin1Len(qsizetype l) { return l + 1; }
1764
1765
1766
1767/*!
1768 \class QStringConverterBase
1769 \internal
1770
1771 Just a common base class for QStringConverter and QTextCodec
1772*/
1773
1774/*!
1775 \class QStringConverter
1776 \inmodule QtCore
1777 \brief The QStringConverter class provides a base class for encoding and decoding text.
1778 \reentrant
1779 \ingroup i18n
1780
1781 Qt uses UTF-16 to store, draw and manipulate strings. In many
1782 situations you may wish to deal with data that uses a different
1783 encoding. Most text data transferred over files and network connections is encoded
1784 in UTF-8.
1785
1786 The QStringConverter class is a base class for the \l {QStringEncoder} and
1787 \l {QStringDecoder} classes that help with converting between different
1788 text encodings. QStringDecoder can decode a string from an encoded representation
1789 into UTF-16, the format Qt uses internally. QStringEncoder does the opposite
1790 operation, encoding UTF-16 encoded data (usually in the form of a QString) to
1791 the requested encoding.
1792
1793 The following encodings are always supported:
1794
1795 \list
1796 \li UTF-8
1797 \li UTF-16
1798 \li UTF-16BE
1799 \li UTF-16LE
1800 \li UTF-32
1801 \li UTF-32BE
1802 \li UTF-32LE
1803 \li ISO-8859-1 (Latin-1)
1804 \li The system encoding
1805 \endlist
1806
1807 QStringConverter may support more encodings depending on how Qt was
1808 compiled. If more codecs are supported, they can be listed using
1809 availableCodecs().
1810
1811 \l {QStringConverter}s can be used as follows to convert some encoded
1812 string to and from UTF-16.
1813
1814 Suppose you have some string encoded in UTF-8, and
1815 want to convert it to a QString. The simple way
1816 to do it is to use a \l {QStringDecoder} like this:
1817
1818 \snippet code/src_corelib_text_qstringconverter.cpp 0
1819
1820 After this, \c string holds the text in decoded form.
1821 Converting a string from Unicode to the local encoding is just as
1822 easy using the \l {QStringEncoder} class:
1823
1824 \snippet code/src_corelib_text_qstringconverter.cpp 1
1825
1826 To read or write text files in various encodings, use QTextStream and
1827 its \l{QTextStream::setEncoding()}{setEncoding()} function.
1828
1829 Some care must be taken when trying to convert the data in chunks,
1830 for example, when receiving it over a network. In such cases it is
1831 possible that a multi-byte character will be split over two
1832 chunks. At best this might result in the loss of a character and
1833 at worst cause the entire conversion to fail.
1834
1835 Both QStringEncoder and QStringDecoder make this easy, by tracking
1836 this in an internal state. So simply calling the encoder or decoder
1837 again with the next chunk of data will automatically continue encoding
1838 or decoding the data correctly:
1839
1840 \snippet code/src_corelib_text_qstringconverter.cpp 2
1841
1842 The QStringDecoder object maintains state between chunks and therefore
1843 works correctly even if a multi-byte character is split between
1844 chunks.
1845
1846 QStringConverter objects can't be copied because of their internal state, but
1847 can be moved.
1848
1849 \sa QTextStream, QStringDecoder, QStringEncoder
1850*/
1851
1852/*!
1853 \enum QStringConverter::Flag
1854
1855 \value Default Default conversion rules apply.
1856 \value ConvertInvalidToNull If this flag is set, each invalid input
1857 character is output as a null character. If it is not set,
1858 invalid input characters are represented as QChar::ReplacementCharacter
1859 if the output encoding can represent that character, otherwise as a question mark.
1860 \value WriteBom When converting from a QString to an output encoding, write a QChar::ByteOrderMark as the first
1861 character if the output encoding supports this. This is the case for UTF-8, UTF-16 and UTF-32
1862 encodings.
1863 \value ConvertInitialBom When converting from an input encoding to a QString the QStringDecoder usually skips an
1864 leading QChar::ByteOrderMark. When this flag is set, the byte order mark will not be
1865 skipped, but converted to utf-16 and inserted at the start of the created QString.
1866 \value Stateless Ignore possible converter states between different function calls
1867 to encode or decode strings. This will also cause the QStringConverter to raise an error if an incomplete
1868 sequence of data is encountered.
1869 \omitvalue UsesIcu
1870*/
1871
1872/*!
1873 \enum QStringConverter::Encoding
1874 \value Utf8 Create a converter to or from UTF-8
1875 \value Utf16 Create a converter to or from UTF-16. When decoding, the byte order will get automatically
1876 detected by a leading byte order mark. If none exists or when encoding, the system byte order will
1877 be assumed.
1878 \value Utf16BE Create a converter to or from big-endian UTF-16.
1879 \value Utf16LE Create a converter to or from little-endian UTF-16.
1880 \value Utf32 Create a converter to or from UTF-32. When decoding, the byte order will get automatically
1881 detected by a leading byte order mark. If none exists or when encoding, the system byte order will
1882 be assumed.
1883 \value Utf32BE Create a converter to or from big-endian UTF-32.
1884 \value Utf32LE Create a converter to or from little-endian UTF-32.
1885 \value Latin1 Create a converter to or from ISO-8859-1 (Latin1).
1886 \value System Create a converter to or from the underlying encoding of the
1887 operating systems locale. This is always assumed to be UTF-8 for Unix based
1888 systems. On Windows, this converts to and from the locale code page.
1889 \omitvalue LastEncoding
1890*/
1891
1892/*!
1893 \struct QStringConverter::Interface
1894 \internal
1895*/
1896
1897const QStringConverter::Interface QStringConverter::encodingInterfaces[QStringConverter::LastEncoding + 1] =
1898{
1899 { .name: "UTF-8", .toUtf16: QUtf8::convertToUnicode, .toUtf16Len: fromUtf8Len, .fromUtf16: QUtf8::convertFromUnicode, .fromUtf16Len: toUtf8Len },
1900#ifndef QT_BOOTSTRAPPED
1901 { .name: "UTF-16", .toUtf16: fromUtf16, .toUtf16Len: fromUtf16Len, .fromUtf16: toUtf16, .fromUtf16Len: toUtf16Len },
1902 { .name: "UTF-16LE", .toUtf16: fromUtf16LE, .toUtf16Len: fromUtf16Len, .fromUtf16: toUtf16LE, .fromUtf16Len: toUtf16Len },
1903 { .name: "UTF-16BE", .toUtf16: fromUtf16BE, .toUtf16Len: fromUtf16Len, .fromUtf16: toUtf16BE, .fromUtf16Len: toUtf16Len },
1904 { .name: "UTF-32", .toUtf16: fromUtf32, .toUtf16Len: fromUtf32Len, .fromUtf16: toUtf32, .fromUtf16Len: toUtf32Len },
1905 { .name: "UTF-32LE", .toUtf16: fromUtf32LE, .toUtf16Len: fromUtf32Len, .fromUtf16: toUtf32LE, .fromUtf16Len: toUtf32Len },
1906 { .name: "UTF-32BE", .toUtf16: fromUtf32BE, .toUtf16Len: fromUtf32Len, .fromUtf16: toUtf32BE, .fromUtf16Len: toUtf32Len },
1907#endif
1908 { .name: "ISO-8859-1", .toUtf16: QLatin1::convertToUnicode, .toUtf16Len: fromLatin1Len, .fromUtf16: QLatin1::convertFromUnicode, .fromUtf16Len: toLatin1Len },
1909 { .name: "Locale", .toUtf16: fromLocal8Bit, .toUtf16Len: fromUtf8Len, .fromUtf16: toLocal8Bit, .fromUtf16Len: toUtf8Len }
1910};
1911
1912// match names case insensitive and skipping '-' and '_'
1913template <typename Char>
1914static bool nameMatch_impl_impl(const char *a, const Char *b, const Char *b_end)
1915{
1916 do {
1917 while (*a == '-' || *a == '_')
1918 ++a;
1919 while (b != b_end && (*b == Char{'-'} || *b == Char{'_'}))
1920 ++b;
1921 if (!*a && b == b_end) // end of both strings
1922 return true;
1923 if (char16_t(*b) > 127)
1924 return false; // non-US-ASCII cannot match US-ASCII (prevents narrowing below)
1925 } while (QtMiscUtils::toAsciiLower(ch: *a++) == QtMiscUtils::toAsciiLower(ch: char(*b++)));
1926
1927 return false;
1928}
1929
1930static bool nameMatch_impl(const char *a, QLatin1StringView b)
1931{
1932 return nameMatch_impl_impl(a, b: b.begin(), b_end: b.end());
1933}
1934
1935static bool nameMatch_impl(const char *a, QUtf8StringView b)
1936{
1937 return nameMatch_impl(a, b: QLatin1StringView{QByteArrayView{b}});
1938}
1939
1940static bool nameMatch_impl(const char *a, QStringView b)
1941{
1942 return nameMatch_impl_impl(a, b: b.utf16(), b_end: b.utf16() + b.size()); // uses char16_t*, not QChar*
1943}
1944
1945static bool nameMatch(const char *a, QAnyStringView b)
1946{
1947 return b.visit(v: [a](auto b) { return nameMatch_impl(a, b); });
1948}
1949
1950
1951/*!
1952 \fn constexpr QStringConverter::QStringConverter()
1953 \internal
1954*/
1955
1956/*!
1957 \fn constexpr QStringConverter::QStringConverter(Encoding, Flags)
1958 \internal
1959*/
1960
1961
1962#if QT_CONFIG(icu)
1963// only derives from QStringConverter to get access to protected types
1964struct QStringConverterICU : QStringConverter
1965{
1966 static void clear_function(QStringConverterBase::State *state) noexcept
1967 {
1968 ucnv_close(converter: static_cast<UConverter *>(state->d[0]));
1969 state->d[0] = nullptr;
1970 }
1971
1972 static void ensureConverter(QStringConverter::State *state)
1973 {
1974 // old code might reset the state via clear instead of reset
1975 // in that case, the converter has been closed, and we have to reopen it
1976 if (state->d[0] == nullptr)
1977 state->d[0] = createConverterForName(name: static_cast<const char *>(state->d[1]), state);
1978 }
1979
1980 static QChar *toUtf16(QChar *out, QByteArrayView in, QStringConverter::State *state)
1981 {
1982 ensureConverter(state);
1983
1984 auto icu_conv = static_cast<UConverter *>(state->d[0]);
1985 UErrorCode err = U_ZERO_ERROR;
1986 auto source = in.data();
1987 auto sourceLimit = in.data() + in.size();
1988
1989 qsizetype length = toLen(inLength: in.size());
1990
1991 UChar *target = reinterpret_cast<UChar *>(out);
1992 auto targetLimit = target + length;
1993 // We explicitly clean up anyway, so no need to set flush to true,
1994 // which would just reset the converter.
1995 UBool flush = false;
1996
1997 // If the QStringConverter was moved, the state that we used as a context is stale now.
1998 UConverterToUCallback action;
1999 const void *context;
2000 ucnv_getToUCallBack(converter: icu_conv, action: &action, context: &context);
2001 if (context != state)
2002 ucnv_setToUCallBack(converter: icu_conv, newAction: action, newContext: state, oldAction: nullptr, oldContext: nullptr, err: &err);
2003
2004 ucnv_toUnicode(converter: icu_conv, target: &target, targetLimit, source: &source, sourceLimit, offsets: nullptr, flush, err: &err);
2005 // We did reserve enough space:
2006 Q_ASSERT(err != U_BUFFER_OVERFLOW_ERROR);
2007 if (state->flags.testFlag(flag: QStringConverter::Flag::Stateless)) {
2008 if (auto leftOver = ucnv_toUCountPending(cnv: icu_conv, status: &err)) {
2009 ucnv_reset(converter: icu_conv);
2010 state->invalidChars += leftOver;
2011 }
2012 }
2013 return reinterpret_cast<QChar *>(target);
2014 }
2015
2016 static char *fromUtf16(char *out, QStringView in, QStringConverter::State *state)
2017 {
2018 ensureConverter(state);
2019 auto icu_conv = static_cast<UConverter *>(state->d[0]);
2020 UErrorCode err = U_ZERO_ERROR;
2021 auto source = reinterpret_cast<const UChar *>(in.data());
2022 auto sourceLimit = reinterpret_cast<const UChar *>(in.data() + in.size());
2023
2024 qsizetype length = UCNV_GET_MAX_BYTES_FOR_STRING(in.size(), ucnv_getMaxCharSize(icu_conv));
2025
2026 char *target = out;
2027 char *targetLimit = out + length;
2028 UBool flush = false;
2029
2030 // If the QStringConverter was moved, the state that we used as a context is stale now.
2031 UConverterFromUCallback action;
2032 const void *context;
2033 ucnv_getFromUCallBack(converter: icu_conv, action: &action, context: &context);
2034 if (context != state)
2035 ucnv_setFromUCallBack(converter: icu_conv, newAction: action, newContext: state, oldAction: nullptr, oldContext: nullptr, err: &err);
2036
2037 ucnv_fromUnicode(converter: icu_conv, target: &target, targetLimit, source: &source, sourceLimit, offsets: nullptr, flush, err: &err);
2038 // We did reserve enough space:
2039 Q_ASSERT(err != U_BUFFER_OVERFLOW_ERROR);
2040 if (state->flags.testFlag(flag: QStringConverter::Flag::Stateless)) {
2041 if (auto leftOver = ucnv_fromUCountPending(cnv: icu_conv, status: &err)) {
2042 ucnv_reset(converter: icu_conv);
2043 state->invalidChars += leftOver;
2044 }
2045 }
2046 return target;
2047 }
2048
2049 Q_DISABLE_COPY_MOVE(QStringConverterICU)
2050
2051 template<qsizetype X>
2052 static qsizetype fromLen(qsizetype inLength)
2053 {
2054 return X * inLength * sizeof(UChar);
2055 }
2056
2057 static qsizetype toLen(qsizetype inLength)
2058 {
2059
2060 /* Assumption: each input char might map to a different codepoint
2061 Each codepoint can take up to 4 bytes == 2 QChar
2062 We can ignore reserving space for a BOM, as only UTF encodings use one
2063 and those are not handled by the ICU converter.
2064 */
2065 return 2 * inLength;
2066 }
2067
2068 static constexpr QStringConverter::Interface forLength[] = {
2069 {.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<1>},
2070 {.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<2>},
2071 {.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<3>},
2072 {.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<4>},
2073 {.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<5>},
2074 {.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<6>},
2075 {.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<7>},
2076 {.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<8>}
2077 };
2078
2079 static UConverter *createConverterForName(const char *name, const State *state)
2080 {
2081 Q_ASSERT(name);
2082 Q_ASSERT(state);
2083 UErrorCode status = U_ZERO_ERROR;
2084 UConverter *conv = ucnv_open(converterName: name, err: &status);
2085 if (status != U_ZERO_ERROR && status != U_AMBIGUOUS_ALIAS_WARNING) {
2086 ucnv_close(converter: conv);
2087 return nullptr;
2088 }
2089
2090 if (state->flags.testFlag(flag: Flag::ConvertInvalidToNull)) {
2091 UErrorCode error = U_ZERO_ERROR;
2092
2093 auto nullToSubstituter = [](const void *context, UConverterToUnicodeArgs *toUArgs,
2094 const char *, int32_t length,
2095 UConverterCallbackReason reason, UErrorCode *err) {
2096 if (reason <= UCNV_IRREGULAR) {
2097 *err = U_ZERO_ERROR;
2098 UChar c = '\0';
2099 ucnv_cbToUWriteUChars(args: toUArgs, source: &c, length: 1, offsetIndex: 0, err);
2100 // Recover outer scope's state (which isn't const) from context:
2101 auto state = const_cast<State *>(static_cast<const State *>(context));
2102 state->invalidChars += length;
2103 }
2104 };
2105 ucnv_setToUCallBack(converter: conv, newAction: nullToSubstituter, newContext: state, oldAction: nullptr, oldContext: nullptr, err: &error);
2106
2107 auto nullFromSubstituter = [](const void *context, UConverterFromUnicodeArgs *fromUArgs,
2108 const UChar *, int32_t length,
2109 UChar32, UConverterCallbackReason reason, UErrorCode *err) {
2110 if (reason <= UCNV_IRREGULAR) {
2111 *err = U_ZERO_ERROR;
2112 const UChar replacement[] = { 0 };
2113 const UChar *stringBegin = std::begin(arr: replacement);
2114 ucnv_cbFromUWriteUChars(args: fromUArgs, source: &stringBegin, sourceLimit: std::end(arr: replacement), offsetIndex: 0, err);
2115 // Recover outer scope's state (which isn't const) from context:
2116 auto state = const_cast<State *>(static_cast<const State *>(context));
2117 state->invalidChars += length;
2118 }
2119 };
2120 ucnv_setFromUCallBack(converter: conv, newAction: nullFromSubstituter, newContext: state, oldAction: nullptr, oldContext: nullptr, err: &error);
2121 } else {
2122 UErrorCode error = U_ZERO_ERROR;
2123
2124 auto qmarkToSubstituter = [](const void *context, UConverterToUnicodeArgs *toUArgs,
2125 const char *codeUnits,int32_t length,
2126 UConverterCallbackReason reason, UErrorCode *err) {
2127 if (reason <= UCNV_IRREGULAR) {
2128 // Recover outer scope's state (which isn't const) from context:
2129 auto state = const_cast<State *>(static_cast<const State *>(context));
2130 state->invalidChars += length;
2131 }
2132 // use existing ICU callback for logic
2133 UCNV_TO_U_CALLBACK_SUBSTITUTE(context: nullptr, toUArgs, codeUnits, length, reason, err);
2134
2135 };
2136 ucnv_setToUCallBack(converter: conv, newAction: qmarkToSubstituter, newContext: state, oldAction: nullptr, oldContext: nullptr, err: &error);
2137
2138 auto qmarkFromSubstituter = [](const void *context, UConverterFromUnicodeArgs *fromUArgs,
2139 const UChar *codeUnits, int32_t length,
2140 UChar32 codePoint, UConverterCallbackReason reason, UErrorCode *err) {
2141 if (reason <= UCNV_IRREGULAR) {
2142 // Recover outer scope's state (which isn't const) from context:
2143 auto state = const_cast<State *>(static_cast<const State *>(context));
2144 state->invalidChars += length;
2145 }
2146 // use existing ICU callback for logic
2147 UCNV_FROM_U_CALLBACK_SUBSTITUTE(context: nullptr, fromUArgs, codeUnits, length,
2148 codePoint, reason, err);
2149 };
2150 ucnv_setFromUCallBack(converter: conv, newAction: qmarkFromSubstituter, newContext: state, oldAction: nullptr, oldContext: nullptr, err: &error);
2151 }
2152 return conv;
2153 }
2154
2155 static std::string nul_terminate_impl(QLatin1StringView name)
2156 { return name.isNull() ? std::string() : std::string{name.data(), size_t(name.size())}; }
2157
2158 static std::string nul_terminate_impl(QUtf8StringView name)
2159 { return nul_terminate_impl(name: QLatin1StringView{QByteArrayView{name}}); }
2160
2161 static std::string nul_terminate_impl(QStringView name)
2162 {
2163 std::string result;
2164 const auto convert = [&](char *p, size_t n) {
2165 const auto sz = QLatin1::convertFromUnicode(out: p, in: name) - p;
2166 Q_ASSERT(size_t(sz) <= n);
2167 return sz;
2168 };
2169#ifdef __cpp_lib_string_resize_and_overwrite
2170 result.resize_and_overwrite(size_t(name.size()), convert);
2171#else
2172 result.resize(n: size_t(name.size()));
2173 result.resize(n: convert(result.data(), result.size()));
2174#endif // __cpp_lib_string_resize_and_overwrite
2175 return result;
2176 }
2177
2178 static std::string nul_terminate(QAnyStringView name)
2179 { return name.visit(v: [](auto name) { return nul_terminate_impl(name); }); }
2180
2181 static const QStringConverter::Interface *
2182 make_icu_converter(QStringConverterBase::State *state, QAnyStringView name)
2183 { return make_icu_converter(state, name: nul_terminate(name).data()); }
2184
2185 static const QStringConverter::Interface *make_icu_converter(
2186 QStringConverterBase::State *state,
2187 const char *name)
2188 {
2189 UErrorCode status = U_ZERO_ERROR;
2190 UConverter *conv = createConverterForName(name, state);
2191 if (!conv)
2192 return nullptr;
2193
2194 const char *icuName = ucnv_getName(converter: conv, err: &status);
2195 // ucnv_getStandardName returns a name which is owned by the library
2196 // we can thus store it in the state without worrying aobut its lifetime
2197 const char *persistentName = ucnv_getStandardName(name: icuName, standard: "MIME", pErrorCode: &status);
2198 if (U_FAILURE(code: status) || !persistentName) {
2199 status = U_ZERO_ERROR;
2200 persistentName = ucnv_getStandardName(name: icuName, standard: "IANA", pErrorCode: &status);
2201 }
2202 state->d[1] = const_cast<char *>(persistentName);
2203 state->d[0] = conv;
2204 state->flags |= QStringConverterBase::Flag::UsesIcu;
2205 qsizetype maxCharSize = ucnv_getMaxCharSize(converter: conv);
2206 state->clearFn = QStringConverterICU::clear_function;
2207 if (maxCharSize > 8 || maxCharSize < 1) {
2208 qWarning(msg: "Encountered unexpected codec \"%s\" which requires >8x space", name);
2209 return nullptr;
2210 } else {
2211 return &forLength[maxCharSize - 1];
2212 }
2213
2214 }
2215
2216};
2217#endif
2218
2219/*!
2220 \internal
2221*/
2222QStringConverter::QStringConverter(QAnyStringView name, Flags f)
2223 : iface(nullptr), state(f)
2224{
2225 auto e = encodingForName(name);
2226 if (e)
2227 iface = encodingInterfaces + int(*e);
2228#if QT_CONFIG(icu)
2229 else
2230 iface = QStringConverterICU::make_icu_converter(state: &state, name);
2231#endif
2232}
2233
2234
2235const char *QStringConverter::name() const noexcept
2236{
2237 if (!iface)
2238 return nullptr;
2239 if (state.flags & QStringConverter::Flag::UsesIcu) {
2240#if QT_CONFIG(icu)
2241 return static_cast<const char*>(state.d[1]);
2242#else
2243 return nullptr;
2244#endif
2245 } else {
2246 return iface->name;
2247 }
2248}
2249
2250/*!
2251 \fn bool QStringConverter::isValid() const
2252
2253 Returns true if this is a valid string converter that can be used for encoding or
2254 decoding text.
2255
2256 Default constructed string converters or converters constructed with an unsupported
2257 name are not valid.
2258*/
2259
2260/*!
2261 \fn void QStringConverter::resetState()
2262
2263 Resets the internal state of the converter, clearing potential errors or partial
2264 conversions.
2265*/
2266
2267/*!
2268 \fn bool QStringConverter::hasError() const
2269
2270 Returns true if a conversion could not correctly convert a character. This could for example
2271 get triggered by an invalid UTF-8 sequence or when a character can't get converted due to
2272 limitations in the target encoding.
2273*/
2274
2275/*!
2276 \fn const char *QStringConverter::name() const
2277
2278 Returns the canonical name of the encoding this QStringConverter can encode or decode.
2279 Returns a nullptr if the converter is not valid.
2280 The returned name is UTF-8 encoded.
2281
2282 \sa isValid()
2283*/
2284
2285/*!
2286 Convert \a name to the corresponding \l Encoding member, if there is one.
2287
2288 If the \a name is not the name of a codec listed in the Encoding enumeration,
2289 \c{std::nullopt} is returned. Such a name may, none the less, be accepted by
2290 the QStringConverter constructor when Qt is built with ICU, if ICU provides a
2291 converter with the given name.
2292
2293 \note In Qt versions prior to 6.8, this function took only a \c{const char *},
2294 which was expected to be UTF-8-encoded.
2295*/
2296std::optional<QStringConverter::Encoding> QStringConverter::encodingForName(QAnyStringView name) noexcept
2297{
2298 if (name.isEmpty())
2299 return std::nullopt;
2300 for (qsizetype i = 0; i < LastEncoding + 1; ++i) {
2301 if (nameMatch(a: encodingInterfaces[i].name, b: name))
2302 return QStringConverter::Encoding(i);
2303 }
2304 if (nameMatch(a: "latin1", b: name))
2305 return QStringConverter::Latin1;
2306 return std::nullopt;
2307}
2308
2309#ifndef QT_BOOTSTRAPPED
2310/*!
2311 Returns the encoding for the content of \a data if it can be determined.
2312 \a expectedFirstCharacter can be passed as an additional hint to help determine
2313 the encoding.
2314
2315 The returned optional is empty, if the encoding is unclear.
2316 */
2317std::optional<QStringConverter::Encoding>
2318QStringConverter::encodingForData(QByteArrayView data, char16_t expectedFirstCharacter) noexcept
2319{
2320 // someone set us up the BOM?
2321 qsizetype arraySize = data.size();
2322 if (arraySize > 3) {
2323 char32_t uc = qFromUnaligned<char32_t>(src: data.data());
2324 if (uc == qToBigEndian(source: char32_t(QChar::ByteOrderMark)))
2325 return QStringConverter::Utf32BE;
2326 if (uc == qToLittleEndian(source: char32_t(QChar::ByteOrderMark)))
2327 return QStringConverter::Utf32LE;
2328 if (expectedFirstCharacter) {
2329 // catch also anything starting with the expected character
2330 if (qToLittleEndian(source: uc) == expectedFirstCharacter)
2331 return QStringConverter::Utf32LE;
2332 else if (qToBigEndian(source: uc) == expectedFirstCharacter)
2333 return QStringConverter::Utf32BE;
2334 }
2335 }
2336
2337 if (arraySize > 2) {
2338 if (memcmp(s1: data.data(), s2: utf8bom, n: sizeof(utf8bom)) == 0)
2339 return QStringConverter::Utf8;
2340 }
2341
2342 if (arraySize > 1) {
2343 char16_t uc = qFromUnaligned<char16_t>(src: data.data());
2344 if (uc == qToBigEndian(source: char16_t(QChar::ByteOrderMark)))
2345 return QStringConverter::Utf16BE;
2346 if (uc == qToLittleEndian(source: char16_t(QChar::ByteOrderMark)))
2347 return QStringConverter::Utf16LE;
2348 if (expectedFirstCharacter) {
2349 // catch also anything starting with the expected character
2350 if (qToLittleEndian(source: uc) == expectedFirstCharacter)
2351 return QStringConverter::Utf16LE;
2352 else if (qToBigEndian(source: uc) == expectedFirstCharacter)
2353 return QStringConverter::Utf16BE;
2354 }
2355 }
2356 return std::nullopt;
2357}
2358
2359static QByteArray parseHtmlMetaForEncoding(QByteArrayView data)
2360{
2361 static constexpr auto metaSearcher = qMakeStaticByteArrayMatcher(pattern: "meta ");
2362 static constexpr auto charsetSearcher = qMakeStaticByteArrayMatcher(pattern: "charset=");
2363
2364 QByteArray header = data.first(n: qMin(a: data.size(), b: qsizetype(1024))).toByteArray().toLower();
2365 qsizetype pos = metaSearcher.indexIn(haystack: header);
2366 if (pos != -1) {
2367 pos = charsetSearcher.indexIn(haystack: header, from: pos);
2368 if (pos != -1) {
2369 pos += qstrlen(str: "charset=");
2370 if (pos < header.size() && (header.at(i: pos) == '\"' || header.at(i: pos) == '\''))
2371 ++pos;
2372
2373 qsizetype pos2 = pos;
2374 // The attribute can be closed with either """, "'", ">" or "/",
2375 // none of which are valid charset characters.
2376 while (++pos2 < header.size()) {
2377 char ch = header.at(i: pos2);
2378 if (ch == '\"' || ch == '\'' || ch == '>' || ch == '/') {
2379 QByteArray name = header.mid(index: pos, len: pos2 - pos);
2380 qsizetype colon = name.indexOf(c: ':');
2381 if (colon > 0)
2382 name = name.left(n: colon);
2383 name = name.simplified();
2384 if (name == "unicode") // QTBUG-41998, ICU will return UTF-16.
2385 name = QByteArrayLiteral("UTF-8");
2386 if (!name.isEmpty())
2387 return name;
2388 }
2389 }
2390 }
2391 }
2392 return QByteArray();
2393}
2394
2395/*!
2396 Tries to determine the encoding of the HTML in \a data by looking at leading byte
2397 order marks or a charset specifier in the HTML meta tag. If the optional is empty,
2398 the encoding specified is not supported by QStringConverter. If no encoding is
2399 detected, the method returns Utf8.
2400
2401 \sa QStringDecoder::decoderForHtml()
2402*/
2403std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(QByteArrayView data)
2404{
2405 // determine charset
2406 std::optional<QStringConverter::Encoding> encoding = encodingForData(data);
2407 if (encoding)
2408 // trust the initial BOM
2409 return encoding;
2410
2411 QByteArray encodingTag = parseHtmlMetaForEncoding(data);
2412 if (!encodingTag.isEmpty())
2413 return encodingForName(name: encodingTag);
2414
2415 return Utf8;
2416}
2417
2418static qsizetype availableCodecCount()
2419{
2420#if !QT_CONFIG(icu)
2421 return QStringConverter::Encoding::LastEncoding;
2422#else
2423 /* icu contains also the names of what Qt provides
2424 except for the special Locale one (so add one for it)
2425 */
2426 return 1 + ucnv_countAvailable();
2427#endif
2428}
2429
2430/*!
2431 Returns a list of names of supported codecs. The names returned
2432 by this function can be passed to QStringEncoder's and
2433 QStringDecoder's constructor to create a en- or decoder for
2434 the given codec.
2435
2436 This function may be used to obtain a listing of additional codecs beyond
2437 the standard ones. Support for additional codecs requires Qt be compiled
2438 with support for the ICU library.
2439
2440 \note The order of codecs is an internal implementation detail
2441 and not guaranteed to be stable.
2442 */
2443QStringList QStringConverter::availableCodecs()
2444{
2445 auto availableCodec = [](qsizetype index) -> QString
2446 {
2447 #if !QT_CONFIG(icu)
2448 return QString::fromLatin1(encodingInterfaces[index].name);
2449 #else
2450 if (index == 0) // "Locale", not provided by icu
2451 return QString::fromLatin1(
2452 ba: encodingInterfaces[QStringConverter::Encoding::System].name);
2453 // this mirrors the setup we do to set a converters name
2454 UErrorCode status = U_ZERO_ERROR;
2455 auto icuName = ucnv_getAvailableName(n: int32_t(index - 1));
2456 const char *standardName = ucnv_getStandardName(name: icuName, standard: "MIME", pErrorCode: &status);
2457 if (U_FAILURE(code: status) || !standardName) {
2458 status = U_ZERO_ERROR;
2459 standardName = ucnv_getStandardName(name: icuName, standard: "IANA", pErrorCode: &status);
2460 }
2461 if (!standardName)
2462 standardName = icuName;
2463 return QString::fromLatin1(ba: standardName);
2464 #endif
2465 };
2466
2467 qsizetype codecCount = availableCodecCount();
2468 QStringList result;
2469 result.reserve(asize: codecCount);
2470 for (qsizetype i = 0; i < codecCount; ++i)
2471 result.push_back(t: availableCodec(i));
2472 return result;
2473}
2474
2475/*!
2476 Tries to determine the encoding of the HTML in \a data by looking at leading byte
2477 order marks or a charset specifier in the HTML meta tag and returns a QStringDecoder
2478 matching the encoding. If the returned decoder is not valid,
2479 the encoding specified is not supported by QStringConverter. If no encoding is
2480 detected, the method returns a decoder for Utf8.
2481
2482 \sa isValid()
2483*/
2484QStringDecoder QStringDecoder::decoderForHtml(QByteArrayView data)
2485{
2486 // determine charset
2487 std::optional<QStringConverter::Encoding> encoding = encodingForData(data);
2488 if (encoding)
2489 // trust the initial BOM
2490 return QStringDecoder(encoding.value());
2491
2492 QByteArray encodingTag = parseHtmlMetaForEncoding(data);
2493 if (!encodingTag.isEmpty())
2494 return QStringDecoder(encodingTag);
2495
2496 return QStringDecoder(Utf8);
2497}
2498#endif // !QT_BOOTSTRAPPED
2499
2500/*!
2501 Returns the canonical name for encoding \a e.
2502*/
2503const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e)
2504{
2505 return encodingInterfaces[int(e)].name;
2506}
2507
2508/*!
2509 \class QStringEncoder
2510 \inmodule QtCore
2511 \brief The QStringEncoder class provides a state-based encoder for text.
2512 \reentrant
2513 \ingroup i18n
2514
2515 A text encoder converts text from Qt's internal representation into an encoded
2516 text format using a specific encoding.
2517
2518 Converting a string from Unicode to the local encoding can be achieved
2519 using the following code:
2520
2521 \snippet code/src_corelib_text_qstringconverter.cpp 1
2522
2523 The encoder remembers any state that is required between calls, so converting
2524 data received in chunks, for example, when receiving it over a network, is just as
2525 easy, by calling the encoder whenever new data is available:
2526
2527 \snippet code/src_corelib_text_qstringconverter.cpp 3
2528
2529 The QStringEncoder object maintains state between chunks and therefore
2530 works correctly even if a UTF-16 surrogate character is split between
2531 chunks.
2532
2533 QStringEncoder objects can't be copied because of their internal state, but
2534 can be moved.
2535
2536 \sa QStringConverter, QStringDecoder
2537*/
2538
2539/*!
2540 \fn constexpr QStringEncoder::QStringEncoder(const Interface *i)
2541 \internal
2542*/
2543
2544/*!
2545 \fn constexpr QStringEncoder::QStringEncoder()
2546
2547 Default constructs an encoder. The default encoder is not valid,
2548 and can't be used for converting text.
2549*/
2550
2551/*!
2552 \fn constexpr QStringEncoder::QStringEncoder(Encoding encoding, Flags flags = Flag::Default)
2553
2554 Creates an encoder object using \a encoding and \a flags.
2555*/
2556
2557/*!
2558 \fn QStringEncoder::QStringEncoder(QAnyStringView name, Flags flags = Flag::Default)
2559
2560 Creates an encoder object using \a name and \a flags.
2561 If \a name is not the name of a known encoding an invalid converter will get created.
2562
2563 \note In Qt versions prior to 6.8, this function took only a \c{const char *},
2564 which was expected to be UTF-8-encoded.
2565
2566 \sa isValid()
2567*/
2568
2569/*!
2570 \fn QStringEncoder::DecodedData<const QString &> QStringEncoder::encode(const QString &in)
2571 \fn QStringEncoder::DecodedData<QStringView> QStringEncoder::encode(QStringView in)
2572 \fn QStringEncoder::DecodedData<const QString &> QStringEncoder::operator()(const QString &in)
2573 \fn QStringEncoder::DecodedData<QStringView> QStringEncoder::operator()(QStringView in)
2574
2575 Converts \a in and returns a struct that is implicitly convertible to QByteArray.
2576
2577 \snippet code/src_corelib_text_qstringconverter.cpp 5
2578*/
2579
2580/*!
2581 \fn qsizetype QStringEncoder::requiredSpace(qsizetype inputLength) const
2582
2583 Returns the maximum amount of characters required to be able to process
2584 \a inputLength decoded data.
2585
2586 \sa appendToBuffer()
2587*/
2588
2589/*!
2590 \fn char *QStringEncoder::appendToBuffer(char *out, QStringView in)
2591
2592 Encodes \a in and writes the encoded result into the buffer
2593 starting at \a out. Returns a pointer to the end of the data written.
2594
2595 \note \a out must be large enough to be able to hold all the decoded data. Use
2596 requiredSpace() to determine the maximum size requirement to be able to encode
2597 \a in.
2598
2599 \sa requiredSpace()
2600*/
2601
2602/*!
2603 \class QStringDecoder
2604 \inmodule QtCore
2605 \brief The QStringDecoder class provides a state-based decoder for text.
2606 \reentrant
2607 \ingroup i18n
2608
2609 A text decoder converts text an encoded text format that uses a specific encoding
2610 into Qt's internal representation.
2611
2612 Converting encoded data into a QString can be achieved
2613 using the following code:
2614
2615 \snippet code/src_corelib_text_qstringconverter.cpp 0
2616
2617 The decoder remembers any state that is required between calls, so converting
2618 data received in chunks, for example, when receiving it over a network, is just as
2619 easy, by calling the decoder whenever new data is available:
2620
2621 \snippet code/src_corelib_text_qstringconverter.cpp 2
2622
2623 The QStringDecoder object maintains state between chunks and therefore
2624 works correctly even if chunks are split in the middle of a multi-byte character
2625 sequence.
2626
2627 QStringDecoder objects can't be copied because of their internal state, but
2628 can be moved.
2629
2630 \sa QStringConverter, QStringEncoder
2631*/
2632
2633/*!
2634 \fn constexpr QStringDecoder::QStringDecoder(const Interface *i)
2635 \internal
2636*/
2637
2638/*!
2639 \fn constexpr QStringDecoder::QStringDecoder()
2640
2641 Default constructs an decoder. The default decoder is not valid,
2642 and can't be used for converting text.
2643*/
2644
2645/*!
2646 \fn constexpr QStringDecoder::QStringDecoder(Encoding encoding, Flags flags = Flag::Default)
2647
2648 Creates an decoder object using \a encoding and \a flags.
2649*/
2650
2651/*!
2652 \fn QStringDecoder::QStringDecoder(QAnyStringView name, Flags flags = Flag::Default)
2653
2654 Creates an decoder object using \a name and \a flags.
2655 If \a name is not the name of a known encoding an invalid converter will get created.
2656
2657 \note In Qt versions prior to 6.8, this function took only a \c{const char *},
2658 which was expected to be UTF-8-encoded.
2659
2660 \sa isValid()
2661*/
2662
2663/*!
2664 \fn QStringDecoder::EncodedData<const QByteArray &> QStringDecoder::operator()(const QByteArray &ba)
2665 \fn QStringDecoder::EncodedData<const QByteArray &> QStringDecoder::decode(const QByteArray &ba)
2666 \fn QStringDecoder::EncodedData<QByteArrayView> QStringDecoder::operator()(QByteArrayView ba)
2667 \fn QStringDecoder::EncodedData<QByteArrayView> QStringDecoder::decode(QByteArrayView ba)
2668
2669 Converts \a ba and returns a struct that is implicitly convertible to QString.
2670
2671
2672 \snippet code/src_corelib_text_qstringconverter.cpp 4
2673*/
2674
2675/*!
2676 \fn qsizetype QStringDecoder::requiredSpace(qsizetype inputLength) const
2677
2678 Returns the maximum amount of UTF-16 code units required to be able to process
2679 \a inputLength encoded data.
2680
2681 \sa appendToBuffer
2682*/
2683
2684/*!
2685 \fn QChar *QStringDecoder::appendToBuffer(QChar *out, QByteArrayView in)
2686
2687 Decodes the sequence of bytes viewed by \a in and writes the decoded result into
2688 the buffer starting at \a out. Returns a pointer to the end of data written.
2689
2690 \a out needs to be large enough to be able to hold all the decoded data. Use
2691 \l{requiredSpace} to determine the maximum size requirements to decode an encoded
2692 data buffer of \c in.size() bytes.
2693
2694 \sa requiredSpace
2695*/
2696
2697/*!
2698 \fn char16_t *QStringDecoder::appendToBuffer(char16_t *out, QByteArrayView in)
2699 \since 6.6
2700 \overload
2701*/
2702
2703QT_END_NAMESPACE
2704

Provided by KDAB

Privacy Policy
Start learning QML with our Intro Training
Find out more

source code of qtbase/src/corelib/text/qstringconverter.cpp