1// Copyright (C) 2020 The Qt Company Ltd.
2// Copyright (C) 2020 Intel Corporation.
3// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4
5#include <qstringconverter.h>
6#include <private/qstringconverter_p.h>
7#include "qendian.h"
8
9#include "private/qsimd_p.h"
10#include "private/qstringiterator_p.h"
11#include "private/qtools_p.h"
12#include "qbytearraymatcher.h"
13
14#if QT_CONFIG(icu)
15#include <unicode/ucnv.h>
16#include <unicode/ucnv_cb.h>
17#include <unicode/ucnv_err.h>
18#include <unicode/ustring.h>
19#endif
20
21#ifdef Q_OS_WIN
22#include <qt_windows.h>
23#ifndef QT_BOOTSTRAPPED
24#include <QtCore/qvarlengtharray.h>
25#endif // !QT_BOOTSTRAPPED
26#endif
27
28#if __has_include(<bit>) && __cplusplus > 201703L
29#include <bit>
30#endif
31
32QT_BEGIN_NAMESPACE
33
34using namespace QtMiscUtils;
35
36static_assert(std::is_nothrow_move_constructible_v<QStringEncoder>);
37static_assert(std::is_nothrow_move_assignable_v<QStringEncoder>);
38static_assert(std::is_nothrow_move_constructible_v<QStringDecoder>);
39static_assert(std::is_nothrow_move_assignable_v<QStringDecoder>);
40
41enum { Endian = 0, Data = 1 };
42
43static const uchar utf8bom[] = { 0xef, 0xbb, 0xbf };
44
45#if defined(__SSE2__) || defined(__ARM_NEON__)
46static Q_ALWAYS_INLINE uint qBitScanReverse(unsigned v) noexcept
47{
48#if defined(__cpp_lib_int_pow2) && __cpp_lib_int_pow2 >= 202002L
49 return std::bit_width(v) - 1;
50#else
51 uint result = qCountLeadingZeroBits(v);
52 // Now Invert the result: clz will count *down* from the msb to the lsb, so the msb index is 31
53 // and the lsb index is 0. The result for _bit_scan_reverse is expected to be the index when
54 // counting up: msb index is 0 (because it starts there), and the lsb index is 31.
55 result ^= sizeof(unsigned) * 8 - 1;
56 return result;
57#endif
58}
59#endif
60
61#if defined(__SSE2__)
62static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end)
63{
64 // do sixteen characters at a time
65 for ( ; end - src >= 16; src += 16, dst += 16) {
66# ifdef __AVX2__
67 __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
68 __m128i data1 = _mm256_castsi256_si128(data);
69 __m128i data2 = _mm256_extracti128_si256(data, 1);
70# else
71 __m128i data1 = _mm_loadu_si128(p: (const __m128i*)src);
72 __m128i data2 = _mm_loadu_si128(p: 1+(const __m128i*)src);
73# endif
74
75 // check if everything is ASCII
76 // the highest ASCII value is U+007F
77 // Do the packing directly:
78 // The PACKUSWB instruction has packs a signed 16-bit integer to an unsigned 8-bit
79 // with saturation. That is, anything from 0x0100 to 0x7fff is saturated to 0xff,
80 // while all negatives (0x8000 to 0xffff) get saturated to 0x00. To detect non-ASCII,
81 // we simply do a signed greater-than comparison to 0x00. That means we detect NULs as
82 // "non-ASCII", but it's an acceptable compromise.
83 __m128i packed = _mm_packus_epi16(a: data1, b: data2);
84 __m128i nonAscii = _mm_cmpgt_epi8(a: packed, b: _mm_setzero_si128());
85
86 // store, even if there are non-ASCII characters here
87 _mm_storeu_si128(p: (__m128i*)dst, b: packed);
88
89 // n will contain 1 bit set per character in [data1, data2] that is non-ASCII (or NUL)
90 ushort n = ~_mm_movemask_epi8(a: nonAscii);
91 if (n) {
92 // find the next probable ASCII character
93 // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
94 // characters still coming
95 nextAscii = src + qBitScanReverse(v: n) + 1;
96
97 n = qCountTrailingZeroBits(v: n);
98 dst += n;
99 src += n;
100 return false;
101 }
102 }
103
104 if (end - src >= 8) {
105 // do eight characters at a time
106 __m128i data = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(src));
107 __m128i packed = _mm_packus_epi16(a: data, b: data);
108 __m128i nonAscii = _mm_cmpgt_epi8(a: packed, b: _mm_setzero_si128());
109
110 // store even non-ASCII
111 _mm_storel_epi64(p: reinterpret_cast<__m128i *>(dst), a: packed);
112
113 uchar n = ~_mm_movemask_epi8(a: nonAscii);
114 if (n) {
115 nextAscii = src + qBitScanReverse(v: n) + 1;
116 n = qCountTrailingZeroBits(v: n);
117 dst += n;
118 src += n;
119 return false;
120 }
121 }
122
123 return src == end;
124}
125
126static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
127{
128 // do sixteen characters at a time
129 for ( ; end - src >= 16; src += 16, dst += 16) {
130 __m128i data = _mm_loadu_si128(p: (const __m128i*)src);
131
132#ifdef __AVX2__
133 const int BitSpacing = 2;
134 // load and zero extend to an YMM register
135 const __m256i extended = _mm256_cvtepu8_epi16(data);
136
137 uint n = _mm256_movemask_epi8(extended);
138 if (!n) {
139 // store
140 _mm256_storeu_si256((__m256i*)dst, extended);
141 continue;
142 }
143#else
144 const int BitSpacing = 1;
145
146 // check if everything is ASCII
147 // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
148 uint n = _mm_movemask_epi8(a: data);
149 if (!n) {
150 // unpack
151 _mm_storeu_si128(p: (__m128i*)dst, b: _mm_unpacklo_epi8(a: data, b: _mm_setzero_si128()));
152 _mm_storeu_si128(p: 1+(__m128i*)dst, b: _mm_unpackhi_epi8(a: data, b: _mm_setzero_si128()));
153 continue;
154 }
155#endif
156
157 // copy the front part that is still ASCII
158 while (!(n & 1)) {
159 *dst++ = *src++;
160 n >>= BitSpacing;
161 }
162
163 // find the next probable ASCII character
164 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
165 // characters still coming
166 n = qBitScanReverse(v: n);
167 nextAscii = src + (n / BitSpacing) + 1;
168 return false;
169
170 }
171
172 if (end - src >= 8) {
173 __m128i data = _mm_loadl_epi64(p: reinterpret_cast<const __m128i *>(src));
174 uint n = _mm_movemask_epi8(a: data) & 0xff;
175 if (!n) {
176 // unpack and store
177 _mm_storeu_si128(p: reinterpret_cast<__m128i *>(dst), b: _mm_unpacklo_epi8(a: data, b: _mm_setzero_si128()));
178 } else {
179 while (!(n & 1)) {
180 *dst++ = *src++;
181 n >>= 1;
182 }
183
184 n = qBitScanReverse(v: n);
185 nextAscii = src + n + 1;
186 return false;
187 }
188 }
189
190 return src == end;
191}
192
193static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
194{
195#ifdef __AVX2__
196 // do 32 characters at a time
197 // (this is similar to simdTestMask in qstring.cpp)
198 const __m256i mask = _mm256_set1_epi8(char(0x80));
199 for ( ; end - src >= 32; src += 32) {
200 __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
201 if (_mm256_testz_si256(mask, data))
202 continue;
203
204 uint n = _mm256_movemask_epi8(data);
205 Q_ASSUME(n);
206
207 // find the next probable ASCII character
208 // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
209 // characters still coming
210 nextAscii = src + qBitScanReverse(n) + 1;
211
212 // return the non-ASCII character
213 return src + qCountTrailingZeroBits(n);
214 }
215#endif
216
217 // do sixteen characters at a time
218 for ( ; end - src >= 16; src += 16) {
219 __m128i data = _mm_loadu_si128(p: reinterpret_cast<const __m128i*>(src));
220
221 // check if everything is ASCII
222 // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
223 uint n = _mm_movemask_epi8(a: data);
224 if (!n)
225 continue;
226
227 // find the next probable ASCII character
228 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
229 // characters still coming
230 nextAscii = src + qBitScanReverse(v: n) + 1;
231
232 // return the non-ASCII character
233 return src + qCountTrailingZeroBits(v: n);
234 }
235
236 // do four characters at a time
237 for ( ; end - src >= 4; src += 4) {
238 quint32 data = qFromUnaligned<quint32>(src);
239 data &= 0x80808080U;
240 if (!data)
241 continue;
242
243 // We don't try to guess which of the three bytes is ASCII and which
244 // one isn't. The chance that at least two of them are non-ASCII is
245 // better than 75%.
246 nextAscii = src;
247 return src;
248 }
249 nextAscii = end;
250 return src;
251}
252
253// Compare only the US-ASCII beginning of [src8, end8) and [src16, end16)
254// and advance src8 and src16 to the first character that could not be compared
255static void simdCompareAscii(const qchar8_t *&src8, const qchar8_t *end8, const char16_t *&src16, const char16_t *end16)
256{
257 int bitSpacing = 1;
258 qptrdiff len = qMin(a: end8 - src8, b: end16 - src16);
259 qptrdiff offset = 0;
260 uint mask = 0;
261
262 // do sixteen characters at a time
263 for ( ; offset + 16 < len; offset += 16) {
264 __m128i data8 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(src8 + offset));
265#ifdef __AVX2__
266 // AVX2 version, use 256-bit registers and VPMOVXZBW
267 __m256i data16 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src16 + offset));
268
269 // expand US-ASCII as if it were Latin1 and confirm it's US-ASCII
270 __m256i datax8 = _mm256_cvtepu8_epi16(data8);
271 mask = _mm256_movemask_epi8(datax8);
272 if (mask)
273 break;
274
275 // compare Latin1 to UTF-16
276 __m256i latin1cmp = _mm256_cmpeq_epi16(datax8, data16);
277 mask = ~_mm256_movemask_epi8(latin1cmp);
278 if (mask)
279 break;
280#else
281 // non-AVX2 code
282 __m128i datalo16 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(src16 + offset));
283 __m128i datahi16 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(src16 + offset) + 1);
284
285 // expand US-ASCII as if it were Latin1, we'll confirm later
286 __m128i datalo8 = _mm_unpacklo_epi8(a: data8, b: _mm_setzero_si128());
287 __m128i datahi8 = _mm_unpackhi_epi8(a: data8, b: _mm_setzero_si128());
288
289 // compare Latin1 to UTF-16
290 __m128i latin1cmplo = _mm_cmpeq_epi16(a: datalo8, b: datalo16);
291 __m128i latin1cmphi = _mm_cmpeq_epi16(a: datahi8, b: datahi16);
292 mask = _mm_movemask_epi8(a: latin1cmphi) << 16;
293 mask |= ushort(_mm_movemask_epi8(a: latin1cmplo));
294 mask = ~mask;
295 if (mask)
296 break;
297
298 // confirm it was US-ASCII
299 mask = _mm_movemask_epi8(a: data8);
300 if (mask) {
301 bitSpacing = 0;
302 break;
303 }
304#endif
305 }
306
307 // helper for comparing 4 or 8 characters
308 auto cmp_lt_16 = [&mask, &offset](int n, __m128i data8, __m128i data16) {
309 // n = 4 -> sizemask = 0xff
310 // n = 8 -> sizemask = 0xffff
311 unsigned sizemask = (1U << (2 * n)) - 1;
312
313 // expand as if Latin1
314 data8 = _mm_unpacklo_epi8(a: data8, b: _mm_setzero_si128());
315
316 // compare and confirm it's US-ASCII
317 __m128i latin1cmp = _mm_cmpeq_epi16(a: data8, b: data16);
318 mask = ~_mm_movemask_epi8(a: latin1cmp) & sizemask;
319 mask |= _mm_movemask_epi8(a: data8);
320 if (mask == 0)
321 offset += n;
322 };
323
324 // do eight characters at a time
325 if (mask == 0 && offset + 8 < len) {
326 __m128i data8 = _mm_loadl_epi64(p: reinterpret_cast<const __m128i *>(src8 + offset));
327 __m128i data16 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(src16 + offset));
328 cmp_lt_16(8, data8, data16);
329 }
330
331 // do four characters
332 if (mask == 0 && offset + 4 < len) {
333 __m128i data8 = _mm_cvtsi32_si128(a: qFromUnaligned<quint32>(src: src8 + offset));
334 __m128i data16 = _mm_loadl_epi64(p: reinterpret_cast<const __m128i *>(src16 + offset));
335 cmp_lt_16(4, data8, data16);
336 }
337
338 // correct the source pointers to point to the first character we couldn't deal with
339 if (mask)
340 offset += qCountTrailingZeroBits(v: mask) >> bitSpacing;
341 src8 += offset;
342 src16 += offset;
343}
344#elif defined(__ARM_NEON__)
345static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end)
346{
347 uint16x8_t maxAscii = vdupq_n_u16(0x7f);
348 uint16x8_t mask1 = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 };
349 uint16x8_t mask2 = vshlq_n_u16(mask1, 1);
350
351 // do sixteen characters at a time
352 for ( ; end - src >= 16; src += 16, dst += 16) {
353 // load 2 lanes (or: "load interleaved")
354 uint16x8x2_t in = vld2q_u16(reinterpret_cast<const uint16_t *>(src));
355
356 // check if any of the elements > 0x7f, select 1 bit per element (element 0 -> bit 0, element 1 -> bit 1, etc),
357 // add those together into a scalar, and merge the scalars.
358 uint16_t nonAscii = vaddvq_u16(vandq_u16(vcgtq_u16(in.val[0], maxAscii), mask1))
359 | vaddvq_u16(vandq_u16(vcgtq_u16(in.val[1], maxAscii), mask2));
360
361 // merge the two lanes by shifting the values of the second by 8 and inserting them
362 uint16x8_t out = vsliq_n_u16(in.val[0], in.val[1], 8);
363
364 // store, even if there are non-ASCII characters here
365 vst1q_u8(dst, vreinterpretq_u8_u16(out));
366
367 if (nonAscii) {
368 // find the next probable ASCII character
369 // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
370 // characters still coming
371 nextAscii = src + qBitScanReverse(nonAscii) + 1;
372
373 nonAscii = qCountTrailingZeroBits(nonAscii);
374 dst += nonAscii;
375 src += nonAscii;
376 return false;
377 }
378 }
379 return src == end;
380}
381
382static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
383{
384 // do eight characters at a time
385 uint8x8_t msb_mask = vdup_n_u8(0x80);
386 uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
387 for ( ; end - src >= 8; src += 8, dst += 8) {
388 uint8x8_t c = vld1_u8(src);
389 uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
390 if (!n) {
391 // store
392 vst1q_u16(reinterpret_cast<uint16_t *>(dst), vmovl_u8(c));
393 continue;
394 }
395
396 // copy the front part that is still ASCII
397 while (!(n & 1)) {
398 *dst++ = *src++;
399 n >>= 1;
400 }
401
402 // find the next probable ASCII character
403 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
404 // characters still coming
405 n = qBitScanReverse(n);
406 nextAscii = src + n + 1;
407 return false;
408
409 }
410 return src == end;
411}
412
413static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
414{
415 // The SIMD code below is untested, so just force an early return until
416 // we've had the time to verify it works.
417 nextAscii = end;
418 return src;
419
420 // do eight characters at a time
421 uint8x8_t msb_mask = vdup_n_u8(0x80);
422 uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
423 for ( ; end - src >= 8; src += 8) {
424 uint8x8_t c = vld1_u8(src);
425 uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
426 if (!n)
427 continue;
428
429 // find the next probable ASCII character
430 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
431 // characters still coming
432 nextAscii = src + qBitScanReverse(n) + 1;
433
434 // return the non-ASCII character
435 return src + qCountTrailingZeroBits(n);
436 }
437 nextAscii = end;
438 return src;
439}
440
441static void simdCompareAscii(const qchar8_t *&, const qchar8_t *, const char16_t *&, const char16_t *)
442{
443}
444#else
445static inline bool simdEncodeAscii(uchar *, const char16_t *, const char16_t *, const char16_t *)
446{
447 return false;
448}
449
450static inline bool simdDecodeAscii(char16_t *, const uchar *, const uchar *, const uchar *)
451{
452 return false;
453}
454
455static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
456{
457 nextAscii = end;
458 return src;
459}
460
461static void simdCompareAscii(const qchar8_t *&, const qchar8_t *, const char16_t *&, const char16_t *)
462{
463}
464#endif
465
466enum { HeaderDone = 1 };
467
468QByteArray QUtf8::convertFromUnicode(QStringView in)
469{
470 qsizetype len = in.size();
471
472 // create a QByteArray with the worst case scenario size
473 QByteArray result(len * 3, Qt::Uninitialized);
474 uchar *dst = reinterpret_cast<uchar *>(const_cast<char *>(result.constData()));
475 const char16_t *src = reinterpret_cast<const char16_t *>(in.data());
476 const char16_t *const end = src + len;
477
478 while (src != end) {
479 const char16_t *nextAscii = end;
480 if (simdEncodeAscii(dst, nextAscii, src, end))
481 break;
482
483 do {
484 char16_t u = *src++;
485 int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u, dst, src, end);
486 if (res < 0) {
487 // encoding error - append '?'
488 *dst++ = '?';
489 }
490 } while (src < nextAscii);
491 }
492
493 result.truncate(pos: dst - reinterpret_cast<uchar *>(const_cast<char *>(result.constData())));
494 return result;
495}
496
497QByteArray QUtf8::convertFromUnicode(QStringView in, QStringConverterBase::State *state)
498{
499 QByteArray ba(3*in.size() +3, Qt::Uninitialized);
500 char *end = convertFromUnicode(out: ba.data(), in, state);
501 ba.truncate(pos: end - ba.data());
502 return ba;
503}
504
505char *QUtf8::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state)
506{
507 Q_ASSERT(state);
508 qsizetype len = in.size();
509 if (!len)
510 return out;
511
512 auto appendReplacementChar = [state](uchar *cursor) -> uchar * {
513 if (state->flags & QStringConverter::Flag::ConvertInvalidToNull) {
514 *cursor++ = 0;
515 } else {
516 // QChar::replacement encoded in utf8
517 *cursor++ = 0xef;
518 *cursor++ = 0xbf;
519 *cursor++ = 0xbd;
520 }
521 return cursor;
522 };
523
524 uchar *cursor = reinterpret_cast<uchar *>(out);
525 const char16_t *src = in.utf16();
526 const char16_t *const end = src + len;
527
528 if (!(state->flags & QStringDecoder::Flag::Stateless)) {
529 if (state->remainingChars) {
530 int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u: state->state_data[0], dst&: cursor, src, end);
531 if (res < 0)
532 cursor = appendReplacementChar(cursor);
533 state->state_data[0] = 0;
534 state->remainingChars = 0;
535 } else if (!(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom) {
536 // append UTF-8 BOM
537 *cursor++ = utf8bom[0];
538 *cursor++ = utf8bom[1];
539 *cursor++ = utf8bom[2];
540 state->internalState |= HeaderDone;
541 }
542 }
543
544 while (src != end) {
545 const char16_t *nextAscii = end;
546 if (simdEncodeAscii(dst&: cursor, nextAscii, src, end))
547 break;
548
549 do {
550 char16_t uc = *src++;
551 int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u: uc, dst&: cursor, src, end);
552 if (Q_LIKELY(res >= 0))
553 continue;
554
555 if (res == QUtf8BaseTraits::Error) {
556 // encoding error
557 ++state->invalidChars;
558 cursor = appendReplacementChar(cursor);
559 } else if (res == QUtf8BaseTraits::EndOfString) {
560 if (state->flags & QStringConverter::Flag::Stateless) {
561 ++state->invalidChars;
562 cursor = appendReplacementChar(cursor);
563 } else {
564 state->remainingChars = 1;
565 state->state_data[0] = uc;
566 }
567 return reinterpret_cast<char *>(cursor);
568 }
569 } while (src < nextAscii);
570 }
571
572 return reinterpret_cast<char *>(cursor);
573}
574
575char *QUtf8::convertFromLatin1(char *out, QLatin1StringView in)
576{
577 // ### SIMD-optimize:
578 for (uchar ch : in) {
579 if (ch < 128) {
580 *out++ = ch;
581 } else {
582 // as per https://en.wikipedia.org/wiki/UTF-8#Encoding, 2nd row
583 *out++ = 0b110'0'0000u | (ch >> 6);
584 *out++ = 0b10'00'0000u | (ch & 0b0011'1111);
585 }
586 }
587 return out;
588}
589
590QString QUtf8::convertToUnicode(QByteArrayView in)
591{
592 // UTF-8 to UTF-16 always needs the exact same number of words or less:
593 // UTF-8 UTF-16
594 // 1 byte 1 word
595 // 2 bytes 1 word
596 // 3 bytes 1 word
597 // 4 bytes 2 words (one surrogate pair)
598 // That is, we'll use the full buffer if the input is US-ASCII (1-byte UTF-8),
599 // half the buffer for U+0080-U+07FF text (e.g., Greek, Cyrillic, Arabic) or
600 // non-BMP text, and one third of the buffer for U+0800-U+FFFF text (e.g, CJK).
601 //
602 // The table holds for invalid sequences too: we'll insert one replacement char
603 // per invalid byte.
604 QString result(in.size(), Qt::Uninitialized);
605 QChar *data = const_cast<QChar*>(result.constData()); // we know we're not shared
606 const QChar *end = convertToUnicode(buffer: data, in);
607 result.truncate(pos: end - data);
608 return result;
609}
610
611/*! \internal
612 \since 6.6
613 \overload
614
615 Converts the UTF-8 sequence of bytes viewed by \a in to a sequence of
616 QChar starting at \a dst in the destination buffer. The buffer is expected
617 to be large enough to hold the result. An upper bound for the size of the
618 buffer is \c in.size() QChars.
619
620 If, during decoding, an error occurs, a QChar::ReplacementCharacter is
621 written.
622
623 Returns a pointer to one past the last QChar written.
624
625 This function never throws.
626
627 For QChar buffers, instead of casting manually, you can use the static
628 QUtf8::convertToUnicode(QChar *, QByteArrayView) directly.
629*/
630char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in) noexcept
631{
632 const uchar *const start = reinterpret_cast<const uchar *>(in.data());
633 const uchar *src = start;
634 const uchar *end = src + in.size();
635
636 // attempt to do a full decoding in SIMD
637 const uchar *nextAscii = end;
638 if (!simdDecodeAscii(dst, nextAscii, src, end)) {
639 // at least one non-ASCII entry
640 // check if we failed to decode the UTF-8 BOM; if so, skip it
641 if (Q_UNLIKELY(src == start)
642 && end - src >= 3
643 && Q_UNLIKELY(src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])) {
644 src += 3;
645 }
646
647 while (src < end) {
648 nextAscii = end;
649 if (simdDecodeAscii(dst, nextAscii, src, end))
650 break;
651
652 do {
653 uchar b = *src++;
654 const qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
655 if (res < 0) {
656 // decoding error
657 *dst++ = QChar::ReplacementCharacter;
658 }
659 } while (src < nextAscii);
660 }
661 }
662
663 return dst;
664}
665
666QString QUtf8::convertToUnicode(QByteArrayView in, QStringConverter::State *state)
667{
668 // See above for buffer requirements for stateless decoding. However, that
669 // fails if the state is not empty. The following situations can add to the
670 // requirements:
671 // state contains chars starts with requirement
672 // 1 of 2 bytes valid continuation 0
673 // 2 of 3 bytes same 0
674 // 3 bytes of 4 same +1 (need to insert surrogate pair)
675 // 1 of 2 bytes invalid continuation +1 (need to insert replacement and restart)
676 // 2 of 3 bytes same +1 (same)
677 // 3 of 4 bytes same +1 (same)
678 QString result(in.size() + 1, Qt::Uninitialized);
679 QChar *end = convertToUnicode(out: result.data(), in, state);
680 result.truncate(pos: end - result.constData());
681 return result;
682}
683
684char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in, QStringConverter::State *state)
685{
686 qsizetype len = in.size();
687
688 Q_ASSERT(state);
689 if (!len)
690 return dst;
691
692
693 char16_t replacement = QChar::ReplacementCharacter;
694 if (state->flags & QStringConverter::Flag::ConvertInvalidToNull)
695 replacement = QChar::Null;
696
697 qsizetype res;
698 uchar ch = 0;
699
700 const uchar *src = reinterpret_cast<const uchar *>(in.data());
701 const uchar *end = src + len;
702
703 if (!(state->flags & QStringConverter::Flag::Stateless)) {
704 bool headerdone = state->internalState & HeaderDone || state->flags & QStringConverter::Flag::ConvertInitialBom;
705 if (state->remainingChars || !headerdone) {
706 // handle incoming state first
707 uchar remainingCharsData[4]; // longest UTF-8 sequence possible
708 qsizetype remainingCharsCount = state->remainingChars;
709 qsizetype newCharsToCopy = qMin<qsizetype>(a: sizeof(remainingCharsData) - remainingCharsCount, b: end - src);
710
711 memset(s: remainingCharsData, c: 0, n: sizeof(remainingCharsData));
712 memcpy(dest: remainingCharsData, src: &state->state_data[0], n: remainingCharsCount);
713 memcpy(dest: remainingCharsData + remainingCharsCount, src: src, n: newCharsToCopy);
714
715 const uchar *begin = &remainingCharsData[1];
716 res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b: remainingCharsData[0], dst, src&: begin,
717 end: static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
718 if (res == QUtf8BaseTraits::Error) {
719 ++state->invalidChars;
720 *dst++ = replacement;
721 ++src;
722 } else if (res == QUtf8BaseTraits::EndOfString) {
723 // if we got EndOfString again, then there were too few bytes in src;
724 // copy to our state and return
725 state->remainingChars = remainingCharsCount + newCharsToCopy;
726 memcpy(dest: &state->state_data[0], src: remainingCharsData, n: state->remainingChars);
727 return dst;
728 } else if (!headerdone) {
729 // eat the UTF-8 BOM
730 if (dst[-1] == 0xfeff)
731 --dst;
732 }
733 state->internalState |= HeaderDone;
734
735 // adjust src now that we have maybe consumed a few chars
736 if (res >= 0) {
737 Q_ASSERT(res > remainingCharsCount);
738 src += res - remainingCharsCount;
739 }
740 }
741 } else if (!(state->flags & QStringConverter::Flag::ConvertInitialBom)) {
742 // stateless, remove initial BOM
743 if (len > 2 && src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])
744 // skip BOM
745 src += 3;
746 }
747
748 // main body, stateless decoding
749 res = 0;
750 const uchar *nextAscii = src;
751 while (res >= 0 && src < end) {
752 if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end))
753 break;
754
755 ch = *src++;
756 res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b: ch, dst, src, end);
757 if (res == QUtf8BaseTraits::Error) {
758 res = 0;
759 ++state->invalidChars;
760 *dst++ = replacement;
761 }
762 }
763
764 if (res == QUtf8BaseTraits::EndOfString) {
765 // unterminated UTF sequence
766 if (state->flags & QStringConverter::Flag::Stateless) {
767 *dst++ = QChar::ReplacementCharacter;
768 ++state->invalidChars;
769 while (src++ < end) {
770 *dst++ = QChar::ReplacementCharacter;
771 ++state->invalidChars;
772 }
773 state->remainingChars = 0;
774 } else {
775 --src; // unread the byte in ch
776 state->remainingChars = end - src;
777 memcpy(dest: &state->state_data[0], src: src, n: end - src);
778 }
779 } else {
780 state->remainingChars = 0;
781 }
782
783 return dst;
784}
785
786struct QUtf8NoOutputTraits : public QUtf8BaseTraitsNoAscii
787{
788 struct NoOutput {};
789 static void appendUtf16(const NoOutput &, char16_t) {}
790 static void appendUcs4(const NoOutput &, char32_t) {}
791};
792
793QUtf8::ValidUtf8Result QUtf8::isValidUtf8(QByteArrayView in)
794{
795 const uchar *src = reinterpret_cast<const uchar *>(in.data());
796 const uchar *end = src + in.size();
797 const uchar *nextAscii = src;
798 bool isValidAscii = true;
799
800 while (src < end) {
801 if (src >= nextAscii)
802 src = simdFindNonAscii(src, end, nextAscii);
803 if (src == end)
804 break;
805
806 do {
807 uchar b = *src++;
808 if ((b & 0x80) == 0)
809 continue;
810
811 isValidAscii = false;
812 QUtf8NoOutputTraits::NoOutput output;
813 const qsizetype res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, dst&: output, src, end);
814 if (res < 0) {
815 // decoding error
816 return { .isValidUtf8: false, .isValidAscii: false };
817 }
818 } while (src < nextAscii);
819 }
820
821 return { .isValidUtf8: true, .isValidAscii: isValidAscii };
822}
823
824int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16, Qt::CaseSensitivity cs) noexcept
825{
826 auto src1 = reinterpret_cast<const qchar8_t *>(utf8.data());
827 auto end1 = src1 + utf8.size();
828 auto src2 = reinterpret_cast<const char16_t *>(utf16.data());
829 auto end2 = src2 + utf16.size();
830
831 do {
832 simdCompareAscii(src8&: src1, end8: end1, src16&: src2, end16: end2);
833
834 if (src1 < end1 && src2 < end2) {
835 char32_t uc1 = *src1++;
836 char32_t uc2 = *src2++;
837
838 if (uc1 >= 0x80) {
839 char32_t *output = &uc1;
840 qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraitsNoAscii>(b: uc1, dst&: output, src&: src1, end: end1);
841 if (res < 0) {
842 // decoding error
843 uc1 = QChar::ReplacementCharacter;
844 }
845
846 // Only decode the UTF-16 surrogate pair if the UTF-8 code point
847 // wasn't US-ASCII (a surrogate cannot match US-ASCII).
848 if (QChar::isHighSurrogate(ucs4: uc2) && src2 < end2 && QChar::isLowSurrogate(ucs4: *src2))
849 uc2 = QChar::surrogateToUcs4(high: uc2, low: *src2++);
850 }
851 if (cs == Qt::CaseInsensitive) {
852 uc1 = QChar::toCaseFolded(ucs4: uc1);
853 uc2 = QChar::toCaseFolded(ucs4: uc2);
854 }
855 if (uc1 != uc2)
856 return int(uc1) - int(uc2);
857 }
858 } while (src1 < end1 && src2 < end2);
859
860 // the shorter string sorts first
861 return (end1 > src1) - int(end2 > src2);
862}
863
864int QUtf8::compareUtf8(QByteArrayView utf8, QLatin1StringView s, Qt::CaseSensitivity cs)
865{
866 char32_t uc1 = QChar::Null;
867 auto src1 = reinterpret_cast<const uchar *>(utf8.data());
868 auto end1 = src1 + utf8.size();
869 auto src2 = reinterpret_cast<const uchar *>(s.latin1());
870 auto end2 = src2 + s.size();
871
872 while (src1 < end1 && src2 < end2) {
873 uchar b = *src1++;
874 char32_t *output = &uc1;
875 const qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst&: output, src&: src1, end: end1);
876 if (res < 0) {
877 // decoding error
878 uc1 = QChar::ReplacementCharacter;
879 }
880
881 char32_t uc2 = *src2++;
882 if (cs == Qt::CaseInsensitive) {
883 uc1 = QChar::toCaseFolded(ucs4: uc1);
884 uc2 = QChar::toCaseFolded(ucs4: uc2);
885 }
886 if (uc1 != uc2)
887 return int(uc1) - int(uc2);
888 }
889
890 // the shorter string sorts first
891 return (end1 > src1) - (end2 > src2);
892}
893
894int QUtf8::compareUtf8(QByteArrayView lhs, QByteArrayView rhs, Qt::CaseSensitivity cs) noexcept
895{
896 if (lhs.isEmpty())
897 return qt_lencmp(lhs: 0, rhs: rhs.size());
898
899 if (cs == Qt::CaseSensitive) {
900 const auto l = std::min(a: lhs.size(), b: rhs.size());
901 int r = memcmp(s1: lhs.data(), s2: rhs.data(), n: l);
902 return r ? r : qt_lencmp(lhs: lhs.size(), rhs: rhs.size());
903 }
904
905 char32_t uc1 = QChar::Null;
906 auto src1 = reinterpret_cast<const uchar *>(lhs.data());
907 auto end1 = src1 + lhs.size();
908 char32_t uc2 = QChar::Null;
909 auto src2 = reinterpret_cast<const uchar *>(rhs.data());
910 auto end2 = src2 + rhs.size();
911
912 while (src1 < end1 && src2 < end2) {
913 uchar b = *src1++;
914 char32_t *output = &uc1;
915 qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst&: output, src&: src1, end: end1);
916 if (res < 0) {
917 // decoding error
918 uc1 = QChar::ReplacementCharacter;
919 }
920
921 b = *src2++;
922 output = &uc2;
923 res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst&: output, src&: src2, end: end2);
924 if (res < 0) {
925 // decoding error
926 uc2 = QChar::ReplacementCharacter;
927 }
928
929 uc1 = QChar::toCaseFolded(ucs4: uc1);
930 uc2 = QChar::toCaseFolded(ucs4: uc2);
931 if (uc1 != uc2)
932 return int(uc1) - int(uc2);
933 }
934
935 // the shorter string sorts first
936 return (end1 > src1) - (end2 > src2);
937}
938
939QByteArray QUtf16::convertFromUnicode(QStringView in, QStringConverter::State *state, DataEndianness endian)
940{
941 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
942 qsizetype length = 2 * in.size();
943 if (writeBom)
944 length += 2;
945
946 QByteArray d(length, Qt::Uninitialized);
947 char *end = convertFromUnicode(out: d.data(), in, state, endian);
948 Q_ASSERT(end - d.constData() == d.size());
949 Q_UNUSED(end);
950 return d;
951}
952
953char *QUtf16::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state, DataEndianness endian)
954{
955 Q_ASSERT(state);
956 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
957
958 if (endian == DetectEndianness)
959 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
960
961 if (writeBom) {
962 // set them up the BOM
963 QChar bom(QChar::ByteOrderMark);
964 if (endian == BigEndianness)
965 qToBigEndian(src: bom.unicode(), dest: out);
966 else
967 qToLittleEndian(src: bom.unicode(), dest: out);
968 out += 2;
969 }
970 if (endian == BigEndianness)
971 qToBigEndian<char16_t>(source: in.data(), count: in.size(), dest: out);
972 else
973 qToLittleEndian<char16_t>(source: in.data(), count: in.size(), dest: out);
974
975 state->remainingChars = 0;
976 state->internalState |= HeaderDone;
977 return out + 2*in.size();
978}
979
980QString QUtf16::convertToUnicode(QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
981{
982 QString result((in.size() + 1) >> 1, Qt::Uninitialized); // worst case
983 QChar *qch = convertToUnicode(out: result.data(), in, state, endian);
984 result.truncate(pos: qch - result.constData());
985 return result;
986}
987
988QChar *QUtf16::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
989{
990 qsizetype len = in.size();
991 const char *chars = in.data();
992
993 Q_ASSERT(state);
994
995 if (endian == DetectEndianness)
996 endian = (DataEndianness)state->state_data[Endian];
997
998 const char *end = chars + len;
999
1000 // make sure we can decode at least one char
1001 if (state->remainingChars + len < 2) {
1002 if (len) {
1003 Q_ASSERT(state->remainingChars == 0 && len == 1);
1004 state->remainingChars = 1;
1005 state->state_data[Data] = *chars;
1006 }
1007 return out;
1008 }
1009
1010 bool headerdone = state && state->internalState & HeaderDone;
1011 if (state->flags & QStringConverter::Flag::ConvertInitialBom)
1012 headerdone = true;
1013
1014 if (!headerdone || state->remainingChars) {
1015 uchar buf;
1016 if (state->remainingChars)
1017 buf = state->state_data[Data];
1018 else
1019 buf = *chars++;
1020
1021 // detect BOM, set endianness
1022 state->internalState |= HeaderDone;
1023 QChar ch(buf, *chars++);
1024 if (endian == DetectEndianness) {
1025 // someone set us up the BOM
1026 if (ch == QChar::ByteOrderSwapped) {
1027 endian = BigEndianness;
1028 } else if (ch == QChar::ByteOrderMark) {
1029 endian = LittleEndianness;
1030 } else {
1031 if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
1032 endian = BigEndianness;
1033 } else {
1034 endian = LittleEndianness;
1035 }
1036 }
1037 }
1038 if (endian == BigEndianness)
1039 ch = QChar::fromUcs2(c: (ch.unicode() >> 8) | ((ch.unicode() & 0xff) << 8));
1040 if (headerdone || ch != QChar::ByteOrderMark)
1041 *out++ = ch;
1042 } else if (endian == DetectEndianness) {
1043 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1044 }
1045
1046 qsizetype nPairs = (end - chars) >> 1;
1047 if (endian == BigEndianness)
1048 qFromBigEndian<char16_t>(source: chars, count: nPairs, dest: out);
1049 else
1050 qFromLittleEndian<char16_t>(source: chars, count: nPairs, dest: out);
1051 out += nPairs;
1052
1053 state->state_data[Endian] = endian;
1054 state->remainingChars = 0;
1055 if ((end - chars) & 1) {
1056 if (state->flags & QStringConverter::Flag::Stateless) {
1057 *out++ = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? QChar::Null : QChar::ReplacementCharacter;
1058 } else {
1059 state->remainingChars = 1;
1060 state->state_data[Data] = *(end - 1);
1061 }
1062 } else {
1063 state->state_data[Data] = 0;
1064 }
1065
1066 return out;
1067}
1068
1069QByteArray QUtf32::convertFromUnicode(QStringView in, QStringConverter::State *state, DataEndianness endian)
1070{
1071 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1072 qsizetype length = 4*in.size();
1073 if (writeBom)
1074 length += 4;
1075 QByteArray ba(length, Qt::Uninitialized);
1076 char *end = convertFromUnicode(out: ba.data(), in, state, endian);
1077 ba.truncate(pos: end - ba.constData());
1078 return ba;
1079}
1080
1081char *QUtf32::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state, DataEndianness endian)
1082{
1083 Q_ASSERT(state);
1084
1085 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1086 if (endian == DetectEndianness)
1087 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1088
1089 if (writeBom) {
1090 // set them up the BOM
1091 if (endian == BigEndianness) {
1092 out[0] = 0;
1093 out[1] = 0;
1094 out[2] = (char)0xfe;
1095 out[3] = (char)0xff;
1096 } else {
1097 out[0] = (char)0xff;
1098 out[1] = (char)0xfe;
1099 out[2] = 0;
1100 out[3] = 0;
1101 }
1102 out += 4;
1103 state->internalState |= HeaderDone;
1104 }
1105
1106 const QChar *uc = in.data();
1107 const QChar *end = in.data() + in.size();
1108 QChar ch;
1109 char32_t ucs4;
1110 if (state->remainingChars == 1) {
1111 auto character = state->state_data[Data];
1112 Q_ASSERT(character <= 0xFFFF);
1113 ch = QChar(character);
1114 // this is ugly, but shortcuts a whole lot of logic that would otherwise be required
1115 state->remainingChars = 0;
1116 goto decode_surrogate;
1117 }
1118
1119 while (uc < end) {
1120 ch = *uc++;
1121 if (Q_LIKELY(!ch.isSurrogate())) {
1122 ucs4 = ch.unicode();
1123 } else if (Q_LIKELY(ch.isHighSurrogate())) {
1124decode_surrogate:
1125 if (uc == end) {
1126 if (state->flags & QStringConverter::Flag::Stateless) {
1127 ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? 0 : QChar::ReplacementCharacter;
1128 } else {
1129 state->remainingChars = 1;
1130 state->state_data[Data] = ch.unicode();
1131 return out;
1132 }
1133 } else if (uc->isLowSurrogate()) {
1134 ucs4 = QChar::surrogateToUcs4(high: ch, low: *uc++);
1135 } else {
1136 ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? 0 : QChar::ReplacementCharacter;
1137 }
1138 } else {
1139 ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? 0 : QChar::ReplacementCharacter;
1140 }
1141 if (endian == BigEndianness)
1142 qToBigEndian(src: ucs4, dest: out);
1143 else
1144 qToLittleEndian(src: ucs4, dest: out);
1145 out += 4;
1146 }
1147
1148 return out;
1149}
1150
1151QString QUtf32::convertToUnicode(QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
1152{
1153 QString result;
1154 result.resize(size: (in.size() + 7) >> 1); // worst case
1155 QChar *end = convertToUnicode(out: result.data(), in, state, endian);
1156 result.truncate(pos: end - result.constData());
1157 return result;
1158}
1159
1160QChar *QUtf32::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
1161{
1162 qsizetype len = in.size();
1163 const char *chars = in.data();
1164
1165 Q_ASSERT(state);
1166 if (endian == DetectEndianness)
1167 endian = (DataEndianness)state->state_data[Endian];
1168
1169 const char *end = chars + len;
1170
1171 uchar tuple[4];
1172 memcpy(dest: tuple, src: &state->state_data[Data], n: 4);
1173
1174 // make sure we can decode at least one char
1175 if (state->remainingChars + len < 4) {
1176 if (len) {
1177 while (chars < end) {
1178 tuple[state->remainingChars] = *chars;
1179 ++state->remainingChars;
1180 ++chars;
1181 }
1182 Q_ASSERT(state->remainingChars < 4);
1183 memcpy(dest: &state->state_data[Data], src: tuple, n: 4);
1184 }
1185 return out;
1186 }
1187
1188 bool headerdone = state->internalState & HeaderDone;
1189 if (state->flags & QStringConverter::Flag::ConvertInitialBom)
1190 headerdone = true;
1191
1192 qsizetype num = state->remainingChars;
1193 state->remainingChars = 0;
1194
1195 if (!headerdone || endian == DetectEndianness || num) {
1196 while (num < 4)
1197 tuple[num++] = *chars++;
1198 if (endian == DetectEndianness) {
1199 // someone set us up the BOM?
1200 if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0) {
1201 endian = LittleEndianness;
1202 } else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff) {
1203 endian = BigEndianness;
1204 } else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
1205 endian = BigEndianness;
1206 } else {
1207 endian = LittleEndianness;
1208 }
1209 }
1210 char32_t code = (endian == BigEndianness) ? qFromBigEndian<char32_t>(src: tuple) : qFromLittleEndian<char32_t>(src: tuple);
1211 if (headerdone || code != QChar::ByteOrderMark) {
1212 if (QChar::requiresSurrogates(ucs4: code)) {
1213 *out++ = QChar(QChar::highSurrogate(ucs4: code));
1214 *out++ = QChar(QChar::lowSurrogate(ucs4: code));
1215 } else {
1216 *out++ = QChar(code);
1217 }
1218 }
1219 num = 0;
1220 } else if (endian == DetectEndianness) {
1221 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1222 }
1223 state->state_data[Endian] = endian;
1224 state->internalState |= HeaderDone;
1225
1226 while (chars < end) {
1227 tuple[num++] = *chars++;
1228 if (num == 4) {
1229 char32_t code = (endian == BigEndianness) ? qFromBigEndian<char32_t>(src: tuple) : qFromLittleEndian<char32_t>(src: tuple);
1230 for (char16_t c : QChar::fromUcs4(c: code))
1231 *out++ = c;
1232 num = 0;
1233 }
1234 }
1235
1236 if (num) {
1237 if (state->flags & QStringDecoder::Flag::Stateless) {
1238 *out++ = QChar::ReplacementCharacter;
1239 } else {
1240 state->state_data[Endian] = endian;
1241 state->remainingChars = num;
1242 memcpy(dest: &state->state_data[Data], src: tuple, n: 4);
1243 }
1244 }
1245
1246 return out;
1247}
1248
1249#if defined(Q_OS_WIN) && !defined(QT_BOOTSTRAPPED)
1250int QLocal8Bit::checkUtf8()
1251{
1252 return GetACP() == CP_UTF8 ? 1 : -1;
1253}
1254
1255static QString convertToUnicodeCharByChar(QByteArrayView in, QStringConverter::State *state)
1256{
1257 qsizetype length = in.size();
1258 const char *chars = in.data();
1259
1260 Q_ASSERT(state);
1261 if (state->flags & QStringConverter::Flag::Stateless) // temporary
1262 state = nullptr;
1263
1264 if (!chars || !length)
1265 return QString();
1266
1267 qsizetype copyLocation = 0;
1268 qsizetype extra = 2;
1269 if (state && state->remainingChars) {
1270 copyLocation = state->remainingChars;
1271 extra += copyLocation;
1272 }
1273 qsizetype newLength = length + extra;
1274 char *mbcs = new char[newLength];
1275 //ensure that we have a NULL terminated string
1276 mbcs[newLength-1] = 0;
1277 mbcs[newLength-2] = 0;
1278 memcpy(&(mbcs[copyLocation]), chars, length);
1279 if (copyLocation) {
1280 //copy the last character from the state
1281 mbcs[0] = (char)state->state_data[0];
1282 state->remainingChars = 0;
1283 }
1284 const char *mb = mbcs;
1285 const char *next = 0;
1286 QString s;
1287 while ((next = CharNextExA(CP_ACP, mb, 0)) != mb) {
1288 wchar_t wc[2] ={0};
1289 int charlength = int(next - mb); // always just a few bytes
1290 int len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, mb, charlength, wc, 2);
1291 if (len>0) {
1292 s.append(QChar(wc[0]));
1293 } else {
1294 int r = GetLastError();
1295 //check if the character being dropped is the last character
1296 if (r == ERROR_NO_UNICODE_TRANSLATION && mb == (mbcs+newLength -3) && state) {
1297 state->remainingChars = 1;
1298 state->state_data[0] = (char)*mb;
1299 }
1300 }
1301 mb = next;
1302 }
1303 delete [] mbcs;
1304 return s;
1305}
1306
1307
1308QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::State *state)
1309{
1310 qsizetype length = in.size();
1311
1312 Q_ASSERT(length < INT_MAX); // ### FIXME
1313 const char *mb = in.data();
1314 int mblen = length;
1315
1316 if (!mb || !mblen)
1317 return QString();
1318
1319 QVarLengthArray<wchar_t, 4096> wc(4096);
1320 int len;
1321 QString sp;
1322 bool prepend = false;
1323 char state_data = 0;
1324 int remainingChars = 0;
1325
1326 //save the current state information
1327 if (state) {
1328 state_data = (char)state->state_data[0];
1329 remainingChars = state->remainingChars;
1330 }
1331
1332 //convert the pending character (if available)
1333 if (state && remainingChars) {
1334 char prev[3] = {0};
1335 prev[0] = state_data;
1336 prev[1] = mb[0];
1337 remainingChars = 0;
1338 len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
1339 prev, 2, wc.data(), wc.length());
1340 if (len) {
1341 sp.append(QChar(wc[0]));
1342 if (mblen == 1) {
1343 state->remainingChars = 0;
1344 return sp;
1345 }
1346 prepend = true;
1347 mb++;
1348 mblen--;
1349 wc[0] = 0;
1350 }
1351 }
1352
1353 while (!(len=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS,
1354 mb, mblen, wc.data(), wc.length()))) {
1355 int r = GetLastError();
1356 if (r == ERROR_INSUFFICIENT_BUFFER) {
1357 const int wclen = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
1358 mb, mblen, 0, 0);
1359 wc.resize(wclen);
1360 } else if (r == ERROR_NO_UNICODE_TRANSLATION) {
1361 //find the last non NULL character
1362 while (mblen > 1 && !(mb[mblen-1]))
1363 mblen--;
1364 //check whether, we hit an invalid character in the middle
1365 if ((mblen <= 1) || (remainingChars && state_data))
1366 return convertToUnicodeCharByChar(in, state);
1367 //Remove the last character and try again...
1368 state_data = mb[mblen-1];
1369 remainingChars = 1;
1370 mblen--;
1371 } else {
1372 // Fail.
1373 qWarning("MultiByteToWideChar: Cannot convert multibyte text");
1374 break;
1375 }
1376 }
1377
1378 if (len <= 0)
1379 return QString();
1380
1381 if (wc[len-1] == 0) // len - 1: we don't want terminator
1382 --len;
1383
1384 //save the new state information
1385 if (state) {
1386 state->state_data[0] = (char)state_data;
1387 state->remainingChars = remainingChars;
1388 }
1389 QString s((QChar*)wc.data(), len);
1390 if (prepend) {
1391 return sp+s;
1392 }
1393 return s;
1394}
1395
1396QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, QStringConverter::State *state)
1397{
1398 const QChar *ch = in.data();
1399 qsizetype uclen = in.size();
1400
1401 Q_ASSERT(uclen < INT_MAX); // ### FIXME
1402 Q_ASSERT(state);
1403 Q_UNUSED(state); // ### Fixme
1404 if (state->flags & QStringConverter::Flag::Stateless) // temporary
1405 state = nullptr;
1406
1407 if (!ch)
1408 return QByteArray();
1409 if (uclen == 0)
1410 return QByteArray("");
1411 BOOL used_def;
1412 QByteArray mb(4096, 0);
1413 int len;
1414 while (!(len=WideCharToMultiByte(CP_ACP, 0, (const wchar_t*)ch, uclen,
1415 mb.data(), mb.size()-1, 0, &used_def)))
1416 {
1417 int r = GetLastError();
1418 if (r == ERROR_INSUFFICIENT_BUFFER) {
1419 mb.resize(1+WideCharToMultiByte(CP_ACP, 0,
1420 (const wchar_t*)ch, uclen,
1421 0, 0, 0, &used_def));
1422 // and try again...
1423 } else {
1424 // Fail. Probably can't happen in fact (dwFlags is 0).
1425#ifndef QT_NO_DEBUG
1426 // Can't use qWarning(), as it'll recurse to handle %ls
1427 fprintf(stderr,
1428 "WideCharToMultiByte: Cannot convert multibyte text (error %d): %ls\n",
1429 r, reinterpret_cast<const wchar_t*>(QString(ch, uclen).utf16()));
1430#endif
1431 break;
1432 }
1433 }
1434 mb.resize(len);
1435 return mb;
1436}
1437#endif
1438
1439void QStringConverter::State::clear() noexcept
1440{
1441 if (clearFn)
1442 clearFn(this);
1443 else
1444 state_data[0] = state_data[1] = state_data[2] = state_data[3] = 0;
1445 remainingChars = 0;
1446 invalidChars = 0;
1447 internalState = 0;
1448}
1449
1450void QStringConverter::State::reset() noexcept
1451{
1452 if (flags & Flag::UsesIcu) {
1453#if QT_CONFIG(icu)
1454 UConverter *converter = static_cast<UConverter *>(d[0]);
1455 if (converter)
1456 ucnv_reset(converter);
1457#else
1458 Q_UNREACHABLE();
1459#endif
1460 } else {
1461 clear();
1462 }
1463}
1464
1465static QChar *fromUtf16(QChar *out, QByteArrayView in, QStringConverter::State *state)
1466{
1467 return QUtf16::convertToUnicode(out, in, state, endian: DetectEndianness);
1468}
1469
1470static char *toUtf16(char *out, QStringView in, QStringConverter::State *state)
1471{
1472 return QUtf16::convertFromUnicode(out, in, state, endian: DetectEndianness);
1473}
1474
1475static QChar *fromUtf16BE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1476{
1477 return QUtf16::convertToUnicode(out, in, state, endian: BigEndianness);
1478}
1479
1480static char *toUtf16BE(char *out, QStringView in, QStringConverter::State *state)
1481{
1482 return QUtf16::convertFromUnicode(out, in, state, endian: BigEndianness);
1483}
1484
1485static QChar *fromUtf16LE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1486{
1487 return QUtf16::convertToUnicode(out, in, state, endian: LittleEndianness);
1488}
1489
1490static char *toUtf16LE(char *out, QStringView in, QStringConverter::State *state)
1491{
1492 return QUtf16::convertFromUnicode(out, in, state, endian: LittleEndianness);
1493}
1494
1495static QChar *fromUtf32(QChar *out, QByteArrayView in, QStringConverter::State *state)
1496{
1497 return QUtf32::convertToUnicode(out, in, state, endian: DetectEndianness);
1498}
1499
1500static char *toUtf32(char *out, QStringView in, QStringConverter::State *state)
1501{
1502 return QUtf32::convertFromUnicode(out, in, state, endian: DetectEndianness);
1503}
1504
1505static QChar *fromUtf32BE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1506{
1507 return QUtf32::convertToUnicode(out, in, state, endian: BigEndianness);
1508}
1509
1510static char *toUtf32BE(char *out, QStringView in, QStringConverter::State *state)
1511{
1512 return QUtf32::convertFromUnicode(out, in, state, endian: BigEndianness);
1513}
1514
1515static QChar *fromUtf32LE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1516{
1517 return QUtf32::convertToUnicode(out, in, state, endian: LittleEndianness);
1518}
1519
1520static char *toUtf32LE(char *out, QStringView in, QStringConverter::State *state)
1521{
1522 return QUtf32::convertFromUnicode(out, in, state, endian: LittleEndianness);
1523}
1524
1525char *QLatin1::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state) noexcept
1526{
1527 Q_ASSERT(state);
1528 if (state->flags & QStringConverter::Flag::Stateless) // temporary
1529 state = nullptr;
1530
1531 const char replacement = (state && state->flags & QStringConverter::Flag::ConvertInvalidToNull) ? 0 : '?';
1532 qsizetype invalid = 0;
1533 for (qsizetype i = 0; i < in.size(); ++i) {
1534 if (in[i] > QChar(0xff)) {
1535 *out = replacement;
1536 ++invalid;
1537 } else {
1538 *out = (char)in[i].cell();
1539 }
1540 ++out;
1541 }
1542 if (state)
1543 state->invalidChars += invalid;
1544 return out;
1545}
1546
1547static QChar *fromLocal8Bit(QChar *out, QByteArrayView in, QStringConverter::State *state)
1548{
1549 QString s = QLocal8Bit::convertToUnicode(in, state);
1550 memcpy(dest: out, src: s.constData(), n: s.size()*sizeof(QChar));
1551 return out + s.size();
1552}
1553
1554static char *toLocal8Bit(char *out, QStringView in, QStringConverter::State *state)
1555{
1556 QByteArray s = QLocal8Bit::convertFromUnicode(in, state);
1557 memcpy(dest: out, src: s.constData(), n: s.size());
1558 return out + s.size();
1559}
1560
1561
1562static qsizetype fromUtf8Len(qsizetype l) { return l + 1; }
1563static qsizetype toUtf8Len(qsizetype l) { return 3*(l + 1); }
1564
1565static qsizetype fromUtf16Len(qsizetype l) { return l/2 + 2; }
1566static qsizetype toUtf16Len(qsizetype l) { return 2*(l + 1); }
1567
1568static qsizetype fromUtf32Len(qsizetype l) { return l/2 + 2; }
1569static qsizetype toUtf32Len(qsizetype l) { return 4*(l + 1); }
1570
1571static qsizetype fromLatin1Len(qsizetype l) { return l + 1; }
1572static qsizetype toLatin1Len(qsizetype l) { return l + 1; }
1573
1574
1575
1576/*!
1577 \class QStringConverterBase
1578 \internal
1579
1580 Just a common base class for QStringConverter and QTextCodec
1581*/
1582
1583/*!
1584 \class QStringConverter
1585 \inmodule QtCore
1586 \brief The QStringConverter class provides a base class for encoding and decoding text.
1587 \reentrant
1588 \ingroup i18n
1589
1590 Qt uses UTF-16 to store, draw and manipulate strings. In many
1591 situations you may wish to deal with data that uses a different
1592 encoding. Most text data transferred over files and network connections is encoded
1593 in UTF-8.
1594
1595 The QStringConverter class is a base class for the \l {QStringEncoder} and
1596 \l {QStringDecoder} classes that help with converting between different
1597 text encodings. QStringDecoder can decode a string from an encoded representation
1598 into UTF-16, the format Qt uses internally. QStringEncoder does the opposite
1599 operation, encoding UTF-16 encoded data (usually in the form of a QString) to
1600 the requested encoding.
1601
1602 The supported encodings are:
1603
1604 \list
1605 \li UTF-8
1606 \li UTF-16
1607 \li UTF-16BE
1608 \li UTF-16LE
1609 \li UTF-32
1610 \li UTF-32BE
1611 \li UTF-32LE
1612 \li ISO-8859-1 (Latin-1)
1613 \li The system encoding
1614 \endlist
1615
1616 \l {QStringConverter}s can be used as follows to convert some encoded
1617 string to and from UTF-16.
1618
1619 Suppose you have some string encoded in UTF-8, and
1620 want to convert it to a QString. The simple way
1621 to do it is to use a \l {QStringDecoder} like this:
1622
1623 \snippet code/src_corelib_text_qstringconverter.cpp 0
1624
1625 After this, \c string holds the text in decoded form.
1626 Converting a string from Unicode to the local encoding is just as
1627 easy using the \l {QStringEncoder} class:
1628
1629 \snippet code/src_corelib_text_qstringconverter.cpp 1
1630
1631 To read or write text files in various encodings, use QTextStream and
1632 its \l{QTextStream::setEncoding()}{setEncoding()} function.
1633
1634 Some care must be taken when trying to convert the data in chunks,
1635 for example, when receiving it over a network. In such cases it is
1636 possible that a multi-byte character will be split over two
1637 chunks. At best this might result in the loss of a character and
1638 at worst cause the entire conversion to fail.
1639
1640 Both QStringEncoder and QStringDecoder make this easy, by tracking
1641 this in an internal state. So simply calling the encoder or decoder
1642 again with the next chunk of data will automatically continue encoding
1643 or decoding the data correctly:
1644
1645 \snippet code/src_corelib_text_qstringconverter.cpp 2
1646
1647 The QStringDecoder object maintains state between chunks and therefore
1648 works correctly even if a multi-byte character is split between
1649 chunks.
1650
1651 QStringConverter objects can't be copied because of their internal state, but
1652 can be moved.
1653
1654 \sa QTextStream, QStringDecoder, QStringEncoder
1655*/
1656
1657/*!
1658 \enum QStringConverter::Flag
1659
1660 \value Default Default conversion rules apply.
1661 \value ConvertInvalidToNull If this flag is set, each invalid input
1662 character is output as a null character. If it is not set,
1663 invalid input characters are represented as QChar::ReplacementCharacter
1664 if the output encoding can represent that character, otherwise as a question mark.
1665 \value WriteBom When converting from a QString to an output encoding, write a QChar::ByteOrderMark as the first
1666 character if the output encoding supports this. This is the case for UTF-8, UTF-16 and UTF-32
1667 encodings.
1668 \value ConvertInitialBom When converting from an input encoding to a QString the QStringDecoder usually skips an
1669 leading QChar::ByteOrderMark. When this flag is set, the byte order mark will not be
1670 skipped, but converted to utf-16 and inserted at the start of the created QString.
1671 \value Stateless Ignore possible converter states between different function calls
1672 to encode or decode strings. This will also cause the QStringConverter to raise an error if an incomplete
1673 sequence of data is encountered.
1674 \omitvalue UsesIcu
1675*/
1676
1677/*!
1678 \enum QStringConverter::Encoding
1679 \value Utf8 Create a converter to or from UTF-8
1680 \value Utf16 Create a converter to or from UTF-16. When decoding, the byte order will get automatically
1681 detected by a leading byte order mark. If none exists or when encoding, the system byte order will
1682 be assumed.
1683 \value Utf16BE Create a converter to or from big-endian UTF-16.
1684 \value Utf16LE Create a converter to or from little-endian UTF-16.
1685 \value Utf32 Create a converter to or from UTF-32. When decoding, the byte order will get automatically
1686 detected by a leading byte order mark. If none exists or when encoding, the system byte order will
1687 be assumed.
1688 \value Utf32BE Create a converter to or from big-endian UTF-32.
1689 \value Utf32LE Create a converter to or from little-endian UTF-32.
1690 \value Latin1 Create a converter to or from ISO-8859-1 (Latin1).
1691 \value System Create a converter to or from the underlying encoding of the
1692 operating systems locale. This is always assumed to be UTF-8 for Unix based
1693 systems. On Windows, this converts to and from the locale code page.
1694 \omitvalue LastEncoding
1695*/
1696
1697/*!
1698 \struct QStringConverter::Interface
1699 \internal
1700*/
1701
1702const QStringConverter::Interface QStringConverter::encodingInterfaces[QStringConverter::LastEncoding + 1] =
1703{
1704 { .name: "UTF-8", .toUtf16: QUtf8::convertToUnicode, .toUtf16Len: fromUtf8Len, .fromUtf16: QUtf8::convertFromUnicode, .fromUtf16Len: toUtf8Len },
1705 { .name: "UTF-16", .toUtf16: fromUtf16, .toUtf16Len: fromUtf16Len, .fromUtf16: toUtf16, .fromUtf16Len: toUtf16Len },
1706 { .name: "UTF-16LE", .toUtf16: fromUtf16LE, .toUtf16Len: fromUtf16Len, .fromUtf16: toUtf16LE, .fromUtf16Len: toUtf16Len },
1707 { .name: "UTF-16BE", .toUtf16: fromUtf16BE, .toUtf16Len: fromUtf16Len, .fromUtf16: toUtf16BE, .fromUtf16Len: toUtf16Len },
1708 { .name: "UTF-32", .toUtf16: fromUtf32, .toUtf16Len: fromUtf32Len, .fromUtf16: toUtf32, .fromUtf16Len: toUtf32Len },
1709 { .name: "UTF-32LE", .toUtf16: fromUtf32LE, .toUtf16Len: fromUtf32Len, .fromUtf16: toUtf32LE, .fromUtf16Len: toUtf32Len },
1710 { .name: "UTF-32BE", .toUtf16: fromUtf32BE, .toUtf16Len: fromUtf32Len, .fromUtf16: toUtf32BE, .fromUtf16Len: toUtf32Len },
1711 { .name: "ISO-8859-1", .toUtf16: QLatin1::convertToUnicode, .toUtf16Len: fromLatin1Len, .fromUtf16: QLatin1::convertFromUnicode, .fromUtf16Len: toLatin1Len },
1712 { .name: "Locale", .toUtf16: fromLocal8Bit, .toUtf16Len: fromUtf8Len, .fromUtf16: toLocal8Bit, .fromUtf16Len: toUtf8Len }
1713};
1714
1715// match names case insensitive and skipping '-' and '_'
1716static bool nameMatch(const char *a, const char *b)
1717{
1718 while (*a && *b) {
1719 if (*a == '-' || *a == '_') {
1720 ++a;
1721 continue;
1722 }
1723 if (*b == '-' || *b == '_') {
1724 ++b;
1725 continue;
1726 }
1727 if (QtMiscUtils::toAsciiLower(ch: *a) != QtMiscUtils::toAsciiLower(ch: *b))
1728 return false;
1729 ++a;
1730 ++b;
1731 }
1732 return !*a && !*b;
1733}
1734
1735
1736/*!
1737 \fn constexpr QStringConverter::QStringConverter()
1738 \internal
1739*/
1740
1741/*!
1742 \fn constexpr QStringConverter::QStringConverter(Encoding, Flags)
1743 \internal
1744*/
1745
1746
1747#if QT_CONFIG(icu)
1748// only derives from QStringConverter to get access to protected types
1749struct QStringConverterICU : QStringConverter
1750{
1751 static void clear_function(QStringConverterBase::State *state) noexcept
1752 {
1753 ucnv_close(converter: static_cast<UConverter *>(state->d[0]));
1754 state->d[0] = nullptr;
1755 }
1756
1757 static void ensureConverter(QStringConverter::State *state)
1758 {
1759 // old code might reset the state via clear instead of reset
1760 // in that case, the converter has been closed, and we have to reopen it
1761 if (state->d[0] == nullptr)
1762 state->d[0] = createConverterForName(name: static_cast<const char *>(state->d[1]), state);
1763 }
1764
1765 static QChar *toUtf16(QChar *out, QByteArrayView in, QStringConverter::State *state)
1766 {
1767 ensureConverter(state);
1768
1769 auto icu_conv = static_cast<UConverter *>(state->d[0]);
1770 UErrorCode err = U_ZERO_ERROR;
1771 auto source = in.data();
1772 auto sourceLimit = in.data() + in.size();
1773
1774 qsizetype length = toLen(inLength: in.size());
1775
1776 UChar *target = reinterpret_cast<UChar *>(out);
1777 auto targetLimit = target + length;
1778 // We explicitly clean up anyway, so no need to set flush to true,
1779 // which would just reset the converter.
1780 UBool flush = false;
1781
1782 // If the QStringConverter was moved, the state that we used as a context is stale now.
1783 UConverterToUCallback action;
1784 const void *context;
1785 ucnv_getToUCallBack(converter: icu_conv, action: &action, context: &context);
1786 if (context != state)
1787 ucnv_setToUCallBack(converter: icu_conv, newAction: action, newContext: &state, oldAction: nullptr, oldContext: nullptr, err: &err);
1788
1789 ucnv_toUnicode(converter: icu_conv, target: &target, targetLimit, source: &source, sourceLimit, offsets: nullptr, flush, err: &err);
1790 // We did reserve enough space:
1791 Q_ASSERT(err != U_BUFFER_OVERFLOW_ERROR);
1792 if (state->flags.testFlag(flag: QStringConverter::Flag::Stateless)) {
1793 if (auto leftOver = ucnv_toUCountPending(cnv: icu_conv, status: &err)) {
1794 ucnv_reset(converter: icu_conv);
1795 state->invalidChars += leftOver;
1796 }
1797 }
1798 return reinterpret_cast<QChar *>(target);
1799 }
1800
1801 static char *fromUtf16(char *out, QStringView in, QStringConverter::State *state)
1802 {
1803 ensureConverter(state);
1804 auto icu_conv = static_cast<UConverter *>(state->d[0]);
1805 UErrorCode err = U_ZERO_ERROR;
1806 auto source = reinterpret_cast<const UChar *>(in.data());
1807 auto sourceLimit = reinterpret_cast<const UChar *>(in.data() + in.size());
1808
1809 qsizetype length = UCNV_GET_MAX_BYTES_FOR_STRING(in.size(), ucnv_getMaxCharSize(icu_conv));
1810
1811 char *target = out;
1812 char *targetLimit = out + length;
1813 UBool flush = false;
1814
1815 // If the QStringConverter was moved, the state that we used as a context is stale now.
1816 UConverterFromUCallback action;
1817 const void *context;
1818 ucnv_getFromUCallBack(converter: icu_conv, action: &action, context: &context);
1819 if (context != state)
1820 ucnv_setFromUCallBack(converter: icu_conv, newAction: action, newContext: &state, oldAction: nullptr, oldContext: nullptr, err: &err);
1821
1822 ucnv_fromUnicode(converter: icu_conv, target: &target, targetLimit, source: &source, sourceLimit, offsets: nullptr, flush, err: &err);
1823 // We did reserve enough space:
1824 Q_ASSERT(err != U_BUFFER_OVERFLOW_ERROR);
1825 if (state->flags.testFlag(flag: QStringConverter::Flag::Stateless)) {
1826 if (auto leftOver = ucnv_fromUCountPending(cnv: icu_conv, status: &err)) {
1827 ucnv_reset(converter: icu_conv);
1828 state->invalidChars += leftOver;
1829 }
1830 }
1831 return target;
1832 }
1833
1834 Q_DISABLE_COPY_MOVE(QStringConverterICU)
1835
1836 template<qsizetype X>
1837 static qsizetype fromLen(qsizetype inLength)
1838 {
1839 return X * inLength * sizeof(UChar);
1840 }
1841
1842 static qsizetype toLen(qsizetype inLength)
1843 {
1844
1845 /* Assumption: each input char might map to a different codepoint
1846 Each codepoint can take up to 4 bytes == 2 QChar
1847 We can ignore reserving space for a BOM, as only UTF encodings use one
1848 and those are not handled by the ICU converter.
1849 */
1850 return 2 * inLength;
1851 }
1852
1853 static constexpr QStringConverter::Interface forLength[] = {
1854 {.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<1>},
1855 {.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<2>},
1856 {.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<3>},
1857 {.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<4>},
1858 {.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<5>},
1859 {.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<6>},
1860 {.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<7>},
1861 {.name: "icu, recompile if you see this", .toUtf16: QStringConverterICU::toUtf16, .toUtf16Len: QStringConverterICU::toLen, .fromUtf16: QStringConverterICU::fromUtf16, .fromUtf16Len: QStringConverterICU::fromLen<8>}
1862 };
1863
1864 static UConverter *createConverterForName(const char *name, const State *state)
1865 {
1866 Q_ASSERT(name);
1867 Q_ASSERT(state);
1868 UErrorCode status = U_ZERO_ERROR;
1869 UConverter *conv = ucnv_open(converterName: name, err: &status);
1870 if (status != U_ZERO_ERROR && status != U_AMBIGUOUS_ALIAS_WARNING) {
1871 ucnv_close(converter: conv);
1872 return nullptr;
1873 }
1874
1875 if (state->flags.testFlag(flag: Flag::ConvertInvalidToNull)) {
1876 UErrorCode error = U_ZERO_ERROR;
1877
1878 auto nullToSubstituter = [](const void *context, UConverterToUnicodeArgs *toUArgs,
1879 const char *, int32_t length,
1880 UConverterCallbackReason reason, UErrorCode *err) {
1881 if (reason <= UCNV_IRREGULAR) {
1882 *err = U_ZERO_ERROR;
1883 UChar c = '\0';
1884 ucnv_cbToUWriteUChars(args: toUArgs, source: &c, length: 1, offsetIndex: 0, err);
1885 // Recover outer scope's state (which isn't const) from context:
1886 auto state = const_cast<State *>(static_cast<const State *>(context));
1887 state->invalidChars += length;
1888 }
1889 };
1890 ucnv_setToUCallBack(converter: conv, newAction: nullToSubstituter, newContext: state, oldAction: nullptr, oldContext: nullptr, err: &error);
1891
1892 auto nullFromSubstituter = [](const void *context, UConverterFromUnicodeArgs *fromUArgs,
1893 const UChar *, int32_t length,
1894 UChar32, UConverterCallbackReason reason, UErrorCode *err) {
1895 if (reason <= UCNV_IRREGULAR) {
1896 *err = U_ZERO_ERROR;
1897 const UChar replacement[] = { 0 };
1898 const UChar *stringBegin = std::begin(arr: replacement);
1899 ucnv_cbFromUWriteUChars(args: fromUArgs, source: &stringBegin, sourceLimit: std::end(arr: replacement), offsetIndex: 0, err);
1900 // Recover outer scope's state (which isn't const) from context:
1901 auto state = const_cast<State *>(static_cast<const State *>(context));
1902 state->invalidChars += length;
1903 }
1904 };
1905 ucnv_setFromUCallBack(converter: conv, newAction: nullFromSubstituter, newContext: state, oldAction: nullptr, oldContext: nullptr, err: &error);
1906 } else {
1907 UErrorCode error = U_ZERO_ERROR;
1908
1909 auto qmarkToSubstituter = [](const void *context, UConverterToUnicodeArgs *toUArgs,
1910 const char *codeUnits,int32_t length,
1911 UConverterCallbackReason reason, UErrorCode *err) {
1912 if (reason <= UCNV_IRREGULAR) {
1913 // Recover outer scope's state (which isn't const) from context:
1914 auto state = const_cast<State *>(static_cast<const State *>(context));
1915 state->invalidChars += length;
1916 }
1917 // use existing ICU callback for logic
1918 UCNV_TO_U_CALLBACK_SUBSTITUTE(context: nullptr, toUArgs, codeUnits, length, reason, err);
1919
1920 };
1921 ucnv_setToUCallBack(converter: conv, newAction: qmarkToSubstituter, newContext: state, oldAction: nullptr, oldContext: nullptr, err: &error);
1922
1923 auto qmarkFromSubstituter = [](const void *context, UConverterFromUnicodeArgs *fromUArgs,
1924 const UChar *codeUnits, int32_t length,
1925 UChar32 codePoint, UConverterCallbackReason reason, UErrorCode *err) {
1926 if (reason <= UCNV_IRREGULAR) {
1927 // Recover outer scope's state (which isn't const) from context:
1928 auto state = const_cast<State *>(static_cast<const State *>(context));
1929 state->invalidChars += length;
1930 }
1931 // use existing ICU callback for logic
1932 UCNV_FROM_U_CALLBACK_SUBSTITUTE(context: nullptr, fromUArgs, codeUnits, length,
1933 codePoint, reason, err);
1934 };
1935 ucnv_setFromUCallBack(converter: conv, newAction: qmarkFromSubstituter, newContext: state, oldAction: nullptr, oldContext: nullptr, err: &error);
1936 }
1937 return conv;
1938 }
1939
1940 static const QStringConverter::Interface *make_icu_converter(
1941 QStringConverterBase::State *state,
1942 const char *name)
1943 {
1944 UErrorCode status = U_ZERO_ERROR;
1945 UConverter *conv = createConverterForName(name, state);
1946 if (!conv)
1947 return nullptr;
1948
1949 const char *icuName = ucnv_getName(converter: conv, err: &status);
1950 // ucnv_getStandardName returns a name which is owned by the library
1951 // we can thus store it in the state without worrying aobut its lifetime
1952 const char *persistentName = ucnv_getStandardName(name: icuName, standard: "MIME", pErrorCode: &status);
1953 if (U_FAILURE(code: status) || !persistentName) {
1954 status = U_ZERO_ERROR;
1955 persistentName = ucnv_getStandardName(name: icuName, standard: "IANA", pErrorCode: &status);
1956 }
1957 state->d[1] = const_cast<char *>(persistentName);
1958 state->d[0] = conv;
1959 state->flags |= QStringConverterBase::Flag::UsesIcu;
1960 qsizetype maxCharSize = ucnv_getMaxCharSize(converter: conv);
1961 state->clearFn = QStringConverterICU::clear_function;
1962 if (maxCharSize > 8 || maxCharSize < 1) {
1963 qWarning(msg: "Encountered unexpected codec \"%s\" which requires >8x space", name);
1964 return nullptr;
1965 } else {
1966 return &forLength[maxCharSize - 1];
1967 }
1968
1969 }
1970
1971};
1972#endif
1973
1974/*!
1975 \internal
1976*/
1977QStringConverter::QStringConverter(const char *name, Flags f)
1978 : iface(nullptr), state(f)
1979{
1980 auto e = encodingForName(name);
1981 if (e)
1982 iface = encodingInterfaces + int(*e);
1983#if QT_CONFIG(icu)
1984 else
1985 iface = QStringConverterICU::make_icu_converter(state: &state, name);
1986#endif
1987}
1988
1989
1990const char *QStringConverter::name() const noexcept
1991{
1992 if (!iface)
1993 return nullptr;
1994 if (state.flags & QStringConverter::Flag::UsesIcu) {
1995#if QT_CONFIG(icu)
1996 return static_cast<const char*>(state.d[1]);
1997#else
1998 return nullptr;
1999#endif
2000 } else {
2001 return iface->name;
2002 }
2003}
2004
2005/*!
2006 \fn bool QStringConverter::isValid() const
2007
2008 Returns true if this is a valid string converter that can be used for encoding or
2009 decoding text.
2010
2011 Default constructed string converters or converters constructed with an unsupported
2012 name are not valid.
2013*/
2014
2015/*!
2016 \fn void QStringConverter::resetState()
2017
2018 Resets the internal state of the converter, clearing potential errors or partial
2019 conversions.
2020*/
2021
2022/*!
2023 \fn bool QStringConverter::hasError() const
2024
2025 Returns true if a conversion could not correctly convert a character. This could for example
2026 get triggered by an invalid UTF-8 sequence or when a character can't get converted due to
2027 limitations in the target encoding.
2028*/
2029
2030/*!
2031 \fn const char *QStringConverter::name() const
2032
2033 Returns the canonical name of the encoding this QStringConverter can encode or decode.
2034 Returns a nullptr if the converter is not valid.
2035
2036 \sa isValid()
2037*/
2038
2039/*!
2040 Convert \a name to the corresponding \l Encoding member, if there is one.
2041
2042 If the \a name is not the name of a codec listed in the Encoding enumeration,
2043 \c{std::nullopt} is returned. Such a name may, none the less, be accepted by
2044 the QStringConverter constructor when Qt is built with ICU, if ICU provides a
2045 converter with the given name.
2046*/
2047std::optional<QStringConverter::Encoding> QStringConverter::encodingForName(const char *name) noexcept
2048{
2049 for (qsizetype i = 0; i < LastEncoding + 1; ++i) {
2050 if (nameMatch(a: encodingInterfaces[i].name, b: name))
2051 return QStringConverter::Encoding(i);
2052 }
2053 if (nameMatch(a: name, b: "latin1"))
2054 return QStringConverter::Latin1;
2055 return std::nullopt;
2056}
2057
2058/*!
2059 Returns the encoding for the content of \a data if it can be determined.
2060 \a expectedFirstCharacter can be passed as an additional hint to help determine
2061 the encoding.
2062
2063 The returned optional is empty, if the encoding is unclear.
2064 */
2065std::optional<QStringConverter::Encoding>
2066QStringConverter::encodingForData(QByteArrayView data, char16_t expectedFirstCharacter) noexcept
2067{
2068 // someone set us up the BOM?
2069 qsizetype arraySize = data.size();
2070 if (arraySize > 3) {
2071 char32_t uc = qFromUnaligned<char32_t>(src: data.data());
2072 if (uc == qToBigEndian(source: char32_t(QChar::ByteOrderMark)))
2073 return QStringConverter::Utf32BE;
2074 if (uc == qToLittleEndian(source: char32_t(QChar::ByteOrderMark)))
2075 return QStringConverter::Utf32LE;
2076 if (expectedFirstCharacter) {
2077 // catch also anything starting with the expected character
2078 if (qToLittleEndian(source: uc) == expectedFirstCharacter)
2079 return QStringConverter::Utf32LE;
2080 else if (qToBigEndian(source: uc) == expectedFirstCharacter)
2081 return QStringConverter::Utf32BE;
2082 }
2083 }
2084
2085 if (arraySize > 2) {
2086 if (memcmp(s1: data.data(), s2: utf8bom, n: sizeof(utf8bom)) == 0)
2087 return QStringConverter::Utf8;
2088 }
2089
2090 if (arraySize > 1) {
2091 char16_t uc = qFromUnaligned<char16_t>(src: data.data());
2092 if (uc == qToBigEndian(source: char16_t(QChar::ByteOrderMark)))
2093 return QStringConverter::Utf16BE;
2094 if (uc == qToLittleEndian(source: char16_t(QChar::ByteOrderMark)))
2095 return QStringConverter::Utf16LE;
2096 if (expectedFirstCharacter) {
2097 // catch also anything starting with the expected character
2098 if (qToLittleEndian(source: uc) == expectedFirstCharacter)
2099 return QStringConverter::Utf16LE;
2100 else if (qToBigEndian(source: uc) == expectedFirstCharacter)
2101 return QStringConverter::Utf16BE;
2102 }
2103 }
2104 return std::nullopt;
2105}
2106
2107static QByteArray parseHtmlMetaForEncoding(QByteArrayView data)
2108{
2109 static constexpr auto metaSearcher = qMakeStaticByteArrayMatcher(pattern: "meta ");
2110 static constexpr auto charsetSearcher = qMakeStaticByteArrayMatcher(pattern: "charset=");
2111
2112 QByteArray header = data.first(n: qMin(a: data.size(), b: qsizetype(1024))).toByteArray().toLower();
2113 qsizetype pos = metaSearcher.indexIn(haystack: header);
2114 if (pos != -1) {
2115 pos = charsetSearcher.indexIn(haystack: header, from: pos);
2116 if (pos != -1) {
2117 pos += qstrlen(str: "charset=");
2118 if (pos < header.size() && (header.at(i: pos) == '\"' || header.at(i: pos) == '\''))
2119 ++pos;
2120
2121 qsizetype pos2 = pos;
2122 // The attribute can be closed with either """, "'", ">" or "/",
2123 // none of which are valid charset characters.
2124 while (++pos2 < header.size()) {
2125 char ch = header.at(i: pos2);
2126 if (ch == '\"' || ch == '\'' || ch == '>' || ch == '/') {
2127 QByteArray name = header.mid(index: pos, len: pos2 - pos);
2128 qsizetype colon = name.indexOf(c: ':');
2129 if (colon > 0)
2130 name = name.left(len: colon);
2131 name = name.simplified();
2132 if (name == "unicode") // QTBUG-41998, ICU will return UTF-16.
2133 name = QByteArrayLiteral("UTF-8");
2134 if (!name.isEmpty())
2135 return name;
2136 }
2137 }
2138 }
2139 }
2140 return QByteArray();
2141}
2142
2143/*!
2144 Tries to determine the encoding of the HTML in \a data by looking at leading byte
2145 order marks or a charset specifier in the HTML meta tag. If the optional is empty,
2146 the encoding specified is not supported by QStringConverter. If no encoding is
2147 detected, the method returns Utf8.
2148
2149 \sa QStringDecoder::decoderForHtml()
2150*/
2151std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(QByteArrayView data)
2152{
2153 // determine charset
2154 std::optional<QStringConverter::Encoding> encoding = encodingForData(data);
2155 if (encoding)
2156 // trust the initial BOM
2157 return encoding;
2158
2159 QByteArray encodingTag = parseHtmlMetaForEncoding(data);
2160 if (!encodingTag.isEmpty())
2161 return encodingForName(name: encodingTag);
2162
2163 return Utf8;
2164}
2165
2166/*!
2167 Tries to determine the encoding of the HTML in \a data by looking at leading byte
2168 order marks or a charset specifier in the HTML meta tag and returns a QStringDecoder
2169 matching the encoding. If the returned decoder is not valid,
2170 the encoding specified is not supported by QStringConverter. If no encoding is
2171 detected, the method returns a decoder for Utf8.
2172
2173 \sa isValid()
2174*/
2175QStringDecoder QStringDecoder::decoderForHtml(QByteArrayView data)
2176{
2177 // determine charset
2178 std::optional<QStringConverter::Encoding> encoding = encodingForData(data);
2179 if (encoding)
2180 // trust the initial BOM
2181 return QStringDecoder(encoding.value());
2182
2183 QByteArray encodingTag = parseHtmlMetaForEncoding(data);
2184 if (!encodingTag.isEmpty())
2185 return QStringDecoder(encodingTag);
2186
2187 return QStringDecoder(Utf8);
2188}
2189
2190
2191/*!
2192 Returns the canonical name for encoding \a e.
2193*/
2194const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e)
2195{
2196 return encodingInterfaces[int(e)].name;
2197}
2198
2199/*!
2200 \class QStringEncoder
2201 \inmodule QtCore
2202 \brief The QStringEncoder class provides a state-based encoder for text.
2203 \reentrant
2204 \ingroup i18n
2205
2206 A text encoder converts text from Qt's internal representation into an encoded
2207 text format using a specific encoding.
2208
2209 Converting a string from Unicode to the local encoding can be achieved
2210 using the following code:
2211
2212 \snippet code/src_corelib_text_qstringconverter.cpp 1
2213
2214 The encoder remembers any state that is required between calls, so converting
2215 data received in chunks, for example, when receiving it over a network, is just as
2216 easy, by calling the encoder whenever new data is available:
2217
2218 \snippet code/src_corelib_text_qstringconverter.cpp 3
2219
2220 The QStringEncoder object maintains state between chunks and therefore
2221 works correctly even if a UTF-16 surrogate character is split between
2222 chunks.
2223
2224 QStringEncoder objects can't be copied because of their internal state, but
2225 can be moved.
2226
2227 \sa QStringConverter, QStringDecoder
2228*/
2229
2230/*!
2231 \fn constexpr QStringEncoder::QStringEncoder(const Interface *i)
2232 \internal
2233*/
2234
2235/*!
2236 \fn constexpr QStringEncoder::QStringEncoder()
2237
2238 Default constructs an encoder. The default encoder is not valid,
2239 and can't be used for converting text.
2240*/
2241
2242/*!
2243 \fn constexpr QStringEncoder::QStringEncoder(Encoding encoding, Flags flags = Flag::Default)
2244
2245 Creates an encoder object using \a encoding and \a flags.
2246*/
2247
2248/*!
2249 \fn constexpr QStringEncoder::QStringEncoder(const char *name, Flags flags = Flag::Default)
2250
2251 Creates an encoder object using \a name and \a flags.
2252 If \a name is not the name of a known encoding an invalid converter will get created.
2253
2254 \sa isValid()
2255*/
2256
2257/*!
2258 \fn QByteArray QStringEncoder::encode(const QString &in)
2259 \fn QByteArray QStringEncoder::encode(QStringView in)
2260 \fn QByteArray QStringEncoder::operator()(const QString &in)
2261 \fn QByteArray QStringEncoder::operator()(QStringView in)
2262
2263 Converts \a in and returns the data as a byte array.
2264*/
2265
2266/*!
2267 \fn qsizetype QStringEncoder::requiredSpace(qsizetype inputLength) const
2268
2269 Returns the maximum amount of characters required to be able to process
2270 \a inputLength decoded data.
2271
2272 \sa appendToBuffer()
2273*/
2274
2275/*!
2276 \fn char *QStringEncoder::appendToBuffer(char *out, QStringView in)
2277
2278 Encodes \a in and writes the encoded result into the buffer
2279 starting at \a out. Returns a pointer to the end of the data written.
2280
2281 \note \a out must be large enough to be able to hold all the decoded data. Use
2282 requiredSpace() to determine the maximum size requirement to be able to encode
2283 \a in.
2284
2285 \sa requiredSpace()
2286*/
2287
2288/*!
2289 \class QStringDecoder
2290 \inmodule QtCore
2291 \brief The QStringDecoder class provides a state-based decoder for text.
2292 \reentrant
2293 \ingroup i18n
2294
2295 A text decoder converts text an encoded text format that uses a specific encoding
2296 into Qt's internal representation.
2297
2298 Converting encoded data into a QString can be achieved
2299 using the following code:
2300
2301 \snippet code/src_corelib_text_qstringconverter.cpp 0
2302
2303 The decoder remembers any state that is required between calls, so converting
2304 data received in chunks, for example, when receiving it over a network, is just as
2305 easy, by calling the decoder whenever new data is available:
2306
2307 \snippet code/src_corelib_text_qstringconverter.cpp 2
2308
2309 The QStringDecoder object maintains state between chunks and therefore
2310 works correctly even if chunks are split in the middle of a multi-byte character
2311 sequence.
2312
2313 QStringDecoder objects can't be copied because of their internal state, but
2314 can be moved.
2315
2316 \sa QStringConverter, QStringEncoder
2317*/
2318
2319/*!
2320 \fn constexpr QStringDecoder::QStringDecoder(const Interface *i)
2321 \internal
2322*/
2323
2324/*!
2325 \fn constexpr QStringDecoder::QStringDecoder()
2326
2327 Default constructs an decoder. The default decoder is not valid,
2328 and can't be used for converting text.
2329*/
2330
2331/*!
2332 \fn constexpr QStringDecoder::QStringDecoder(Encoding encoding, Flags flags = Flag::Default)
2333
2334 Creates an decoder object using \a encoding and \a flags.
2335*/
2336
2337/*!
2338 \fn constexpr QStringDecoder::QStringDecoder(const char *name, Flags flags = Flag::Default)
2339
2340 Creates an decoder object using \a name and \a flags.
2341 If \a name is not the name of a known encoding an invalid converter will get created.
2342
2343 \sa isValid()
2344*/
2345
2346/*!
2347 \fn QString QStringDecoder::operator()(const QByteArray &ba)
2348 \fn QString QStringDecoder::decode(const QByteArray &ba)
2349 \fn QString QStringDecoder::operator()(QByteArrayView ba)
2350 \fn QString QStringDecoder::decode(QByteArrayView ba)
2351
2352 Converts \a ba and returns the data as a QString.
2353*/
2354
2355/*!
2356 \fn qsizetype QStringDecoder::requiredSpace(qsizetype inputLength) const
2357
2358 Returns the maximum amount of UTF-16 code units required to be able to process
2359 \a inputLength encoded data.
2360
2361 \sa appendToBuffer
2362*/
2363
2364/*!
2365 \fn QChar *QStringDecoder::appendToBuffer(QChar *out, QByteArrayView in)
2366
2367 Decodes the sequence of bytes viewed by \a in and writes the decoded result into
2368 the buffer starting at \a out. Returns a pointer to the end of data written.
2369
2370 \a out needs to be large enough to be able to hold all the decoded data. Use
2371 \l{requiredSpace} to determine the maximum size requirements to decode an encoded
2372 data buffer of \c in.size() bytes.
2373
2374 \sa requiredSpace
2375*/
2376
2377/*!
2378 \fn char16_t *QStringDecoder::appendToBuffer(char16_t *out, QByteArrayView in)
2379 \since 6.6
2380 \overload
2381*/
2382
2383QT_END_NAMESPACE
2384

source code of qtbase/src/corelib/text/qstringconverter.cpp