1 | /**************************************************************************** |
2 | ** |
3 | ** Copyright (C) 2016 The Qt Company Ltd. |
4 | ** Copyright (C) 2018 Intel Corporation. |
5 | ** Contact: https://www.qt.io/licensing/ |
6 | ** |
7 | ** This file is part of the QtCore module of the Qt Toolkit. |
8 | ** |
9 | ** $QT_BEGIN_LICENSE:LGPL$ |
10 | ** Commercial License Usage |
11 | ** Licensees holding valid commercial Qt licenses may use this file in |
12 | ** accordance with the commercial license agreement provided with the |
13 | ** Software or, alternatively, in accordance with the terms contained in |
14 | ** a written agreement between you and The Qt Company. For licensing terms |
15 | ** and conditions see https://www.qt.io/terms-conditions. For further |
16 | ** information use the contact form at https://www.qt.io/contact-us. |
17 | ** |
18 | ** GNU Lesser General Public License Usage |
19 | ** Alternatively, this file may be used under the terms of the GNU Lesser |
20 | ** General Public License version 3 as published by the Free Software |
21 | ** Foundation and appearing in the file LICENSE.LGPL3 included in the |
22 | ** packaging of this file. Please review the following information to |
23 | ** ensure the GNU Lesser General Public License version 3 requirements |
24 | ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. |
25 | ** |
26 | ** GNU General Public License Usage |
27 | ** Alternatively, this file may be used under the terms of the GNU |
28 | ** General Public License version 2.0 or (at your option) the GNU General |
29 | ** Public license version 3 or any later version approved by the KDE Free |
30 | ** Qt Foundation. The licenses are as published by the Free Software |
31 | ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 |
32 | ** included in the packaging of this file. Please review the following |
33 | ** information to ensure the GNU General Public License requirements will |
34 | ** be met: https://www.gnu.org/licenses/gpl-2.0.html and |
35 | ** https://www.gnu.org/licenses/gpl-3.0.html. |
36 | ** |
37 | ** $QT_END_LICENSE$ |
38 | ** |
39 | ****************************************************************************/ |
40 | |
41 | #include "qutfcodec_p.h" |
42 | #include "qlist.h" |
43 | #include "qendian.h" |
44 | #include "qchar.h" |
45 | |
46 | #include "private/qsimd_p.h" |
47 | #include "private/qstringiterator_p.h" |
48 | |
49 | QT_BEGIN_NAMESPACE |
50 | |
51 | enum { Endian = 0, Data = 1 }; |
52 | |
53 | static const uchar utf8bom[] = { 0xef, 0xbb, 0xbf }; |
54 | |
55 | #if (defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)) \ |
56 | || (defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64)) |
57 | static Q_ALWAYS_INLINE uint qBitScanReverse(unsigned v) noexcept |
58 | { |
59 | uint result = qCountLeadingZeroBits(v); |
60 | // Now Invert the result: clz will count *down* from the msb to the lsb, so the msb index is 31 |
61 | // and the lsb index is 0. The result for _bit_scan_reverse is expected to be the index when |
62 | // counting up: msb index is 0 (because it starts there), and the lsb index is 31. |
63 | result ^= sizeof(unsigned) * 8 - 1; |
64 | return result; |
65 | } |
66 | #endif |
67 | |
68 | #if defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2) |
69 | static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end) |
70 | { |
71 | // do sixteen characters at a time |
72 | for ( ; end - src >= 16; src += 16, dst += 16) { |
73 | # ifdef __AVX2__ |
74 | __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)); |
75 | __m128i data1 = _mm256_castsi256_si128(data); |
76 | __m128i data2 = _mm256_extracti128_si256(data, 1); |
77 | # else |
78 | __m128i data1 = _mm_loadu_si128(p: (const __m128i*)src); |
79 | __m128i data2 = _mm_loadu_si128(p: 1+(const __m128i*)src); |
80 | # endif |
81 | |
82 | // check if everything is ASCII |
83 | // the highest ASCII value is U+007F |
84 | // Do the packing directly: |
85 | // The PACKUSWB instruction has packs a signed 16-bit integer to an unsigned 8-bit |
86 | // with saturation. That is, anything from 0x0100 to 0x7fff is saturated to 0xff, |
87 | // while all negatives (0x8000 to 0xffff) get saturated to 0x00. To detect non-ASCII, |
88 | // we simply do a signed greater-than comparison to 0x00. That means we detect NULs as |
89 | // "non-ASCII", but it's an acceptable compromise. |
90 | __m128i packed = _mm_packus_epi16(a: data1, b: data2); |
91 | __m128i nonAscii = _mm_cmpgt_epi8(a: packed, b: _mm_setzero_si128()); |
92 | |
93 | // store, even if there are non-ASCII characters here |
94 | _mm_storeu_si128(p: (__m128i*)dst, b: packed); |
95 | |
96 | // n will contain 1 bit set per character in [data1, data2] that is non-ASCII (or NUL) |
97 | ushort n = ~_mm_movemask_epi8(a: nonAscii); |
98 | if (n) { |
99 | // find the next probable ASCII character |
100 | // we don't want to load 32 bytes again in this loop if we know there are non-ASCII |
101 | // characters still coming |
102 | nextAscii = src + qBitScanReverse(v: n) + 1; |
103 | |
104 | n = qCountTrailingZeroBits(v: n); |
105 | dst += n; |
106 | src += n; |
107 | return false; |
108 | } |
109 | } |
110 | |
111 | if (end - src >= 8) { |
112 | // do eight characters at a time |
113 | __m128i data = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(src)); |
114 | __m128i packed = _mm_packus_epi16(a: data, b: data); |
115 | __m128i nonAscii = _mm_cmpgt_epi8(a: packed, b: _mm_setzero_si128()); |
116 | |
117 | // store even non-ASCII |
118 | _mm_storel_epi64(p: reinterpret_cast<__m128i *>(dst), a: packed); |
119 | |
120 | uchar n = ~_mm_movemask_epi8(a: nonAscii); |
121 | if (n) { |
122 | nextAscii = src + qBitScanReverse(v: n) + 1; |
123 | n = qCountTrailingZeroBits(v: n); |
124 | dst += n; |
125 | src += n; |
126 | return false; |
127 | } |
128 | } |
129 | |
130 | return src == end; |
131 | } |
132 | |
133 | static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end) |
134 | { |
135 | // do sixteen characters at a time |
136 | for ( ; end - src >= 16; src += 16, dst += 16) { |
137 | __m128i data = _mm_loadu_si128(p: (const __m128i*)src); |
138 | |
139 | #ifdef __AVX2__ |
140 | const int BitSpacing = 2; |
141 | // load and zero extend to an YMM register |
142 | const __m256i extended = _mm256_cvtepu8_epi16(data); |
143 | |
144 | uint n = _mm256_movemask_epi8(extended); |
145 | if (!n) { |
146 | // store |
147 | _mm256_storeu_si256((__m256i*)dst, extended); |
148 | continue; |
149 | } |
150 | #else |
151 | const int BitSpacing = 1; |
152 | |
153 | // check if everything is ASCII |
154 | // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII |
155 | uint n = _mm_movemask_epi8(a: data); |
156 | if (!n) { |
157 | // unpack |
158 | _mm_storeu_si128(p: (__m128i*)dst, b: _mm_unpacklo_epi8(a: data, b: _mm_setzero_si128())); |
159 | _mm_storeu_si128(p: 1+(__m128i*)dst, b: _mm_unpackhi_epi8(a: data, b: _mm_setzero_si128())); |
160 | continue; |
161 | } |
162 | #endif |
163 | |
164 | // copy the front part that is still ASCII |
165 | while (!(n & 1)) { |
166 | *dst++ = *src++; |
167 | n >>= BitSpacing; |
168 | } |
169 | |
170 | // find the next probable ASCII character |
171 | // we don't want to load 16 bytes again in this loop if we know there are non-ASCII |
172 | // characters still coming |
173 | n = qBitScanReverse(v: n); |
174 | nextAscii = src + (n / BitSpacing) + 1; |
175 | return false; |
176 | |
177 | } |
178 | |
179 | if (end - src >= 8) { |
180 | __m128i data = _mm_loadl_epi64(p: reinterpret_cast<const __m128i *>(src)); |
181 | uint n = _mm_movemask_epi8(a: data) & 0xff; |
182 | if (!n) { |
183 | // unpack and store |
184 | _mm_storeu_si128(p: reinterpret_cast<__m128i *>(dst), b: _mm_unpacklo_epi8(a: data, b: _mm_setzero_si128())); |
185 | } else { |
186 | while (!(n & 1)) { |
187 | *dst++ = *src++; |
188 | n >>= 1; |
189 | } |
190 | |
191 | n = qBitScanReverse(v: n); |
192 | nextAscii = src + n + 1; |
193 | return false; |
194 | } |
195 | } |
196 | |
197 | return src == end; |
198 | } |
199 | |
200 | static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii) |
201 | { |
202 | #ifdef __AVX2__ |
203 | // do 32 characters at a time |
204 | // (this is similar to simdTestMask in qstring.cpp) |
205 | const __m256i mask = _mm256_set1_epi8(0x80); |
206 | for ( ; end - src >= 32; src += 32) { |
207 | __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)); |
208 | if (_mm256_testz_si256(mask, data)) |
209 | continue; |
210 | |
211 | uint n = _mm256_movemask_epi8(data); |
212 | Q_ASSUME(n); |
213 | |
214 | // find the next probable ASCII character |
215 | // we don't want to load 32 bytes again in this loop if we know there are non-ASCII |
216 | // characters still coming |
217 | nextAscii = src + qBitScanReverse(n) + 1; |
218 | |
219 | // return the non-ASCII character |
220 | return src + qCountTrailingZeroBits(n); |
221 | } |
222 | #endif |
223 | |
224 | // do sixteen characters at a time |
225 | for ( ; end - src >= 16; src += 16) { |
226 | __m128i data = _mm_loadu_si128(p: reinterpret_cast<const __m128i*>(src)); |
227 | |
228 | // check if everything is ASCII |
229 | // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII |
230 | uint n = _mm_movemask_epi8(a: data); |
231 | if (!n) |
232 | continue; |
233 | |
234 | // find the next probable ASCII character |
235 | // we don't want to load 16 bytes again in this loop if we know there are non-ASCII |
236 | // characters still coming |
237 | nextAscii = src + qBitScanReverse(v: n) + 1; |
238 | |
239 | // return the non-ASCII character |
240 | return src + qCountTrailingZeroBits(v: n); |
241 | } |
242 | |
243 | // do four characters at a time |
244 | for ( ; end - src >= 4; src += 4) { |
245 | quint32 data = qFromUnaligned<quint32>(src); |
246 | data &= 0x80808080U; |
247 | if (!data) |
248 | continue; |
249 | |
250 | // We don't try to guess which of the three bytes is ASCII and which |
251 | // one isn't. The chance that at least two of them are non-ASCII is |
252 | // better than 75%. |
253 | nextAscii = src; |
254 | return src; |
255 | } |
256 | nextAscii = end; |
257 | return src; |
258 | } |
259 | #elif defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64) // vaddv is only available on Aarch64 |
260 | static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end) |
261 | { |
262 | uint16x8_t maxAscii = vdupq_n_u16(0x7f); |
263 | uint16x8_t mask1 = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 }; |
264 | uint16x8_t mask2 = vshlq_n_u16(mask1, 1); |
265 | |
266 | // do sixteen characters at a time |
267 | for ( ; end - src >= 16; src += 16, dst += 16) { |
268 | // load 2 lanes (or: "load interleaved") |
269 | uint16x8x2_t in = vld2q_u16(src); |
270 | |
271 | // check if any of the elements > 0x7f, select 1 bit per element (element 0 -> bit 0, element 1 -> bit 1, etc), |
272 | // add those together into a scalar, and merge the scalars. |
273 | uint16_t nonAscii = vaddvq_u16(vandq_u16(vcgtq_u16(in.val[0], maxAscii), mask1)) |
274 | | vaddvq_u16(vandq_u16(vcgtq_u16(in.val[1], maxAscii), mask2)); |
275 | |
276 | // merge the two lanes by shifting the values of the second by 8 and inserting them |
277 | uint16x8_t out = vsliq_n_u16(in.val[0], in.val[1], 8); |
278 | |
279 | // store, even if there are non-ASCII characters here |
280 | vst1q_u8(dst, vreinterpretq_u8_u16(out)); |
281 | |
282 | if (nonAscii) { |
283 | // find the next probable ASCII character |
284 | // we don't want to load 32 bytes again in this loop if we know there are non-ASCII |
285 | // characters still coming |
286 | nextAscii = src + qBitScanReverse(nonAscii) + 1; |
287 | |
288 | nonAscii = qCountTrailingZeroBits(nonAscii); |
289 | dst += nonAscii; |
290 | src += nonAscii; |
291 | return false; |
292 | } |
293 | } |
294 | return src == end; |
295 | } |
296 | |
297 | static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end) |
298 | { |
299 | // do eight characters at a time |
300 | uint8x8_t msb_mask = vdup_n_u8(0x80); |
301 | uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; |
302 | for ( ; end - src >= 8; src += 8, dst += 8) { |
303 | uint8x8_t c = vld1_u8(src); |
304 | uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask)); |
305 | if (!n) { |
306 | // store |
307 | vst1q_u16(dst, vmovl_u8(c)); |
308 | continue; |
309 | } |
310 | |
311 | // copy the front part that is still ASCII |
312 | while (!(n & 1)) { |
313 | *dst++ = *src++; |
314 | n >>= 1; |
315 | } |
316 | |
317 | // find the next probable ASCII character |
318 | // we don't want to load 16 bytes again in this loop if we know there are non-ASCII |
319 | // characters still coming |
320 | n = qBitScanReverse(n); |
321 | nextAscii = src + n + 1; |
322 | return false; |
323 | |
324 | } |
325 | return src == end; |
326 | } |
327 | |
328 | static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii) |
329 | { |
330 | // The SIMD code below is untested, so just force an early return until |
331 | // we've had the time to verify it works. |
332 | nextAscii = end; |
333 | return src; |
334 | |
335 | // do eight characters at a time |
336 | uint8x8_t msb_mask = vdup_n_u8(0x80); |
337 | uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; |
338 | for ( ; end - src >= 8; src += 8) { |
339 | uint8x8_t c = vld1_u8(src); |
340 | uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask)); |
341 | if (!n) |
342 | continue; |
343 | |
344 | // find the next probable ASCII character |
345 | // we don't want to load 16 bytes again in this loop if we know there are non-ASCII |
346 | // characters still coming |
347 | nextAscii = src + qBitScanReverse(n) + 1; |
348 | |
349 | // return the non-ASCII character |
350 | return src + qCountTrailingZeroBits(n); |
351 | } |
352 | nextAscii = end; |
353 | return src; |
354 | } |
355 | #else |
356 | static inline bool simdEncodeAscii(uchar *, const ushort *, const ushort *, const ushort *) |
357 | { |
358 | return false; |
359 | } |
360 | |
361 | static inline bool simdDecodeAscii(ushort *, const uchar *, const uchar *, const uchar *) |
362 | { |
363 | return false; |
364 | } |
365 | |
366 | static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii) |
367 | { |
368 | nextAscii = end; |
369 | return src; |
370 | } |
371 | #endif |
372 | |
373 | QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len) |
374 | { |
375 | // create a QByteArray with the worst case scenario size |
376 | QByteArray result(len * 3, Qt::Uninitialized); |
377 | uchar *dst = reinterpret_cast<uchar *>(const_cast<char *>(result.constData())); |
378 | const ushort *src = reinterpret_cast<const ushort *>(uc); |
379 | const ushort *const end = src + len; |
380 | |
381 | while (src != end) { |
382 | const ushort *nextAscii = end; |
383 | if (simdEncodeAscii(dst, nextAscii, src, end)) |
384 | break; |
385 | |
386 | do { |
387 | ushort uc = *src++; |
388 | int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u: uc, dst, src, end); |
389 | if (res < 0) { |
390 | // encoding error - append '?' |
391 | *dst++ = '?'; |
392 | } |
393 | } while (src < nextAscii); |
394 | } |
395 | |
396 | result.truncate(pos: dst - reinterpret_cast<uchar *>(const_cast<char *>(result.constData()))); |
397 | return result; |
398 | } |
399 | |
400 | QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state) |
401 | { |
402 | uchar replacement = '?'; |
403 | int rlen = 3*len; |
404 | int surrogate_high = -1; |
405 | if (state) { |
406 | if (state->flags & QTextCodec::ConvertInvalidToNull) |
407 | replacement = 0; |
408 | if (!(state->flags & QTextCodec::IgnoreHeader)) |
409 | rlen += 3; |
410 | if (state->remainingChars) |
411 | surrogate_high = state->state_data[0]; |
412 | } |
413 | |
414 | |
415 | QByteArray rstr(rlen, Qt::Uninitialized); |
416 | uchar *cursor = reinterpret_cast<uchar *>(const_cast<char *>(rstr.constData())); |
417 | const ushort *src = reinterpret_cast<const ushort *>(uc); |
418 | const ushort *const end = src + len; |
419 | |
420 | int invalid = 0; |
421 | if (state && !(state->flags & QTextCodec::IgnoreHeader)) { |
422 | // append UTF-8 BOM |
423 | *cursor++ = utf8bom[0]; |
424 | *cursor++ = utf8bom[1]; |
425 | *cursor++ = utf8bom[2]; |
426 | } |
427 | |
428 | const ushort *nextAscii = src; |
429 | while (src != end) { |
430 | int res; |
431 | ushort uc; |
432 | if (surrogate_high != -1) { |
433 | uc = surrogate_high; |
434 | surrogate_high = -1; |
435 | res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u: uc, dst&: cursor, src, end); |
436 | } else { |
437 | if (src >= nextAscii && simdEncodeAscii(dst&: cursor, nextAscii, src, end)) |
438 | break; |
439 | |
440 | uc = *src++; |
441 | res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u: uc, dst&: cursor, src, end); |
442 | } |
443 | if (Q_LIKELY(res >= 0)) |
444 | continue; |
445 | |
446 | if (res == QUtf8BaseTraits::Error) { |
447 | // encoding error |
448 | ++invalid; |
449 | *cursor++ = replacement; |
450 | } else if (res == QUtf8BaseTraits::EndOfString) { |
451 | surrogate_high = uc; |
452 | break; |
453 | } |
454 | } |
455 | |
456 | rstr.resize(size: cursor - (const uchar*)rstr.constData()); |
457 | if (state) { |
458 | state->invalidChars += invalid; |
459 | state->flags |= QTextCodec::IgnoreHeader; |
460 | state->remainingChars = 0; |
461 | if (surrogate_high >= 0) { |
462 | state->remainingChars = 1; |
463 | state->state_data[0] = surrogate_high; |
464 | } |
465 | } |
466 | return rstr; |
467 | } |
468 | |
469 | QString QUtf8::convertToUnicode(const char *chars, int len) |
470 | { |
471 | // UTF-8 to UTF-16 always needs the exact same number of words or less: |
472 | // UTF-8 UTF-16 |
473 | // 1 byte 1 word |
474 | // 2 bytes 1 word |
475 | // 3 bytes 1 word |
476 | // 4 bytes 2 words (one surrogate pair) |
477 | // That is, we'll use the full buffer if the input is US-ASCII (1-byte UTF-8), |
478 | // half the buffer for U+0080-U+07FF text (e.g., Greek, Cyrillic, Arabic) or |
479 | // non-BMP text, and one third of the buffer for U+0800-U+FFFF text (e.g, CJK). |
480 | // |
481 | // The table holds for invalid sequences too: we'll insert one replacement char |
482 | // per invalid byte. |
483 | QString result(len, Qt::Uninitialized); |
484 | QChar *data = const_cast<QChar*>(result.constData()); // we know we're not shared |
485 | const QChar *end = convertToUnicode(data, chars, len); |
486 | result.truncate(pos: end - data); |
487 | return result; |
488 | } |
489 | |
490 | /*! |
491 | \since 5.7 |
492 | \overload |
493 | |
494 | Converts the UTF-8 sequence of \a len octets beginning at \a chars to |
495 | a sequence of QChar starting at \a buffer. The buffer is expected to be |
496 | large enough to hold the result. An upper bound for the size of the |
497 | buffer is \a len QChars. |
498 | |
499 | If, during decoding, an error occurs, a QChar::ReplacementCharacter is |
500 | written. |
501 | |
502 | Returns a pointer to one past the last QChar written. |
503 | |
504 | This function never throws. |
505 | */ |
506 | |
507 | QChar *QUtf8::convertToUnicode(QChar *buffer, const char *chars, int len) noexcept |
508 | { |
509 | ushort *dst = reinterpret_cast<ushort *>(buffer); |
510 | const uchar *src = reinterpret_cast<const uchar *>(chars); |
511 | const uchar *end = src + len; |
512 | |
513 | // attempt to do a full decoding in SIMD |
514 | const uchar *nextAscii = end; |
515 | if (!simdDecodeAscii(dst, nextAscii, src, end)) { |
516 | // at least one non-ASCII entry |
517 | // check if we failed to decode the UTF-8 BOM; if so, skip it |
518 | if (Q_UNLIKELY(src == reinterpret_cast<const uchar *>(chars)) |
519 | && end - src >= 3 |
520 | && Q_UNLIKELY(src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])) { |
521 | src += 3; |
522 | } |
523 | |
524 | while (src < end) { |
525 | nextAscii = end; |
526 | if (simdDecodeAscii(dst, nextAscii, src, end)) |
527 | break; |
528 | |
529 | do { |
530 | uchar b = *src++; |
531 | int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end); |
532 | if (res < 0) { |
533 | // decoding error |
534 | *dst++ = QChar::ReplacementCharacter; |
535 | } |
536 | } while (src < nextAscii); |
537 | } |
538 | } |
539 | |
540 | return reinterpret_cast<QChar *>(dst); |
541 | } |
542 | |
543 | QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state) |
544 | { |
545 | bool = false; |
546 | ushort replacement = QChar::ReplacementCharacter; |
547 | int invalid = 0; |
548 | int res; |
549 | uchar ch = 0; |
550 | |
551 | // See above for buffer requirements for stateless decoding. However, that |
552 | // fails if the state is not empty. The following situations can add to the |
553 | // requirements: |
554 | // state contains chars starts with requirement |
555 | // 1 of 2 bytes valid continuation 0 |
556 | // 2 of 3 bytes same 0 |
557 | // 3 bytes of 4 same +1 (need to insert surrogate pair) |
558 | // 1 of 2 bytes invalid continuation +1 (need to insert replacement and restart) |
559 | // 2 of 3 bytes same +1 (same) |
560 | // 3 of 4 bytes same +1 (same) |
561 | QString result(len + 1, Qt::Uninitialized); |
562 | |
563 | ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData())); |
564 | const uchar *src = reinterpret_cast<const uchar *>(chars); |
565 | const uchar *end = src + len; |
566 | |
567 | if (state) { |
568 | if (state->flags & QTextCodec::IgnoreHeader) |
569 | headerdone = true; |
570 | if (state->flags & QTextCodec::ConvertInvalidToNull) |
571 | replacement = QChar::Null; |
572 | if (state->remainingChars) { |
573 | // handle incoming state first |
574 | uchar remainingCharsData[4]; // longest UTF-8 sequence possible |
575 | int remainingCharsCount = state->remainingChars; |
576 | int newCharsToCopy = qMin<int>(a: sizeof(remainingCharsData) - remainingCharsCount, b: end - src); |
577 | |
578 | memset(s: remainingCharsData, c: 0, n: sizeof(remainingCharsData)); |
579 | memcpy(dest: remainingCharsData, src: &state->state_data[0], n: remainingCharsCount); |
580 | memcpy(dest: remainingCharsData + remainingCharsCount, src: src, n: newCharsToCopy); |
581 | |
582 | const uchar *begin = &remainingCharsData[1]; |
583 | res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b: remainingCharsData[0], dst, src&: begin, |
584 | end: static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy); |
585 | if (res == QUtf8BaseTraits::Error || (res == QUtf8BaseTraits::EndOfString && len == 0)) { |
586 | // special case for len == 0: |
587 | // if we were supplied an empty string, terminate the previous, unfinished sequence with error |
588 | ++invalid; |
589 | *dst++ = replacement; |
590 | } else if (res == QUtf8BaseTraits::EndOfString) { |
591 | // if we got EndOfString again, then there were too few bytes in src; |
592 | // copy to our state and return |
593 | state->remainingChars = remainingCharsCount + newCharsToCopy; |
594 | memcpy(dest: &state->state_data[0], src: remainingCharsData, n: state->remainingChars); |
595 | return QString(); |
596 | } else if (!headerdone && res >= 0) { |
597 | // eat the UTF-8 BOM |
598 | headerdone = true; |
599 | if (dst[-1] == 0xfeff) |
600 | --dst; |
601 | } |
602 | |
603 | // adjust src now that we have maybe consumed a few chars |
604 | if (res >= 0) { |
605 | Q_ASSERT(res > remainingCharsCount); |
606 | src += res - remainingCharsCount; |
607 | } |
608 | } |
609 | } |
610 | |
611 | // main body, stateless decoding |
612 | res = 0; |
613 | const uchar *nextAscii = src; |
614 | const uchar *start = src; |
615 | while (res >= 0 && src < end) { |
616 | if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end)) |
617 | break; |
618 | |
619 | ch = *src++; |
620 | res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b: ch, dst, src, end); |
621 | if (!headerdone && res >= 0) { |
622 | headerdone = true; |
623 | if (src == start + 3) { // 3 == sizeof(utf8-bom) |
624 | // eat the UTF-8 BOM (it can only appear at the beginning of the string). |
625 | if (dst[-1] == 0xfeff) |
626 | --dst; |
627 | } |
628 | } |
629 | if (res == QUtf8BaseTraits::Error) { |
630 | res = 0; |
631 | ++invalid; |
632 | *dst++ = replacement; |
633 | } |
634 | } |
635 | |
636 | if (!state && res == QUtf8BaseTraits::EndOfString) { |
637 | // unterminated UTF sequence |
638 | *dst++ = QChar::ReplacementCharacter; |
639 | while (src++ < end) |
640 | *dst++ = QChar::ReplacementCharacter; |
641 | } |
642 | |
643 | result.truncate(pos: dst - (const ushort *)result.unicode()); |
644 | if (state) { |
645 | state->invalidChars += invalid; |
646 | if (headerdone) |
647 | state->flags |= QTextCodec::IgnoreHeader; |
648 | if (res == QUtf8BaseTraits::EndOfString) { |
649 | --src; // unread the byte in ch |
650 | state->remainingChars = end - src; |
651 | memcpy(dest: &state->state_data[0], src: src, n: end - src); |
652 | } else { |
653 | state->remainingChars = 0; |
654 | } |
655 | } |
656 | return result; |
657 | } |
658 | |
659 | struct QUtf8NoOutputTraits : public QUtf8BaseTraitsNoAscii |
660 | { |
661 | struct NoOutput {}; |
662 | static void appendUtf16(const NoOutput &, ushort) {} |
663 | static void appendUcs4(const NoOutput &, uint) {} |
664 | }; |
665 | |
666 | QUtf8::ValidUtf8Result QUtf8::isValidUtf8(const char *chars, qsizetype len) |
667 | { |
668 | const uchar *src = reinterpret_cast<const uchar *>(chars); |
669 | const uchar *end = src + len; |
670 | const uchar *nextAscii = src; |
671 | bool isValidAscii = true; |
672 | |
673 | while (src < end) { |
674 | if (src >= nextAscii) |
675 | src = simdFindNonAscii(src, end, nextAscii); |
676 | if (src == end) |
677 | break; |
678 | |
679 | do { |
680 | uchar b = *src++; |
681 | if ((b & 0x80) == 0) |
682 | continue; |
683 | |
684 | isValidAscii = false; |
685 | QUtf8NoOutputTraits::NoOutput output; |
686 | int res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, dst&: output, src, end); |
687 | if (res < 0) { |
688 | // decoding error |
689 | return { .isValidUtf8: false, .isValidAscii: false }; |
690 | } |
691 | } while (src < nextAscii); |
692 | } |
693 | |
694 | return { .isValidUtf8: true, .isValidAscii: isValidAscii }; |
695 | } |
696 | |
697 | int QUtf8::compareUtf8(const char *utf8, qsizetype u8len, const QChar *utf16, int u16len) |
698 | { |
699 | uint uc1, uc2; |
700 | auto src1 = reinterpret_cast<const uchar *>(utf8); |
701 | auto end1 = src1 + u8len; |
702 | QStringIterator src2(utf16, utf16 + u16len); |
703 | |
704 | while (src1 < end1 && src2.hasNext()) { |
705 | uchar b = *src1++; |
706 | uint *output = &uc1; |
707 | int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst&: output, src&: src1, end: end1); |
708 | if (res < 0) { |
709 | // decoding error |
710 | uc1 = QChar::ReplacementCharacter; |
711 | } |
712 | |
713 | uc2 = src2.next(); |
714 | if (uc1 != uc2) |
715 | return int(uc1) - int(uc2); |
716 | } |
717 | |
718 | // the shorter string sorts first |
719 | return (end1 > src1) - int(src2.hasNext()); |
720 | } |
721 | |
722 | int QUtf8::compareUtf8(const char *utf8, qsizetype u8len, QLatin1String s) |
723 | { |
724 | uint uc1; |
725 | auto src1 = reinterpret_cast<const uchar *>(utf8); |
726 | auto end1 = src1 + u8len; |
727 | auto src2 = reinterpret_cast<const uchar *>(s.latin1()); |
728 | auto end2 = src2 + s.size(); |
729 | |
730 | while (src1 < end1 && src2 < end2) { |
731 | uchar b = *src1++; |
732 | uint *output = &uc1; |
733 | int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst&: output, src&: src1, end: end1); |
734 | if (res < 0) { |
735 | // decoding error |
736 | uc1 = QChar::ReplacementCharacter; |
737 | } |
738 | |
739 | uint uc2 = *src2++; |
740 | if (uc1 != uc2) |
741 | return int(uc1) - int(uc2); |
742 | } |
743 | |
744 | // the shorter string sorts first |
745 | return (end1 > src1) - (end2 > src2); |
746 | } |
747 | |
748 | QByteArray QUtf16::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state, DataEndianness e) |
749 | { |
750 | DataEndianness endian = e; |
751 | int length = 2*len; |
752 | if (!state || (!(state->flags & QTextCodec::IgnoreHeader))) { |
753 | length += 2; |
754 | } |
755 | if (e == DetectEndianness) { |
756 | endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness; |
757 | } |
758 | |
759 | QByteArray d; |
760 | d.resize(size: length); |
761 | char *data = d.data(); |
762 | if (!state || !(state->flags & QTextCodec::IgnoreHeader)) { |
763 | QChar bom(QChar::ByteOrderMark); |
764 | if (endian == BigEndianness) |
765 | qToBigEndian(src: bom.unicode(), dest: data); |
766 | else |
767 | qToLittleEndian(src: bom.unicode(), dest: data); |
768 | data += 2; |
769 | } |
770 | if (endian == BigEndianness) |
771 | qToBigEndian<ushort>(source: uc, count: len, dest: data); |
772 | else |
773 | qToLittleEndian<ushort>(source: uc, count: len, dest: data); |
774 | |
775 | if (state) { |
776 | state->remainingChars = 0; |
777 | state->flags |= QTextCodec::IgnoreHeader; |
778 | } |
779 | return d; |
780 | } |
781 | |
782 | QString QUtf16::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state, DataEndianness e) |
783 | { |
784 | DataEndianness endian = e; |
785 | bool half = false; |
786 | uchar buf = 0; |
787 | bool = false; |
788 | if (state) { |
789 | headerdone = state->flags & QTextCodec::IgnoreHeader; |
790 | if (endian == DetectEndianness) |
791 | endian = (DataEndianness)state->state_data[Endian]; |
792 | if (state->remainingChars) { |
793 | half = true; |
794 | buf = state->state_data[Data]; |
795 | } |
796 | } |
797 | if (headerdone && endian == DetectEndianness) |
798 | endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness; |
799 | |
800 | QString result(len, Qt::Uninitialized); // worst case |
801 | QChar *qch = (QChar *)result.data(); |
802 | while (len--) { |
803 | if (half) { |
804 | QChar ch; |
805 | if (endian == LittleEndianness) { |
806 | ch.setRow(*chars++); |
807 | ch.setCell(buf); |
808 | } else { |
809 | ch.setRow(buf); |
810 | ch.setCell(*chars++); |
811 | } |
812 | if (!headerdone) { |
813 | headerdone = true; |
814 | if (endian == DetectEndianness) { |
815 | if (ch == QChar::ByteOrderSwapped) { |
816 | endian = LittleEndianness; |
817 | } else if (ch == QChar::ByteOrderMark) { |
818 | endian = BigEndianness; |
819 | } else { |
820 | if (QSysInfo::ByteOrder == QSysInfo::BigEndian) { |
821 | endian = BigEndianness; |
822 | } else { |
823 | endian = LittleEndianness; |
824 | ch = QChar((ch.unicode() >> 8) | ((ch.unicode() & 0xff) << 8)); |
825 | } |
826 | *qch++ = ch; |
827 | } |
828 | } else if (ch != QChar::ByteOrderMark) { |
829 | *qch++ = ch; |
830 | } |
831 | } else { |
832 | *qch++ = ch; |
833 | } |
834 | half = false; |
835 | } else { |
836 | buf = *chars++; |
837 | half = true; |
838 | } |
839 | } |
840 | result.truncate(pos: qch - result.unicode()); |
841 | |
842 | if (state) { |
843 | if (headerdone) |
844 | state->flags |= QTextCodec::IgnoreHeader; |
845 | state->state_data[Endian] = endian; |
846 | if (half) { |
847 | state->remainingChars = 1; |
848 | state->state_data[Data] = buf; |
849 | } else { |
850 | state->remainingChars = 0; |
851 | state->state_data[Data] = 0; |
852 | } |
853 | } |
854 | return result; |
855 | } |
856 | |
857 | QByteArray QUtf32::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state, DataEndianness e) |
858 | { |
859 | DataEndianness endian = e; |
860 | int length = 4*len; |
861 | if (!state || (!(state->flags & QTextCodec::IgnoreHeader))) { |
862 | length += 4; |
863 | } |
864 | if (e == DetectEndianness) { |
865 | endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness; |
866 | } |
867 | |
868 | QByteArray d(length, Qt::Uninitialized); |
869 | char *data = d.data(); |
870 | if (!state || !(state->flags & QTextCodec::IgnoreHeader)) { |
871 | if (endian == BigEndianness) { |
872 | data[0] = 0; |
873 | data[1] = 0; |
874 | data[2] = (char)0xfe; |
875 | data[3] = (char)0xff; |
876 | } else { |
877 | data[0] = (char)0xff; |
878 | data[1] = (char)0xfe; |
879 | data[2] = 0; |
880 | data[3] = 0; |
881 | } |
882 | data += 4; |
883 | } |
884 | |
885 | QStringIterator i(uc, uc + len); |
886 | if (endian == BigEndianness) { |
887 | while (i.hasNext()) { |
888 | uint cp = i.next(); |
889 | qToBigEndian(src: cp, dest: data); |
890 | data += 4; |
891 | } |
892 | } else { |
893 | while (i.hasNext()) { |
894 | uint cp = i.next(); |
895 | qToLittleEndian(src: cp, dest: data); |
896 | data += 4; |
897 | } |
898 | } |
899 | |
900 | if (state) { |
901 | state->remainingChars = 0; |
902 | state->flags |= QTextCodec::IgnoreHeader; |
903 | } |
904 | return d; |
905 | } |
906 | |
907 | QString QUtf32::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state, DataEndianness e) |
908 | { |
909 | DataEndianness endian = e; |
910 | uchar tuple[4]; |
911 | int num = 0; |
912 | bool = false; |
913 | if (state) { |
914 | headerdone = state->flags & QTextCodec::IgnoreHeader; |
915 | if (endian == DetectEndianness) { |
916 | endian = (DataEndianness)state->state_data[Endian]; |
917 | } |
918 | num = state->remainingChars; |
919 | memcpy(dest: tuple, src: &state->state_data[Data], n: 4); |
920 | } |
921 | if (headerdone && endian == DetectEndianness) |
922 | endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness; |
923 | |
924 | QString result; |
925 | result.resize(size: (num + len) >> 2 << 1); // worst case |
926 | QChar *qch = (QChar *)result.data(); |
927 | |
928 | const char *end = chars + len; |
929 | while (chars < end) { |
930 | tuple[num++] = *chars++; |
931 | if (num == 4) { |
932 | if (!headerdone) { |
933 | headerdone = true; |
934 | if (endian == DetectEndianness) { |
935 | if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0 && endian != BigEndianness) { |
936 | endian = LittleEndianness; |
937 | num = 0; |
938 | continue; |
939 | } else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff && endian != LittleEndianness) { |
940 | endian = BigEndianness; |
941 | num = 0; |
942 | continue; |
943 | } else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) { |
944 | endian = BigEndianness; |
945 | } else { |
946 | endian = LittleEndianness; |
947 | } |
948 | } else if (((endian == BigEndianness) ? qFromBigEndian<quint32>(src: tuple) : qFromLittleEndian<quint32>(src: tuple)) == QChar::ByteOrderMark) { |
949 | num = 0; |
950 | continue; |
951 | } |
952 | } |
953 | uint code = (endian == BigEndianness) ? qFromBigEndian<quint32>(src: tuple) : qFromLittleEndian<quint32>(src: tuple); |
954 | if (QChar::requiresSurrogates(ucs4: code)) { |
955 | *qch++ = QChar(QChar::highSurrogate(ucs4: code)); |
956 | *qch++ = QChar(QChar::lowSurrogate(ucs4: code)); |
957 | } else { |
958 | *qch++ = QChar(code); |
959 | } |
960 | num = 0; |
961 | } |
962 | } |
963 | result.truncate(pos: qch - result.unicode()); |
964 | |
965 | if (state) { |
966 | if (headerdone) |
967 | state->flags |= QTextCodec::IgnoreHeader; |
968 | state->state_data[Endian] = endian; |
969 | state->remainingChars = num; |
970 | memcpy(dest: &state->state_data[Data], src: tuple, n: 4); |
971 | } |
972 | return result; |
973 | } |
974 | |
975 | |
976 | #if QT_CONFIG(textcodec) |
977 | |
978 | QUtf8Codec::~QUtf8Codec() |
979 | { |
980 | } |
981 | |
982 | QByteArray QUtf8Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const |
983 | { |
984 | return QUtf8::convertFromUnicode(uc, len, state); |
985 | } |
986 | |
987 | void QUtf8Codec::convertToUnicode(QString *target, const char *chars, int len, ConverterState *state) const |
988 | { |
989 | *target += QUtf8::convertToUnicode(chars, len, state); |
990 | } |
991 | |
992 | QString QUtf8Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const |
993 | { |
994 | return QUtf8::convertToUnicode(chars, len, state); |
995 | } |
996 | |
997 | QByteArray QUtf8Codec::name() const |
998 | { |
999 | return "UTF-8" ; |
1000 | } |
1001 | |
1002 | int QUtf8Codec::mibEnum() const |
1003 | { |
1004 | return 106; |
1005 | } |
1006 | |
1007 | QUtf16Codec::~QUtf16Codec() |
1008 | { |
1009 | } |
1010 | |
1011 | QByteArray QUtf16Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const |
1012 | { |
1013 | return QUtf16::convertFromUnicode(uc, len, state, e); |
1014 | } |
1015 | |
1016 | QString QUtf16Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const |
1017 | { |
1018 | return QUtf16::convertToUnicode(chars, len, state, e); |
1019 | } |
1020 | |
1021 | int QUtf16Codec::mibEnum() const |
1022 | { |
1023 | return 1015; |
1024 | } |
1025 | |
1026 | QByteArray QUtf16Codec::name() const |
1027 | { |
1028 | return "UTF-16" ; |
1029 | } |
1030 | |
1031 | QList<QByteArray> QUtf16Codec::aliases() const |
1032 | { |
1033 | return QList<QByteArray>(); |
1034 | } |
1035 | |
1036 | int QUtf16BECodec::mibEnum() const |
1037 | { |
1038 | return 1013; |
1039 | } |
1040 | |
1041 | QByteArray QUtf16BECodec::name() const |
1042 | { |
1043 | return "UTF-16BE" ; |
1044 | } |
1045 | |
1046 | QList<QByteArray> QUtf16BECodec::aliases() const |
1047 | { |
1048 | QList<QByteArray> list; |
1049 | return list; |
1050 | } |
1051 | |
1052 | int QUtf16LECodec::mibEnum() const |
1053 | { |
1054 | return 1014; |
1055 | } |
1056 | |
1057 | QByteArray QUtf16LECodec::name() const |
1058 | { |
1059 | return "UTF-16LE" ; |
1060 | } |
1061 | |
1062 | QList<QByteArray> QUtf16LECodec::aliases() const |
1063 | { |
1064 | QList<QByteArray> list; |
1065 | return list; |
1066 | } |
1067 | |
1068 | QUtf32Codec::~QUtf32Codec() |
1069 | { |
1070 | } |
1071 | |
1072 | QByteArray QUtf32Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const |
1073 | { |
1074 | return QUtf32::convertFromUnicode(uc, len, state, e); |
1075 | } |
1076 | |
1077 | QString QUtf32Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const |
1078 | { |
1079 | return QUtf32::convertToUnicode(chars, len, state, e); |
1080 | } |
1081 | |
1082 | int QUtf32Codec::mibEnum() const |
1083 | { |
1084 | return 1017; |
1085 | } |
1086 | |
1087 | QByteArray QUtf32Codec::name() const |
1088 | { |
1089 | return "UTF-32" ; |
1090 | } |
1091 | |
1092 | QList<QByteArray> QUtf32Codec::aliases() const |
1093 | { |
1094 | QList<QByteArray> list; |
1095 | return list; |
1096 | } |
1097 | |
1098 | int QUtf32BECodec::mibEnum() const |
1099 | { |
1100 | return 1018; |
1101 | } |
1102 | |
1103 | QByteArray QUtf32BECodec::name() const |
1104 | { |
1105 | return "UTF-32BE" ; |
1106 | } |
1107 | |
1108 | QList<QByteArray> QUtf32BECodec::aliases() const |
1109 | { |
1110 | QList<QByteArray> list; |
1111 | return list; |
1112 | } |
1113 | |
1114 | int QUtf32LECodec::mibEnum() const |
1115 | { |
1116 | return 1019; |
1117 | } |
1118 | |
1119 | QByteArray QUtf32LECodec::name() const |
1120 | { |
1121 | return "UTF-32LE" ; |
1122 | } |
1123 | |
1124 | QList<QByteArray> QUtf32LECodec::aliases() const |
1125 | { |
1126 | QList<QByteArray> list; |
1127 | return list; |
1128 | } |
1129 | |
1130 | #endif // textcodec |
1131 | |
1132 | QT_END_NAMESPACE |
1133 | |