1// Copyright (C) 2020 The Qt Company Ltd.
2// Copyright (C) 2020 Intel Corporation.
3// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4// Qt-Security score:critical reason:data-parser
5
6#ifndef QSTRINGCONVERTER_P_H
7#define QSTRINGCONVERTER_P_H
8
9//
10// W A R N I N G
11// -------------
12//
13// This file is not part of the Qt API. It exists purely as an
14// implementation detail. This header file may change from version to
15// version without notice, or even be removed.
16//
17// We mean it.
18//
19
20#include <QtCore/qstring.h>
21#include <QtCore/qendian.h>
22#include <QtCore/qstringconverter.h>
23#include <QtCore/private/qglobal_p.h>
24
25QT_BEGIN_NAMESPACE
26
27#ifndef __cpp_char8_t
28enum qchar8_t : uchar {};
29#else
30using qchar8_t = char8_t;
31#endif
32
33struct QLatin1
34{
35 // Defined in qstring.cpp
36 static char16_t *convertToUnicode(char16_t *dst, QLatin1StringView in) noexcept;
37
38 static QChar *convertToUnicode(QChar *buffer, QLatin1StringView in) noexcept
39 {
40 char16_t *dst = reinterpret_cast<char16_t *>(buffer);
41 dst = convertToUnicode(dst, in);
42 return reinterpret_cast<QChar *>(dst);
43 }
44
45 static QChar *convertToUnicode(QChar *dst, QByteArrayView in,
46 [[maybe_unused]] QStringConverter::State *state) noexcept
47 {
48 Q_ASSERT(state);
49
50 return convertToUnicode(buffer: dst, in: QLatin1StringView(in.data(), in.size()));
51 }
52
53 static char *convertFromUnicode(char *out, QStringView in, QStringConverter::State *state) noexcept;
54
55 // Defined in qstring.cpp
56 Q_CORE_EXPORT
57 static char *convertFromUnicode(char *out, QStringView in) noexcept;
58};
59
60struct QUtf8BaseTraits
61{
62 static const bool isTrusted = false;
63 static const bool allowNonCharacters = true;
64 static const bool skipAsciiHandling = false;
65 static const int Error = -1;
66 static const int EndOfString = -2;
67
68 static void appendByte(uchar *&ptr, uchar b)
69 { *ptr++ = b; }
70
71 static void appendByte(qchar8_t *&ptr, qchar8_t b)
72 { *ptr++ = b; }
73
74 static uchar peekByte(const char *ptr, qsizetype n = 0)
75 { return ptr[n]; }
76
77 static uchar peekByte(const uchar *ptr, qsizetype n = 0)
78 { return ptr[n]; }
79
80 static uchar peekByte(const qchar8_t *ptr, qsizetype n = 0)
81 { return ptr[n]; }
82
83 static qptrdiff availableBytes(const char *ptr, const char *end)
84 { return end - ptr; }
85
86 static qptrdiff availableBytes(const uchar *ptr, const uchar *end)
87 { return end - ptr; }
88
89 static qptrdiff availableBytes(const qchar8_t *ptr, const qchar8_t *end)
90 { return end - ptr; }
91
92 static void advanceByte(const char *&ptr, qsizetype n = 1)
93 { ptr += n; }
94
95 static void advanceByte(const uchar *&ptr, qsizetype n = 1)
96 { ptr += n; }
97
98 static void advanceByte(const qchar8_t *&ptr, qsizetype n = 1)
99 { ptr += n; }
100
101 static void appendUtf16(char16_t *&ptr, char16_t uc)
102 { *ptr++ = char16_t(uc); }
103
104 static void appendUcs4(char16_t *&ptr, char32_t uc)
105 {
106 appendUtf16(ptr, uc: QChar::highSurrogate(ucs4: uc));
107 appendUtf16(ptr, uc: QChar::lowSurrogate(ucs4: uc));
108 }
109
110 static char16_t peekUtf16(const char16_t *ptr, qsizetype n = 0) { return ptr[n]; }
111
112 static qptrdiff availableUtf16(const char16_t *ptr, const char16_t *end)
113 { return end - ptr; }
114
115 static void advanceUtf16(const char16_t *&ptr, qsizetype n = 1) { ptr += n; }
116
117 static void appendUtf16(char32_t *&ptr, char16_t uc)
118 { *ptr++ = char32_t(uc); }
119
120 static void appendUcs4(char32_t *&ptr, char32_t uc)
121 { *ptr++ = uc; }
122};
123
124struct QUtf8BaseTraitsNoAscii : public QUtf8BaseTraits
125{
126 static const bool skipAsciiHandling = true;
127};
128
129namespace QUtf8Functions
130{
131 /// returns 0 on success; errors can only happen if \a u is a surrogate:
132 /// Error if \a u is a low surrogate;
133 /// if \a u is a high surrogate, Error if the next isn't a low one,
134 /// EndOfString if we run into the end of the string.
135 template <typename Traits, typename OutputPtr, typename InputPtr> inline
136 int toUtf8(char16_t u, OutputPtr &dst, InputPtr &src, InputPtr end)
137 {
138 if (!Traits::skipAsciiHandling && u < 0x80) {
139 // U+0000 to U+007F (US-ASCII) - one byte
140 Traits::appendByte(dst, uchar(u));
141 return 0;
142 } else if (u < 0x0800) {
143 // U+0080 to U+07FF - two bytes
144 // first of two bytes
145 Traits::appendByte(dst, 0xc0 | uchar(u >> 6));
146 } else {
147 if (!QChar::isSurrogate(ucs4: u)) {
148 // U+0800 to U+FFFF (except U+D800-U+DFFF) - three bytes
149 if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4: u))
150 return Traits::Error;
151
152 // first of three bytes
153 Traits::appendByte(dst, 0xe0 | uchar(u >> 12));
154 } else {
155 // U+10000 to U+10FFFF - four bytes
156 // need to get one extra codepoint
157 if (Traits::availableUtf16(src, end) == 0)
158 return Traits::EndOfString;
159
160 char16_t low = Traits::peekUtf16(src);
161 if (!QChar::isHighSurrogate(ucs4: u))
162 return Traits::Error;
163 if (!QChar::isLowSurrogate(ucs4: low))
164 return Traits::Error;
165
166 Traits::advanceUtf16(src);
167 char32_t ucs4 = QChar::surrogateToUcs4(high: u, low);
168
169 if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4))
170 return Traits::Error;
171
172 // first byte
173 Traits::appendByte(dst, 0xf0 | (uchar(ucs4 >> 18) & 0xf));
174
175 // second of four bytes
176 Traits::appendByte(dst, 0x80 | (uchar(ucs4 >> 12) & 0x3f));
177
178 // for the rest of the bytes
179 u = char16_t(ucs4);
180 }
181
182 // second to last byte
183 Traits::appendByte(dst, 0x80 | (uchar(u >> 6) & 0x3f));
184 }
185
186 // last byte
187 Traits::appendByte(dst, 0x80 | (u & 0x3f));
188 return 0;
189 }
190
191 inline bool isContinuationByte(uchar b)
192 {
193 return (b & 0xc0) == 0x80;
194 }
195
196 /// returns the number of characters consumed (including \a b) in case of success;
197 /// returns negative in case of error: Traits::Error or Traits::EndOfString
198 template <typename Traits, typename OutputPtr, typename InputPtr> inline
199 qsizetype fromUtf8(uchar b, OutputPtr &dst, InputPtr &src, InputPtr end)
200 {
201 qsizetype charsNeeded;
202 char32_t min_uc;
203 char32_t uc;
204
205 if (!Traits::skipAsciiHandling && b < 0x80) {
206 // US-ASCII
207 Traits::appendUtf16(dst, b);
208 return 1;
209 }
210
211 if (!Traits::isTrusted && Q_UNLIKELY(b <= 0xC1)) {
212 // an UTF-8 first character must be at least 0xC0
213 // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences
214 return Traits::Error;
215 } else if (b < 0xe0) {
216 charsNeeded = 2;
217 min_uc = 0x80;
218 uc = b & 0x1f;
219 } else if (b < 0xf0) {
220 charsNeeded = 3;
221 min_uc = 0x800;
222 uc = b & 0x0f;
223 } else if (b < 0xf5) {
224 charsNeeded = 4;
225 min_uc = 0x10000;
226 uc = b & 0x07;
227 } else {
228 // the last Unicode character is U+10FFFF
229 // it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF"
230 // therefore, a byte higher than 0xF4 is not the UTF-8 first byte
231 return Traits::Error;
232 }
233
234 qptrdiff bytesAvailable = Traits::availableBytes(src, end);
235 if (Q_UNLIKELY(bytesAvailable < charsNeeded - 1)) {
236 // it's possible that we have an error instead of just unfinished bytes
237 if (bytesAvailable > 0 && !isContinuationByte(Traits::peekByte(src, 0)))
238 return Traits::Error;
239 if (bytesAvailable > 1 && !isContinuationByte(Traits::peekByte(src, 1)))
240 return Traits::Error;
241 return Traits::EndOfString;
242 }
243
244 // first continuation character
245 b = Traits::peekByte(src, 0);
246 if (!isContinuationByte(b))
247 return Traits::Error;
248 uc <<= 6;
249 uc |= b & 0x3f;
250
251 if (charsNeeded > 2) {
252 // second continuation character
253 b = Traits::peekByte(src, 1);
254 if (!isContinuationByte(b))
255 return Traits::Error;
256 uc <<= 6;
257 uc |= b & 0x3f;
258
259 if (charsNeeded > 3) {
260 // third continuation character
261 b = Traits::peekByte(src, 2);
262 if (!isContinuationByte(b))
263 return Traits::Error;
264 uc <<= 6;
265 uc |= b & 0x3f;
266 }
267 }
268
269 // we've decoded something; safety-check it
270 if (!Traits::isTrusted) {
271 if (uc < min_uc)
272 return Traits::Error;
273 if (QChar::isSurrogate(ucs4: uc) || uc > QChar::LastValidCodePoint)
274 return Traits::Error;
275 if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4: uc))
276 return Traits::Error;
277 }
278
279 // write the UTF-16 sequence
280 if (!QChar::requiresSurrogates(ucs4: uc)) {
281 // UTF-8 decoded and no surrogates are required
282 // detach if necessary
283 Traits::appendUtf16(dst, char16_t(uc));
284 } else {
285 // UTF-8 decoded to something that requires a surrogate pair
286 Traits::appendUcs4(dst, uc);
287 }
288
289 Traits::advanceByte(src, charsNeeded - 1);
290 return charsNeeded;
291 }
292
293 /// wrapper around fromUtf8<Traits> to provide a simpler interface for a common case
294 template <typename Traits = QUtf8BaseTraits>
295 char32_t nextUcs4FromUtf8(const qchar8_t *&src, const qchar8_t *end,
296 char32_t errorChar = QChar::ReplacementCharacter)
297 {
298 auto ch = *src++;
299 char32_t buffer[1];
300 auto *output = buffer;
301 if (QUtf8Functions::fromUtf8<Traits>(ch, output, src, end) < 0)
302 return errorChar; // decoding error
303 Q_ASSERT(output == buffer + 1);
304 return buffer[0];
305 }
306}
307
308enum DataEndianness
309{
310 DetectEndianness,
311 BigEndianness,
312 LittleEndianness
313};
314
315struct QUtf8
316{
317 static QChar *convertToUnicode(QChar *buffer, QByteArrayView in) noexcept
318 {
319 char16_t *dst = reinterpret_cast<char16_t *>(buffer);
320 dst = QUtf8::convertToUnicode(dst, in);
321 return reinterpret_cast<QChar *>(dst);
322 }
323
324 Q_CORE_EXPORT static char16_t* convertToUnicode(char16_t *dst, QByteArrayView in) noexcept;
325 static QString convertToUnicode(QByteArrayView in);
326 Q_CORE_EXPORT static QString convertToUnicode(QByteArrayView in, QStringConverter::State *state);
327
328 static QChar *convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::State *state)
329 {
330 char16_t *buffer = reinterpret_cast<char16_t *>(out);
331 buffer = convertToUnicode(dst: buffer, in, state);
332 return reinterpret_cast<QChar *>(buffer);
333 }
334
335 static char16_t *convertToUnicode(char16_t *dst, QByteArrayView in, QStringConverter::State *state);
336
337 static char *convertFromUnicode(char *dst, QStringView in) noexcept;
338 Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView in);
339 Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView in, QStringConverter::State *state);
340 static char *convertFromUnicode(char *out, QStringView in, QStringConverter::State *state);
341 Q_CORE_EXPORT static char *convertFromLatin1(char *out, QLatin1StringView in);
342 struct ValidUtf8Result {
343 bool isValidUtf8;
344 bool isValidAscii;
345 };
346 static ValidUtf8Result isValidUtf8(QByteArrayView in);
347 static int compareUtf8(QByteArrayView utf8, QStringView utf16,
348 Qt::CaseSensitivity cs = Qt::CaseSensitive) noexcept;
349 static int compareUtf8(QByteArrayView utf8, QLatin1StringView s,
350 Qt::CaseSensitivity cs = Qt::CaseSensitive);
351 static int compareUtf8(QByteArrayView lhs, QByteArrayView rhs,
352 Qt::CaseSensitivity cs = Qt::CaseSensitive) noexcept;
353
354private:
355 template <typename OnErrorLambda> static char *
356 convertFromUnicode(char *out, QStringView in, OnErrorLambda &&onError) noexcept;
357 template <typename OnErrorLambda> static char16_t *
358 convertToUnicode(char16_t *dst, QByteArrayView in, OnErrorLambda &&onError) noexcept;
359};
360
361struct QUtf16
362{
363 Q_CORE_EXPORT static QString convertToUnicode(QByteArrayView, QStringConverter::State *, DataEndianness = DetectEndianness);
364 static QChar *convertToUnicode(QChar *out, QByteArrayView, QStringConverter::State *state, DataEndianness endian);
365 Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView, QStringConverter::State *, DataEndianness = DetectEndianness);
366 static char *convertFromUnicode(char *out, QStringView in, QStringConverter::State *state, DataEndianness endian);
367};
368
369struct QUtf32
370{
371 static QChar *convertToUnicode(QChar *out, QByteArrayView, QStringConverter::State *state, DataEndianness endian);
372 Q_CORE_EXPORT static QString convertToUnicode(QByteArrayView, QStringConverter::State *, DataEndianness = DetectEndianness);
373 Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView, QStringConverter::State *, DataEndianness = DetectEndianness);
374 static char *convertFromUnicode(char *out, QStringView in, QStringConverter::State *state, DataEndianness endian);
375};
376
377struct Q_CORE_EXPORT QLocal8Bit
378{
379#if !defined(Q_OS_WIN) || defined(QT_BOOTSTRAPPED)
380 static QString convertToUnicode(QByteArrayView in, QStringConverter::State *state)
381 { return QUtf8::convertToUnicode(in, state); }
382 static QByteArray convertFromUnicode(QStringView in, QStringConverter::State *state)
383 { return QUtf8::convertFromUnicode(in, state); }
384#else
385 static int checkUtf8();
386 static bool isUtf8()
387 {
388 Q_CONSTINIT
389 static QBasicAtomicInteger<qint8> result = { 0 };
390 int r = result.loadRelaxed();
391 if (r == 0) {
392 r = checkUtf8();
393 result.storeRelaxed(r);
394 }
395 return r > 0;
396 }
397 static QString convertToUnicode_sys(QByteArrayView, quint32, QStringConverter::State *);
398 static QString convertToUnicode_sys(QByteArrayView, QStringConverter::State *);
399 static QString convertToUnicode(QByteArrayView in, QStringConverter::State *state)
400 {
401 if (isUtf8())
402 return QUtf8::convertToUnicode(in, state);
403 return convertToUnicode_sys(in, state);
404 }
405 static QByteArray convertFromUnicode_sys(QStringView, quint32, QStringConverter::State *);
406 static QByteArray convertFromUnicode_sys(QStringView, QStringConverter::State *);
407 static QByteArray convertFromUnicode(QStringView in, QStringConverter::State *state)
408 {
409 if (isUtf8())
410 return QUtf8::convertFromUnicode(in, state);
411 return convertFromUnicode_sys(in, state);
412 }
413#endif
414};
415
416QT_END_NAMESPACE
417
418#endif // QSTRINGCONVERTER_P_H
419

source code of qtbase/src/corelib/text/qstringconverter_p.h