1// Copyright (C) 2020 The Qt Company Ltd.
2// Copyright (C) 2020 Intel Corporation.
3// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4
5#ifndef QSTRINGCONVERTER_P_H
6#define QSTRINGCONVERTER_P_H
7
8//
9// W A R N I N G
10// -------------
11//
12// This file is not part of the Qt API. It exists purely as an
13// implementation detail. This header file may change from version to
14// version without notice, or even be removed.
15//
16// We mean it.
17//
18
19#include <QtCore/qstring.h>
20#include <QtCore/qendian.h>
21#include <QtCore/qstringconverter.h>
22#include <QtCore/private/qglobal_p.h>
23
24QT_BEGIN_NAMESPACE
25
26#ifndef __cpp_char8_t
27enum qchar8_t : uchar {};
28#else
29using qchar8_t = char8_t;
30#endif
31
32struct QLatin1
33{
34 // Defined in qstring.cpp
35 static char16_t *convertToUnicode(char16_t *dst, QLatin1StringView in) noexcept;
36
37 static QChar *convertToUnicode(QChar *buffer, QLatin1StringView in) noexcept
38 {
39 char16_t *dst = reinterpret_cast<char16_t *>(buffer);
40 dst = convertToUnicode(dst, in);
41 return reinterpret_cast<QChar *>(dst);
42 }
43
44 static QChar *convertToUnicode(QChar *dst, QByteArrayView in,
45 [[maybe_unused]] QStringConverterBase::State *state) noexcept
46 {
47 Q_ASSERT(state);
48
49 return convertToUnicode(buffer: dst, in: QLatin1StringView(in.data(), in.size()));
50 }
51
52 static char *convertFromUnicode(char *out, QStringView in, QStringConverter::State *state) noexcept;
53
54 // Defined in qstring.cpp
55 Q_CORE_EXPORT
56 static char *convertFromUnicode(char *out, QStringView in) noexcept;
57};
58
59struct QUtf8BaseTraits
60{
61 static const bool isTrusted = false;
62 static const bool allowNonCharacters = true;
63 static const bool skipAsciiHandling = false;
64 static const int Error = -1;
65 static const int EndOfString = -2;
66
67 static void appendByte(uchar *&ptr, uchar b)
68 { *ptr++ = b; }
69
70 static void appendByte(qchar8_t *&ptr, qchar8_t b)
71 { *ptr++ = b; }
72
73 static uchar peekByte(const char *ptr, qsizetype n = 0)
74 { return ptr[n]; }
75
76 static uchar peekByte(const uchar *ptr, qsizetype n = 0)
77 { return ptr[n]; }
78
79 static uchar peekByte(const qchar8_t *ptr, qsizetype n = 0)
80 { return ptr[n]; }
81
82 static qptrdiff availableBytes(const char *ptr, const char *end)
83 { return end - ptr; }
84
85 static qptrdiff availableBytes(const uchar *ptr, const uchar *end)
86 { return end - ptr; }
87
88 static qptrdiff availableBytes(const qchar8_t *ptr, const qchar8_t *end)
89 { return end - ptr; }
90
91 static void advanceByte(const char *&ptr, qsizetype n = 1)
92 { ptr += n; }
93
94 static void advanceByte(const uchar *&ptr, qsizetype n = 1)
95 { ptr += n; }
96
97 static void advanceByte(const qchar8_t *&ptr, qsizetype n = 1)
98 { ptr += n; }
99
100 static void appendUtf16(char16_t *&ptr, char16_t uc)
101 { *ptr++ = char16_t(uc); }
102
103 static void appendUcs4(char16_t *&ptr, char32_t uc)
104 {
105 appendUtf16(ptr, uc: QChar::highSurrogate(ucs4: uc));
106 appendUtf16(ptr, uc: QChar::lowSurrogate(ucs4: uc));
107 }
108
109 static char16_t peekUtf16(const char16_t *ptr, qsizetype n = 0) { return ptr[n]; }
110
111 static qptrdiff availableUtf16(const char16_t *ptr, const char16_t *end)
112 { return end - ptr; }
113
114 static void advanceUtf16(const char16_t *&ptr, qsizetype n = 1) { ptr += n; }
115
116 static void appendUtf16(char32_t *&ptr, char16_t uc)
117 { *ptr++ = char32_t(uc); }
118
119 static void appendUcs4(char32_t *&ptr, char32_t uc)
120 { *ptr++ = uc; }
121};
122
123struct QUtf8BaseTraitsNoAscii : public QUtf8BaseTraits
124{
125 static const bool skipAsciiHandling = true;
126};
127
128namespace QUtf8Functions
129{
130 /// returns 0 on success; errors can only happen if \a u is a surrogate:
131 /// Error if \a u is a low surrogate;
132 /// if \a u is a high surrogate, Error if the next isn't a low one,
133 /// EndOfString if we run into the end of the string.
134 template <typename Traits, typename OutputPtr, typename InputPtr> inline
135 int toUtf8(char16_t u, OutputPtr &dst, InputPtr &src, InputPtr end)
136 {
137 if (!Traits::skipAsciiHandling && u < 0x80) {
138 // U+0000 to U+007F (US-ASCII) - one byte
139 Traits::appendByte(dst, uchar(u));
140 return 0;
141 } else if (u < 0x0800) {
142 // U+0080 to U+07FF - two bytes
143 // first of two bytes
144 Traits::appendByte(dst, 0xc0 | uchar(u >> 6));
145 } else {
146 if (!QChar::isSurrogate(ucs4: u)) {
147 // U+0800 to U+FFFF (except U+D800-U+DFFF) - three bytes
148 if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4: u))
149 return Traits::Error;
150
151 // first of three bytes
152 Traits::appendByte(dst, 0xe0 | uchar(u >> 12));
153 } else {
154 // U+10000 to U+10FFFF - four bytes
155 // need to get one extra codepoint
156 if (Traits::availableUtf16(src, end) == 0)
157 return Traits::EndOfString;
158
159 char16_t low = Traits::peekUtf16(src);
160 if (!QChar::isHighSurrogate(ucs4: u))
161 return Traits::Error;
162 if (!QChar::isLowSurrogate(ucs4: low))
163 return Traits::Error;
164
165 Traits::advanceUtf16(src);
166 char32_t ucs4 = QChar::surrogateToUcs4(high: u, low);
167
168 if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4))
169 return Traits::Error;
170
171 // first byte
172 Traits::appendByte(dst, 0xf0 | (uchar(ucs4 >> 18) & 0xf));
173
174 // second of four bytes
175 Traits::appendByte(dst, 0x80 | (uchar(ucs4 >> 12) & 0x3f));
176
177 // for the rest of the bytes
178 u = char16_t(ucs4);
179 }
180
181 // second to last byte
182 Traits::appendByte(dst, 0x80 | (uchar(u >> 6) & 0x3f));
183 }
184
185 // last byte
186 Traits::appendByte(dst, 0x80 | (u & 0x3f));
187 return 0;
188 }
189
190 inline bool isContinuationByte(uchar b)
191 {
192 return (b & 0xc0) == 0x80;
193 }
194
195 /// returns the number of characters consumed (including \a b) in case of success;
196 /// returns negative in case of error: Traits::Error or Traits::EndOfString
197 template <typename Traits, typename OutputPtr, typename InputPtr> inline
198 qsizetype fromUtf8(uchar b, OutputPtr &dst, InputPtr &src, InputPtr end)
199 {
200 qsizetype charsNeeded;
201 char32_t min_uc;
202 char32_t uc;
203
204 if (!Traits::skipAsciiHandling && b < 0x80) {
205 // US-ASCII
206 Traits::appendUtf16(dst, b);
207 return 1;
208 }
209
210 if (!Traits::isTrusted && Q_UNLIKELY(b <= 0xC1)) {
211 // an UTF-8 first character must be at least 0xC0
212 // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences
213 return Traits::Error;
214 } else if (b < 0xe0) {
215 charsNeeded = 2;
216 min_uc = 0x80;
217 uc = b & 0x1f;
218 } else if (b < 0xf0) {
219 charsNeeded = 3;
220 min_uc = 0x800;
221 uc = b & 0x0f;
222 } else if (b < 0xf5) {
223 charsNeeded = 4;
224 min_uc = 0x10000;
225 uc = b & 0x07;
226 } else {
227 // the last Unicode character is U+10FFFF
228 // it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF"
229 // therefore, a byte higher than 0xF4 is not the UTF-8 first byte
230 return Traits::Error;
231 }
232
233 qptrdiff bytesAvailable = Traits::availableBytes(src, end);
234 if (Q_UNLIKELY(bytesAvailable < charsNeeded - 1)) {
235 // it's possible that we have an error instead of just unfinished bytes
236 if (bytesAvailable > 0 && !isContinuationByte(Traits::peekByte(src, 0)))
237 return Traits::Error;
238 if (bytesAvailable > 1 && !isContinuationByte(Traits::peekByte(src, 1)))
239 return Traits::Error;
240 return Traits::EndOfString;
241 }
242
243 // first continuation character
244 b = Traits::peekByte(src, 0);
245 if (!isContinuationByte(b))
246 return Traits::Error;
247 uc <<= 6;
248 uc |= b & 0x3f;
249
250 if (charsNeeded > 2) {
251 // second continuation character
252 b = Traits::peekByte(src, 1);
253 if (!isContinuationByte(b))
254 return Traits::Error;
255 uc <<= 6;
256 uc |= b & 0x3f;
257
258 if (charsNeeded > 3) {
259 // third continuation character
260 b = Traits::peekByte(src, 2);
261 if (!isContinuationByte(b))
262 return Traits::Error;
263 uc <<= 6;
264 uc |= b & 0x3f;
265 }
266 }
267
268 // we've decoded something; safety-check it
269 if (!Traits::isTrusted) {
270 if (uc < min_uc)
271 return Traits::Error;
272 if (QChar::isSurrogate(ucs4: uc) || uc > QChar::LastValidCodePoint)
273 return Traits::Error;
274 if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4: uc))
275 return Traits::Error;
276 }
277
278 // write the UTF-16 sequence
279 if (!QChar::requiresSurrogates(ucs4: uc)) {
280 // UTF-8 decoded and no surrogates are required
281 // detach if necessary
282 Traits::appendUtf16(dst, char16_t(uc));
283 } else {
284 // UTF-8 decoded to something that requires a surrogate pair
285 Traits::appendUcs4(dst, uc);
286 }
287
288 Traits::advanceByte(src, charsNeeded - 1);
289 return charsNeeded;
290 }
291}
292
293enum DataEndianness
294{
295 DetectEndianness,
296 BigEndianness,
297 LittleEndianness
298};
299
300struct QUtf8
301{
302 static QChar *convertToUnicode(QChar *buffer, QByteArrayView in) noexcept
303 {
304 char16_t *dst = reinterpret_cast<char16_t *>(buffer);
305 dst = QUtf8::convertToUnicode(dst, in);
306 return reinterpret_cast<QChar *>(dst);
307 }
308
309 Q_CORE_EXPORT static char16_t* convertToUnicode(char16_t *dst, QByteArrayView in) noexcept;
310 static QString convertToUnicode(QByteArrayView in);
311 Q_CORE_EXPORT static QString convertToUnicode(QByteArrayView in, QStringConverter::State *state);
312
313 static QChar *convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::State *state)
314 {
315 char16_t *buffer = reinterpret_cast<char16_t *>(out);
316 buffer = convertToUnicode(dst: buffer, in, state);
317 return reinterpret_cast<QChar *>(buffer);
318 }
319
320 static char16_t *convertToUnicode(char16_t *dst, QByteArrayView in, QStringConverter::State *state);
321
322 Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView in);
323 Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView in, QStringConverterBase::State *state);
324 static char *convertFromUnicode(char *out, QStringView in, QStringConverter::State *state);
325 Q_CORE_EXPORT static char *convertFromLatin1(char *out, QLatin1StringView in);
326 struct ValidUtf8Result {
327 bool isValidUtf8;
328 bool isValidAscii;
329 };
330 static ValidUtf8Result isValidUtf8(QByteArrayView in);
331 static int compareUtf8(QByteArrayView utf8, QStringView utf16,
332 Qt::CaseSensitivity cs = Qt::CaseSensitive) noexcept;
333 static int compareUtf8(QByteArrayView utf8, QLatin1StringView s,
334 Qt::CaseSensitivity cs = Qt::CaseSensitive);
335 static int compareUtf8(QByteArrayView lhs, QByteArrayView rhs,
336 Qt::CaseSensitivity cs = Qt::CaseSensitive) noexcept;
337};
338
339struct QUtf16
340{
341 Q_CORE_EXPORT static QString convertToUnicode(QByteArrayView, QStringConverter::State *, DataEndianness = DetectEndianness);
342 static QChar *convertToUnicode(QChar *out, QByteArrayView, QStringConverter::State *state, DataEndianness endian);
343 Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView, QStringConverter::State *, DataEndianness = DetectEndianness);
344 static char *convertFromUnicode(char *out, QStringView in, QStringConverter::State *state, DataEndianness endian);
345};
346
347struct QUtf32
348{
349 static QChar *convertToUnicode(QChar *out, QByteArrayView, QStringConverter::State *state, DataEndianness endian);
350 Q_CORE_EXPORT static QString convertToUnicode(QByteArrayView, QStringConverter::State *, DataEndianness = DetectEndianness);
351 Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView, QStringConverter::State *, DataEndianness = DetectEndianness);
352 static char *convertFromUnicode(char *out, QStringView in, QStringConverter::State *state, DataEndianness endian);
353};
354
355struct Q_CORE_EXPORT QLocal8Bit
356{
357#if !defined(Q_OS_WIN) || defined(QT_BOOTSTRAPPED)
358 static QString convertToUnicode(QByteArrayView in, QStringConverter::State *state)
359 { return QUtf8::convertToUnicode(in, state); }
360 static QByteArray convertFromUnicode(QStringView in, QStringConverter::State *state)
361 { return QUtf8::convertFromUnicode(in, state); }
362#else
363 static int checkUtf8();
364 static bool isUtf8()
365 {
366 Q_CONSTINIT
367 static QBasicAtomicInteger<qint8> result = { 0 };
368 int r = result.loadRelaxed();
369 if (r == 0) {
370 r = checkUtf8();
371 result.storeRelaxed(r);
372 }
373 return r > 0;
374 }
375 static QString convertToUnicode_sys(QByteArrayView, quint32, QStringConverter::State *);
376 static QString convertToUnicode_sys(QByteArrayView, QStringConverter::State *);
377 static QString convertToUnicode(QByteArrayView in, QStringConverter::State *state)
378 {
379 if (isUtf8())
380 return QUtf8::convertToUnicode(in, state);
381 return convertToUnicode_sys(in, state);
382 }
383 static QByteArray convertFromUnicode_sys(QStringView, quint32, QStringConverter::State *);
384 static QByteArray convertFromUnicode_sys(QStringView, QStringConverter::State *);
385 static QByteArray convertFromUnicode(QStringView in, QStringConverter::State *state)
386 {
387 if (isUtf8())
388 return QUtf8::convertFromUnicode(in, state);
389 return convertFromUnicode_sys(in, state);
390 }
391#endif
392};
393
394QT_END_NAMESPACE
395
396#endif // QSTRINGCONVERTER_P_H
397

Provided by KDAB

Privacy Policy
Start learning QML with our Intro Training
Find out more

source code of qtbase/src/corelib/text/qstringconverter_p.h