1 | // Copyright (C) 2020 The Qt Company Ltd. |
---|---|
2 | // Copyright (C) 2020 Intel Corporation. |
3 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only |
4 | |
5 | #ifndef QSTRINGCONVERTER_P_H |
6 | #define QSTRINGCONVERTER_P_H |
7 | |
8 | // |
9 | // W A R N I N G |
10 | // ------------- |
11 | // |
12 | // This file is not part of the Qt API. It exists purely as an |
13 | // implementation detail. This header file may change from version to |
14 | // version without notice, or even be removed. |
15 | // |
16 | // We mean it. |
17 | // |
18 | |
19 | #include <QtCore/qstring.h> |
20 | #include <QtCore/qendian.h> |
21 | #include <QtCore/qstringconverter.h> |
22 | #include <QtCore/private/qglobal_p.h> |
23 | |
24 | QT_BEGIN_NAMESPACE |
25 | |
26 | #ifndef __cpp_char8_t |
27 | enum qchar8_t : uchar {}; |
28 | #else |
29 | using qchar8_t = char8_t; |
30 | #endif |
31 | |
32 | struct QLatin1 |
33 | { |
34 | // Defined in qstring.cpp |
35 | static char16_t *convertToUnicode(char16_t *dst, QLatin1StringView in) noexcept; |
36 | |
37 | static QChar *convertToUnicode(QChar *buffer, QLatin1StringView in) noexcept |
38 | { |
39 | char16_t *dst = reinterpret_cast<char16_t *>(buffer); |
40 | dst = convertToUnicode(dst, in); |
41 | return reinterpret_cast<QChar *>(dst); |
42 | } |
43 | |
44 | static QChar *convertToUnicode(QChar *dst, QByteArrayView in, |
45 | [[maybe_unused]] QStringConverterBase::State *state) noexcept |
46 | { |
47 | Q_ASSERT(state); |
48 | |
49 | return convertToUnicode(buffer: dst, in: QLatin1StringView(in.data(), in.size())); |
50 | } |
51 | |
52 | static char *convertFromUnicode(char *out, QStringView in, QStringConverter::State *state) noexcept; |
53 | |
54 | // Defined in qstring.cpp |
55 | Q_CORE_EXPORT |
56 | static char *convertFromUnicode(char *out, QStringView in) noexcept; |
57 | }; |
58 | |
59 | struct QUtf8BaseTraits |
60 | { |
61 | static const bool isTrusted = false; |
62 | static const bool allowNonCharacters = true; |
63 | static const bool skipAsciiHandling = false; |
64 | static const int Error = -1; |
65 | static const int EndOfString = -2; |
66 | |
67 | static void appendByte(uchar *&ptr, uchar b) |
68 | { *ptr++ = b; } |
69 | |
70 | static void appendByte(qchar8_t *&ptr, qchar8_t b) |
71 | { *ptr++ = b; } |
72 | |
73 | static uchar peekByte(const char *ptr, qsizetype n = 0) |
74 | { return ptr[n]; } |
75 | |
76 | static uchar peekByte(const uchar *ptr, qsizetype n = 0) |
77 | { return ptr[n]; } |
78 | |
79 | static uchar peekByte(const qchar8_t *ptr, qsizetype n = 0) |
80 | { return ptr[n]; } |
81 | |
82 | static qptrdiff availableBytes(const char *ptr, const char *end) |
83 | { return end - ptr; } |
84 | |
85 | static qptrdiff availableBytes(const uchar *ptr, const uchar *end) |
86 | { return end - ptr; } |
87 | |
88 | static qptrdiff availableBytes(const qchar8_t *ptr, const qchar8_t *end) |
89 | { return end - ptr; } |
90 | |
91 | static void advanceByte(const char *&ptr, qsizetype n = 1) |
92 | { ptr += n; } |
93 | |
94 | static void advanceByte(const uchar *&ptr, qsizetype n = 1) |
95 | { ptr += n; } |
96 | |
97 | static void advanceByte(const qchar8_t *&ptr, qsizetype n = 1) |
98 | { ptr += n; } |
99 | |
100 | static void appendUtf16(char16_t *&ptr, char16_t uc) |
101 | { *ptr++ = char16_t(uc); } |
102 | |
103 | static void appendUcs4(char16_t *&ptr, char32_t uc) |
104 | { |
105 | appendUtf16(ptr, uc: QChar::highSurrogate(ucs4: uc)); |
106 | appendUtf16(ptr, uc: QChar::lowSurrogate(ucs4: uc)); |
107 | } |
108 | |
109 | static char16_t peekUtf16(const char16_t *ptr, qsizetype n = 0) { return ptr[n]; } |
110 | |
111 | static qptrdiff availableUtf16(const char16_t *ptr, const char16_t *end) |
112 | { return end - ptr; } |
113 | |
114 | static void advanceUtf16(const char16_t *&ptr, qsizetype n = 1) { ptr += n; } |
115 | |
116 | static void appendUtf16(char32_t *&ptr, char16_t uc) |
117 | { *ptr++ = char32_t(uc); } |
118 | |
119 | static void appendUcs4(char32_t *&ptr, char32_t uc) |
120 | { *ptr++ = uc; } |
121 | }; |
122 | |
123 | struct QUtf8BaseTraitsNoAscii : public QUtf8BaseTraits |
124 | { |
125 | static const bool skipAsciiHandling = true; |
126 | }; |
127 | |
128 | namespace QUtf8Functions |
129 | { |
130 | /// returns 0 on success; errors can only happen if \a u is a surrogate: |
131 | /// Error if \a u is a low surrogate; |
132 | /// if \a u is a high surrogate, Error if the next isn't a low one, |
133 | /// EndOfString if we run into the end of the string. |
134 | template <typename Traits, typename OutputPtr, typename InputPtr> inline |
135 | int toUtf8(char16_t u, OutputPtr &dst, InputPtr &src, InputPtr end) |
136 | { |
137 | if (!Traits::skipAsciiHandling && u < 0x80) { |
138 | // U+0000 to U+007F (US-ASCII) - one byte |
139 | Traits::appendByte(dst, uchar(u)); |
140 | return 0; |
141 | } else if (u < 0x0800) { |
142 | // U+0080 to U+07FF - two bytes |
143 | // first of two bytes |
144 | Traits::appendByte(dst, 0xc0 | uchar(u >> 6)); |
145 | } else { |
146 | if (!QChar::isSurrogate(ucs4: u)) { |
147 | // U+0800 to U+FFFF (except U+D800-U+DFFF) - three bytes |
148 | if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4: u)) |
149 | return Traits::Error; |
150 | |
151 | // first of three bytes |
152 | Traits::appendByte(dst, 0xe0 | uchar(u >> 12)); |
153 | } else { |
154 | // U+10000 to U+10FFFF - four bytes |
155 | // need to get one extra codepoint |
156 | if (Traits::availableUtf16(src, end) == 0) |
157 | return Traits::EndOfString; |
158 | |
159 | char16_t low = Traits::peekUtf16(src); |
160 | if (!QChar::isHighSurrogate(ucs4: u)) |
161 | return Traits::Error; |
162 | if (!QChar::isLowSurrogate(ucs4: low)) |
163 | return Traits::Error; |
164 | |
165 | Traits::advanceUtf16(src); |
166 | char32_t ucs4 = QChar::surrogateToUcs4(high: u, low); |
167 | |
168 | if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4)) |
169 | return Traits::Error; |
170 | |
171 | // first byte |
172 | Traits::appendByte(dst, 0xf0 | (uchar(ucs4 >> 18) & 0xf)); |
173 | |
174 | // second of four bytes |
175 | Traits::appendByte(dst, 0x80 | (uchar(ucs4 >> 12) & 0x3f)); |
176 | |
177 | // for the rest of the bytes |
178 | u = char16_t(ucs4); |
179 | } |
180 | |
181 | // second to last byte |
182 | Traits::appendByte(dst, 0x80 | (uchar(u >> 6) & 0x3f)); |
183 | } |
184 | |
185 | // last byte |
186 | Traits::appendByte(dst, 0x80 | (u & 0x3f)); |
187 | return 0; |
188 | } |
189 | |
190 | inline bool isContinuationByte(uchar b) |
191 | { |
192 | return (b & 0xc0) == 0x80; |
193 | } |
194 | |
195 | /// returns the number of characters consumed (including \a b) in case of success; |
196 | /// returns negative in case of error: Traits::Error or Traits::EndOfString |
197 | template <typename Traits, typename OutputPtr, typename InputPtr> inline |
198 | qsizetype fromUtf8(uchar b, OutputPtr &dst, InputPtr &src, InputPtr end) |
199 | { |
200 | qsizetype charsNeeded; |
201 | char32_t min_uc; |
202 | char32_t uc; |
203 | |
204 | if (!Traits::skipAsciiHandling && b < 0x80) { |
205 | // US-ASCII |
206 | Traits::appendUtf16(dst, b); |
207 | return 1; |
208 | } |
209 | |
210 | if (!Traits::isTrusted && Q_UNLIKELY(b <= 0xC1)) { |
211 | // an UTF-8 first character must be at least 0xC0 |
212 | // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences |
213 | return Traits::Error; |
214 | } else if (b < 0xe0) { |
215 | charsNeeded = 2; |
216 | min_uc = 0x80; |
217 | uc = b & 0x1f; |
218 | } else if (b < 0xf0) { |
219 | charsNeeded = 3; |
220 | min_uc = 0x800; |
221 | uc = b & 0x0f; |
222 | } else if (b < 0xf5) { |
223 | charsNeeded = 4; |
224 | min_uc = 0x10000; |
225 | uc = b & 0x07; |
226 | } else { |
227 | // the last Unicode character is U+10FFFF |
228 | // it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF" |
229 | // therefore, a byte higher than 0xF4 is not the UTF-8 first byte |
230 | return Traits::Error; |
231 | } |
232 | |
233 | qptrdiff bytesAvailable = Traits::availableBytes(src, end); |
234 | if (Q_UNLIKELY(bytesAvailable < charsNeeded - 1)) { |
235 | // it's possible that we have an error instead of just unfinished bytes |
236 | if (bytesAvailable > 0 && !isContinuationByte(Traits::peekByte(src, 0))) |
237 | return Traits::Error; |
238 | if (bytesAvailable > 1 && !isContinuationByte(Traits::peekByte(src, 1))) |
239 | return Traits::Error; |
240 | return Traits::EndOfString; |
241 | } |
242 | |
243 | // first continuation character |
244 | b = Traits::peekByte(src, 0); |
245 | if (!isContinuationByte(b)) |
246 | return Traits::Error; |
247 | uc <<= 6; |
248 | uc |= b & 0x3f; |
249 | |
250 | if (charsNeeded > 2) { |
251 | // second continuation character |
252 | b = Traits::peekByte(src, 1); |
253 | if (!isContinuationByte(b)) |
254 | return Traits::Error; |
255 | uc <<= 6; |
256 | uc |= b & 0x3f; |
257 | |
258 | if (charsNeeded > 3) { |
259 | // third continuation character |
260 | b = Traits::peekByte(src, 2); |
261 | if (!isContinuationByte(b)) |
262 | return Traits::Error; |
263 | uc <<= 6; |
264 | uc |= b & 0x3f; |
265 | } |
266 | } |
267 | |
268 | // we've decoded something; safety-check it |
269 | if (!Traits::isTrusted) { |
270 | if (uc < min_uc) |
271 | return Traits::Error; |
272 | if (QChar::isSurrogate(ucs4: uc) || uc > QChar::LastValidCodePoint) |
273 | return Traits::Error; |
274 | if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4: uc)) |
275 | return Traits::Error; |
276 | } |
277 | |
278 | // write the UTF-16 sequence |
279 | if (!QChar::requiresSurrogates(ucs4: uc)) { |
280 | // UTF-8 decoded and no surrogates are required |
281 | // detach if necessary |
282 | Traits::appendUtf16(dst, char16_t(uc)); |
283 | } else { |
284 | // UTF-8 decoded to something that requires a surrogate pair |
285 | Traits::appendUcs4(dst, uc); |
286 | } |
287 | |
288 | Traits::advanceByte(src, charsNeeded - 1); |
289 | return charsNeeded; |
290 | } |
291 | } |
292 | |
293 | enum DataEndianness |
294 | { |
295 | DetectEndianness, |
296 | BigEndianness, |
297 | LittleEndianness |
298 | }; |
299 | |
300 | struct QUtf8 |
301 | { |
302 | static QChar *convertToUnicode(QChar *buffer, QByteArrayView in) noexcept |
303 | { |
304 | char16_t *dst = reinterpret_cast<char16_t *>(buffer); |
305 | dst = QUtf8::convertToUnicode(dst, in); |
306 | return reinterpret_cast<QChar *>(dst); |
307 | } |
308 | |
309 | Q_CORE_EXPORT static char16_t* convertToUnicode(char16_t *dst, QByteArrayView in) noexcept; |
310 | static QString convertToUnicode(QByteArrayView in); |
311 | Q_CORE_EXPORT static QString convertToUnicode(QByteArrayView in, QStringConverter::State *state); |
312 | |
313 | static QChar *convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::State *state) |
314 | { |
315 | char16_t *buffer = reinterpret_cast<char16_t *>(out); |
316 | buffer = convertToUnicode(dst: buffer, in, state); |
317 | return reinterpret_cast<QChar *>(buffer); |
318 | } |
319 | |
320 | static char16_t *convertToUnicode(char16_t *dst, QByteArrayView in, QStringConverter::State *state); |
321 | |
322 | Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView in); |
323 | Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView in, QStringConverterBase::State *state); |
324 | static char *convertFromUnicode(char *out, QStringView in, QStringConverter::State *state); |
325 | Q_CORE_EXPORT static char *convertFromLatin1(char *out, QLatin1StringView in); |
326 | struct ValidUtf8Result { |
327 | bool isValidUtf8; |
328 | bool isValidAscii; |
329 | }; |
330 | static ValidUtf8Result isValidUtf8(QByteArrayView in); |
331 | static int compareUtf8(QByteArrayView utf8, QStringView utf16, |
332 | Qt::CaseSensitivity cs = Qt::CaseSensitive) noexcept; |
333 | static int compareUtf8(QByteArrayView utf8, QLatin1StringView s, |
334 | Qt::CaseSensitivity cs = Qt::CaseSensitive); |
335 | static int compareUtf8(QByteArrayView lhs, QByteArrayView rhs, |
336 | Qt::CaseSensitivity cs = Qt::CaseSensitive) noexcept; |
337 | }; |
338 | |
339 | struct QUtf16 |
340 | { |
341 | Q_CORE_EXPORT static QString convertToUnicode(QByteArrayView, QStringConverter::State *, DataEndianness = DetectEndianness); |
342 | static QChar *convertToUnicode(QChar *out, QByteArrayView, QStringConverter::State *state, DataEndianness endian); |
343 | Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView, QStringConverter::State *, DataEndianness = DetectEndianness); |
344 | static char *convertFromUnicode(char *out, QStringView in, QStringConverter::State *state, DataEndianness endian); |
345 | }; |
346 | |
347 | struct QUtf32 |
348 | { |
349 | static QChar *convertToUnicode(QChar *out, QByteArrayView, QStringConverter::State *state, DataEndianness endian); |
350 | Q_CORE_EXPORT static QString convertToUnicode(QByteArrayView, QStringConverter::State *, DataEndianness = DetectEndianness); |
351 | Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView, QStringConverter::State *, DataEndianness = DetectEndianness); |
352 | static char *convertFromUnicode(char *out, QStringView in, QStringConverter::State *state, DataEndianness endian); |
353 | }; |
354 | |
355 | struct Q_CORE_EXPORT QLocal8Bit |
356 | { |
357 | #if !defined(Q_OS_WIN) || defined(QT_BOOTSTRAPPED) |
358 | static QString convertToUnicode(QByteArrayView in, QStringConverter::State *state) |
359 | { return QUtf8::convertToUnicode(in, state); } |
360 | static QByteArray convertFromUnicode(QStringView in, QStringConverter::State *state) |
361 | { return QUtf8::convertFromUnicode(in, state); } |
362 | #else |
363 | static int checkUtf8(); |
364 | static bool isUtf8() |
365 | { |
366 | Q_CONSTINIT |
367 | static QBasicAtomicInteger<qint8> result = { 0 }; |
368 | int r = result.loadRelaxed(); |
369 | if (r == 0) { |
370 | r = checkUtf8(); |
371 | result.storeRelaxed(r); |
372 | } |
373 | return r > 0; |
374 | } |
375 | static QString convertToUnicode_sys(QByteArrayView, quint32, QStringConverter::State *); |
376 | static QString convertToUnicode_sys(QByteArrayView, QStringConverter::State *); |
377 | static QString convertToUnicode(QByteArrayView in, QStringConverter::State *state) |
378 | { |
379 | if (isUtf8()) |
380 | return QUtf8::convertToUnicode(in, state); |
381 | return convertToUnicode_sys(in, state); |
382 | } |
383 | static QByteArray convertFromUnicode_sys(QStringView, quint32, QStringConverter::State *); |
384 | static QByteArray convertFromUnicode_sys(QStringView, QStringConverter::State *); |
385 | static QByteArray convertFromUnicode(QStringView in, QStringConverter::State *state) |
386 | { |
387 | if (isUtf8()) |
388 | return QUtf8::convertFromUnicode(in, state); |
389 | return convertFromUnicode_sys(in, state); |
390 | } |
391 | #endif |
392 | }; |
393 | |
394 | QT_END_NAMESPACE |
395 | |
396 | #endif // QSTRINGCONVERTER_P_H |
397 |
Definitions
- qchar8_t
- QLatin1
- convertToUnicode
- convertToUnicode
- QUtf8BaseTraits
- appendByte
- appendByte
- peekByte
- peekByte
- peekByte
- availableBytes
- availableBytes
- availableBytes
- advanceByte
- advanceByte
- advanceByte
- appendUtf16
- appendUcs4
- peekUtf16
- availableUtf16
- advanceUtf16
- appendUtf16
- appendUcs4
- QUtf8BaseTraitsNoAscii
- toUtf8
- isContinuationByte
- fromUtf8
- DataEndianness
- QUtf8
- convertToUnicode
- convertToUnicode
- ValidUtf8Result
- QUtf16
- QUtf32
- QLocal8Bit
- convertToUnicode
Start learning QML with our Intro Training
Find out more