1 | // Copyright (C) 2020 The Qt Company Ltd. |
2 | // Copyright (C) 2020 Intel Corporation. |
3 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only |
4 | |
5 | #ifndef QSTRINGCONVERTER_P_H |
6 | #define QSTRINGCONVERTER_P_H |
7 | |
8 | // |
9 | // W A R N I N G |
10 | // ------------- |
11 | // |
12 | // This file is not part of the Qt API. It exists purely as an |
13 | // implementation detail. This header file may change from version to |
14 | // version without notice, or even be removed. |
15 | // |
16 | // We mean it. |
17 | // |
18 | |
19 | #include <QtCore/qstring.h> |
20 | #include <QtCore/qendian.h> |
21 | #include <QtCore/qstringconverter.h> |
22 | #include <QtCore/private/qglobal_p.h> |
23 | |
24 | QT_BEGIN_NAMESPACE |
25 | |
26 | #ifndef __cpp_char8_t |
27 | enum qchar8_t : uchar {}; |
28 | #else |
29 | using qchar8_t = char8_t; |
30 | #endif |
31 | |
32 | struct QLatin1 |
33 | { |
34 | // Defined in qstring.cpp |
35 | static char16_t *convertToUnicode(char16_t *dst, QLatin1StringView in) noexcept; |
36 | |
37 | static QChar *convertToUnicode(QChar *buffer, QLatin1StringView in) noexcept |
38 | { |
39 | char16_t *dst = reinterpret_cast<char16_t *>(buffer); |
40 | dst = convertToUnicode(dst, in); |
41 | return reinterpret_cast<QChar *>(dst); |
42 | } |
43 | |
44 | static QChar *convertToUnicode(QChar *dst, QByteArrayView in, |
45 | [[maybe_unused]] QStringConverterBase::State *state) noexcept |
46 | { |
47 | Q_ASSERT(state); |
48 | |
49 | return convertToUnicode(buffer: dst, in: QLatin1StringView(in.data(), in.size())); |
50 | } |
51 | |
52 | static char *convertFromUnicode(char *out, QStringView in, QStringConverter::State *state) noexcept; |
53 | |
54 | // Defined in qstring.cpp |
55 | static char *convertFromUnicode(char *out, QStringView in) noexcept; |
56 | }; |
57 | |
58 | struct QUtf8BaseTraits |
59 | { |
60 | static const bool isTrusted = false; |
61 | static const bool allowNonCharacters = true; |
62 | static const bool skipAsciiHandling = false; |
63 | static const int Error = -1; |
64 | static const int EndOfString = -2; |
65 | |
66 | static void appendByte(uchar *&ptr, uchar b) |
67 | { *ptr++ = b; } |
68 | |
69 | static void appendByte(qchar8_t *&ptr, qchar8_t b) |
70 | { *ptr++ = b; } |
71 | |
72 | static uchar peekByte(const uchar *ptr, qsizetype n = 0) |
73 | { return ptr[n]; } |
74 | |
75 | static uchar peekByte(const qchar8_t *ptr, qsizetype n = 0) |
76 | { return ptr[n]; } |
77 | |
78 | static qptrdiff availableBytes(const uchar *ptr, const uchar *end) |
79 | { return end - ptr; } |
80 | |
81 | static qptrdiff availableBytes(const qchar8_t *ptr, const qchar8_t *end) |
82 | { return end - ptr; } |
83 | |
84 | static void advanceByte(const uchar *&ptr, qsizetype n = 1) |
85 | { ptr += n; } |
86 | |
87 | static void advanceByte(const qchar8_t *&ptr, qsizetype n = 1) |
88 | { ptr += n; } |
89 | |
90 | static void appendUtf16(char16_t *&ptr, char16_t uc) |
91 | { *ptr++ = char16_t(uc); } |
92 | |
93 | static void appendUcs4(char16_t *&ptr, char32_t uc) |
94 | { |
95 | appendUtf16(ptr, uc: QChar::highSurrogate(ucs4: uc)); |
96 | appendUtf16(ptr, uc: QChar::lowSurrogate(ucs4: uc)); |
97 | } |
98 | |
99 | static char16_t peekUtf16(const char16_t *ptr, qsizetype n = 0) { return ptr[n]; } |
100 | |
101 | static qptrdiff availableUtf16(const char16_t *ptr, const char16_t *end) |
102 | { return end - ptr; } |
103 | |
104 | static void advanceUtf16(const char16_t *&ptr, qsizetype n = 1) { ptr += n; } |
105 | |
106 | static void appendUtf16(char32_t *&ptr, char16_t uc) |
107 | { *ptr++ = char32_t(uc); } |
108 | |
109 | static void appendUcs4(char32_t *&ptr, char32_t uc) |
110 | { *ptr++ = uc; } |
111 | }; |
112 | |
113 | struct QUtf8BaseTraitsNoAscii : public QUtf8BaseTraits |
114 | { |
115 | static const bool skipAsciiHandling = true; |
116 | }; |
117 | |
118 | namespace QUtf8Functions |
119 | { |
120 | /// returns 0 on success; errors can only happen if \a u is a surrogate: |
121 | /// Error if \a u is a low surrogate; |
122 | /// if \a u is a high surrogate, Error if the next isn't a low one, |
123 | /// EndOfString if we run into the end of the string. |
124 | template <typename Traits, typename OutputPtr, typename InputPtr> inline |
125 | int toUtf8(char16_t u, OutputPtr &dst, InputPtr &src, InputPtr end) |
126 | { |
127 | if (!Traits::skipAsciiHandling && u < 0x80) { |
128 | // U+0000 to U+007F (US-ASCII) - one byte |
129 | Traits::appendByte(dst, uchar(u)); |
130 | return 0; |
131 | } else if (u < 0x0800) { |
132 | // U+0080 to U+07FF - two bytes |
133 | // first of two bytes |
134 | Traits::appendByte(dst, 0xc0 | uchar(u >> 6)); |
135 | } else { |
136 | if (!QChar::isSurrogate(ucs4: u)) { |
137 | // U+0800 to U+FFFF (except U+D800-U+DFFF) - three bytes |
138 | if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4: u)) |
139 | return Traits::Error; |
140 | |
141 | // first of three bytes |
142 | Traits::appendByte(dst, 0xe0 | uchar(u >> 12)); |
143 | } else { |
144 | // U+10000 to U+10FFFF - four bytes |
145 | // need to get one extra codepoint |
146 | if (Traits::availableUtf16(src, end) == 0) |
147 | return Traits::EndOfString; |
148 | |
149 | char16_t low = Traits::peekUtf16(src); |
150 | if (!QChar::isHighSurrogate(ucs4: u)) |
151 | return Traits::Error; |
152 | if (!QChar::isLowSurrogate(ucs4: low)) |
153 | return Traits::Error; |
154 | |
155 | Traits::advanceUtf16(src); |
156 | char32_t ucs4 = QChar::surrogateToUcs4(high: u, low); |
157 | |
158 | if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4)) |
159 | return Traits::Error; |
160 | |
161 | // first byte |
162 | Traits::appendByte(dst, 0xf0 | (uchar(ucs4 >> 18) & 0xf)); |
163 | |
164 | // second of four bytes |
165 | Traits::appendByte(dst, 0x80 | (uchar(ucs4 >> 12) & 0x3f)); |
166 | |
167 | // for the rest of the bytes |
168 | u = char16_t(ucs4); |
169 | } |
170 | |
171 | // second to last byte |
172 | Traits::appendByte(dst, 0x80 | (uchar(u >> 6) & 0x3f)); |
173 | } |
174 | |
175 | // last byte |
176 | Traits::appendByte(dst, 0x80 | (u & 0x3f)); |
177 | return 0; |
178 | } |
179 | |
180 | inline bool isContinuationByte(uchar b) |
181 | { |
182 | return (b & 0xc0) == 0x80; |
183 | } |
184 | |
185 | /// returns the number of characters consumed (including \a b) in case of success; |
186 | /// returns negative in case of error: Traits::Error or Traits::EndOfString |
187 | template <typename Traits, typename OutputPtr, typename InputPtr> inline |
188 | qsizetype fromUtf8(uchar b, OutputPtr &dst, InputPtr &src, InputPtr end) |
189 | { |
190 | qsizetype charsNeeded; |
191 | char32_t min_uc; |
192 | char32_t uc; |
193 | |
194 | if (!Traits::skipAsciiHandling && b < 0x80) { |
195 | // US-ASCII |
196 | Traits::appendUtf16(dst, b); |
197 | return 1; |
198 | } |
199 | |
200 | if (!Traits::isTrusted && Q_UNLIKELY(b <= 0xC1)) { |
201 | // an UTF-8 first character must be at least 0xC0 |
202 | // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences |
203 | return Traits::Error; |
204 | } else if (b < 0xe0) { |
205 | charsNeeded = 2; |
206 | min_uc = 0x80; |
207 | uc = b & 0x1f; |
208 | } else if (b < 0xf0) { |
209 | charsNeeded = 3; |
210 | min_uc = 0x800; |
211 | uc = b & 0x0f; |
212 | } else if (b < 0xf5) { |
213 | charsNeeded = 4; |
214 | min_uc = 0x10000; |
215 | uc = b & 0x07; |
216 | } else { |
217 | // the last Unicode character is U+10FFFF |
218 | // it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF" |
219 | // therefore, a byte higher than 0xF4 is not the UTF-8 first byte |
220 | return Traits::Error; |
221 | } |
222 | |
223 | qptrdiff bytesAvailable = Traits::availableBytes(src, end); |
224 | if (Q_UNLIKELY(bytesAvailable < charsNeeded - 1)) { |
225 | // it's possible that we have an error instead of just unfinished bytes |
226 | if (bytesAvailable > 0 && !isContinuationByte(Traits::peekByte(src, 0))) |
227 | return Traits::Error; |
228 | if (bytesAvailable > 1 && !isContinuationByte(Traits::peekByte(src, 1))) |
229 | return Traits::Error; |
230 | return Traits::EndOfString; |
231 | } |
232 | |
233 | // first continuation character |
234 | b = Traits::peekByte(src, 0); |
235 | if (!isContinuationByte(b)) |
236 | return Traits::Error; |
237 | uc <<= 6; |
238 | uc |= b & 0x3f; |
239 | |
240 | if (charsNeeded > 2) { |
241 | // second continuation character |
242 | b = Traits::peekByte(src, 1); |
243 | if (!isContinuationByte(b)) |
244 | return Traits::Error; |
245 | uc <<= 6; |
246 | uc |= b & 0x3f; |
247 | |
248 | if (charsNeeded > 3) { |
249 | // third continuation character |
250 | b = Traits::peekByte(src, 2); |
251 | if (!isContinuationByte(b)) |
252 | return Traits::Error; |
253 | uc <<= 6; |
254 | uc |= b & 0x3f; |
255 | } |
256 | } |
257 | |
258 | // we've decoded something; safety-check it |
259 | if (!Traits::isTrusted) { |
260 | if (uc < min_uc) |
261 | return Traits::Error; |
262 | if (QChar::isSurrogate(ucs4: uc) || uc > QChar::LastValidCodePoint) |
263 | return Traits::Error; |
264 | if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4: uc)) |
265 | return Traits::Error; |
266 | } |
267 | |
268 | // write the UTF-16 sequence |
269 | if (!QChar::requiresSurrogates(ucs4: uc)) { |
270 | // UTF-8 decoded and no surrogates are required |
271 | // detach if necessary |
272 | Traits::appendUtf16(dst, char16_t(uc)); |
273 | } else { |
274 | // UTF-8 decoded to something that requires a surrogate pair |
275 | Traits::appendUcs4(dst, uc); |
276 | } |
277 | |
278 | Traits::advanceByte(src, charsNeeded - 1); |
279 | return charsNeeded; |
280 | } |
281 | } |
282 | |
283 | enum DataEndianness |
284 | { |
285 | DetectEndianness, |
286 | BigEndianness, |
287 | LittleEndianness |
288 | }; |
289 | |
290 | struct QUtf8 |
291 | { |
292 | static QChar *convertToUnicode(QChar *buffer, QByteArrayView in) noexcept |
293 | { |
294 | char16_t *dst = reinterpret_cast<char16_t *>(buffer); |
295 | dst = QUtf8::convertToUnicode(dst, in); |
296 | return reinterpret_cast<QChar *>(dst); |
297 | } |
298 | |
299 | Q_CORE_EXPORT static char16_t* convertToUnicode(char16_t *dst, QByteArrayView in) noexcept; |
300 | static QString convertToUnicode(QByteArrayView in); |
301 | Q_CORE_EXPORT static QString convertToUnicode(QByteArrayView in, QStringConverter::State *state); |
302 | |
303 | static QChar *convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::State *state) |
304 | { |
305 | char16_t *buffer = reinterpret_cast<char16_t *>(out); |
306 | buffer = convertToUnicode(dst: buffer, in, state); |
307 | return reinterpret_cast<QChar *>(buffer); |
308 | } |
309 | |
310 | static char16_t *convertToUnicode(char16_t *dst, QByteArrayView in, QStringConverter::State *state); |
311 | |
312 | Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView in); |
313 | Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView in, QStringConverterBase::State *state); |
314 | static char *convertFromUnicode(char *out, QStringView in, QStringConverter::State *state); |
315 | Q_CORE_EXPORT static char *convertFromLatin1(char *out, QLatin1StringView in); |
316 | struct ValidUtf8Result { |
317 | bool isValidUtf8; |
318 | bool isValidAscii; |
319 | }; |
320 | static ValidUtf8Result isValidUtf8(QByteArrayView in); |
321 | static int compareUtf8(QByteArrayView utf8, QStringView utf16, |
322 | Qt::CaseSensitivity cs = Qt::CaseSensitive) noexcept; |
323 | static int compareUtf8(QByteArrayView utf8, QLatin1StringView s, |
324 | Qt::CaseSensitivity cs = Qt::CaseSensitive); |
325 | static int compareUtf8(QByteArrayView lhs, QByteArrayView rhs, |
326 | Qt::CaseSensitivity cs = Qt::CaseSensitive) noexcept; |
327 | }; |
328 | |
329 | struct QUtf16 |
330 | { |
331 | Q_CORE_EXPORT static QString convertToUnicode(QByteArrayView, QStringConverter::State *, DataEndianness = DetectEndianness); |
332 | static QChar *convertToUnicode(QChar *out, QByteArrayView, QStringConverter::State *state, DataEndianness endian); |
333 | Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView, QStringConverter::State *, DataEndianness = DetectEndianness); |
334 | static char *convertFromUnicode(char *out, QStringView in, QStringConverter::State *state, DataEndianness endian); |
335 | }; |
336 | |
337 | struct QUtf32 |
338 | { |
339 | static QChar *convertToUnicode(QChar *out, QByteArrayView, QStringConverter::State *state, DataEndianness endian); |
340 | Q_CORE_EXPORT static QString convertToUnicode(QByteArrayView, QStringConverter::State *, DataEndianness = DetectEndianness); |
341 | Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView, QStringConverter::State *, DataEndianness = DetectEndianness); |
342 | static char *convertFromUnicode(char *out, QStringView in, QStringConverter::State *state, DataEndianness endian); |
343 | }; |
344 | |
345 | struct Q_CORE_EXPORT QLocal8Bit |
346 | { |
347 | #if !defined(Q_OS_WIN) || defined(QT_BOOTSTRAPPED) |
348 | static QString convertToUnicode(QByteArrayView in, QStringConverter::State *state) |
349 | { return QUtf8::convertToUnicode(in, state); } |
350 | static QByteArray convertFromUnicode(QStringView in, QStringConverter::State *state) |
351 | { return QUtf8::convertFromUnicode(in, state); } |
352 | #else |
353 | static int checkUtf8(); |
354 | static bool isUtf8() |
355 | { |
356 | Q_CONSTINIT |
357 | static QBasicAtomicInteger<qint8> result = { 0 }; |
358 | int r = result.loadRelaxed(); |
359 | if (r == 0) { |
360 | r = checkUtf8(); |
361 | result.storeRelaxed(r); |
362 | } |
363 | return r > 0; |
364 | } |
365 | static QString convertToUnicode_sys(QByteArrayView, QStringConverter::State *); |
366 | static QString convertToUnicode(QByteArrayView in, QStringConverter::State *state) |
367 | { |
368 | if (isUtf8()) |
369 | return QUtf8::convertToUnicode(in, state); |
370 | return convertToUnicode_sys(in, state); |
371 | } |
372 | static QByteArray convertFromUnicode_sys(QStringView, QStringConverter::State *); |
373 | static QByteArray convertFromUnicode(QStringView in, QStringConverter::State *state) |
374 | { |
375 | if (isUtf8()) |
376 | return QUtf8::convertFromUnicode(in, state); |
377 | return convertFromUnicode_sys(in, state); |
378 | } |
379 | #endif |
380 | }; |
381 | |
382 | QT_END_NAMESPACE |
383 | |
384 | #endif // QSTRINGCONVERTER_P_H |
385 | |