1 | /**************************************************************************** |
2 | ** |
3 | ** Copyright (C) 2018 The Qt Company Ltd. |
4 | ** Copyright (C) 2018 Intel Corporation. |
5 | ** Contact: https://www.qt.io/licensing/ |
6 | ** |
7 | ** This file is part of the QtCore module of the Qt Toolkit. |
8 | ** |
9 | ** $QT_BEGIN_LICENSE:LGPL$ |
10 | ** Commercial License Usage |
11 | ** Licensees holding valid commercial Qt licenses may use this file in |
12 | ** accordance with the commercial license agreement provided with the |
13 | ** Software or, alternatively, in accordance with the terms contained in |
14 | ** a written agreement between you and The Qt Company. For licensing terms |
15 | ** and conditions see https://www.qt.io/terms-conditions. For further |
16 | ** information use the contact form at https://www.qt.io/contact-us. |
17 | ** |
18 | ** GNU Lesser General Public License Usage |
19 | ** Alternatively, this file may be used under the terms of the GNU Lesser |
20 | ** General Public License version 3 as published by the Free Software |
21 | ** Foundation and appearing in the file LICENSE.LGPL3 included in the |
22 | ** packaging of this file. Please review the following information to |
23 | ** ensure the GNU Lesser General Public License version 3 requirements |
24 | ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. |
25 | ** |
26 | ** GNU General Public License Usage |
27 | ** Alternatively, this file may be used under the terms of the GNU |
28 | ** General Public License version 2.0 or (at your option) the GNU General |
29 | ** Public license version 3 or any later version approved by the KDE Free |
30 | ** Qt Foundation. The licenses are as published by the Free Software |
31 | ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 |
32 | ** included in the packaging of this file. Please review the following |
33 | ** information to ensure the GNU General Public License requirements will |
34 | ** be met: https://www.gnu.org/licenses/gpl-2.0.html and |
35 | ** https://www.gnu.org/licenses/gpl-3.0.html. |
36 | ** |
37 | ** $QT_END_LICENSE$ |
38 | ** |
39 | ****************************************************************************/ |
40 | |
41 | #ifndef QUTFCODEC_P_H |
42 | #define QUTFCODEC_P_H |
43 | |
44 | // |
45 | // W A R N I N G |
46 | // ------------- |
47 | // |
48 | // This file is not part of the Qt API. It exists purely as an |
49 | // implementation detail. This header file may change from version to |
50 | // version without notice, or even be removed. |
51 | // |
52 | // We mean it. |
53 | // |
54 | |
55 | #include <QtCore/qstring.h> |
56 | #include <QtCore/qlist.h> |
57 | |
58 | #if QT_CONFIG(textcodec) |
59 | #include "QtCore/qtextcodec.h" |
60 | #endif |
61 | |
62 | #include "private/qtextcodec_p.h" |
63 | |
64 | QT_BEGIN_NAMESPACE |
65 | |
66 | struct QUtf8BaseTraits |
67 | { |
68 | static const bool isTrusted = false; |
69 | static const bool allowNonCharacters = true; |
70 | static const bool skipAsciiHandling = false; |
71 | static const int Error = -1; |
72 | static const int EndOfString = -2; |
73 | |
74 | static bool isValidCharacter(uint u) |
75 | { return int(u) >= 0; } |
76 | |
77 | static void appendByte(uchar *&ptr, uchar b) |
78 | { *ptr++ = b; } |
79 | |
80 | static uchar peekByte(const uchar *ptr, int n = 0) |
81 | { return ptr[n]; } |
82 | |
83 | static qptrdiff availableBytes(const uchar *ptr, const uchar *end) |
84 | { return end - ptr; } |
85 | |
86 | static void advanceByte(const uchar *&ptr, int n = 1) |
87 | { ptr += n; } |
88 | |
89 | static void appendUtf16(ushort *&ptr, ushort uc) |
90 | { *ptr++ = uc; } |
91 | |
92 | static void appendUcs4(ushort *&ptr, uint uc) |
93 | { |
94 | appendUtf16(ptr, uc: QChar::highSurrogate(ucs4: uc)); |
95 | appendUtf16(ptr, uc: QChar::lowSurrogate(ucs4: uc)); |
96 | } |
97 | |
98 | static ushort peekUtf16(const ushort *ptr, int n = 0) |
99 | { return ptr[n]; } |
100 | |
101 | static qptrdiff availableUtf16(const ushort *ptr, const ushort *end) |
102 | { return end - ptr; } |
103 | |
104 | static void advanceUtf16(const ushort *&ptr, int n = 1) |
105 | { ptr += n; } |
106 | |
107 | // it's possible to output to UCS-4 too |
108 | static void appendUtf16(uint *&ptr, ushort uc) |
109 | { *ptr++ = uc; } |
110 | |
111 | static void appendUcs4(uint *&ptr, uint uc) |
112 | { *ptr++ = uc; } |
113 | }; |
114 | |
115 | struct QUtf8BaseTraitsNoAscii : public QUtf8BaseTraits |
116 | { |
117 | static const bool skipAsciiHandling = true; |
118 | }; |
119 | |
120 | namespace QUtf8Functions |
121 | { |
122 | /// returns 0 on success; errors can only happen if \a u is a surrogate: |
123 | /// Error if \a u is a low surrogate; |
124 | /// if \a u is a high surrogate, Error if the next isn't a low one, |
125 | /// EndOfString if we run into the end of the string. |
126 | template <typename Traits, typename OutputPtr, typename InputPtr> inline |
127 | int toUtf8(ushort u, OutputPtr &dst, InputPtr &src, InputPtr end) |
128 | { |
129 | if (!Traits::skipAsciiHandling && u < 0x80) { |
130 | // U+0000 to U+007F (US-ASCII) - one byte |
131 | Traits::appendByte(dst, uchar(u)); |
132 | return 0; |
133 | } else if (u < 0x0800) { |
134 | // U+0080 to U+07FF - two bytes |
135 | // first of two bytes |
136 | Traits::appendByte(dst, 0xc0 | uchar(u >> 6)); |
137 | } else { |
138 | if (!QChar::isSurrogate(ucs4: u)) { |
139 | // U+0800 to U+FFFF (except U+D800-U+DFFF) - three bytes |
140 | if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4: u)) |
141 | return Traits::Error; |
142 | |
143 | // first of three bytes |
144 | Traits::appendByte(dst, 0xe0 | uchar(u >> 12)); |
145 | } else { |
146 | // U+10000 to U+10FFFF - four bytes |
147 | // need to get one extra codepoint |
148 | if (Traits::availableUtf16(src, end) == 0) |
149 | return Traits::EndOfString; |
150 | |
151 | ushort low = Traits::peekUtf16(src); |
152 | if (!QChar::isHighSurrogate(ucs4: u)) |
153 | return Traits::Error; |
154 | if (!QChar::isLowSurrogate(ucs4: low)) |
155 | return Traits::Error; |
156 | |
157 | Traits::advanceUtf16(src); |
158 | uint ucs4 = QChar::surrogateToUcs4(high: u, low); |
159 | |
160 | if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4)) |
161 | return Traits::Error; |
162 | |
163 | // first byte |
164 | Traits::appendByte(dst, 0xf0 | (uchar(ucs4 >> 18) & 0xf)); |
165 | |
166 | // second of four bytes |
167 | Traits::appendByte(dst, 0x80 | (uchar(ucs4 >> 12) & 0x3f)); |
168 | |
169 | // for the rest of the bytes |
170 | u = ushort(ucs4); |
171 | } |
172 | |
173 | // second to last byte |
174 | Traits::appendByte(dst, 0x80 | (uchar(u >> 6) & 0x3f)); |
175 | } |
176 | |
177 | // last byte |
178 | Traits::appendByte(dst, 0x80 | (u & 0x3f)); |
179 | return 0; |
180 | } |
181 | |
182 | inline bool isContinuationByte(uchar b) |
183 | { |
184 | return (b & 0xc0) == 0x80; |
185 | } |
186 | |
187 | /// returns the number of characters consumed (including \a b) in case of success; |
188 | /// returns negative in case of error: Traits::Error or Traits::EndOfString |
189 | template <typename Traits, typename OutputPtr, typename InputPtr> inline |
190 | int fromUtf8(uchar b, OutputPtr &dst, InputPtr &src, InputPtr end) |
191 | { |
192 | int charsNeeded; |
193 | uint min_uc; |
194 | uint uc; |
195 | |
196 | if (!Traits::skipAsciiHandling && b < 0x80) { |
197 | // US-ASCII |
198 | Traits::appendUtf16(dst, b); |
199 | return 1; |
200 | } |
201 | |
202 | if (!Traits::isTrusted && Q_UNLIKELY(b <= 0xC1)) { |
203 | // an UTF-8 first character must be at least 0xC0 |
204 | // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences |
205 | return Traits::Error; |
206 | } else if (b < 0xe0) { |
207 | charsNeeded = 2; |
208 | min_uc = 0x80; |
209 | uc = b & 0x1f; |
210 | } else if (b < 0xf0) { |
211 | charsNeeded = 3; |
212 | min_uc = 0x800; |
213 | uc = b & 0x0f; |
214 | } else if (b < 0xf5) { |
215 | charsNeeded = 4; |
216 | min_uc = 0x10000; |
217 | uc = b & 0x07; |
218 | } else { |
219 | // the last Unicode character is U+10FFFF |
220 | // it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF" |
221 | // therefore, a byte higher than 0xF4 is not the UTF-8 first byte |
222 | return Traits::Error; |
223 | } |
224 | |
225 | int bytesAvailable = Traits::availableBytes(src, end); |
226 | if (Q_UNLIKELY(bytesAvailable < charsNeeded - 1)) { |
227 | // it's possible that we have an error instead of just unfinished bytes |
228 | if (bytesAvailable > 0 && !isContinuationByte(Traits::peekByte(src, 0))) |
229 | return Traits::Error; |
230 | if (bytesAvailable > 1 && !isContinuationByte(Traits::peekByte(src, 1))) |
231 | return Traits::Error; |
232 | return Traits::EndOfString; |
233 | } |
234 | |
235 | // first continuation character |
236 | b = Traits::peekByte(src, 0); |
237 | if (!isContinuationByte(b)) |
238 | return Traits::Error; |
239 | uc <<= 6; |
240 | uc |= b & 0x3f; |
241 | |
242 | if (charsNeeded > 2) { |
243 | // second continuation character |
244 | b = Traits::peekByte(src, 1); |
245 | if (!isContinuationByte(b)) |
246 | return Traits::Error; |
247 | uc <<= 6; |
248 | uc |= b & 0x3f; |
249 | |
250 | if (charsNeeded > 3) { |
251 | // third continuation character |
252 | b = Traits::peekByte(src, 2); |
253 | if (!isContinuationByte(b)) |
254 | return Traits::Error; |
255 | uc <<= 6; |
256 | uc |= b & 0x3f; |
257 | } |
258 | } |
259 | |
260 | // we've decoded something; safety-check it |
261 | if (!Traits::isTrusted) { |
262 | if (uc < min_uc) |
263 | return Traits::Error; |
264 | if (QChar::isSurrogate(ucs4: uc) || uc > QChar::LastValidCodePoint) |
265 | return Traits::Error; |
266 | if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4: uc)) |
267 | return Traits::Error; |
268 | } |
269 | |
270 | // write the UTF-16 sequence |
271 | if (!QChar::requiresSurrogates(ucs4: uc)) { |
272 | // UTF-8 decoded and no surrogates are required |
273 | // detach if necessary |
274 | Traits::appendUtf16(dst, ushort(uc)); |
275 | } else { |
276 | // UTF-8 decoded to something that requires a surrogate pair |
277 | Traits::appendUcs4(dst, uc); |
278 | } |
279 | |
280 | Traits::advanceByte(src, charsNeeded - 1); |
281 | return charsNeeded; |
282 | } |
283 | } |
284 | |
285 | enum DataEndianness |
286 | { |
287 | DetectEndianness, |
288 | BigEndianness, |
289 | LittleEndianness |
290 | }; |
291 | |
292 | struct QUtf8 |
293 | { |
294 | static QChar *convertToUnicode(QChar *, const char *, int) noexcept; |
295 | static QString convertToUnicode(const char *, int); |
296 | static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *); |
297 | static QByteArray convertFromUnicode(const QChar *, int); |
298 | static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *); |
299 | struct ValidUtf8Result { |
300 | bool isValidUtf8; |
301 | bool isValidAscii; |
302 | }; |
303 | static ValidUtf8Result isValidUtf8(const char *, qsizetype); |
304 | static int compareUtf8(const char *, qsizetype, const QChar *, int); |
305 | static int compareUtf8(const char *, qsizetype, QLatin1String s); |
306 | }; |
307 | |
308 | struct QUtf16 |
309 | { |
310 | static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness); |
311 | static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness); |
312 | }; |
313 | |
314 | struct QUtf32 |
315 | { |
316 | static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness); |
317 | static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness); |
318 | }; |
319 | |
320 | #if QT_CONFIG(textcodec) |
321 | |
322 | class QUtf8Codec : public QTextCodec { |
323 | public: |
324 | ~QUtf8Codec(); |
325 | |
326 | QByteArray name() const override; |
327 | int mibEnum() const override; |
328 | |
329 | QString convertToUnicode(const char *, int, ConverterState *) const override; |
330 | QByteArray convertFromUnicode(const QChar *, int, ConverterState *) const override; |
331 | void convertToUnicode(QString *target, const char *, int, ConverterState *) const; |
332 | }; |
333 | |
334 | class QUtf16Codec : public QTextCodec { |
335 | protected: |
336 | public: |
337 | QUtf16Codec() { e = DetectEndianness; } |
338 | ~QUtf16Codec(); |
339 | |
340 | QByteArray name() const override; |
341 | QList<QByteArray> aliases() const override; |
342 | int mibEnum() const override; |
343 | |
344 | QString convertToUnicode(const char *, int, ConverterState *) const override; |
345 | QByteArray convertFromUnicode(const QChar *, int, ConverterState *) const override; |
346 | |
347 | protected: |
348 | DataEndianness e; |
349 | }; |
350 | |
351 | class QUtf16BECodec : public QUtf16Codec { |
352 | public: |
353 | QUtf16BECodec() : QUtf16Codec() { e = BigEndianness; } |
354 | QByteArray name() const override; |
355 | QList<QByteArray> aliases() const override; |
356 | int mibEnum() const override; |
357 | }; |
358 | |
359 | class QUtf16LECodec : public QUtf16Codec { |
360 | public: |
361 | QUtf16LECodec() : QUtf16Codec() { e = LittleEndianness; } |
362 | QByteArray name() const override; |
363 | QList<QByteArray> aliases() const override; |
364 | int mibEnum() const override; |
365 | }; |
366 | |
367 | class QUtf32Codec : public QTextCodec { |
368 | public: |
369 | QUtf32Codec() { e = DetectEndianness; } |
370 | ~QUtf32Codec(); |
371 | |
372 | QByteArray name() const override; |
373 | QList<QByteArray> aliases() const override; |
374 | int mibEnum() const override; |
375 | |
376 | QString convertToUnicode(const char *, int, ConverterState *) const override; |
377 | QByteArray convertFromUnicode(const QChar *, int, ConverterState *) const override; |
378 | |
379 | protected: |
380 | DataEndianness e; |
381 | }; |
382 | |
383 | class QUtf32BECodec : public QUtf32Codec { |
384 | public: |
385 | QUtf32BECodec() : QUtf32Codec() { e = BigEndianness; } |
386 | QByteArray name() const override; |
387 | QList<QByteArray> aliases() const override; |
388 | int mibEnum() const override; |
389 | }; |
390 | |
391 | class QUtf32LECodec : public QUtf32Codec { |
392 | public: |
393 | QUtf32LECodec() : QUtf32Codec() { e = LittleEndianness; } |
394 | QByteArray name() const override; |
395 | QList<QByteArray> aliases() const override; |
396 | int mibEnum() const override; |
397 | }; |
398 | |
399 | |
400 | #endif // textcodec |
401 | |
402 | QT_END_NAMESPACE |
403 | |
404 | #endif // QUTFCODEC_P_H |
405 | |