qutfcodec_p.h source code [qtbase/src/corelib/codecs/qutfcodec_p.h]

1	/****************************************************************************
2	**
3	** Copyright (C) 2018 The Qt Company Ltd.
4	** Copyright (C) 2018 Intel Corporation.
5	** Contact: https://www.qt.io/licensing/
6	**
7	** This file is part of the QtCore module of the Qt Toolkit.
8	**
9	** $QT_BEGIN_LICENSE:LGPL$
10	** Commercial License Usage
11	** Licensees holding valid commercial Qt licenses may use this file in
12	** accordance with the commercial license agreement provided with the
13	** Software or, alternatively, in accordance with the terms contained in
14	** a written agreement between you and The Qt Company. For licensing terms
15	** and conditions see https://www.qt.io/terms-conditions. For further
16	** information use the contact form at https://www.qt.io/contact-us.
17	**
18	** GNU Lesser General Public License Usage
19	** Alternatively, this file may be used under the terms of the GNU Lesser
20	** General Public License version 3 as published by the Free Software
21	** Foundation and appearing in the file LICENSE.LGPL3 included in the
22	** packaging of this file. Please review the following information to
23	** ensure the GNU Lesser General Public License version 3 requirements
24	** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
25	**
26	** GNU General Public License Usage
27	** Alternatively, this file may be used under the terms of the GNU
28	** General Public License version 2.0 or (at your option) the GNU General
29	** Public license version 3 or any later version approved by the KDE Free
30	** Qt Foundation. The licenses are as published by the Free Software
31	** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
32	** included in the packaging of this file. Please review the following
33	** information to ensure the GNU General Public License requirements will
34	** be met: https://www.gnu.org/licenses/gpl-2.0.html and
35	** https://www.gnu.org/licenses/gpl-3.0.html.
36	**
37	** $QT_END_LICENSE$
38	**
39	****************************************************************************/
40
41	#ifndef QUTFCODEC_P_H
42	#define QUTFCODEC_P_H
43
44	//
45	// W A R N I N G
46	// -------------
47	//
48	// This file is not part of the Qt API. It exists purely as an
49	// implementation detail. This header file may change from version to
50	// version without notice, or even be removed.
51	//
52	// We mean it.
53	//
54
55	#include <QtCore/qstring.h>
56	#include <QtCore/qlist.h>
57
58	#if QT_CONFIG(textcodec)
59	#include "QtCore/qtextcodec.h"
60	#endif
61
62	#include "private/qtextcodec_p.h"
63
64	QT_BEGIN_NAMESPACE
65
66	struct QUtf8BaseTraits
67	{
68	static const bool isTrusted = false;
69	static const bool allowNonCharacters = true;
70	static const bool skipAsciiHandling = false;
71	static const int Error = -`1`;
72	static const int EndOfString = -`2`;
73
74	static bool isValidCharacter(uint u)
75	{ return int(u) >= `0`; }
76
77	static void appendByte(uchar *&ptr, uchar b)
78	{ *ptr++ = b; }
79
80	static uchar peekByte(const uchar ptr, int* n = `0`)
81	{ return ptr[n]; }
82
83	static qptrdiff availableBytes(const uchar ptr, const* uchar *end)
84	{ return end - ptr; }
85
86	static void advanceByte(const uchar &ptr, int* n = `1`)
87	{ ptr += n; }
88
89	static void appendUtf16(ushort *&ptr, ushort uc)
90	{ *ptr++ = uc; }
91
92	static void appendUcs4(ushort *&ptr, uint uc)
93	{
94	appendUtf16(ptr, uc: QChar::highSurrogate(ucs4: uc));
95	appendUtf16(ptr, uc: QChar::lowSurrogate(ucs4: uc));
96	}
97
98	static ushort peekUtf16(const ushort ptr, int* n = `0`)
99	{ return ptr[n]; }
100
101	static qptrdiff availableUtf16(const ushort ptr, const* ushort *end)
102	{ return end - ptr; }
103
104	static void advanceUtf16(const ushort &ptr, int* n = `1`)
105	{ ptr += n; }
106
107	// it's possible to output to UCS-4 too
108	static void appendUtf16(uint *&ptr, ushort uc)
109	{ *ptr++ = uc; }
110
111	static void appendUcs4(uint *&ptr, uint uc)
112	{ *ptr++ = uc; }
113	};
114
115	struct QUtf8BaseTraitsNoAscii : public QUtf8BaseTraits
116	{
117	static const bool skipAsciiHandling = true;
118	};
119
120	namespace QUtf8Functions
121	{
122	/// returns 0 on success; errors can only happen if \a u is a surrogate:
123	/// Error if \a u is a low surrogate;
124	/// if \a u is a high surrogate, Error if the next isn't a low one,
125	/// EndOfString if we run into the end of the string.
126	template <typename Traits, typename OutputPtr, typename InputPtr> inline
127	int toUtf8(ushort u, OutputPtr &dst, InputPtr &src, InputPtr end)
128	{
129	if (!Traits::skipAsciiHandling && u < `0x80`) {
130	// U+0000 to U+007F (US-ASCII) - one byte
131	Traits::appendByte(dst, uchar(u));
132	return `0`;
133	} else if (u < `0x0800`) {
134	// U+0080 to U+07FF - two bytes
135	// first of two bytes
136	Traits::appendByte(dst, `0xc0` \| uchar(u >> `6`));
137	} else {
138	if (!QChar::isSurrogate(ucs4: u)) {
139	// U+0800 to U+FFFF (except U+D800-U+DFFF) - three bytes
140	if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4: u))
141	return Traits::Error;
142
143	// first of three bytes
144	Traits::appendByte(dst, `0xe0` \| uchar(u >> `12`));
145	} else {
146	// U+10000 to U+10FFFF - four bytes
147	// need to get one extra codepoint
148	if (Traits::availableUtf16(src, end) == `0`)
149	return Traits::EndOfString;
150
151	ushort low = Traits::peekUtf16(src);
152	if (!QChar::isHighSurrogate(ucs4: u))
153	return Traits::Error;
154	if (!QChar::isLowSurrogate(ucs4: low))
155	return Traits::Error;
156
157	Traits::advanceUtf16(src);
158	uint ucs4 = QChar::surrogateToUcs4(high: u, low);
159
160	if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4))
161	return Traits::Error;
162
163	// first byte
164	Traits::appendByte(dst, `0xf0` \| (uchar(ucs4 >> `18`) & `0xf`));
165
166	// second of four bytes
167	Traits::appendByte(dst, `0x80` \| (uchar(ucs4 >> `12`) & `0x3f`));
168
169	// for the rest of the bytes
170	u = ushort(ucs4);
171	}
172
173	// second to last byte
174	Traits::appendByte(dst, `0x80` \| (uchar(u >> `6`) & `0x3f`));
175	}
176
177	// last byte
178	Traits::appendByte(dst, `0x80` \| (u & `0x3f`));
179	return `0`;
180	}
181
182	inline bool isContinuationByte(uchar b)
183	{
184	return (b & `0xc0`) == `0x80`;
185	}
186
187	/// returns the number of characters consumed (including \a b) in case of success;
188	/// returns negative in case of error: Traits::Error or Traits::EndOfString
189	template <typename Traits, typename OutputPtr, typename InputPtr> inline
190	int fromUtf8(uchar b, OutputPtr &dst, InputPtr &src, InputPtr end)
191	{
192	int charsNeeded;
193	uint min_uc;
194	uint uc;
195
196	if (!Traits::skipAsciiHandling && b < `0x80`) {
197	// US-ASCII
198	Traits::appendUtf16(dst, b);
199	return `1`;
200	}
201
202	if (!Traits::isTrusted && Q_UNLIKELY(b <= `0xC1`)) {
203	// an UTF-8 first character must be at least 0xC0
204	// however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences
205	return Traits::Error;
206	} else if (b < `0xe0`) {
207	charsNeeded = `2`;
208	min_uc = `0x80`;
209	uc = b & `0x1f`;
210	} else if (b < `0xf0`) {
211	charsNeeded = `3`;
212	min_uc = `0x800`;
213	uc = b & `0x0f`;
214	} else if (b < `0xf5`) {
215	charsNeeded = `4`;
216	min_uc = `0x10000`;
217	uc = b & `0x07`;
218	} else {
219	// the last Unicode character is U+10FFFF
220	// it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF"
221	// therefore, a byte higher than 0xF4 is not the UTF-8 first byte
222	return Traits::Error;
223	}
224
225	int bytesAvailable = Traits::availableBytes(src, end);
226	if (Q_UNLIKELY(bytesAvailable < charsNeeded - `1`)) {
227	// it's possible that we have an error instead of just unfinished bytes
228	if (bytesAvailable > `0` && !isContinuationByte(Traits::peekByte(src, `0`)))
229	return Traits::Error;
230	if (bytesAvailable > `1` && !isContinuationByte(Traits::peekByte(src, `1`)))
231	return Traits::Error;
232	return Traits::EndOfString;
233	}
234
235	// first continuation character
236	b = Traits::peekByte(src, `0`);
237	if (!isContinuationByte(b))
238	return Traits::Error;
239	uc <<= `6`;
240	uc \|= b & `0x3f`;
241
242	if (charsNeeded > `2`) {
243	// second continuation character
244	b = Traits::peekByte(src, `1`);
245	if (!isContinuationByte(b))
246	return Traits::Error;
247	uc <<= `6`;
248	uc \|= b & `0x3f`;
249
250	if (charsNeeded > `3`) {
251	// third continuation character
252	b = Traits::peekByte(src, `2`);
253	if (!isContinuationByte(b))
254	return Traits::Error;
255	uc <<= `6`;
256	uc \|= b & `0x3f`;
257	}
258	}
259
260	// we've decoded something; safety-check it
261	if (!Traits::isTrusted) {
262	if (uc < min_uc)
263	return Traits::Error;
264	if (QChar::isSurrogate(ucs4: uc) \|\| uc > QChar::LastValidCodePoint)
265	return Traits::Error;
266	if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4: uc))
267	return Traits::Error;
268	}
269
270	// write the UTF-16 sequence
271	if (!QChar::requiresSurrogates(ucs4: uc)) {
272	// UTF-8 decoded and no surrogates are required
273	// detach if necessary
274	Traits::appendUtf16(dst, ushort(uc));
275	} else {
276	// UTF-8 decoded to something that requires a surrogate pair
277	Traits::appendUcs4(dst, uc);
278	}
279
280	Traits::advanceByte(src, charsNeeded - `1`);
281	return charsNeeded;
282	}
283	}
284
285	enum DataEndianness
286	{
287	DetectEndianness,
288	BigEndianness,
289	LittleEndianness
290	};
291
292	struct QUtf8
293	{
294	static QChar convertToUnicode(QChar , const char , int) noexcept*;
295	static QString convertToUnicode(const char , int*);
296	static QString convertToUnicode(const char , int, QTextCodec::ConverterState );
297	static QByteArray convertFromUnicode(const QChar , int*);
298	static QByteArray convertFromUnicode(const QChar , int, QTextCodec::ConverterState );
299	struct ValidUtf8Result {
300	bool isValidUtf8;
301	bool isValidAscii;
302	};
303	static ValidUtf8Result isValidUtf8(const char *, qsizetype);
304	static int compareUtf8(const char , qsizetype, const* QChar , int*);
305	static int compareUtf8(const char *, qsizetype, QLatin1String s);
306	};
307
308	struct QUtf16
309	{
310	static QString convertToUnicode(const char , int, QTextCodec::ConverterState , DataEndianness = DetectEndianness);
311	static QByteArray convertFromUnicode(const QChar , int, QTextCodec::ConverterState , DataEndianness = DetectEndianness);
312	};
313
314	struct QUtf32
315	{
316	static QString convertToUnicode(const char , int, QTextCodec::ConverterState , DataEndianness = DetectEndianness);
317	static QByteArray convertFromUnicode(const QChar , int, QTextCodec::ConverterState , DataEndianness = DetectEndianness);
318	};
319
320	#if QT_CONFIG(textcodec)
321
322	class QUtf8Codec : public QTextCodec {
323	public:
324	~QUtf8Codec();
325
326	QByteArray name() const override;
327	int mibEnum() const override;
328
329	QString convertToUnicode(const char , int, ConverterState ) const override;
330	QByteArray convertFromUnicode(const QChar , int, ConverterState ) const override;
331	void convertToUnicode(QString target, const* char , int, ConverterState ) const;
332	};
333
334	class QUtf16Codec : public QTextCodec {
335	protected:
336	public:
337	QUtf16Codec() { e = DetectEndianness; }
338	~QUtf16Codec();
339
340	QByteArray name() const override;
341	QList<QByteArray> aliases() const override;
342	int mibEnum() const override;
343
344	QString convertToUnicode(const char , int, ConverterState ) const override;
345	QByteArray convertFromUnicode(const QChar , int, ConverterState ) const override;
346
347	protected:
348	DataEndianness e;
349	};
350
351	class QUtf16BECodec : public QUtf16Codec {
352	public:
353	QUtf16BECodec() : QUtf16Codec () { e = BigEndianness; }
354	QByteArray name() const override;
355	QList<QByteArray> aliases() const override;
356	int mibEnum() const override;
357	};
358
359	class QUtf16LECodec : public QUtf16Codec {
360	public:
361	QUtf16LECodec() : QUtf16Codec () { e = LittleEndianness; }
362	QByteArray name() const override;
363	QList<QByteArray> aliases() const override;
364	int mibEnum() const override;
365	};
366
367	class QUtf32Codec : public QTextCodec {
368	public:
369	QUtf32Codec() { e = DetectEndianness; }
370	~QUtf32Codec();
371
372	QByteArray name() const override;
373	QList<QByteArray> aliases() const override;
374	int mibEnum() const override;
375
376	QString convertToUnicode(const char , int, ConverterState ) const override;
377	QByteArray convertFromUnicode(const QChar , int, ConverterState ) const override;
378
379	protected:
380	DataEndianness e;
381	};
382
383	class QUtf32BECodec : public QUtf32Codec {
384	public:
385	QUtf32BECodec() : QUtf32Codec () { e = BigEndianness; }
386	QByteArray name() const override;
387	QList<QByteArray> aliases() const override;
388	int mibEnum() const override;
389	};
390
391	class QUtf32LECodec : public QUtf32Codec {
392	public:
393	QUtf32LECodec() : QUtf32Codec () { e = LittleEndianness; }
394	QByteArray name() const override;
395	QList<QByteArray> aliases() const override;
396	int mibEnum() const override;
397	};
398
399
400	#endif // textcodec
401
402	QT_END_NAMESPACE
403
404	#endif // QUTFCODEC_P_H
405

source code of qtbase/src/corelib/codecs/qutfcodec_p.h