tst_utf8.cpp source code [qtbase/tests/auto/corelib/codecs/utf8/tst_utf8.cpp]

1	/****************************************************************************
2	**
3	** Copyright (C) 2018 The Qt Company Ltd.
4	** Copyright (C) 2018 Intel Corporation.
5	** Contact: https://www.qt.io/licensing/
6	**
7	** This file is part of the test suite of the Qt Toolkit.
8	**
9	** $QT_BEGIN_LICENSE:GPL-EXCEPT$
10	** Commercial License Usage
11	** Licensees holding valid commercial Qt licenses may use this file in
12	** accordance with the commercial license agreement provided with the
13	** Software or, alternatively, in accordance with the terms contained in
14	** a written agreement between you and The Qt Company. For licensing terms
15	** and conditions see https://www.qt.io/terms-conditions. For further
16	** information use the contact form at https://www.qt.io/contact-us.
17	**
18	** GNU General Public License Usage
19	** Alternatively, this file may be used under the terms of the GNU
20	** General Public License version 3 as published by the Free Software
21	** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
22	** included in the packaging of this file. Please review the following
23	** information to ensure the GNU General Public License requirements will
24	** be met: https://www.gnu.org/licenses/gpl-3.0.html.
25	**
26	** $QT_END_LICENSE$
27	**
28	****************************************************************************/
29	#include <QtTest/QtTest>
30
31	#include <qtextcodec.h>
32	#include <QScopedPointer>
33
34	static const char utf8bom[] = "\xEF\xBB\xBF";
35
36	class tst_Utf8 : public QObject
37	{
38	Q_OBJECT
39
40	public:
41	// test data:
42	QTextCodec *codec;
43	QString (from8BitPtr)(const* char , int*);
44	static QByteArray to8Bit(const QString &);
45
46	inline QString from8Bit(const QByteArray &ba)
47	{ return from8BitPtr(ba.constData(), ba.length()); }
48	public slots:
49	void initTestCase();
50	void init();
51
52	private slots:
53	void roundTrip_data();
54	void roundTrip();
55
56	void charByChar_data();
57	void charByChar();
58
59	void invalidUtf8_data();
60	void invalidUtf8();
61
62	void nonCharacters_data();
63	void nonCharacters();
64	};
65
66	void tst_Utf8::initTestCase()
67	{
68	QTest::addColumn<bool>(name: "useLocale");
69	QTest::newRow(dataTag: "utf8codec") << false;
70
71	// is the locale UTF-8?
72	if (QString (QChar (QChar::ReplacementCharacter)).toLocal8Bit() == "\xEF\xBF\xBD") {
73	QTest::newRow(dataTag: "localecodec") << true;
74	qInfo() << "locale is utf8";
75	}
76	}
77
78	void tst_Utf8::init()
79	{
80	QFETCH_GLOBAL(bool, useLocale);
81	if (useLocale) {
82	codec = QTextCodec::codecForLocale();
83	from8BitPtr = &QString::fromLocal8Bit;
84	} else {
85	codec = QTextCodec::codecForMib(mib: `106`);
86	from8BitPtr = &QString::fromUtf8;
87	}
88	}
89
90	QByteArray tst_Utf8::to8Bit(const QString &s)
91	{
92	QFETCH_GLOBAL(bool, useLocale);
93	if (useLocale)
94	return s.toLocal8Bit();
95	else
96	return s.toUtf8();
97	}
98
99	void tst_Utf8::roundTrip_data()
100	{
101	QTest::addColumn<QByteArray>(name: "utf8");
102	QTest::addColumn<QString>(name: "utf16");
103
104	QTest::newRow(dataTag: "empty") << QByteArray () << QString ();
105	QTest::newRow(dataTag: "nul") << QByteArray ("", `1`) << QString (QChar (QChar::Null));
106
107	static const char ascii[] = "This is a standard US-ASCII message";
108	QTest::newRow(dataTag: "ascii") << QByteArray (ascii) << QString::fromLatin1(str: ascii);
109
110	static const char ascii2[] = "\1This\2is\3an\4US-ASCII\020 message interspersed with control chars";
111	QTest::newRow(dataTag: "ascii2") << QByteArray (ascii2) << QString::fromLatin1(str: ascii2);
112
113	static const char utf8_1[] = "\302\240"; // NBSP
114	QTest::newRow(dataTag: "utf8_1") << QByteArray (utf8_1) << QString (QChar (QChar::Nbsp));
115
116	static const char utf8_2[] = "\342\202\254"; // Euro symbol
117	QTest::newRow(dataTag: "utf8_2") << QByteArray (utf8_2) << QString (QChar (`0x20AC`));
118
119	#if 0
120	// Can't test this because QString::fromUtf8 consumes it
121	static const char utf8_3[] = "\357\273\277"; // byte order mark
122	QTest::newRow("utf8_3") << QByteArray(utf8_3) << QString(QChar(QChar::ByteOrderMark));
123	#endif
124
125	static const char utf8_4[] = "\357\277\275"; // replacement char
126	QTest::newRow(dataTag: "utf8_4") << QByteArray (utf8_4) << QString (QChar (QChar::ReplacementCharacter));
127
128	static const char utf8_5[] = "\360\220\210\203"; // U+010203
129	static const uint utf32_5[] = { `0x010203` };
130	QTest::newRow(dataTag: "utf8_5") << QByteArray (utf8_5) << QString::fromUcs4(utf32_5, size: `1`);
131
132	static const char utf8_6[] = "\364\217\277\275"; // U+10FFFD
133	static const uint utf32_6[] = { `0x10FFFD` };
134	QTest::newRow(dataTag: "utf8_6") << QByteArray (utf8_6) << QString::fromUcs4(utf32_6, size: `1`);
135
136	static const char utf8_7[] = "abc\302\240\303\241\303\251\307\275 \342\202\254def";
137	static const ushort utf16_7[] = { `'a'`, `'b'`, `'c'`, `0x00A0`,
138	`0x00E1`, `0x00E9`, `0x01FD`,
139	`' '`, `0x20AC`, `'d'`, `'e'`, `'f'`, `0` };
140	QTest::newRow(dataTag: "utf8_7") << QByteArray (utf8_7) << QString::fromUtf16(utf16_7);
141
142	static const char utf8_8[] = "abc\302\240\303\241\303\251\307\275 \364\217\277\275 \342\202\254def";
143	static const uint utf32_8[] = { `'a'`, `'b'`, `'c'`, `0x00A0`,
144	`0x00E1`, `0x00E9`, `0x01FD`,
145	`' '`, `0x10FFFD`, `' '`,
146	`0x20AC`, `'d'`, `'e'`, `'f'`, `0` };
147	QTest::newRow(dataTag: "utf8_8") << QByteArray (utf8_8) << QString::fromUcs4(utf32_8);
148	}
149
150	void tst_Utf8::roundTrip()
151	{
152	QFETCH(QByteArray, utf8);
153	QFETCH(QString, utf16);
154
155	QCOMPARE(to8Bit(utf16), utf8);
156	QCOMPARE(from8Bit(utf8), utf16);
157
158	QCOMPARE(to8Bit(from8Bit(utf8)), utf8);
159	QCOMPARE(from8Bit(to8Bit(utf16)), utf16);
160
161	// repeat with a longer message
162	utf8.prepend(s: "12345678901234");
163	utf16.prepend(s: QLatin1String ("12345678901234"));
164	QCOMPARE(to8Bit(utf16), utf8);
165	QCOMPARE(from8Bit(utf8), utf16);
166
167	QCOMPARE(to8Bit(from8Bit(utf8)), utf8);
168	QCOMPARE(from8Bit(to8Bit(utf16)), utf16);
169	}
170
171	void tst_Utf8::charByChar_data()
172	{
173	roundTrip_data();
174	}
175
176	void tst_Utf8::charByChar()
177	{
178	QFETCH(QByteArray, utf8);
179	QFETCH(QString, utf16);
180
181	{
182	// from utf16 to utf8 char by char:
183	const QScopedPointer<QTextEncoder> encoder(codec->makeEncoder());
184	QByteArray encoded;
185
186	for (int i = `0`; i < utf16.length(); ++i) {
187	encoded += encoder ->fromUnicode(uc: utf16.constData() + i, len: `1`);
188	QVERIFY(!encoder ->hasFailure());
189	}
190
191	if (encoded.startsWith(c: utf8bom))
192	encoded = encoded.mid(index: int(strlen(s: utf8bom)));
193	QCOMPARE(encoded, utf8);
194	}
195	{
196	// from utf8 to utf16 char by char:
197	const QScopedPointer<QTextDecoder> decoder(codec->makeDecoder());
198	QString decoded;
199
200	for (int i = `0`; i < utf8.length(); ++i) {
201	decoded += decoder ->toUnicode(chars: utf8.constData() + i, len: `1`);
202	QVERIFY(!decoder ->hasFailure());
203	}
204
205	QCOMPARE(decoded, utf16);
206	}
207	}
208
209	void tst_Utf8::invalidUtf8_data()
210	{
211	QTest::addColumn<QByteArray>(name: "utf8");
212
213	extern void loadInvalidUtf8Rows();
214	loadInvalidUtf8Rows();
215	}
216
217	void tst_Utf8::invalidUtf8()
218	{
219	QFETCH(QByteArray, utf8);
220	QFETCH_GLOBAL(bool, useLocale);
221
222	const QScopedPointer<QTextDecoder> decoder(codec->makeDecoder());
223	decoder ->toUnicode(ba: utf8);
224
225	// Only enforce correctness on our UTF-8 decoder
226	// The system's UTF-8 codec is sometimes buggy
227	// GNU libc's iconv is known to accept U+FFFF and U+FFFE encoded as UTF-8
228	// OS X's iconv is known to accept those, plus surrogates and codepoints above U+10FFFF
229	if (!useLocale)
230	QVERIFY(decoder ->hasFailure() \|\| decoder ->needsMoreData());
231	else if (!decoder ->hasFailure() && !decoder ->needsMoreData())
232	qWarning(msg: "System codec does not report failure when it should. Should report bug upstream.");
233
234	// add a continuation character and test that we don't accidentally use it
235	// (buffer overrun)
236	utf8 += char(`0x80` \| `0x3f`);
237	decoder ->toUnicode(chars: utf8.constData(), len: utf8.size() - `1`);
238	if (!useLocale)
239	QVERIFY(decoder ->hasFailure());
240	else if (!decoder ->hasFailure())
241	qWarning(msg: "System codec does not report failure when it should. Should report bug upstream.");
242	}
243
244	void tst_Utf8::nonCharacters_data()
245	{
246	QTest::addColumn<QByteArray>(name: "utf8");
247	QTest::addColumn<QString>(name: "utf16");
248
249	extern void loadNonCharactersRows();
250	loadNonCharactersRows();
251	}
252
253	void tst_Utf8::nonCharacters()
254	{
255	QFETCH(QByteArray, utf8);
256	QFETCH(QString, utf16);
257	QFETCH_GLOBAL(bool, useLocale);
258
259	const QScopedPointer<QTextDecoder> decoder(codec->makeDecoder());
260	decoder ->toUnicode(ba: utf8);
261
262	// Only enforce correctness on our UTF-8 decoder
263	if (!useLocale)
264	QVERIFY(!decoder ->hasFailure());
265	else if (decoder ->hasFailure())
266	qWarning(msg: "System codec reports failure when it shouldn't. Should report bug upstream.");
267
268	const QScopedPointer<QTextEncoder> encoder(codec->makeEncoder());
269	encoder ->fromUnicode(str: utf16);
270	if (!useLocale)
271	QVERIFY(!encoder ->hasFailure());
272	else if (encoder ->hasFailure())
273	qWarning(msg: "System codec reports failure when it shouldn't. Should report bug upstream.");
274	}
275
276	QTEST_MAIN(tst_Utf8)
277	#include "tst_utf8.moc"
278

source code of qtbase/tests/auto/corelib/codecs/utf8/tst_utf8.cpp