1/****************************************************************************
2**
3** Copyright (C) 2018 The Qt Company Ltd.
4** Copyright (C) 2018 Intel Corporation.
5** Contact: https://www.qt.io/licensing/
6**
7** This file is part of the test suite of the Qt Toolkit.
8**
9** $QT_BEGIN_LICENSE:GPL-EXCEPT$
10** Commercial License Usage
11** Licensees holding valid commercial Qt licenses may use this file in
12** accordance with the commercial license agreement provided with the
13** Software or, alternatively, in accordance with the terms contained in
14** a written agreement between you and The Qt Company. For licensing terms
15** and conditions see https://www.qt.io/terms-conditions. For further
16** information use the contact form at https://www.qt.io/contact-us.
17**
18** GNU General Public License Usage
19** Alternatively, this file may be used under the terms of the GNU
20** General Public License version 3 as published by the Free Software
21** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
22** included in the packaging of this file. Please review the following
23** information to ensure the GNU General Public License requirements will
24** be met: https://www.gnu.org/licenses/gpl-3.0.html.
25**
26** $QT_END_LICENSE$
27**
28****************************************************************************/
29#include <QtTest/QtTest>
30
31#include <qtextcodec.h>
32#include <QScopedPointer>
33
34static const char utf8bom[] = "\xEF\xBB\xBF";
35
36class tst_Utf8 : public QObject
37{
38 Q_OBJECT
39
40public:
41 // test data:
42 QTextCodec *codec;
43 QString (*from8BitPtr)(const char *, int);
44 static QByteArray to8Bit(const QString &);
45
46 inline QString from8Bit(const QByteArray &ba)
47 { return from8BitPtr(ba.constData(), ba.length()); }
48public slots:
49 void initTestCase();
50 void init();
51
52private slots:
53 void roundTrip_data();
54 void roundTrip();
55
56 void charByChar_data();
57 void charByChar();
58
59 void invalidUtf8_data();
60 void invalidUtf8();
61
62 void nonCharacters_data();
63 void nonCharacters();
64};
65
66void tst_Utf8::initTestCase()
67{
68 QTest::addColumn<bool>(name: "useLocale");
69 QTest::newRow(dataTag: "utf8codec") << false;
70
71 // is the locale UTF-8?
72 if (QString(QChar(QChar::ReplacementCharacter)).toLocal8Bit() == "\xEF\xBF\xBD") {
73 QTest::newRow(dataTag: "localecodec") << true;
74 qInfo() << "locale is utf8";
75 }
76}
77
78void tst_Utf8::init()
79{
80 QFETCH_GLOBAL(bool, useLocale);
81 if (useLocale) {
82 codec = QTextCodec::codecForLocale();
83 from8BitPtr = &QString::fromLocal8Bit;
84 } else {
85 codec = QTextCodec::codecForMib(mib: 106);
86 from8BitPtr = &QString::fromUtf8;
87 }
88}
89
90QByteArray tst_Utf8::to8Bit(const QString &s)
91{
92 QFETCH_GLOBAL(bool, useLocale);
93 if (useLocale)
94 return s.toLocal8Bit();
95 else
96 return s.toUtf8();
97}
98
99void tst_Utf8::roundTrip_data()
100{
101 QTest::addColumn<QByteArray>(name: "utf8");
102 QTest::addColumn<QString>(name: "utf16");
103
104 QTest::newRow(dataTag: "empty") << QByteArray() << QString();
105 QTest::newRow(dataTag: "nul") << QByteArray("", 1) << QString(QChar(QChar::Null));
106
107 static const char ascii[] = "This is a standard US-ASCII message";
108 QTest::newRow(dataTag: "ascii") << QByteArray(ascii) << QString::fromLatin1(str: ascii);
109
110 static const char ascii2[] = "\1This\2is\3an\4US-ASCII\020 message interspersed with control chars";
111 QTest::newRow(dataTag: "ascii2") << QByteArray(ascii2) << QString::fromLatin1(str: ascii2);
112
113 static const char utf8_1[] = "\302\240"; // NBSP
114 QTest::newRow(dataTag: "utf8_1") << QByteArray(utf8_1) << QString(QChar(QChar::Nbsp));
115
116 static const char utf8_2[] = "\342\202\254"; // Euro symbol
117 QTest::newRow(dataTag: "utf8_2") << QByteArray(utf8_2) << QString(QChar(0x20AC));
118
119#if 0
120 // Can't test this because QString::fromUtf8 consumes it
121 static const char utf8_3[] = "\357\273\277"; // byte order mark
122 QTest::newRow("utf8_3") << QByteArray(utf8_3) << QString(QChar(QChar::ByteOrderMark));
123#endif
124
125 static const char utf8_4[] = "\357\277\275"; // replacement char
126 QTest::newRow(dataTag: "utf8_4") << QByteArray(utf8_4) << QString(QChar(QChar::ReplacementCharacter));
127
128 static const char utf8_5[] = "\360\220\210\203"; // U+010203
129 static const uint utf32_5[] = { 0x010203 };
130 QTest::newRow(dataTag: "utf8_5") << QByteArray(utf8_5) << QString::fromUcs4(utf32_5, size: 1);
131
132 static const char utf8_6[] = "\364\217\277\275"; // U+10FFFD
133 static const uint utf32_6[] = { 0x10FFFD };
134 QTest::newRow(dataTag: "utf8_6") << QByteArray(utf8_6) << QString::fromUcs4(utf32_6, size: 1);
135
136 static const char utf8_7[] = "abc\302\240\303\241\303\251\307\275 \342\202\254def";
137 static const ushort utf16_7[] = { 'a', 'b', 'c', 0x00A0,
138 0x00E1, 0x00E9, 0x01FD,
139 ' ', 0x20AC, 'd', 'e', 'f', 0 };
140 QTest::newRow(dataTag: "utf8_7") << QByteArray(utf8_7) << QString::fromUtf16(utf16_7);
141
142 static const char utf8_8[] = "abc\302\240\303\241\303\251\307\275 \364\217\277\275 \342\202\254def";
143 static const uint utf32_8[] = { 'a', 'b', 'c', 0x00A0,
144 0x00E1, 0x00E9, 0x01FD,
145 ' ', 0x10FFFD, ' ',
146 0x20AC, 'd', 'e', 'f', 0 };
147 QTest::newRow(dataTag: "utf8_8") << QByteArray(utf8_8) << QString::fromUcs4(utf32_8);
148}
149
150void tst_Utf8::roundTrip()
151{
152 QFETCH(QByteArray, utf8);
153 QFETCH(QString, utf16);
154
155 QCOMPARE(to8Bit(utf16), utf8);
156 QCOMPARE(from8Bit(utf8), utf16);
157
158 QCOMPARE(to8Bit(from8Bit(utf8)), utf8);
159 QCOMPARE(from8Bit(to8Bit(utf16)), utf16);
160
161 // repeat with a longer message
162 utf8.prepend(s: "12345678901234");
163 utf16.prepend(s: QLatin1String("12345678901234"));
164 QCOMPARE(to8Bit(utf16), utf8);
165 QCOMPARE(from8Bit(utf8), utf16);
166
167 QCOMPARE(to8Bit(from8Bit(utf8)), utf8);
168 QCOMPARE(from8Bit(to8Bit(utf16)), utf16);
169}
170
171void tst_Utf8::charByChar_data()
172{
173 roundTrip_data();
174}
175
176void tst_Utf8::charByChar()
177{
178 QFETCH(QByteArray, utf8);
179 QFETCH(QString, utf16);
180
181 {
182 // from utf16 to utf8 char by char:
183 const QScopedPointer<QTextEncoder> encoder(codec->makeEncoder());
184 QByteArray encoded;
185
186 for (int i = 0; i < utf16.length(); ++i) {
187 encoded += encoder->fromUnicode(uc: utf16.constData() + i, len: 1);
188 QVERIFY(!encoder->hasFailure());
189 }
190
191 if (encoded.startsWith(c: utf8bom))
192 encoded = encoded.mid(index: int(strlen(s: utf8bom)));
193 QCOMPARE(encoded, utf8);
194 }
195 {
196 // from utf8 to utf16 char by char:
197 const QScopedPointer<QTextDecoder> decoder(codec->makeDecoder());
198 QString decoded;
199
200 for (int i = 0; i < utf8.length(); ++i) {
201 decoded += decoder->toUnicode(chars: utf8.constData() + i, len: 1);
202 QVERIFY(!decoder->hasFailure());
203 }
204
205 QCOMPARE(decoded, utf16);
206 }
207}
208
209void tst_Utf8::invalidUtf8_data()
210{
211 QTest::addColumn<QByteArray>(name: "utf8");
212
213 extern void loadInvalidUtf8Rows();
214 loadInvalidUtf8Rows();
215}
216
217void tst_Utf8::invalidUtf8()
218{
219 QFETCH(QByteArray, utf8);
220 QFETCH_GLOBAL(bool, useLocale);
221
222 const QScopedPointer<QTextDecoder> decoder(codec->makeDecoder());
223 decoder->toUnicode(ba: utf8);
224
225 // Only enforce correctness on our UTF-8 decoder
226 // The system's UTF-8 codec is sometimes buggy
227 // GNU libc's iconv is known to accept U+FFFF and U+FFFE encoded as UTF-8
228 // OS X's iconv is known to accept those, plus surrogates and codepoints above U+10FFFF
229 if (!useLocale)
230 QVERIFY(decoder->hasFailure() || decoder->needsMoreData());
231 else if (!decoder->hasFailure() && !decoder->needsMoreData())
232 qWarning(msg: "System codec does not report failure when it should. Should report bug upstream.");
233
234 // add a continuation character and test that we don't accidentally use it
235 // (buffer overrun)
236 utf8 += char(0x80 | 0x3f);
237 decoder->toUnicode(chars: utf8.constData(), len: utf8.size() - 1);
238 if (!useLocale)
239 QVERIFY(decoder->hasFailure());
240 else if (!decoder->hasFailure())
241 qWarning(msg: "System codec does not report failure when it should. Should report bug upstream.");
242}
243
244void tst_Utf8::nonCharacters_data()
245{
246 QTest::addColumn<QByteArray>(name: "utf8");
247 QTest::addColumn<QString>(name: "utf16");
248
249 extern void loadNonCharactersRows();
250 loadNonCharactersRows();
251}
252
253void tst_Utf8::nonCharacters()
254{
255 QFETCH(QByteArray, utf8);
256 QFETCH(QString, utf16);
257 QFETCH_GLOBAL(bool, useLocale);
258
259 const QScopedPointer<QTextDecoder> decoder(codec->makeDecoder());
260 decoder->toUnicode(ba: utf8);
261
262 // Only enforce correctness on our UTF-8 decoder
263 if (!useLocale)
264 QVERIFY(!decoder->hasFailure());
265 else if (decoder->hasFailure())
266 qWarning(msg: "System codec reports failure when it shouldn't. Should report bug upstream.");
267
268 const QScopedPointer<QTextEncoder> encoder(codec->makeEncoder());
269 encoder->fromUnicode(str: utf16);
270 if (!useLocale)
271 QVERIFY(!encoder->hasFailure());
272 else if (encoder->hasFailure())
273 qWarning(msg: "System codec reports failure when it shouldn't. Should report bug upstream.");
274}
275
276QTEST_MAIN(tst_Utf8)
277#include "tst_utf8.moc"
278

source code of qtbase/tests/auto/corelib/codecs/utf8/tst_utf8.cpp