1 | /**************************************************************************** |
2 | ** |
3 | ** Copyright (C) 2018 The Qt Company Ltd. |
4 | ** Copyright (C) 2018 Intel Corporation. |
5 | ** Contact: https://www.qt.io/licensing/ |
6 | ** |
7 | ** This file is part of the test suite of the Qt Toolkit. |
8 | ** |
9 | ** $QT_BEGIN_LICENSE:GPL-EXCEPT$ |
10 | ** Commercial License Usage |
11 | ** Licensees holding valid commercial Qt licenses may use this file in |
12 | ** accordance with the commercial license agreement provided with the |
13 | ** Software or, alternatively, in accordance with the terms contained in |
14 | ** a written agreement between you and The Qt Company. For licensing terms |
15 | ** and conditions see https://www.qt.io/terms-conditions. For further |
16 | ** information use the contact form at https://www.qt.io/contact-us. |
17 | ** |
18 | ** GNU General Public License Usage |
19 | ** Alternatively, this file may be used under the terms of the GNU |
20 | ** General Public License version 3 as published by the Free Software |
21 | ** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT |
22 | ** included in the packaging of this file. Please review the following |
23 | ** information to ensure the GNU General Public License requirements will |
24 | ** be met: https://www.gnu.org/licenses/gpl-3.0.html. |
25 | ** |
26 | ** $QT_END_LICENSE$ |
27 | ** |
28 | ****************************************************************************/ |
29 | #include <QtTest/QtTest> |
30 | |
31 | #include <qtextcodec.h> |
32 | #include <QScopedPointer> |
33 | |
34 | static const char utf8bom[] = "\xEF\xBB\xBF" ; |
35 | |
36 | class tst_Utf8 : public QObject |
37 | { |
38 | Q_OBJECT |
39 | |
40 | public: |
41 | // test data: |
42 | QTextCodec *codec; |
43 | QString (*from8BitPtr)(const char *, int); |
44 | static QByteArray to8Bit(const QString &); |
45 | |
46 | inline QString from8Bit(const QByteArray &ba) |
47 | { return from8BitPtr(ba.constData(), ba.length()); } |
48 | public slots: |
49 | void initTestCase(); |
50 | void init(); |
51 | |
52 | private slots: |
53 | void roundTrip_data(); |
54 | void roundTrip(); |
55 | |
56 | void charByChar_data(); |
57 | void charByChar(); |
58 | |
59 | void invalidUtf8_data(); |
60 | void invalidUtf8(); |
61 | |
62 | void nonCharacters_data(); |
63 | void nonCharacters(); |
64 | }; |
65 | |
66 | void tst_Utf8::initTestCase() |
67 | { |
68 | QTest::addColumn<bool>(name: "useLocale" ); |
69 | QTest::newRow(dataTag: "utf8codec" ) << false; |
70 | |
71 | // is the locale UTF-8? |
72 | if (QString(QChar(QChar::ReplacementCharacter)).toLocal8Bit() == "\xEF\xBF\xBD" ) { |
73 | QTest::newRow(dataTag: "localecodec" ) << true; |
74 | qInfo() << "locale is utf8" ; |
75 | } |
76 | } |
77 | |
78 | void tst_Utf8::init() |
79 | { |
80 | QFETCH_GLOBAL(bool, useLocale); |
81 | if (useLocale) { |
82 | codec = QTextCodec::codecForLocale(); |
83 | from8BitPtr = &QString::fromLocal8Bit; |
84 | } else { |
85 | codec = QTextCodec::codecForMib(mib: 106); |
86 | from8BitPtr = &QString::fromUtf8; |
87 | } |
88 | } |
89 | |
90 | QByteArray tst_Utf8::to8Bit(const QString &s) |
91 | { |
92 | QFETCH_GLOBAL(bool, useLocale); |
93 | if (useLocale) |
94 | return s.toLocal8Bit(); |
95 | else |
96 | return s.toUtf8(); |
97 | } |
98 | |
99 | void tst_Utf8::roundTrip_data() |
100 | { |
101 | QTest::addColumn<QByteArray>(name: "utf8" ); |
102 | QTest::addColumn<QString>(name: "utf16" ); |
103 | |
104 | QTest::newRow(dataTag: "empty" ) << QByteArray() << QString(); |
105 | QTest::newRow(dataTag: "nul" ) << QByteArray("" , 1) << QString(QChar(QChar::Null)); |
106 | |
107 | static const char ascii[] = "This is a standard US-ASCII message" ; |
108 | QTest::newRow(dataTag: "ascii" ) << QByteArray(ascii) << QString::fromLatin1(str: ascii); |
109 | |
110 | static const char ascii2[] = "\1This\2is\3an\4US-ASCII\020 message interspersed with control chars" ; |
111 | QTest::newRow(dataTag: "ascii2" ) << QByteArray(ascii2) << QString::fromLatin1(str: ascii2); |
112 | |
113 | static const char utf8_1[] = "\302\240" ; // NBSP |
114 | QTest::newRow(dataTag: "utf8_1" ) << QByteArray(utf8_1) << QString(QChar(QChar::Nbsp)); |
115 | |
116 | static const char utf8_2[] = "\342\202\254" ; // Euro symbol |
117 | QTest::newRow(dataTag: "utf8_2" ) << QByteArray(utf8_2) << QString(QChar(0x20AC)); |
118 | |
119 | #if 0 |
120 | // Can't test this because QString::fromUtf8 consumes it |
121 | static const char utf8_3[] = "\357\273\277" ; // byte order mark |
122 | QTest::newRow("utf8_3" ) << QByteArray(utf8_3) << QString(QChar(QChar::ByteOrderMark)); |
123 | #endif |
124 | |
125 | static const char utf8_4[] = "\357\277\275" ; // replacement char |
126 | QTest::newRow(dataTag: "utf8_4" ) << QByteArray(utf8_4) << QString(QChar(QChar::ReplacementCharacter)); |
127 | |
128 | static const char utf8_5[] = "\360\220\210\203" ; // U+010203 |
129 | static const uint utf32_5[] = { 0x010203 }; |
130 | QTest::newRow(dataTag: "utf8_5" ) << QByteArray(utf8_5) << QString::fromUcs4(utf32_5, size: 1); |
131 | |
132 | static const char utf8_6[] = "\364\217\277\275" ; // U+10FFFD |
133 | static const uint utf32_6[] = { 0x10FFFD }; |
134 | QTest::newRow(dataTag: "utf8_6" ) << QByteArray(utf8_6) << QString::fromUcs4(utf32_6, size: 1); |
135 | |
136 | static const char utf8_7[] = "abc\302\240\303\241\303\251\307\275 \342\202\254def" ; |
137 | static const ushort utf16_7[] = { 'a', 'b', 'c', 0x00A0, |
138 | 0x00E1, 0x00E9, 0x01FD, |
139 | ' ', 0x20AC, 'd', 'e', 'f', 0 }; |
140 | QTest::newRow(dataTag: "utf8_7" ) << QByteArray(utf8_7) << QString::fromUtf16(utf16_7); |
141 | |
142 | static const char utf8_8[] = "abc\302\240\303\241\303\251\307\275 \364\217\277\275 \342\202\254def" ; |
143 | static const uint utf32_8[] = { 'a', 'b', 'c', 0x00A0, |
144 | 0x00E1, 0x00E9, 0x01FD, |
145 | ' ', 0x10FFFD, ' ', |
146 | 0x20AC, 'd', 'e', 'f', 0 }; |
147 | QTest::newRow(dataTag: "utf8_8" ) << QByteArray(utf8_8) << QString::fromUcs4(utf32_8); |
148 | } |
149 | |
150 | void tst_Utf8::roundTrip() |
151 | { |
152 | QFETCH(QByteArray, utf8); |
153 | QFETCH(QString, utf16); |
154 | |
155 | QCOMPARE(to8Bit(utf16), utf8); |
156 | QCOMPARE(from8Bit(utf8), utf16); |
157 | |
158 | QCOMPARE(to8Bit(from8Bit(utf8)), utf8); |
159 | QCOMPARE(from8Bit(to8Bit(utf16)), utf16); |
160 | |
161 | // repeat with a longer message |
162 | utf8.prepend(s: "12345678901234" ); |
163 | utf16.prepend(s: QLatin1String("12345678901234" )); |
164 | QCOMPARE(to8Bit(utf16), utf8); |
165 | QCOMPARE(from8Bit(utf8), utf16); |
166 | |
167 | QCOMPARE(to8Bit(from8Bit(utf8)), utf8); |
168 | QCOMPARE(from8Bit(to8Bit(utf16)), utf16); |
169 | } |
170 | |
171 | void tst_Utf8::charByChar_data() |
172 | { |
173 | roundTrip_data(); |
174 | } |
175 | |
176 | void tst_Utf8::charByChar() |
177 | { |
178 | QFETCH(QByteArray, utf8); |
179 | QFETCH(QString, utf16); |
180 | |
181 | { |
182 | // from utf16 to utf8 char by char: |
183 | const QScopedPointer<QTextEncoder> encoder(codec->makeEncoder()); |
184 | QByteArray encoded; |
185 | |
186 | for (int i = 0; i < utf16.length(); ++i) { |
187 | encoded += encoder->fromUnicode(uc: utf16.constData() + i, len: 1); |
188 | QVERIFY(!encoder->hasFailure()); |
189 | } |
190 | |
191 | if (encoded.startsWith(c: utf8bom)) |
192 | encoded = encoded.mid(index: int(strlen(s: utf8bom))); |
193 | QCOMPARE(encoded, utf8); |
194 | } |
195 | { |
196 | // from utf8 to utf16 char by char: |
197 | const QScopedPointer<QTextDecoder> decoder(codec->makeDecoder()); |
198 | QString decoded; |
199 | |
200 | for (int i = 0; i < utf8.length(); ++i) { |
201 | decoded += decoder->toUnicode(chars: utf8.constData() + i, len: 1); |
202 | QVERIFY(!decoder->hasFailure()); |
203 | } |
204 | |
205 | QCOMPARE(decoded, utf16); |
206 | } |
207 | } |
208 | |
209 | void tst_Utf8::invalidUtf8_data() |
210 | { |
211 | QTest::addColumn<QByteArray>(name: "utf8" ); |
212 | |
213 | extern void loadInvalidUtf8Rows(); |
214 | loadInvalidUtf8Rows(); |
215 | } |
216 | |
217 | void tst_Utf8::invalidUtf8() |
218 | { |
219 | QFETCH(QByteArray, utf8); |
220 | QFETCH_GLOBAL(bool, useLocale); |
221 | |
222 | const QScopedPointer<QTextDecoder> decoder(codec->makeDecoder()); |
223 | decoder->toUnicode(ba: utf8); |
224 | |
225 | // Only enforce correctness on our UTF-8 decoder |
226 | // The system's UTF-8 codec is sometimes buggy |
227 | // GNU libc's iconv is known to accept U+FFFF and U+FFFE encoded as UTF-8 |
228 | // OS X's iconv is known to accept those, plus surrogates and codepoints above U+10FFFF |
229 | if (!useLocale) |
230 | QVERIFY(decoder->hasFailure() || decoder->needsMoreData()); |
231 | else if (!decoder->hasFailure() && !decoder->needsMoreData()) |
232 | qWarning(msg: "System codec does not report failure when it should. Should report bug upstream." ); |
233 | |
234 | // add a continuation character and test that we don't accidentally use it |
235 | // (buffer overrun) |
236 | utf8 += char(0x80 | 0x3f); |
237 | decoder->toUnicode(chars: utf8.constData(), len: utf8.size() - 1); |
238 | if (!useLocale) |
239 | QVERIFY(decoder->hasFailure()); |
240 | else if (!decoder->hasFailure()) |
241 | qWarning(msg: "System codec does not report failure when it should. Should report bug upstream." ); |
242 | } |
243 | |
244 | void tst_Utf8::nonCharacters_data() |
245 | { |
246 | QTest::addColumn<QByteArray>(name: "utf8" ); |
247 | QTest::addColumn<QString>(name: "utf16" ); |
248 | |
249 | extern void loadNonCharactersRows(); |
250 | loadNonCharactersRows(); |
251 | } |
252 | |
253 | void tst_Utf8::nonCharacters() |
254 | { |
255 | QFETCH(QByteArray, utf8); |
256 | QFETCH(QString, utf16); |
257 | QFETCH_GLOBAL(bool, useLocale); |
258 | |
259 | const QScopedPointer<QTextDecoder> decoder(codec->makeDecoder()); |
260 | decoder->toUnicode(ba: utf8); |
261 | |
262 | // Only enforce correctness on our UTF-8 decoder |
263 | if (!useLocale) |
264 | QVERIFY(!decoder->hasFailure()); |
265 | else if (decoder->hasFailure()) |
266 | qWarning(msg: "System codec reports failure when it shouldn't. Should report bug upstream." ); |
267 | |
268 | const QScopedPointer<QTextEncoder> encoder(codec->makeEncoder()); |
269 | encoder->fromUnicode(str: utf16); |
270 | if (!useLocale) |
271 | QVERIFY(!encoder->hasFailure()); |
272 | else if (encoder->hasFailure()) |
273 | qWarning(msg: "System codec reports failure when it shouldn't. Should report bug upstream." ); |
274 | } |
275 | |
276 | QTEST_MAIN(tst_Utf8) |
277 | #include "tst_utf8.moc" |
278 | |