| 1 | /**************************************************************************** |
| 2 | ** |
| 3 | ** Copyright (C) 2018 The Qt Company Ltd. |
| 4 | ** Copyright (C) 2018 Intel Corporation. |
| 5 | ** Contact: https://www.qt.io/licensing/ |
| 6 | ** |
| 7 | ** This file is part of the test suite of the Qt Toolkit. |
| 8 | ** |
| 9 | ** $QT_BEGIN_LICENSE:GPL-EXCEPT$ |
| 10 | ** Commercial License Usage |
| 11 | ** Licensees holding valid commercial Qt licenses may use this file in |
| 12 | ** accordance with the commercial license agreement provided with the |
| 13 | ** Software or, alternatively, in accordance with the terms contained in |
| 14 | ** a written agreement between you and The Qt Company. For licensing terms |
| 15 | ** and conditions see https://www.qt.io/terms-conditions. For further |
| 16 | ** information use the contact form at https://www.qt.io/contact-us. |
| 17 | ** |
| 18 | ** GNU General Public License Usage |
| 19 | ** Alternatively, this file may be used under the terms of the GNU |
| 20 | ** General Public License version 3 as published by the Free Software |
| 21 | ** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT |
| 22 | ** included in the packaging of this file. Please review the following |
| 23 | ** information to ensure the GNU General Public License requirements will |
| 24 | ** be met: https://www.gnu.org/licenses/gpl-3.0.html. |
| 25 | ** |
| 26 | ** $QT_END_LICENSE$ |
| 27 | ** |
| 28 | ****************************************************************************/ |
| 29 | #include <QtTest/QtTest> |
| 30 | |
| 31 | #include <qtextcodec.h> |
| 32 | #include <QScopedPointer> |
| 33 | |
| 34 | static const char utf8bom[] = "\xEF\xBB\xBF" ; |
| 35 | |
| 36 | class tst_Utf8 : public QObject |
| 37 | { |
| 38 | Q_OBJECT |
| 39 | |
| 40 | public: |
| 41 | // test data: |
| 42 | QTextCodec *codec; |
| 43 | QString (*from8BitPtr)(const char *, int); |
| 44 | static QByteArray to8Bit(const QString &); |
| 45 | |
| 46 | inline QString from8Bit(const QByteArray &ba) |
| 47 | { return from8BitPtr(ba.constData(), ba.length()); } |
| 48 | public slots: |
| 49 | void initTestCase(); |
| 50 | void init(); |
| 51 | |
| 52 | private slots: |
| 53 | void roundTrip_data(); |
| 54 | void roundTrip(); |
| 55 | |
| 56 | void charByChar_data(); |
| 57 | void charByChar(); |
| 58 | |
| 59 | void invalidUtf8_data(); |
| 60 | void invalidUtf8(); |
| 61 | |
| 62 | void nonCharacters_data(); |
| 63 | void nonCharacters(); |
| 64 | }; |
| 65 | |
| 66 | void tst_Utf8::initTestCase() |
| 67 | { |
| 68 | QTest::addColumn<bool>(name: "useLocale" ); |
| 69 | QTest::newRow(dataTag: "utf8codec" ) << false; |
| 70 | |
| 71 | // is the locale UTF-8? |
| 72 | if (QString(QChar(QChar::ReplacementCharacter)).toLocal8Bit() == "\xEF\xBF\xBD" ) { |
| 73 | QTest::newRow(dataTag: "localecodec" ) << true; |
| 74 | qInfo() << "locale is utf8" ; |
| 75 | } |
| 76 | } |
| 77 | |
| 78 | void tst_Utf8::init() |
| 79 | { |
| 80 | QFETCH_GLOBAL(bool, useLocale); |
| 81 | if (useLocale) { |
| 82 | codec = QTextCodec::codecForLocale(); |
| 83 | from8BitPtr = &QString::fromLocal8Bit; |
| 84 | } else { |
| 85 | codec = QTextCodec::codecForMib(mib: 106); |
| 86 | from8BitPtr = &QString::fromUtf8; |
| 87 | } |
| 88 | } |
| 89 | |
| 90 | QByteArray tst_Utf8::to8Bit(const QString &s) |
| 91 | { |
| 92 | QFETCH_GLOBAL(bool, useLocale); |
| 93 | if (useLocale) |
| 94 | return s.toLocal8Bit(); |
| 95 | else |
| 96 | return s.toUtf8(); |
| 97 | } |
| 98 | |
| 99 | void tst_Utf8::roundTrip_data() |
| 100 | { |
| 101 | QTest::addColumn<QByteArray>(name: "utf8" ); |
| 102 | QTest::addColumn<QString>(name: "utf16" ); |
| 103 | |
| 104 | QTest::newRow(dataTag: "empty" ) << QByteArray() << QString(); |
| 105 | QTest::newRow(dataTag: "nul" ) << QByteArray("" , 1) << QString(QChar(QChar::Null)); |
| 106 | |
| 107 | static const char ascii[] = "This is a standard US-ASCII message" ; |
| 108 | QTest::newRow(dataTag: "ascii" ) << QByteArray(ascii) << QString::fromLatin1(str: ascii); |
| 109 | |
| 110 | static const char ascii2[] = "\1This\2is\3an\4US-ASCII\020 message interspersed with control chars" ; |
| 111 | QTest::newRow(dataTag: "ascii2" ) << QByteArray(ascii2) << QString::fromLatin1(str: ascii2); |
| 112 | |
| 113 | static const char utf8_1[] = "\302\240" ; // NBSP |
| 114 | QTest::newRow(dataTag: "utf8_1" ) << QByteArray(utf8_1) << QString(QChar(QChar::Nbsp)); |
| 115 | |
| 116 | static const char utf8_2[] = "\342\202\254" ; // Euro symbol |
| 117 | QTest::newRow(dataTag: "utf8_2" ) << QByteArray(utf8_2) << QString(QChar(0x20AC)); |
| 118 | |
| 119 | #if 0 |
| 120 | // Can't test this because QString::fromUtf8 consumes it |
| 121 | static const char utf8_3[] = "\357\273\277" ; // byte order mark |
| 122 | QTest::newRow("utf8_3" ) << QByteArray(utf8_3) << QString(QChar(QChar::ByteOrderMark)); |
| 123 | #endif |
| 124 | |
| 125 | static const char utf8_4[] = "\357\277\275" ; // replacement char |
| 126 | QTest::newRow(dataTag: "utf8_4" ) << QByteArray(utf8_4) << QString(QChar(QChar::ReplacementCharacter)); |
| 127 | |
| 128 | static const char utf8_5[] = "\360\220\210\203" ; // U+010203 |
| 129 | static const uint utf32_5[] = { 0x010203 }; |
| 130 | QTest::newRow(dataTag: "utf8_5" ) << QByteArray(utf8_5) << QString::fromUcs4(utf32_5, size: 1); |
| 131 | |
| 132 | static const char utf8_6[] = "\364\217\277\275" ; // U+10FFFD |
| 133 | static const uint utf32_6[] = { 0x10FFFD }; |
| 134 | QTest::newRow(dataTag: "utf8_6" ) << QByteArray(utf8_6) << QString::fromUcs4(utf32_6, size: 1); |
| 135 | |
| 136 | static const char utf8_7[] = "abc\302\240\303\241\303\251\307\275 \342\202\254def" ; |
| 137 | static const ushort utf16_7[] = { 'a', 'b', 'c', 0x00A0, |
| 138 | 0x00E1, 0x00E9, 0x01FD, |
| 139 | ' ', 0x20AC, 'd', 'e', 'f', 0 }; |
| 140 | QTest::newRow(dataTag: "utf8_7" ) << QByteArray(utf8_7) << QString::fromUtf16(utf16_7); |
| 141 | |
| 142 | static const char utf8_8[] = "abc\302\240\303\241\303\251\307\275 \364\217\277\275 \342\202\254def" ; |
| 143 | static const uint utf32_8[] = { 'a', 'b', 'c', 0x00A0, |
| 144 | 0x00E1, 0x00E9, 0x01FD, |
| 145 | ' ', 0x10FFFD, ' ', |
| 146 | 0x20AC, 'd', 'e', 'f', 0 }; |
| 147 | QTest::newRow(dataTag: "utf8_8" ) << QByteArray(utf8_8) << QString::fromUcs4(utf32_8); |
| 148 | } |
| 149 | |
| 150 | void tst_Utf8::roundTrip() |
| 151 | { |
| 152 | QFETCH(QByteArray, utf8); |
| 153 | QFETCH(QString, utf16); |
| 154 | |
| 155 | QCOMPARE(to8Bit(utf16), utf8); |
| 156 | QCOMPARE(from8Bit(utf8), utf16); |
| 157 | |
| 158 | QCOMPARE(to8Bit(from8Bit(utf8)), utf8); |
| 159 | QCOMPARE(from8Bit(to8Bit(utf16)), utf16); |
| 160 | |
| 161 | // repeat with a longer message |
| 162 | utf8.prepend(s: "12345678901234" ); |
| 163 | utf16.prepend(s: QLatin1String("12345678901234" )); |
| 164 | QCOMPARE(to8Bit(utf16), utf8); |
| 165 | QCOMPARE(from8Bit(utf8), utf16); |
| 166 | |
| 167 | QCOMPARE(to8Bit(from8Bit(utf8)), utf8); |
| 168 | QCOMPARE(from8Bit(to8Bit(utf16)), utf16); |
| 169 | } |
| 170 | |
| 171 | void tst_Utf8::charByChar_data() |
| 172 | { |
| 173 | roundTrip_data(); |
| 174 | } |
| 175 | |
| 176 | void tst_Utf8::charByChar() |
| 177 | { |
| 178 | QFETCH(QByteArray, utf8); |
| 179 | QFETCH(QString, utf16); |
| 180 | |
| 181 | { |
| 182 | // from utf16 to utf8 char by char: |
| 183 | const QScopedPointer<QTextEncoder> encoder(codec->makeEncoder()); |
| 184 | QByteArray encoded; |
| 185 | |
| 186 | for (int i = 0; i < utf16.length(); ++i) { |
| 187 | encoded += encoder->fromUnicode(uc: utf16.constData() + i, len: 1); |
| 188 | QVERIFY(!encoder->hasFailure()); |
| 189 | } |
| 190 | |
| 191 | if (encoded.startsWith(c: utf8bom)) |
| 192 | encoded = encoded.mid(index: int(strlen(s: utf8bom))); |
| 193 | QCOMPARE(encoded, utf8); |
| 194 | } |
| 195 | { |
| 196 | // from utf8 to utf16 char by char: |
| 197 | const QScopedPointer<QTextDecoder> decoder(codec->makeDecoder()); |
| 198 | QString decoded; |
| 199 | |
| 200 | for (int i = 0; i < utf8.length(); ++i) { |
| 201 | decoded += decoder->toUnicode(chars: utf8.constData() + i, len: 1); |
| 202 | QVERIFY(!decoder->hasFailure()); |
| 203 | } |
| 204 | |
| 205 | QCOMPARE(decoded, utf16); |
| 206 | } |
| 207 | } |
| 208 | |
| 209 | void tst_Utf8::invalidUtf8_data() |
| 210 | { |
| 211 | QTest::addColumn<QByteArray>(name: "utf8" ); |
| 212 | |
| 213 | extern void loadInvalidUtf8Rows(); |
| 214 | loadInvalidUtf8Rows(); |
| 215 | } |
| 216 | |
| 217 | void tst_Utf8::invalidUtf8() |
| 218 | { |
| 219 | QFETCH(QByteArray, utf8); |
| 220 | QFETCH_GLOBAL(bool, useLocale); |
| 221 | |
| 222 | const QScopedPointer<QTextDecoder> decoder(codec->makeDecoder()); |
| 223 | decoder->toUnicode(ba: utf8); |
| 224 | |
| 225 | // Only enforce correctness on our UTF-8 decoder |
| 226 | // The system's UTF-8 codec is sometimes buggy |
| 227 | // GNU libc's iconv is known to accept U+FFFF and U+FFFE encoded as UTF-8 |
| 228 | // OS X's iconv is known to accept those, plus surrogates and codepoints above U+10FFFF |
| 229 | if (!useLocale) |
| 230 | QVERIFY(decoder->hasFailure() || decoder->needsMoreData()); |
| 231 | else if (!decoder->hasFailure() && !decoder->needsMoreData()) |
| 232 | qWarning(msg: "System codec does not report failure when it should. Should report bug upstream." ); |
| 233 | |
| 234 | // add a continuation character and test that we don't accidentally use it |
| 235 | // (buffer overrun) |
| 236 | utf8 += char(0x80 | 0x3f); |
| 237 | decoder->toUnicode(chars: utf8.constData(), len: utf8.size() - 1); |
| 238 | if (!useLocale) |
| 239 | QVERIFY(decoder->hasFailure()); |
| 240 | else if (!decoder->hasFailure()) |
| 241 | qWarning(msg: "System codec does not report failure when it should. Should report bug upstream." ); |
| 242 | } |
| 243 | |
| 244 | void tst_Utf8::nonCharacters_data() |
| 245 | { |
| 246 | QTest::addColumn<QByteArray>(name: "utf8" ); |
| 247 | QTest::addColumn<QString>(name: "utf16" ); |
| 248 | |
| 249 | extern void loadNonCharactersRows(); |
| 250 | loadNonCharactersRows(); |
| 251 | } |
| 252 | |
| 253 | void tst_Utf8::nonCharacters() |
| 254 | { |
| 255 | QFETCH(QByteArray, utf8); |
| 256 | QFETCH(QString, utf16); |
| 257 | QFETCH_GLOBAL(bool, useLocale); |
| 258 | |
| 259 | const QScopedPointer<QTextDecoder> decoder(codec->makeDecoder()); |
| 260 | decoder->toUnicode(ba: utf8); |
| 261 | |
| 262 | // Only enforce correctness on our UTF-8 decoder |
| 263 | if (!useLocale) |
| 264 | QVERIFY(!decoder->hasFailure()); |
| 265 | else if (decoder->hasFailure()) |
| 266 | qWarning(msg: "System codec reports failure when it shouldn't. Should report bug upstream." ); |
| 267 | |
| 268 | const QScopedPointer<QTextEncoder> encoder(codec->makeEncoder()); |
| 269 | encoder->fromUnicode(str: utf16); |
| 270 | if (!useLocale) |
| 271 | QVERIFY(!encoder->hasFailure()); |
| 272 | else if (encoder->hasFailure()) |
| 273 | qWarning(msg: "System codec reports failure when it shouldn't. Should report bug upstream." ); |
| 274 | } |
| 275 | |
| 276 | QTEST_MAIN(tst_Utf8) |
| 277 | #include "tst_utf8.moc" |
| 278 | |