1#include <QtCore/QScopedPointer>
2#include <QtTest/QTest>
3
4#include <poppler-private.h>
5
6#include <cstring>
7
8#include "GlobalParams.h"
9#include "UnicodeTypeTable.h"
10#include "UTF.h"
11
12class TestUTFConversion : public QObject
13{
14 Q_OBJECT
15public:
16 explicit TestUTFConversion(QObject *parent = nullptr) : QObject(parent) { }
17private slots:
18 void testUTF_data();
19 void testUTF();
20 void testUnicodeToAscii7();
21 void testUnicodeLittleEndian();
22};
23
24static bool compare(const char *a, const char *b)
25{
26 return strcmp(s1: a, s2: b) == 0;
27}
28
29static bool compare(const uint16_t *a, const uint16_t *b)
30{
31 while (*a && *b) {
32 if (*a++ != *b++) {
33 return false;
34 }
35 }
36 return *a == *b;
37}
38
39static bool compare(const Unicode *a, const char *b, int len)
40{
41 for (int i = 0; i < len; i++) {
42 if (a[i] != (Unicode)b[i]) {
43 return false;
44 }
45 }
46
47 return true;
48}
49
50static bool compare(const Unicode *a, const uint16_t *b, int len)
51{
52 for (int i = 0; i < len; i++) {
53 if (a[i] != b[i]) {
54 return false;
55 }
56 }
57
58 return true;
59}
60void TestUTFConversion::testUTF_data()
61{
62 QTest::addColumn<QString>(name: "s");
63
64 QTest::newRow(dataTag: "<empty>") << QString(QLatin1String(""));
65 QTest::newRow(dataTag: "a") << QStringLiteral("a");
66 QTest::newRow(dataTag: "abc") << QStringLiteral("abc");
67 QTest::newRow(dataTag: "Latin") << QStringLiteral("Vitrum edere possum; mihi non nocet");
68 QTest::newRow(dataTag: "Greek") << QStringLiteral("Μπορώ να φάω σπασμένα γυαλιά χωρίς να πάθω τίποτα");
69 QTest::newRow(dataTag: "Icelandic") << QStringLiteral("Ég get etið gler án þess að meiða mig");
70 QTest::newRow(dataTag: "Russian") << QStringLiteral("Я могу есть стекло, оно мне не вредит.");
71 QTest::newRow(dataTag: "Sanskrit") << QStringLiteral("काचं शक्नोम्यत्तुम् । नोपहिनस्ति माम् ॥");
72 QTest::newRow(dataTag: "Arabic") << QStringLiteral("أنا قادر على أكل الزجاج و هذا لا يؤلمني");
73 QTest::newRow(dataTag: "Chinese") << QStringLiteral("我能吞下玻璃而不伤身体。");
74 QTest::newRow(dataTag: "Thai") << QStringLiteral("ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ");
75 QTest::newRow(dataTag: "non BMP") << QStringLiteral("𝓹𝓸𝓹𝓹𝓵𝓮𝓻");
76}
77
78void TestUTFConversion::testUTF()
79{
80 char utf8Buf[1000];
81 char *utf8String;
82 uint16_t utf16Buf[1000];
83 uint16_t *utf16String;
84 int len;
85
86 QFETCH(QString, s);
87 char *str = strdup(s: s.toUtf8().constData());
88
89 // UTF-8 to UTF-16
90
91 len = utf8CountUtf16CodeUnits(utf8: str);
92 QCOMPARE(len, s.size()); // QString size() returns number of code units, not code points
93 Q_ASSERT(len < (int)sizeof(utf16Buf)); // if this fails, make utf16Buf larger
94
95 len = utf8ToUtf16(utf8: str, utf16: utf16Buf, maxUtf16: sizeof(utf16Buf), INT_MAX);
96 QVERIFY(compare(utf16Buf, s.utf16()));
97 QCOMPARE(len, s.size());
98
99 utf16String = utf8ToUtf16(utf8: str);
100 QVERIFY(compare(utf16String, s.utf16()));
101 free(ptr: utf16String);
102
103 std::string sUtf8(str);
104 std::string gsUtf16_a(utf8ToUtf16WithBom(utf8: sUtf8));
105 std::unique_ptr<GooString> gsUtf16_b(Poppler::QStringToUnicodeGooString(s));
106 QCOMPARE(gsUtf16_b->cmp(gsUtf16_a), 0);
107
108 // UTF-16 to UTF-8
109
110 len = utf16CountUtf8Bytes(utf16: s.utf16());
111 QCOMPARE(len, (int)strlen(str));
112 Q_ASSERT(len < (int)sizeof(utf8Buf)); // if this fails, make utf8Buf larger
113
114 len = utf16ToUtf8(utf16: s.utf16(), utf8: utf8Buf);
115 QVERIFY(compare(utf8Buf, str));
116 QCOMPARE(len, (int)strlen(str));
117
118 utf8String = utf16ToUtf8(utf16: s.utf16());
119 QVERIFY(compare(utf8String, str));
120 free(ptr: utf8String);
121
122 free(ptr: str);
123}
124
125void TestUTFConversion::testUnicodeToAscii7()
126{
127 globalParams = std::make_unique<GlobalParams>();
128
129 // Test string is one 'Registered' and twenty 'Copyright' chars
130 // so it's long enough to reproduce the bug given that glibc
131 // malloc() always returns 8-byte aligned memory addresses.
132 GooString *goo = Poppler::QStringToUnicodeGooString(s: QString::fromUtf8(utf8: "®©©©©©©©©©©©©©©©©©©©©")); // clazy:exclude=qstring-allocations
133
134 const std::vector<Unicode> in = TextStringToUCS4(textStr: goo->toStr());
135
136 delete goo;
137
138 int in_norm_len;
139 int *in_norm_idx;
140 Unicode *in_norm = unicodeNormalizeNFKC(in: in.data(), len: in.size(), out_len: &in_norm_len, indices: &in_norm_idx, reverseRTL: true);
141
142 Unicode *out;
143 int out_len;
144 int *out_ascii_idx;
145
146 unicodeToAscii7(in: in_norm, len: in_norm_len, ucs4_out: &out, out_len: &out_len, in_idx: in_norm_idx, indices: &out_ascii_idx);
147
148 free(ptr: in_norm);
149 free(ptr: in_norm_idx);
150
151 // ascii7 conversion: ® -> (R) © -> (c)
152 const char *expected_ascii = (char *)"(R)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)";
153
154 QCOMPARE(out_len, (int)strlen(expected_ascii));
155 QVERIFY(compare(out, expected_ascii, out_len));
156
157 free(ptr: out);
158 free(ptr: out_ascii_idx);
159}
160
161void TestUTFConversion::testUnicodeLittleEndian()
162{
163 uint16_t UTF16LE_hi[5] { 0xFFFE, 0x4800, 0x4900, 0x2100, 0x1126 }; // UTF16-LE "HI!☑"
164 std::string GooUTF16LE(reinterpret_cast<const char *>(UTF16LE_hi), sizeof(UTF16LE_hi));
165
166 uint16_t UTF16BE_hi[5] { 0xFEFF, 0x0048, 0x0049, 0x0021, 0x2611 }; // UTF16-BE "HI!☑"
167 std::string GooUTF16BE(reinterpret_cast<const char *>(UTF16BE_hi), sizeof(UTF16BE_hi));
168
169 // Let's assert both GooString's are different
170 QVERIFY(GooUTF16LE != GooUTF16BE);
171
172 const std::vector<Unicode> UCS4fromLE = TextStringToUCS4(textStr: GooUTF16LE);
173 const std::vector<Unicode> UCS4fromBE = TextStringToUCS4(textStr: GooUTF16BE);
174
175 // len is 4 because TextStringToUCS4() removes the two leading Byte Order Mark (BOM) code points
176 QCOMPARE(UCS4fromLE.size(), UCS4fromBE.size());
177 QCOMPARE(UCS4fromLE.size(), 4);
178
179 // Check that now after conversion, UCS4fromLE and UCS4fromBE are now the same
180 for (size_t i = 0; i < UCS4fromLE.size(); i++) {
181 QCOMPARE(UCS4fromLE[i], UCS4fromBE[i]);
182 }
183
184 const QString expected = QStringLiteral("HI!☑");
185
186 // Do some final verifications, checking the strings to be "HI!"
187 QVERIFY(UCS4fromLE == UCS4fromBE);
188 QVERIFY(compare(UCS4fromLE.data(), expected.utf16(), UCS4fromLE.size()));
189 QVERIFY(compare(UCS4fromBE.data(), expected.utf16(), UCS4fromBE.size()));
190}
191
192QTEST_GUILESS_MAIN(TestUTFConversion)
193#include "check_utf_conversion.moc"
194

source code of poppler/qt6/tests/check_utf_conversion.cpp