1 | #include <QtCore/QScopedPointer> |
2 | #include <QtTest/QTest> |
3 | |
4 | #include <poppler-private.h> |
5 | |
6 | #include <cstring> |
7 | |
8 | #include "GlobalParams.h" |
9 | #include "UnicodeTypeTable.h" |
10 | #include "UTF.h" |
11 | |
12 | class TestUTFConversion : public QObject |
13 | { |
14 | Q_OBJECT |
15 | public: |
16 | explicit TestUTFConversion(QObject *parent = nullptr) : QObject(parent) { } |
17 | private slots: |
18 | void testUTF_data(); |
19 | void testUTF(); |
20 | void testUnicodeToAscii7(); |
21 | void testUnicodeLittleEndian(); |
22 | }; |
23 | |
24 | static bool compare(const char *a, const char *b) |
25 | { |
26 | return strcmp(s1: a, s2: b) == 0; |
27 | } |
28 | |
29 | static bool compare(const uint16_t *a, const uint16_t *b) |
30 | { |
31 | while (*a && *b) { |
32 | if (*a++ != *b++) { |
33 | return false; |
34 | } |
35 | } |
36 | return *a == *b; |
37 | } |
38 | |
39 | static bool compare(const Unicode *a, const char *b, int len) |
40 | { |
41 | for (int i = 0; i < len; i++) { |
42 | if (a[i] != (Unicode)b[i]) { |
43 | return false; |
44 | } |
45 | } |
46 | |
47 | return true; |
48 | } |
49 | |
50 | static bool compare(const Unicode *a, const uint16_t *b, int len) |
51 | { |
52 | for (int i = 0; i < len; i++) { |
53 | if (a[i] != b[i]) { |
54 | return false; |
55 | } |
56 | } |
57 | |
58 | return true; |
59 | } |
60 | void TestUTFConversion::testUTF_data() |
61 | { |
62 | QTest::addColumn<QString>(name: "s" ); |
63 | |
64 | QTest::newRow(dataTag: "<empty>" ) << QString(QLatin1String("" )); |
65 | QTest::newRow(dataTag: "a" ) << QStringLiteral("a" ); |
66 | QTest::newRow(dataTag: "abc" ) << QStringLiteral("abc" ); |
67 | QTest::newRow(dataTag: "Latin" ) << QStringLiteral("Vitrum edere possum; mihi non nocet" ); |
68 | QTest::newRow(dataTag: "Greek" ) << QStringLiteral("Μπορώ να φάω σπασμένα γυαλιά χωρίς να πάθω τίποτα" ); |
69 | QTest::newRow(dataTag: "Icelandic" ) << QStringLiteral("Ég get etið gler án þess að meiða mig" ); |
70 | QTest::newRow(dataTag: "Russian" ) << QStringLiteral("Я могу есть стекло, оно мне не вредит." ); |
71 | QTest::newRow(dataTag: "Sanskrit" ) << QStringLiteral("काचं शक्नोम्यत्तुम् । नोपहिनस्ति माम् ॥" ); |
72 | QTest::newRow(dataTag: "Arabic" ) << QStringLiteral("أنا قادر على أكل الزجاج و هذا لا يؤلمني" ); |
73 | QTest::newRow(dataTag: "Chinese" ) << QStringLiteral("我能吞下玻璃而不伤身体。" ); |
74 | QTest::newRow(dataTag: "Thai" ) << QStringLiteral("ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ" ); |
75 | QTest::newRow(dataTag: "non BMP" ) << QStringLiteral("𝓹𝓸𝓹𝓹𝓵𝓮𝓻" ); |
76 | } |
77 | |
78 | void TestUTFConversion::testUTF() |
79 | { |
80 | char utf8Buf[1000]; |
81 | char *utf8String; |
82 | uint16_t utf16Buf[1000]; |
83 | uint16_t *utf16String; |
84 | int len; |
85 | |
86 | QFETCH(QString, s); |
87 | char *str = strdup(s: s.toUtf8().constData()); |
88 | |
89 | // UTF-8 to UTF-16 |
90 | |
91 | len = utf8CountUtf16CodeUnits(utf8: str); |
92 | QCOMPARE(len, s.size()); // QString size() returns number of code units, not code points |
93 | Q_ASSERT(len < (int)sizeof(utf16Buf)); // if this fails, make utf16Buf larger |
94 | |
95 | len = utf8ToUtf16(utf8: str, utf16: utf16Buf, maxUtf16: sizeof(utf16Buf), INT_MAX); |
96 | QVERIFY(compare(utf16Buf, s.utf16())); |
97 | QCOMPARE(len, s.size()); |
98 | |
99 | utf16String = utf8ToUtf16(utf8: str); |
100 | QVERIFY(compare(utf16String, s.utf16())); |
101 | free(ptr: utf16String); |
102 | |
103 | std::string sUtf8(str); |
104 | std::string gsUtf16_a(utf8ToUtf16WithBom(utf8: sUtf8)); |
105 | std::unique_ptr<GooString> gsUtf16_b(Poppler::QStringToUnicodeGooString(s)); |
106 | QCOMPARE(gsUtf16_b->cmp(gsUtf16_a), 0); |
107 | |
108 | // UTF-16 to UTF-8 |
109 | |
110 | len = utf16CountUtf8Bytes(utf16: s.utf16()); |
111 | QCOMPARE(len, (int)strlen(str)); |
112 | Q_ASSERT(len < (int)sizeof(utf8Buf)); // if this fails, make utf8Buf larger |
113 | |
114 | len = utf16ToUtf8(utf16: s.utf16(), utf8: utf8Buf); |
115 | QVERIFY(compare(utf8Buf, str)); |
116 | QCOMPARE(len, (int)strlen(str)); |
117 | |
118 | utf8String = utf16ToUtf8(utf16: s.utf16()); |
119 | QVERIFY(compare(utf8String, str)); |
120 | free(ptr: utf8String); |
121 | |
122 | free(ptr: str); |
123 | } |
124 | |
125 | void TestUTFConversion::testUnicodeToAscii7() |
126 | { |
127 | globalParams = std::make_unique<GlobalParams>(); |
128 | |
129 | // Test string is one 'Registered' and twenty 'Copyright' chars |
130 | // so it's long enough to reproduce the bug given that glibc |
131 | // malloc() always returns 8-byte aligned memory addresses. |
132 | GooString *goo = Poppler::QStringToUnicodeGooString(s: QString::fromUtf8(utf8: "®©©©©©©©©©©©©©©©©©©©©" )); // clazy:exclude=qstring-allocations |
133 | |
134 | const std::vector<Unicode> in = TextStringToUCS4(textStr: goo->toStr()); |
135 | |
136 | delete goo; |
137 | |
138 | int in_norm_len; |
139 | int *in_norm_idx; |
140 | Unicode *in_norm = unicodeNormalizeNFKC(in: in.data(), len: in.size(), out_len: &in_norm_len, indices: &in_norm_idx, reverseRTL: true); |
141 | |
142 | Unicode *out; |
143 | int out_len; |
144 | int *out_ascii_idx; |
145 | |
146 | unicodeToAscii7(in: in_norm, len: in_norm_len, ucs4_out: &out, out_len: &out_len, in_idx: in_norm_idx, indices: &out_ascii_idx); |
147 | |
148 | free(ptr: in_norm); |
149 | free(ptr: in_norm_idx); |
150 | |
151 | // ascii7 conversion: ® -> (R) © -> (c) |
152 | const char *expected_ascii = (char *)"(R)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)" ; |
153 | |
154 | QCOMPARE(out_len, (int)strlen(expected_ascii)); |
155 | QVERIFY(compare(out, expected_ascii, out_len)); |
156 | |
157 | free(ptr: out); |
158 | free(ptr: out_ascii_idx); |
159 | } |
160 | |
161 | void TestUTFConversion::testUnicodeLittleEndian() |
162 | { |
163 | uint16_t UTF16LE_hi[5] { 0xFFFE, 0x4800, 0x4900, 0x2100, 0x1126 }; // UTF16-LE "HI!☑" |
164 | std::string GooUTF16LE(reinterpret_cast<const char *>(UTF16LE_hi), sizeof(UTF16LE_hi)); |
165 | |
166 | uint16_t UTF16BE_hi[5] { 0xFEFF, 0x0048, 0x0049, 0x0021, 0x2611 }; // UTF16-BE "HI!☑" |
167 | std::string GooUTF16BE(reinterpret_cast<const char *>(UTF16BE_hi), sizeof(UTF16BE_hi)); |
168 | |
169 | // Let's assert both GooString's are different |
170 | QVERIFY(GooUTF16LE != GooUTF16BE); |
171 | |
172 | const std::vector<Unicode> UCS4fromLE = TextStringToUCS4(textStr: GooUTF16LE); |
173 | const std::vector<Unicode> UCS4fromBE = TextStringToUCS4(textStr: GooUTF16BE); |
174 | |
175 | // len is 4 because TextStringToUCS4() removes the two leading Byte Order Mark (BOM) code points |
176 | QCOMPARE(UCS4fromLE.size(), UCS4fromBE.size()); |
177 | QCOMPARE(UCS4fromLE.size(), 4); |
178 | |
179 | // Check that now after conversion, UCS4fromLE and UCS4fromBE are now the same |
180 | for (size_t i = 0; i < UCS4fromLE.size(); i++) { |
181 | QCOMPARE(UCS4fromLE[i], UCS4fromBE[i]); |
182 | } |
183 | |
184 | const QString expected = QStringLiteral("HI!☑" ); |
185 | |
186 | // Do some final verifications, checking the strings to be "HI!" |
187 | QVERIFY(UCS4fromLE == UCS4fromBE); |
188 | QVERIFY(compare(UCS4fromLE.data(), expected.utf16(), UCS4fromLE.size())); |
189 | QVERIFY(compare(UCS4fromBE.data(), expected.utf16(), UCS4fromBE.size())); |
190 | } |
191 | |
192 | QTEST_GUILESS_MAIN(TestUTFConversion) |
193 | #include "check_utf_conversion.moc" |
194 | |