1/*
2 This file is part of the KDE libraries
3
4 SPDX-FileCopyrightText: 1999 Lars Knoll <knoll@kde.org>
5 SPDX-FileCopyrightText: 2001, 2003, 2004, 2005, 2006 Nicolas GOUTTE <goutte@kde.org>
6 SPDX-FileCopyrightText: 2007 Nick Shaforostoff <shafff@ukr.net>
7
8 SPDX-License-Identifier: LGPL-2.0-or-later
9*/
10#include "kcharsets.h"
11#include "kcharsets_p.h"
12#include "kcodecs_debug.h"
13
14#include <kentities.h>
15
16#include <QHash>
17
18#include <algorithm>
19#include <assert.h>
20
21/*
22 * The encoding names (like "ISO 8859-1") in this list are user-visible,
23 * and should be mostly uppercase.
24 * Generate with generate_string_table.pl (located in kde-dev-scripts),
25 * input data:
26ISO 8859-1
27i18n:Western European
28ISO 8859-15
29i18n:Western European
30ISO 8859-14
31i18n:Western European
32cp 1252
33i18n:Western European
34IBM850
35i18n:Western European
36ISO 8859-2
37i18n:Central European
38ISO 8859-3
39i18n:Central European
40ISO 8859-4
41i18n:Baltic
42ISO 8859-13
43i18n:Baltic
44ISO 8859-16
45i18n:South-Eastern Europe
46cp 1250
47i18n:Central European
48cp 1254
49i18n:Turkish
50cp 1257
51i18n:Baltic
52KOI8-R
53i18n:Cyrillic
54ISO 8859-5
55i18n:Cyrillic
56cp 1251
57i18n:Cyrillic
58KOI8-U
59i18n:Cyrillic
60IBM866
61i18n:Cyrillic
62Big5
63i18n:Chinese Traditional
64Big5-HKSCS
65i18n:Chinese Traditional
66GB18030
67i18n:Chinese Simplified
68GBK
69i18n:Chinese Simplified
70GB2312
71i18n:Chinese Simplified
72EUC-KR
73i18n:Korean
74windows-949
75i18n:Korean
76sjis
77i18n:Japanese
78ISO-2022-JP
79i18n:Japanese
80EUC-JP
81i18n:Japanese
82ISO 8859-7
83i18n:Greek
84cp 1253
85i18n:Greek
86ISO 8859-6
87i18n:Arabic
88cp 1256
89i18n:Arabic
90ISO 8859-8
91i18n:Hebrew
92ISO 8859-8-I
93i18n:Hebrew
94cp 1255
95i18n:Hebrew
96ISO 8859-9
97i18n:Turkish
98TIS620
99i18n:Thai
100ISO 8859-11
101i18n:Thai
102UTF-8
103i18n:Unicode
104UTF-16
105i18n:Unicode
106utf7
107i18n:Unicode
108ucs2
109i18n:Unicode
110ISO 10646-UCS-2
111i18n:Unicode
112windows-1258
113i18n:Other
114IBM874
115i18n:Other
116TSCII
117i18n:Other
118 */
119/*
120 * Notes about the table:
121 *
122 * - The following entries were disabled and removed from the table:
123ibm852
124i18n:Central European
125pt 154
126i18n:Cyrillic // ### TODO "PT 154" seems to have been removed from Qt
127 *
128 * - ISO 8559-11 is the deprecated name of TIS-620
129 * - utf7 is not in Qt
130 * - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2"
131 * - windows-1258: TODO
132 * - IBM874: TODO
133 * - TSCII: TODO
134 */
135
136/*
137 * This redefines the QT_TRANSLATE_NOOP3 macro provided by Qt to indicate that
138 * statically initialised text should be translated so that it expands to just
139 * the string that should be translated, making it possible to use it in the
140 * single string construct below.
141 */
142#undef QT_TRANSLATE_NOOP3
143#define QT_TRANSLATE_NOOP3(a, b, c) b
144
145/*
146 * THE FOLLOWING CODE IS GENERATED. PLEASE DO NOT EDIT BY HAND.
147 * The script used was generate_string_table.pl which can be found in kde-dev-scripts.
148 * It was then edited to use QT_TRANSLATE_NOOP3 instead of I18N_NOOP.
149 */
150
151static const char language_for_encoding_string[] =
152 "ISO 8859-1\0"
153 QT_TRANSLATE_NOOP3("KCharsets", "Western European", "@item Text character set")"\0"
154 "ISO 8859-15\0"
155 "ISO 8859-14\0"
156 "cp 1252\0"
157 "IBM850\0"
158 "ISO 8859-2\0"
159 QT_TRANSLATE_NOOP3("KCharsets", "Central European", "@item Text character set")"\0"
160 "ISO 8859-3\0"
161 "ISO 8859-4\0"
162 QT_TRANSLATE_NOOP3("KCharsets", "Baltic", "@item Text character set")"\0"
163 "ISO 8859-13\0"
164 "ISO 8859-16\0"
165 QT_TRANSLATE_NOOP3("KCharsets", "South-Eastern Europe", "@item Text character set")"\0"
166 "cp 1250\0"
167 "cp 1254\0"
168 QT_TRANSLATE_NOOP3("KCharsets", "Turkish", "@item Text character set")"\0"
169 "cp 1257\0"
170 "KOI8-R\0"
171 QT_TRANSLATE_NOOP3("KCharsets", "Cyrillic", "@item Text character set")"\0"
172 "ISO 8859-5\0"
173 "cp 1251\0"
174 "KOI8-U\0"
175 "IBM866\0"
176 "Big5\0"
177 QT_TRANSLATE_NOOP3("KCharsets", "Chinese Traditional", "@item Text character set")"\0"
178 "Big5-HKSCS\0"
179 "GB18030\0"
180 QT_TRANSLATE_NOOP3("KCharsets", "Chinese Simplified", "@item Text character set")"\0"
181 "GBK\0"
182 "GB2312\0"
183 "EUC-KR\0"
184 QT_TRANSLATE_NOOP3("KCharsets", "Korean", "@item Text character set")"\0"
185 "windows-949\0"
186 "sjis\0"
187 QT_TRANSLATE_NOOP3("KCharsets", "Japanese", "@item Text character set")"\0"
188 "ISO-2022-JP\0"
189 "EUC-JP\0"
190 "ISO 8859-7\0"
191 QT_TRANSLATE_NOOP3("KCharsets", "Greek", "@item Text character set")"\0"
192 "cp 1253\0"
193 "ISO 8859-6\0"
194 QT_TRANSLATE_NOOP3("KCharsets", "Arabic", "@item Text character set")"\0"
195 "cp 1256\0"
196 "ISO 8859-8\0"
197 QT_TRANSLATE_NOOP3("KCharsets", "Hebrew", "@item Text character set")"\0"
198 "ISO 8859-8-I\0"
199 "cp 1255\0"
200 "ISO 8859-9\0"
201 "TIS620\0"
202 QT_TRANSLATE_NOOP3("KCharsets", "Thai", "@item Text character set")"\0"
203 "ISO 8859-11\0"
204 "UTF-8\0"
205 QT_TRANSLATE_NOOP3("KCharsets", "Unicode", "@item Text character set")"\0"
206 "UTF-16\0"
207 "utf7\0"
208 "ucs2\0"
209 "ISO 10646-UCS-2\0"
210 "windows-1258\0"
211 QT_TRANSLATE_NOOP3("KCharsets", "Other", "@item Text character set")"\0"
212 "IBM874\0"
213 "TSCII\0"
214 "\0";
215
216static const int language_for_encoding_indices[] = {
217 0, 11, 28, 11, 40, 11, 52, 11, 60, 11, 67, 78, 95, 78, 106, 117, 124, 117, 136, 148, 169, 78, 177, 185, 193, 117, 201, 208, 217, 208, 228,
218 208, 236, 208, 243, 208, 250, 255, 275, 255, 286, 294, 313, 294, 317, 294, 324, 331, 338, 331, 350, 355, 364, 355, 376, 355, 383, 394, 400, 394, 408, 419,
219 426, 419, 434, 445, 452, 445, 465, 445, 473, 185, 484, 491, 496, 491, 508, 514, 522, 514, 529, 514, 534, 514, 539, 514, 555, 568, 574, 568, 581, 568, -1};
220
221/*
222 * GENERATED CODE ENDS HERE
223 */
224
225/*
226 * defines some different names for codecs that are built into Qt.
227 * The names in this list must be lower-case.
228 * input data for generate_string_table.pl:
229iso-ir-111
230koi8-r
231koi unified
232koi8-r
233us-ascii
234iso 8859-1
235usascii
236iso 8859-1
237ascii
238iso 8859-1
239unicode-1-1-utf-7
240utf-7
241ucs2
242iso-10646-ucs-2
243iso10646-1
244iso-10646-ucs-2
245gb18030.2000-1
246gb18030
247gb18030.2000-0
248gb18030
249gbk-0
250gbk
251gb2312
252gbk
253gb2312.1980-0
254gbk
255big5-0
256big5
257euc-kr
258euckr
259cp 949
260windows-949
261euc-jp
262eucjp
263jisx0201.1976-0
264eucjp
265jisx0208.1983-0
266eucjp
267jisx0208.1990-0
268eucjp
269jisx0208.1997-0
270eucjp
271jisx0212.1990-0
272eucjp
273jisx0213.2000-1
274eucjp
275jisx0213.2000-2
276eucjp
277shift_jis
278sjis
279shift-jis
280sjis
281sjis
282sjis
283iso-2022-jp
284jis7
285windows850
286ibm850
287windows866
288ibm866
289windows-850
290ibm850
291windows-866
292ibm866
293cp-10000
294apple roman
295thai-tis620
296iso 8859-11
297windows-874
298ibm874
299windows874
300ibm874
301cp-874
302ibm874
303ksc5601.1987-0
304euckr
305ks_c_5601-1987
306euckr
307mac-roman
308apple roman
309macintosh
310apple roman
311mac
312apple roman
313csiso2022jp
314iso-2022-jp
315*/
316/*
317 * Notes about the table:
318 * - using ISO-8859-1 for ASCII is only an approximation (as you cannot test if a character is part of the set)
319 * - utf7 is not in Qt
320 * - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2"
321 * - sjis: appears on the table for x-sjis
322 * - jis7: ISO-2022-JP is now the default name in Qt4
323 * - cp-874: is it really needed?
324 * - mac-roman: appears on the table for x-mac-roman
325 * - csiso2022jp: See bug #77243
326 */
327
328/*
329 * THE FOLLOWING CODE IS GENERATED. PLEASE DO NOT EDIT BY HAND.
330 * The script used was generate_string_table.pl which can be found in kde-dev-scripts.
331 */
332
333static const char builtin_string[] =
334 "iso-ir-111\0"
335 "koi8-r\0"
336 "koi unified\0"
337 "us-ascii\0"
338 "iso 8859-1\0"
339 "usascii\0"
340 "ascii\0"
341 "unicode-1-1-utf-7\0"
342 "utf-7\0"
343 "ucs2\0"
344 "iso-10646-ucs-2\0"
345 "iso10646-1\0"
346 "gb18030.2000-1\0"
347 "gb18030\0"
348 "gb18030.2000-0\0"
349 "gbk-0\0"
350 "gbk\0"
351 "gb2312\0"
352 "gb2312.1980-0\0"
353 "big5-0\0"
354 "big5\0"
355 "euc-kr\0"
356 "euckr\0"
357 "cp 949\0"
358 "windows-949\0"
359 "euc-jp\0"
360 "eucjp\0"
361 "jisx0201.1976-0\0"
362 "jisx0208.1983-0\0"
363 "jisx0208.1990-0\0"
364 "jisx0208.1997-0\0"
365 "jisx0212.1990-0\0"
366 "jisx0213.2000-1\0"
367 "jisx0213.2000-2\0"
368 "shift_jis\0"
369 "sjis\0"
370 "shift-jis\0"
371 "iso-2022-jp\0"
372 "jis7\0"
373 "windows850\0"
374 "ibm850\0"
375 "windows866\0"
376 "ibm866\0"
377 "windows-850\0"
378 "windows-866\0"
379 "cp-10000\0"
380 "apple roman\0"
381 "thai-tis620\0"
382 "iso 8859-11\0"
383 "windows-874\0"
384 "ibm874\0"
385 "windows874\0"
386 "cp-874\0"
387 "ksc5601.1987-0\0"
388 "ks_c_5601-1987\0"
389 "mac-roman\0"
390 "macintosh\0"
391 "mac\0"
392 "csiso2022jp\0"
393 "\0";
394
395static const int builtin_indices[] = {0, 11, 18, 11, 30, 39, 50, 39, 58, 39, 64, 82, 88, 93, 109, 93, 120, 135, 143, 135, 158, 164,
396 168, 164, 175, 164, 189, 196, 201, 208, 214, 221, 233, 240, 246, 240, 262, 240, 278, 240, 294, 240, 310, 240,
397 326, 240, 342, 240, 358, 368, 373, 368, 368, 368, 383, 395, 400, 411, 418, 429, 436, 411, 448, 429, 460, 469,
398 481, 493, 505, 517, 524, 517, 535, 517, 542, 208, 557, 208, 572, 469, 582, 469, 592, 469, 596, 383, -1};
399
400/*
401 * GENERATED CODE ENDS HERE
402 */
403
404/*
405 * some last resort hints in case the charmap file couldn't be found.
406 * This gives at least a partial conversion and helps make things readable.
407 *
408 * the name used as input here is already converted to the more canonical
409 * name as defined in the aliases array.
410 *
411 * Input data:
412cp1250
413iso-8859-2
414koi8-r
415iso-8859-5
416koi8-u
417koi8-r
418pt 154
419windows-1251
420paratype-154
421windows-1251
422pt-154
423windows-1251
424 */
425/* Notes:
426 * - KDE had always "CP 1251" as best fallback to PT 154. As Qt does not offer this encoding anymore, the codepage 1251 is used as fallback.
427 */
428
429/*
430 * THE FOLLOWING CODE IS GENERATED. PLEASE DO NOT EDIT BY HAND.
431 * The script used was generate_string_table.pl which can be found in kde-dev-scripts.
432 */
433
434static const char conversion_hints_string[] =
435 "cp1250\0"
436 "iso-8859-2\0"
437 "koi8-r\0"
438 "iso-8859-5\0"
439 "koi8-u\0"
440 "pt 154\0"
441 "windows-1251\0"
442 "paratype-154\0"
443 "pt-154\0"
444 "\0";
445
446static const int conversion_hints_indices[] = {0, 7, 18, 25, 36, 18, 43, 50, 63, 50, 76, 50, -1};
447
448/*
449 * GENERATED CODE ENDS HERE
450 */
451
452struct KCharsetsSingletonPrivate {
453 KCharsets instance;
454};
455
456Q_GLOBAL_STATIC(KCharsetsSingletonPrivate, globalCharsets)
457
458// search an array of items index/data, find first matching index
459// and return data, or return 0
460static inline const char *kcharsets_array_search(const char *start, const int *indices, const char *entry)
461{
462 for (int i = 0; indices[i] != -1; i += 2) {
463 if (qstrcmp(str1: start + indices[i], str2: entry) == 0) {
464 return start + indices[i + 1];
465 }
466 }
467 return nullptr;
468}
469
470// --------------------------------------------------------------------------
471
472KCharsets::KCharsets()
473 : d(new KCharsetsPrivate)
474{
475}
476
477KCharsets::~KCharsets() = default;
478
479QChar KCharsets::fromEntity(QStringView str)
480{
481 QChar res = QChar::Null;
482
483 if (str.isEmpty()) {
484 return QChar::Null;
485 }
486
487 int pos = 0;
488 if (str[pos] == QLatin1Char('&')) {
489 pos++;
490 }
491
492 // Check for '&#000' or '&#x0000' sequence
493 if (str[pos] == QLatin1Char('#') && str.length() - pos > 1) {
494 bool ok;
495 pos++;
496 if (str[pos] == QLatin1Char('x') || str[pos] == QLatin1Char('X')) {
497 pos++;
498 // '&#x0000', hexadecimal character reference
499 const auto tmp = str.mid(pos);
500 res = QChar(tmp.toInt(ok: &ok, base: 16));
501 } else {
502 // '&#0000', decimal character reference
503 const auto tmp = str.mid(pos);
504 res = QChar(tmp.toInt(ok: &ok, base: 10));
505 }
506 if (ok) {
507 return res;
508 } else {
509 return QChar::Null;
510 }
511 }
512
513 const QByteArray raw(str.toLatin1());
514 const entity *e = KCodecsEntities::kde_findEntity(str: raw.data(), len: raw.length());
515
516 if (!e) {
517 // qCDebug(KCODECS_LOG) << "unknown entity " << str <<", len = " << str.length();
518 return QChar::Null;
519 }
520 // qCDebug(KCODECS_LOG) << "got entity " << str << " = " << e->code;
521
522 return QChar(e->code);
523}
524
525QChar KCharsets::fromEntity(QStringView str, int &len)
526{
527 // entities are never longer than 8 chars... we start from
528 // that length and work backwards...
529 len = 8;
530 while (len > 0) {
531 const auto tmp = str.left(n: len);
532 QChar res = fromEntity(str: tmp);
533 if (res != QChar::Null) {
534 return res;
535 }
536 len--;
537 }
538 return QChar::Null;
539}
540
541QString KCharsets::toEntity(const QChar &ch)
542{
543 return QString::asprintf(format: "&#0x%x;", ch.unicode());
544}
545
546QString KCharsets::resolveEntities(const QString &input)
547{
548 QString text = input;
549 const QChar *p = text.unicode();
550 const QChar *end = p + text.length();
551 const QChar *ampersand = nullptr;
552 bool scanForSemicolon = false;
553
554 for (; p < end; ++p) {
555 const QChar ch = *p;
556
557 if (ch == QLatin1Char('&')) {
558 ampersand = p;
559 scanForSemicolon = true;
560 continue;
561 }
562
563 if (ch != QLatin1Char(';') || scanForSemicolon == false) {
564 continue;
565 }
566
567 assert(ampersand);
568
569 scanForSemicolon = false;
570
571 const QChar *entityBegin = ampersand + 1;
572
573 const uint entityLength = p - entityBegin;
574 if (entityLength == 0) {
575 continue;
576 }
577
578 const QChar entityValue = KCharsets::fromEntity(str: QStringView(entityBegin, entityLength));
579 if (entityValue.isNull()) {
580 continue;
581 }
582
583 const uint ampersandPos = ampersand - text.unicode();
584
585 text[(int)ampersandPos] = entityValue;
586 text.remove(i: ampersandPos + 1, len: entityLength + 1);
587 p = text.unicode() + ampersandPos;
588 end = text.unicode() + text.length();
589 ampersand = nullptr;
590 }
591
592 return text;
593}
594
595QStringList KCharsets::availableEncodingNames() const
596{
597 QStringList available;
598 for (const int *p = language_for_encoding_indices; *p != -1; p += 2) {
599 available.append(t: QString::fromUtf8(utf8: language_for_encoding_string + *p));
600 }
601 available.sort();
602 return available;
603}
604
605QString KCharsets::descriptionForEncoding(QStringView encoding) const
606{
607 const char *lang = kcharsets_array_search(start: language_for_encoding_string, indices: language_for_encoding_indices, entry: encoding.toUtf8().data());
608 if (lang) {
609 return tr(sourceText: "%1 ( %2 )", disambiguation: "@item %1 character set, %2 encoding").arg(args: tr(sourceText: lang, disambiguation: "@item Text character set"), args&: encoding);
610 } else {
611 return tr(sourceText: "Other encoding (%1)", disambiguation: "@item").arg(a: encoding);
612 }
613}
614
615QString KCharsets::encodingForName(const QString &descriptiveName) const
616{
617 const int left = descriptiveName.lastIndexOf(c: QLatin1Char('('));
618
619 if (left < 0) { // No parenthesis, so assume it is a normal encoding name
620 return descriptiveName.trimmed();
621 }
622
623 QString name(descriptiveName.mid(position: left + 1));
624
625 const int right = name.lastIndexOf(c: QLatin1Char(')'));
626
627 if (right < 0) {
628 return name;
629 }
630
631 return name.left(n: right).trimmed();
632}
633
634QStringList KCharsets::descriptiveEncodingNames() const
635{
636 QStringList encodings;
637 for (const int *p = language_for_encoding_indices; *p != -1; p += 2) {
638 const QString name = QString::fromUtf8(utf8: language_for_encoding_string + p[0]);
639 const QString description = tr(sourceText: language_for_encoding_string + p[1], disambiguation: "@item Text character set");
640 encodings.append(t: tr(sourceText: "%1 ( %2 )", disambiguation: "@item Text encoding: %1 character set, %2 encoding").arg(args: description, args: name));
641 }
642 encodings.sort();
643 return encodings;
644}
645
646QList<QStringList> KCharsets::encodingsByScript() const
647{
648 if (!d->encodingsByScript.isEmpty()) {
649 return d->encodingsByScript;
650 }
651 int i;
652 for (const int *p = language_for_encoding_indices; *p != -1; p += 2) {
653 const QString name = QString::fromUtf8(utf8: language_for_encoding_string + p[0]);
654 const QString description = tr(sourceText: language_for_encoding_string + p[1], disambiguation: "@item Text character set");
655
656 for (i = 0; i < d->encodingsByScript.size(); ++i) {
657 if (d->encodingsByScript.at(i).at(i: 0) == description) {
658 d->encodingsByScript[i].append(t: name);
659 break;
660 }
661 }
662
663 if (i == d->encodingsByScript.size()) {
664 d->encodingsByScript.append(t: QStringList() << description << name);
665 }
666 }
667 return d->encodingsByScript;
668}
669
670KCharsets *KCharsets::charsets()
671{
672 return &globalCharsets()->instance;
673}
674

source code of kcodecs/src/kcharsets.cpp