kcharsets.cpp source code [kcodecs/src/kcharsets.cpp]

1	/*
2	This file is part of the KDE libraries
3
4	SPDX-FileCopyrightText: 1999 Lars Knoll <knoll@kde.org>
5	SPDX-FileCopyrightText: 2001, 2003, 2004, 2005, 2006 Nicolas GOUTTE <goutte@kde.org>
6	SPDX-FileCopyrightText: 2007 Nick Shaforostoff <shafff@ukr.net>
7
8	SPDX-License-Identifier: LGPL-2.0-or-later
9	*/
10	#include "kcharsets.h"
11	#include "kcharsets_p.h"
12	#include "kcodecs_debug.h"
13
14	#include <kentities.h>
15
16	#include <QHash>
17
18	#include <algorithm>
19	#include <assert.h>
20
21	/*
22	* The encoding names (like "ISO 8859-1") in this list are user-visible,
23	* and should be mostly uppercase.
24	* Generate with generate_string_table.pl (located in kde-dev-scripts),
25	* input data:
26	ISO 8859-1
27	i18n:Western European
28	ISO 8859-15
29	i18n:Western European
30	ISO 8859-14
31	i18n:Western European
32	cp 1252
33	i18n:Western European
34	IBM850
35	i18n:Western European
36	ISO 8859-2
37	i18n:Central European
38	ISO 8859-3
39	i18n:Central European
40	ISO 8859-4
41	i18n:Baltic
42	ISO 8859-13
43	i18n:Baltic
44	ISO 8859-16
45	i18n:South-Eastern Europe
46	cp 1250
47	i18n:Central European
48	cp 1254
49	i18n:Turkish
50	cp 1257
51	i18n:Baltic
52	KOI8-R
53	i18n:Cyrillic
54	ISO 8859-5
55	i18n:Cyrillic
56	cp 1251
57	i18n:Cyrillic
58	KOI8-U
59	i18n:Cyrillic
60	IBM866
61	i18n:Cyrillic
62	Big5
63	i18n:Chinese Traditional
64	Big5-HKSCS
65	i18n:Chinese Traditional
66	GB18030
67	i18n:Chinese Simplified
68	GBK
69	i18n:Chinese Simplified
70	GB2312
71	i18n:Chinese Simplified
72	EUC-KR
73	i18n:Korean
74	windows-949
75	i18n:Korean
76	sjis
77	i18n:Japanese
78	ISO-2022-JP
79	i18n:Japanese
80	EUC-JP
81	i18n:Japanese
82	ISO 8859-7
83	i18n:Greek
84	cp 1253
85	i18n:Greek
86	ISO 8859-6
87	i18n:Arabic
88	cp 1256
89	i18n:Arabic
90	ISO 8859-8
91	i18n:Hebrew
92	ISO 8859-8-I
93	i18n:Hebrew
94	cp 1255
95	i18n:Hebrew
96	ISO 8859-9
97	i18n:Turkish
98	TIS620
99	i18n:Thai
100	ISO 8859-11
101	i18n:Thai
102	UTF-8
103	i18n:Unicode
104	UTF-16
105	i18n:Unicode
106	utf7
107	i18n:Unicode
108	ucs2
109	i18n:Unicode
110	ISO 10646-UCS-2
111	i18n:Unicode
112	windows-1258
113	i18n:Other
114	IBM874
115	i18n:Other
116	TSCII
117	i18n:Other
118	*/
119	/*
120	* Notes about the table:
121	*
122	* - The following entries were disabled and removed from the table:
123	ibm852
124	i18n:Central European
125	pt 154
126	i18n:Cyrillic // ### TODO "PT 154" seems to have been removed from Qt
127	*
128	* - ISO 8559-11 is the deprecated name of TIS-620
129	* - utf7 is not in Qt
130	* - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2"
131	* - windows-1258: TODO
132	* - IBM874: TODO
133	* - TSCII: TODO
134	*/
135
136	/*
137	* This redefines the QT_TRANSLATE_NOOP3 macro provided by Qt to indicate that
138	* statically initialised text should be translated so that it expands to just
139	* the string that should be translated, making it possible to use it in the
140	* single string construct below.
141	*/
142	#undef QT_TRANSLATE_NOOP3
143	#define QT_TRANSLATE_NOOP3(a, b, c) b
144
145	/*
146	* THE FOLLOWING CODE IS GENERATED. PLEASE DO NOT EDIT BY HAND.
147	* The script used was generate_string_table.pl which can be found in kde-dev-scripts.
148	* It was then edited to use QT_TRANSLATE_NOOP3 instead of I18N_NOOP.
149	*/
150
151	static const char language_for_encoding_string[] =
152	"ISO 8859-1\0"
153	QT_TRANSLATE_NOOP3("KCharsets", "Western European", "@item Text character set")"\0"
154	"ISO 8859-15\0"
155	"ISO 8859-14\0"
156	"cp 1252\0"
157	"IBM850\0"
158	"ISO 8859-2\0"
159	QT_TRANSLATE_NOOP3("KCharsets", "Central European", "@item Text character set")"\0"
160	"ISO 8859-3\0"
161	"ISO 8859-4\0"
162	QT_TRANSLATE_NOOP3("KCharsets", "Baltic", "@item Text character set")"\0"
163	"ISO 8859-13\0"
164	"ISO 8859-16\0"
165	QT_TRANSLATE_NOOP3("KCharsets", "South-Eastern Europe", "@item Text character set")"\0"
166	"cp 1250\0"
167	"cp 1254\0"
168	QT_TRANSLATE_NOOP3("KCharsets", "Turkish", "@item Text character set")"\0"
169	"cp 1257\0"
170	"KOI8-R\0"
171	QT_TRANSLATE_NOOP3("KCharsets", "Cyrillic", "@item Text character set")"\0"
172	"ISO 8859-5\0"
173	"cp 1251\0"
174	"KOI8-U\0"
175	"IBM866\0"
176	"Big5\0"
177	QT_TRANSLATE_NOOP3("KCharsets", "Chinese Traditional", "@item Text character set")"\0"
178	"Big5-HKSCS\0"
179	"GB18030\0"
180	QT_TRANSLATE_NOOP3("KCharsets", "Chinese Simplified", "@item Text character set")"\0"
181	"GBK\0"
182	"GB2312\0"
183	"EUC-KR\0"
184	QT_TRANSLATE_NOOP3("KCharsets", "Korean", "@item Text character set")"\0"
185	"windows-949\0"
186	"sjis\0"
187	QT_TRANSLATE_NOOP3("KCharsets", "Japanese", "@item Text character set")"\0"
188	"ISO-2022-JP\0"
189	"EUC-JP\0"
190	"ISO 8859-7\0"
191	QT_TRANSLATE_NOOP3("KCharsets", "Greek", "@item Text character set")"\0"
192	"cp 1253\0"
193	"ISO 8859-6\0"
194	QT_TRANSLATE_NOOP3("KCharsets", "Arabic", "@item Text character set")"\0"
195	"cp 1256\0"
196	"ISO 8859-8\0"
197	QT_TRANSLATE_NOOP3("KCharsets", "Hebrew", "@item Text character set")"\0"
198	"ISO 8859-8-I\0"
199	"cp 1255\0"
200	"ISO 8859-9\0"
201	"TIS620\0"
202	QT_TRANSLATE_NOOP3("KCharsets", "Thai", "@item Text character set")"\0"
203	"ISO 8859-11\0"
204	"UTF-8\0"
205	QT_TRANSLATE_NOOP3("KCharsets", "Unicode", "@item Text character set")"\0"
206	"UTF-16\0"
207	"utf7\0"
208	"ucs2\0"
209	"ISO 10646-UCS-2\0"
210	"windows-1258\0"
211	QT_TRANSLATE_NOOP3("KCharsets", "Other", "@item Text character set")"\0"
212	"IBM874\0"
213	"TSCII\0"
214	"\0";
215
216	static const int language_for_encoding_indices[] = {
217	`0`, `11`, `28`, `11`, `40`, `11`, `52`, `11`, `60`, `11`, `67`, `78`, `95`, `78`, `106`, `117`, `124`, `117`, `136`, `148`, `169`, `78`, `177`, `185`, `193`, `117`, `201`, `208`, `217`, `208`, `228`,
218	`208`, `236`, `208`, `243`, `208`, `250`, `255`, `275`, `255`, `286`, `294`, `313`, `294`, `317`, `294`, `324`, `331`, `338`, `331`, `350`, `355`, `364`, `355`, `376`, `355`, `383`, `394`, `400`, `394`, `408`, `419`,
219	`426`, `419`, `434`, `445`, `452`, `445`, `465`, `445`, `473`, `185`, `484`, `491`, `496`, `491`, `508`, `514`, `522`, `514`, `529`, `514`, `534`, `514`, `539`, `514`, `555`, `568`, `574`, `568`, `581`, `568`, -`1`};
220
221	/*
222	* GENERATED CODE ENDS HERE
223	*/
224
225	/*
226	* defines some different names for codecs that are built into Qt.
227	* The names in this list must be lower-case.
228	* input data for generate_string_table.pl:
229	iso-ir-111
230	koi8-r
231	koi unified
232	koi8-r
233	us-ascii
234	iso 8859-1
235	usascii
236	iso 8859-1
237	ascii
238	iso 8859-1
239	unicode-1-1-utf-7
240	utf-7
241	ucs2
242	iso-10646-ucs-2
243	iso10646-1
244	iso-10646-ucs-2
245	gb18030.2000-1
246	gb18030
247	gb18030.2000-0
248	gb18030
249	gbk-0
250	gbk
251	gb2312
252	gbk
253	gb2312.1980-0
254	gbk
255	big5-0
256	big5
257	euc-kr
258	euckr
259	cp 949
260	windows-949
261	euc-jp
262	eucjp
263	jisx0201.1976-0
264	eucjp
265	jisx0208.1983-0
266	eucjp
267	jisx0208.1990-0
268	eucjp
269	jisx0208.1997-0
270	eucjp
271	jisx0212.1990-0
272	eucjp
273	jisx0213.2000-1
274	eucjp
275	jisx0213.2000-2
276	eucjp
277	shift_jis
278	sjis
279	shift-jis
280	sjis
281	sjis
282	sjis
283	iso-2022-jp
284	jis7
285	windows850
286	ibm850
287	windows866
288	ibm866
289	windows-850
290	ibm850
291	windows-866
292	ibm866
293	cp-10000
294	apple roman
295	thai-tis620
296	iso 8859-11
297	windows-874
298	ibm874
299	windows874
300	ibm874
301	cp-874
302	ibm874
303	ksc5601.1987-0
304	euckr
305	ks_c_5601-1987
306	euckr
307	mac-roman
308	apple roman
309	macintosh
310	apple roman
311	mac
312	apple roman
313	csiso2022jp
314	iso-2022-jp
315	*/
316	/*
317	* Notes about the table:
318	* - using ISO-8859-1 for ASCII is only an approximation (as you cannot test if a character is part of the set)
319	* - utf7 is not in Qt
320	* - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2"
321	* - sjis: appears on the table for x-sjis
322	* - jis7: ISO-2022-JP is now the default name in Qt4
323	* - cp-874: is it really needed?
324	* - mac-roman: appears on the table for x-mac-roman
325	* - csiso2022jp: See bug #77243
326	*/
327
328	/*
329	* THE FOLLOWING CODE IS GENERATED. PLEASE DO NOT EDIT BY HAND.
330	* The script used was generate_string_table.pl which can be found in kde-dev-scripts.
331	*/
332
333	static const char builtin_string[] =
334	"iso-ir-111\0"
335	"koi8-r\0"
336	"koi unified\0"
337	"us-ascii\0"
338	"iso 8859-1\0"
339	"usascii\0"
340	"ascii\0"
341	"unicode-1-1-utf-7\0"
342	"utf-7\0"
343	"ucs2\0"
344	"iso-10646-ucs-2\0"
345	"iso10646-1\0"
346	"gb18030.2000-1\0"
347	"gb18030\0"
348	"gb18030.2000-0\0"
349	"gbk-0\0"
350	"gbk\0"
351	"gb2312\0"
352	"gb2312.1980-0\0"
353	"big5-0\0"
354	"big5\0"
355	"euc-kr\0"
356	"euckr\0"
357	"cp 949\0"
358	"windows-949\0"
359	"euc-jp\0"
360	"eucjp\0"
361	"jisx0201.1976-0\0"
362	"jisx0208.1983-0\0"
363	"jisx0208.1990-0\0"
364	"jisx0208.1997-0\0"
365	"jisx0212.1990-0\0"
366	"jisx0213.2000-1\0"
367	"jisx0213.2000-2\0"
368	"shift_jis\0"
369	"sjis\0"
370	"shift-jis\0"
371	"iso-2022-jp\0"
372	"jis7\0"
373	"windows850\0"
374	"ibm850\0"
375	"windows866\0"
376	"ibm866\0"
377	"windows-850\0"
378	"windows-866\0"
379	"cp-10000\0"
380	"apple roman\0"
381	"thai-tis620\0"
382	"iso 8859-11\0"
383	"windows-874\0"
384	"ibm874\0"
385	"windows874\0"
386	"cp-874\0"
387	"ksc5601.1987-0\0"
388	"ks_c_5601-1987\0"
389	"mac-roman\0"
390	"macintosh\0"
391	"mac\0"
392	"csiso2022jp\0"
393	"\0";
394
395	static const int builtin_indices[] = {`0`, `11`, `18`, `11`, `30`, `39`, `50`, `39`, `58`, `39`, `64`, `82`, `88`, `93`, `109`, `93`, `120`, `135`, `143`, `135`, `158`, `164`,
396	`168`, `164`, `175`, `164`, `189`, `196`, `201`, `208`, `214`, `221`, `233`, `240`, `246`, `240`, `262`, `240`, `278`, `240`, `294`, `240`, `310`, `240`,
397	`326`, `240`, `342`, `240`, `358`, `368`, `373`, `368`, `368`, `368`, `383`, `395`, `400`, `411`, `418`, `429`, `436`, `411`, `448`, `429`, `460`, `469`,
398	`481`, `493`, `505`, `517`, `524`, `517`, `535`, `517`, `542`, `208`, `557`, `208`, `572`, `469`, `582`, `469`, `592`, `469`, `596`, `383`, -`1`};
399
400	/*
401	* GENERATED CODE ENDS HERE
402	*/
403
404	/*
405	* some last resort hints in case the charmap file couldn't be found.
406	* This gives at least a partial conversion and helps make things readable.
407	*
408	* the name used as input here is already converted to the more canonical
409	* name as defined in the aliases array.
410	*
411	* Input data:
412	cp1250
413	iso-8859-2
414	koi8-r
415	iso-8859-5
416	koi8-u
417	koi8-r
418	pt 154
419	windows-1251
420	paratype-154
421	windows-1251
422	pt-154
423	windows-1251
424	*/
425	/ Notes:*
426	* - KDE had always "CP 1251" as best fallback to PT 154. As Qt does not offer this encoding anymore, the codepage 1251 is used as fallback.
427	*/
428
429	/*
430	* THE FOLLOWING CODE IS GENERATED. PLEASE DO NOT EDIT BY HAND.
431	* The script used was generate_string_table.pl which can be found in kde-dev-scripts.
432	*/
433
434	static const char conversion_hints_string[] =
435	"cp1250\0"
436	"iso-8859-2\0"
437	"koi8-r\0"
438	"iso-8859-5\0"
439	"koi8-u\0"
440	"pt 154\0"
441	"windows-1251\0"
442	"paratype-154\0"
443	"pt-154\0"
444	"\0";
445
446	static const int conversion_hints_indices[] = {`0`, `7`, `18`, `25`, `36`, `18`, `43`, `50`, `63`, `50`, `76`, `50`, -`1`};
447
448	/*
449	* GENERATED CODE ENDS HERE
450	*/
451
452	struct KCharsetsSingletonPrivate {
453	KCharsets instance;
454	};
455
456	Q_GLOBAL_STATIC(KCharsetsSingletonPrivate, globalCharsets)
457
458	// search an array of items index/data, find first matching index
459	// and return data, or return 0
460	static inline const char kcharsets_array_search(const* char start, const* int indices, const* char *entry)
461	{
462	for (int i = `0`; indices[i] != -`1`; i += `2`) {
463	if (qstrcmp(str1: start + indices[i], str2: entry) == `0`) {
464	return start + indices[i + `1`];
465	}
466	}
467	return nullptr;
468	}
469
470	// --------------------------------------------------------------------------
471
472	KCharsets::KCharsets()
473	: d (new KCharsetsPrivate)
474	{
475	}
476
477	KCharsets::~KCharsets() = default;
478
479	QChar KCharsets::fromEntity(QStringView str)
480	{
481	QChar res = QChar::Null;
482
483	if (str.isEmpty()) {
484	return QChar::Null;
485	}
486
487	int pos = `0`;
488	if (str[pos] == QLatin1Char(`'&'`)) {
489	pos++;
490	}
491
492	// Check for '&#000' or '&#x0000' sequence
493	if (str[pos] == QLatin1Char(`'#'`) && str.length() - pos > `1`) {
494	bool ok;
495	pos++;
496	if (str[pos] == QLatin1Char(`'x'`) \|\| str[pos] == QLatin1Char(`'X'`)) {
497	pos++;
498	// '&#x0000', hexadecimal character reference
499	const auto tmp = str.mid(pos);
500	res = QChar(tmp.toInt(ok: &ok, base: `16`));
501	} else {
502	// '&#0000', decimal character reference
503	const auto tmp = str.mid(pos);
504	res = QChar(tmp.toInt(ok: &ok, base: `10`));
505	}
506	if (ok) {
507	return res;
508	} else {
509	return QChar::Null;
510	}
511	}
512
513	const QByteArray raw(str.toLatin1());
514	const entity *e = KCodecsEntities::kde_findEntity(str: raw.data(), len: raw.length());
515
516	if (!e) {
517	// qCDebug(KCODECS_LOG) << "unknown entity " << str <<", len = " << str.length();
518	return QChar::Null;
519	}
520	// qCDebug(KCODECS_LOG) << "got entity " << str << " = " << e->code;
521
522	return QChar(e->code);
523	}
524
525	QChar KCharsets::fromEntity(QStringView str, int &len)
526	{
527	// entities are never longer than 8 chars... we start from
528	// that length and work backwards...
529	len = `8`;
530	while (len > `0`) {
531	const auto tmp = str.left(n: len);
532	QChar res = fromEntity(str: tmp);
533	if (res != QChar::Null) {
534	return res;
535	}
536	len--;
537	}
538	return QChar::Null;
539	}
540
541	QString KCharsets::toEntity(const QChar &ch)
542	{
543	return QString::asprintf(format: "&#0x%x;", ch.unicode());
544	}
545
546	QString KCharsets::resolveEntities(const QString &input)
547	{
548	QString text = input;
549	const QChar *p = text.unicode();
550	const QChar *end = p + text.length();
551	const QChar ampersand = nullptr*;
552	bool scanForSemicolon = false;
553
554	for (; p < end; ++p) {
555	const QChar ch = *p;
556
557	if (ch == QLatin1Char(`'&'`)) {
558	ampersand = p;
559	scanForSemicolon = true;
560	continue;
561	}
562
563	if (ch != QLatin1Char(`';'`) \|\| scanForSemicolon == false) {
564	continue;
565	}
566
567	assert(ampersand);
568
569	scanForSemicolon = false;
570
571	const QChar *entityBegin = ampersand + `1`;
572
573	const uint entityLength = p - entityBegin;
574	if (entityLength == `0`) {
575	continue;
576	}
577
578	const QChar entityValue = KCharsets::fromEntity(str: QStringView(entityBegin, entityLength));
579	if (entityValue.isNull()) {
580	continue;
581	}
582
583	const uint ampersandPos = ampersand - text.unicode();
584
585	text[(int)ampersandPos] = entityValue;
586	text.remove(i: ampersandPos + `1`, len: entityLength + `1`);
587	p = text.unicode() + ampersandPos;
588	end = text.unicode() + text.length();
589	ampersand = nullptr;
590	}
591
592	return text;
593	}
594
595	QStringList KCharsets::availableEncodingNames() const
596	{
597	QStringList available;
598	for (const int p = language_for_encoding_indices; p != -`1`; p += `2`) {
599	available.append(t: QString::fromUtf8(utf8: language_for_encoding_string + *p));
600	}
601	available.sort();
602	return available;
603	}
604
605	QString KCharsets::descriptionForEncoding(QStringView encoding) const
606	{
607	const char *lang = kcharsets_array_search(start: language_for_encoding_string, indices: language_for_encoding_indices, entry: encoding.toUtf8().data());
608	if (lang) {
609	return tr(sourceText: "%1 ( %2 )", disambiguation: "@item %1 character set, %2 encoding").arg(args: tr(sourceText: lang, disambiguation: "@item Text character set"), args&: encoding);
610	} else {
611	return tr(sourceText: "Other encoding (%1)", disambiguation: "@item").arg(a: encoding);
612	}
613	}
614
615	QString KCharsets::encodingForName(const QString &descriptiveName) const
616	{
617	const int left = descriptiveName.lastIndexOf(c: QLatin1Char(`'('`));
618
619	if (left < `0`) { // No parenthesis, so assume it is a normal encoding name
620	return descriptiveName.trimmed();
621	}
622
623	QString name(descriptiveName.mid(position: left + `1`));
624
625	const int right = name.lastIndexOf(c: QLatin1Char(`')'`));
626
627	if (right < `0`) {
628	return name;
629	}
630
631	return name.left(n: right).trimmed();
632	}
633
634	QStringList KCharsets::descriptiveEncodingNames() const
635	{
636	QStringList encodings;
637	for (const int p = language_for_encoding_indices; p != -`1`; p += `2`) {
638	const QString name = QString::fromUtf8(utf8: language_for_encoding_string + p[`0`]);
639	const QString description = tr(sourceText: language_for_encoding_string + p[`1`], disambiguation: "@item Text character set");
640	encodings.append(t: tr(sourceText: "%1 ( %2 )", disambiguation: "@item Text encoding: %1 character set, %2 encoding").arg(args: description, args: name));
641	}
642	encodings.sort();
643	return encodings;
644	}
645
646	QList<QStringList> KCharsets::encodingsByScript() const
647	{
648	if (!d ->encodingsByScript.isEmpty()) {
649	return d ->encodingsByScript;
650	}
651	int i;
652	for (const int p = language_for_encoding_indices; p != -`1`; p += `2`) {
653	const QString name = QString::fromUtf8(utf8: language_for_encoding_string + p[`0`]);
654	const QString description = tr(sourceText: language_for_encoding_string + p[`1`], disambiguation: "@item Text character set");
655
656	for (i = `0`; i < d ->encodingsByScript.size(); ++i) {
657	if (d ->encodingsByScript.at(i).at(i: `0`) == description) {
658	d ->encodingsByScript[i].append(t: name);
659	break;
660	}
661	}
662
663	if (i == d ->encodingsByScript.size()) {
664	d ->encodingsByScript.append(t: QStringList() << description << name);
665	}
666	}
667	return d ->encodingsByScript;
668	}
669
670	KCharsets *KCharsets::charsets()
671	{
672	return &globalCharsets()->instance;
673	}
674

source code of kcodecs/src/kcharsets.cpp