qtextcodec.cpp source code [qt5compat/src/core5/codecs/qtextcodec.cpp]

1	// Copyright (C) 2018 The Qt Company Ltd.
2	// Copyright (C) 2018 Intel Corporation.
3	// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4
5	#include "qplatformdefs.h"
6
7	#include "qtextcodec.h"
8	#include "qtextcodec_p.h"
9
10	#include "qbytearraymatcher.h"
11	#include "qendian.h"
12	#include "qfile.h"
13	#include "qlist.h"
14	#include <private/qlocking_p.h>
15	#include "qstringlist.h"
16	#include "qvarlengtharray.h"
17
18	#include <private/qcoreapplication_p.h>
19
20	#include "qutfcodec_p.h"
21	#include "qlatincodec_p.h"
22
23	#if QT_CONFIG(codecs)
24	# include "qtsciicodec_p.h"
25	# include "qisciicodec_p.h"
26	#endif
27	#if QT_CONFIG(icu)
28	#include "qicucodec_p.h"
29	#else
30	#if QT_CONFIG(iconv)
31	# include "qiconvcodec_p.h"
32	#endif
33	#ifdef Q_OS_WIN
34	# include "qwindowscodec_p.h"
35	#endif
36	# include "qsimplecodec_p.h"
37	#if QT_CONFIG(big_codecs) && QT_CONFIG(textcodec)
38	# ifndef Q_OS_INTEGRITY
39	# include "qgb18030codec_p.h"
40	# include "qeucjpcodec_p.h"
41	# include "qjiscodec_p.h"
42	# include "qsjiscodec_p.h"
43	# include "qeuckrcodec_p.h"
44	# include "qbig5codec_p.h"
45	# endif // !Q_OS_INTEGRITY
46	#endif // big_codecs
47
48	#endif // icu
49
50	#include <mutex>
51
52	#include <stdlib.h>
53	#include <ctype.h>
54	#include <locale.h>
55	#if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_ANDROID)
56	# include <langinfo.h>
57	#endif
58
59	QT_BEGIN_NAMESPACE
60
61	// in qstring.cpp:
62	void qt_from_latin1(char16_t dst, const* char str, size_t size) noexcept*;
63
64	typedef QList<QTextCodec*>::ConstIterator TextCodecListConstIt;
65	typedef QList<QByteArray>::ConstIterator ByteArrayListConstIt;
66
67	Q_GLOBAL_STATIC(QRecursiveMutex, textCodecsMutex);
68
69	Q_GLOBAL_STATIC(QTextCodecData, textCodecData)
70
71	QTextCodecData::QTextCodecData()
72	: codecForLocale (nullptr)
73	{
74	}
75
76	QTextCodecData::~QTextCodecData()
77	{
78	codecForLocale.storeRelease(newValue: nullptr);
79	QList<QTextCodec *> tmp = allCodecs;
80	allCodecs.clear();
81	codecCache.clear();
82	for (QList<QTextCodec *>::const_iterator it = tmp.constBegin(); it != tmp.constEnd(); ++it)
83	delete *it;
84	}
85
86	QTextCodecData *QTextCodecData::instance()
87	{
88	return textCodecData ();
89	}
90
91	class TextCodecsMutexLocker
92	{
93	using Lock = decltype(qt_unique_lock(mutex&: std::declval<QRecursiveMutex&>()));
94	// ### FIXME: this is used when textCodecsMutex already == nullptr
95	const Lock lock = qt_unique_lock(mutex: textCodecsMutex ());
96	public:
97	TextCodecsMutexLocker() {} // required d/t an ICC 19 bug
98	};
99
100	#if !QT_CONFIG(icu)
101	static char qtolower(char c)
102	{ if (c >= `'A'` && c <= `'Z'`) return c + `0x20`; return c; }
103	static bool qisalnum(char c)
104	{ return (c >= `'0'` && c <= `'9'`) \|\| ((c \| `0x20`) >= `'a'` && (c \| `0x20`) <= `'z'`); }
105
106	bool qTextCodecNameMatch(const char n, const* char *h)
107	{
108	if (qstricmp(n, h) == `0`)
109	return true;
110
111	// if the letters and numbers are the same, we have a match
112	while (*n != `'\0'`) {
113	if (qisalnum(*n)) {
114	for (;;) {
115	if (*h == `'\0'`)
116	return false;
117	if (qisalnum(*h))
118	break;
119	++h;
120	}
121	if (qtolower(n) != qtolower(h))
122	return false;
123	++h;
124	}
125	++n;
126	}
127	while (h && !qisalnum(h))
128	++h;
129	return (*h == `'\0'`);
130	}
131
132
133	#if !defined(Q_OS_WIN32) && !defined(QT_LOCALE_IS_UTF8)
134	static QTextCodec checkForCodec(const* QByteArray &name) {
135	QTextCodec *c = QTextCodec::codecForName(name);
136	if (!c) {
137	const int index = name.indexOf(`'@'`);
138	if (index != -`1`) {
139	c = QTextCodec::codecForName(name.left(index));
140	}
141	}
142	return c;
143	}
144	#endif
145
146	static void setup();
147
148	// \threadsafe
149	// this returns the codec the method sets up as locale codec to
150	// avoid a race condition in codecForLocale() when
151	// setCodecForLocale(nullptr) is called at the same time.
152	static QTextCodec *setupLocaleMapper()
153	{
154	QTextCodecData *globalData = QTextCodecData::instance();
155
156	QTextCodec locale = nullptr*;
157
158	{
159	const TextCodecsMutexLocker locker;
160	if (globalData->allCodecs.isEmpty())
161	setup();
162	}
163
164	QCoreApplicationPrivate::initLocale();
165
166	#if defined(QT_LOCALE_IS_UTF8)
167	locale = QTextCodec::codecForName("UTF-8");
168	#elif defined(Q_OS_WIN)
169	locale = QTextCodec::codecForName("System");
170	#else
171
172	// First try getting the codecs name from nl_langinfo and see
173	// if we have a builtin codec for it.
174	// Only fall back to using iconv if we can't find a builtin codec
175	// This is because the builtin utf8 codec is around 5 times faster
176	// then the using QIconvCodec
177
178	#if defined (_XOPEN_UNIX)
179	char *charset = nl_langinfo(CODESET);
180	if (charset)
181	locale = QTextCodec::codecForName(charset);
182	#endif
183	#if QT_CONFIG(iconv)
184	if (!locale) {
185	// no builtin codec for the locale found, let's try using iconv
186	(void) new QIconvCodec();
187	locale = QTextCodec::codecForName("System");
188	}
189	#endif
190
191	if (!locale) {
192	// Very poorly defined and followed standards causes lots of
193	// code to try to get all the cases... This logic is
194	// duplicated in QIconvCodec, so if you change it here, change
195	// it there too.
196
197	// Try to determine locale codeset from locale name assigned to
198	// LC_CTYPE category.
199
200	// First part is getting that locale name. First try setlocale() which
201	// definitely knows it, but since we cannot fully trust it, get ready
202	// to fall back to environment variables.
203	const QByteArray ctype = setlocale(LC_CTYPE, nullptr);
204
205	// Get the first nonempty value from $LC_ALL, $LC_CTYPE, and $LANG
206	// environment variables.
207	QByteArray lang = qgetenv("LC_ALL");
208	if (lang.isEmpty() \|\| lang == "C") {
209	lang = qgetenv("LC_CTYPE");
210	}
211	if (lang.isEmpty() \|\| lang == "C") {
212	lang = qgetenv("LANG");
213	}
214
215	// Now try these in order:
216	// 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
217	// 2. CODESET from lang if it contains a .CODESET part
218	// 3. ctype (maybe the locale is named "ISO-8859-1" or something)
219	// 4. locale (ditto)
220	// 5. check for "@euro"
221	// 6. guess locale from ctype unless ctype is "C"
222	// 7. guess locale from lang
223
224	// 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
225	int indexOfDot = ctype.indexOf(`'.'`);
226	if (indexOfDot != -`1`)
227	locale = checkForCodec( ctype.mid(indexOfDot + `1`) );
228
229	// 2. CODESET from lang if it contains a .CODESET part
230	if (!locale) {
231	indexOfDot = lang.indexOf(`'.'`);
232	if (indexOfDot != -`1`)
233	locale = checkForCodec( lang.mid(indexOfDot + `1`) );
234	}
235
236	// 3. ctype (maybe the locale is named "ISO-8859-1" or something)
237	if (!locale && !ctype.isEmpty() && ctype != "C")
238	locale = checkForCodec(ctype);
239
240	// 4. locale (ditto)
241	if (!locale && !lang.isEmpty())
242	locale = checkForCodec(lang);
243
244	// 5. "@euro"
245	if ((!locale && ctype.contains("@euro")) \|\| lang.contains("@euro"))
246	locale = checkForCodec("ISO 8859-15");
247	}
248
249	#endif
250	// If everything failed, we default to 8859-1
251	if (!locale)
252	locale = QTextCodec::codecForName("ISO 8859-1");
253	globalData->codecForLocale.storeRelease(locale);
254	return locale;
255	}
256
257
258	// textCodecsMutex need to be locked to enter this function
259	static void setup()
260	{
261	static bool initialized = false;
262	if (initialized)
263	return;
264	initialized = true;
265
266	#if QT_CONFIG(codecs)
267	(void)new QTsciiCodec;
268	for (int i = `0`; i < `9`; ++i)
269	(void)new QIsciiCodec(i);
270	for (int i = `0`; i < QSimpleTextCodec::numSimpleCodecs; ++i)
271	(void)new QSimpleTextCodec(i);
272
273	# if QT_CONFIG(big_codecs) && !defined(Q_OS_INTEGRITY)
274	(void)new QGb18030Codec;
275	(void)new QGbkCodec;
276	(void)new QGb2312Codec;
277	(void)new QEucJpCodec;
278	(void)new QJisCodec;
279	(void)new QSjisCodec;
280	(void)new QEucKrCodec;
281	(void)new QCP949Codec;
282	(void)new QBig5Codec;
283	(void)new QBig5hkscsCodec;
284	# endif // big_codecs && !Q_OS_INTEGRITY
285	#if QT_CONFIG(iconv)
286	(void) new QIconvCodec;
287	#endif
288	#if defined(Q_OS_WIN32)
289	(void) new QWindowsLocalCodec;
290	#endif // Q_OS_WIN32
291	#endif // codecs
292
293	(void)new QUtf16Codec;
294	(void)new QUtf16BECodec;
295	(void)new QUtf16LECodec;
296	(void)new QUtf32Codec;
297	(void)new QUtf32BECodec;
298	(void)new QUtf32LECodec;
299	(void)new QLatin15Codec;
300	(void)new QLatin1Codec;
301	(void)new QUtf8Codec;
302	}
303	#else
304	static void setup() {}
305	#endif // icu
306
307	/!*
308	\typealias QTextCodec::ConversionFlags
309
310	\value DefaultConversion No flag is set.
311	\value ConvertInvalidToNull If this flag is set, each invalid input
312	character is output as a null character.
313	\value IgnoreHeader Ignore any Unicode byte-order mark and don't generate any.
314
315	\omitvalue FreeFunction
316	*/
317
318	/!*
319	\typealias QTextCodec::ConverterState
320	*/
321
322	/!*
323	\class QTextCodec
324	\inmodule QtCore5Compat
325	\brief The QTextCodec class provides conversions between text encodings.
326	\reentrant
327	\ingroup i18n
328
329	Qt uses Unicode to store, draw and manipulate strings. In many
330	situations you may wish to deal with data that uses a different
331	encoding. For example, most Japanese documents are still stored
332	in Shift-JIS or ISO 2022-JP, while Russian users often have their
333	documents in KOI8-R or Windows-1251.
334
335	Qt provides a set of QTextCodec classes to help with converting
336	non-Unicode formats to and from Unicode. You can also create your
337	own codec classes.
338
339	The supported encodings are:
340
341	\list
342	\li \l{Big5 Text Codec}{Big5}
343	\li \l{Big5-HKSCS Text Codec}{Big5-HKSCS}
344	\li CP949
345	\li \l{EUC-JP Text Codec}{EUC-JP}
346	\li \l{EUC-KR Text Codec}{EUC-KR}
347	\li \l{GBK Text Codec}{GB18030}
348	\li HP-ROMAN8
349	\li IBM 850
350	\li IBM 866
351	\li IBM 874
352	\li \l{ISO 2022-JP (JIS) Text Codec}{ISO 2022-JP}
353	\li ISO 8859-1 to 10
354	\li ISO 8859-13 to 16
355	\li Iscii-Bng, Dev, Gjr, Knd, Mlm, Ori, Pnj, Tlg, and Tml
356	\li KOI8-R
357	\li KOI8-U
358	\li Macintosh
359	\li \l{Shift-JIS Text Codec}{Shift-JIS}
360	\li TIS-620
361	\li \l{TSCII Text Codec}{TSCII}
362	\li UTF-8
363	\li UTF-16
364	\li UTF-16BE
365	\li UTF-16LE
366	\li UTF-32
367	\li UTF-32BE
368	\li UTF-32LE
369	\li Windows-1250 to 1258
370	\endlist
371
372	If Qt is compiled with ICU support enabled, most codecs supported by
373	ICU will also be available to the application.
374
375	\l {QTextCodec}s can be used as follows to convert some locally encoded
376	string to Unicode. Suppose you have some string encoded in Russian
377	KOI8-R encoding, and want to convert it to Unicode. The simple way
378	to do it is like this:
379
380	\snippet code/src_corelib_codecs_qtextcodec.cpp 0
381
382	After this, \c string holds the text converted to Unicode.
383	Converting a string from Unicode to the local encoding is just as
384	easy:
385
386	\snippet code/src_corelib_codecs_qtextcodec.cpp 1
387
388	Some care must be taken when trying to convert the data in chunks,
389	for example, when receiving it over a network. In such cases it is
390	possible that a multi-byte character will be split over two
391	chunks. At best this might result in the loss of a character and
392	at worst cause the entire conversion to fail.
393
394	The approach to use in these situations is to create a QTextDecoder
395	object for the codec and use this QTextDecoder for the whole
396	decoding process, as shown below:
397
398	\snippet code/src_corelib_codecs_qtextcodec.cpp 2
399
400	The QTextDecoder object maintains state between chunks and therefore
401	works correctly even if a multi-byte character is split between
402	chunks.
403
404	\section1 Creating Your Own Codec Class
405
406	Support for new text encodings can be added to Qt by creating
407	QTextCodec subclasses.
408
409	The pure virtual functions describe the encoder to the system and
410	the coder is used as required in the different text file formats
411	supported by QTextStream, and under X11, for the locale-specific
412	character input and output.
413
414	To add support for another encoding to Qt, make a subclass of
415	QTextCodec and implement the functions listed in the table below.
416
417	\table
418	\header \li Function \li Description
419
420	\row \li name()
421	\li Returns the official name for the encoding. If the
422	encoding is listed in the
423	\l{IANA character-sets encoding file}, the name
424	should be the preferred MIME name for the encoding.
425
426	\row \li aliases()
427	\li Returns a list of alternative names for the encoding.
428	QTextCodec provides a default implementation that returns
429	an empty list. For example, "ISO-8859-1" has "latin1",
430	"CP819", "IBM819", and "iso-ir-100" as aliases.
431
432	\row \li \l{QTextCodec::mibEnum()}{mibEnum()}
433	\li Return the MIB enum for the encoding if it is listed in
434	the \l{IANA character-sets encoding file}.
435
436	\row \li convertToUnicode()
437	\li Converts an 8-bit character string to Unicode.
438
439	\row \li convertFromUnicode()
440	\li Converts a Unicode string to an 8-bit character string.
441	\endtable
442
443	\sa QTextStream, QTextDecoder, QTextEncoder
444	*/
445
446	/!*
447	Constructs a QTextCodec, and gives it the highest precedence. The
448	QTextCodec should always be constructed on the heap (i.e. with \c
449	new). Qt takes ownership and will delete it when the application
450	terminates.
451	*/
452	QTextCodec::QTextCodec()
453	{
454	const TextCodecsMutexLocker locker;
455
456	QTextCodecData *globalInstance = QTextCodecData::instance();
457	if (globalInstance->allCodecs.isEmpty())
458	setup();
459
460	globalInstance->allCodecs.prepend(t: this);
461	}
462
463
464	/!*
465	\nonreentrant
466
467	Destroys the QTextCodec. Note that you should not delete codecs
468	yourself: once created they become Qt's responsibility.
469	*/
470	QTextCodec::~QTextCodec()
471	{
472	QTextCodecData *globalData = QTextCodecData::instance();
473	if (!globalData)
474	return;
475
476	globalData->codecForLocale.testAndSetRelaxed(expectedValue: this, newValue: nullptr);
477
478	const TextCodecsMutexLocker locker;
479
480	globalData->allCodecs.removeOne(t: this);
481
482	auto it = globalData->codecCache.begin();
483
484	while (it != globalData->codecCache.end()) {
485	if (it.value() == this)
486	it = globalData->codecCache.erase(it);
487	else
488	++it;
489	}
490	}
491
492	/!*
493	\fn QTextCodec QTextCodec::codecForName(const char name)
494
495	Searches all installed QTextCodec objects and returns the one
496	which best matches \a name; the match is case-insensitive. Returns
497	\nullptr if no codec matching the name \a name could be found.
498	*/
499
500	/!*
501	\threadsafe
502	Searches all installed QTextCodec objects and returns the one
503	which best matches \a name; the match is case-insensitive. Returns
504	\nullptr if no codec matching the name \a name could be found.
505	*/
506	QTextCodec QTextCodec::codecForName(const* QByteArray &name)
507	{
508	if (name.isEmpty())
509	return nullptr;
510
511	const TextCodecsMutexLocker locker;
512
513	QTextCodecData *globalData = QTextCodecData::instance();
514	if (!globalData)
515	return nullptr;
516	setup();
517
518	#if !QT_CONFIG(icu)
519	QTextCodecCache *cache = &globalData->codecCache;
520	QTextCodec *codec;
521	codec = cache->value(name);
522	if (codec)
523	return codec;
524
525	for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) {
526	QTextCodec cursor = it;
527	if (qTextCodecNameMatch(cursor->name(), name)) {
528	if (cache)
529	cache->insert(name, cursor);
530	return cursor;
531	}
532	QList<QByteArray> aliases = cursor->aliases();
533	for (ByteArrayListConstIt ait = aliases.constBegin(), acend = aliases.constEnd(); ait != acend; ++ait) {
534	if (qTextCodecNameMatch(*ait, name)) {
535	cache->insert(name, cursor);
536	return cursor;
537	}
538	}
539	}
540
541	return nullptr;
542	#else
543	return QIcuCodec::codecForNameUnlocked(name);
544	#endif
545	}
546
547
548	/!*
549	\threadsafe
550	Returns the QTextCodec which matches the
551	\l{QTextCodec::mibEnum()}{MIBenum} \a mib.
552	*/
553	QTextCodec* QTextCodec::codecForMib(int mib)
554	{
555	const TextCodecsMutexLocker locker;
556
557	QTextCodecData *globalData = QTextCodecData::instance();
558	if (!globalData)
559	return nullptr;
560	if (globalData->allCodecs.isEmpty())
561	setup();
562
563	QByteArray key = "MIB: " + QByteArray::number(mib);
564
565	QTextCodecCache *cache = &globalData->codecCache;
566	QTextCodec *codec;
567	if (cache) {
568	codec = cache->value(key);
569	if (codec)
570	return codec;
571	}
572
573	for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) {
574	QTextCodec cursor = it;
575	if (cursor->mibEnum() == mib) {
576	if (cache)
577	cache->insert(key, value: cursor);
578	return cursor;
579	}
580	}
581
582	#if QT_CONFIG(icu)
583	return QIcuCodec::codecForMibUnlocked(mib);
584	#else
585	return nullptr;
586	#endif
587	}
588
589	/!*
590	\threadsafe
591	Returns the list of all available codecs, by name. Call
592	QTextCodec::codecForName() to obtain the QTextCodec for the name.
593
594	The list may contain many mentions of the same codec
595	if the codec has aliases.
596
597	\sa availableMibs(), name(), aliases()
598	*/
599	QList<QByteArray> QTextCodec::availableCodecs()
600	{
601	const TextCodecsMutexLocker locker;
602
603	QTextCodecData *globalData = QTextCodecData::instance();
604	if (globalData->allCodecs.isEmpty())
605	setup();
606
607	QList<QByteArray> codecs;
608
609	for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) {
610	codecs += (*it)->name();
611	codecs += (*it)->aliases();
612	}
613
614	#if QT_CONFIG(icu)
615	codecs += QIcuCodec::availableCodecs();
616	#endif
617
618	return codecs;
619	}
620
621	/!*
622	\threadsafe
623	Returns the list of MIBs for all available codecs. Call
624	QTextCodec::codecForMib() to obtain the QTextCodec for the MIB.
625
626	\sa availableCodecs(), mibEnum()
627	*/
628	QList<int> QTextCodec::availableMibs()
629	{
630	#if QT_CONFIG(icu)
631	return QIcuCodec::availableMibs();
632	#else
633	const TextCodecsMutexLocker locker;
634
635	QTextCodecData *globalData = QTextCodecData::instance();
636	if (globalData->allCodecs.isEmpty())
637	setup();
638
639	QList<int> codecs;
640
641	for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it)
642	codecs += (*it)->mibEnum();
643
644	return codecs;
645	#endif
646	}
647
648	/!*
649	\nonreentrant
650
651	Set the codec to \a c; this will be returned by
652	codecForLocale(). If \a c is \nullptr, the codec is reset to
653	the default.
654
655	This might be needed for some applications that want to use their
656	own mechanism for setting the locale.
657
658	\sa codecForLocale()
659	*/
660	void QTextCodec::setCodecForLocale(QTextCodec *c)
661	{
662	QTextCodecData::instance()->codecForLocale.storeRelease(newValue: c);
663	}
664
665	/!*
666	\threadsafe
667	Returns a pointer to the codec most suitable for this locale.
668
669	The codec will be retrieved from ICU where that backend is in use, otherwise
670	it may be obtained from an OS-specific API. In the latter case, the codec's
671	name may be "System".
672	*/
673
674	QTextCodec* QTextCodec::codecForLocale()
675	{
676	QTextCodecData *globalData = QTextCodecData::instance();
677	if (!globalData)
678	return nullptr;
679
680	QTextCodec *codec = globalData->codecForLocale.loadAcquire();
681	if (!codec) {
682	#if QT_CONFIG(icu)
683	const TextCodecsMutexLocker locker;
684	codec = QIcuCodec::defaultCodecUnlocked();
685	#else
686	// setupLocaleMapper locks as necessary
687	codec = setupLocaleMapper();
688	#endif
689	}
690
691	return codec;
692	}
693
694
695	/!*
696	\fn QByteArray QTextCodec::name() const
697
698	QTextCodec subclasses must reimplement this function. It returns
699	the name of the encoding supported by the subclass.
700
701	If the codec is registered as a character set in the
702	\l{IANA character-sets encoding file} this method should
703	return the preferred mime name for the codec if defined,
704	otherwise its name.
705	*/
706
707	/!*
708	\fn int QTextCodec::mibEnum() const
709
710	Subclasses of QTextCodec must reimplement this function. It
711	returns the \l{QTextCodec::mibEnum()}{MIBenum} (see \l{IANA character-sets encoding file}
712	for more information). It is important that each QTextCodec
713	subclass returns the correct unique value for this function.
714	*/
715
716	/!*
717	Subclasses can return a number of aliases for the codec in question.
718
719	Standard aliases for codecs can be found in the
720	\l{IANA character-sets encoding file}.
721	*/
722	QList<QByteArray> QTextCodec::aliases() const
723	{
724	return QList<QByteArray>();
725	}
726
727	/!*
728	\fn QString QTextCodec::convertToUnicode(const char chars, int len,*
729	ConverterState state) const*
730
731	QTextCodec subclasses must reimplement this function.
732
733	Converts the first \a len characters of \a chars from the
734	encoding of the subclass to Unicode, and returns the result in a
735	QString.
736
737	\a state can be \nullptr, in which case the conversion is stateless and
738	default conversion rules should be used. If \a state is not \nullptr, the
739	codec should save the state after the conversion in \a state, and
740	adjust the \c remainingChars and \c invalidChars members of the struct.
741	*/
742
743	/!*
744	\fn QByteArray QTextCodec::convertFromUnicode(const QChar input, int number,*
745	ConverterState state) const*
746
747	QTextCodec subclasses must reimplement this function.
748
749	Converts the first \a number of characters from the \a input array
750	from Unicode to the encoding of the subclass, and returns the result
751	in a QByteArray.
752
753	\a state can be \nullptr in which case the conversion is stateless and
754	default conversion rules should be used. If \a state is not \nullptr, the
755	codec should save the state after the conversion in \a state, and
756	adjust the \c remainingChars and \c invalidChars members of the struct.
757	*/
758
759	/!*
760	Creates a QTextDecoder with a specified \a flags to decode chunks
761	of \c{char } data to create chunks of Unicode data.*
762
763	The caller is responsible for deleting the returned object.
764
765	\since 4.7
766	*/
767	QTextDecoder* QTextCodec::makeDecoder(QTextCodec::ConversionFlags flags) const
768	{
769	return new QTextDecoder (this, flags);
770	}
771
772	/!*
773	Creates a QTextEncoder with a specified \a flags to encode chunks
774	of Unicode data as \c{char } data.*
775
776	The caller is responsible for deleting the returned object.
777
778	\since 4.7
779	*/
780	QTextEncoder* QTextCodec::makeEncoder(QTextCodec::ConversionFlags flags) const
781	{
782	return new QTextEncoder (this, flags);
783	}
784
785	/!*
786	\fn QByteArray QTextCodec::fromUnicode(const QChar input, int number,*
787	ConverterState state) const*
788
789	Converts the first \a number of characters from the \a input array
790	from Unicode to the encoding of this codec, and returns the result
791	in a QByteArray.
792
793	The \a state of the convertor used is updated.
794	*/
795
796	/!*
797	Converts \a str from Unicode to the encoding of this codec, and
798	returns the result in a QByteArray.
799	*/
800	QByteArray QTextCodec::fromUnicode(const QString& str) const
801	{
802	return convertFromUnicode(in: str.constData(), length: str.size(), state: nullptr);
803	}
804
805	/!*
806	\overload
807	\since 5.10
808
809	Converts \a str from Unicode to the encoding of this codec, and
810	returns the result in a QByteArray.
811	*/
812	QByteArray QTextCodec::fromUnicode(QStringView str) const
813	{
814	return convertFromUnicode(in: str.data(), length: str.size(), state: nullptr);
815	}
816
817	/!*
818	\fn QString QTextCodec::toUnicode(const char input, int size,*
819	ConverterState state) const*
820
821	Converts the first \a size characters from the \a input from the
822	encoding of this codec to Unicode, and returns the result in a
823	QString.
824
825	The \a state of the convertor used is updated.
826	*/
827
828	/!*
829	Converts \a a from the encoding of this codec to Unicode, and
830	returns the result in a QString.
831	*/
832	QString QTextCodec::toUnicode(const QByteArray& a) const
833	{
834	return convertToUnicode(in: a.constData(), length: a.size(), state: nullptr);
835	}
836
837	/!*
838	Returns \c true if the Unicode character \a ch can be fully encoded
839	with this codec; otherwise returns \c false.
840	*/
841	bool QTextCodec::canEncode(QChar ch) const
842	{
843	ConverterState state;
844	state.flags = ConvertInvalidToNull;
845	convertFromUnicode(in: &ch, length: `1`, state: &state);
846	return (state.invalidChars == `0`);
847	}
848
849	/!*
850	\overload
851
852	\a s contains the string being tested for encode-ability.
853	*/
854	bool QTextCodec::canEncode(const QString& s) const
855	{
856	ConverterState state;
857	state.flags = ConvertInvalidToNull;
858	convertFromUnicode(in: s.constData(), length: s.size(), state: &state);
859	return (state.invalidChars == `0`);
860	}
861
862	/!*
863	\overload
864	\since 5.10
865
866	Returns \c true if the Unicode string \a s can be fully encoded
867	with this codec; otherwise returns \c false.
868	*/
869	bool QTextCodec::canEncode(QStringView s) const
870	{
871	ConverterState state;
872	state.flags = ConvertInvalidToNull;
873	convertFromUnicode(in: s.data(), length: s.size(), state: &state);
874	return !state.invalidChars;
875	}
876	/!*
877	\overload
878
879	\a chars contains the source characters.
880	*/
881	QString QTextCodec::toUnicode(const char chars) const*
882	{
883	const auto len = int(qstrlen(str: chars));
884	return convertToUnicode(in: chars, length: len, state: nullptr);
885	}
886
887
888	/!*
889	\class QTextEncoder
890	\inmodule QtCore5Compat
891	\brief The QTextEncoder class provides a state-based encoder.
892	\reentrant
893	\ingroup i18n
894
895	A text encoder converts text from Unicode into an encoded text format
896	using a specific codec.
897
898	The encoder converts Unicode into another format, remembering any
899	state that is required between calls.
900
901	\sa QTextCodec::makeEncoder(), QTextDecoder
902	*/
903
904	/!*
905	\fn QTextEncoder::QTextEncoder(const QTextCodec codec)*
906
907	Constructs a text encoder for the given \a codec.
908	*/
909
910	/!*
911	Constructs a text encoder for the given \a codec and conversion \a flags.
912
913	\since 4.7
914	*/
915	QTextEncoder::QTextEncoder(const QTextCodec *codec, QTextCodec::ConversionFlags flags)
916	: c(codec), state ()
917	{
918	state.flags = flags;
919	}
920
921	/!*
922	Destroys the encoder.
923	*/
924	QTextEncoder::~QTextEncoder()
925	{
926	}
927
928	/!*
929	\internal
930	\since 4.5
931	Determines whether the encoder encountered a failure while decoding the input. If
932	an error was encountered, the produced result is undefined, and gets converted as according
933	to the conversion flags.
934	*/
935	bool QTextEncoder::hasFailure() const
936	{
937	return state.invalidChars != `0`;
938	}
939
940	/!*
941	Converts the Unicode string \a str into an encoded QByteArray.
942	*/
943	QByteArray QTextEncoder::fromUnicode(const QString& str)
944	{
945	return c->fromUnicode(in: str.constData(), length: str.size(), state: &state);
946	}
947
948	/!*
949	\overload
950	\since 5.10
951	Converts the Unicode string \a str into an encoded QByteArray.
952	*/
953	QByteArray QTextEncoder::fromUnicode(QStringView str)
954	{
955	return c->fromUnicode(in: str.data(), length: str.size(), state: &state);
956	}
957
958	/!*
959	\overload
960
961	Converts \a len characters (not bytes) from \a uc, and returns the
962	result in a QByteArray.
963	*/
964	QByteArray QTextEncoder::fromUnicode(const QChar uc, int* len)
965	{
966	return c->fromUnicode(in: uc, length: len, state: &state);
967	}
968
969	/!*
970	\class QTextDecoder
971	\inmodule QtCore5Compat
972	\brief The QTextDecoder class provides a state-based decoder.
973	\reentrant
974	\ingroup i18n
975
976	A text decoder converts text from an encoded text format into Unicode
977	using a specific codec.
978
979	The decoder converts text in this format into Unicode, remembering any
980	state that is required between calls.
981
982	\sa QTextCodec::makeDecoder(), QTextEncoder
983	*/
984
985	/!*
986	\fn QTextDecoder::QTextDecoder(const QTextCodec codec)*
987
988	Constructs a text decoder for the given \a codec.
989	*/
990
991	/!*
992	Constructs a text decoder for the given \a codec and conversion \a flags.
993
994	\since 4.7
995	*/
996
997	QTextDecoder::QTextDecoder(const QTextCodec *codec, QTextCodec::ConversionFlags flags)
998	: c(codec), state ()
999	{
1000	state.flags = flags;
1001	}
1002
1003	/!*
1004	Destroys the decoder.
1005	*/
1006	QTextDecoder::~QTextDecoder()
1007	{
1008	}
1009
1010	/!*
1011	\fn QString QTextDecoder::toUnicode(const char chars, int len)*
1012
1013	Converts the first \a len bytes in \a chars to Unicode, returning
1014	the result.
1015
1016	If not all characters are used (e.g. if only part of a multi-byte
1017	encoding is at the end of the characters), the decoder remembers
1018	enough state to continue with the next call to this function.
1019	*/
1020	QString QTextDecoder::toUnicode(const char chars, int* len)
1021	{
1022	return c->toUnicode(in: chars, length: len, state: &state);
1023	}
1024
1025	/! \overload*
1026
1027	The converted string is returned in \a target.
1028	*/
1029	void QTextDecoder::toUnicode(QString target, const* char chars, int* len)
1030	{
1031	Q_ASSERT(target);
1032	switch (c->mibEnum()) {
1033	case `106`: // utf8
1034	static_cast<const QUtf8Codec*>(c)->convertToUnicode(target, chars, len, &state);
1035	break;
1036	case `4`: // latin1
1037	target->resize(size: len);
1038	qt_from_latin1(dst: (char16_t*)target->data(), str: chars, size: len);
1039	break;
1040	default:
1041	*target = c->toUnicode(in: chars, length: len, state: &state);
1042	}
1043	}
1044
1045
1046	/!*
1047	\overload
1048
1049	Converts the bytes in the byte array specified by \a ba to Unicode
1050	and returns the result.
1051	*/
1052	QString QTextDecoder::toUnicode(const QByteArray &ba)
1053	{
1054	return c->toUnicode(in: ba.constData(), length: ba.size(), state: &state);
1055	}
1056
1057	/!*
1058	\since 4.4
1059
1060	Tries to detect the encoding of the provided snippet of HTML in
1061	the given byte array, \a ba, by checking the BOM (Byte Order Mark)
1062	and the content-type meta header and returns a QTextCodec instance
1063	that is capable of decoding the html to unicode. If the codec
1064	cannot be detected from the content provided, \a defaultCodec is
1065	returned.
1066
1067	\sa codecForUtfText()
1068	*/
1069	QTextCodec QTextCodec::codecForHtml(const* QByteArray &ba, QTextCodec *defaultCodec)
1070	{
1071	// determine charset
1072	QTextCodec c = QTextCodec::codecForUtfText(ba, defaultCodec: nullptr*);
1073	if (!c) {
1074	static Q_RELAXED_CONSTEXPR auto matcher = qMakeStaticByteArrayMatcher(pattern: "meta ");
1075	QByteArray header = ba.left(len: `1024`).toLower();
1076	qsizetype pos = matcher.indexIn(haystack: header);
1077	if (pos != -`1`) {
1078	static Q_RELAXED_CONSTEXPR auto matcher = qMakeStaticByteArrayMatcher(pattern: "charset=");
1079	pos = matcher.indexIn(haystack: header, from: pos);
1080	if (pos != -`1`) {
1081	pos += qstrlen(str: "charset=");
1082
1083	qsizetype pos2 = pos;
1084	// The attribute can be closed with either """, "'", ">" or "/",
1085	// none of which are valid charset characters.
1086	while (++pos2 < header.size()) {
1087	char ch = header.at(i: pos2);
1088	if (ch == `'\"'` \|\| ch == `'\''` \|\| ch == `'>'`) {
1089	QByteArray name = header.mid(index: pos, len: pos2 - pos);
1090	if (name == "unicode") // QTBUG-41998, ICU will return UTF-16.
1091	name = QByteArrayLiteral("UTF-8");
1092	c = QTextCodec::codecForName(name);
1093	return c ? c : defaultCodec;
1094	}
1095	}
1096	}
1097	}
1098	}
1099	if (!c)
1100	c = defaultCodec;
1101
1102	return c;
1103	}
1104
1105	/!*
1106	\overload
1107
1108	Tries to detect the encoding of the provided snippet of HTML in
1109	the given byte array, \a ba, by checking the BOM (Byte Order Mark)
1110	and the content-type meta header and returns a QTextCodec instance
1111	that is capable of decoding the html to unicode. If the codec cannot
1112	be detected, this overload returns a Latin-1 QTextCodec.
1113	*/
1114	QTextCodec QTextCodec::codecForHtml(const* QByteArray &ba)
1115	{
1116	return codecForHtml(ba, defaultCodec: QTextCodec::codecForName(name: "ISO-8859-1"));
1117	}
1118
1119	/!*
1120	\since 4.6
1121
1122	Tries to detect the encoding of the provided snippet \a ba by
1123	using the BOM (Byte Order Mark) and returns a QTextCodec instance
1124	that is capable of decoding the text to unicode. This function can
1125	detect one of the following codecs:
1126
1127	\list
1128	\li UTF-32 Little Endian
1129	\li UTF-32 Big Endian
1130	\li UTF-16 Little Endian
1131	\li UTF-16 Big Endian
1132	\li UTF-8
1133	\endlist
1134
1135	If the codec cannot be detected from the content provided, \a defaultCodec
1136	is returned.
1137
1138	\sa codecForHtml()
1139	*/
1140	QTextCodec QTextCodec::codecForUtfText(const* QByteArray &ba, QTextCodec *defaultCodec)
1141	{
1142	const int arraySize = ba.size();
1143	const uchar buf = reinterpret_cast<const* uchar *>(ba.constData());
1144	const uint bom = `0xfeff`;
1145
1146	if (arraySize > `3`) {
1147	uint uc = qFromUnaligned<uint>(src: buf);
1148	if (uc == qToBigEndian(source: bom))
1149	return QTextCodec::codecForMib(mib: `1018`); // utf-32 be
1150	else if (uc == qToLittleEndian(source: bom))
1151	return QTextCodec::codecForMib(mib: `1019`); // utf-32 le
1152	}
1153
1154	if (arraySize < `2`)
1155	return defaultCodec;
1156
1157	ushort uc = qFromUnaligned<ushort>(src: buf);
1158	if (uc == qToBigEndian(source: ushort(bom)))
1159	return QTextCodec::codecForMib(mib: `1013`); // utf16 be
1160	else if (uc == qToLittleEndian(source: ushort(bom)))
1161	return QTextCodec::codecForMib(mib: `1014`); // utf16 le
1162
1163	if (arraySize < `3`)
1164	return defaultCodec;
1165
1166	static const char utf8bom[] = "\xef\xbb\xbf";
1167	if (memcmp(s1: buf, s2: utf8bom, n: sizeof(utf8bom) - `1`) == `0`)
1168	return QTextCodec::codecForMib(mib: `106`); // utf-8
1169
1170	return defaultCodec;
1171	}
1172
1173	/!*
1174	\overload
1175
1176	Tries to detect the encoding of the provided snippet \a ba by
1177	using the BOM (Byte Order Mark) and returns a QTextCodec instance
1178	that is capable of decoding the text to unicode. This function can
1179	detect one of the following codecs:
1180
1181	\list
1182	\li UTF-32 Little Endian
1183	\li UTF-32 Big Endian
1184	\li UTF-16 Little Endian
1185	\li UTF-16 Big Endian
1186	\li UTF-8
1187	\endlist
1188
1189	If the codec cannot be detected from the content provided, this overload
1190	returns a Latin-1 QTextCodec.
1191
1192	\sa codecForHtml()
1193	*/
1194	QTextCodec QTextCodec::codecForUtfText(const* QByteArray &ba)
1195	{
1196	return codecForUtfText(ba, defaultCodec: QTextCodec::codecForMib(/Latin 1/ mib: `4`));
1197	}
1198
1199	/!*
1200	\fn QTextCodec QTextCodec::codecForTr ()*
1201	\deprecated
1202
1203	Returns the codec used by QObject::tr() on its argument. If this
1204	function returns \nullptr (the default), tr() assumes Latin-1.
1205	*/
1206
1207	/!*
1208	\internal
1209	\since 4.3
1210	Determines whether the decoder encountered a failure while decoding the
1211	input. If an error was encountered, the produced result is undefined, and
1212	gets converted as according to the conversion flags.
1213	*/
1214	bool QTextDecoder::hasFailure() const
1215	{
1216	return state.invalidChars != `0`;
1217	}
1218
1219	/!*
1220	\internal
1221	\since 5.12
1222
1223	Determines whether the decoder needs more bytes to continue decoding. That
1224	is, this signifies that the input string ended in the middle of a
1225	multi-byte sequence. Note that it's possible some codecs do not report this.
1226	*/
1227	bool QTextDecoder::needsMoreData() const
1228	{
1229	return state.remainingChars;
1230	}
1231
1232	/!*
1233	\fn QTextCodec Qt::codecForHtml(const QByteArray &ba)*
1234	\internal
1235
1236	This function is defined in the \c <QTextCodec> header file.
1237	*/
1238	QTextCodec Qt::codecForHtml(const* QByteArray &ba)
1239	{
1240	return QTextCodec::codecForHtml(ba);
1241	}
1242
1243	QT_END_NAMESPACE
1244

source code of qt5compat/src/core5/codecs/qtextcodec.cpp