1// Copyright (C) 2018 The Qt Company Ltd.
2// Copyright (C) 2018 Intel Corporation.
3// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4
5#include "qplatformdefs.h"
6
7#include "qtextcodec.h"
8#include "qtextcodec_p.h"
9
10#include "qbytearraymatcher.h"
11#include "qendian.h"
12#include "qfile.h"
13#include "qlist.h"
14#include <private/qlocking_p.h>
15#include "qstringlist.h"
16#include "qvarlengtharray.h"
17
18#include <private/qcoreapplication_p.h>
19
20#include "qutfcodec_p.h"
21#include "qlatincodec_p.h"
22
23#if QT_CONFIG(codecs)
24# include "qtsciicodec_p.h"
25# include "qisciicodec_p.h"
26#endif
27#if QT_CONFIG(icu)
28#include "qicucodec_p.h"
29#else
30#if QT_CONFIG(iconv)
31# include "qiconvcodec_p.h"
32#endif
33#ifdef Q_OS_WIN
34# include "qwindowscodec_p.h"
35#endif
36# include "qsimplecodec_p.h"
37#if QT_CONFIG(big_codecs) && QT_CONFIG(textcodec)
38# ifndef Q_OS_INTEGRITY
39# include "qgb18030codec_p.h"
40# include "qeucjpcodec_p.h"
41# include "qjiscodec_p.h"
42# include "qsjiscodec_p.h"
43# include "qeuckrcodec_p.h"
44# include "qbig5codec_p.h"
45# endif // !Q_OS_INTEGRITY
46#endif // big_codecs
47
48#endif // icu
49
50#include <mutex>
51
52#include <stdlib.h>
53#include <ctype.h>
54#include <locale.h>
55#if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_ANDROID)
56# include <langinfo.h>
57#endif
58
59QT_BEGIN_NAMESPACE
60
61// in qstring.cpp:
62void qt_from_latin1(char16_t *dst, const char *str, size_t size) noexcept;
63
64typedef QList<QTextCodec*>::ConstIterator TextCodecListConstIt;
65typedef QList<QByteArray>::ConstIterator ByteArrayListConstIt;
66
67Q_GLOBAL_STATIC(QRecursiveMutex, textCodecsMutex);
68
69Q_GLOBAL_STATIC(QTextCodecData, textCodecData)
70
71QTextCodecData::QTextCodecData()
72 : codecForLocale(nullptr)
73{
74}
75
76QTextCodecData::~QTextCodecData()
77{
78 codecForLocale.storeRelease(newValue: nullptr);
79 QList<QTextCodec *> tmp = allCodecs;
80 allCodecs.clear();
81 codecCache.clear();
82 for (QList<QTextCodec *>::const_iterator it = tmp.constBegin(); it != tmp.constEnd(); ++it)
83 delete *it;
84}
85
86QTextCodecData *QTextCodecData::instance()
87{
88 return textCodecData();
89}
90
91class TextCodecsMutexLocker
92{
93 using Lock = decltype(qt_unique_lock(mutex&: std::declval<QRecursiveMutex&>()));
94 // ### FIXME: this is used when textCodecsMutex already == nullptr
95 const Lock lock = qt_unique_lock(mutex: textCodecsMutex());
96public:
97 TextCodecsMutexLocker() {} // required d/t an ICC 19 bug
98};
99
100#if !QT_CONFIG(icu)
101static char qtolower(char c)
102{ if (c >= 'A' && c <= 'Z') return c + 0x20; return c; }
103static bool qisalnum(char c)
104{ return (c >= '0' && c <= '9') || ((c | 0x20) >= 'a' && (c | 0x20) <= 'z'); }
105
106bool qTextCodecNameMatch(const char *n, const char *h)
107{
108 if (qstricmp(n, h) == 0)
109 return true;
110
111 // if the letters and numbers are the same, we have a match
112 while (*n != '\0') {
113 if (qisalnum(*n)) {
114 for (;;) {
115 if (*h == '\0')
116 return false;
117 if (qisalnum(*h))
118 break;
119 ++h;
120 }
121 if (qtolower(*n) != qtolower(*h))
122 return false;
123 ++h;
124 }
125 ++n;
126 }
127 while (*h && !qisalnum(*h))
128 ++h;
129 return (*h == '\0');
130}
131
132
133#if !defined(Q_OS_WIN32) && !defined(QT_LOCALE_IS_UTF8)
134static QTextCodec *checkForCodec(const QByteArray &name) {
135 QTextCodec *c = QTextCodec::codecForName(name);
136 if (!c) {
137 const int index = name.indexOf('@');
138 if (index != -1) {
139 c = QTextCodec::codecForName(name.left(index));
140 }
141 }
142 return c;
143}
144#endif
145
146static void setup();
147
148// \threadsafe
149// this returns the codec the method sets up as locale codec to
150// avoid a race condition in codecForLocale() when
151// setCodecForLocale(nullptr) is called at the same time.
152static QTextCodec *setupLocaleMapper()
153{
154 QTextCodecData *globalData = QTextCodecData::instance();
155
156 QTextCodec *locale = nullptr;
157
158 {
159 const TextCodecsMutexLocker locker;
160 if (globalData->allCodecs.isEmpty())
161 setup();
162 }
163
164 QCoreApplicationPrivate::initLocale();
165
166#if defined(QT_LOCALE_IS_UTF8)
167 locale = QTextCodec::codecForName("UTF-8");
168#elif defined(Q_OS_WIN)
169 locale = QTextCodec::codecForName("System");
170#else
171
172 // First try getting the codecs name from nl_langinfo and see
173 // if we have a builtin codec for it.
174 // Only fall back to using iconv if we can't find a builtin codec
175 // This is because the builtin utf8 codec is around 5 times faster
176 // then the using QIconvCodec
177
178#if defined (_XOPEN_UNIX)
179 char *charset = nl_langinfo(CODESET);
180 if (charset)
181 locale = QTextCodec::codecForName(charset);
182#endif
183#if QT_CONFIG(iconv)
184 if (!locale) {
185 // no builtin codec for the locale found, let's try using iconv
186 (void) new QIconvCodec();
187 locale = QTextCodec::codecForName("System");
188 }
189#endif
190
191 if (!locale) {
192 // Very poorly defined and followed standards causes lots of
193 // code to try to get all the cases... This logic is
194 // duplicated in QIconvCodec, so if you change it here, change
195 // it there too.
196
197 // Try to determine locale codeset from locale name assigned to
198 // LC_CTYPE category.
199
200 // First part is getting that locale name. First try setlocale() which
201 // definitely knows it, but since we cannot fully trust it, get ready
202 // to fall back to environment variables.
203 const QByteArray ctype = setlocale(LC_CTYPE, nullptr);
204
205 // Get the first nonempty value from $LC_ALL, $LC_CTYPE, and $LANG
206 // environment variables.
207 QByteArray lang = qgetenv("LC_ALL");
208 if (lang.isEmpty() || lang == "C") {
209 lang = qgetenv("LC_CTYPE");
210 }
211 if (lang.isEmpty() || lang == "C") {
212 lang = qgetenv("LANG");
213 }
214
215 // Now try these in order:
216 // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
217 // 2. CODESET from lang if it contains a .CODESET part
218 // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
219 // 4. locale (ditto)
220 // 5. check for "@euro"
221 // 6. guess locale from ctype unless ctype is "C"
222 // 7. guess locale from lang
223
224 // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
225 int indexOfDot = ctype.indexOf('.');
226 if (indexOfDot != -1)
227 locale = checkForCodec( ctype.mid(indexOfDot + 1) );
228
229 // 2. CODESET from lang if it contains a .CODESET part
230 if (!locale) {
231 indexOfDot = lang.indexOf('.');
232 if (indexOfDot != -1)
233 locale = checkForCodec( lang.mid(indexOfDot + 1) );
234 }
235
236 // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
237 if (!locale && !ctype.isEmpty() && ctype != "C")
238 locale = checkForCodec(ctype);
239
240 // 4. locale (ditto)
241 if (!locale && !lang.isEmpty())
242 locale = checkForCodec(lang);
243
244 // 5. "@euro"
245 if ((!locale && ctype.contains("@euro")) || lang.contains("@euro"))
246 locale = checkForCodec("ISO 8859-15");
247 }
248
249#endif
250 // If everything failed, we default to 8859-1
251 if (!locale)
252 locale = QTextCodec::codecForName("ISO 8859-1");
253 globalData->codecForLocale.storeRelease(locale);
254 return locale;
255}
256
257
258// textCodecsMutex need to be locked to enter this function
259static void setup()
260{
261 static bool initialized = false;
262 if (initialized)
263 return;
264 initialized = true;
265
266#if QT_CONFIG(codecs)
267 (void)new QTsciiCodec;
268 for (int i = 0; i < 9; ++i)
269 (void)new QIsciiCodec(i);
270 for (int i = 0; i < QSimpleTextCodec::numSimpleCodecs; ++i)
271 (void)new QSimpleTextCodec(i);
272
273# if QT_CONFIG(big_codecs) && !defined(Q_OS_INTEGRITY)
274 (void)new QGb18030Codec;
275 (void)new QGbkCodec;
276 (void)new QGb2312Codec;
277 (void)new QEucJpCodec;
278 (void)new QJisCodec;
279 (void)new QSjisCodec;
280 (void)new QEucKrCodec;
281 (void)new QCP949Codec;
282 (void)new QBig5Codec;
283 (void)new QBig5hkscsCodec;
284# endif // big_codecs && !Q_OS_INTEGRITY
285#if QT_CONFIG(iconv)
286 (void) new QIconvCodec;
287#endif
288#if defined(Q_OS_WIN32)
289 (void) new QWindowsLocalCodec;
290#endif // Q_OS_WIN32
291#endif // codecs
292
293 (void)new QUtf16Codec;
294 (void)new QUtf16BECodec;
295 (void)new QUtf16LECodec;
296 (void)new QUtf32Codec;
297 (void)new QUtf32BECodec;
298 (void)new QUtf32LECodec;
299 (void)new QLatin15Codec;
300 (void)new QLatin1Codec;
301 (void)new QUtf8Codec;
302}
303#else
304static void setup() {}
305#endif // icu
306
307/*!
308 \typealias QTextCodec::ConversionFlags
309
310 \value DefaultConversion No flag is set.
311 \value ConvertInvalidToNull If this flag is set, each invalid input
312 character is output as a null character.
313 \value IgnoreHeader Ignore any Unicode byte-order mark and don't generate any.
314
315 \omitvalue FreeFunction
316*/
317
318/*!
319 \typealias QTextCodec::ConverterState
320*/
321
322/*!
323 \class QTextCodec
324 \inmodule QtCore5Compat
325 \brief The QTextCodec class provides conversions between text encodings.
326 \reentrant
327 \ingroup i18n
328
329 Qt uses Unicode to store, draw and manipulate strings. In many
330 situations you may wish to deal with data that uses a different
331 encoding. For example, most Japanese documents are still stored
332 in Shift-JIS or ISO 2022-JP, while Russian users often have their
333 documents in KOI8-R or Windows-1251.
334
335 Qt provides a set of QTextCodec classes to help with converting
336 non-Unicode formats to and from Unicode. You can also create your
337 own codec classes.
338
339 The supported encodings are:
340
341 \list
342 \li \l{Big5 Text Codec}{Big5}
343 \li \l{Big5-HKSCS Text Codec}{Big5-HKSCS}
344 \li CP949
345 \li \l{EUC-JP Text Codec}{EUC-JP}
346 \li \l{EUC-KR Text Codec}{EUC-KR}
347 \li \l{GBK Text Codec}{GB18030}
348 \li HP-ROMAN8
349 \li IBM 850
350 \li IBM 866
351 \li IBM 874
352 \li \l{ISO 2022-JP (JIS) Text Codec}{ISO 2022-JP}
353 \li ISO 8859-1 to 10
354 \li ISO 8859-13 to 16
355 \li Iscii-Bng, Dev, Gjr, Knd, Mlm, Ori, Pnj, Tlg, and Tml
356 \li KOI8-R
357 \li KOI8-U
358 \li Macintosh
359 \li \l{Shift-JIS Text Codec}{Shift-JIS}
360 \li TIS-620
361 \li \l{TSCII Text Codec}{TSCII}
362 \li UTF-8
363 \li UTF-16
364 \li UTF-16BE
365 \li UTF-16LE
366 \li UTF-32
367 \li UTF-32BE
368 \li UTF-32LE
369 \li Windows-1250 to 1258
370 \endlist
371
372 If Qt is compiled with ICU support enabled, most codecs supported by
373 ICU will also be available to the application.
374
375 \l {QTextCodec}s can be used as follows to convert some locally encoded
376 string to Unicode. Suppose you have some string encoded in Russian
377 KOI8-R encoding, and want to convert it to Unicode. The simple way
378 to do it is like this:
379
380 \snippet code/src_corelib_codecs_qtextcodec.cpp 0
381
382 After this, \c string holds the text converted to Unicode.
383 Converting a string from Unicode to the local encoding is just as
384 easy:
385
386 \snippet code/src_corelib_codecs_qtextcodec.cpp 1
387
388 Some care must be taken when trying to convert the data in chunks,
389 for example, when receiving it over a network. In such cases it is
390 possible that a multi-byte character will be split over two
391 chunks. At best this might result in the loss of a character and
392 at worst cause the entire conversion to fail.
393
394 The approach to use in these situations is to create a QTextDecoder
395 object for the codec and use this QTextDecoder for the whole
396 decoding process, as shown below:
397
398 \snippet code/src_corelib_codecs_qtextcodec.cpp 2
399
400 The QTextDecoder object maintains state between chunks and therefore
401 works correctly even if a multi-byte character is split between
402 chunks.
403
404 \section1 Creating Your Own Codec Class
405
406 Support for new text encodings can be added to Qt by creating
407 QTextCodec subclasses.
408
409 The pure virtual functions describe the encoder to the system and
410 the coder is used as required in the different text file formats
411 supported by QTextStream, and under X11, for the locale-specific
412 character input and output.
413
414 To add support for another encoding to Qt, make a subclass of
415 QTextCodec and implement the functions listed in the table below.
416
417 \table
418 \header \li Function \li Description
419
420 \row \li name()
421 \li Returns the official name for the encoding. If the
422 encoding is listed in the
423 \l{IANA character-sets encoding file}, the name
424 should be the preferred MIME name for the encoding.
425
426 \row \li aliases()
427 \li Returns a list of alternative names for the encoding.
428 QTextCodec provides a default implementation that returns
429 an empty list. For example, "ISO-8859-1" has "latin1",
430 "CP819", "IBM819", and "iso-ir-100" as aliases.
431
432 \row \li \l{QTextCodec::mibEnum()}{mibEnum()}
433 \li Return the MIB enum for the encoding if it is listed in
434 the \l{IANA character-sets encoding file}.
435
436 \row \li convertToUnicode()
437 \li Converts an 8-bit character string to Unicode.
438
439 \row \li convertFromUnicode()
440 \li Converts a Unicode string to an 8-bit character string.
441 \endtable
442
443 \sa QTextStream, QTextDecoder, QTextEncoder
444*/
445
446/*!
447 Constructs a QTextCodec, and gives it the highest precedence. The
448 QTextCodec should always be constructed on the heap (i.e. with \c
449 new). Qt takes ownership and will delete it when the application
450 terminates.
451*/
452QTextCodec::QTextCodec()
453{
454 const TextCodecsMutexLocker locker;
455
456 QTextCodecData *globalInstance = QTextCodecData::instance();
457 if (globalInstance->allCodecs.isEmpty())
458 setup();
459
460 globalInstance->allCodecs.prepend(t: this);
461}
462
463
464/*!
465 \nonreentrant
466
467 Destroys the QTextCodec. Note that you should not delete codecs
468 yourself: once created they become Qt's responsibility.
469*/
470QTextCodec::~QTextCodec()
471{
472 QTextCodecData *globalData = QTextCodecData::instance();
473 if (!globalData)
474 return;
475
476 globalData->codecForLocale.testAndSetRelaxed(expectedValue: this, newValue: nullptr);
477
478 const TextCodecsMutexLocker locker;
479
480 globalData->allCodecs.removeOne(t: this);
481
482 auto it = globalData->codecCache.begin();
483
484 while (it != globalData->codecCache.end()) {
485 if (it.value() == this)
486 it = globalData->codecCache.erase(it);
487 else
488 ++it;
489 }
490}
491
492/*!
493 \fn QTextCodec *QTextCodec::codecForName(const char *name)
494
495 Searches all installed QTextCodec objects and returns the one
496 which best matches \a name; the match is case-insensitive. Returns
497 \nullptr if no codec matching the name \a name could be found.
498*/
499
500/*!
501 \threadsafe
502 Searches all installed QTextCodec objects and returns the one
503 which best matches \a name; the match is case-insensitive. Returns
504 \nullptr if no codec matching the name \a name could be found.
505*/
506QTextCodec *QTextCodec::codecForName(const QByteArray &name)
507{
508 if (name.isEmpty())
509 return nullptr;
510
511 const TextCodecsMutexLocker locker;
512
513 QTextCodecData *globalData = QTextCodecData::instance();
514 if (!globalData)
515 return nullptr;
516 setup();
517
518#if !QT_CONFIG(icu)
519 QTextCodecCache *cache = &globalData->codecCache;
520 QTextCodec *codec;
521 codec = cache->value(name);
522 if (codec)
523 return codec;
524
525 for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) {
526 QTextCodec *cursor = *it;
527 if (qTextCodecNameMatch(cursor->name(), name)) {
528 if (cache)
529 cache->insert(name, cursor);
530 return cursor;
531 }
532 QList<QByteArray> aliases = cursor->aliases();
533 for (ByteArrayListConstIt ait = aliases.constBegin(), acend = aliases.constEnd(); ait != acend; ++ait) {
534 if (qTextCodecNameMatch(*ait, name)) {
535 cache->insert(name, cursor);
536 return cursor;
537 }
538 }
539 }
540
541 return nullptr;
542#else
543 return QIcuCodec::codecForNameUnlocked(name);
544#endif
545}
546
547
548/*!
549 \threadsafe
550 Returns the QTextCodec which matches the
551 \l{QTextCodec::mibEnum()}{MIBenum} \a mib.
552*/
553QTextCodec* QTextCodec::codecForMib(int mib)
554{
555 const TextCodecsMutexLocker locker;
556
557 QTextCodecData *globalData = QTextCodecData::instance();
558 if (!globalData)
559 return nullptr;
560 if (globalData->allCodecs.isEmpty())
561 setup();
562
563 QByteArray key = "MIB: " + QByteArray::number(mib);
564
565 QTextCodecCache *cache = &globalData->codecCache;
566 QTextCodec *codec;
567 if (cache) {
568 codec = cache->value(key);
569 if (codec)
570 return codec;
571 }
572
573 for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) {
574 QTextCodec *cursor = *it;
575 if (cursor->mibEnum() == mib) {
576 if (cache)
577 cache->insert(key, value: cursor);
578 return cursor;
579 }
580 }
581
582#if QT_CONFIG(icu)
583 return QIcuCodec::codecForMibUnlocked(mib);
584#else
585 return nullptr;
586#endif
587}
588
589/*!
590 \threadsafe
591 Returns the list of all available codecs, by name. Call
592 QTextCodec::codecForName() to obtain the QTextCodec for the name.
593
594 The list may contain many mentions of the same codec
595 if the codec has aliases.
596
597 \sa availableMibs(), name(), aliases()
598*/
599QList<QByteArray> QTextCodec::availableCodecs()
600{
601 const TextCodecsMutexLocker locker;
602
603 QTextCodecData *globalData = QTextCodecData::instance();
604 if (globalData->allCodecs.isEmpty())
605 setup();
606
607 QList<QByteArray> codecs;
608
609 for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) {
610 codecs += (*it)->name();
611 codecs += (*it)->aliases();
612 }
613
614#if QT_CONFIG(icu)
615 codecs += QIcuCodec::availableCodecs();
616#endif
617
618 return codecs;
619}
620
621/*!
622 \threadsafe
623 Returns the list of MIBs for all available codecs. Call
624 QTextCodec::codecForMib() to obtain the QTextCodec for the MIB.
625
626 \sa availableCodecs(), mibEnum()
627*/
628QList<int> QTextCodec::availableMibs()
629{
630#if QT_CONFIG(icu)
631 return QIcuCodec::availableMibs();
632#else
633 const TextCodecsMutexLocker locker;
634
635 QTextCodecData *globalData = QTextCodecData::instance();
636 if (globalData->allCodecs.isEmpty())
637 setup();
638
639 QList<int> codecs;
640
641 for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it)
642 codecs += (*it)->mibEnum();
643
644 return codecs;
645#endif
646}
647
648/*!
649 \nonreentrant
650
651 Set the codec to \a c; this will be returned by
652 codecForLocale(). If \a c is \nullptr, the codec is reset to
653 the default.
654
655 This might be needed for some applications that want to use their
656 own mechanism for setting the locale.
657
658 \sa codecForLocale()
659*/
660void QTextCodec::setCodecForLocale(QTextCodec *c)
661{
662 QTextCodecData::instance()->codecForLocale.storeRelease(newValue: c);
663}
664
665/*!
666 \threadsafe
667 Returns a pointer to the codec most suitable for this locale.
668
669 The codec will be retrieved from ICU where that backend is in use, otherwise
670 it may be obtained from an OS-specific API. In the latter case, the codec's
671 name may be "System".
672*/
673
674QTextCodec* QTextCodec::codecForLocale()
675{
676 QTextCodecData *globalData = QTextCodecData::instance();
677 if (!globalData)
678 return nullptr;
679
680 QTextCodec *codec = globalData->codecForLocale.loadAcquire();
681 if (!codec) {
682#if QT_CONFIG(icu)
683 const TextCodecsMutexLocker locker;
684 codec = QIcuCodec::defaultCodecUnlocked();
685#else
686 // setupLocaleMapper locks as necessary
687 codec = setupLocaleMapper();
688#endif
689 }
690
691 return codec;
692}
693
694
695/*!
696 \fn QByteArray QTextCodec::name() const
697
698 QTextCodec subclasses must reimplement this function. It returns
699 the name of the encoding supported by the subclass.
700
701 If the codec is registered as a character set in the
702 \l{IANA character-sets encoding file} this method should
703 return the preferred mime name for the codec if defined,
704 otherwise its name.
705*/
706
707/*!
708 \fn int QTextCodec::mibEnum() const
709
710 Subclasses of QTextCodec must reimplement this function. It
711 returns the \l{QTextCodec::mibEnum()}{MIBenum} (see \l{IANA character-sets encoding file}
712 for more information). It is important that each QTextCodec
713 subclass returns the correct unique value for this function.
714*/
715
716/*!
717 Subclasses can return a number of aliases for the codec in question.
718
719 Standard aliases for codecs can be found in the
720 \l{IANA character-sets encoding file}.
721*/
722QList<QByteArray> QTextCodec::aliases() const
723{
724 return QList<QByteArray>();
725}
726
727/*!
728 \fn QString QTextCodec::convertToUnicode(const char *chars, int len,
729 ConverterState *state) const
730
731 QTextCodec subclasses must reimplement this function.
732
733 Converts the first \a len characters of \a chars from the
734 encoding of the subclass to Unicode, and returns the result in a
735 QString.
736
737 \a state can be \nullptr, in which case the conversion is stateless and
738 default conversion rules should be used. If \a state is not \nullptr, the
739 codec should save the state after the conversion in \a state, and
740 adjust the \c remainingChars and \c invalidChars members of the struct.
741*/
742
743/*!
744 \fn QByteArray QTextCodec::convertFromUnicode(const QChar *input, int number,
745 ConverterState *state) const
746
747 QTextCodec subclasses must reimplement this function.
748
749 Converts the first \a number of characters from the \a input array
750 from Unicode to the encoding of the subclass, and returns the result
751 in a QByteArray.
752
753 \a state can be \nullptr in which case the conversion is stateless and
754 default conversion rules should be used. If \a state is not \nullptr, the
755 codec should save the state after the conversion in \a state, and
756 adjust the \c remainingChars and \c invalidChars members of the struct.
757*/
758
759/*!
760 Creates a QTextDecoder with a specified \a flags to decode chunks
761 of \c{char *} data to create chunks of Unicode data.
762
763 The caller is responsible for deleting the returned object.
764
765 \since 4.7
766*/
767QTextDecoder* QTextCodec::makeDecoder(QTextCodec::ConversionFlags flags) const
768{
769 return new QTextDecoder(this, flags);
770}
771
772/*!
773 Creates a QTextEncoder with a specified \a flags to encode chunks
774 of Unicode data as \c{char *} data.
775
776 The caller is responsible for deleting the returned object.
777
778 \since 4.7
779*/
780QTextEncoder* QTextCodec::makeEncoder(QTextCodec::ConversionFlags flags) const
781{
782 return new QTextEncoder(this, flags);
783}
784
785/*!
786 \fn QByteArray QTextCodec::fromUnicode(const QChar *input, int number,
787 ConverterState *state) const
788
789 Converts the first \a number of characters from the \a input array
790 from Unicode to the encoding of this codec, and returns the result
791 in a QByteArray.
792
793 The \a state of the convertor used is updated.
794*/
795
796/*!
797 Converts \a str from Unicode to the encoding of this codec, and
798 returns the result in a QByteArray.
799*/
800QByteArray QTextCodec::fromUnicode(const QString& str) const
801{
802 return convertFromUnicode(in: str.constData(), length: str.size(), state: nullptr);
803}
804
805/*!
806 \overload
807 \since 5.10
808
809 Converts \a str from Unicode to the encoding of this codec, and
810 returns the result in a QByteArray.
811*/
812QByteArray QTextCodec::fromUnicode(QStringView str) const
813{
814 return convertFromUnicode(in: str.data(), length: str.size(), state: nullptr);
815}
816
817/*!
818 \fn QString QTextCodec::toUnicode(const char *input, int size,
819 ConverterState *state) const
820
821 Converts the first \a size characters from the \a input from the
822 encoding of this codec to Unicode, and returns the result in a
823 QString.
824
825 The \a state of the convertor used is updated.
826*/
827
828/*!
829 Converts \a a from the encoding of this codec to Unicode, and
830 returns the result in a QString.
831*/
832QString QTextCodec::toUnicode(const QByteArray& a) const
833{
834 return convertToUnicode(in: a.constData(), length: a.size(), state: nullptr);
835}
836
837/*!
838 Returns \c true if the Unicode character \a ch can be fully encoded
839 with this codec; otherwise returns \c false.
840*/
841bool QTextCodec::canEncode(QChar ch) const
842{
843 ConverterState state;
844 state.flags = ConvertInvalidToNull;
845 convertFromUnicode(in: &ch, length: 1, state: &state);
846 return (state.invalidChars == 0);
847}
848
849/*!
850 \overload
851
852 \a s contains the string being tested for encode-ability.
853*/
854bool QTextCodec::canEncode(const QString& s) const
855{
856 ConverterState state;
857 state.flags = ConvertInvalidToNull;
858 convertFromUnicode(in: s.constData(), length: s.size(), state: &state);
859 return (state.invalidChars == 0);
860}
861
862/*!
863 \overload
864 \since 5.10
865
866 Returns \c true if the Unicode string \a s can be fully encoded
867 with this codec; otherwise returns \c false.
868*/
869bool QTextCodec::canEncode(QStringView s) const
870{
871 ConverterState state;
872 state.flags = ConvertInvalidToNull;
873 convertFromUnicode(in: s.data(), length: s.size(), state: &state);
874 return !state.invalidChars;
875}
876/*!
877 \overload
878
879 \a chars contains the source characters.
880*/
881QString QTextCodec::toUnicode(const char *chars) const
882{
883 const auto len = int(qstrlen(str: chars));
884 return convertToUnicode(in: chars, length: len, state: nullptr);
885}
886
887
888/*!
889 \class QTextEncoder
890 \inmodule QtCore5Compat
891 \brief The QTextEncoder class provides a state-based encoder.
892 \reentrant
893 \ingroup i18n
894
895 A text encoder converts text from Unicode into an encoded text format
896 using a specific codec.
897
898 The encoder converts Unicode into another format, remembering any
899 state that is required between calls.
900
901 \sa QTextCodec::makeEncoder(), QTextDecoder
902*/
903
904/*!
905 \fn QTextEncoder::QTextEncoder(const QTextCodec *codec)
906
907 Constructs a text encoder for the given \a codec.
908*/
909
910/*!
911 Constructs a text encoder for the given \a codec and conversion \a flags.
912
913 \since 4.7
914*/
915QTextEncoder::QTextEncoder(const QTextCodec *codec, QTextCodec::ConversionFlags flags)
916 : c(codec), state()
917{
918 state.flags = flags;
919}
920
921/*!
922 Destroys the encoder.
923*/
924QTextEncoder::~QTextEncoder()
925{
926}
927
928/*!
929 \internal
930 \since 4.5
931 Determines whether the encoder encountered a failure while decoding the input. If
932 an error was encountered, the produced result is undefined, and gets converted as according
933 to the conversion flags.
934 */
935bool QTextEncoder::hasFailure() const
936{
937 return state.invalidChars != 0;
938}
939
940/*!
941 Converts the Unicode string \a str into an encoded QByteArray.
942*/
943QByteArray QTextEncoder::fromUnicode(const QString& str)
944{
945 return c->fromUnicode(in: str.constData(), length: str.size(), state: &state);
946}
947
948/*!
949 \overload
950 \since 5.10
951 Converts the Unicode string \a str into an encoded QByteArray.
952*/
953QByteArray QTextEncoder::fromUnicode(QStringView str)
954{
955 return c->fromUnicode(in: str.data(), length: str.size(), state: &state);
956}
957
958/*!
959 \overload
960
961 Converts \a len characters (not bytes) from \a uc, and returns the
962 result in a QByteArray.
963*/
964QByteArray QTextEncoder::fromUnicode(const QChar *uc, int len)
965{
966 return c->fromUnicode(in: uc, length: len, state: &state);
967}
968
969/*!
970 \class QTextDecoder
971 \inmodule QtCore5Compat
972 \brief The QTextDecoder class provides a state-based decoder.
973 \reentrant
974 \ingroup i18n
975
976 A text decoder converts text from an encoded text format into Unicode
977 using a specific codec.
978
979 The decoder converts text in this format into Unicode, remembering any
980 state that is required between calls.
981
982 \sa QTextCodec::makeDecoder(), QTextEncoder
983*/
984
985/*!
986 \fn QTextDecoder::QTextDecoder(const QTextCodec *codec)
987
988 Constructs a text decoder for the given \a codec.
989*/
990
991/*!
992 Constructs a text decoder for the given \a codec and conversion \a flags.
993
994 \since 4.7
995*/
996
997QTextDecoder::QTextDecoder(const QTextCodec *codec, QTextCodec::ConversionFlags flags)
998 : c(codec), state()
999{
1000 state.flags = flags;
1001}
1002
1003/*!
1004 Destroys the decoder.
1005*/
1006QTextDecoder::~QTextDecoder()
1007{
1008}
1009
1010/*!
1011 \fn QString QTextDecoder::toUnicode(const char *chars, int len)
1012
1013 Converts the first \a len bytes in \a chars to Unicode, returning
1014 the result.
1015
1016 If not all characters are used (e.g. if only part of a multi-byte
1017 encoding is at the end of the characters), the decoder remembers
1018 enough state to continue with the next call to this function.
1019*/
1020QString QTextDecoder::toUnicode(const char *chars, int len)
1021{
1022 return c->toUnicode(in: chars, length: len, state: &state);
1023}
1024
1025/*! \overload
1026
1027 The converted string is returned in \a target.
1028 */
1029void QTextDecoder::toUnicode(QString *target, const char *chars, int len)
1030{
1031 Q_ASSERT(target);
1032 switch (c->mibEnum()) {
1033 case 106: // utf8
1034 static_cast<const QUtf8Codec*>(c)->convertToUnicode(target, chars, len, &state);
1035 break;
1036 case 4: // latin1
1037 target->resize(size: len);
1038 qt_from_latin1(dst: (char16_t*)target->data(), str: chars, size: len);
1039 break;
1040 default:
1041 *target = c->toUnicode(in: chars, length: len, state: &state);
1042 }
1043}
1044
1045
1046/*!
1047 \overload
1048
1049 Converts the bytes in the byte array specified by \a ba to Unicode
1050 and returns the result.
1051*/
1052QString QTextDecoder::toUnicode(const QByteArray &ba)
1053{
1054 return c->toUnicode(in: ba.constData(), length: ba.size(), state: &state);
1055}
1056
1057/*!
1058 \since 4.4
1059
1060 Tries to detect the encoding of the provided snippet of HTML in
1061 the given byte array, \a ba, by checking the BOM (Byte Order Mark)
1062 and the content-type meta header and returns a QTextCodec instance
1063 that is capable of decoding the html to unicode. If the codec
1064 cannot be detected from the content provided, \a defaultCodec is
1065 returned.
1066
1067 \sa codecForUtfText()
1068*/
1069QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec *defaultCodec)
1070{
1071 // determine charset
1072 QTextCodec *c = QTextCodec::codecForUtfText(ba, defaultCodec: nullptr);
1073 if (!c) {
1074 static Q_RELAXED_CONSTEXPR auto matcher = qMakeStaticByteArrayMatcher(pattern: "meta ");
1075 QByteArray header = ba.left(len: 1024).toLower();
1076 qsizetype pos = matcher.indexIn(haystack: header);
1077 if (pos != -1) {
1078 static Q_RELAXED_CONSTEXPR auto matcher = qMakeStaticByteArrayMatcher(pattern: "charset=");
1079 pos = matcher.indexIn(haystack: header, from: pos);
1080 if (pos != -1) {
1081 pos += qstrlen(str: "charset=");
1082
1083 qsizetype pos2 = pos;
1084 // The attribute can be closed with either """, "'", ">" or "/",
1085 // none of which are valid charset characters.
1086 while (++pos2 < header.size()) {
1087 char ch = header.at(i: pos2);
1088 if (ch == '\"' || ch == '\'' || ch == '>') {
1089 QByteArray name = header.mid(index: pos, len: pos2 - pos);
1090 if (name == "unicode") // QTBUG-41998, ICU will return UTF-16.
1091 name = QByteArrayLiteral("UTF-8");
1092 c = QTextCodec::codecForName(name);
1093 return c ? c : defaultCodec;
1094 }
1095 }
1096 }
1097 }
1098 }
1099 if (!c)
1100 c = defaultCodec;
1101
1102 return c;
1103}
1104
1105/*!
1106 \overload
1107
1108 Tries to detect the encoding of the provided snippet of HTML in
1109 the given byte array, \a ba, by checking the BOM (Byte Order Mark)
1110 and the content-type meta header and returns a QTextCodec instance
1111 that is capable of decoding the html to unicode. If the codec cannot
1112 be detected, this overload returns a Latin-1 QTextCodec.
1113*/
1114QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba)
1115{
1116 return codecForHtml(ba, defaultCodec: QTextCodec::codecForName(name: "ISO-8859-1"));
1117}
1118
1119/*!
1120 \since 4.6
1121
1122 Tries to detect the encoding of the provided snippet \a ba by
1123 using the BOM (Byte Order Mark) and returns a QTextCodec instance
1124 that is capable of decoding the text to unicode. This function can
1125 detect one of the following codecs:
1126
1127 \list
1128 \li UTF-32 Little Endian
1129 \li UTF-32 Big Endian
1130 \li UTF-16 Little Endian
1131 \li UTF-16 Big Endian
1132 \li UTF-8
1133 \endlist
1134
1135 If the codec cannot be detected from the content provided, \a defaultCodec
1136 is returned.
1137
1138 \sa codecForHtml()
1139*/
1140QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba, QTextCodec *defaultCodec)
1141{
1142 const int arraySize = ba.size();
1143 const uchar *buf = reinterpret_cast<const uchar *>(ba.constData());
1144 const uint bom = 0xfeff;
1145
1146 if (arraySize > 3) {
1147 uint uc = qFromUnaligned<uint>(src: buf);
1148 if (uc == qToBigEndian(source: bom))
1149 return QTextCodec::codecForMib(mib: 1018); // utf-32 be
1150 else if (uc == qToLittleEndian(source: bom))
1151 return QTextCodec::codecForMib(mib: 1019); // utf-32 le
1152 }
1153
1154 if (arraySize < 2)
1155 return defaultCodec;
1156
1157 ushort uc = qFromUnaligned<ushort>(src: buf);
1158 if (uc == qToBigEndian(source: ushort(bom)))
1159 return QTextCodec::codecForMib(mib: 1013); // utf16 be
1160 else if (uc == qToLittleEndian(source: ushort(bom)))
1161 return QTextCodec::codecForMib(mib: 1014); // utf16 le
1162
1163 if (arraySize < 3)
1164 return defaultCodec;
1165
1166 static const char utf8bom[] = "\xef\xbb\xbf";
1167 if (memcmp(s1: buf, s2: utf8bom, n: sizeof(utf8bom) - 1) == 0)
1168 return QTextCodec::codecForMib(mib: 106); // utf-8
1169
1170 return defaultCodec;
1171}
1172
1173/*!
1174 \overload
1175
1176 Tries to detect the encoding of the provided snippet \a ba by
1177 using the BOM (Byte Order Mark) and returns a QTextCodec instance
1178 that is capable of decoding the text to unicode. This function can
1179 detect one of the following codecs:
1180
1181 \list
1182 \li UTF-32 Little Endian
1183 \li UTF-32 Big Endian
1184 \li UTF-16 Little Endian
1185 \li UTF-16 Big Endian
1186 \li UTF-8
1187 \endlist
1188
1189 If the codec cannot be detected from the content provided, this overload
1190 returns a Latin-1 QTextCodec.
1191
1192 \sa codecForHtml()
1193*/
1194QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba)
1195{
1196 return codecForUtfText(ba, defaultCodec: QTextCodec::codecForMib(/*Latin 1*/ mib: 4));
1197}
1198
1199/*!
1200 \fn QTextCodec *QTextCodec::codecForTr ()
1201 \deprecated
1202
1203 Returns the codec used by QObject::tr() on its argument. If this
1204 function returns \nullptr (the default), tr() assumes Latin-1.
1205*/
1206
1207/*!
1208 \internal
1209 \since 4.3
1210 Determines whether the decoder encountered a failure while decoding the
1211 input. If an error was encountered, the produced result is undefined, and
1212 gets converted as according to the conversion flags.
1213 */
1214bool QTextDecoder::hasFailure() const
1215{
1216 return state.invalidChars != 0;
1217}
1218
1219/*!
1220 \internal
1221 \since 5.12
1222
1223 Determines whether the decoder needs more bytes to continue decoding. That
1224 is, this signifies that the input string ended in the middle of a
1225 multi-byte sequence. Note that it's possible some codecs do not report this.
1226 */
1227bool QTextDecoder::needsMoreData() const
1228{
1229 return state.remainingChars;
1230}
1231
1232/*!
1233 \fn QTextCodec * Qt::codecForHtml(const QByteArray &ba)
1234 \internal
1235
1236 This function is defined in the \c <QTextCodec> header file.
1237*/
1238QTextCodec *Qt::codecForHtml(const QByteArray &ba)
1239{
1240 return QTextCodec::codecForHtml(ba);
1241}
1242
1243QT_END_NAMESPACE
1244

source code of qt5compat/src/core5/codecs/qtextcodec.cpp