1// Copyright (C) 2018 The Qt Company Ltd.
2// Copyright (C) 2018 Intel Corporation.
3// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4
5#include "qplatformdefs.h"
6
7#include "qtextcodec.h"
8#include "qtextcodec_p.h"
9
10#include "qbytearraymatcher.h"
11#include "qendian.h"
12#include "qfile.h"
13#include "qlist.h"
14#include <private/qlocking_p.h>
15#include "qstringlist.h"
16#include "qvarlengtharray.h"
17
18#include <private/qcoreapplication_p.h>
19
20#include "qutfcodec_p.h"
21#include "qlatincodec_p.h"
22
23#if QT_CONFIG(codecs)
24# include "qtsciicodec_p.h"
25# include "qisciicodec_p.h"
26#endif
27#if QT_CONFIG(icu)
28#include "qicucodec_p.h"
29#else
30#if QT_CONFIG(iconv)
31# include "qiconvcodec_p.h"
32#endif
33#ifdef Q_OS_WIN
34# include "qwindowscodec_p.h"
35#endif
36# include "qsimplecodec_p.h"
37#if QT_CONFIG(big_codecs) && QT_CONFIG(textcodec)
38# ifndef Q_OS_INTEGRITY
39# include "qgb18030codec_p.h"
40# include "qeucjpcodec_p.h"
41# include "qjiscodec_p.h"
42# include "qsjiscodec_p.h"
43# include "qeuckrcodec_p.h"
44# include "qbig5codec_p.h"
45# endif // !Q_OS_INTEGRITY
46#endif // big_codecs
47
48#endif // icu
49
50#include <mutex>
51
52#include <stdlib.h>
53#include <ctype.h>
54#include <locale.h>
55#if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_ANDROID)
56# include <langinfo.h>
57#endif
58
59QT_BEGIN_NAMESPACE
60
61// in qstring.cpp:
62void qt_from_latin1(char16_t *dst, const char *str, size_t size) noexcept;
63
64typedef QList<QTextCodec*>::ConstIterator TextCodecListConstIt;
65typedef QList<QByteArray>::ConstIterator ByteArrayListConstIt;
66
67Q_GLOBAL_STATIC(QRecursiveMutex, textCodecsMutex);
68
69Q_GLOBAL_STATIC(QTextCodecData, textCodecData)
70
71QTextCodecData::QTextCodecData()
72 : codecForLocale(nullptr)
73{
74}
75
76QTextCodecData::~QTextCodecData()
77{
78 codecForLocale.storeRelease(newValue: nullptr);
79 QList<QTextCodec *> tmp = allCodecs;
80 allCodecs.clear();
81 codecCache.clear();
82 for (QList<QTextCodec *>::const_iterator it = tmp.constBegin(); it != tmp.constEnd(); ++it)
83 delete *it;
84}
85
86QTextCodecData *QTextCodecData::instance()
87{
88 return textCodecData();
89}
90
91class TextCodecsMutexLocker
92{
93 using Lock = decltype(qt_unique_lock(mutex&: std::declval<QRecursiveMutex&>()));
94 // ### FIXME: this is used when textCodecsMutex already == nullptr
95 const Lock lock = qt_unique_lock(mutex: textCodecsMutex());
96public:
97 TextCodecsMutexLocker() {} // required d/t an ICC 19 bug
98};
99
100#if !QT_CONFIG(icu)
101static char qtolower(char c)
102{ if (c >= 'A' && c <= 'Z') return c + 0x20; return c; }
103static bool qisalnum(char c)
104{ return (c >= '0' && c <= '9') || ((c | 0x20) >= 'a' && (c | 0x20) <= 'z'); }
105
106bool qTextCodecNameMatch(const char *n, const char *h)
107{
108 if (qstricmp(n, h) == 0)
109 return true;
110
111 // if the letters and numbers are the same, we have a match
112 while (*n != '\0') {
113 if (qisalnum(*n)) {
114 for (;;) {
115 if (*h == '\0')
116 return false;
117 if (qisalnum(*h))
118 break;
119 ++h;
120 }
121 if (qtolower(*n) != qtolower(*h))
122 return false;
123 ++h;
124 }
125 ++n;
126 }
127 while (*h && !qisalnum(*h))
128 ++h;
129 return (*h == '\0');
130}
131
132
133#if !defined(Q_OS_WIN32) && !defined(QT_LOCALE_IS_UTF8)
134static QTextCodec *checkForCodec(const QByteArray &name) {
135 QTextCodec *c = QTextCodec::codecForName(name);
136 if (!c) {
137 const int index = name.indexOf('@');
138 if (index != -1) {
139 c = QTextCodec::codecForName(name.left(index));
140 }
141 }
142 return c;
143}
144#endif
145
146static void setup();
147
148// \threadsafe
149// this returns the codec the method sets up as locale codec to
150// avoid a race condition in codecForLocale() when
151// setCodecForLocale(nullptr) is called at the same time.
152static QTextCodec *setupLocaleMapper()
153{
154 QTextCodecData *globalData = QTextCodecData::instance();
155
156 QTextCodec *locale = nullptr;
157
158 {
159 const TextCodecsMutexLocker locker;
160 if (globalData->allCodecs.isEmpty())
161 setup();
162 }
163
164 QCoreApplicationPrivate::initLocale();
165
166#if defined(QT_LOCALE_IS_UTF8)
167 locale = QTextCodec::codecForName("UTF-8");
168#elif defined(Q_OS_WIN)
169 locale = QTextCodec::codecForName("System");
170#else
171
172 // First try getting the codecs name from nl_langinfo and see
173 // if we have a builtin codec for it.
174 // Only fall back to using iconv if we can't find a builtin codec
175 // This is because the builtin utf8 codec is around 5 times faster
176 // then the using QIconvCodec
177
178#if defined (_XOPEN_UNIX)
179 char *charset = nl_langinfo(CODESET);
180 if (charset)
181 locale = QTextCodec::codecForName(charset);
182#endif
183#if QT_CONFIG(iconv)
184 if (!locale) {
185 // no builtin codec for the locale found, let's try using iconv
186 (void) new QIconvCodec();
187 locale = QTextCodec::codecForName("System");
188 }
189#endif
190
191 if (!locale) {
192 // Very poorly defined and followed standards causes lots of
193 // code to try to get all the cases... This logic is
194 // duplicated in QIconvCodec, so if you change it here, change
195 // it there too.
196
197 // Try to determine locale codeset from locale name assigned to
198 // LC_CTYPE category.
199
200 // First part is getting that locale name. First try setlocale() which
201 // definitely knows it, but since we cannot fully trust it, get ready
202 // to fall back to environment variables.
203 const QByteArray ctype = setlocale(LC_CTYPE, nullptr);
204
205 // Get the first nonempty value from $LC_ALL, $LC_CTYPE, and $LANG
206 // environment variables.
207 QByteArray lang = qgetenv("LC_ALL");
208 if (lang.isEmpty() || lang == "C") {
209 lang = qgetenv("LC_CTYPE");
210 }
211 if (lang.isEmpty() || lang == "C") {
212 lang = qgetenv("LANG");
213 }
214
215 // Now try these in order:
216 // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
217 // 2. CODESET from lang if it contains a .CODESET part
218 // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
219 // 4. locale (ditto)
220 // 5. check for "@euro"
221 // 6. guess locale from ctype unless ctype is "C"
222 // 7. guess locale from lang
223
224 // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
225 int indexOfDot = ctype.indexOf('.');
226 if (indexOfDot != -1)
227 locale = checkForCodec( ctype.mid(indexOfDot + 1) );
228
229 // 2. CODESET from lang if it contains a .CODESET part
230 if (!locale) {
231 indexOfDot = lang.indexOf('.');
232 if (indexOfDot != -1)
233 locale = checkForCodec( lang.mid(indexOfDot + 1) );
234 }
235
236 // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
237 if (!locale && !ctype.isEmpty() && ctype != "C")
238 locale = checkForCodec(ctype);
239
240 // 4. locale (ditto)
241 if (!locale && !lang.isEmpty())
242 locale = checkForCodec(lang);
243
244 // 5. "@euro"
245 if ((!locale && ctype.contains("@euro")) || lang.contains("@euro"))
246 locale = checkForCodec("ISO 8859-15");
247 }
248
249#endif
250 // If everything failed, we default to 8859-1
251 if (!locale)
252 locale = QTextCodec::codecForName("ISO 8859-1");
253 globalData->codecForLocale.storeRelease(locale);
254 return locale;
255}
256
257
258// textCodecsMutex need to be locked to enter this function
259static void setup()
260{
261 static bool initialized = false;
262 if (initialized)
263 return;
264 initialized = true;
265
266#if QT_CONFIG(codecs)
267 (void)new QTsciiCodec;
268 for (int i = 0; i < 9; ++i)
269 (void)new QIsciiCodec(i);
270 for (int i = 0; i < QSimpleTextCodec::numSimpleCodecs; ++i)
271 (void)new QSimpleTextCodec(i);
272
273# if QT_CONFIG(big_codecs) && !defined(Q_OS_INTEGRITY)
274 (void)new QGb18030Codec;
275 (void)new QGbkCodec;
276 (void)new QGb2312Codec;
277 (void)new QEucJpCodec;
278 (void)new QJisCodec;
279 (void)new QSjisCodec;
280 (void)new QEucKrCodec;
281 (void)new QCP949Codec;
282 (void)new QBig5Codec;
283 (void)new QBig5hkscsCodec;
284# endif // big_codecs && !Q_OS_INTEGRITY
285#if QT_CONFIG(iconv)
286 (void) new QIconvCodec;
287#endif
288#if defined(Q_OS_WIN32)
289 (void) new QWindowsLocalCodec;
290#endif // Q_OS_WIN32
291#endif // codecs
292
293 (void)new QUtf16Codec;
294 (void)new QUtf16BECodec;
295 (void)new QUtf16LECodec;
296 (void)new QUtf32Codec;
297 (void)new QUtf32BECodec;
298 (void)new QUtf32LECodec;
299 (void)new QLatin15Codec;
300 (void)new QLatin1Codec;
301 (void)new QUtf8Codec;
302}
303#else
304static void setup() {}
305#endif // icu
306
307/*!
308 \class QTextCodec
309 \inmodule QtCore5Compat
310 \brief The QTextCodec class provides conversions between text encodings.
311 \reentrant
312 \ingroup i18n
313
314 Qt uses Unicode to store, draw and manipulate strings. In many
315 situations you may wish to deal with data that uses a different
316 encoding. For example, most Japanese documents are still stored
317 in Shift-JIS or ISO 2022-JP, while Russian users often have their
318 documents in KOI8-R or Windows-1251.
319
320 Qt provides a set of QTextCodec classes to help with converting
321 non-Unicode formats to and from Unicode. You can also create your
322 own codec classes.
323
324 The supported encodings are:
325
326 \list
327 \li \l{Big5 Text Codec}{Big5}
328 \li \l{Big5-HKSCS Text Codec}{Big5-HKSCS}
329 \li CP949
330 \li \l{EUC-JP Text Codec}{EUC-JP}
331 \li \l{EUC-KR Text Codec}{EUC-KR}
332 \li \l{GBK Text Codec}{GB18030}
333 \li HP-ROMAN8
334 \li IBM 850
335 \li IBM 866
336 \li IBM 874
337 \li \l{ISO 2022-JP (JIS) Text Codec}{ISO 2022-JP}
338 \li ISO 8859-1 to 10
339 \li ISO 8859-13 to 16
340 \li Iscii-Bng, Dev, Gjr, Knd, Mlm, Ori, Pnj, Tlg, and Tml
341 \li KOI8-R
342 \li KOI8-U
343 \li Macintosh
344 \li \l{Shift-JIS Text Codec}{Shift-JIS}
345 \li TIS-620
346 \li \l{TSCII Text Codec}{TSCII}
347 \li UTF-8
348 \li UTF-16
349 \li UTF-16BE
350 \li UTF-16LE
351 \li UTF-32
352 \li UTF-32BE
353 \li UTF-32LE
354 \li Windows-1250 to 1258
355 \endlist
356
357 If Qt is compiled with ICU support enabled, most codecs supported by
358 ICU will also be available to the application.
359
360 \l {QTextCodec}s can be used as follows to convert some locally encoded
361 string to Unicode. Suppose you have some string encoded in Russian
362 KOI8-R encoding, and want to convert it to Unicode. The simple way
363 to do it is like this:
364
365 \snippet code/src_corelib_codecs_qtextcodec.cpp 0
366
367 After this, \c string holds the text converted to Unicode.
368 Converting a string from Unicode to the local encoding is just as
369 easy:
370
371 \snippet code/src_corelib_codecs_qtextcodec.cpp 1
372
373 Some care must be taken when trying to convert the data in chunks,
374 for example, when receiving it over a network. In such cases it is
375 possible that a multi-byte character will be split over two
376 chunks. At best this might result in the loss of a character and
377 at worst cause the entire conversion to fail.
378
379 The approach to use in these situations is to create a QTextDecoder
380 object for the codec and use this QTextDecoder for the whole
381 decoding process, as shown below:
382
383 \snippet code/src_corelib_codecs_qtextcodec.cpp 2
384
385 The QTextDecoder object maintains state between chunks and therefore
386 works correctly even if a multi-byte character is split between
387 chunks.
388
389 \section1 Creating Your Own Codec Class
390
391 Support for new text encodings can be added to Qt by creating
392 QTextCodec subclasses.
393
394 The pure virtual functions describe the encoder to the system and
395 the coder is used as required in the different text file formats
396 supported by QTextStream, and under X11, for the locale-specific
397 character input and output.
398
399 To add support for another encoding to Qt, make a subclass of
400 QTextCodec and implement the functions listed in the table below.
401
402 \table
403 \header \li Function \li Description
404
405 \row \li name()
406 \li Returns the official name for the encoding. If the
407 encoding is listed in the
408 \l{IANA character-sets encoding file}, the name
409 should be the preferred MIME name for the encoding.
410
411 \row \li aliases()
412 \li Returns a list of alternative names for the encoding.
413 QTextCodec provides a default implementation that returns
414 an empty list. For example, "ISO-8859-1" has "latin1",
415 "CP819", "IBM819", and "iso-ir-100" as aliases.
416
417 \row \li \l{QTextCodec::mibEnum()}{mibEnum()}
418 \li Return the MIB enum for the encoding if it is listed in
419 the \l{IANA character-sets encoding file}.
420
421 \row \li convertToUnicode()
422 \li Converts an 8-bit character string to Unicode.
423
424 \row \li convertFromUnicode()
425 \li Converts a Unicode string to an 8-bit character string.
426 \endtable
427
428 \sa QTextStream, QTextDecoder, QTextEncoder
429*/
430
431/*!
432 Constructs a QTextCodec, and gives it the highest precedence. The
433 QTextCodec should always be constructed on the heap (i.e. with \c
434 new). Qt takes ownership and will delete it when the application
435 terminates.
436*/
437QTextCodec::QTextCodec()
438{
439 const TextCodecsMutexLocker locker;
440
441 QTextCodecData *globalInstance = QTextCodecData::instance();
442 if (globalInstance->allCodecs.isEmpty())
443 setup();
444
445 globalInstance->allCodecs.prepend(t: this);
446}
447
448
449/*!
450 \nonreentrant
451
452 Destroys the QTextCodec. Note that you should not delete codecs
453 yourself: once created they become Qt's responsibility.
454*/
455QTextCodec::~QTextCodec()
456{
457 QTextCodecData *globalData = QTextCodecData::instance();
458 if (!globalData)
459 return;
460
461 globalData->codecForLocale.testAndSetRelaxed(expectedValue: this, newValue: nullptr);
462
463 const TextCodecsMutexLocker locker;
464
465 globalData->allCodecs.removeOne(t: this);
466
467 auto it = globalData->codecCache.begin();
468
469 while (it != globalData->codecCache.end()) {
470 if (it.value() == this)
471 it = globalData->codecCache.erase(it);
472 else
473 ++it;
474 }
475}
476
477/*!
478 \fn QTextCodec *QTextCodec::codecForName(const char *name)
479
480 Searches all installed QTextCodec objects and returns the one
481 which best matches \a name; the match is case-insensitive. Returns
482 \nullptr if no codec matching the name \a name could be found.
483*/
484
485/*!
486 \threadsafe
487 Searches all installed QTextCodec objects and returns the one
488 which best matches \a name; the match is case-insensitive. Returns
489 \nullptr if no codec matching the name \a name could be found.
490*/
491QTextCodec *QTextCodec::codecForName(const QByteArray &name)
492{
493 if (name.isEmpty())
494 return nullptr;
495
496 const TextCodecsMutexLocker locker;
497
498 QTextCodecData *globalData = QTextCodecData::instance();
499 if (!globalData)
500 return nullptr;
501 setup();
502
503#if !QT_CONFIG(icu)
504 QTextCodecCache *cache = &globalData->codecCache;
505 QTextCodec *codec;
506 codec = cache->value(name);
507 if (codec)
508 return codec;
509
510 for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) {
511 QTextCodec *cursor = *it;
512 if (qTextCodecNameMatch(cursor->name(), name)) {
513 if (cache)
514 cache->insert(name, cursor);
515 return cursor;
516 }
517 QList<QByteArray> aliases = cursor->aliases();
518 for (ByteArrayListConstIt ait = aliases.constBegin(), acend = aliases.constEnd(); ait != acend; ++ait) {
519 if (qTextCodecNameMatch(*ait, name)) {
520 cache->insert(name, cursor);
521 return cursor;
522 }
523 }
524 }
525
526 return nullptr;
527#else
528 return QIcuCodec::codecForNameUnlocked(name);
529#endif
530}
531
532
533/*!
534 \threadsafe
535 Returns the QTextCodec which matches the
536 \l{QTextCodec::mibEnum()}{MIBenum} \a mib.
537*/
538QTextCodec* QTextCodec::codecForMib(int mib)
539{
540 const TextCodecsMutexLocker locker;
541
542 QTextCodecData *globalData = QTextCodecData::instance();
543 if (!globalData)
544 return nullptr;
545 if (globalData->allCodecs.isEmpty())
546 setup();
547
548 QByteArray key = "MIB: " + QByteArray::number(mib);
549
550 QTextCodecCache *cache = &globalData->codecCache;
551 QTextCodec *codec;
552 if (cache) {
553 codec = cache->value(key);
554 if (codec)
555 return codec;
556 }
557
558 for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) {
559 QTextCodec *cursor = *it;
560 if (cursor->mibEnum() == mib) {
561 if (cache)
562 cache->insert(key, value: cursor);
563 return cursor;
564 }
565 }
566
567#if QT_CONFIG(icu)
568 return QIcuCodec::codecForMibUnlocked(mib);
569#else
570 return nullptr;
571#endif
572}
573
574/*!
575 \threadsafe
576 Returns the list of all available codecs, by name. Call
577 QTextCodec::codecForName() to obtain the QTextCodec for the name.
578
579 The list may contain many mentions of the same codec
580 if the codec has aliases.
581
582 \sa availableMibs(), name(), aliases()
583*/
584QList<QByteArray> QTextCodec::availableCodecs()
585{
586 const TextCodecsMutexLocker locker;
587
588 QTextCodecData *globalData = QTextCodecData::instance();
589 if (globalData->allCodecs.isEmpty())
590 setup();
591
592 QList<QByteArray> codecs;
593
594 for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) {
595 codecs += (*it)->name();
596 codecs += (*it)->aliases();
597 }
598
599#if QT_CONFIG(icu)
600 codecs += QIcuCodec::availableCodecs();
601#endif
602
603 return codecs;
604}
605
606/*!
607 \threadsafe
608 Returns the list of MIBs for all available codecs. Call
609 QTextCodec::codecForMib() to obtain the QTextCodec for the MIB.
610
611 \sa availableCodecs(), mibEnum()
612*/
613QList<int> QTextCodec::availableMibs()
614{
615#if QT_CONFIG(icu)
616 return QIcuCodec::availableMibs();
617#else
618 const TextCodecsMutexLocker locker;
619
620 QTextCodecData *globalData = QTextCodecData::instance();
621 if (globalData->allCodecs.isEmpty())
622 setup();
623
624 QList<int> codecs;
625
626 for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it)
627 codecs += (*it)->mibEnum();
628
629 return codecs;
630#endif
631}
632
633/*!
634 \nonreentrant
635
636 Set the codec to \a c; this will be returned by
637 codecForLocale(). If \a c is \nullptr, the codec is reset to
638 the default.
639
640 This might be needed for some applications that want to use their
641 own mechanism for setting the locale.
642
643 \sa codecForLocale()
644*/
645void QTextCodec::setCodecForLocale(QTextCodec *c)
646{
647 QTextCodecData::instance()->codecForLocale.storeRelease(newValue: c);
648}
649
650/*!
651 \threadsafe
652 Returns a pointer to the codec most suitable for this locale.
653
654 The codec will be retrieved from ICU where that backend is in use, otherwise
655 it may be obtained from an OS-specific API. In the latter case, the codec's
656 name may be "System".
657*/
658
659QTextCodec* QTextCodec::codecForLocale()
660{
661 QTextCodecData *globalData = QTextCodecData::instance();
662 if (!globalData)
663 return nullptr;
664
665 QTextCodec *codec = globalData->codecForLocale.loadAcquire();
666 if (!codec) {
667#if QT_CONFIG(icu)
668 const TextCodecsMutexLocker locker;
669 codec = QIcuCodec::defaultCodecUnlocked();
670#else
671 // setupLocaleMapper locks as necessary
672 codec = setupLocaleMapper();
673#endif
674 }
675
676 return codec;
677}
678
679
680/*!
681 \fn QByteArray QTextCodec::name() const
682
683 QTextCodec subclasses must reimplement this function. It returns
684 the name of the encoding supported by the subclass.
685
686 If the codec is registered as a character set in the
687 \l{IANA character-sets encoding file} this method should
688 return the preferred mime name for the codec if defined,
689 otherwise its name.
690*/
691
692/*!
693 \fn int QTextCodec::mibEnum() const
694
695 Subclasses of QTextCodec must reimplement this function. It
696 returns the \l{QTextCodec::mibEnum()}{MIBenum} (see \l{IANA character-sets encoding file}
697 for more information). It is important that each QTextCodec
698 subclass returns the correct unique value for this function.
699*/
700
701/*!
702 Subclasses can return a number of aliases for the codec in question.
703
704 Standard aliases for codecs can be found in the
705 \l{IANA character-sets encoding file}.
706*/
707QList<QByteArray> QTextCodec::aliases() const
708{
709 return QList<QByteArray>();
710}
711
712/*!
713 \fn QString QTextCodec::convertToUnicode(const char *chars, int len,
714 ConverterState *state) const
715
716 QTextCodec subclasses must reimplement this function.
717
718 Converts the first \a len characters of \a chars from the
719 encoding of the subclass to Unicode, and returns the result in a
720 QString.
721
722 \a state can be \nullptr, in which case the conversion is stateless and
723 default conversion rules should be used. If \a state is not \nullptr, the
724 codec should save the state after the conversion in \a state, and
725 adjust the \c remainingChars and \c invalidChars members of the struct.
726*/
727
728/*!
729 \fn QByteArray QTextCodec::convertFromUnicode(const QChar *input, int number,
730 ConverterState *state) const
731
732 QTextCodec subclasses must reimplement this function.
733
734 Converts the first \a number of characters from the \a input array
735 from Unicode to the encoding of the subclass, and returns the result
736 in a QByteArray.
737
738 \a state can be \nullptr in which case the conversion is stateless and
739 default conversion rules should be used. If \a state is not \nullptr, the
740 codec should save the state after the conversion in \a state, and
741 adjust the \c remainingChars and \c invalidChars members of the struct.
742*/
743
744/*!
745 Creates a QTextDecoder with a specified \a flags to decode chunks
746 of \c{char *} data to create chunks of Unicode data.
747
748 The caller is responsible for deleting the returned object.
749
750 \since 4.7
751*/
752QTextDecoder* QTextCodec::makeDecoder(QTextCodec::ConversionFlags flags) const
753{
754 return new QTextDecoder(this, flags);
755}
756
757/*!
758 Creates a QTextEncoder with a specified \a flags to encode chunks
759 of Unicode data as \c{char *} data.
760
761 The caller is responsible for deleting the returned object.
762
763 \since 4.7
764*/
765QTextEncoder* QTextCodec::makeEncoder(QTextCodec::ConversionFlags flags) const
766{
767 return new QTextEncoder(this, flags);
768}
769
770/*!
771 \fn QByteArray QTextCodec::fromUnicode(const QChar *input, int number,
772 ConverterState *state) const
773
774 Converts the first \a number of characters from the \a input array
775 from Unicode to the encoding of this codec, and returns the result
776 in a QByteArray.
777
778 The \a state of the convertor used is updated.
779*/
780
781/*!
782 Converts \a str from Unicode to the encoding of this codec, and
783 returns the result in a QByteArray.
784*/
785QByteArray QTextCodec::fromUnicode(const QString& str) const
786{
787 ConverterState state = DefaultConversion | Flag::Stateless;
788 return convertFromUnicode(in: str.constData(), length: str.size(), state: &state);
789}
790
791/*!
792 \overload
793 \since 5.10
794
795 Converts \a str from Unicode to the encoding of this codec, and
796 returns the result in a QByteArray.
797*/
798QByteArray QTextCodec::fromUnicode(QStringView str) const
799{
800 ConverterState state = DefaultConversion | Flag::Stateless;
801 return convertFromUnicode(in: str.data(), length: str.size(), state: &state);
802}
803
804/*!
805 \fn QString QTextCodec::toUnicode(const char *input, int size,
806 ConverterState *state) const
807
808 Converts the first \a size characters from the \a input from the
809 encoding of this codec to Unicode, and returns the result in a
810 QString.
811
812 The \a state of the convertor used is updated.
813*/
814
815/*!
816 Converts \a a from the encoding of this codec to Unicode, and
817 returns the result in a QString.
818*/
819QString QTextCodec::toUnicode(const QByteArray& a) const
820{
821 ConverterState state = DefaultConversion | Flag::Stateless;
822 return convertToUnicode(in: a.constData(), length: a.size(), state: &state);
823}
824
825/*!
826 Returns \c true if the Unicode character \a ch can be fully encoded
827 with this codec; otherwise returns \c false.
828*/
829bool QTextCodec::canEncode(QChar ch) const
830{
831 ConverterState state;
832 state.flags = ConvertInvalidToNull;
833 convertFromUnicode(in: &ch, length: 1, state: &state);
834 return (state.invalidChars == 0);
835}
836
837/*!
838 \overload
839
840 \a s contains the string being tested for encode-ability.
841*/
842bool QTextCodec::canEncode(const QString& s) const
843{
844 ConverterState state;
845 state.flags = ConvertInvalidToNull;
846 convertFromUnicode(in: s.constData(), length: s.size(), state: &state);
847 return (state.invalidChars == 0);
848}
849
850/*!
851 \overload
852 \since 5.10
853
854 Returns \c true if the Unicode string \a s can be fully encoded
855 with this codec; otherwise returns \c false.
856*/
857bool QTextCodec::canEncode(QStringView s) const
858{
859 ConverterState state;
860 state.flags = ConvertInvalidToNull;
861 convertFromUnicode(in: s.data(), length: s.size(), state: &state);
862 return !state.invalidChars;
863}
864/*!
865 \overload
866
867 \a chars contains the source characters.
868*/
869QString QTextCodec::toUnicode(const char *chars) const
870{
871 const auto len = int(qstrlen(str: chars));
872 return convertToUnicode(in: chars, length: len, state: nullptr);
873}
874
875
876/*!
877 \class QTextEncoder
878 \inmodule QtCore5Compat
879 \brief The QTextEncoder class provides a state-based encoder.
880 \reentrant
881 \ingroup i18n
882
883 A text encoder converts text from Unicode into an encoded text format
884 using a specific codec.
885
886 The encoder converts Unicode into another format, remembering any
887 state that is required between calls.
888
889 \sa QTextCodec::makeEncoder(), QTextDecoder
890*/
891
892/*!
893 \fn QTextEncoder::QTextEncoder(const QTextCodec *codec)
894
895 Constructs a text encoder for the given \a codec.
896*/
897
898/*!
899 Constructs a text encoder for the given \a codec and conversion \a flags.
900
901 \since 4.7
902*/
903QTextEncoder::QTextEncoder(const QTextCodec *codec, QTextCodec::ConversionFlags flags)
904 : c(codec), state()
905{
906 state.flags = flags;
907}
908
909/*!
910 Destroys the encoder.
911*/
912QTextEncoder::~QTextEncoder()
913{
914}
915
916/*!
917 \internal
918 \since 4.5
919 Determines whether the encoder encountered a failure while decoding the input. If
920 an error was encountered, the produced result is undefined, and gets converted as according
921 to the conversion flags.
922 */
923bool QTextEncoder::hasFailure() const
924{
925 return state.invalidChars != 0;
926}
927
928/*!
929 Converts the Unicode string \a str into an encoded QByteArray.
930*/
931QByteArray QTextEncoder::fromUnicode(const QString& str)
932{
933 return c->fromUnicode(in: str.constData(), length: str.size(), state: &state);
934}
935
936/*!
937 \overload
938 \since 5.10
939 Converts the Unicode string \a str into an encoded QByteArray.
940*/
941QByteArray QTextEncoder::fromUnicode(QStringView str)
942{
943 return c->fromUnicode(in: str.data(), length: str.size(), state: &state);
944}
945
946/*!
947 \overload
948
949 Converts \a len characters (not bytes) from \a uc, and returns the
950 result in a QByteArray.
951*/
952QByteArray QTextEncoder::fromUnicode(const QChar *uc, int len)
953{
954 return c->fromUnicode(in: uc, length: len, state: &state);
955}
956
957/*!
958 \class QTextDecoder
959 \inmodule QtCore5Compat
960 \brief The QTextDecoder class provides a state-based decoder.
961 \reentrant
962 \ingroup i18n
963
964 A text decoder converts text from an encoded text format into Unicode
965 using a specific codec.
966
967 The decoder converts text in this format into Unicode, remembering any
968 state that is required between calls.
969
970 \sa QTextCodec::makeDecoder(), QTextEncoder
971*/
972
973/*!
974 \fn QTextDecoder::QTextDecoder(const QTextCodec *codec)
975
976 Constructs a text decoder for the given \a codec.
977*/
978
979/*!
980 Constructs a text decoder for the given \a codec and conversion \a flags.
981
982 \since 4.7
983*/
984
985QTextDecoder::QTextDecoder(const QTextCodec *codec, QTextCodec::ConversionFlags flags)
986 : c(codec), state()
987{
988 state.flags = flags;
989}
990
991/*!
992 Destroys the decoder.
993*/
994QTextDecoder::~QTextDecoder()
995{
996}
997
998/*!
999 \fn QString QTextDecoder::toUnicode(const char *chars, int len)
1000
1001 Converts the first \a len bytes in \a chars to Unicode, returning
1002 the result.
1003
1004 If not all characters are used (e.g. if only part of a multi-byte
1005 encoding is at the end of the characters), the decoder remembers
1006 enough state to continue with the next call to this function.
1007*/
1008QString QTextDecoder::toUnicode(const char *chars, int len)
1009{
1010 return c->toUnicode(in: chars, length: len, state: &state);
1011}
1012
1013/*! \overload
1014
1015 The converted string is returned in \a target.
1016 */
1017void QTextDecoder::toUnicode(QString *target, const char *chars, int len)
1018{
1019 Q_ASSERT(target);
1020 switch (c->mibEnum()) {
1021 case 106: // utf8
1022 static_cast<const QUtf8Codec*>(c)->convertToUnicode(target, chars, len, &state);
1023 break;
1024 case 4: // latin1
1025 target->resize(size: len);
1026 qt_from_latin1(dst: (char16_t*)target->data(), str: chars, size: len);
1027 break;
1028 default:
1029 *target = c->toUnicode(in: chars, length: len, state: &state);
1030 }
1031}
1032
1033
1034/*!
1035 \overload
1036
1037 Converts the bytes in the byte array specified by \a ba to Unicode
1038 and returns the result.
1039*/
1040QString QTextDecoder::toUnicode(const QByteArray &ba)
1041{
1042 return c->toUnicode(in: ba.constData(), length: ba.size(), state: &state);
1043}
1044
1045/*!
1046 \since 4.4
1047
1048 Tries to detect the encoding of the provided snippet of HTML in
1049 the given byte array, \a ba, by checking the BOM (Byte Order Mark)
1050 and the content-type meta header and returns a QTextCodec instance
1051 that is capable of decoding the html to unicode. If the codec
1052 cannot be detected from the content provided, \a defaultCodec is
1053 returned.
1054
1055 \sa codecForUtfText()
1056*/
1057QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec *defaultCodec)
1058{
1059 // determine charset
1060 QTextCodec *c = QTextCodec::codecForUtfText(ba, defaultCodec: nullptr);
1061 if (!c) {
1062 static Q_RELAXED_CONSTEXPR auto matcher = qMakeStaticByteArrayMatcher(pattern: "meta ");
1063 QByteArray header = ba.left(n: 1024).toLower();
1064 qsizetype pos = matcher.indexIn(haystack: header);
1065 if (pos != -1) {
1066 static Q_RELAXED_CONSTEXPR auto matcher = qMakeStaticByteArrayMatcher(pattern: "charset=");
1067 pos = matcher.indexIn(haystack: header, from: pos);
1068 if (pos != -1) {
1069 pos += qstrlen(str: "charset=");
1070
1071 qsizetype pos2 = pos;
1072 // The attribute can be closed with either """, "'", ">" or "/",
1073 // none of which are valid charset characters.
1074 while (++pos2 < header.size()) {
1075 char ch = header.at(i: pos2);
1076 if (ch == '\"' || ch == '\'' || ch == '>') {
1077 QByteArray name = header.mid(index: pos, len: pos2 - pos);
1078 if (name == "unicode") // QTBUG-41998, ICU will return UTF-16.
1079 name = QByteArrayLiteral("UTF-8");
1080 c = QTextCodec::codecForName(name);
1081 return c ? c : defaultCodec;
1082 }
1083 }
1084 }
1085 }
1086 }
1087 if (!c)
1088 c = defaultCodec;
1089
1090 return c;
1091}
1092
1093/*!
1094 \overload
1095
1096 Tries to detect the encoding of the provided snippet of HTML in
1097 the given byte array, \a ba, by checking the BOM (Byte Order Mark)
1098 and the content-type meta header and returns a QTextCodec instance
1099 that is capable of decoding the html to unicode. If the codec cannot
1100 be detected, this overload returns a Latin-1 QTextCodec.
1101*/
1102QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba)
1103{
1104 return codecForHtml(ba, defaultCodec: QTextCodec::codecForName(name: "ISO-8859-1"));
1105}
1106
1107/*!
1108 \since 4.6
1109
1110 Tries to detect the encoding of the provided snippet \a ba by
1111 using the BOM (Byte Order Mark) and returns a QTextCodec instance
1112 that is capable of decoding the text to unicode. This function can
1113 detect one of the following codecs:
1114
1115 \list
1116 \li UTF-32 Little Endian
1117 \li UTF-32 Big Endian
1118 \li UTF-16 Little Endian
1119 \li UTF-16 Big Endian
1120 \li UTF-8
1121 \endlist
1122
1123 If the codec cannot be detected from the content provided, \a defaultCodec
1124 is returned.
1125
1126 \sa codecForHtml()
1127*/
1128QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba, QTextCodec *defaultCodec)
1129{
1130 const int arraySize = ba.size();
1131 const uchar *buf = reinterpret_cast<const uchar *>(ba.constData());
1132 const uint bom = 0xfeff;
1133
1134 if (arraySize > 3) {
1135 uint uc = qFromUnaligned<uint>(src: buf);
1136 if (uc == qToBigEndian(source: bom))
1137 return QTextCodec::codecForMib(mib: 1018); // utf-32 be
1138 else if (uc == qToLittleEndian(source: bom))
1139 return QTextCodec::codecForMib(mib: 1019); // utf-32 le
1140 }
1141
1142 if (arraySize < 2)
1143 return defaultCodec;
1144
1145 ushort uc = qFromUnaligned<ushort>(src: buf);
1146 if (uc == qToBigEndian(source: ushort(bom)))
1147 return QTextCodec::codecForMib(mib: 1013); // utf16 be
1148 else if (uc == qToLittleEndian(source: ushort(bom)))
1149 return QTextCodec::codecForMib(mib: 1014); // utf16 le
1150
1151 if (arraySize < 3)
1152 return defaultCodec;
1153
1154 static const char utf8bom[] = "\xef\xbb\xbf";
1155 if (memcmp(s1: buf, s2: utf8bom, n: sizeof(utf8bom) - 1) == 0)
1156 return QTextCodec::codecForMib(mib: 106); // utf-8
1157
1158 return defaultCodec;
1159}
1160
1161/*!
1162 \overload
1163
1164 Tries to detect the encoding of the provided snippet \a ba by
1165 using the BOM (Byte Order Mark) and returns a QTextCodec instance
1166 that is capable of decoding the text to unicode. This function can
1167 detect one of the following codecs:
1168
1169 \list
1170 \li UTF-32 Little Endian
1171 \li UTF-32 Big Endian
1172 \li UTF-16 Little Endian
1173 \li UTF-16 Big Endian
1174 \li UTF-8
1175 \endlist
1176
1177 If the codec cannot be detected from the content provided, this overload
1178 returns a Latin-1 QTextCodec.
1179
1180 \sa codecForHtml()
1181*/
1182QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba)
1183{
1184 return codecForUtfText(ba, defaultCodec: QTextCodec::codecForMib(/*Latin 1*/ mib: 4));
1185}
1186
1187/*!
1188 \fn QTextCodec *QTextCodec::codecForTr ()
1189 \deprecated
1190
1191 Returns the codec used by QObject::tr() on its argument. If this
1192 function returns \nullptr (the default), tr() assumes Latin-1.
1193*/
1194
1195/*!
1196 \internal
1197 \since 4.3
1198 Determines whether the decoder encountered a failure while decoding the
1199 input. If an error was encountered, the produced result is undefined, and
1200 gets converted as according to the conversion flags.
1201 */
1202bool QTextDecoder::hasFailure() const
1203{
1204 return state.invalidChars != 0;
1205}
1206
1207/*!
1208 \internal
1209 \since 5.12
1210
1211 Determines whether the decoder needs more bytes to continue decoding. That
1212 is, this signifies that the input string ended in the middle of a
1213 multi-byte sequence. Note that it's possible some codecs do not report this.
1214 */
1215bool QTextDecoder::needsMoreData() const
1216{
1217 return state.remainingChars;
1218}
1219
1220/*!
1221 \fn QTextCodec * Qt::codecForHtml(const QByteArray &ba)
1222 \internal
1223
1224 This function is defined in the \c <QTextCodec> header file.
1225*/
1226QTextCodec *Qt::codecForHtml(const QByteArray &ba)
1227{
1228 return QTextCodec::codecForHtml(ba);
1229}
1230
1231QT_END_NAMESPACE
1232

source code of qt5compat/src/core5/codecs/qtextcodec.cpp