| 1 | // Copyright (C) 2018 The Qt Company Ltd. | 
| 2 | // Copyright (C) 2018 Intel Corporation. | 
| 3 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only | 
| 4 |  | 
| 5 | #include "qplatformdefs.h" | 
| 6 |  | 
| 7 | #include "qtextcodec.h" | 
| 8 | #include "qtextcodec_p.h" | 
| 9 |  | 
| 10 | #include "qbytearraymatcher.h" | 
| 11 | #include "qendian.h" | 
| 12 | #include "qfile.h" | 
| 13 | #include "qlist.h" | 
| 14 | #include <private/qlocking_p.h> | 
| 15 | #include "qstringlist.h" | 
| 16 | #include "qvarlengtharray.h" | 
| 17 |  | 
| 18 | #include <private/qcoreapplication_p.h> | 
| 19 |  | 
| 20 | #include "qutfcodec_p.h" | 
| 21 | #include "qlatincodec_p.h" | 
| 22 |  | 
| 23 | #if QT_CONFIG(codecs) | 
| 24 | #  include "qtsciicodec_p.h" | 
| 25 | #  include "qisciicodec_p.h" | 
| 26 | #endif | 
| 27 | #if QT_CONFIG(icu) | 
| 28 | #include "qicucodec_p.h" | 
| 29 | #else | 
| 30 | #if QT_CONFIG(iconv) | 
| 31 | #  include "qiconvcodec_p.h" | 
| 32 | #endif | 
| 33 | #ifdef Q_OS_WIN | 
| 34 | #  include "qwindowscodec_p.h" | 
| 35 | #endif | 
| 36 | #  include "qsimplecodec_p.h" | 
| 37 | #if QT_CONFIG(big_codecs) && QT_CONFIG(textcodec) | 
| 38 | #  ifndef Q_OS_INTEGRITY | 
| 39 | #    include "qgb18030codec_p.h" | 
| 40 | #    include "qeucjpcodec_p.h" | 
| 41 | #    include "qjiscodec_p.h" | 
| 42 | #    include "qsjiscodec_p.h" | 
| 43 | #    include "qeuckrcodec_p.h" | 
| 44 | #    include "qbig5codec_p.h" | 
| 45 | #  endif // !Q_OS_INTEGRITY | 
| 46 | #endif // big_codecs | 
| 47 |  | 
| 48 | #endif // icu | 
| 49 |  | 
| 50 | #include <mutex> | 
| 51 |  | 
| 52 | #include <stdlib.h> | 
| 53 | #include <ctype.h> | 
| 54 | #include <locale.h> | 
| 55 | #if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_ANDROID) | 
| 56 | # include <langinfo.h> | 
| 57 | #endif | 
| 58 |  | 
| 59 | QT_BEGIN_NAMESPACE | 
| 60 |  | 
| 61 | // in qstring.cpp: | 
| 62 | void qt_from_latin1(char16_t *dst, const char *str, size_t size) noexcept; | 
| 63 |  | 
| 64 | typedef QList<QTextCodec*>::ConstIterator TextCodecListConstIt; | 
| 65 | typedef QList<QByteArray>::ConstIterator ByteArrayListConstIt; | 
| 66 |  | 
| 67 | Q_GLOBAL_STATIC(QRecursiveMutex, textCodecsMutex); | 
| 68 |  | 
| 69 | Q_GLOBAL_STATIC(QTextCodecData, textCodecData) | 
| 70 |  | 
| 71 | QTextCodecData::QTextCodecData() | 
| 72 |     : codecForLocale(nullptr) | 
| 73 | { | 
| 74 | } | 
| 75 |  | 
| 76 | QTextCodecData::~QTextCodecData() | 
| 77 | { | 
| 78 |     codecForLocale.storeRelease(newValue: nullptr); | 
| 79 |     QList<QTextCodec *> tmp = allCodecs; | 
| 80 |     allCodecs.clear(); | 
| 81 |     codecCache.clear(); | 
| 82 |     for (QList<QTextCodec *>::const_iterator it = tmp.constBegin(); it != tmp.constEnd(); ++it) | 
| 83 |         delete *it; | 
| 84 | } | 
| 85 |  | 
| 86 | QTextCodecData *QTextCodecData::instance() | 
| 87 | { | 
| 88 |     return textCodecData(); | 
| 89 | } | 
| 90 |  | 
| 91 | class TextCodecsMutexLocker | 
| 92 | { | 
| 93 |     using Lock = decltype(qt_unique_lock(mutex&: std::declval<QRecursiveMutex&>())); | 
| 94 |     // ### FIXME: this is used when textCodecsMutex already == nullptr | 
| 95 |     const Lock lock = qt_unique_lock(mutex: textCodecsMutex()); | 
| 96 | public: | 
| 97 |     TextCodecsMutexLocker() {} // required d/t an ICC 19 bug | 
| 98 | }; | 
| 99 |  | 
| 100 | #if !QT_CONFIG(icu) | 
| 101 | static char qtolower(char c) | 
| 102 | { if (c >= 'A' && c <= 'Z') return c + 0x20; return c; } | 
| 103 | static bool qisalnum(char c) | 
| 104 | { return (c >= '0' && c <= '9') || ((c | 0x20) >= 'a' && (c | 0x20) <= 'z'); } | 
| 105 |  | 
| 106 | bool qTextCodecNameMatch(const char *n, const char *h) | 
| 107 | { | 
| 108 |     if (qstricmp(n, h) == 0) | 
| 109 |         return true; | 
| 110 |  | 
| 111 |     // if the letters and numbers are the same, we have a match | 
| 112 |     while (*n != '\0') { | 
| 113 |         if (qisalnum(*n)) { | 
| 114 |             for (;;) { | 
| 115 |                 if (*h == '\0') | 
| 116 |                     return false; | 
| 117 |                 if (qisalnum(*h)) | 
| 118 |                     break; | 
| 119 |                 ++h; | 
| 120 |             } | 
| 121 |             if (qtolower(*n) != qtolower(*h)) | 
| 122 |                 return false; | 
| 123 |             ++h; | 
| 124 |         } | 
| 125 |         ++n; | 
| 126 |     } | 
| 127 |     while (*h && !qisalnum(*h)) | 
| 128 |            ++h; | 
| 129 |     return (*h == '\0'); | 
| 130 | } | 
| 131 |  | 
| 132 |  | 
| 133 | #if !defined(Q_OS_WIN32) && !defined(QT_LOCALE_IS_UTF8) | 
| 134 | static QTextCodec *checkForCodec(const QByteArray &name) { | 
| 135 |     QTextCodec *c = QTextCodec::codecForName(name); | 
| 136 |     if (!c) { | 
| 137 |         const int index = name.indexOf('@'); | 
| 138 |         if (index != -1) { | 
| 139 |             c = QTextCodec::codecForName(name.left(index)); | 
| 140 |         } | 
| 141 |     } | 
| 142 |     return c; | 
| 143 | } | 
| 144 | #endif | 
| 145 |  | 
| 146 | static void setup(); | 
| 147 |  | 
| 148 | // \threadsafe | 
| 149 | // this returns the codec the method sets up as locale codec to | 
| 150 | // avoid a race condition in codecForLocale() when | 
| 151 | // setCodecForLocale(nullptr) is called at the same time. | 
| 152 | static QTextCodec *setupLocaleMapper() | 
| 153 | { | 
| 154 |     QTextCodecData *globalData = QTextCodecData::instance(); | 
| 155 |  | 
| 156 |     QTextCodec *locale = nullptr; | 
| 157 |  | 
| 158 |     { | 
| 159 |         const TextCodecsMutexLocker locker; | 
| 160 |         if (globalData->allCodecs.isEmpty()) | 
| 161 |             setup(); | 
| 162 |     } | 
| 163 |  | 
| 164 |     QCoreApplicationPrivate::initLocale(); | 
| 165 |  | 
| 166 | #if defined(QT_LOCALE_IS_UTF8) | 
| 167 |     locale = QTextCodec::codecForName("UTF-8" ); | 
| 168 | #elif defined(Q_OS_WIN) | 
| 169 |     locale = QTextCodec::codecForName("System" ); | 
| 170 | #else | 
| 171 |  | 
| 172 |     // First try getting the codecs name from nl_langinfo and see | 
| 173 |     // if we have a builtin codec for it. | 
| 174 |     // Only fall back to using iconv if we can't find a builtin codec | 
| 175 |     // This is because the builtin utf8 codec is around 5 times faster | 
| 176 |     // then the using QIconvCodec | 
| 177 |  | 
| 178 | #if defined (_XOPEN_UNIX) | 
| 179 |     char *charset = nl_langinfo(CODESET); | 
| 180 |     if (charset) | 
| 181 |         locale = QTextCodec::codecForName(charset); | 
| 182 | #endif | 
| 183 | #if QT_CONFIG(iconv) | 
| 184 |     if (!locale) { | 
| 185 |         // no builtin codec for the locale found, let's try using iconv | 
| 186 |         (void) new QIconvCodec(); | 
| 187 |         locale = QTextCodec::codecForName("System" ); | 
| 188 |     } | 
| 189 | #endif | 
| 190 |  | 
| 191 |     if (!locale) { | 
| 192 |         // Very poorly defined and followed standards causes lots of | 
| 193 |         // code to try to get all the cases... This logic is | 
| 194 |         // duplicated in QIconvCodec, so if you change it here, change | 
| 195 |         // it there too. | 
| 196 |  | 
| 197 |         // Try to determine locale codeset from locale name assigned to | 
| 198 |         // LC_CTYPE category. | 
| 199 |  | 
| 200 |         // First part is getting that locale name.  First try setlocale() which | 
| 201 |         // definitely knows it, but since we cannot fully trust it, get ready | 
| 202 |         // to fall back to environment variables. | 
| 203 |         const QByteArray ctype = setlocale(LC_CTYPE, nullptr); | 
| 204 |  | 
| 205 |         // Get the first nonempty value from $LC_ALL, $LC_CTYPE, and $LANG | 
| 206 |         // environment variables. | 
| 207 |         QByteArray lang = qgetenv("LC_ALL" ); | 
| 208 |         if (lang.isEmpty() || lang == "C" ) { | 
| 209 |             lang = qgetenv("LC_CTYPE" ); | 
| 210 |         } | 
| 211 |         if (lang.isEmpty() || lang == "C" ) { | 
| 212 |             lang = qgetenv("LANG" ); | 
| 213 |         } | 
| 214 |  | 
| 215 |         // Now try these in order: | 
| 216 |         // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15) | 
| 217 |         // 2. CODESET from lang if it contains a .CODESET part | 
| 218 |         // 3. ctype (maybe the locale is named "ISO-8859-1" or something) | 
| 219 |         // 4. locale (ditto) | 
| 220 |         // 5. check for "@euro" | 
| 221 |         // 6. guess locale from ctype unless ctype is "C" | 
| 222 |         // 7. guess locale from lang | 
| 223 |  | 
| 224 |         // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15) | 
| 225 |         int indexOfDot = ctype.indexOf('.'); | 
| 226 |         if (indexOfDot != -1) | 
| 227 |             locale = checkForCodec( ctype.mid(indexOfDot + 1) ); | 
| 228 |  | 
| 229 |         // 2. CODESET from lang if it contains a .CODESET part | 
| 230 |         if (!locale) { | 
| 231 |             indexOfDot = lang.indexOf('.'); | 
| 232 |             if (indexOfDot != -1) | 
| 233 |                 locale = checkForCodec( lang.mid(indexOfDot + 1) ); | 
| 234 |         } | 
| 235 |  | 
| 236 |         // 3. ctype (maybe the locale is named "ISO-8859-1" or something) | 
| 237 |         if (!locale && !ctype.isEmpty() && ctype != "C" ) | 
| 238 |             locale = checkForCodec(ctype); | 
| 239 |  | 
| 240 |         // 4. locale (ditto) | 
| 241 |         if (!locale && !lang.isEmpty()) | 
| 242 |             locale = checkForCodec(lang); | 
| 243 |  | 
| 244 |         // 5. "@euro" | 
| 245 |         if ((!locale && ctype.contains("@euro" )) || lang.contains("@euro" )) | 
| 246 |             locale = checkForCodec("ISO 8859-15" ); | 
| 247 |     } | 
| 248 |  | 
| 249 | #endif | 
| 250 |     // If everything failed, we default to 8859-1 | 
| 251 |     if (!locale) | 
| 252 |         locale = QTextCodec::codecForName("ISO 8859-1" ); | 
| 253 |     globalData->codecForLocale.storeRelease(locale); | 
| 254 |     return locale; | 
| 255 | } | 
| 256 |  | 
| 257 |  | 
| 258 | // textCodecsMutex need to be locked to enter this function | 
| 259 | static void setup() | 
| 260 | { | 
| 261 |     static bool initialized = false; | 
| 262 |     if (initialized) | 
| 263 |         return; | 
| 264 |     initialized = true; | 
| 265 |  | 
| 266 | #if QT_CONFIG(codecs) | 
| 267 |     (void)new QTsciiCodec; | 
| 268 |     for (int i = 0; i < 9; ++i) | 
| 269 |         (void)new QIsciiCodec(i); | 
| 270 |     for (int i = 0; i < QSimpleTextCodec::numSimpleCodecs; ++i) | 
| 271 |         (void)new QSimpleTextCodec(i); | 
| 272 |  | 
| 273 | #  if QT_CONFIG(big_codecs) && !defined(Q_OS_INTEGRITY) | 
| 274 |     (void)new QGb18030Codec; | 
| 275 |     (void)new QGbkCodec; | 
| 276 |     (void)new QGb2312Codec; | 
| 277 |     (void)new QEucJpCodec; | 
| 278 |     (void)new QJisCodec; | 
| 279 |     (void)new QSjisCodec; | 
| 280 |     (void)new QEucKrCodec; | 
| 281 |     (void)new QCP949Codec; | 
| 282 |     (void)new QBig5Codec; | 
| 283 |     (void)new QBig5hkscsCodec; | 
| 284 | #  endif // big_codecs && !Q_OS_INTEGRITY | 
| 285 | #if QT_CONFIG(iconv) | 
| 286 |     (void) new QIconvCodec; | 
| 287 | #endif | 
| 288 | #if defined(Q_OS_WIN32) | 
| 289 |     (void) new QWindowsLocalCodec; | 
| 290 | #endif // Q_OS_WIN32 | 
| 291 | #endif // codecs | 
| 292 |  | 
| 293 |     (void)new QUtf16Codec; | 
| 294 |     (void)new QUtf16BECodec; | 
| 295 |     (void)new QUtf16LECodec; | 
| 296 |     (void)new QUtf32Codec; | 
| 297 |     (void)new QUtf32BECodec; | 
| 298 |     (void)new QUtf32LECodec; | 
| 299 |     (void)new QLatin15Codec; | 
| 300 |     (void)new QLatin1Codec; | 
| 301 |     (void)new QUtf8Codec; | 
| 302 | } | 
| 303 | #else | 
| 304 | static void setup() {} | 
| 305 | #endif // icu | 
| 306 |  | 
| 307 | /*! | 
| 308 |     \class QTextCodec | 
| 309 |     \inmodule QtCore5Compat | 
| 310 |     \brief The QTextCodec class provides conversions between text encodings. | 
| 311 |     \reentrant | 
| 312 |     \ingroup i18n | 
| 313 |  | 
| 314 |     Qt uses Unicode to store, draw and manipulate strings. In many | 
| 315 |     situations you may wish to deal with data that uses a different | 
| 316 |     encoding. For example, most Japanese documents are still stored | 
| 317 |     in Shift-JIS or ISO 2022-JP, while Russian users often have their | 
| 318 |     documents in KOI8-R or Windows-1251. | 
| 319 |  | 
| 320 |     Qt provides a set of QTextCodec classes to help with converting | 
| 321 |     non-Unicode formats to and from Unicode. You can also create your | 
| 322 |     own codec classes. | 
| 323 |  | 
| 324 |     The supported encodings are: | 
| 325 |  | 
| 326 |     \list | 
| 327 |     \li \l{Big5 Text Codec}{Big5} | 
| 328 |     \li \l{Big5-HKSCS Text Codec}{Big5-HKSCS} | 
| 329 |     \li CP949 | 
| 330 |     \li \l{EUC-JP Text Codec}{EUC-JP} | 
| 331 |     \li \l{EUC-KR Text Codec}{EUC-KR} | 
| 332 |     \li \l{GBK Text Codec}{GB18030} | 
| 333 |     \li HP-ROMAN8 | 
| 334 |     \li IBM 850 | 
| 335 |     \li IBM 866 | 
| 336 |     \li IBM 874 | 
| 337 |     \li \l{ISO 2022-JP (JIS) Text Codec}{ISO 2022-JP} | 
| 338 |     \li ISO 8859-1 to 10 | 
| 339 |     \li ISO 8859-13 to 16 | 
| 340 |     \li Iscii-Bng, Dev, Gjr, Knd, Mlm, Ori, Pnj, Tlg, and Tml | 
| 341 |     \li KOI8-R | 
| 342 |     \li KOI8-U | 
| 343 |     \li Macintosh | 
| 344 |     \li \l{Shift-JIS Text Codec}{Shift-JIS} | 
| 345 |     \li TIS-620 | 
| 346 |     \li \l{TSCII Text Codec}{TSCII} | 
| 347 |     \li UTF-8 | 
| 348 |     \li UTF-16 | 
| 349 |     \li UTF-16BE | 
| 350 |     \li UTF-16LE | 
| 351 |     \li UTF-32 | 
| 352 |     \li UTF-32BE | 
| 353 |     \li UTF-32LE | 
| 354 |     \li Windows-1250 to 1258 | 
| 355 |     \endlist | 
| 356 |  | 
| 357 |     If Qt is compiled with ICU support enabled, most codecs supported by | 
| 358 |     ICU will also be available to the application. | 
| 359 |  | 
| 360 |     \l {QTextCodec}s can be used as follows to convert some locally encoded | 
| 361 |     string to Unicode. Suppose you have some string encoded in Russian | 
| 362 |     KOI8-R encoding, and want to convert it to Unicode. The simple way | 
| 363 |     to do it is like this: | 
| 364 |  | 
| 365 |     \snippet code/src_corelib_codecs_qtextcodec.cpp 0 | 
| 366 |  | 
| 367 |     After this, \c string holds the text converted to Unicode. | 
| 368 |     Converting a string from Unicode to the local encoding is just as | 
| 369 |     easy: | 
| 370 |  | 
| 371 |     \snippet code/src_corelib_codecs_qtextcodec.cpp 1 | 
| 372 |  | 
| 373 |     Some care must be taken when trying to convert the data in chunks, | 
| 374 |     for example, when receiving it over a network. In such cases it is | 
| 375 |     possible that a multi-byte character will be split over two | 
| 376 |     chunks. At best this might result in the loss of a character and | 
| 377 |     at worst cause the entire conversion to fail. | 
| 378 |  | 
| 379 |     The approach to use in these situations is to create a QTextDecoder | 
| 380 |     object for the codec and use this QTextDecoder for the whole | 
| 381 |     decoding process, as shown below: | 
| 382 |  | 
| 383 |     \snippet code/src_corelib_codecs_qtextcodec.cpp 2 | 
| 384 |  | 
| 385 |     The QTextDecoder object maintains state between chunks and therefore | 
| 386 |     works correctly even if a multi-byte character is split between | 
| 387 |     chunks. | 
| 388 |  | 
| 389 |     \section1 Creating Your Own Codec Class | 
| 390 |  | 
| 391 |     Support for new text encodings can be added to Qt by creating | 
| 392 |     QTextCodec subclasses. | 
| 393 |  | 
| 394 |     The pure virtual functions describe the encoder to the system and | 
| 395 |     the coder is used as required in the different text file formats | 
| 396 |     supported by QTextStream, and under X11, for the locale-specific | 
| 397 |     character input and output. | 
| 398 |  | 
| 399 |     To add support for another encoding to Qt, make a subclass of | 
| 400 |     QTextCodec and implement the functions listed in the table below. | 
| 401 |  | 
| 402 |     \table | 
| 403 |     \header \li Function \li Description | 
| 404 |  | 
| 405 |     \row \li name() | 
| 406 |          \li Returns the official name for the encoding. If the | 
| 407 |             encoding is listed in the | 
| 408 |             \l{IANA character-sets encoding file}, the name | 
| 409 |             should be the preferred MIME name for the encoding. | 
| 410 |  | 
| 411 |     \row \li aliases() | 
| 412 |          \li Returns a list of alternative names for the encoding. | 
| 413 |             QTextCodec provides a default implementation that returns | 
| 414 |             an empty list. For example, "ISO-8859-1" has "latin1", | 
| 415 |             "CP819", "IBM819", and "iso-ir-100" as aliases. | 
| 416 |  | 
| 417 |     \row \li \l{QTextCodec::mibEnum()}{mibEnum()} | 
| 418 |          \li Return the MIB enum for the encoding if it is listed in | 
| 419 |             the \l{IANA character-sets encoding file}. | 
| 420 |  | 
| 421 |     \row \li convertToUnicode() | 
| 422 |          \li Converts an 8-bit character string to Unicode. | 
| 423 |  | 
| 424 |     \row \li convertFromUnicode() | 
| 425 |          \li Converts a Unicode string to an 8-bit character string. | 
| 426 |     \endtable | 
| 427 |  | 
| 428 |     \sa QTextStream, QTextDecoder, QTextEncoder | 
| 429 | */ | 
| 430 |  | 
| 431 | /*! | 
| 432 |     Constructs a QTextCodec, and gives it the highest precedence. The | 
| 433 |     QTextCodec should always be constructed on the heap (i.e. with \c | 
| 434 |     new). Qt takes ownership and will delete it when the application | 
| 435 |     terminates. | 
| 436 | */ | 
| 437 | QTextCodec::QTextCodec() | 
| 438 | { | 
| 439 |     const TextCodecsMutexLocker locker; | 
| 440 |  | 
| 441 |     QTextCodecData *globalInstance = QTextCodecData::instance(); | 
| 442 |     if (globalInstance->allCodecs.isEmpty()) | 
| 443 |         setup(); | 
| 444 |  | 
| 445 |     globalInstance->allCodecs.prepend(t: this); | 
| 446 | } | 
| 447 |  | 
| 448 |  | 
| 449 | /*! | 
| 450 |     \nonreentrant | 
| 451 |  | 
| 452 |     Destroys the QTextCodec. Note that you should not delete codecs | 
| 453 |     yourself: once created they become Qt's responsibility. | 
| 454 | */ | 
| 455 | QTextCodec::~QTextCodec() | 
| 456 | { | 
| 457 |     QTextCodecData *globalData = QTextCodecData::instance(); | 
| 458 |     if (!globalData) | 
| 459 |         return; | 
| 460 |  | 
| 461 |     globalData->codecForLocale.testAndSetRelaxed(expectedValue: this, newValue: nullptr); | 
| 462 |  | 
| 463 |     const TextCodecsMutexLocker locker; | 
| 464 |  | 
| 465 |     globalData->allCodecs.removeOne(t: this); | 
| 466 |  | 
| 467 |     auto it = globalData->codecCache.begin(); | 
| 468 |  | 
| 469 |     while (it != globalData->codecCache.end()) { | 
| 470 |         if (it.value() == this) | 
| 471 |             it = globalData->codecCache.erase(it); | 
| 472 |         else | 
| 473 |             ++it; | 
| 474 |     } | 
| 475 | } | 
| 476 |  | 
| 477 | /*! | 
| 478 |     \fn QTextCodec *QTextCodec::codecForName(const char *name) | 
| 479 |  | 
| 480 |     Searches all installed QTextCodec objects and returns the one | 
| 481 |     which best matches \a name; the match is case-insensitive. Returns | 
| 482 |     \nullptr if no codec matching the name \a name could be found. | 
| 483 | */ | 
| 484 |  | 
| 485 | /*! | 
| 486 |     \threadsafe | 
| 487 |     Searches all installed QTextCodec objects and returns the one | 
| 488 |     which best matches \a name; the match is case-insensitive. Returns | 
| 489 |     \nullptr if no codec matching the name \a name could be found. | 
| 490 | */ | 
| 491 | QTextCodec *QTextCodec::codecForName(const QByteArray &name) | 
| 492 | { | 
| 493 |     if (name.isEmpty()) | 
| 494 |         return nullptr; | 
| 495 |  | 
| 496 |     const TextCodecsMutexLocker locker; | 
| 497 |  | 
| 498 |     QTextCodecData *globalData = QTextCodecData::instance(); | 
| 499 |     if (!globalData) | 
| 500 |         return nullptr; | 
| 501 |     setup(); | 
| 502 |  | 
| 503 | #if !QT_CONFIG(icu) | 
| 504 |     QTextCodecCache *cache = &globalData->codecCache; | 
| 505 |     QTextCodec *codec; | 
| 506 |     codec = cache->value(name); | 
| 507 |     if (codec) | 
| 508 |         return codec; | 
| 509 |  | 
| 510 |     for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) { | 
| 511 |         QTextCodec *cursor = *it; | 
| 512 |         if (qTextCodecNameMatch(cursor->name(), name)) { | 
| 513 |             if (cache) | 
| 514 |                 cache->insert(name, cursor); | 
| 515 |             return cursor; | 
| 516 |         } | 
| 517 |         QList<QByteArray> aliases = cursor->aliases(); | 
| 518 |         for (ByteArrayListConstIt ait = aliases.constBegin(), acend = aliases.constEnd(); ait != acend; ++ait) { | 
| 519 |             if (qTextCodecNameMatch(*ait, name)) { | 
| 520 |                 cache->insert(name, cursor); | 
| 521 |                 return cursor; | 
| 522 |             } | 
| 523 |         } | 
| 524 |     } | 
| 525 |  | 
| 526 |     return nullptr; | 
| 527 | #else | 
| 528 |     return QIcuCodec::codecForNameUnlocked(name); | 
| 529 | #endif | 
| 530 | } | 
| 531 |  | 
| 532 |  | 
| 533 | /*! | 
| 534 |     \threadsafe | 
| 535 |     Returns the QTextCodec which matches the | 
| 536 |     \l{QTextCodec::mibEnum()}{MIBenum} \a mib. | 
| 537 | */ | 
| 538 | QTextCodec* QTextCodec::codecForMib(int mib) | 
| 539 | { | 
| 540 |     const TextCodecsMutexLocker locker; | 
| 541 |  | 
| 542 |     QTextCodecData *globalData = QTextCodecData::instance(); | 
| 543 |     if (!globalData) | 
| 544 |         return nullptr; | 
| 545 |     if (globalData->allCodecs.isEmpty()) | 
| 546 |         setup(); | 
| 547 |  | 
| 548 |     QByteArray key = "MIB: "  + QByteArray::number(mib); | 
| 549 |  | 
| 550 |     QTextCodecCache *cache = &globalData->codecCache; | 
| 551 |     QTextCodec *codec; | 
| 552 |     if (cache) { | 
| 553 |         codec = cache->value(key); | 
| 554 |         if (codec) | 
| 555 |             return codec; | 
| 556 |     } | 
| 557 |  | 
| 558 |     for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) { | 
| 559 |         QTextCodec *cursor = *it; | 
| 560 |         if (cursor->mibEnum() == mib) { | 
| 561 |             if (cache) | 
| 562 |                 cache->insert(key, value: cursor); | 
| 563 |             return cursor; | 
| 564 |         } | 
| 565 |     } | 
| 566 |  | 
| 567 | #if QT_CONFIG(icu) | 
| 568 |     return QIcuCodec::codecForMibUnlocked(mib); | 
| 569 | #else | 
| 570 |     return nullptr; | 
| 571 | #endif | 
| 572 | } | 
| 573 |  | 
| 574 | /*! | 
| 575 |     \threadsafe | 
| 576 |     Returns the list of all available codecs, by name. Call | 
| 577 |     QTextCodec::codecForName() to obtain the QTextCodec for the name. | 
| 578 |  | 
| 579 |     The list may contain many mentions of the same codec | 
| 580 |     if the codec has aliases. | 
| 581 |  | 
| 582 |     \sa availableMibs(), name(), aliases() | 
| 583 | */ | 
| 584 | QList<QByteArray> QTextCodec::availableCodecs() | 
| 585 | { | 
| 586 |     const TextCodecsMutexLocker locker; | 
| 587 |  | 
| 588 |     QTextCodecData *globalData = QTextCodecData::instance(); | 
| 589 |     if (globalData->allCodecs.isEmpty()) | 
| 590 |         setup(); | 
| 591 |  | 
| 592 |     QList<QByteArray> codecs; | 
| 593 |  | 
| 594 |     for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) { | 
| 595 |         codecs += (*it)->name(); | 
| 596 |         codecs += (*it)->aliases(); | 
| 597 |     } | 
| 598 |  | 
| 599 | #if QT_CONFIG(icu) | 
| 600 |     codecs += QIcuCodec::availableCodecs(); | 
| 601 | #endif | 
| 602 |  | 
| 603 |     return codecs; | 
| 604 | } | 
| 605 |  | 
| 606 | /*! | 
| 607 |     \threadsafe | 
| 608 |     Returns the list of MIBs for all available codecs. Call | 
| 609 |     QTextCodec::codecForMib() to obtain the QTextCodec for the MIB. | 
| 610 |  | 
| 611 |     \sa availableCodecs(), mibEnum() | 
| 612 | */ | 
| 613 | QList<int> QTextCodec::availableMibs() | 
| 614 | { | 
| 615 | #if QT_CONFIG(icu) | 
| 616 |     return QIcuCodec::availableMibs(); | 
| 617 | #else | 
| 618 |     const TextCodecsMutexLocker locker; | 
| 619 |  | 
| 620 |     QTextCodecData *globalData = QTextCodecData::instance(); | 
| 621 |     if (globalData->allCodecs.isEmpty()) | 
| 622 |         setup(); | 
| 623 |  | 
| 624 |     QList<int> codecs; | 
| 625 |  | 
| 626 |     for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) | 
| 627 |         codecs += (*it)->mibEnum(); | 
| 628 |  | 
| 629 |     return codecs; | 
| 630 | #endif | 
| 631 | } | 
| 632 |  | 
| 633 | /*! | 
| 634 |     \nonreentrant | 
| 635 |  | 
| 636 |     Set the codec to \a c; this will be returned by | 
| 637 |     codecForLocale(). If \a c is \nullptr, the codec is reset to | 
| 638 |     the default. | 
| 639 |  | 
| 640 |     This might be needed for some applications that want to use their | 
| 641 |     own mechanism for setting the locale. | 
| 642 |  | 
| 643 |     \sa codecForLocale() | 
| 644 | */ | 
| 645 | void QTextCodec::setCodecForLocale(QTextCodec *c) | 
| 646 | { | 
| 647 |     QTextCodecData::instance()->codecForLocale.storeRelease(newValue: c); | 
| 648 | } | 
| 649 |  | 
| 650 | /*! | 
| 651 |     \threadsafe | 
| 652 |     Returns a pointer to the codec most suitable for this locale. | 
| 653 |  | 
| 654 |     The codec will be retrieved from ICU where that backend is in use, otherwise | 
| 655 |     it may be obtained from an OS-specific API.  In the latter case, the codec's | 
| 656 |     name may be "System". | 
| 657 | */ | 
| 658 |  | 
| 659 | QTextCodec* QTextCodec::codecForLocale() | 
| 660 | { | 
| 661 |     QTextCodecData *globalData = QTextCodecData::instance(); | 
| 662 |     if (!globalData) | 
| 663 |         return nullptr; | 
| 664 |  | 
| 665 |     QTextCodec *codec = globalData->codecForLocale.loadAcquire(); | 
| 666 |     if (!codec) { | 
| 667 | #if QT_CONFIG(icu) | 
| 668 |         const TextCodecsMutexLocker locker; | 
| 669 |         codec = QIcuCodec::defaultCodecUnlocked(); | 
| 670 | #else | 
| 671 |         // setupLocaleMapper locks as necessary | 
| 672 |         codec = setupLocaleMapper(); | 
| 673 | #endif | 
| 674 |     } | 
| 675 |  | 
| 676 |     return codec; | 
| 677 | } | 
| 678 |  | 
| 679 |  | 
| 680 | /*! | 
| 681 |     \fn QByteArray QTextCodec::name() const | 
| 682 |  | 
| 683 |     QTextCodec subclasses must reimplement this function. It returns | 
| 684 |     the name of the encoding supported by the subclass. | 
| 685 |  | 
| 686 |     If the codec is registered as a character set in the | 
| 687 |     \l{IANA character-sets encoding file} this method should | 
| 688 |     return the preferred mime name for the codec if defined, | 
| 689 |     otherwise its name. | 
| 690 | */ | 
| 691 |  | 
| 692 | /*! | 
| 693 |     \fn int QTextCodec::mibEnum() const | 
| 694 |  | 
| 695 |     Subclasses of QTextCodec must reimplement this function. It | 
| 696 |     returns the \l{QTextCodec::mibEnum()}{MIBenum} (see \l{IANA character-sets encoding file} | 
| 697 |     for more information). It is important that each QTextCodec | 
| 698 |     subclass returns the correct unique value for this function. | 
| 699 | */ | 
| 700 |  | 
| 701 | /*! | 
| 702 |   Subclasses can return a number of aliases for the codec in question. | 
| 703 |  | 
| 704 |   Standard aliases for codecs can be found in the | 
| 705 |   \l{IANA character-sets encoding file}. | 
| 706 | */ | 
| 707 | QList<QByteArray> QTextCodec::aliases() const | 
| 708 | { | 
| 709 |     return QList<QByteArray>(); | 
| 710 | } | 
| 711 |  | 
| 712 | /*! | 
| 713 |     \fn QString QTextCodec::convertToUnicode(const char *chars, int len, | 
| 714 |                                              ConverterState *state) const | 
| 715 |  | 
| 716 |     QTextCodec subclasses must reimplement this function. | 
| 717 |  | 
| 718 |     Converts the first \a len characters of \a chars from the | 
| 719 |     encoding of the subclass to Unicode, and returns the result in a | 
| 720 |     QString. | 
| 721 |  | 
| 722 |     \a state can be \nullptr, in which case the conversion is stateless and | 
| 723 |     default conversion rules should be used. If \a state is not \nullptr, the | 
| 724 |     codec should save the state after the conversion in \a state, and | 
| 725 |     adjust the \c remainingChars and \c invalidChars members of the struct. | 
| 726 | */ | 
| 727 |  | 
| 728 | /*! | 
| 729 |     \fn QByteArray QTextCodec::convertFromUnicode(const QChar *input, int number, | 
| 730 |                                                   ConverterState *state) const | 
| 731 |  | 
| 732 |     QTextCodec subclasses must reimplement this function. | 
| 733 |  | 
| 734 |     Converts the first \a number of characters from the \a input array | 
| 735 |     from Unicode to the encoding of the subclass, and returns the result | 
| 736 |     in a QByteArray. | 
| 737 |  | 
| 738 |     \a state can be \nullptr in which case the conversion is stateless and | 
| 739 |     default conversion rules should be used. If \a state is not \nullptr, the | 
| 740 |     codec should save the state after the conversion in \a state, and | 
| 741 |     adjust the \c remainingChars and \c invalidChars members of the struct. | 
| 742 | */ | 
| 743 |  | 
| 744 | /*! | 
| 745 |     Creates a QTextDecoder with a specified \a flags to decode chunks | 
| 746 |     of \c{char *} data to create chunks of Unicode data. | 
| 747 |  | 
| 748 |     The caller is responsible for deleting the returned object. | 
| 749 |  | 
| 750 |     \since 4.7 | 
| 751 | */ | 
| 752 | QTextDecoder* QTextCodec::makeDecoder(QTextCodec::ConversionFlags flags) const | 
| 753 | { | 
| 754 |     return new QTextDecoder(this, flags); | 
| 755 | } | 
| 756 |  | 
| 757 | /*! | 
| 758 |     Creates a QTextEncoder with a specified \a flags to encode chunks | 
| 759 |     of Unicode data as \c{char *} data. | 
| 760 |  | 
| 761 |     The caller is responsible for deleting the returned object. | 
| 762 |  | 
| 763 |     \since 4.7 | 
| 764 | */ | 
| 765 | QTextEncoder* QTextCodec::makeEncoder(QTextCodec::ConversionFlags flags) const | 
| 766 | { | 
| 767 |     return new QTextEncoder(this, flags); | 
| 768 | } | 
| 769 |  | 
| 770 | /*! | 
| 771 |     \fn QByteArray QTextCodec::fromUnicode(const QChar *input, int number, | 
| 772 |                                            ConverterState *state) const | 
| 773 |  | 
| 774 |     Converts the first \a number of characters from the \a input array | 
| 775 |     from Unicode to the encoding of this codec, and returns the result | 
| 776 |     in a QByteArray. | 
| 777 |  | 
| 778 |     The \a state of the convertor used is updated. | 
| 779 | */ | 
| 780 |  | 
| 781 | /*! | 
| 782 |     Converts \a str from Unicode to the encoding of this codec, and | 
| 783 |     returns the result in a QByteArray. | 
| 784 | */ | 
| 785 | QByteArray QTextCodec::fromUnicode(const QString& str) const | 
| 786 | { | 
| 787 |     ConverterState state = DefaultConversion | Flag::Stateless; | 
| 788 |     return convertFromUnicode(in: str.constData(), length: str.size(), state: &state); | 
| 789 | } | 
| 790 |  | 
| 791 | /*! | 
| 792 |     \overload | 
| 793 |     \since 5.10 | 
| 794 |  | 
| 795 |     Converts \a str from Unicode to the encoding of this codec, and | 
| 796 |     returns the result in a QByteArray. | 
| 797 | */ | 
| 798 | QByteArray QTextCodec::fromUnicode(QStringView str) const | 
| 799 | { | 
| 800 |     ConverterState state = DefaultConversion | Flag::Stateless; | 
| 801 |     return convertFromUnicode(in: str.data(), length: str.size(), state: &state); | 
| 802 | } | 
| 803 |  | 
| 804 | /*! | 
| 805 |     \fn QString QTextCodec::toUnicode(const char *input, int size, | 
| 806 |                                       ConverterState *state) const | 
| 807 |  | 
| 808 |     Converts the first \a size characters from the \a input from the | 
| 809 |     encoding of this codec to Unicode, and returns the result in a | 
| 810 |     QString. | 
| 811 |  | 
| 812 |     The \a state of the convertor used is updated. | 
| 813 | */ | 
| 814 |  | 
| 815 | /*! | 
| 816 |     Converts \a a from the encoding of this codec to Unicode, and | 
| 817 |     returns the result in a QString. | 
| 818 | */ | 
| 819 | QString QTextCodec::toUnicode(const QByteArray& a) const | 
| 820 | { | 
| 821 |     ConverterState state = DefaultConversion | Flag::Stateless; | 
| 822 |     return convertToUnicode(in: a.constData(), length: a.size(), state: &state); | 
| 823 | } | 
| 824 |  | 
| 825 | /*! | 
| 826 |     Returns \c true if the Unicode character \a ch can be fully encoded | 
| 827 |     with this codec; otherwise returns \c false. | 
| 828 | */ | 
| 829 | bool QTextCodec::canEncode(QChar ch) const | 
| 830 | { | 
| 831 |     ConverterState state; | 
| 832 |     state.flags = ConvertInvalidToNull; | 
| 833 |     convertFromUnicode(in: &ch, length: 1, state: &state); | 
| 834 |     return (state.invalidChars == 0); | 
| 835 | } | 
| 836 |  | 
| 837 | /*! | 
| 838 |     \overload | 
| 839 |  | 
| 840 |     \a s contains the string being tested for encode-ability. | 
| 841 | */ | 
| 842 | bool QTextCodec::canEncode(const QString& s) const | 
| 843 | { | 
| 844 |     ConverterState state; | 
| 845 |     state.flags = ConvertInvalidToNull; | 
| 846 |     convertFromUnicode(in: s.constData(), length: s.size(), state: &state); | 
| 847 |     return (state.invalidChars == 0); | 
| 848 | } | 
| 849 |  | 
| 850 | /*! | 
| 851 |     \overload | 
| 852 |     \since 5.10 | 
| 853 |  | 
| 854 |     Returns \c true if the Unicode string \a s can be fully encoded | 
| 855 |     with this codec; otherwise returns \c false. | 
| 856 | */ | 
| 857 | bool QTextCodec::canEncode(QStringView s) const | 
| 858 | { | 
| 859 |     ConverterState state; | 
| 860 |     state.flags = ConvertInvalidToNull; | 
| 861 |     convertFromUnicode(in: s.data(), length: s.size(), state: &state); | 
| 862 |     return !state.invalidChars; | 
| 863 | } | 
| 864 | /*! | 
| 865 |     \overload | 
| 866 |  | 
| 867 |     \a chars contains the source characters. | 
| 868 | */ | 
| 869 | QString QTextCodec::toUnicode(const char *chars) const | 
| 870 | { | 
| 871 |     const auto len = int(qstrlen(str: chars)); | 
| 872 |     return convertToUnicode(in: chars, length: len, state: nullptr); | 
| 873 | } | 
| 874 |  | 
| 875 |  | 
| 876 | /*! | 
| 877 |     \class QTextEncoder | 
| 878 |     \inmodule QtCore5Compat | 
| 879 |     \brief The QTextEncoder class provides a state-based encoder. | 
| 880 |     \reentrant | 
| 881 |     \ingroup i18n | 
| 882 |  | 
| 883 |     A text encoder converts text from Unicode into an encoded text format | 
| 884 |     using a specific codec. | 
| 885 |  | 
| 886 |     The encoder converts Unicode into another format, remembering any | 
| 887 |     state that is required between calls. | 
| 888 |  | 
| 889 |     \sa QTextCodec::makeEncoder(), QTextDecoder | 
| 890 | */ | 
| 891 |  | 
| 892 | /*! | 
| 893 |     \fn QTextEncoder::QTextEncoder(const QTextCodec *codec) | 
| 894 |  | 
| 895 |     Constructs a text encoder for the given \a codec. | 
| 896 | */ | 
| 897 |  | 
| 898 | /*! | 
| 899 |     Constructs a text encoder for the given \a codec and conversion \a flags. | 
| 900 |  | 
| 901 |     \since 4.7 | 
| 902 | */ | 
| 903 | QTextEncoder::QTextEncoder(const QTextCodec *codec, QTextCodec::ConversionFlags flags) | 
| 904 |     : c(codec), state() | 
| 905 | { | 
| 906 |     state.flags = flags; | 
| 907 | } | 
| 908 |  | 
| 909 | /*! | 
| 910 |     Destroys the encoder. | 
| 911 | */ | 
| 912 | QTextEncoder::~QTextEncoder() | 
| 913 | { | 
| 914 | } | 
| 915 |  | 
| 916 | /*! | 
| 917 |     \internal | 
| 918 |     \since 4.5 | 
| 919 |     Determines whether the encoder encountered a failure while decoding the input. If | 
| 920 |     an error was encountered, the produced result is undefined, and gets converted as according | 
| 921 |     to the conversion flags. | 
| 922 |  */ | 
| 923 | bool QTextEncoder::hasFailure() const | 
| 924 | { | 
| 925 |     return state.invalidChars != 0; | 
| 926 | } | 
| 927 |  | 
| 928 | /*! | 
| 929 |     Converts the Unicode string \a str into an encoded QByteArray. | 
| 930 | */ | 
| 931 | QByteArray QTextEncoder::fromUnicode(const QString& str) | 
| 932 | { | 
| 933 |     return c->fromUnicode(in: str.constData(), length: str.size(), state: &state); | 
| 934 | } | 
| 935 |  | 
| 936 | /*! | 
| 937 |     \overload | 
| 938 |     \since 5.10 | 
| 939 |     Converts the Unicode string \a str into an encoded QByteArray. | 
| 940 | */ | 
| 941 | QByteArray QTextEncoder::fromUnicode(QStringView str) | 
| 942 | { | 
| 943 |     return c->fromUnicode(in: str.data(), length: str.size(), state: &state); | 
| 944 | } | 
| 945 |  | 
| 946 | /*! | 
| 947 |     \overload | 
| 948 |  | 
| 949 |     Converts \a len characters (not bytes) from \a uc, and returns the | 
| 950 |     result in a QByteArray. | 
| 951 | */ | 
| 952 | QByteArray QTextEncoder::fromUnicode(const QChar *uc, int len) | 
| 953 | { | 
| 954 |     return c->fromUnicode(in: uc, length: len, state: &state); | 
| 955 | } | 
| 956 |  | 
| 957 | /*! | 
| 958 |     \class QTextDecoder | 
| 959 |     \inmodule QtCore5Compat | 
| 960 |     \brief The QTextDecoder class provides a state-based decoder. | 
| 961 |     \reentrant | 
| 962 |     \ingroup i18n | 
| 963 |  | 
| 964 |     A text decoder converts text from an encoded text format into Unicode | 
| 965 |     using a specific codec. | 
| 966 |  | 
| 967 |     The decoder converts text in this format into Unicode, remembering any | 
| 968 |     state that is required between calls. | 
| 969 |  | 
| 970 |     \sa QTextCodec::makeDecoder(), QTextEncoder | 
| 971 | */ | 
| 972 |  | 
| 973 | /*! | 
| 974 |     \fn  QTextDecoder::QTextDecoder(const QTextCodec *codec) | 
| 975 |  | 
| 976 |     Constructs a text decoder for the given \a codec. | 
| 977 | */ | 
| 978 |  | 
| 979 | /*! | 
| 980 |     Constructs a text decoder for the given \a codec and conversion \a flags. | 
| 981 |  | 
| 982 |     \since 4.7 | 
| 983 | */ | 
| 984 |  | 
| 985 | QTextDecoder::QTextDecoder(const QTextCodec *codec, QTextCodec::ConversionFlags flags) | 
| 986 |     : c(codec), state() | 
| 987 | { | 
| 988 |     state.flags = flags; | 
| 989 | } | 
| 990 |  | 
| 991 | /*! | 
| 992 |     Destroys the decoder. | 
| 993 | */ | 
| 994 | QTextDecoder::~QTextDecoder() | 
| 995 | { | 
| 996 | } | 
| 997 |  | 
| 998 | /*! | 
| 999 |     \fn QString QTextDecoder::toUnicode(const char *chars, int len) | 
| 1000 |  | 
| 1001 |     Converts the first \a len bytes in \a chars to Unicode, returning | 
| 1002 |     the result. | 
| 1003 |  | 
| 1004 |     If not all characters are used (e.g. if only part of a multi-byte | 
| 1005 |     encoding is at the end of the characters), the decoder remembers | 
| 1006 |     enough state to continue with the next call to this function. | 
| 1007 | */ | 
| 1008 | QString QTextDecoder::toUnicode(const char *chars, int len) | 
| 1009 | { | 
| 1010 |     return c->toUnicode(in: chars, length: len, state: &state); | 
| 1011 | } | 
| 1012 |  | 
| 1013 | /*! \overload | 
| 1014 |  | 
| 1015 |     The converted string is returned in \a target. | 
| 1016 |  */ | 
| 1017 | void QTextDecoder::toUnicode(QString *target, const char *chars, int len) | 
| 1018 | { | 
| 1019 |     Q_ASSERT(target); | 
| 1020 |     switch (c->mibEnum()) { | 
| 1021 |     case 106: // utf8 | 
| 1022 |         static_cast<const QUtf8Codec*>(c)->convertToUnicode(target, chars, len, &state); | 
| 1023 |         break; | 
| 1024 |     case 4: // latin1 | 
| 1025 |         target->resize(size: len); | 
| 1026 |         qt_from_latin1(dst: (char16_t*)target->data(), str: chars, size: len); | 
| 1027 |         break; | 
| 1028 |     default: | 
| 1029 |         *target = c->toUnicode(in: chars, length: len, state: &state); | 
| 1030 |     } | 
| 1031 | } | 
| 1032 |  | 
| 1033 |  | 
| 1034 | /*! | 
| 1035 |     \overload | 
| 1036 |  | 
| 1037 |     Converts the bytes in the byte array specified by \a ba to Unicode | 
| 1038 |     and returns the result. | 
| 1039 | */ | 
| 1040 | QString QTextDecoder::toUnicode(const QByteArray &ba) | 
| 1041 | { | 
| 1042 |     return c->toUnicode(in: ba.constData(), length: ba.size(), state: &state); | 
| 1043 | } | 
| 1044 |  | 
| 1045 | /*! | 
| 1046 |     \since 4.4 | 
| 1047 |  | 
| 1048 |     Tries to detect the encoding of the provided snippet of HTML in | 
| 1049 |     the given byte array, \a ba, by checking the BOM (Byte Order Mark) | 
| 1050 |     and the content-type meta header and returns a QTextCodec instance | 
| 1051 |     that is capable of decoding the html to unicode.  If the codec | 
| 1052 |     cannot be detected from the content provided, \a defaultCodec is | 
| 1053 |     returned. | 
| 1054 |  | 
| 1055 |     \sa codecForUtfText() | 
| 1056 | */ | 
| 1057 | QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec *defaultCodec) | 
| 1058 | { | 
| 1059 |     // determine charset | 
| 1060 |     QTextCodec *c = QTextCodec::codecForUtfText(ba, defaultCodec: nullptr); | 
| 1061 |     if (!c) { | 
| 1062 |         static Q_RELAXED_CONSTEXPR auto matcher = qMakeStaticByteArrayMatcher(pattern: "meta " ); | 
| 1063 |         QByteArray  = ba.left(n: 1024).toLower(); | 
| 1064 |         qsizetype pos = matcher.indexIn(haystack: header); | 
| 1065 |         if (pos != -1) { | 
| 1066 |             static Q_RELAXED_CONSTEXPR auto matcher = qMakeStaticByteArrayMatcher(pattern: "charset=" ); | 
| 1067 |             pos = matcher.indexIn(haystack: header, from: pos); | 
| 1068 |             if (pos != -1) { | 
| 1069 |                 pos += qstrlen(str: "charset=" ); | 
| 1070 |  | 
| 1071 |                 qsizetype pos2 = pos; | 
| 1072 |                 // The attribute can be closed with either """, "'", ">" or "/", | 
| 1073 |                 // none of which are valid charset characters. | 
| 1074 |                 while (++pos2 < header.size()) { | 
| 1075 |                     char ch = header.at(i: pos2); | 
| 1076 |                     if (ch == '\"' || ch == '\'' || ch == '>') { | 
| 1077 |                         QByteArray name = header.mid(index: pos, len: pos2 - pos); | 
| 1078 |                         if (name == "unicode" ) // QTBUG-41998, ICU will return UTF-16. | 
| 1079 |                             name = QByteArrayLiteral("UTF-8" ); | 
| 1080 |                         c = QTextCodec::codecForName(name); | 
| 1081 |                         return c ? c : defaultCodec; | 
| 1082 |                     } | 
| 1083 |                 } | 
| 1084 |             } | 
| 1085 |         } | 
| 1086 |     } | 
| 1087 |     if (!c) | 
| 1088 |         c = defaultCodec; | 
| 1089 |  | 
| 1090 |     return c; | 
| 1091 | } | 
| 1092 |  | 
| 1093 | /*! | 
| 1094 |     \overload | 
| 1095 |  | 
| 1096 |     Tries to detect the encoding of the provided snippet of HTML in | 
| 1097 |     the given byte array, \a ba, by checking the BOM (Byte Order Mark) | 
| 1098 |     and the content-type meta header and returns a QTextCodec instance | 
| 1099 |     that is capable of decoding the html to unicode. If the codec cannot | 
| 1100 |     be detected, this overload returns a Latin-1 QTextCodec. | 
| 1101 | */ | 
| 1102 | QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba) | 
| 1103 | { | 
| 1104 |     return codecForHtml(ba, defaultCodec: QTextCodec::codecForName(name: "ISO-8859-1" )); | 
| 1105 | } | 
| 1106 |  | 
| 1107 | /*! | 
| 1108 |     \since 4.6 | 
| 1109 |  | 
| 1110 |     Tries to detect the encoding of the provided snippet \a ba by | 
| 1111 |     using the BOM (Byte Order Mark) and returns a QTextCodec instance | 
| 1112 |     that is capable of decoding the text to unicode. This function can | 
| 1113 |     detect one of the following codecs: | 
| 1114 |  | 
| 1115 |     \list | 
| 1116 |       \li UTF-32 Little Endian | 
| 1117 |       \li UTF-32 Big Endian | 
| 1118 |       \li UTF-16 Little Endian | 
| 1119 |       \li UTF-16 Big Endian | 
| 1120 |       \li UTF-8 | 
| 1121 |     \endlist | 
| 1122 |  | 
| 1123 |     If the codec cannot be detected from the content provided, \a defaultCodec | 
| 1124 |     is returned. | 
| 1125 |  | 
| 1126 |     \sa codecForHtml() | 
| 1127 | */ | 
| 1128 | QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba, QTextCodec *defaultCodec) | 
| 1129 | { | 
| 1130 |     const int arraySize = ba.size(); | 
| 1131 |     const uchar *buf = reinterpret_cast<const uchar *>(ba.constData()); | 
| 1132 |     const uint bom = 0xfeff; | 
| 1133 |  | 
| 1134 |     if (arraySize > 3) { | 
| 1135 |         uint uc = qFromUnaligned<uint>(src: buf); | 
| 1136 |         if (uc == qToBigEndian(source: bom)) | 
| 1137 |             return QTextCodec::codecForMib(mib: 1018); // utf-32 be | 
| 1138 |         else if (uc == qToLittleEndian(source: bom)) | 
| 1139 |             return QTextCodec::codecForMib(mib: 1019); // utf-32 le | 
| 1140 |     } | 
| 1141 |  | 
| 1142 |     if (arraySize < 2) | 
| 1143 |         return defaultCodec; | 
| 1144 |  | 
| 1145 |     ushort uc = qFromUnaligned<ushort>(src: buf); | 
| 1146 |     if (uc == qToBigEndian(source: ushort(bom))) | 
| 1147 |         return QTextCodec::codecForMib(mib: 1013); // utf16 be | 
| 1148 |     else if (uc == qToLittleEndian(source: ushort(bom))) | 
| 1149 |         return QTextCodec::codecForMib(mib: 1014); // utf16 le | 
| 1150 |  | 
| 1151 |     if (arraySize < 3) | 
| 1152 |         return defaultCodec; | 
| 1153 |  | 
| 1154 |     static const char utf8bom[] = "\xef\xbb\xbf" ; | 
| 1155 |     if (memcmp(s1: buf, s2: utf8bom, n: sizeof(utf8bom) - 1) == 0) | 
| 1156 |         return QTextCodec::codecForMib(mib: 106); // utf-8 | 
| 1157 |  | 
| 1158 |     return defaultCodec; | 
| 1159 | } | 
| 1160 |  | 
| 1161 | /*! | 
| 1162 |     \overload | 
| 1163 |  | 
| 1164 |     Tries to detect the encoding of the provided snippet \a ba by | 
| 1165 |     using the BOM (Byte Order Mark) and returns a QTextCodec instance | 
| 1166 |     that is capable of decoding the text to unicode. This function can | 
| 1167 |     detect one of the following codecs: | 
| 1168 |  | 
| 1169 |     \list | 
| 1170 |       \li UTF-32 Little Endian | 
| 1171 |       \li UTF-32 Big Endian | 
| 1172 |       \li UTF-16 Little Endian | 
| 1173 |       \li UTF-16 Big Endian | 
| 1174 |       \li UTF-8 | 
| 1175 |     \endlist | 
| 1176 |  | 
| 1177 |     If the codec cannot be detected from the content provided, this overload | 
| 1178 |     returns a Latin-1 QTextCodec. | 
| 1179 |  | 
| 1180 |     \sa codecForHtml() | 
| 1181 | */ | 
| 1182 | QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba) | 
| 1183 | { | 
| 1184 |     return codecForUtfText(ba, defaultCodec: QTextCodec::codecForMib(/*Latin 1*/ mib: 4)); | 
| 1185 | } | 
| 1186 |  | 
| 1187 | /*! | 
| 1188 |     \fn QTextCodec *QTextCodec::codecForTr () | 
| 1189 |     \deprecated | 
| 1190 |  | 
| 1191 |     Returns the codec used by QObject::tr() on its argument. If this | 
| 1192 |     function returns \nullptr (the default), tr() assumes Latin-1. | 
| 1193 | */ | 
| 1194 |  | 
| 1195 | /*! | 
| 1196 |     \internal | 
| 1197 |     \since 4.3 | 
| 1198 |     Determines whether the decoder encountered a failure while decoding the | 
| 1199 |     input. If an error was encountered, the produced result is undefined, and | 
| 1200 |     gets converted as according to the conversion flags. | 
| 1201 |  */ | 
| 1202 | bool QTextDecoder::hasFailure() const | 
| 1203 | { | 
| 1204 |     return state.invalidChars != 0; | 
| 1205 | } | 
| 1206 |  | 
| 1207 | /*! | 
| 1208 |     \internal | 
| 1209 |     \since 5.12 | 
| 1210 |  | 
| 1211 |     Determines whether the decoder needs more bytes to continue decoding. That | 
| 1212 |     is, this signifies that the input string ended in the middle of a | 
| 1213 |     multi-byte sequence. Note that it's possible some codecs do not report this. | 
| 1214 |  */ | 
| 1215 | bool QTextDecoder::needsMoreData() const | 
| 1216 | { | 
| 1217 |     return state.remainingChars; | 
| 1218 | } | 
| 1219 |  | 
| 1220 | /*! | 
| 1221 |     \fn QTextCodec * Qt::codecForHtml(const QByteArray &ba) | 
| 1222 |     \internal | 
| 1223 |  | 
| 1224 |     This function is defined in the \c <QTextCodec> header file. | 
| 1225 | */ | 
| 1226 | QTextCodec *Qt::codecForHtml(const QByteArray &ba) | 
| 1227 | { | 
| 1228 |     return QTextCodec::codecForHtml(ba); | 
| 1229 | } | 
| 1230 |  | 
| 1231 | QT_END_NAMESPACE | 
| 1232 |  |