1 | // Copyright (C) 2018 The Qt Company Ltd. |
2 | // Copyright (C) 2018 Intel Corporation. |
3 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only |
4 | |
5 | #include "qplatformdefs.h" |
6 | |
7 | #include "qtextcodec.h" |
8 | #include "qtextcodec_p.h" |
9 | |
10 | #include "qbytearraymatcher.h" |
11 | #include "qendian.h" |
12 | #include "qfile.h" |
13 | #include "qlist.h" |
14 | #include <private/qlocking_p.h> |
15 | #include "qstringlist.h" |
16 | #include "qvarlengtharray.h" |
17 | |
18 | #include <private/qcoreapplication_p.h> |
19 | |
20 | #include "qutfcodec_p.h" |
21 | #include "qlatincodec_p.h" |
22 | |
23 | #if QT_CONFIG(codecs) |
24 | # include "qtsciicodec_p.h" |
25 | # include "qisciicodec_p.h" |
26 | #endif |
27 | #if QT_CONFIG(icu) |
28 | #include "qicucodec_p.h" |
29 | #else |
30 | #if QT_CONFIG(iconv) |
31 | # include "qiconvcodec_p.h" |
32 | #endif |
33 | #ifdef Q_OS_WIN |
34 | # include "qwindowscodec_p.h" |
35 | #endif |
36 | # include "qsimplecodec_p.h" |
37 | #if QT_CONFIG(big_codecs) && QT_CONFIG(textcodec) |
38 | # ifndef Q_OS_INTEGRITY |
39 | # include "qgb18030codec_p.h" |
40 | # include "qeucjpcodec_p.h" |
41 | # include "qjiscodec_p.h" |
42 | # include "qsjiscodec_p.h" |
43 | # include "qeuckrcodec_p.h" |
44 | # include "qbig5codec_p.h" |
45 | # endif // !Q_OS_INTEGRITY |
46 | #endif // big_codecs |
47 | |
48 | #endif // icu |
49 | |
50 | #include <mutex> |
51 | |
52 | #include <stdlib.h> |
53 | #include <ctype.h> |
54 | #include <locale.h> |
55 | #if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_ANDROID) |
56 | # include <langinfo.h> |
57 | #endif |
58 | |
59 | QT_BEGIN_NAMESPACE |
60 | |
61 | // in qstring.cpp: |
62 | void qt_from_latin1(char16_t *dst, const char *str, size_t size) noexcept; |
63 | |
64 | typedef QList<QTextCodec*>::ConstIterator TextCodecListConstIt; |
65 | typedef QList<QByteArray>::ConstIterator ByteArrayListConstIt; |
66 | |
67 | Q_GLOBAL_STATIC(QRecursiveMutex, textCodecsMutex); |
68 | |
69 | Q_GLOBAL_STATIC(QTextCodecData, textCodecData) |
70 | |
71 | QTextCodecData::QTextCodecData() |
72 | : codecForLocale(nullptr) |
73 | { |
74 | } |
75 | |
76 | QTextCodecData::~QTextCodecData() |
77 | { |
78 | codecForLocale.storeRelease(newValue: nullptr); |
79 | QList<QTextCodec *> tmp = allCodecs; |
80 | allCodecs.clear(); |
81 | codecCache.clear(); |
82 | for (QList<QTextCodec *>::const_iterator it = tmp.constBegin(); it != tmp.constEnd(); ++it) |
83 | delete *it; |
84 | } |
85 | |
86 | QTextCodecData *QTextCodecData::instance() |
87 | { |
88 | return textCodecData(); |
89 | } |
90 | |
91 | class TextCodecsMutexLocker |
92 | { |
93 | using Lock = decltype(qt_unique_lock(mutex&: std::declval<QRecursiveMutex&>())); |
94 | // ### FIXME: this is used when textCodecsMutex already == nullptr |
95 | const Lock lock = qt_unique_lock(mutex: textCodecsMutex()); |
96 | public: |
97 | TextCodecsMutexLocker() {} // required d/t an ICC 19 bug |
98 | }; |
99 | |
100 | #if !QT_CONFIG(icu) |
101 | static char qtolower(char c) |
102 | { if (c >= 'A' && c <= 'Z') return c + 0x20; return c; } |
103 | static bool qisalnum(char c) |
104 | { return (c >= '0' && c <= '9') || ((c | 0x20) >= 'a' && (c | 0x20) <= 'z'); } |
105 | |
106 | bool qTextCodecNameMatch(const char *n, const char *h) |
107 | { |
108 | if (qstricmp(n, h) == 0) |
109 | return true; |
110 | |
111 | // if the letters and numbers are the same, we have a match |
112 | while (*n != '\0') { |
113 | if (qisalnum(*n)) { |
114 | for (;;) { |
115 | if (*h == '\0') |
116 | return false; |
117 | if (qisalnum(*h)) |
118 | break; |
119 | ++h; |
120 | } |
121 | if (qtolower(*n) != qtolower(*h)) |
122 | return false; |
123 | ++h; |
124 | } |
125 | ++n; |
126 | } |
127 | while (*h && !qisalnum(*h)) |
128 | ++h; |
129 | return (*h == '\0'); |
130 | } |
131 | |
132 | |
133 | #if !defined(Q_OS_WIN32) && !defined(QT_LOCALE_IS_UTF8) |
134 | static QTextCodec *checkForCodec(const QByteArray &name) { |
135 | QTextCodec *c = QTextCodec::codecForName(name); |
136 | if (!c) { |
137 | const int index = name.indexOf('@'); |
138 | if (index != -1) { |
139 | c = QTextCodec::codecForName(name.left(index)); |
140 | } |
141 | } |
142 | return c; |
143 | } |
144 | #endif |
145 | |
146 | static void setup(); |
147 | |
148 | // \threadsafe |
149 | // this returns the codec the method sets up as locale codec to |
150 | // avoid a race condition in codecForLocale() when |
151 | // setCodecForLocale(nullptr) is called at the same time. |
152 | static QTextCodec *setupLocaleMapper() |
153 | { |
154 | QTextCodecData *globalData = QTextCodecData::instance(); |
155 | |
156 | QTextCodec *locale = nullptr; |
157 | |
158 | { |
159 | const TextCodecsMutexLocker locker; |
160 | if (globalData->allCodecs.isEmpty()) |
161 | setup(); |
162 | } |
163 | |
164 | QCoreApplicationPrivate::initLocale(); |
165 | |
166 | #if defined(QT_LOCALE_IS_UTF8) |
167 | locale = QTextCodec::codecForName("UTF-8" ); |
168 | #elif defined(Q_OS_WIN) |
169 | locale = QTextCodec::codecForName("System" ); |
170 | #else |
171 | |
172 | // First try getting the codecs name from nl_langinfo and see |
173 | // if we have a builtin codec for it. |
174 | // Only fall back to using iconv if we can't find a builtin codec |
175 | // This is because the builtin utf8 codec is around 5 times faster |
176 | // then the using QIconvCodec |
177 | |
178 | #if defined (_XOPEN_UNIX) |
179 | char *charset = nl_langinfo(CODESET); |
180 | if (charset) |
181 | locale = QTextCodec::codecForName(charset); |
182 | #endif |
183 | #if QT_CONFIG(iconv) |
184 | if (!locale) { |
185 | // no builtin codec for the locale found, let's try using iconv |
186 | (void) new QIconvCodec(); |
187 | locale = QTextCodec::codecForName("System" ); |
188 | } |
189 | #endif |
190 | |
191 | if (!locale) { |
192 | // Very poorly defined and followed standards causes lots of |
193 | // code to try to get all the cases... This logic is |
194 | // duplicated in QIconvCodec, so if you change it here, change |
195 | // it there too. |
196 | |
197 | // Try to determine locale codeset from locale name assigned to |
198 | // LC_CTYPE category. |
199 | |
200 | // First part is getting that locale name. First try setlocale() which |
201 | // definitely knows it, but since we cannot fully trust it, get ready |
202 | // to fall back to environment variables. |
203 | const QByteArray ctype = setlocale(LC_CTYPE, nullptr); |
204 | |
205 | // Get the first nonempty value from $LC_ALL, $LC_CTYPE, and $LANG |
206 | // environment variables. |
207 | QByteArray lang = qgetenv("LC_ALL" ); |
208 | if (lang.isEmpty() || lang == "C" ) { |
209 | lang = qgetenv("LC_CTYPE" ); |
210 | } |
211 | if (lang.isEmpty() || lang == "C" ) { |
212 | lang = qgetenv("LANG" ); |
213 | } |
214 | |
215 | // Now try these in order: |
216 | // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15) |
217 | // 2. CODESET from lang if it contains a .CODESET part |
218 | // 3. ctype (maybe the locale is named "ISO-8859-1" or something) |
219 | // 4. locale (ditto) |
220 | // 5. check for "@euro" |
221 | // 6. guess locale from ctype unless ctype is "C" |
222 | // 7. guess locale from lang |
223 | |
224 | // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15) |
225 | int indexOfDot = ctype.indexOf('.'); |
226 | if (indexOfDot != -1) |
227 | locale = checkForCodec( ctype.mid(indexOfDot + 1) ); |
228 | |
229 | // 2. CODESET from lang if it contains a .CODESET part |
230 | if (!locale) { |
231 | indexOfDot = lang.indexOf('.'); |
232 | if (indexOfDot != -1) |
233 | locale = checkForCodec( lang.mid(indexOfDot + 1) ); |
234 | } |
235 | |
236 | // 3. ctype (maybe the locale is named "ISO-8859-1" or something) |
237 | if (!locale && !ctype.isEmpty() && ctype != "C" ) |
238 | locale = checkForCodec(ctype); |
239 | |
240 | // 4. locale (ditto) |
241 | if (!locale && !lang.isEmpty()) |
242 | locale = checkForCodec(lang); |
243 | |
244 | // 5. "@euro" |
245 | if ((!locale && ctype.contains("@euro" )) || lang.contains("@euro" )) |
246 | locale = checkForCodec("ISO 8859-15" ); |
247 | } |
248 | |
249 | #endif |
250 | // If everything failed, we default to 8859-1 |
251 | if (!locale) |
252 | locale = QTextCodec::codecForName("ISO 8859-1" ); |
253 | globalData->codecForLocale.storeRelease(locale); |
254 | return locale; |
255 | } |
256 | |
257 | |
258 | // textCodecsMutex need to be locked to enter this function |
259 | static void setup() |
260 | { |
261 | static bool initialized = false; |
262 | if (initialized) |
263 | return; |
264 | initialized = true; |
265 | |
266 | #if QT_CONFIG(codecs) |
267 | (void)new QTsciiCodec; |
268 | for (int i = 0; i < 9; ++i) |
269 | (void)new QIsciiCodec(i); |
270 | for (int i = 0; i < QSimpleTextCodec::numSimpleCodecs; ++i) |
271 | (void)new QSimpleTextCodec(i); |
272 | |
273 | # if QT_CONFIG(big_codecs) && !defined(Q_OS_INTEGRITY) |
274 | (void)new QGb18030Codec; |
275 | (void)new QGbkCodec; |
276 | (void)new QGb2312Codec; |
277 | (void)new QEucJpCodec; |
278 | (void)new QJisCodec; |
279 | (void)new QSjisCodec; |
280 | (void)new QEucKrCodec; |
281 | (void)new QCP949Codec; |
282 | (void)new QBig5Codec; |
283 | (void)new QBig5hkscsCodec; |
284 | # endif // big_codecs && !Q_OS_INTEGRITY |
285 | #if QT_CONFIG(iconv) |
286 | (void) new QIconvCodec; |
287 | #endif |
288 | #if defined(Q_OS_WIN32) |
289 | (void) new QWindowsLocalCodec; |
290 | #endif // Q_OS_WIN32 |
291 | #endif // codecs |
292 | |
293 | (void)new QUtf16Codec; |
294 | (void)new QUtf16BECodec; |
295 | (void)new QUtf16LECodec; |
296 | (void)new QUtf32Codec; |
297 | (void)new QUtf32BECodec; |
298 | (void)new QUtf32LECodec; |
299 | (void)new QLatin15Codec; |
300 | (void)new QLatin1Codec; |
301 | (void)new QUtf8Codec; |
302 | } |
303 | #else |
304 | static void setup() {} |
305 | #endif // icu |
306 | |
307 | /*! |
308 | \class QTextCodec |
309 | \inmodule QtCore5Compat |
310 | \brief The QTextCodec class provides conversions between text encodings. |
311 | \reentrant |
312 | \ingroup i18n |
313 | |
314 | Qt uses Unicode to store, draw and manipulate strings. In many |
315 | situations you may wish to deal with data that uses a different |
316 | encoding. For example, most Japanese documents are still stored |
317 | in Shift-JIS or ISO 2022-JP, while Russian users often have their |
318 | documents in KOI8-R or Windows-1251. |
319 | |
320 | Qt provides a set of QTextCodec classes to help with converting |
321 | non-Unicode formats to and from Unicode. You can also create your |
322 | own codec classes. |
323 | |
324 | The supported encodings are: |
325 | |
326 | \list |
327 | \li \l{Big5 Text Codec}{Big5} |
328 | \li \l{Big5-HKSCS Text Codec}{Big5-HKSCS} |
329 | \li CP949 |
330 | \li \l{EUC-JP Text Codec}{EUC-JP} |
331 | \li \l{EUC-KR Text Codec}{EUC-KR} |
332 | \li \l{GBK Text Codec}{GB18030} |
333 | \li HP-ROMAN8 |
334 | \li IBM 850 |
335 | \li IBM 866 |
336 | \li IBM 874 |
337 | \li \l{ISO 2022-JP (JIS) Text Codec}{ISO 2022-JP} |
338 | \li ISO 8859-1 to 10 |
339 | \li ISO 8859-13 to 16 |
340 | \li Iscii-Bng, Dev, Gjr, Knd, Mlm, Ori, Pnj, Tlg, and Tml |
341 | \li KOI8-R |
342 | \li KOI8-U |
343 | \li Macintosh |
344 | \li \l{Shift-JIS Text Codec}{Shift-JIS} |
345 | \li TIS-620 |
346 | \li \l{TSCII Text Codec}{TSCII} |
347 | \li UTF-8 |
348 | \li UTF-16 |
349 | \li UTF-16BE |
350 | \li UTF-16LE |
351 | \li UTF-32 |
352 | \li UTF-32BE |
353 | \li UTF-32LE |
354 | \li Windows-1250 to 1258 |
355 | \endlist |
356 | |
357 | If Qt is compiled with ICU support enabled, most codecs supported by |
358 | ICU will also be available to the application. |
359 | |
360 | \l {QTextCodec}s can be used as follows to convert some locally encoded |
361 | string to Unicode. Suppose you have some string encoded in Russian |
362 | KOI8-R encoding, and want to convert it to Unicode. The simple way |
363 | to do it is like this: |
364 | |
365 | \snippet code/src_corelib_codecs_qtextcodec.cpp 0 |
366 | |
367 | After this, \c string holds the text converted to Unicode. |
368 | Converting a string from Unicode to the local encoding is just as |
369 | easy: |
370 | |
371 | \snippet code/src_corelib_codecs_qtextcodec.cpp 1 |
372 | |
373 | Some care must be taken when trying to convert the data in chunks, |
374 | for example, when receiving it over a network. In such cases it is |
375 | possible that a multi-byte character will be split over two |
376 | chunks. At best this might result in the loss of a character and |
377 | at worst cause the entire conversion to fail. |
378 | |
379 | The approach to use in these situations is to create a QTextDecoder |
380 | object for the codec and use this QTextDecoder for the whole |
381 | decoding process, as shown below: |
382 | |
383 | \snippet code/src_corelib_codecs_qtextcodec.cpp 2 |
384 | |
385 | The QTextDecoder object maintains state between chunks and therefore |
386 | works correctly even if a multi-byte character is split between |
387 | chunks. |
388 | |
389 | \section1 Creating Your Own Codec Class |
390 | |
391 | Support for new text encodings can be added to Qt by creating |
392 | QTextCodec subclasses. |
393 | |
394 | The pure virtual functions describe the encoder to the system and |
395 | the coder is used as required in the different text file formats |
396 | supported by QTextStream, and under X11, for the locale-specific |
397 | character input and output. |
398 | |
399 | To add support for another encoding to Qt, make a subclass of |
400 | QTextCodec and implement the functions listed in the table below. |
401 | |
402 | \table |
403 | \header \li Function \li Description |
404 | |
405 | \row \li name() |
406 | \li Returns the official name for the encoding. If the |
407 | encoding is listed in the |
408 | \l{IANA character-sets encoding file}, the name |
409 | should be the preferred MIME name for the encoding. |
410 | |
411 | \row \li aliases() |
412 | \li Returns a list of alternative names for the encoding. |
413 | QTextCodec provides a default implementation that returns |
414 | an empty list. For example, "ISO-8859-1" has "latin1", |
415 | "CP819", "IBM819", and "iso-ir-100" as aliases. |
416 | |
417 | \row \li \l{QTextCodec::mibEnum()}{mibEnum()} |
418 | \li Return the MIB enum for the encoding if it is listed in |
419 | the \l{IANA character-sets encoding file}. |
420 | |
421 | \row \li convertToUnicode() |
422 | \li Converts an 8-bit character string to Unicode. |
423 | |
424 | \row \li convertFromUnicode() |
425 | \li Converts a Unicode string to an 8-bit character string. |
426 | \endtable |
427 | |
428 | \sa QTextStream, QTextDecoder, QTextEncoder |
429 | */ |
430 | |
431 | /*! |
432 | Constructs a QTextCodec, and gives it the highest precedence. The |
433 | QTextCodec should always be constructed on the heap (i.e. with \c |
434 | new). Qt takes ownership and will delete it when the application |
435 | terminates. |
436 | */ |
437 | QTextCodec::QTextCodec() |
438 | { |
439 | const TextCodecsMutexLocker locker; |
440 | |
441 | QTextCodecData *globalInstance = QTextCodecData::instance(); |
442 | if (globalInstance->allCodecs.isEmpty()) |
443 | setup(); |
444 | |
445 | globalInstance->allCodecs.prepend(t: this); |
446 | } |
447 | |
448 | |
449 | /*! |
450 | \nonreentrant |
451 | |
452 | Destroys the QTextCodec. Note that you should not delete codecs |
453 | yourself: once created they become Qt's responsibility. |
454 | */ |
455 | QTextCodec::~QTextCodec() |
456 | { |
457 | QTextCodecData *globalData = QTextCodecData::instance(); |
458 | if (!globalData) |
459 | return; |
460 | |
461 | globalData->codecForLocale.testAndSetRelaxed(expectedValue: this, newValue: nullptr); |
462 | |
463 | const TextCodecsMutexLocker locker; |
464 | |
465 | globalData->allCodecs.removeOne(t: this); |
466 | |
467 | auto it = globalData->codecCache.begin(); |
468 | |
469 | while (it != globalData->codecCache.end()) { |
470 | if (it.value() == this) |
471 | it = globalData->codecCache.erase(it); |
472 | else |
473 | ++it; |
474 | } |
475 | } |
476 | |
477 | /*! |
478 | \fn QTextCodec *QTextCodec::codecForName(const char *name) |
479 | |
480 | Searches all installed QTextCodec objects and returns the one |
481 | which best matches \a name; the match is case-insensitive. Returns |
482 | \nullptr if no codec matching the name \a name could be found. |
483 | */ |
484 | |
485 | /*! |
486 | \threadsafe |
487 | Searches all installed QTextCodec objects and returns the one |
488 | which best matches \a name; the match is case-insensitive. Returns |
489 | \nullptr if no codec matching the name \a name could be found. |
490 | */ |
491 | QTextCodec *QTextCodec::codecForName(const QByteArray &name) |
492 | { |
493 | if (name.isEmpty()) |
494 | return nullptr; |
495 | |
496 | const TextCodecsMutexLocker locker; |
497 | |
498 | QTextCodecData *globalData = QTextCodecData::instance(); |
499 | if (!globalData) |
500 | return nullptr; |
501 | setup(); |
502 | |
503 | #if !QT_CONFIG(icu) |
504 | QTextCodecCache *cache = &globalData->codecCache; |
505 | QTextCodec *codec; |
506 | codec = cache->value(name); |
507 | if (codec) |
508 | return codec; |
509 | |
510 | for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) { |
511 | QTextCodec *cursor = *it; |
512 | if (qTextCodecNameMatch(cursor->name(), name)) { |
513 | if (cache) |
514 | cache->insert(name, cursor); |
515 | return cursor; |
516 | } |
517 | QList<QByteArray> aliases = cursor->aliases(); |
518 | for (ByteArrayListConstIt ait = aliases.constBegin(), acend = aliases.constEnd(); ait != acend; ++ait) { |
519 | if (qTextCodecNameMatch(*ait, name)) { |
520 | cache->insert(name, cursor); |
521 | return cursor; |
522 | } |
523 | } |
524 | } |
525 | |
526 | return nullptr; |
527 | #else |
528 | return QIcuCodec::codecForNameUnlocked(name); |
529 | #endif |
530 | } |
531 | |
532 | |
533 | /*! |
534 | \threadsafe |
535 | Returns the QTextCodec which matches the |
536 | \l{QTextCodec::mibEnum()}{MIBenum} \a mib. |
537 | */ |
538 | QTextCodec* QTextCodec::codecForMib(int mib) |
539 | { |
540 | const TextCodecsMutexLocker locker; |
541 | |
542 | QTextCodecData *globalData = QTextCodecData::instance(); |
543 | if (!globalData) |
544 | return nullptr; |
545 | if (globalData->allCodecs.isEmpty()) |
546 | setup(); |
547 | |
548 | QByteArray key = "MIB: " + QByteArray::number(mib); |
549 | |
550 | QTextCodecCache *cache = &globalData->codecCache; |
551 | QTextCodec *codec; |
552 | if (cache) { |
553 | codec = cache->value(key); |
554 | if (codec) |
555 | return codec; |
556 | } |
557 | |
558 | for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) { |
559 | QTextCodec *cursor = *it; |
560 | if (cursor->mibEnum() == mib) { |
561 | if (cache) |
562 | cache->insert(key, value: cursor); |
563 | return cursor; |
564 | } |
565 | } |
566 | |
567 | #if QT_CONFIG(icu) |
568 | return QIcuCodec::codecForMibUnlocked(mib); |
569 | #else |
570 | return nullptr; |
571 | #endif |
572 | } |
573 | |
574 | /*! |
575 | \threadsafe |
576 | Returns the list of all available codecs, by name. Call |
577 | QTextCodec::codecForName() to obtain the QTextCodec for the name. |
578 | |
579 | The list may contain many mentions of the same codec |
580 | if the codec has aliases. |
581 | |
582 | \sa availableMibs(), name(), aliases() |
583 | */ |
584 | QList<QByteArray> QTextCodec::availableCodecs() |
585 | { |
586 | const TextCodecsMutexLocker locker; |
587 | |
588 | QTextCodecData *globalData = QTextCodecData::instance(); |
589 | if (globalData->allCodecs.isEmpty()) |
590 | setup(); |
591 | |
592 | QList<QByteArray> codecs; |
593 | |
594 | for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) { |
595 | codecs += (*it)->name(); |
596 | codecs += (*it)->aliases(); |
597 | } |
598 | |
599 | #if QT_CONFIG(icu) |
600 | codecs += QIcuCodec::availableCodecs(); |
601 | #endif |
602 | |
603 | return codecs; |
604 | } |
605 | |
606 | /*! |
607 | \threadsafe |
608 | Returns the list of MIBs for all available codecs. Call |
609 | QTextCodec::codecForMib() to obtain the QTextCodec for the MIB. |
610 | |
611 | \sa availableCodecs(), mibEnum() |
612 | */ |
613 | QList<int> QTextCodec::availableMibs() |
614 | { |
615 | #if QT_CONFIG(icu) |
616 | return QIcuCodec::availableMibs(); |
617 | #else |
618 | const TextCodecsMutexLocker locker; |
619 | |
620 | QTextCodecData *globalData = QTextCodecData::instance(); |
621 | if (globalData->allCodecs.isEmpty()) |
622 | setup(); |
623 | |
624 | QList<int> codecs; |
625 | |
626 | for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) |
627 | codecs += (*it)->mibEnum(); |
628 | |
629 | return codecs; |
630 | #endif |
631 | } |
632 | |
633 | /*! |
634 | \nonreentrant |
635 | |
636 | Set the codec to \a c; this will be returned by |
637 | codecForLocale(). If \a c is \nullptr, the codec is reset to |
638 | the default. |
639 | |
640 | This might be needed for some applications that want to use their |
641 | own mechanism for setting the locale. |
642 | |
643 | \sa codecForLocale() |
644 | */ |
645 | void QTextCodec::setCodecForLocale(QTextCodec *c) |
646 | { |
647 | QTextCodecData::instance()->codecForLocale.storeRelease(newValue: c); |
648 | } |
649 | |
650 | /*! |
651 | \threadsafe |
652 | Returns a pointer to the codec most suitable for this locale. |
653 | |
654 | The codec will be retrieved from ICU where that backend is in use, otherwise |
655 | it may be obtained from an OS-specific API. In the latter case, the codec's |
656 | name may be "System". |
657 | */ |
658 | |
659 | QTextCodec* QTextCodec::codecForLocale() |
660 | { |
661 | QTextCodecData *globalData = QTextCodecData::instance(); |
662 | if (!globalData) |
663 | return nullptr; |
664 | |
665 | QTextCodec *codec = globalData->codecForLocale.loadAcquire(); |
666 | if (!codec) { |
667 | #if QT_CONFIG(icu) |
668 | const TextCodecsMutexLocker locker; |
669 | codec = QIcuCodec::defaultCodecUnlocked(); |
670 | #else |
671 | // setupLocaleMapper locks as necessary |
672 | codec = setupLocaleMapper(); |
673 | #endif |
674 | } |
675 | |
676 | return codec; |
677 | } |
678 | |
679 | |
680 | /*! |
681 | \fn QByteArray QTextCodec::name() const |
682 | |
683 | QTextCodec subclasses must reimplement this function. It returns |
684 | the name of the encoding supported by the subclass. |
685 | |
686 | If the codec is registered as a character set in the |
687 | \l{IANA character-sets encoding file} this method should |
688 | return the preferred mime name for the codec if defined, |
689 | otherwise its name. |
690 | */ |
691 | |
692 | /*! |
693 | \fn int QTextCodec::mibEnum() const |
694 | |
695 | Subclasses of QTextCodec must reimplement this function. It |
696 | returns the \l{QTextCodec::mibEnum()}{MIBenum} (see \l{IANA character-sets encoding file} |
697 | for more information). It is important that each QTextCodec |
698 | subclass returns the correct unique value for this function. |
699 | */ |
700 | |
701 | /*! |
702 | Subclasses can return a number of aliases for the codec in question. |
703 | |
704 | Standard aliases for codecs can be found in the |
705 | \l{IANA character-sets encoding file}. |
706 | */ |
707 | QList<QByteArray> QTextCodec::aliases() const |
708 | { |
709 | return QList<QByteArray>(); |
710 | } |
711 | |
712 | /*! |
713 | \fn QString QTextCodec::convertToUnicode(const char *chars, int len, |
714 | ConverterState *state) const |
715 | |
716 | QTextCodec subclasses must reimplement this function. |
717 | |
718 | Converts the first \a len characters of \a chars from the |
719 | encoding of the subclass to Unicode, and returns the result in a |
720 | QString. |
721 | |
722 | \a state can be \nullptr, in which case the conversion is stateless and |
723 | default conversion rules should be used. If \a state is not \nullptr, the |
724 | codec should save the state after the conversion in \a state, and |
725 | adjust the \c remainingChars and \c invalidChars members of the struct. |
726 | */ |
727 | |
728 | /*! |
729 | \fn QByteArray QTextCodec::convertFromUnicode(const QChar *input, int number, |
730 | ConverterState *state) const |
731 | |
732 | QTextCodec subclasses must reimplement this function. |
733 | |
734 | Converts the first \a number of characters from the \a input array |
735 | from Unicode to the encoding of the subclass, and returns the result |
736 | in a QByteArray. |
737 | |
738 | \a state can be \nullptr in which case the conversion is stateless and |
739 | default conversion rules should be used. If \a state is not \nullptr, the |
740 | codec should save the state after the conversion in \a state, and |
741 | adjust the \c remainingChars and \c invalidChars members of the struct. |
742 | */ |
743 | |
744 | /*! |
745 | Creates a QTextDecoder with a specified \a flags to decode chunks |
746 | of \c{char *} data to create chunks of Unicode data. |
747 | |
748 | The caller is responsible for deleting the returned object. |
749 | |
750 | \since 4.7 |
751 | */ |
752 | QTextDecoder* QTextCodec::makeDecoder(QTextCodec::ConversionFlags flags) const |
753 | { |
754 | return new QTextDecoder(this, flags); |
755 | } |
756 | |
757 | /*! |
758 | Creates a QTextEncoder with a specified \a flags to encode chunks |
759 | of Unicode data as \c{char *} data. |
760 | |
761 | The caller is responsible for deleting the returned object. |
762 | |
763 | \since 4.7 |
764 | */ |
765 | QTextEncoder* QTextCodec::makeEncoder(QTextCodec::ConversionFlags flags) const |
766 | { |
767 | return new QTextEncoder(this, flags); |
768 | } |
769 | |
770 | /*! |
771 | \fn QByteArray QTextCodec::fromUnicode(const QChar *input, int number, |
772 | ConverterState *state) const |
773 | |
774 | Converts the first \a number of characters from the \a input array |
775 | from Unicode to the encoding of this codec, and returns the result |
776 | in a QByteArray. |
777 | |
778 | The \a state of the convertor used is updated. |
779 | */ |
780 | |
781 | /*! |
782 | Converts \a str from Unicode to the encoding of this codec, and |
783 | returns the result in a QByteArray. |
784 | */ |
785 | QByteArray QTextCodec::fromUnicode(const QString& str) const |
786 | { |
787 | ConverterState state = DefaultConversion | Flag::Stateless; |
788 | return convertFromUnicode(in: str.constData(), length: str.size(), state: &state); |
789 | } |
790 | |
791 | /*! |
792 | \overload |
793 | \since 5.10 |
794 | |
795 | Converts \a str from Unicode to the encoding of this codec, and |
796 | returns the result in a QByteArray. |
797 | */ |
798 | QByteArray QTextCodec::fromUnicode(QStringView str) const |
799 | { |
800 | ConverterState state = DefaultConversion | Flag::Stateless; |
801 | return convertFromUnicode(in: str.data(), length: str.size(), state: &state); |
802 | } |
803 | |
804 | /*! |
805 | \fn QString QTextCodec::toUnicode(const char *input, int size, |
806 | ConverterState *state) const |
807 | |
808 | Converts the first \a size characters from the \a input from the |
809 | encoding of this codec to Unicode, and returns the result in a |
810 | QString. |
811 | |
812 | The \a state of the convertor used is updated. |
813 | */ |
814 | |
815 | /*! |
816 | Converts \a a from the encoding of this codec to Unicode, and |
817 | returns the result in a QString. |
818 | */ |
819 | QString QTextCodec::toUnicode(const QByteArray& a) const |
820 | { |
821 | ConverterState state = DefaultConversion | Flag::Stateless; |
822 | return convertToUnicode(in: a.constData(), length: a.size(), state: &state); |
823 | } |
824 | |
825 | /*! |
826 | Returns \c true if the Unicode character \a ch can be fully encoded |
827 | with this codec; otherwise returns \c false. |
828 | */ |
829 | bool QTextCodec::canEncode(QChar ch) const |
830 | { |
831 | ConverterState state; |
832 | state.flags = ConvertInvalidToNull; |
833 | convertFromUnicode(in: &ch, length: 1, state: &state); |
834 | return (state.invalidChars == 0); |
835 | } |
836 | |
837 | /*! |
838 | \overload |
839 | |
840 | \a s contains the string being tested for encode-ability. |
841 | */ |
842 | bool QTextCodec::canEncode(const QString& s) const |
843 | { |
844 | ConverterState state; |
845 | state.flags = ConvertInvalidToNull; |
846 | convertFromUnicode(in: s.constData(), length: s.size(), state: &state); |
847 | return (state.invalidChars == 0); |
848 | } |
849 | |
850 | /*! |
851 | \overload |
852 | \since 5.10 |
853 | |
854 | Returns \c true if the Unicode string \a s can be fully encoded |
855 | with this codec; otherwise returns \c false. |
856 | */ |
857 | bool QTextCodec::canEncode(QStringView s) const |
858 | { |
859 | ConverterState state; |
860 | state.flags = ConvertInvalidToNull; |
861 | convertFromUnicode(in: s.data(), length: s.size(), state: &state); |
862 | return !state.invalidChars; |
863 | } |
864 | /*! |
865 | \overload |
866 | |
867 | \a chars contains the source characters. |
868 | */ |
869 | QString QTextCodec::toUnicode(const char *chars) const |
870 | { |
871 | const auto len = int(qstrlen(str: chars)); |
872 | return convertToUnicode(in: chars, length: len, state: nullptr); |
873 | } |
874 | |
875 | |
876 | /*! |
877 | \class QTextEncoder |
878 | \inmodule QtCore5Compat |
879 | \brief The QTextEncoder class provides a state-based encoder. |
880 | \reentrant |
881 | \ingroup i18n |
882 | |
883 | A text encoder converts text from Unicode into an encoded text format |
884 | using a specific codec. |
885 | |
886 | The encoder converts Unicode into another format, remembering any |
887 | state that is required between calls. |
888 | |
889 | \sa QTextCodec::makeEncoder(), QTextDecoder |
890 | */ |
891 | |
892 | /*! |
893 | \fn QTextEncoder::QTextEncoder(const QTextCodec *codec) |
894 | |
895 | Constructs a text encoder for the given \a codec. |
896 | */ |
897 | |
898 | /*! |
899 | Constructs a text encoder for the given \a codec and conversion \a flags. |
900 | |
901 | \since 4.7 |
902 | */ |
903 | QTextEncoder::QTextEncoder(const QTextCodec *codec, QTextCodec::ConversionFlags flags) |
904 | : c(codec), state() |
905 | { |
906 | state.flags = flags; |
907 | } |
908 | |
909 | /*! |
910 | Destroys the encoder. |
911 | */ |
912 | QTextEncoder::~QTextEncoder() |
913 | { |
914 | } |
915 | |
916 | /*! |
917 | \internal |
918 | \since 4.5 |
919 | Determines whether the encoder encountered a failure while decoding the input. If |
920 | an error was encountered, the produced result is undefined, and gets converted as according |
921 | to the conversion flags. |
922 | */ |
923 | bool QTextEncoder::hasFailure() const |
924 | { |
925 | return state.invalidChars != 0; |
926 | } |
927 | |
928 | /*! |
929 | Converts the Unicode string \a str into an encoded QByteArray. |
930 | */ |
931 | QByteArray QTextEncoder::fromUnicode(const QString& str) |
932 | { |
933 | return c->fromUnicode(in: str.constData(), length: str.size(), state: &state); |
934 | } |
935 | |
936 | /*! |
937 | \overload |
938 | \since 5.10 |
939 | Converts the Unicode string \a str into an encoded QByteArray. |
940 | */ |
941 | QByteArray QTextEncoder::fromUnicode(QStringView str) |
942 | { |
943 | return c->fromUnicode(in: str.data(), length: str.size(), state: &state); |
944 | } |
945 | |
946 | /*! |
947 | \overload |
948 | |
949 | Converts \a len characters (not bytes) from \a uc, and returns the |
950 | result in a QByteArray. |
951 | */ |
952 | QByteArray QTextEncoder::fromUnicode(const QChar *uc, int len) |
953 | { |
954 | return c->fromUnicode(in: uc, length: len, state: &state); |
955 | } |
956 | |
957 | /*! |
958 | \class QTextDecoder |
959 | \inmodule QtCore5Compat |
960 | \brief The QTextDecoder class provides a state-based decoder. |
961 | \reentrant |
962 | \ingroup i18n |
963 | |
964 | A text decoder converts text from an encoded text format into Unicode |
965 | using a specific codec. |
966 | |
967 | The decoder converts text in this format into Unicode, remembering any |
968 | state that is required between calls. |
969 | |
970 | \sa QTextCodec::makeDecoder(), QTextEncoder |
971 | */ |
972 | |
973 | /*! |
974 | \fn QTextDecoder::QTextDecoder(const QTextCodec *codec) |
975 | |
976 | Constructs a text decoder for the given \a codec. |
977 | */ |
978 | |
979 | /*! |
980 | Constructs a text decoder for the given \a codec and conversion \a flags. |
981 | |
982 | \since 4.7 |
983 | */ |
984 | |
985 | QTextDecoder::QTextDecoder(const QTextCodec *codec, QTextCodec::ConversionFlags flags) |
986 | : c(codec), state() |
987 | { |
988 | state.flags = flags; |
989 | } |
990 | |
991 | /*! |
992 | Destroys the decoder. |
993 | */ |
994 | QTextDecoder::~QTextDecoder() |
995 | { |
996 | } |
997 | |
998 | /*! |
999 | \fn QString QTextDecoder::toUnicode(const char *chars, int len) |
1000 | |
1001 | Converts the first \a len bytes in \a chars to Unicode, returning |
1002 | the result. |
1003 | |
1004 | If not all characters are used (e.g. if only part of a multi-byte |
1005 | encoding is at the end of the characters), the decoder remembers |
1006 | enough state to continue with the next call to this function. |
1007 | */ |
1008 | QString QTextDecoder::toUnicode(const char *chars, int len) |
1009 | { |
1010 | return c->toUnicode(in: chars, length: len, state: &state); |
1011 | } |
1012 | |
1013 | /*! \overload |
1014 | |
1015 | The converted string is returned in \a target. |
1016 | */ |
1017 | void QTextDecoder::toUnicode(QString *target, const char *chars, int len) |
1018 | { |
1019 | Q_ASSERT(target); |
1020 | switch (c->mibEnum()) { |
1021 | case 106: // utf8 |
1022 | static_cast<const QUtf8Codec*>(c)->convertToUnicode(target, chars, len, &state); |
1023 | break; |
1024 | case 4: // latin1 |
1025 | target->resize(size: len); |
1026 | qt_from_latin1(dst: (char16_t*)target->data(), str: chars, size: len); |
1027 | break; |
1028 | default: |
1029 | *target = c->toUnicode(in: chars, length: len, state: &state); |
1030 | } |
1031 | } |
1032 | |
1033 | |
1034 | /*! |
1035 | \overload |
1036 | |
1037 | Converts the bytes in the byte array specified by \a ba to Unicode |
1038 | and returns the result. |
1039 | */ |
1040 | QString QTextDecoder::toUnicode(const QByteArray &ba) |
1041 | { |
1042 | return c->toUnicode(in: ba.constData(), length: ba.size(), state: &state); |
1043 | } |
1044 | |
1045 | /*! |
1046 | \since 4.4 |
1047 | |
1048 | Tries to detect the encoding of the provided snippet of HTML in |
1049 | the given byte array, \a ba, by checking the BOM (Byte Order Mark) |
1050 | and the content-type meta header and returns a QTextCodec instance |
1051 | that is capable of decoding the html to unicode. If the codec |
1052 | cannot be detected from the content provided, \a defaultCodec is |
1053 | returned. |
1054 | |
1055 | \sa codecForUtfText() |
1056 | */ |
1057 | QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec *defaultCodec) |
1058 | { |
1059 | // determine charset |
1060 | QTextCodec *c = QTextCodec::codecForUtfText(ba, defaultCodec: nullptr); |
1061 | if (!c) { |
1062 | static Q_RELAXED_CONSTEXPR auto matcher = qMakeStaticByteArrayMatcher(pattern: "meta " ); |
1063 | QByteArray = ba.left(n: 1024).toLower(); |
1064 | qsizetype pos = matcher.indexIn(haystack: header); |
1065 | if (pos != -1) { |
1066 | static Q_RELAXED_CONSTEXPR auto matcher = qMakeStaticByteArrayMatcher(pattern: "charset=" ); |
1067 | pos = matcher.indexIn(haystack: header, from: pos); |
1068 | if (pos != -1) { |
1069 | pos += qstrlen(str: "charset=" ); |
1070 | |
1071 | qsizetype pos2 = pos; |
1072 | // The attribute can be closed with either """, "'", ">" or "/", |
1073 | // none of which are valid charset characters. |
1074 | while (++pos2 < header.size()) { |
1075 | char ch = header.at(i: pos2); |
1076 | if (ch == '\"' || ch == '\'' || ch == '>') { |
1077 | QByteArray name = header.mid(index: pos, len: pos2 - pos); |
1078 | if (name == "unicode" ) // QTBUG-41998, ICU will return UTF-16. |
1079 | name = QByteArrayLiteral("UTF-8" ); |
1080 | c = QTextCodec::codecForName(name); |
1081 | return c ? c : defaultCodec; |
1082 | } |
1083 | } |
1084 | } |
1085 | } |
1086 | } |
1087 | if (!c) |
1088 | c = defaultCodec; |
1089 | |
1090 | return c; |
1091 | } |
1092 | |
1093 | /*! |
1094 | \overload |
1095 | |
1096 | Tries to detect the encoding of the provided snippet of HTML in |
1097 | the given byte array, \a ba, by checking the BOM (Byte Order Mark) |
1098 | and the content-type meta header and returns a QTextCodec instance |
1099 | that is capable of decoding the html to unicode. If the codec cannot |
1100 | be detected, this overload returns a Latin-1 QTextCodec. |
1101 | */ |
1102 | QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba) |
1103 | { |
1104 | return codecForHtml(ba, defaultCodec: QTextCodec::codecForName(name: "ISO-8859-1" )); |
1105 | } |
1106 | |
1107 | /*! |
1108 | \since 4.6 |
1109 | |
1110 | Tries to detect the encoding of the provided snippet \a ba by |
1111 | using the BOM (Byte Order Mark) and returns a QTextCodec instance |
1112 | that is capable of decoding the text to unicode. This function can |
1113 | detect one of the following codecs: |
1114 | |
1115 | \list |
1116 | \li UTF-32 Little Endian |
1117 | \li UTF-32 Big Endian |
1118 | \li UTF-16 Little Endian |
1119 | \li UTF-16 Big Endian |
1120 | \li UTF-8 |
1121 | \endlist |
1122 | |
1123 | If the codec cannot be detected from the content provided, \a defaultCodec |
1124 | is returned. |
1125 | |
1126 | \sa codecForHtml() |
1127 | */ |
1128 | QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba, QTextCodec *defaultCodec) |
1129 | { |
1130 | const int arraySize = ba.size(); |
1131 | const uchar *buf = reinterpret_cast<const uchar *>(ba.constData()); |
1132 | const uint bom = 0xfeff; |
1133 | |
1134 | if (arraySize > 3) { |
1135 | uint uc = qFromUnaligned<uint>(src: buf); |
1136 | if (uc == qToBigEndian(source: bom)) |
1137 | return QTextCodec::codecForMib(mib: 1018); // utf-32 be |
1138 | else if (uc == qToLittleEndian(source: bom)) |
1139 | return QTextCodec::codecForMib(mib: 1019); // utf-32 le |
1140 | } |
1141 | |
1142 | if (arraySize < 2) |
1143 | return defaultCodec; |
1144 | |
1145 | ushort uc = qFromUnaligned<ushort>(src: buf); |
1146 | if (uc == qToBigEndian(source: ushort(bom))) |
1147 | return QTextCodec::codecForMib(mib: 1013); // utf16 be |
1148 | else if (uc == qToLittleEndian(source: ushort(bom))) |
1149 | return QTextCodec::codecForMib(mib: 1014); // utf16 le |
1150 | |
1151 | if (arraySize < 3) |
1152 | return defaultCodec; |
1153 | |
1154 | static const char utf8bom[] = "\xef\xbb\xbf" ; |
1155 | if (memcmp(s1: buf, s2: utf8bom, n: sizeof(utf8bom) - 1) == 0) |
1156 | return QTextCodec::codecForMib(mib: 106); // utf-8 |
1157 | |
1158 | return defaultCodec; |
1159 | } |
1160 | |
1161 | /*! |
1162 | \overload |
1163 | |
1164 | Tries to detect the encoding of the provided snippet \a ba by |
1165 | using the BOM (Byte Order Mark) and returns a QTextCodec instance |
1166 | that is capable of decoding the text to unicode. This function can |
1167 | detect one of the following codecs: |
1168 | |
1169 | \list |
1170 | \li UTF-32 Little Endian |
1171 | \li UTF-32 Big Endian |
1172 | \li UTF-16 Little Endian |
1173 | \li UTF-16 Big Endian |
1174 | \li UTF-8 |
1175 | \endlist |
1176 | |
1177 | If the codec cannot be detected from the content provided, this overload |
1178 | returns a Latin-1 QTextCodec. |
1179 | |
1180 | \sa codecForHtml() |
1181 | */ |
1182 | QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba) |
1183 | { |
1184 | return codecForUtfText(ba, defaultCodec: QTextCodec::codecForMib(/*Latin 1*/ mib: 4)); |
1185 | } |
1186 | |
1187 | /*! |
1188 | \fn QTextCodec *QTextCodec::codecForTr () |
1189 | \deprecated |
1190 | |
1191 | Returns the codec used by QObject::tr() on its argument. If this |
1192 | function returns \nullptr (the default), tr() assumes Latin-1. |
1193 | */ |
1194 | |
1195 | /*! |
1196 | \internal |
1197 | \since 4.3 |
1198 | Determines whether the decoder encountered a failure while decoding the |
1199 | input. If an error was encountered, the produced result is undefined, and |
1200 | gets converted as according to the conversion flags. |
1201 | */ |
1202 | bool QTextDecoder::hasFailure() const |
1203 | { |
1204 | return state.invalidChars != 0; |
1205 | } |
1206 | |
1207 | /*! |
1208 | \internal |
1209 | \since 5.12 |
1210 | |
1211 | Determines whether the decoder needs more bytes to continue decoding. That |
1212 | is, this signifies that the input string ended in the middle of a |
1213 | multi-byte sequence. Note that it's possible some codecs do not report this. |
1214 | */ |
1215 | bool QTextDecoder::needsMoreData() const |
1216 | { |
1217 | return state.remainingChars; |
1218 | } |
1219 | |
1220 | /*! |
1221 | \fn QTextCodec * Qt::codecForHtml(const QByteArray &ba) |
1222 | \internal |
1223 | |
1224 | This function is defined in the \c <QTextCodec> header file. |
1225 | */ |
1226 | QTextCodec *Qt::codecForHtml(const QByteArray &ba) |
1227 | { |
1228 | return QTextCodec::codecForHtml(ba); |
1229 | } |
1230 | |
1231 | QT_END_NAMESPACE |
1232 | |