1 | // Copyright (C) 2018 The Qt Company Ltd. |
2 | // Copyright (C) 2018 Intel Corporation. |
3 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only |
4 | |
5 | #include "qplatformdefs.h" |
6 | |
7 | #include "qtextcodec.h" |
8 | #include "qtextcodec_p.h" |
9 | |
10 | #include "qbytearraymatcher.h" |
11 | #include "qendian.h" |
12 | #include "qfile.h" |
13 | #include "qlist.h" |
14 | #include <private/qlocking_p.h> |
15 | #include "qstringlist.h" |
16 | #include "qvarlengtharray.h" |
17 | |
18 | #include <private/qcoreapplication_p.h> |
19 | |
20 | #include "qutfcodec_p.h" |
21 | #include "qlatincodec_p.h" |
22 | |
23 | #if QT_CONFIG(codecs) |
24 | # include "qtsciicodec_p.h" |
25 | # include "qisciicodec_p.h" |
26 | #endif |
27 | #if QT_CONFIG(icu) |
28 | #include "qicucodec_p.h" |
29 | #else |
30 | #if QT_CONFIG(iconv) |
31 | # include "qiconvcodec_p.h" |
32 | #endif |
33 | #ifdef Q_OS_WIN |
34 | # include "qwindowscodec_p.h" |
35 | #endif |
36 | # include "qsimplecodec_p.h" |
37 | #if QT_CONFIG(big_codecs) && QT_CONFIG(textcodec) |
38 | # ifndef Q_OS_INTEGRITY |
39 | # include "qgb18030codec_p.h" |
40 | # include "qeucjpcodec_p.h" |
41 | # include "qjiscodec_p.h" |
42 | # include "qsjiscodec_p.h" |
43 | # include "qeuckrcodec_p.h" |
44 | # include "qbig5codec_p.h" |
45 | # endif // !Q_OS_INTEGRITY |
46 | #endif // big_codecs |
47 | |
48 | #endif // icu |
49 | |
50 | #include <mutex> |
51 | |
52 | #include <stdlib.h> |
53 | #include <ctype.h> |
54 | #include <locale.h> |
55 | #if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_ANDROID) |
56 | # include <langinfo.h> |
57 | #endif |
58 | |
59 | QT_BEGIN_NAMESPACE |
60 | |
61 | // in qstring.cpp: |
62 | void qt_from_latin1(char16_t *dst, const char *str, size_t size) noexcept; |
63 | |
64 | typedef QList<QTextCodec*>::ConstIterator TextCodecListConstIt; |
65 | typedef QList<QByteArray>::ConstIterator ByteArrayListConstIt; |
66 | |
67 | Q_GLOBAL_STATIC(QRecursiveMutex, textCodecsMutex); |
68 | |
69 | Q_GLOBAL_STATIC(QTextCodecData, textCodecData) |
70 | |
71 | QTextCodecData::QTextCodecData() |
72 | : codecForLocale(nullptr) |
73 | { |
74 | } |
75 | |
76 | QTextCodecData::~QTextCodecData() |
77 | { |
78 | codecForLocale.storeRelease(newValue: nullptr); |
79 | QList<QTextCodec *> tmp = allCodecs; |
80 | allCodecs.clear(); |
81 | codecCache.clear(); |
82 | for (QList<QTextCodec *>::const_iterator it = tmp.constBegin(); it != tmp.constEnd(); ++it) |
83 | delete *it; |
84 | } |
85 | |
86 | QTextCodecData *QTextCodecData::instance() |
87 | { |
88 | return textCodecData(); |
89 | } |
90 | |
91 | class TextCodecsMutexLocker |
92 | { |
93 | using Lock = decltype(qt_unique_lock(mutex&: std::declval<QRecursiveMutex&>())); |
94 | // ### FIXME: this is used when textCodecsMutex already == nullptr |
95 | const Lock lock = qt_unique_lock(mutex: textCodecsMutex()); |
96 | public: |
97 | TextCodecsMutexLocker() {} // required d/t an ICC 19 bug |
98 | }; |
99 | |
100 | #if !QT_CONFIG(icu) |
101 | static char qtolower(char c) |
102 | { if (c >= 'A' && c <= 'Z') return c + 0x20; return c; } |
103 | static bool qisalnum(char c) |
104 | { return (c >= '0' && c <= '9') || ((c | 0x20) >= 'a' && (c | 0x20) <= 'z'); } |
105 | |
106 | bool qTextCodecNameMatch(const char *n, const char *h) |
107 | { |
108 | if (qstricmp(n, h) == 0) |
109 | return true; |
110 | |
111 | // if the letters and numbers are the same, we have a match |
112 | while (*n != '\0') { |
113 | if (qisalnum(*n)) { |
114 | for (;;) { |
115 | if (*h == '\0') |
116 | return false; |
117 | if (qisalnum(*h)) |
118 | break; |
119 | ++h; |
120 | } |
121 | if (qtolower(*n) != qtolower(*h)) |
122 | return false; |
123 | ++h; |
124 | } |
125 | ++n; |
126 | } |
127 | while (*h && !qisalnum(*h)) |
128 | ++h; |
129 | return (*h == '\0'); |
130 | } |
131 | |
132 | |
133 | #if !defined(Q_OS_WIN32) && !defined(QT_LOCALE_IS_UTF8) |
134 | static QTextCodec *checkForCodec(const QByteArray &name) { |
135 | QTextCodec *c = QTextCodec::codecForName(name); |
136 | if (!c) { |
137 | const int index = name.indexOf('@'); |
138 | if (index != -1) { |
139 | c = QTextCodec::codecForName(name.left(index)); |
140 | } |
141 | } |
142 | return c; |
143 | } |
144 | #endif |
145 | |
146 | static void setup(); |
147 | |
148 | // \threadsafe |
149 | // this returns the codec the method sets up as locale codec to |
150 | // avoid a race condition in codecForLocale() when |
151 | // setCodecForLocale(nullptr) is called at the same time. |
152 | static QTextCodec *setupLocaleMapper() |
153 | { |
154 | QTextCodecData *globalData = QTextCodecData::instance(); |
155 | |
156 | QTextCodec *locale = nullptr; |
157 | |
158 | { |
159 | const TextCodecsMutexLocker locker; |
160 | if (globalData->allCodecs.isEmpty()) |
161 | setup(); |
162 | } |
163 | |
164 | QCoreApplicationPrivate::initLocale(); |
165 | |
166 | #if defined(QT_LOCALE_IS_UTF8) |
167 | locale = QTextCodec::codecForName("UTF-8" ); |
168 | #elif defined(Q_OS_WIN) |
169 | locale = QTextCodec::codecForName("System" ); |
170 | #else |
171 | |
172 | // First try getting the codecs name from nl_langinfo and see |
173 | // if we have a builtin codec for it. |
174 | // Only fall back to using iconv if we can't find a builtin codec |
175 | // This is because the builtin utf8 codec is around 5 times faster |
176 | // then the using QIconvCodec |
177 | |
178 | #if defined (_XOPEN_UNIX) |
179 | char *charset = nl_langinfo(CODESET); |
180 | if (charset) |
181 | locale = QTextCodec::codecForName(charset); |
182 | #endif |
183 | #if QT_CONFIG(iconv) |
184 | if (!locale) { |
185 | // no builtin codec for the locale found, let's try using iconv |
186 | (void) new QIconvCodec(); |
187 | locale = QTextCodec::codecForName("System" ); |
188 | } |
189 | #endif |
190 | |
191 | if (!locale) { |
192 | // Very poorly defined and followed standards causes lots of |
193 | // code to try to get all the cases... This logic is |
194 | // duplicated in QIconvCodec, so if you change it here, change |
195 | // it there too. |
196 | |
197 | // Try to determine locale codeset from locale name assigned to |
198 | // LC_CTYPE category. |
199 | |
200 | // First part is getting that locale name. First try setlocale() which |
201 | // definitely knows it, but since we cannot fully trust it, get ready |
202 | // to fall back to environment variables. |
203 | const QByteArray ctype = setlocale(LC_CTYPE, nullptr); |
204 | |
205 | // Get the first nonempty value from $LC_ALL, $LC_CTYPE, and $LANG |
206 | // environment variables. |
207 | QByteArray lang = qgetenv("LC_ALL" ); |
208 | if (lang.isEmpty() || lang == "C" ) { |
209 | lang = qgetenv("LC_CTYPE" ); |
210 | } |
211 | if (lang.isEmpty() || lang == "C" ) { |
212 | lang = qgetenv("LANG" ); |
213 | } |
214 | |
215 | // Now try these in order: |
216 | // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15) |
217 | // 2. CODESET from lang if it contains a .CODESET part |
218 | // 3. ctype (maybe the locale is named "ISO-8859-1" or something) |
219 | // 4. locale (ditto) |
220 | // 5. check for "@euro" |
221 | // 6. guess locale from ctype unless ctype is "C" |
222 | // 7. guess locale from lang |
223 | |
224 | // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15) |
225 | int indexOfDot = ctype.indexOf('.'); |
226 | if (indexOfDot != -1) |
227 | locale = checkForCodec( ctype.mid(indexOfDot + 1) ); |
228 | |
229 | // 2. CODESET from lang if it contains a .CODESET part |
230 | if (!locale) { |
231 | indexOfDot = lang.indexOf('.'); |
232 | if (indexOfDot != -1) |
233 | locale = checkForCodec( lang.mid(indexOfDot + 1) ); |
234 | } |
235 | |
236 | // 3. ctype (maybe the locale is named "ISO-8859-1" or something) |
237 | if (!locale && !ctype.isEmpty() && ctype != "C" ) |
238 | locale = checkForCodec(ctype); |
239 | |
240 | // 4. locale (ditto) |
241 | if (!locale && !lang.isEmpty()) |
242 | locale = checkForCodec(lang); |
243 | |
244 | // 5. "@euro" |
245 | if ((!locale && ctype.contains("@euro" )) || lang.contains("@euro" )) |
246 | locale = checkForCodec("ISO 8859-15" ); |
247 | } |
248 | |
249 | #endif |
250 | // If everything failed, we default to 8859-1 |
251 | if (!locale) |
252 | locale = QTextCodec::codecForName("ISO 8859-1" ); |
253 | globalData->codecForLocale.storeRelease(locale); |
254 | return locale; |
255 | } |
256 | |
257 | |
258 | // textCodecsMutex need to be locked to enter this function |
259 | static void setup() |
260 | { |
261 | static bool initialized = false; |
262 | if (initialized) |
263 | return; |
264 | initialized = true; |
265 | |
266 | #if QT_CONFIG(codecs) |
267 | (void)new QTsciiCodec; |
268 | for (int i = 0; i < 9; ++i) |
269 | (void)new QIsciiCodec(i); |
270 | for (int i = 0; i < QSimpleTextCodec::numSimpleCodecs; ++i) |
271 | (void)new QSimpleTextCodec(i); |
272 | |
273 | # if QT_CONFIG(big_codecs) && !defined(Q_OS_INTEGRITY) |
274 | (void)new QGb18030Codec; |
275 | (void)new QGbkCodec; |
276 | (void)new QGb2312Codec; |
277 | (void)new QEucJpCodec; |
278 | (void)new QJisCodec; |
279 | (void)new QSjisCodec; |
280 | (void)new QEucKrCodec; |
281 | (void)new QCP949Codec; |
282 | (void)new QBig5Codec; |
283 | (void)new QBig5hkscsCodec; |
284 | # endif // big_codecs && !Q_OS_INTEGRITY |
285 | #if QT_CONFIG(iconv) |
286 | (void) new QIconvCodec; |
287 | #endif |
288 | #if defined(Q_OS_WIN32) |
289 | (void) new QWindowsLocalCodec; |
290 | #endif // Q_OS_WIN32 |
291 | #endif // codecs |
292 | |
293 | (void)new QUtf16Codec; |
294 | (void)new QUtf16BECodec; |
295 | (void)new QUtf16LECodec; |
296 | (void)new QUtf32Codec; |
297 | (void)new QUtf32BECodec; |
298 | (void)new QUtf32LECodec; |
299 | (void)new QLatin15Codec; |
300 | (void)new QLatin1Codec; |
301 | (void)new QUtf8Codec; |
302 | } |
303 | #else |
304 | static void setup() {} |
305 | #endif // icu |
306 | |
307 | /*! |
308 | \typealias QTextCodec::ConversionFlags |
309 | |
310 | \value DefaultConversion No flag is set. |
311 | \value ConvertInvalidToNull If this flag is set, each invalid input |
312 | character is output as a null character. |
313 | \value IgnoreHeader Ignore any Unicode byte-order mark and don't generate any. |
314 | |
315 | \omitvalue FreeFunction |
316 | */ |
317 | |
318 | /*! |
319 | \typealias QTextCodec::ConverterState |
320 | */ |
321 | |
322 | /*! |
323 | \class QTextCodec |
324 | \inmodule QtCore5Compat |
325 | \brief The QTextCodec class provides conversions between text encodings. |
326 | \reentrant |
327 | \ingroup i18n |
328 | |
329 | Qt uses Unicode to store, draw and manipulate strings. In many |
330 | situations you may wish to deal with data that uses a different |
331 | encoding. For example, most Japanese documents are still stored |
332 | in Shift-JIS or ISO 2022-JP, while Russian users often have their |
333 | documents in KOI8-R or Windows-1251. |
334 | |
335 | Qt provides a set of QTextCodec classes to help with converting |
336 | non-Unicode formats to and from Unicode. You can also create your |
337 | own codec classes. |
338 | |
339 | The supported encodings are: |
340 | |
341 | \list |
342 | \li \l{Big5 Text Codec}{Big5} |
343 | \li \l{Big5-HKSCS Text Codec}{Big5-HKSCS} |
344 | \li CP949 |
345 | \li \l{EUC-JP Text Codec}{EUC-JP} |
346 | \li \l{EUC-KR Text Codec}{EUC-KR} |
347 | \li \l{GBK Text Codec}{GB18030} |
348 | \li HP-ROMAN8 |
349 | \li IBM 850 |
350 | \li IBM 866 |
351 | \li IBM 874 |
352 | \li \l{ISO 2022-JP (JIS) Text Codec}{ISO 2022-JP} |
353 | \li ISO 8859-1 to 10 |
354 | \li ISO 8859-13 to 16 |
355 | \li Iscii-Bng, Dev, Gjr, Knd, Mlm, Ori, Pnj, Tlg, and Tml |
356 | \li KOI8-R |
357 | \li KOI8-U |
358 | \li Macintosh |
359 | \li \l{Shift-JIS Text Codec}{Shift-JIS} |
360 | \li TIS-620 |
361 | \li \l{TSCII Text Codec}{TSCII} |
362 | \li UTF-8 |
363 | \li UTF-16 |
364 | \li UTF-16BE |
365 | \li UTF-16LE |
366 | \li UTF-32 |
367 | \li UTF-32BE |
368 | \li UTF-32LE |
369 | \li Windows-1250 to 1258 |
370 | \endlist |
371 | |
372 | If Qt is compiled with ICU support enabled, most codecs supported by |
373 | ICU will also be available to the application. |
374 | |
375 | \l {QTextCodec}s can be used as follows to convert some locally encoded |
376 | string to Unicode. Suppose you have some string encoded in Russian |
377 | KOI8-R encoding, and want to convert it to Unicode. The simple way |
378 | to do it is like this: |
379 | |
380 | \snippet code/src_corelib_codecs_qtextcodec.cpp 0 |
381 | |
382 | After this, \c string holds the text converted to Unicode. |
383 | Converting a string from Unicode to the local encoding is just as |
384 | easy: |
385 | |
386 | \snippet code/src_corelib_codecs_qtextcodec.cpp 1 |
387 | |
388 | Some care must be taken when trying to convert the data in chunks, |
389 | for example, when receiving it over a network. In such cases it is |
390 | possible that a multi-byte character will be split over two |
391 | chunks. At best this might result in the loss of a character and |
392 | at worst cause the entire conversion to fail. |
393 | |
394 | The approach to use in these situations is to create a QTextDecoder |
395 | object for the codec and use this QTextDecoder for the whole |
396 | decoding process, as shown below: |
397 | |
398 | \snippet code/src_corelib_codecs_qtextcodec.cpp 2 |
399 | |
400 | The QTextDecoder object maintains state between chunks and therefore |
401 | works correctly even if a multi-byte character is split between |
402 | chunks. |
403 | |
404 | \section1 Creating Your Own Codec Class |
405 | |
406 | Support for new text encodings can be added to Qt by creating |
407 | QTextCodec subclasses. |
408 | |
409 | The pure virtual functions describe the encoder to the system and |
410 | the coder is used as required in the different text file formats |
411 | supported by QTextStream, and under X11, for the locale-specific |
412 | character input and output. |
413 | |
414 | To add support for another encoding to Qt, make a subclass of |
415 | QTextCodec and implement the functions listed in the table below. |
416 | |
417 | \table |
418 | \header \li Function \li Description |
419 | |
420 | \row \li name() |
421 | \li Returns the official name for the encoding. If the |
422 | encoding is listed in the |
423 | \l{IANA character-sets encoding file}, the name |
424 | should be the preferred MIME name for the encoding. |
425 | |
426 | \row \li aliases() |
427 | \li Returns a list of alternative names for the encoding. |
428 | QTextCodec provides a default implementation that returns |
429 | an empty list. For example, "ISO-8859-1" has "latin1", |
430 | "CP819", "IBM819", and "iso-ir-100" as aliases. |
431 | |
432 | \row \li \l{QTextCodec::mibEnum()}{mibEnum()} |
433 | \li Return the MIB enum for the encoding if it is listed in |
434 | the \l{IANA character-sets encoding file}. |
435 | |
436 | \row \li convertToUnicode() |
437 | \li Converts an 8-bit character string to Unicode. |
438 | |
439 | \row \li convertFromUnicode() |
440 | \li Converts a Unicode string to an 8-bit character string. |
441 | \endtable |
442 | |
443 | \sa QTextStream, QTextDecoder, QTextEncoder |
444 | */ |
445 | |
446 | /*! |
447 | Constructs a QTextCodec, and gives it the highest precedence. The |
448 | QTextCodec should always be constructed on the heap (i.e. with \c |
449 | new). Qt takes ownership and will delete it when the application |
450 | terminates. |
451 | */ |
452 | QTextCodec::QTextCodec() |
453 | { |
454 | const TextCodecsMutexLocker locker; |
455 | |
456 | QTextCodecData *globalInstance = QTextCodecData::instance(); |
457 | if (globalInstance->allCodecs.isEmpty()) |
458 | setup(); |
459 | |
460 | globalInstance->allCodecs.prepend(t: this); |
461 | } |
462 | |
463 | |
464 | /*! |
465 | \nonreentrant |
466 | |
467 | Destroys the QTextCodec. Note that you should not delete codecs |
468 | yourself: once created they become Qt's responsibility. |
469 | */ |
470 | QTextCodec::~QTextCodec() |
471 | { |
472 | QTextCodecData *globalData = QTextCodecData::instance(); |
473 | if (!globalData) |
474 | return; |
475 | |
476 | globalData->codecForLocale.testAndSetRelaxed(expectedValue: this, newValue: nullptr); |
477 | |
478 | const TextCodecsMutexLocker locker; |
479 | |
480 | globalData->allCodecs.removeOne(t: this); |
481 | |
482 | auto it = globalData->codecCache.begin(); |
483 | |
484 | while (it != globalData->codecCache.end()) { |
485 | if (it.value() == this) |
486 | it = globalData->codecCache.erase(it); |
487 | else |
488 | ++it; |
489 | } |
490 | } |
491 | |
492 | /*! |
493 | \fn QTextCodec *QTextCodec::codecForName(const char *name) |
494 | |
495 | Searches all installed QTextCodec objects and returns the one |
496 | which best matches \a name; the match is case-insensitive. Returns |
497 | \nullptr if no codec matching the name \a name could be found. |
498 | */ |
499 | |
500 | /*! |
501 | \threadsafe |
502 | Searches all installed QTextCodec objects and returns the one |
503 | which best matches \a name; the match is case-insensitive. Returns |
504 | \nullptr if no codec matching the name \a name could be found. |
505 | */ |
506 | QTextCodec *QTextCodec::codecForName(const QByteArray &name) |
507 | { |
508 | if (name.isEmpty()) |
509 | return nullptr; |
510 | |
511 | const TextCodecsMutexLocker locker; |
512 | |
513 | QTextCodecData *globalData = QTextCodecData::instance(); |
514 | if (!globalData) |
515 | return nullptr; |
516 | setup(); |
517 | |
518 | #if !QT_CONFIG(icu) |
519 | QTextCodecCache *cache = &globalData->codecCache; |
520 | QTextCodec *codec; |
521 | codec = cache->value(name); |
522 | if (codec) |
523 | return codec; |
524 | |
525 | for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) { |
526 | QTextCodec *cursor = *it; |
527 | if (qTextCodecNameMatch(cursor->name(), name)) { |
528 | if (cache) |
529 | cache->insert(name, cursor); |
530 | return cursor; |
531 | } |
532 | QList<QByteArray> aliases = cursor->aliases(); |
533 | for (ByteArrayListConstIt ait = aliases.constBegin(), acend = aliases.constEnd(); ait != acend; ++ait) { |
534 | if (qTextCodecNameMatch(*ait, name)) { |
535 | cache->insert(name, cursor); |
536 | return cursor; |
537 | } |
538 | } |
539 | } |
540 | |
541 | return nullptr; |
542 | #else |
543 | return QIcuCodec::codecForNameUnlocked(name); |
544 | #endif |
545 | } |
546 | |
547 | |
548 | /*! |
549 | \threadsafe |
550 | Returns the QTextCodec which matches the |
551 | \l{QTextCodec::mibEnum()}{MIBenum} \a mib. |
552 | */ |
553 | QTextCodec* QTextCodec::codecForMib(int mib) |
554 | { |
555 | const TextCodecsMutexLocker locker; |
556 | |
557 | QTextCodecData *globalData = QTextCodecData::instance(); |
558 | if (!globalData) |
559 | return nullptr; |
560 | if (globalData->allCodecs.isEmpty()) |
561 | setup(); |
562 | |
563 | QByteArray key = "MIB: " + QByteArray::number(mib); |
564 | |
565 | QTextCodecCache *cache = &globalData->codecCache; |
566 | QTextCodec *codec; |
567 | if (cache) { |
568 | codec = cache->value(key); |
569 | if (codec) |
570 | return codec; |
571 | } |
572 | |
573 | for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) { |
574 | QTextCodec *cursor = *it; |
575 | if (cursor->mibEnum() == mib) { |
576 | if (cache) |
577 | cache->insert(key, value: cursor); |
578 | return cursor; |
579 | } |
580 | } |
581 | |
582 | #if QT_CONFIG(icu) |
583 | return QIcuCodec::codecForMibUnlocked(mib); |
584 | #else |
585 | return nullptr; |
586 | #endif |
587 | } |
588 | |
589 | /*! |
590 | \threadsafe |
591 | Returns the list of all available codecs, by name. Call |
592 | QTextCodec::codecForName() to obtain the QTextCodec for the name. |
593 | |
594 | The list may contain many mentions of the same codec |
595 | if the codec has aliases. |
596 | |
597 | \sa availableMibs(), name(), aliases() |
598 | */ |
599 | QList<QByteArray> QTextCodec::availableCodecs() |
600 | { |
601 | const TextCodecsMutexLocker locker; |
602 | |
603 | QTextCodecData *globalData = QTextCodecData::instance(); |
604 | if (globalData->allCodecs.isEmpty()) |
605 | setup(); |
606 | |
607 | QList<QByteArray> codecs; |
608 | |
609 | for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) { |
610 | codecs += (*it)->name(); |
611 | codecs += (*it)->aliases(); |
612 | } |
613 | |
614 | #if QT_CONFIG(icu) |
615 | codecs += QIcuCodec::availableCodecs(); |
616 | #endif |
617 | |
618 | return codecs; |
619 | } |
620 | |
621 | /*! |
622 | \threadsafe |
623 | Returns the list of MIBs for all available codecs. Call |
624 | QTextCodec::codecForMib() to obtain the QTextCodec for the MIB. |
625 | |
626 | \sa availableCodecs(), mibEnum() |
627 | */ |
628 | QList<int> QTextCodec::availableMibs() |
629 | { |
630 | #if QT_CONFIG(icu) |
631 | return QIcuCodec::availableMibs(); |
632 | #else |
633 | const TextCodecsMutexLocker locker; |
634 | |
635 | QTextCodecData *globalData = QTextCodecData::instance(); |
636 | if (globalData->allCodecs.isEmpty()) |
637 | setup(); |
638 | |
639 | QList<int> codecs; |
640 | |
641 | for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) |
642 | codecs += (*it)->mibEnum(); |
643 | |
644 | return codecs; |
645 | #endif |
646 | } |
647 | |
648 | /*! |
649 | \nonreentrant |
650 | |
651 | Set the codec to \a c; this will be returned by |
652 | codecForLocale(). If \a c is \nullptr, the codec is reset to |
653 | the default. |
654 | |
655 | This might be needed for some applications that want to use their |
656 | own mechanism for setting the locale. |
657 | |
658 | \sa codecForLocale() |
659 | */ |
660 | void QTextCodec::setCodecForLocale(QTextCodec *c) |
661 | { |
662 | QTextCodecData::instance()->codecForLocale.storeRelease(newValue: c); |
663 | } |
664 | |
665 | /*! |
666 | \threadsafe |
667 | Returns a pointer to the codec most suitable for this locale. |
668 | |
669 | The codec will be retrieved from ICU where that backend is in use, otherwise |
670 | it may be obtained from an OS-specific API. In the latter case, the codec's |
671 | name may be "System". |
672 | */ |
673 | |
674 | QTextCodec* QTextCodec::codecForLocale() |
675 | { |
676 | QTextCodecData *globalData = QTextCodecData::instance(); |
677 | if (!globalData) |
678 | return nullptr; |
679 | |
680 | QTextCodec *codec = globalData->codecForLocale.loadAcquire(); |
681 | if (!codec) { |
682 | #if QT_CONFIG(icu) |
683 | const TextCodecsMutexLocker locker; |
684 | codec = QIcuCodec::defaultCodecUnlocked(); |
685 | #else |
686 | // setupLocaleMapper locks as necessary |
687 | codec = setupLocaleMapper(); |
688 | #endif |
689 | } |
690 | |
691 | return codec; |
692 | } |
693 | |
694 | |
695 | /*! |
696 | \fn QByteArray QTextCodec::name() const |
697 | |
698 | QTextCodec subclasses must reimplement this function. It returns |
699 | the name of the encoding supported by the subclass. |
700 | |
701 | If the codec is registered as a character set in the |
702 | \l{IANA character-sets encoding file} this method should |
703 | return the preferred mime name for the codec if defined, |
704 | otherwise its name. |
705 | */ |
706 | |
707 | /*! |
708 | \fn int QTextCodec::mibEnum() const |
709 | |
710 | Subclasses of QTextCodec must reimplement this function. It |
711 | returns the \l{QTextCodec::mibEnum()}{MIBenum} (see \l{IANA character-sets encoding file} |
712 | for more information). It is important that each QTextCodec |
713 | subclass returns the correct unique value for this function. |
714 | */ |
715 | |
716 | /*! |
717 | Subclasses can return a number of aliases for the codec in question. |
718 | |
719 | Standard aliases for codecs can be found in the |
720 | \l{IANA character-sets encoding file}. |
721 | */ |
722 | QList<QByteArray> QTextCodec::aliases() const |
723 | { |
724 | return QList<QByteArray>(); |
725 | } |
726 | |
727 | /*! |
728 | \fn QString QTextCodec::convertToUnicode(const char *chars, int len, |
729 | ConverterState *state) const |
730 | |
731 | QTextCodec subclasses must reimplement this function. |
732 | |
733 | Converts the first \a len characters of \a chars from the |
734 | encoding of the subclass to Unicode, and returns the result in a |
735 | QString. |
736 | |
737 | \a state can be \nullptr, in which case the conversion is stateless and |
738 | default conversion rules should be used. If \a state is not \nullptr, the |
739 | codec should save the state after the conversion in \a state, and |
740 | adjust the \c remainingChars and \c invalidChars members of the struct. |
741 | */ |
742 | |
743 | /*! |
744 | \fn QByteArray QTextCodec::convertFromUnicode(const QChar *input, int number, |
745 | ConverterState *state) const |
746 | |
747 | QTextCodec subclasses must reimplement this function. |
748 | |
749 | Converts the first \a number of characters from the \a input array |
750 | from Unicode to the encoding of the subclass, and returns the result |
751 | in a QByteArray. |
752 | |
753 | \a state can be \nullptr in which case the conversion is stateless and |
754 | default conversion rules should be used. If \a state is not \nullptr, the |
755 | codec should save the state after the conversion in \a state, and |
756 | adjust the \c remainingChars and \c invalidChars members of the struct. |
757 | */ |
758 | |
759 | /*! |
760 | Creates a QTextDecoder with a specified \a flags to decode chunks |
761 | of \c{char *} data to create chunks of Unicode data. |
762 | |
763 | The caller is responsible for deleting the returned object. |
764 | |
765 | \since 4.7 |
766 | */ |
767 | QTextDecoder* QTextCodec::makeDecoder(QTextCodec::ConversionFlags flags) const |
768 | { |
769 | return new QTextDecoder(this, flags); |
770 | } |
771 | |
772 | /*! |
773 | Creates a QTextEncoder with a specified \a flags to encode chunks |
774 | of Unicode data as \c{char *} data. |
775 | |
776 | The caller is responsible for deleting the returned object. |
777 | |
778 | \since 4.7 |
779 | */ |
780 | QTextEncoder* QTextCodec::makeEncoder(QTextCodec::ConversionFlags flags) const |
781 | { |
782 | return new QTextEncoder(this, flags); |
783 | } |
784 | |
785 | /*! |
786 | \fn QByteArray QTextCodec::fromUnicode(const QChar *input, int number, |
787 | ConverterState *state) const |
788 | |
789 | Converts the first \a number of characters from the \a input array |
790 | from Unicode to the encoding of this codec, and returns the result |
791 | in a QByteArray. |
792 | |
793 | The \a state of the convertor used is updated. |
794 | */ |
795 | |
796 | /*! |
797 | Converts \a str from Unicode to the encoding of this codec, and |
798 | returns the result in a QByteArray. |
799 | */ |
800 | QByteArray QTextCodec::fromUnicode(const QString& str) const |
801 | { |
802 | return convertFromUnicode(in: str.constData(), length: str.size(), state: nullptr); |
803 | } |
804 | |
805 | /*! |
806 | \overload |
807 | \since 5.10 |
808 | |
809 | Converts \a str from Unicode to the encoding of this codec, and |
810 | returns the result in a QByteArray. |
811 | */ |
812 | QByteArray QTextCodec::fromUnicode(QStringView str) const |
813 | { |
814 | return convertFromUnicode(in: str.data(), length: str.size(), state: nullptr); |
815 | } |
816 | |
817 | /*! |
818 | \fn QString QTextCodec::toUnicode(const char *input, int size, |
819 | ConverterState *state) const |
820 | |
821 | Converts the first \a size characters from the \a input from the |
822 | encoding of this codec to Unicode, and returns the result in a |
823 | QString. |
824 | |
825 | The \a state of the convertor used is updated. |
826 | */ |
827 | |
828 | /*! |
829 | Converts \a a from the encoding of this codec to Unicode, and |
830 | returns the result in a QString. |
831 | */ |
832 | QString QTextCodec::toUnicode(const QByteArray& a) const |
833 | { |
834 | return convertToUnicode(in: a.constData(), length: a.size(), state: nullptr); |
835 | } |
836 | |
837 | /*! |
838 | Returns \c true if the Unicode character \a ch can be fully encoded |
839 | with this codec; otherwise returns \c false. |
840 | */ |
841 | bool QTextCodec::canEncode(QChar ch) const |
842 | { |
843 | ConverterState state; |
844 | state.flags = ConvertInvalidToNull; |
845 | convertFromUnicode(in: &ch, length: 1, state: &state); |
846 | return (state.invalidChars == 0); |
847 | } |
848 | |
849 | /*! |
850 | \overload |
851 | |
852 | \a s contains the string being tested for encode-ability. |
853 | */ |
854 | bool QTextCodec::canEncode(const QString& s) const |
855 | { |
856 | ConverterState state; |
857 | state.flags = ConvertInvalidToNull; |
858 | convertFromUnicode(in: s.constData(), length: s.size(), state: &state); |
859 | return (state.invalidChars == 0); |
860 | } |
861 | |
862 | /*! |
863 | \overload |
864 | \since 5.10 |
865 | |
866 | Returns \c true if the Unicode string \a s can be fully encoded |
867 | with this codec; otherwise returns \c false. |
868 | */ |
869 | bool QTextCodec::canEncode(QStringView s) const |
870 | { |
871 | ConverterState state; |
872 | state.flags = ConvertInvalidToNull; |
873 | convertFromUnicode(in: s.data(), length: s.size(), state: &state); |
874 | return !state.invalidChars; |
875 | } |
876 | /*! |
877 | \overload |
878 | |
879 | \a chars contains the source characters. |
880 | */ |
881 | QString QTextCodec::toUnicode(const char *chars) const |
882 | { |
883 | const auto len = int(qstrlen(str: chars)); |
884 | return convertToUnicode(in: chars, length: len, state: nullptr); |
885 | } |
886 | |
887 | |
888 | /*! |
889 | \class QTextEncoder |
890 | \inmodule QtCore5Compat |
891 | \brief The QTextEncoder class provides a state-based encoder. |
892 | \reentrant |
893 | \ingroup i18n |
894 | |
895 | A text encoder converts text from Unicode into an encoded text format |
896 | using a specific codec. |
897 | |
898 | The encoder converts Unicode into another format, remembering any |
899 | state that is required between calls. |
900 | |
901 | \sa QTextCodec::makeEncoder(), QTextDecoder |
902 | */ |
903 | |
904 | /*! |
905 | \fn QTextEncoder::QTextEncoder(const QTextCodec *codec) |
906 | |
907 | Constructs a text encoder for the given \a codec. |
908 | */ |
909 | |
910 | /*! |
911 | Constructs a text encoder for the given \a codec and conversion \a flags. |
912 | |
913 | \since 4.7 |
914 | */ |
915 | QTextEncoder::QTextEncoder(const QTextCodec *codec, QTextCodec::ConversionFlags flags) |
916 | : c(codec), state() |
917 | { |
918 | state.flags = flags; |
919 | } |
920 | |
921 | /*! |
922 | Destroys the encoder. |
923 | */ |
924 | QTextEncoder::~QTextEncoder() |
925 | { |
926 | } |
927 | |
928 | /*! |
929 | \internal |
930 | \since 4.5 |
931 | Determines whether the encoder encountered a failure while decoding the input. If |
932 | an error was encountered, the produced result is undefined, and gets converted as according |
933 | to the conversion flags. |
934 | */ |
935 | bool QTextEncoder::hasFailure() const |
936 | { |
937 | return state.invalidChars != 0; |
938 | } |
939 | |
940 | /*! |
941 | Converts the Unicode string \a str into an encoded QByteArray. |
942 | */ |
943 | QByteArray QTextEncoder::fromUnicode(const QString& str) |
944 | { |
945 | return c->fromUnicode(in: str.constData(), length: str.size(), state: &state); |
946 | } |
947 | |
948 | /*! |
949 | \overload |
950 | \since 5.10 |
951 | Converts the Unicode string \a str into an encoded QByteArray. |
952 | */ |
953 | QByteArray QTextEncoder::fromUnicode(QStringView str) |
954 | { |
955 | return c->fromUnicode(in: str.data(), length: str.size(), state: &state); |
956 | } |
957 | |
958 | /*! |
959 | \overload |
960 | |
961 | Converts \a len characters (not bytes) from \a uc, and returns the |
962 | result in a QByteArray. |
963 | */ |
964 | QByteArray QTextEncoder::fromUnicode(const QChar *uc, int len) |
965 | { |
966 | return c->fromUnicode(in: uc, length: len, state: &state); |
967 | } |
968 | |
969 | /*! |
970 | \class QTextDecoder |
971 | \inmodule QtCore5Compat |
972 | \brief The QTextDecoder class provides a state-based decoder. |
973 | \reentrant |
974 | \ingroup i18n |
975 | |
976 | A text decoder converts text from an encoded text format into Unicode |
977 | using a specific codec. |
978 | |
979 | The decoder converts text in this format into Unicode, remembering any |
980 | state that is required between calls. |
981 | |
982 | \sa QTextCodec::makeDecoder(), QTextEncoder |
983 | */ |
984 | |
985 | /*! |
986 | \fn QTextDecoder::QTextDecoder(const QTextCodec *codec) |
987 | |
988 | Constructs a text decoder for the given \a codec. |
989 | */ |
990 | |
991 | /*! |
992 | Constructs a text decoder for the given \a codec and conversion \a flags. |
993 | |
994 | \since 4.7 |
995 | */ |
996 | |
997 | QTextDecoder::QTextDecoder(const QTextCodec *codec, QTextCodec::ConversionFlags flags) |
998 | : c(codec), state() |
999 | { |
1000 | state.flags = flags; |
1001 | } |
1002 | |
1003 | /*! |
1004 | Destroys the decoder. |
1005 | */ |
1006 | QTextDecoder::~QTextDecoder() |
1007 | { |
1008 | } |
1009 | |
1010 | /*! |
1011 | \fn QString QTextDecoder::toUnicode(const char *chars, int len) |
1012 | |
1013 | Converts the first \a len bytes in \a chars to Unicode, returning |
1014 | the result. |
1015 | |
1016 | If not all characters are used (e.g. if only part of a multi-byte |
1017 | encoding is at the end of the characters), the decoder remembers |
1018 | enough state to continue with the next call to this function. |
1019 | */ |
1020 | QString QTextDecoder::toUnicode(const char *chars, int len) |
1021 | { |
1022 | return c->toUnicode(in: chars, length: len, state: &state); |
1023 | } |
1024 | |
1025 | /*! \overload |
1026 | |
1027 | The converted string is returned in \a target. |
1028 | */ |
1029 | void QTextDecoder::toUnicode(QString *target, const char *chars, int len) |
1030 | { |
1031 | Q_ASSERT(target); |
1032 | switch (c->mibEnum()) { |
1033 | case 106: // utf8 |
1034 | static_cast<const QUtf8Codec*>(c)->convertToUnicode(target, chars, len, &state); |
1035 | break; |
1036 | case 4: // latin1 |
1037 | target->resize(size: len); |
1038 | qt_from_latin1(dst: (char16_t*)target->data(), str: chars, size: len); |
1039 | break; |
1040 | default: |
1041 | *target = c->toUnicode(in: chars, length: len, state: &state); |
1042 | } |
1043 | } |
1044 | |
1045 | |
1046 | /*! |
1047 | \overload |
1048 | |
1049 | Converts the bytes in the byte array specified by \a ba to Unicode |
1050 | and returns the result. |
1051 | */ |
1052 | QString QTextDecoder::toUnicode(const QByteArray &ba) |
1053 | { |
1054 | return c->toUnicode(in: ba.constData(), length: ba.size(), state: &state); |
1055 | } |
1056 | |
1057 | /*! |
1058 | \since 4.4 |
1059 | |
1060 | Tries to detect the encoding of the provided snippet of HTML in |
1061 | the given byte array, \a ba, by checking the BOM (Byte Order Mark) |
1062 | and the content-type meta header and returns a QTextCodec instance |
1063 | that is capable of decoding the html to unicode. If the codec |
1064 | cannot be detected from the content provided, \a defaultCodec is |
1065 | returned. |
1066 | |
1067 | \sa codecForUtfText() |
1068 | */ |
1069 | QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec *defaultCodec) |
1070 | { |
1071 | // determine charset |
1072 | QTextCodec *c = QTextCodec::codecForUtfText(ba, defaultCodec: nullptr); |
1073 | if (!c) { |
1074 | static Q_RELAXED_CONSTEXPR auto matcher = qMakeStaticByteArrayMatcher(pattern: "meta " ); |
1075 | QByteArray = ba.left(len: 1024).toLower(); |
1076 | qsizetype pos = matcher.indexIn(haystack: header); |
1077 | if (pos != -1) { |
1078 | static Q_RELAXED_CONSTEXPR auto matcher = qMakeStaticByteArrayMatcher(pattern: "charset=" ); |
1079 | pos = matcher.indexIn(haystack: header, from: pos); |
1080 | if (pos != -1) { |
1081 | pos += qstrlen(str: "charset=" ); |
1082 | |
1083 | qsizetype pos2 = pos; |
1084 | // The attribute can be closed with either """, "'", ">" or "/", |
1085 | // none of which are valid charset characters. |
1086 | while (++pos2 < header.size()) { |
1087 | char ch = header.at(i: pos2); |
1088 | if (ch == '\"' || ch == '\'' || ch == '>') { |
1089 | QByteArray name = header.mid(index: pos, len: pos2 - pos); |
1090 | if (name == "unicode" ) // QTBUG-41998, ICU will return UTF-16. |
1091 | name = QByteArrayLiteral("UTF-8" ); |
1092 | c = QTextCodec::codecForName(name); |
1093 | return c ? c : defaultCodec; |
1094 | } |
1095 | } |
1096 | } |
1097 | } |
1098 | } |
1099 | if (!c) |
1100 | c = defaultCodec; |
1101 | |
1102 | return c; |
1103 | } |
1104 | |
1105 | /*! |
1106 | \overload |
1107 | |
1108 | Tries to detect the encoding of the provided snippet of HTML in |
1109 | the given byte array, \a ba, by checking the BOM (Byte Order Mark) |
1110 | and the content-type meta header and returns a QTextCodec instance |
1111 | that is capable of decoding the html to unicode. If the codec cannot |
1112 | be detected, this overload returns a Latin-1 QTextCodec. |
1113 | */ |
1114 | QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba) |
1115 | { |
1116 | return codecForHtml(ba, defaultCodec: QTextCodec::codecForName(name: "ISO-8859-1" )); |
1117 | } |
1118 | |
1119 | /*! |
1120 | \since 4.6 |
1121 | |
1122 | Tries to detect the encoding of the provided snippet \a ba by |
1123 | using the BOM (Byte Order Mark) and returns a QTextCodec instance |
1124 | that is capable of decoding the text to unicode. This function can |
1125 | detect one of the following codecs: |
1126 | |
1127 | \list |
1128 | \li UTF-32 Little Endian |
1129 | \li UTF-32 Big Endian |
1130 | \li UTF-16 Little Endian |
1131 | \li UTF-16 Big Endian |
1132 | \li UTF-8 |
1133 | \endlist |
1134 | |
1135 | If the codec cannot be detected from the content provided, \a defaultCodec |
1136 | is returned. |
1137 | |
1138 | \sa codecForHtml() |
1139 | */ |
1140 | QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba, QTextCodec *defaultCodec) |
1141 | { |
1142 | const int arraySize = ba.size(); |
1143 | const uchar *buf = reinterpret_cast<const uchar *>(ba.constData()); |
1144 | const uint bom = 0xfeff; |
1145 | |
1146 | if (arraySize > 3) { |
1147 | uint uc = qFromUnaligned<uint>(src: buf); |
1148 | if (uc == qToBigEndian(source: bom)) |
1149 | return QTextCodec::codecForMib(mib: 1018); // utf-32 be |
1150 | else if (uc == qToLittleEndian(source: bom)) |
1151 | return QTextCodec::codecForMib(mib: 1019); // utf-32 le |
1152 | } |
1153 | |
1154 | if (arraySize < 2) |
1155 | return defaultCodec; |
1156 | |
1157 | ushort uc = qFromUnaligned<ushort>(src: buf); |
1158 | if (uc == qToBigEndian(source: ushort(bom))) |
1159 | return QTextCodec::codecForMib(mib: 1013); // utf16 be |
1160 | else if (uc == qToLittleEndian(source: ushort(bom))) |
1161 | return QTextCodec::codecForMib(mib: 1014); // utf16 le |
1162 | |
1163 | if (arraySize < 3) |
1164 | return defaultCodec; |
1165 | |
1166 | static const char utf8bom[] = "\xef\xbb\xbf" ; |
1167 | if (memcmp(s1: buf, s2: utf8bom, n: sizeof(utf8bom) - 1) == 0) |
1168 | return QTextCodec::codecForMib(mib: 106); // utf-8 |
1169 | |
1170 | return defaultCodec; |
1171 | } |
1172 | |
1173 | /*! |
1174 | \overload |
1175 | |
1176 | Tries to detect the encoding of the provided snippet \a ba by |
1177 | using the BOM (Byte Order Mark) and returns a QTextCodec instance |
1178 | that is capable of decoding the text to unicode. This function can |
1179 | detect one of the following codecs: |
1180 | |
1181 | \list |
1182 | \li UTF-32 Little Endian |
1183 | \li UTF-32 Big Endian |
1184 | \li UTF-16 Little Endian |
1185 | \li UTF-16 Big Endian |
1186 | \li UTF-8 |
1187 | \endlist |
1188 | |
1189 | If the codec cannot be detected from the content provided, this overload |
1190 | returns a Latin-1 QTextCodec. |
1191 | |
1192 | \sa codecForHtml() |
1193 | */ |
1194 | QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba) |
1195 | { |
1196 | return codecForUtfText(ba, defaultCodec: QTextCodec::codecForMib(/*Latin 1*/ mib: 4)); |
1197 | } |
1198 | |
1199 | /*! |
1200 | \fn QTextCodec *QTextCodec::codecForTr () |
1201 | \deprecated |
1202 | |
1203 | Returns the codec used by QObject::tr() on its argument. If this |
1204 | function returns \nullptr (the default), tr() assumes Latin-1. |
1205 | */ |
1206 | |
1207 | /*! |
1208 | \internal |
1209 | \since 4.3 |
1210 | Determines whether the decoder encountered a failure while decoding the |
1211 | input. If an error was encountered, the produced result is undefined, and |
1212 | gets converted as according to the conversion flags. |
1213 | */ |
1214 | bool QTextDecoder::hasFailure() const |
1215 | { |
1216 | return state.invalidChars != 0; |
1217 | } |
1218 | |
1219 | /*! |
1220 | \internal |
1221 | \since 5.12 |
1222 | |
1223 | Determines whether the decoder needs more bytes to continue decoding. That |
1224 | is, this signifies that the input string ended in the middle of a |
1225 | multi-byte sequence. Note that it's possible some codecs do not report this. |
1226 | */ |
1227 | bool QTextDecoder::needsMoreData() const |
1228 | { |
1229 | return state.remainingChars; |
1230 | } |
1231 | |
1232 | /*! |
1233 | \fn QTextCodec * Qt::codecForHtml(const QByteArray &ba) |
1234 | \internal |
1235 | |
1236 | This function is defined in the \c <QTextCodec> header file. |
1237 | */ |
1238 | QTextCodec *Qt::codecForHtml(const QByteArray &ba) |
1239 | { |
1240 | return QTextCodec::codecForHtml(ba); |
1241 | } |
1242 | |
1243 | QT_END_NAMESPACE |
1244 | |