1/*
2 SPDX-FileCopyrightText: 2000-2001 Dawit Alemayehu <adawit@kde.org>
3 SPDX-FileCopyrightText: 2001 Rik Hemsley (rikkus) <rik@kde.org>
4 SPDX-FileCopyrightText: 2001-2002 Marc Mutz <mutz@kde.org>
5
6 SPDX-License-Identifier: LGPL-2.0-only
7
8 The encoding and decoding utilities in KCodecs with the exception of
9 quoted-printable are based on the java implementation in HTTPClient
10 package by Ronald Tschalär Copyright (C) 1996-1999. // krazy:exclude=copyright
11
12 The quoted-printable codec as described in RFC 2045, section 6.7. is by
13 Rik Hemsley (C) 2001.
14*/
15
16#include "kcodecs.h"
17#include "kcharsets.h"
18#include "kcharsets_p.h"
19#include "kcodecs_debug.h"
20#include "kcodecs_p.h"
21#include "kcodecsbase64.h"
22#include "kcodecsqp.h"
23#include "kcodecsuuencode.h"
24
25#include <array>
26#include <cassert>
27#include <cstring>
28#include <stdio.h>
29#include <stdlib.h>
30#include <string.h>
31
32#include <QDebug>
33#include <QStringDecoder>
34#include <QStringEncoder>
35
36#if defined(Q_OS_WIN)
37#define strncasecmp _strnicmp
38#endif
39
40namespace KCodecs
41{
42static QList<QByteArray> charsetCache;
43
44QByteArray cachedCharset(const QByteArray &name)
45{
46 auto it = std::find_if(first: charsetCache.cbegin(), last: charsetCache.cend(), pred: [&name](const QByteArray &charset) {
47 return qstricmp(name.data(), charset.data()) == 0;
48 });
49 if (it != charsetCache.cend()) {
50 return *it;
51 }
52
53 charsetCache.append(t: name.toUpper());
54 return charsetCache.last();
55}
56
57namespace CodecNames
58{
59QByteArray utf8()
60{
61 return QByteArrayLiteral("UTF-8");
62}
63}
64
65Q_REQUIRED_RESULT
66QByteArray updateEncodingCharset(const QByteArray &currentCharset, const QByteArray &nextCharset)
67{
68 if (!nextCharset.isEmpty()) {
69 if (currentCharset.isEmpty()) {
70 return nextCharset;
71 }
72 if (currentCharset != nextCharset) {
73 // only one charset per string supported, so change to superset charset UTF-8,
74 // which should cover any possible chars
75 return CodecNames::utf8();
76 }
77 }
78 return currentCharset;
79}
80
81} // namespace KCodecs
82
83/******************************** KCodecs ********************************/
84
85QByteArray KCodecs::quotedPrintableEncode(QByteArrayView in, bool useCRLF)
86{
87 Codec *codec = Codec::codecForName(name: "quoted-printable");
88 return codec->encode(src: in, newline: useCRLF ? Codec::NewlineCRLF : Codec::NewlineLF);
89}
90
91void KCodecs::quotedPrintableEncode(QByteArrayView in, QByteArray &out, bool useCRLF)
92{
93 out = quotedPrintableEncode(in, useCRLF: useCRLF ? Codec::NewlineCRLF : Codec::NewlineLF);
94}
95
96QByteArray KCodecs::quotedPrintableDecode(QByteArrayView in)
97{
98 Codec *codec = Codec::codecForName(name: "quoted-printable");
99 return codec->decode(src: in);
100}
101
102void KCodecs::quotedPrintableDecode(QByteArrayView in, QByteArray &out)
103{
104 out = quotedPrintableDecode(in);
105}
106
107QByteArray KCodecs::base64Encode(QByteArrayView in)
108{
109 Codec *codec = Codec::codecForName(name: "base64");
110 return codec->encode(src: in);
111}
112
113void KCodecs::base64Encode(QByteArrayView in, QByteArray &out, bool insertLFs)
114{
115 Q_UNUSED(insertLFs);
116 out = base64Encode(in);
117}
118
119QByteArray KCodecs::base64Decode(QByteArrayView in)
120{
121 Codec *codec = Codec::codecForName(name: "base64");
122 return codec->decode(src: in);
123}
124
125void KCodecs::base64Decode(const QByteArrayView in, QByteArray &out)
126{
127 out = base64Decode(in);
128}
129
130QByteArray KCodecs::uudecode(QByteArrayView in)
131{
132 Codec *codec = Codec::codecForName(name: "x-uuencode");
133 return codec->decode(src: in);
134}
135
136void KCodecs::uudecode(QByteArrayView in, QByteArray &out)
137{
138 out = uudecode(in);
139}
140
141//@cond PRIVATE
142
143namespace KCodecs
144{
145// parse the encoded-word (scursor points to after the initial '=')
146bool parseEncodedWord(const char *&scursor,
147 const char *const send,
148 QString *result,
149 QByteArray *language,
150 QByteArray *usedCS,
151 const QByteArray &defaultCS,
152 CharsetOption charsetOption)
153{
154 assert(result);
155 assert(language);
156
157 // make sure the caller already did a bit of the work.
158 assert(*(scursor - 1) == '=');
159
160 //
161 // STEP 1:
162 // scan for the charset/language portion of the encoded-word
163 //
164
165 char ch = *scursor++;
166
167 if (ch != '?') {
168 // qCDebug(KCODECS_LOG) << "first";
169 // qCDebug(KCODECS_LOG) << "Premature end of encoded word";
170 return false;
171 }
172
173 // remember start of charset (i.e. just after the initial "=?") and
174 // language (just after the first '*') fields:
175 const char *charsetStart = scursor;
176 const char *languageStart = nullptr;
177
178 // find delimiting '?' (and the '*' separating charset and language
179 // tags, if any):
180 for (; scursor != send; scursor++) {
181 if (*scursor == '?') {
182 break;
183 } else if (*scursor == '*' && languageStart == nullptr) {
184 languageStart = scursor + 1;
185 }
186 }
187
188 // not found? can't be an encoded-word!
189 if (scursor == send || *scursor != '?') {
190 // qCDebug(KCODECS_LOG) << "second";
191 // qCDebug(KCODECS_LOG) << "Premature end of encoded word";
192 return false;
193 }
194
195 // extract the language information, if any (if languageStart is 0,
196 // language will be null, too):
197 QByteArray maybeLanguage(languageStart, scursor - languageStart);
198 // extract charset information (keep in mind: the size given to the
199 // ctor is one off due to the \0 terminator):
200 QByteArray maybeCharset(charsetStart, (languageStart ? languageStart - 1 : scursor) - charsetStart);
201
202 //
203 // STEP 2:
204 // scan for the encoding portion of the encoded-word
205 //
206
207 // remember start of encoding (just _after_ the second '?'):
208 scursor++;
209 const char *encodingStart = scursor;
210
211 // find next '?' (ending the encoding tag):
212 for (; scursor != send; scursor++) {
213 if (*scursor == '?') {
214 break;
215 }
216 }
217
218 // not found? Can't be an encoded-word!
219 if (scursor == send || *scursor != '?') {
220 // qCDebug(KCODECS_LOG) << "third";
221 // qCDebug(KCODECS_LOG) << "Premature end of encoded word";
222 return false;
223 }
224
225 // extract the encoding information:
226 QByteArray maybeEncoding(encodingStart, scursor - encodingStart);
227
228 // qCDebug(KCODECS_LOG) << "parseEncodedWord: found charset == \"" << maybeCharset
229 // << "\"; language == \"" << maybeLanguage
230 // << "\"; encoding == \"" << maybeEncoding << "\"";
231
232 //
233 // STEP 3:
234 // scan for encoded-text portion of encoded-word
235 //
236
237 // remember start of encoded-text (just after the third '?'):
238 scursor++;
239 const char *encodedTextStart = scursor;
240
241 // find the '?=' sequence (ending the encoded-text):
242 for (; scursor != send; scursor++) {
243 if (*scursor == '?') {
244 if (scursor + 1 != send) {
245 if (*(scursor + 1) != '=') { // We expect a '=' after the '?', but we got something else; ignore
246 // qCDebug(KCODECS_LOG) << "Stray '?' in q-encoded word, ignoring this.";
247 continue;
248 } else { // yep, found a '?=' sequence
249 scursor += 2;
250 break;
251 }
252 } else { // The '?' is the last char, but we need a '=' after it!
253 // qCDebug(KCODECS_LOG) << "Premature end of encoded word";
254 return false;
255 }
256 }
257 }
258
259 if (*(scursor - 2) != '?' || *(scursor - 1) != '=' || scursor < encodedTextStart + 2) {
260 // qCDebug(KCODECS_LOG) << "Premature end of encoded word";
261 return false;
262 }
263
264 // set end sentinel for encoded-text:
265 const char *const encodedTextEnd = scursor - 2;
266
267 //
268 // STEP 4:
269 // setup decoders for the transfer encoding and the charset
270 //
271
272 // try if there's a codec for the encoding found:
273 Codec *codec = Codec::codecForName(name: maybeEncoding);
274 if (!codec) {
275 // qCDebug(KCODECS_LOG) << "Unknown encoding" << maybeEncoding;
276 return false;
277 }
278
279 // get an instance of a corresponding decoder:
280 Decoder *dec = codec->makeDecoder();
281 assert(dec);
282
283 // try if there's a (text)codec for the charset found:
284 QByteArray cs;
285 QStringDecoder textCodec;
286 if (charsetOption == KCodecs::ForceDefaultCharset || maybeCharset.isEmpty()) {
287 textCodec = QStringDecoder(defaultCS.constData());
288 cs = cachedCharset(name: defaultCS);
289 } else {
290 textCodec = QStringDecoder(maybeCharset.constData());
291 if (!textCodec.isValid()) { // no suitable codec found => use default charset
292 textCodec = QStringDecoder(defaultCS.constData());
293 cs = cachedCharset(name: defaultCS);
294 } else {
295 cs = cachedCharset(name: maybeCharset);
296 }
297 }
298 if (usedCS) {
299 *usedCS = updateEncodingCharset(currentCharset: *usedCS, nextCharset: cs);
300 }
301
302 if (!textCodec.isValid()) {
303 // qCDebug(KCODECS_LOG) << "Unknown charset" << maybeCharset;
304 delete dec;
305 return false;
306 };
307
308 // qCDebug(KCODECS_LOG) << "mimeName(): \"" << textCodec->name() << "\"";
309
310 // allocate a temporary buffer to store the 8bit text:
311 int encodedTextLength = encodedTextEnd - encodedTextStart;
312 QByteArray buffer;
313 buffer.resize(size: codec->maxDecodedSizeFor(insize: encodedTextLength));
314 char *bbegin = buffer.data();
315 char *bend = bbegin + buffer.length();
316
317 //
318 // STEP 5:
319 // do the actual decoding
320 //
321
322 if (!dec->decode(scursor&: encodedTextStart, send: encodedTextEnd, dcursor&: bbegin, dend: bend)) {
323 qWarning() << codec->name() << "codec lies about its maxDecodedSizeFor(" << encodedTextLength << ")\nresult may be truncated";
324 }
325
326 *result = textCodec.decode(ba: QByteArrayView(buffer.data(), bbegin - buffer.data()));
327
328 // qCDebug(KCODECS_LOG) << "result now: \"" << result << "\"";
329 // cleanup:
330 delete dec;
331 *language = maybeLanguage;
332
333 return true;
334}
335
336} // namespace KCodecs
337
338//@endcond
339
340QString KCodecs::decodeRFC2047String(QStringView msg)
341{
342 QByteArray usedCS;
343 return decodeRFC2047String(src: msg.toUtf8(), usedCS: &usedCS, defaultCS: CodecNames::utf8(), option: NoOption);
344}
345
346QString KCodecs::decodeRFC2047String(QByteArrayView src, QByteArray *usedCS, const QByteArray &defaultCS, CharsetOption charsetOption)
347{
348 QByteArray result;
349 QByteArray spaceBuffer;
350 const char *scursor = src.constData();
351 const char *send = scursor + src.length();
352 bool onlySpacesSinceLastWord = false;
353 if (usedCS) {
354 usedCS->clear();
355 }
356
357 while (scursor != send) {
358 // space
359 if (isspace(*scursor) && onlySpacesSinceLastWord) {
360 spaceBuffer += *scursor++;
361 continue;
362 }
363
364 // possible start of an encoded word
365 if (*scursor == '=') {
366 QByteArray language;
367 QString decoded;
368 ++scursor;
369 const char *start = scursor;
370 if (parseEncodedWord(scursor, send, result: &decoded, language: &language, usedCS, defaultCS, charsetOption)) {
371 result += decoded.toUtf8();
372 onlySpacesSinceLastWord = true;
373 spaceBuffer.clear();
374 } else {
375 if (onlySpacesSinceLastWord) {
376 result += spaceBuffer;
377 onlySpacesSinceLastWord = false;
378 }
379 result += '=';
380 scursor = start; // reset cursor after parsing failure
381 }
382 continue;
383 } else {
384 // unencoded data
385 if (onlySpacesSinceLastWord) {
386 result += spaceBuffer;
387 onlySpacesSinceLastWord = false;
388 }
389 result += *scursor;
390 ++scursor;
391 }
392 }
393 // If there are any chars that couldn't be decoded in UTF-8,
394 // fallback to local codec
395 const QString tryUtf8 = QString::fromUtf8(ba: result);
396 if (tryUtf8.contains(c: QChar(0xFFFD))) {
397 QStringDecoder codec(QStringDecoder::System);
398 if (usedCS) {
399 *usedCS = updateEncodingCharset(currentCharset: *usedCS, nextCharset: cachedCharset(name: codec.name()));
400 }
401 return codec.decode(ba: result);
402 } else {
403 return tryUtf8;
404 }
405}
406
407QByteArray KCodecs::encodeRFC2047String(QStringView src, const QByteArray &charset)
408{
409 QByteArray result;
410 int start = 0;
411 int end = 0;
412 bool nonAscii = false;
413 bool useQEncoding = false;
414
415 QStringEncoder codec(charset.constData());
416
417 QByteArray usedCS;
418 if (!codec.isValid()) {
419 // no codec available => try local8Bit and hope the best ;-)
420 codec = QStringEncoder(QStringEncoder::System);
421 usedCS = codec.name();
422 } else {
423 Q_ASSERT(codec.isValid());
424 if (charset.isEmpty()) {
425 usedCS = codec.name();
426 } else {
427 usedCS = charset;
428 }
429 }
430
431 QByteArray encoded8Bit = codec.encode(in: src);
432 if (codec.hasError()) {
433 usedCS = CodecNames::utf8();
434 codec = QStringEncoder(QStringEncoder::Utf8);
435 encoded8Bit = codec.encode(in: src);
436 }
437
438 if (usedCS.contains(bv: "8859-")) { // use "B"-Encoding for non iso-8859-x charsets
439 useQEncoding = true;
440 }
441
442 uint encoded8BitLength = encoded8Bit.length();
443 for (unsigned int i = 0; i < encoded8BitLength; i++) {
444 if (encoded8Bit[i] == ' ') { // encoding starts at word boundaries
445 start = i + 1;
446 }
447
448 // encode escape character, for japanese encodings...
449 if (((signed char)encoded8Bit[i] < 0) || (encoded8Bit[i] == '\033')) {
450 end = start; // non us-ascii char found, now we determine where to stop encoding
451 nonAscii = true;
452 break;
453 }
454 }
455
456 if (nonAscii) {
457 while ((end < encoded8Bit.length()) && (encoded8Bit[end] != ' ')) {
458 // we encode complete words
459 end++;
460 }
461
462 for (int x = end; x < encoded8Bit.length(); x++) {
463 if (((signed char)encoded8Bit[x] < 0) || (encoded8Bit[x] == '\033')) {
464 end = x; // we found another non-ascii word
465
466 while ((end < encoded8Bit.length()) && (encoded8Bit[end] != ' ')) {
467 // we encode complete words
468 end++;
469 }
470 }
471 }
472
473 result = encoded8Bit.left(n: start) + "=?" + usedCS;
474
475 if (useQEncoding) {
476 result += "?Q?";
477
478 char hexcode; // "Q"-encoding implementation described in RFC 2047
479 for (int i = start; i < end; i++) {
480 const char c = encoded8Bit[i];
481 if (c == ' ') { // make the result readable with not MIME-capable readers
482 result += '_';
483 } else {
484 if (((c >= 'a') && (c <= 'z')) || // paranoid mode, encode *all* special chars to avoid problems
485 ((c >= 'A') && (c <= 'Z')) || // with "From" & "To" headers
486 ((c >= '0') && (c <= '9'))) {
487 result += c;
488 } else {
489 result += '='; // "stolen" from KMail ;-)
490 hexcode = ((c & 0xF0) >> 4) + 48;
491 if (hexcode >= 58) {
492 hexcode += 7;
493 }
494 result += hexcode;
495 hexcode = (c & 0x0F) + 48;
496 if (hexcode >= 58) {
497 hexcode += 7;
498 }
499 result += hexcode;
500 }
501 }
502 }
503 } else {
504 result += "?B?" + encoded8Bit.mid(index: start, len: end - start).toBase64();
505 }
506
507 result += "?=";
508 result += encoded8Bit.right(n: encoded8Bit.length() - end);
509 } else {
510 result = encoded8Bit;
511 }
512
513 return result;
514}
515
516/******************************************************************************/
517/* KCodecs::Codec */
518
519KCodecs::Codec *KCodecs::Codec::codecForName(QByteArrayView name)
520{
521 struct CodecEntry {
522 const char *name;
523 std::unique_ptr<KCodecs::Codec> codec;
524 };
525 // ### has to be sorted by name!
526 static const std::array<CodecEntry, 6> s_codecs{._M_elems: {
527 {.name: "b", .codec: std::make_unique<KCodecs::Rfc2047BEncodingCodec>()},
528 {.name: "base64", .codec: std::make_unique<KCodecs::Base64Codec>()},
529 {.name: "q", .codec: std::make_unique<KCodecs::Rfc2047QEncodingCodec>()},
530 {.name: "quoted-printable", .codec: std::make_unique<KCodecs::QuotedPrintableCodec>()},
531 {.name: "x-kmime-rfc2231", .codec: std::make_unique<KCodecs::Rfc2231EncodingCodec>()},
532 {.name: "x-uuencode", .codec: std::make_unique<KCodecs::UUCodec>()},
533 }};
534
535 const auto it = std::lower_bound(first: s_codecs.begin(), last: s_codecs.end(), val: name, comp: [](const auto &lhs, auto rhs) {
536 return rhs.compare(lhs.name, Qt::CaseInsensitive) > 0;
537 });
538 if (it == s_codecs.end() || name.compare(a: (*it).name, cs: Qt::CaseInsensitive) != 0) {
539 qWarning() << "Unknown codec \"" << name << "\" requested!";
540 return nullptr;
541 }
542 return (*it).codec.get();
543}
544
545bool KCodecs::Codec::encode(const char *&scursor, const char *const send, char *&dcursor, const char *const dend, NewlineType newline) const
546{
547 // get an encoder:
548 std::unique_ptr<Encoder> enc(makeEncoder(newline));
549 if (!enc) {
550 qWarning() << "makeEncoder failed for" << name();
551 return false;
552 }
553
554 // encode and check for output buffer overflow:
555 while (!enc->encode(scursor, send, dcursor, dend)) {
556 if (dcursor == dend) {
557 return false; // not enough space in output buffer
558 }
559 }
560
561 // finish and check for output buffer overflow:
562 while (!enc->finish(dcursor, dend)) {
563 if (dcursor == dend) {
564 return false; // not enough space in output buffer
565 }
566 }
567
568 return true; // successfully encoded.
569}
570
571QByteArray KCodecs::Codec::encode(QByteArrayView src, NewlineType newline) const
572{
573 // allocate buffer for the worst case:
574 QByteArray result;
575 result.resize(size: maxEncodedSizeFor(insize: src.size(), newline));
576
577 // set up iterators:
578 QByteArray::ConstIterator iit = src.begin();
579 QByteArray::ConstIterator iend = src.end();
580 QByteArray::Iterator oit = result.begin();
581 QByteArray::ConstIterator oend = result.end();
582
583 // encode
584 if (!encode(scursor&: iit, send: iend, dcursor&: oit, dend: oend, newline)) {
585 qCritical() << name() << "codec lies about it's mEncodedSizeFor()";
586 }
587
588 // shrink result to actual size:
589 result.truncate(pos: oit - result.begin());
590
591 return result;
592}
593
594QByteArray KCodecs::Codec::decode(QByteArrayView src, NewlineType newline) const
595{
596 // allocate buffer for the worst case:
597 QByteArray result;
598 result.resize(size: maxDecodedSizeFor(insize: src.size(), newline));
599
600 // set up iterators:
601 QByteArray::ConstIterator iit = src.begin();
602 QByteArray::ConstIterator iend = src.end();
603 QByteArray::Iterator oit = result.begin();
604 QByteArray::ConstIterator oend = result.end();
605
606 // decode
607 if (!decode(scursor&: iit, send: iend, dcursor&: oit, dend: oend, newline)) {
608 qCritical() << name() << "codec lies about it's maxDecodedSizeFor()";
609 }
610
611 // shrink result to actual size:
612 result.truncate(pos: oit - result.begin());
613
614 return result;
615}
616
617bool KCodecs::Codec::decode(const char *&scursor, const char *const send, char *&dcursor, const char *const dend, NewlineType newline) const
618{
619 // get a decoder:
620 std::unique_ptr<Decoder> dec(makeDecoder(newline));
621 assert(dec);
622
623 // decode and check for output buffer overflow:
624 while (!dec->decode(scursor, send, dcursor, dend)) {
625 if (dcursor == dend) {
626 return false; // not enough space in output buffer
627 }
628 }
629
630 // finish and check for output buffer overflow:
631 while (!dec->finish(dcursor, dend)) {
632 if (dcursor == dend) {
633 return false; // not enough space in output buffer
634 }
635 }
636
637 return true; // successfully encoded.
638}
639
640/******************************************************************************/
641/* KCodecs::Encoder */
642
643KCodecs::EncoderPrivate::EncoderPrivate(Codec::NewlineType newline)
644 : outputBufferCursor(0)
645 , newline(newline)
646{
647}
648
649KCodecs::Encoder::Encoder(Codec::NewlineType newline)
650 : d(new KCodecs::EncoderPrivate(newline))
651{
652}
653
654KCodecs::Encoder::~Encoder() = default;
655
656bool KCodecs::Encoder::write(char ch, char *&dcursor, const char *const dend)
657{
658 if (dcursor != dend) {
659 // if there's space in the output stream, write there:
660 *dcursor++ = ch;
661 return true;
662 } else {
663 // else buffer the output:
664 if (d->outputBufferCursor >= maxBufferedChars) {
665 qCritical() << "KCodecs::Encoder: internal buffer overflow!";
666 } else {
667 d->outputBuffer[d->outputBufferCursor++] = ch;
668 }
669 return false;
670 }
671}
672
673// write as much as possible off the output buffer. Return true if
674// flushing was complete, false if some chars could not be flushed.
675bool KCodecs::Encoder::flushOutputBuffer(char *&dcursor, const char *const dend)
676{
677 int i;
678 // copy output buffer to output stream:
679 for (i = 0; dcursor != dend && i < d->outputBufferCursor; ++i) {
680 *dcursor++ = d->outputBuffer[i];
681 }
682
683 // calculate the number of missing chars:
684 int numCharsLeft = d->outputBufferCursor - i;
685 // push the remaining chars to the beginning of the buffer:
686 if (numCharsLeft) {
687 ::memmove(dest: d->outputBuffer, src: d->outputBuffer + i, n: numCharsLeft);
688 }
689 // adjust cursor:
690 d->outputBufferCursor = numCharsLeft;
691
692 return !numCharsLeft;
693}
694
695bool KCodecs::Encoder::writeCRLF(char *&dcursor, const char *const dend)
696{
697 if (d->newline == Codec::NewlineCRLF) {
698 write(ch: '\r', dcursor, dend);
699 }
700 return write(ch: '\n', dcursor, dend);
701}
702
703/******************************************************************************/
704/* KCodecs::Decoder */
705
706KCodecs::DecoderPrivate::DecoderPrivate(Codec::NewlineType newline)
707 : newline(newline)
708{
709}
710
711KCodecs::Decoder::Decoder(Codec::NewlineType newline)
712 : d(new KCodecs::DecoderPrivate(newline))
713{
714}
715
716KCodecs::Decoder::~Decoder() = default;
717

source code of kcodecs/src/kcodecs.cpp