1/*
2 SPDX-FileCopyrightText: 2000-2001 Dawit Alemayehu <adawit@kde.org>
3 SPDX-FileCopyrightText: 2001 Rik Hemsley (rikkus) <rik@kde.org>
4 SPDX-FileCopyrightText: 2001-2002 Marc Mutz <mutz@kde.org>
5
6 SPDX-License-Identifier: LGPL-2.0-only
7
8 The encoding and decoding utilities in KCodecs with the exception of
9 quoted-printable are based on the java implementation in HTTPClient
10 package by Ronald Tschalär Copyright (C) 1996-1999. // krazy:exclude=copyright
11
12 The quoted-printable codec as described in RFC 2045, section 6.7. is by
13 Rik Hemsley (C) 2001.
14*/
15
16#include "kcodecs.h"
17#include "kcharsets.h"
18#include "kcharsets_p.h"
19#include "kcodecs_debug.h"
20#include "kcodecs_p.h"
21#include "kcodecsbase64.h"
22#include "kcodecsqp.h"
23#include "kcodecsuuencode.h"
24
25#include <array>
26#include <cassert>
27#include <cstring>
28#include <stdio.h>
29#include <stdlib.h>
30#include <string.h>
31
32#include <QDebug>
33#include <QStringDecoder>
34#include <QStringEncoder>
35
36#if defined(Q_OS_WIN)
37#define strncasecmp _strnicmp
38#endif
39
40namespace KCodecs
41{
42static QList<QByteArray> charsetCache;
43
44QByteArray cachedCharset(const QByteArray &name)
45{
46 auto it = std::find_if(first: charsetCache.cbegin(), last: charsetCache.cend(), pred: [&name](const QByteArray &charset) {
47 return qstricmp(name.data(), charset.data()) == 0;
48 });
49 if (it != charsetCache.cend()) {
50 return *it;
51 }
52
53 charsetCache.append(t: name.toUpper());
54 return charsetCache.last();
55}
56
57namespace CodecNames
58{
59QByteArray utf8()
60{
61 return QByteArrayLiteral("UTF-8");
62}
63}
64
65Q_REQUIRED_RESULT
66QByteArray updateEncodingCharset(const QByteArray &currentCharset, const QByteArray &nextCharset)
67{
68 if (!nextCharset.isEmpty()) {
69 if (currentCharset.isEmpty()) {
70 return nextCharset;
71 }
72 if (currentCharset != nextCharset) {
73 // only one charset per string supported, so change to superset charset UTF-8,
74 // which should cover any possible chars
75 return CodecNames::utf8();
76 }
77 }
78 return currentCharset;
79}
80
81} // namespace KCodecs
82
83/******************************** KCodecs ********************************/
84
85QByteArray KCodecs::quotedPrintableEncode(QByteArrayView in, bool useCRLF)
86{
87 Codec *codec = Codec::codecForName(name: "quoted-printable");
88 return codec->encode(src: in, newline: useCRLF ? Codec::NewlineCRLF : Codec::NewlineLF);
89}
90
91void KCodecs::quotedPrintableEncode(QByteArrayView in, QByteArray &out, bool useCRLF)
92{
93 out = quotedPrintableEncode(in, useCRLF: useCRLF ? Codec::NewlineCRLF : Codec::NewlineLF);
94}
95
96QByteArray KCodecs::quotedPrintableDecode(QByteArrayView in)
97{
98 Codec *codec = Codec::codecForName(name: "quoted-printable");
99 return codec->decode(src: in);
100}
101
102void KCodecs::quotedPrintableDecode(QByteArrayView in, QByteArray &out)
103{
104 out = quotedPrintableDecode(in);
105}
106
107QByteArray KCodecs::base64Encode(QByteArrayView in)
108{
109 Codec *codec = Codec::codecForName(name: "base64");
110 return codec->encode(src: in);
111}
112
113void KCodecs::base64Encode(QByteArrayView in, QByteArray &out, bool insertLFs)
114{
115 Q_UNUSED(insertLFs);
116 out = base64Encode(in);
117}
118
119QByteArray KCodecs::base64Decode(QByteArrayView in)
120{
121 Codec *codec = Codec::codecForName(name: "base64");
122 return codec->decode(src: in);
123}
124
125void KCodecs::base64Decode(const QByteArrayView in, QByteArray &out)
126{
127 out = base64Decode(in);
128}
129
130QByteArray KCodecs::uudecode(QByteArrayView in)
131{
132 Codec *codec = Codec::codecForName(name: "x-uuencode");
133 return codec->decode(src: in);
134}
135
136void KCodecs::uudecode(QByteArrayView in, QByteArray &out)
137{
138 out = uudecode(in);
139}
140
141//@cond PRIVATE
142
143namespace KCodecs
144{
145// parse the encoded-word (scursor points to after the initial '=')
146bool parseEncodedWord(const char *&scursor,
147 const char *const send,
148 QString *result,
149 QByteArray *language,
150 QByteArray *usedCS,
151 const QByteArray &defaultCS,
152 CharsetOption charsetOption)
153{
154 assert(result);
155 assert(language);
156
157 // make sure the caller already did a bit of the work.
158 assert(*(scursor - 1) == '=');
159
160 //
161 // STEP 1:
162 // scan for the charset/language portion of the encoded-word
163 //
164
165 char ch = *scursor++;
166
167 if (ch != '?') {
168 // qCDebug(KCODECS_LOG) << "first";
169 // qCDebug(KCODECS_LOG) << "Premature end of encoded word";
170 return false;
171 }
172
173 // remember start of charset (i.e. just after the initial "=?") and
174 // language (just after the first '*') fields:
175 const char *charsetStart = scursor;
176 const char *languageStart = nullptr;
177
178 // find delimiting '?' (and the '*' separating charset and language
179 // tags, if any):
180 for (; scursor != send; scursor++) {
181 if (*scursor == '?') {
182 break;
183 } else if (*scursor == '*' && languageStart == nullptr) {
184 languageStart = scursor + 1;
185 }
186 }
187
188 // not found? can't be an encoded-word!
189 if (scursor == send || *scursor != '?') {
190 // qCDebug(KCODECS_LOG) << "second";
191 // qCDebug(KCODECS_LOG) << "Premature end of encoded word";
192 return false;
193 }
194
195 // extract the language information, if any (if languageStart is 0,
196 // language will be null, too):
197 QByteArray maybeLanguage(languageStart, scursor - languageStart);
198 // extract charset information (keep in mind: the size given to the
199 // ctor is one off due to the \0 terminator):
200 QByteArray maybeCharset(charsetStart, (languageStart ? languageStart - 1 : scursor) - charsetStart);
201
202 //
203 // STEP 2:
204 // scan for the encoding portion of the encoded-word
205 //
206
207 // remember start of encoding (just _after_ the second '?'):
208 scursor++;
209 const char *encodingStart = scursor;
210
211 // find next '?' (ending the encoding tag):
212 for (; scursor != send; scursor++) {
213 if (*scursor == '?') {
214 break;
215 }
216 }
217
218 // not found? Can't be an encoded-word!
219 if (scursor == send || *scursor != '?') {
220 // qCDebug(KCODECS_LOG) << "third";
221 // qCDebug(KCODECS_LOG) << "Premature end of encoded word";
222 return false;
223 }
224
225 // extract the encoding information:
226 QByteArray maybeEncoding(encodingStart, scursor - encodingStart);
227
228 // qCDebug(KCODECS_LOG) << "parseEncodedWord: found charset == \"" << maybeCharset
229 // << "\"; language == \"" << maybeLanguage
230 // << "\"; encoding == \"" << maybeEncoding << "\"";
231
232 //
233 // STEP 3:
234 // scan for encoded-text portion of encoded-word
235 //
236
237 // remember start of encoded-text (just after the third '?'):
238 scursor++;
239 const char *encodedTextStart = scursor;
240
241 // find the '?=' sequence (ending the encoded-text):
242 for (; scursor != send; scursor++) {
243 if (*scursor == '?') {
244 if (scursor + 1 != send) {
245 if (*(scursor + 1) != '=') { // We expect a '=' after the '?', but we got something else; ignore
246 // qCDebug(KCODECS_LOG) << "Stray '?' in q-encoded word, ignoring this.";
247 continue;
248 } else { // yep, found a '?=' sequence
249 scursor += 2;
250 break;
251 }
252 } else { // The '?' is the last char, but we need a '=' after it!
253 // qCDebug(KCODECS_LOG) << "Premature end of encoded word";
254 return false;
255 }
256 }
257 }
258
259 if (*(scursor - 2) != '?' || *(scursor - 1) != '=' || scursor < encodedTextStart + 2) {
260 // qCDebug(KCODECS_LOG) << "Premature end of encoded word";
261 return false;
262 }
263
264 // set end sentinel for encoded-text:
265 const char *const encodedTextEnd = scursor - 2;
266
267 //
268 // STEP 4:
269 // setup decoders for the transfer encoding and the charset
270 //
271
272 // try if there's a codec for the encoding found:
273 Codec *codec = Codec::codecForName(name: maybeEncoding);
274 if (!codec) {
275 // qCDebug(KCODECS_LOG) << "Unknown encoding" << maybeEncoding;
276 return false;
277 }
278
279 // get an instance of a corresponding decoder:
280 Decoder *dec = codec->makeDecoder();
281 assert(dec);
282
283 // try if there's a (text)codec for the charset found:
284 QByteArray cs;
285 QStringDecoder textCodec;
286 if (charsetOption == KCodecs::ForceDefaultCharset || maybeCharset.isEmpty()) {
287 textCodec = QStringDecoder(defaultCS.constData());
288 cs = cachedCharset(name: defaultCS);
289 } else {
290 textCodec = QStringDecoder(maybeCharset.constData());
291 if (!textCodec.isValid()) { // no suitable codec found => use default charset
292 textCodec = QStringDecoder(defaultCS.constData());
293 cs = cachedCharset(name: defaultCS);
294 } else {
295 cs = cachedCharset(name: maybeCharset);
296 }
297 }
298 if (usedCS) {
299 *usedCS = updateEncodingCharset(currentCharset: *usedCS, nextCharset: cs);
300 }
301
302 if (!textCodec.isValid()) {
303 // qCDebug(KCODECS_LOG) << "Unknown charset" << maybeCharset;
304 delete dec;
305 return false;
306 };
307
308 // qCDebug(KCODECS_LOG) << "mimeName(): \"" << textCodec->name() << "\"";
309
310 // allocate a temporary buffer to store the 8bit text:
311 int encodedTextLength = encodedTextEnd - encodedTextStart;
312 QByteArray buffer;
313 buffer.resize(size: codec->maxDecodedSizeFor(insize: encodedTextLength));
314 char *bbegin = buffer.data();
315 char *bend = bbegin + buffer.length();
316
317 //
318 // STEP 5:
319 // do the actual decoding
320 //
321
322 if (!dec->decode(scursor&: encodedTextStart, send: encodedTextEnd, dcursor&: bbegin, dend: bend)) {
323 qWarning() << codec->name() << "codec lies about its maxDecodedSizeFor(" << encodedTextLength << ")\nresult may be truncated";
324 }
325
326 *result = textCodec.decode(ba: QByteArrayView(buffer.data(), bbegin - buffer.data()));
327
328 // qCDebug(KCODECS_LOG) << "result now: \"" << result << "\"";
329 // cleanup:
330 delete dec;
331 *language = maybeLanguage;
332
333 return true;
334}
335
336} // namespace KCodecs
337
338//@endcond
339
340QString KCodecs::decodeRFC2047String(QStringView msg)
341{
342 QByteArray usedCS;
343 return decodeRFC2047String(src: msg.toUtf8(), usedCS: &usedCS, defaultCS: CodecNames::utf8(), option: NoOption);
344}
345
346QString KCodecs::decodeRFC2047String(QByteArrayView src, QByteArray *usedCS, const QByteArray &defaultCS, CharsetOption charsetOption)
347{
348 QByteArray result;
349 QByteArray spaceBuffer;
350 const char *scursor = src.constData();
351 const char *send = scursor + src.length();
352 bool onlySpacesSinceLastWord = false;
353 if (usedCS) {
354 usedCS->clear();
355 }
356
357 while (scursor != send) {
358 // space
359 if (isspace(*scursor) && onlySpacesSinceLastWord) {
360 spaceBuffer += *scursor++;
361 continue;
362 }
363
364 // possible start of an encoded word
365 if (*scursor == '=') {
366 QByteArray language;
367 QString decoded;
368 ++scursor;
369 const char *start = scursor;
370 if (parseEncodedWord(scursor, send, result: &decoded, language: &language, usedCS, defaultCS, charsetOption)) {
371 result += decoded.toUtf8();
372 onlySpacesSinceLastWord = true;
373 spaceBuffer.clear();
374 } else {
375 if (onlySpacesSinceLastWord) {
376 result += spaceBuffer;
377 onlySpacesSinceLastWord = false;
378 }
379 result += '=';
380 scursor = start; // reset cursor after parsing failure
381 }
382 continue;
383 } else {
384 // unencoded data
385 if (onlySpacesSinceLastWord) {
386 result += spaceBuffer;
387 onlySpacesSinceLastWord = false;
388 }
389 result += *scursor;
390 ++scursor;
391 }
392 }
393 // If there are any chars that couldn't be decoded in UTF-8,
394 // fallback to local codec
395 const QString tryUtf8 = QString::fromUtf8(ba: result);
396 if (tryUtf8.contains(c: QChar(0xFFFD))) {
397 QStringDecoder codec(QStringDecoder::System);
398 if (usedCS) {
399 *usedCS = updateEncodingCharset(currentCharset: *usedCS, nextCharset: cachedCharset(name: codec.name()));
400 }
401 return codec.decode(ba: result);
402 } else {
403 return tryUtf8;
404 }
405}
406
407QByteArray KCodecs::encodeRFC2047String(QStringView src, const QByteArray &charset)
408{
409 QByteArray result;
410 int start = 0;
411 int end = 0;
412 bool nonAscii = false;
413 bool useQEncoding = false;
414
415 QStringEncoder codec(charset.constData());
416
417 QByteArray usedCS;
418 if (!codec.isValid()) {
419 // no codec available => try local8Bit and hope the best ;-)
420 codec = QStringEncoder(QStringEncoder::System);
421 usedCS = codec.name();
422 } else {
423 Q_ASSERT(codec.isValid());
424 if (charset.isEmpty()) {
425 usedCS = codec.name();
426 } else {
427 usedCS = charset;
428 }
429 }
430
431 QByteArray encoded8Bit = codec.encode(in: src);
432 if (codec.hasError()) {
433 usedCS = CodecNames::utf8();
434 codec = QStringEncoder(QStringEncoder::Utf8);
435 encoded8Bit = codec.encode(in: src);
436 }
437
438 if (usedCS.contains(bv: "8859-")) { // use "B"-Encoding for non iso-8859-x charsets
439 useQEncoding = true;
440 }
441
442 uint encoded8BitLength = encoded8Bit.length();
443 for (unsigned int i = 0; i < encoded8BitLength; i++) {
444 if (encoded8Bit[i] == ' ') { // encoding starts at word boundaries
445 start = i + 1;
446 }
447
448 // encode escape character, for japanese encodings...
449 if (((signed char)encoded8Bit[i] < 0) || (encoded8Bit[i] == '\033')) {
450 end = start; // non us-ascii char found, now we determine where to stop encoding
451 nonAscii = true;
452 break;
453 }
454 }
455
456 if (nonAscii) {
457 while ((end < encoded8Bit.length()) && (encoded8Bit[end] != ' ')) {
458 // we encode complete words
459 end++;
460 }
461
462 for (int x = end; x < encoded8Bit.length(); x++) {
463 if (((signed char)encoded8Bit[x] < 0) || (encoded8Bit[x] == '\033')) {
464 end = x; // we found another non-ascii word
465
466 while ((end < encoded8Bit.length()) && (encoded8Bit[end] != ' ')) {
467 // we encode complete words
468 end++;
469 }
470 }
471 }
472
473 result = encoded8Bit.left(len: start) + "=?" + usedCS;
474
475 if (useQEncoding) {
476 result += "?Q?";
477
478 char hexcode; // "Q"-encoding implementation described in RFC 2047
479 for (int i = start; i < end; i++) {
480 const char c = encoded8Bit[i];
481 if (c == ' ') { // make the result readable with not MIME-capable readers
482 result += '_';
483 } else {
484 if (((c >= 'a') && (c <= 'z')) || // paranoid mode, encode *all* special chars to avoid problems
485 ((c >= 'A') && (c <= 'Z')) || // with "From" & "To" headers
486 ((c >= '0') && (c <= '9'))) {
487 result += c;
488 } else {
489 result += '='; // "stolen" from KMail ;-)
490 hexcode = ((c & 0xF0) >> 4) + 48;
491 if (hexcode >= 58) {
492 hexcode += 7;
493 }
494 result += hexcode;
495 hexcode = (c & 0x0F) + 48;
496 if (hexcode >= 58) {
497 hexcode += 7;
498 }
499 result += hexcode;
500 }
501 }
502 }
503 } else {
504 result += "?B?" + encoded8Bit.mid(index: start, len: end - start).toBase64();
505 }
506
507 result += "?=";
508 result += encoded8Bit.right(len: encoded8Bit.length() - end);
509 } else {
510 result = encoded8Bit;
511 }
512
513 return result;
514}
515
516/******************************************************************************/
517/* KCodecs::Codec */
518
519KCodecs::Codec *KCodecs::Codec::codecForName(QByteArrayView name)
520{
521 struct CodecEntry {
522 const char *name;
523 std::unique_ptr<KCodecs::Codec> codec;
524 };
525 // ### has to be sorted by name!
526 static const std::array<CodecEntry, 6> s_codecs{._M_elems: {
527 {.name: "b", .codec: std::make_unique<KCodecs::Rfc2047BEncodingCodec>()},
528 {.name: "base64", .codec: std::make_unique<KCodecs::Base64Codec>()},
529 {.name: "q", .codec: std::make_unique<KCodecs::Rfc2047QEncodingCodec>()},
530 {.name: "quoted-printable", .codec: std::make_unique<KCodecs::QuotedPrintableCodec>()},
531 {.name: "x-kmime-rfc2231", .codec: std::make_unique<KCodecs::Rfc2231EncodingCodec>()},
532 {.name: "x-uuencode", .codec: std::make_unique<KCodecs::UUCodec>()},
533 }};
534
535 const auto it = std::lower_bound(first: s_codecs.begin(), last: s_codecs.end(), val: name, comp: [](const auto &lhs, auto rhs) {
536 return rhs.compare(lhs.name, Qt::CaseInsensitive) > 0;
537 });
538 if (it == s_codecs.end() || name.compare(a: (*it).name, cs: Qt::CaseInsensitive) != 0) {
539 qWarning() << "Unknown codec \"" << name << "\" requested!";
540 }
541 return (*it).codec.get();
542}
543
544bool KCodecs::Codec::encode(const char *&scursor, const char *const send, char *&dcursor, const char *const dend, NewlineType newline) const
545{
546 // get an encoder:
547 std::unique_ptr<Encoder> enc(makeEncoder(newline));
548 if (!enc) {
549 qWarning() << "makeEncoder failed for" << name();
550 return false;
551 }
552
553 // encode and check for output buffer overflow:
554 while (!enc->encode(scursor, send, dcursor, dend)) {
555 if (dcursor == dend) {
556 return false; // not enough space in output buffer
557 }
558 }
559
560 // finish and check for output buffer overflow:
561 while (!enc->finish(dcursor, dend)) {
562 if (dcursor == dend) {
563 return false; // not enough space in output buffer
564 }
565 }
566
567 return true; // successfully encoded.
568}
569
570QByteArray KCodecs::Codec::encode(QByteArrayView src, NewlineType newline) const
571{
572 // allocate buffer for the worst case:
573 QByteArray result;
574 result.resize(size: maxEncodedSizeFor(insize: src.size(), newline));
575
576 // set up iterators:
577 QByteArray::ConstIterator iit = src.begin();
578 QByteArray::ConstIterator iend = src.end();
579 QByteArray::Iterator oit = result.begin();
580 QByteArray::ConstIterator oend = result.end();
581
582 // encode
583 if (!encode(scursor&: iit, send: iend, dcursor&: oit, dend: oend, newline)) {
584 qCritical() << name() << "codec lies about it's mEncodedSizeFor()";
585 }
586
587 // shrink result to actual size:
588 result.truncate(pos: oit - result.begin());
589
590 return result;
591}
592
593QByteArray KCodecs::Codec::decode(QByteArrayView src, NewlineType newline) const
594{
595 // allocate buffer for the worst case:
596 QByteArray result;
597 result.resize(size: maxDecodedSizeFor(insize: src.size(), newline));
598
599 // set up iterators:
600 QByteArray::ConstIterator iit = src.begin();
601 QByteArray::ConstIterator iend = src.end();
602 QByteArray::Iterator oit = result.begin();
603 QByteArray::ConstIterator oend = result.end();
604
605 // decode
606 if (!decode(scursor&: iit, send: iend, dcursor&: oit, dend: oend, newline)) {
607 qCritical() << name() << "codec lies about it's maxDecodedSizeFor()";
608 }
609
610 // shrink result to actual size:
611 result.truncate(pos: oit - result.begin());
612
613 return result;
614}
615
616bool KCodecs::Codec::decode(const char *&scursor, const char *const send, char *&dcursor, const char *const dend, NewlineType newline) const
617{
618 // get a decoder:
619 std::unique_ptr<Decoder> dec(makeDecoder(newline));
620 assert(dec);
621
622 // decode and check for output buffer overflow:
623 while (!dec->decode(scursor, send, dcursor, dend)) {
624 if (dcursor == dend) {
625 return false; // not enough space in output buffer
626 }
627 }
628
629 // finish and check for output buffer overflow:
630 while (!dec->finish(dcursor, dend)) {
631 if (dcursor == dend) {
632 return false; // not enough space in output buffer
633 }
634 }
635
636 return true; // successfully encoded.
637}
638
639/******************************************************************************/
640/* KCodecs::Encoder */
641
642KCodecs::EncoderPrivate::EncoderPrivate(Codec::NewlineType newline)
643 : outputBufferCursor(0)
644 , newline(newline)
645{
646}
647
648KCodecs::Encoder::Encoder(Codec::NewlineType newline)
649 : d(new KCodecs::EncoderPrivate(newline))
650{
651}
652
653KCodecs::Encoder::~Encoder() = default;
654
655bool KCodecs::Encoder::write(char ch, char *&dcursor, const char *const dend)
656{
657 if (dcursor != dend) {
658 // if there's space in the output stream, write there:
659 *dcursor++ = ch;
660 return true;
661 } else {
662 // else buffer the output:
663 if (d->outputBufferCursor >= maxBufferedChars) {
664 qCritical() << "KCodecs::Encoder: internal buffer overflow!";
665 } else {
666 d->outputBuffer[d->outputBufferCursor++] = ch;
667 }
668 return false;
669 }
670}
671
672// write as much as possible off the output buffer. Return true if
673// flushing was complete, false if some chars could not be flushed.
674bool KCodecs::Encoder::flushOutputBuffer(char *&dcursor, const char *const dend)
675{
676 int i;
677 // copy output buffer to output stream:
678 for (i = 0; dcursor != dend && i < d->outputBufferCursor; ++i) {
679 *dcursor++ = d->outputBuffer[i];
680 }
681
682 // calculate the number of missing chars:
683 int numCharsLeft = d->outputBufferCursor - i;
684 // push the remaining chars to the beginning of the buffer:
685 if (numCharsLeft) {
686 ::memmove(dest: d->outputBuffer, src: d->outputBuffer + i, n: numCharsLeft);
687 }
688 // adjust cursor:
689 d->outputBufferCursor = numCharsLeft;
690
691 return !numCharsLeft;
692}
693
694bool KCodecs::Encoder::writeCRLF(char *&dcursor, const char *const dend)
695{
696 if (d->newline == Codec::NewlineCRLF) {
697 write(ch: '\r', dcursor, dend);
698 }
699 return write(ch: '\n', dcursor, dend);
700}
701
702/******************************************************************************/
703/* KCodecs::Decoder */
704
705KCodecs::DecoderPrivate::DecoderPrivate(Codec::NewlineType newline)
706 : newline(newline)
707{
708}
709
710KCodecs::Decoder::Decoder(Codec::NewlineType newline)
711 : d(new KCodecs::DecoderPrivate(newline))
712{
713}
714
715KCodecs::Decoder::~Decoder() = default;
716

source code of kcodecs/src/kcodecs.cpp