1/* -*- c++ -*-
2 SPDX-FileCopyrightText: 2002 Marc Mutz <mutz@kde.org>
3
4 SPDX-License-Identifier: LGPL-2.0-or-later
5*/
6/**
7 @file
8 This file is part of the API for handling @ref MIME data and
9 defines the @ref QuotedPrintable, @ref RFC2047Q, and
10 @ref RFC2231 @ref Codec classes.
11
12 @brief
13 Defines the classes QuotedPrintableCodec, Rfc2047QEncodingCodec, and
14 Rfc2231EncodingCodec.
15
16 @authors Marc Mutz \<mutz@kde.org\>
17*/
18
19#include "kcodecsqp.h"
20#include "kcodecs_p.h"
21
22#include <QDebug>
23
24#include <cassert>
25
26using namespace KCodecs;
27
28namespace KCodecs
29{
30// none except a-zA-Z0-9!*+-/
31const uchar eTextMap[16] = {0x00, 0x00, 0x00, 0x00, 0x40, 0x35, 0xFF, 0xC0, 0x7F, 0xFF, 0xFF, 0xE0, 0x7F, 0xFF, 0xFF, 0xE0};
32
33// some helpful functions:
34
35/**
36 Converts a 4-bit @p value into its hexadecimal characater representation.
37 So input of value [0,15] returns ['0','1',... 'F']. Input values
38 greater than 15 will produce undesired results.
39 @param value is an unsigned character containing the 4-bit input value.
40*/
41static inline char binToHex(uchar value)
42{
43 if (value > 9) {
44 return value + 'A' - 10;
45 } else {
46 return value + '0';
47 }
48}
49
50/**
51 Returns the high-order 4 bits of an 8-bit value in another 8-bit value.
52 @param ch is an unsigned character containing the 8-bit input value.
53*/
54static inline uchar highNibble(uchar ch)
55{
56 return ch >> 4;
57}
58
59/**
60 Returns the low-order 4 bits of an 8-bit value in another 8-bit value.
61 @param ch is an unsigned character containing the 8-bit input value.
62*/
63static inline uchar lowNibble(uchar ch)
64{
65 return ch & 0xF;
66}
67
68/**
69 Returns true if the specified value is a not Control character or
70 question mark; else true.
71 @param ch is an unsigned character containing the 8-bit input value.
72*/
73static inline bool keep(uchar ch)
74{
75 // no CTLs, except HT and not '?'
76 return !((ch < ' ' && ch != '\t') || ch == '?');
77}
78
79//
80// QuotedPrintableCodec
81//
82
83class QuotedPrintableEncoder : public Encoder
84{
85 char mInputBuffer[16];
86 uchar mCurrentLineLength; // 0..76
87 uchar mAccu;
88 uint mInputBufferReadCursor : 4; // 0..15
89 uint mInputBufferWriteCursor : 4; // 0..15
90 enum {
91 Never,
92 AtBOL,
93 Definitely,
94 } mAccuNeedsEncoding : 2;
95 bool mSawLineEnd : 1;
96 bool mSawCR : 1;
97 bool mFinishing : 1;
98 bool mFinished : 1;
99
100protected:
101 friend class QuotedPrintableCodec;
102 QuotedPrintableEncoder(Codec::NewlineType newline = Codec::NewlineLF)
103 : Encoder(newline)
104 , mCurrentLineLength(0)
105 , mAccu(0)
106 , mInputBufferReadCursor(0)
107 , mInputBufferWriteCursor(0)
108 , mAccuNeedsEncoding(Never)
109 , mSawLineEnd(false)
110 , mSawCR(false)
111 , mFinishing(false)
112 , mFinished(false)
113 {
114 }
115
116 bool needsEncoding(uchar ch)
117 {
118 return ch > '~' || (ch < ' ' && ch != '\t') || ch == '=';
119 }
120 bool needsEncodingAtEOL(uchar ch)
121 {
122 return ch == ' ' || ch == '\t';
123 }
124 bool needsEncodingAtBOL(uchar ch)
125 {
126 return ch == 'F' || ch == '.' || ch == '-';
127 }
128 bool fillInputBuffer(const char *&scursor, const char *const send);
129 bool processNextChar();
130 void createOutputBuffer(char *&dcursor, const char *const dend);
131
132public:
133 ~QuotedPrintableEncoder() override
134 {
135 }
136
137 bool encode(const char *&scursor, const char *const send, char *&dcursor, const char *const dend) override;
138
139 bool finish(char *&dcursor, const char *const dend) override;
140};
141
142class QuotedPrintableDecoder : public Decoder
143{
144 const char mEscapeChar;
145 char mBadChar;
146 /** @p accu holds the msb nibble of the hexchar or zero. */
147 uchar mAccu;
148 /** @p insideHexChar is true iff we're inside an hexchar (=XY).
149 Together with @ref mAccu, we can build this states:
150 @li @p insideHexChar == @p false:
151 normal text
152 @li @p insideHexChar == @p true, @p mAccu == 0:
153 saw the leading '='
154 @li @p insideHexChar == @p true, @p mAccu != 0:
155 saw the first nibble '=X'
156 */
157 const bool mQEncoding;
158 bool mInsideHexChar;
159 bool mFlushing;
160 bool mExpectLF;
161 bool mHaveAccu;
162 /** @p mLastChar holds the first char of an encoded char, so that
163 we are able to keep the first char if the second char is invalid. */
164 char mLastChar;
165
166protected:
167 friend class QuotedPrintableCodec;
168 friend class Rfc2047QEncodingCodec;
169 friend class Rfc2231EncodingCodec;
170 QuotedPrintableDecoder(Codec::NewlineType newline = Codec::NewlineLF, bool aQEncoding = false, char aEscapeChar = '=')
171 : Decoder(newline)
172 , mEscapeChar(aEscapeChar)
173 , mBadChar(0)
174 , mAccu(0)
175 , mQEncoding(aQEncoding)
176 , mInsideHexChar(false)
177 , mFlushing(false)
178 , mExpectLF(false)
179 , mHaveAccu(false)
180 , mLastChar(0)
181 {
182 }
183
184public:
185 ~QuotedPrintableDecoder() override
186 {
187 }
188
189 bool decode(const char *&scursor, const char *const send, char *&dcursor, const char *const dend) override;
190 bool finish(char *&dcursor, const char *const dend) override;
191};
192
193class Rfc2047QEncodingEncoder : public Encoder
194{
195 uchar mAccu;
196 uchar mStepNo;
197 const char mEscapeChar;
198 bool mInsideFinishing : 1;
199
200protected:
201 friend class Rfc2047QEncodingCodec;
202 friend class Rfc2231EncodingCodec;
203 Rfc2047QEncodingEncoder(Codec::NewlineType newline = Codec::NewlineLF, char aEscapeChar = '=')
204 : Encoder(newline)
205 , mAccu(0)
206 , mStepNo(0)
207 , mEscapeChar(aEscapeChar)
208 , mInsideFinishing(false)
209 {
210 // else an optimization in ::encode might break.
211 assert(aEscapeChar == '=' || aEscapeChar == '%');
212 }
213
214 bool isEText(uchar ch)
215 {
216 return (ch < 128) && (eTextMap[ch / 8] & 0x80 >> ch % 8);
217 }
218
219 // this code assumes that isEText( mEscapeChar ) == false!
220 bool needsEncoding(uchar ch)
221 {
222 if (ch > 'z') {
223 return true; // {|}~ DEL and 8bit chars need
224 }
225 if (!isEText(ch)) {
226 return true; // all but a-zA-Z0-9!/*+- need, too
227 }
228 if (mEscapeChar == '%' && (ch == '*' || ch == '/')) {
229 return true; // not allowed in rfc2231 encoding
230 }
231 return false;
232 }
233
234public:
235 ~Rfc2047QEncodingEncoder() override
236 {
237 }
238
239 bool encode(const char *&scursor, const char *const send, char *&dcursor, const char *const dend) override;
240 bool finish(char *&dcursor, const char *const dend) override;
241};
242
243// this doesn't access any member variables, so it can be defined static
244// but then we can't call it from virtual functions
245static qsizetype QuotedPrintableDecoder_maxDecodedSizeFor(qsizetype insize, Codec::NewlineType newline)
246{
247 // all chars unencoded:
248 qsizetype result = insize;
249 // but maybe all of them are \n and we need to make them \r\n :-o
250 if (newline == Codec::NewlineCRLF) {
251 result += insize;
252 }
253
254 // there might be an accu plus escape
255 result += 2;
256
257 return result;
258}
259
260Encoder *QuotedPrintableCodec::makeEncoder(Codec::NewlineType newline) const
261{
262 return new QuotedPrintableEncoder(newline);
263}
264
265Decoder *QuotedPrintableCodec::makeDecoder(Codec::NewlineType newline) const
266{
267 return new QuotedPrintableDecoder(newline);
268}
269
270qsizetype QuotedPrintableCodec::maxDecodedSizeFor(qsizetype insize, Codec::NewlineType newline) const
271{
272 return QuotedPrintableDecoder_maxDecodedSizeFor(insize, newline);
273}
274
275Encoder *Rfc2047QEncodingCodec::makeEncoder(Codec::NewlineType newline) const
276{
277 return new Rfc2047QEncodingEncoder(newline);
278}
279
280Decoder *Rfc2047QEncodingCodec::makeDecoder(Codec::NewlineType newline) const
281{
282 return new QuotedPrintableDecoder(newline, true);
283}
284
285qsizetype Rfc2047QEncodingCodec::maxDecodedSizeFor(qsizetype insize, Codec::NewlineType newline) const
286{
287 return QuotedPrintableDecoder_maxDecodedSizeFor(insize, newline);
288}
289
290Encoder *Rfc2231EncodingCodec::makeEncoder(Codec::NewlineType newline) const
291{
292 return new Rfc2047QEncodingEncoder(newline, '%');
293}
294
295Decoder *Rfc2231EncodingCodec::makeDecoder(Codec::NewlineType newline) const
296{
297 return new QuotedPrintableDecoder(newline, true, '%');
298}
299
300qsizetype Rfc2231EncodingCodec::maxDecodedSizeFor(qsizetype insize, Codec::NewlineType newline) const
301{
302 return QuotedPrintableDecoder_maxDecodedSizeFor(insize, newline);
303}
304
305/********************************************************/
306/********************************************************/
307/********************************************************/
308
309bool QuotedPrintableDecoder::decode(const char *&scursor, const char *const send, char *&dcursor, const char *const dend)
310{
311 if (d->newline == Codec::NewlineCRLF) {
312 qWarning() << "CRLF output for decoders isn't yet supported!";
313 }
314
315 while (scursor != send && dcursor != dend) {
316 if (mFlushing) {
317 // we have to flush chars in the aftermath of a decoding
318 // error. The way to request a flush is to
319 // - store the offending character in mBadChar and
320 // - set mFlushing to true.
321 // The supported cases are (H: hexchar, X: bad char):
322 // =X, =HX, CR
323 // mBadChar is only written out if it is not by itself illegal in
324 // quoted-printable (e.g. CTLs, 8Bits).
325 // A fast way to suppress mBadChar output is to set it to NUL.
326 if (mInsideHexChar) {
327 // output '='
328 *dcursor++ = mEscapeChar;
329 mInsideHexChar = false;
330 } else if (mHaveAccu) {
331 // output the high nibble of the accumulator:
332 *dcursor++ = mLastChar;
333 mHaveAccu = false;
334 mAccu = 0;
335 } else {
336 // output mBadChar
337 assert(mAccu == 0);
338 if (mBadChar) {
339 if (mBadChar == '=') {
340 mInsideHexChar = true;
341 } else {
342 *dcursor++ = mBadChar;
343 }
344 mBadChar = 0;
345 }
346 mFlushing = false;
347 }
348 continue;
349 }
350 assert(mBadChar == 0);
351
352 uchar ch = *scursor++;
353
354 if (mExpectLF && ch != '\n') {
355 // qWarning() << "QuotedPrintableDecoder:"
356 // "illegally formed soft linebreak or lonely CR!";
357 mInsideHexChar = false;
358 mExpectLF = false;
359 if (mAccu != 0) {
360 return false;
361 }
362 }
363
364 if (mInsideHexChar) {
365 uchar value = 255;
366 // next char(s) represent nibble instead of itself:
367 if (ch <= '9') {
368 if (ch >= '0') {
369 value = ch - '0';
370 } else {
371 switch (ch) {
372 case '\r':
373 mExpectLF = true;
374 break;
375 case '\n':
376 // soft line break, but only if mAccu is NUL.
377 if (!mHaveAccu) {
378 mExpectLF = false;
379 mInsideHexChar = false;
380 break;
381 }
382 // else fall through
383 default:
384 // qWarning() << "QuotedPrintableDecoder:"
385 // "illegally formed hex char! Outputting verbatim.";
386 mBadChar = ch;
387 mFlushing = true;
388 }
389 continue;
390 }
391 } else { // ch > '9'
392 if (ch <= 'F') {
393 if (ch >= 'A') {
394 value = 10 + ch - 'A';
395 } else { // [:-@]
396 mBadChar = ch;
397 mFlushing = true;
398 continue;
399 }
400 } else { // ch > 'F'
401 if (ch <= 'f' && ch >= 'a') {
402 value = 10 + ch - 'a';
403 } else {
404 mBadChar = ch;
405 mFlushing = true;
406 continue;
407 }
408 }
409 }
410
411 assert(value < 16);
412 assert(mBadChar == 0);
413 assert(!mExpectLF);
414
415 if (mHaveAccu) {
416 *dcursor++ = char(mAccu | value);
417 mAccu = 0;
418 mHaveAccu = false;
419 mInsideHexChar = false;
420 } else {
421 mHaveAccu = true;
422 mAccu = value << 4;
423 mLastChar = ch;
424 }
425 } else { // not mInsideHexChar
426 if ((ch <= '~' && ch >= ' ') || ch == '\t') {
427 if (ch == mEscapeChar) {
428 mInsideHexChar = true;
429 } else if (mQEncoding && ch == '_') {
430 *dcursor++ = char(0x20);
431 } else {
432 *dcursor++ = char(ch);
433 }
434 } else if (ch == '\n') {
435 *dcursor++ = '\n';
436 mExpectLF = false;
437 } else if (ch == '\r') {
438 mExpectLF = true;
439 } else {
440 // qWarning() << "QuotedPrintableDecoder:" << ch <<
441 // "illegal character in input stream!";
442 *dcursor++ = char(ch);
443 }
444 }
445 }
446
447 return scursor == send;
448}
449
450bool QuotedPrintableDecoder::finish(char *&dcursor, const char *const dend)
451{
452 while ((mInsideHexChar || mHaveAccu || mFlushing) && dcursor != dend) {
453 // we have to flush chars
454 if (mInsideHexChar) {
455 // output '='
456 *dcursor++ = mEscapeChar;
457 mInsideHexChar = false;
458 } else if (mHaveAccu) {
459 // output the high nibble of the accumulator:
460 *dcursor++ = mLastChar;
461 mHaveAccu = false;
462 mAccu = 0;
463 } else {
464 // output mBadChar
465 assert(mAccu == 0);
466 if (mBadChar) {
467 *dcursor++ = mBadChar;
468 mBadChar = 0;
469 }
470 mFlushing = false;
471 }
472 }
473
474 // return false if we are not finished yet; note that mInsideHexChar is always false
475 return !(mHaveAccu || mFlushing);
476}
477
478bool QuotedPrintableEncoder::fillInputBuffer(const char *&scursor, const char *const send)
479{
480 // Don't read more if there's still a tail of a line in the buffer:
481 if (mSawLineEnd) {
482 return true;
483 }
484
485 // Read until the buffer is full or we have found CRLF or LF (which
486 // don't end up in the input buffer):
487 for (; (mInputBufferWriteCursor + 1) % 16 != mInputBufferReadCursor && scursor != send; mInputBufferWriteCursor++) {
488 char ch = *scursor++;
489 if (ch == '\r') {
490 mSawCR = true;
491 } else if (ch == '\n') {
492 // remove the CR from the input buffer (if any) and return that
493 // we found a line ending:
494 if (mSawCR) {
495 mSawCR = false;
496 assert(mInputBufferWriteCursor != mInputBufferReadCursor);
497 mInputBufferWriteCursor--;
498 }
499 mSawLineEnd = true;
500 return true; // saw CRLF or LF
501 } else {
502 mSawCR = false;
503 }
504 mInputBuffer[mInputBufferWriteCursor] = ch;
505 }
506 mSawLineEnd = false;
507 return false; // didn't see a line ending...
508}
509
510bool QuotedPrintableEncoder::processNextChar()
511{
512 // If we process a buffer which doesn't end in a line break, we
513 // can't process all of it, since the next chars that will be read
514 // could be a line break. So we empty the buffer only until a fixed
515 // number of chars is left (except when mFinishing, which means that
516 // the data doesn't end in newline):
517 const int minBufferFillWithoutLineEnd = 4;
518
519 assert(d->outputBufferCursor == 0);
520
521 int bufferFill = int(mInputBufferWriteCursor) - int(mInputBufferReadCursor);
522 if (bufferFill < 0) {
523 bufferFill += 16;
524 }
525
526 assert(bufferFill >= 0 && bufferFill <= 15);
527
528 if (!mFinishing //
529 && !mSawLineEnd //
530 && bufferFill < minBufferFillWithoutLineEnd) {
531 return false;
532 }
533
534 // buffer is empty, return false:
535 if (mInputBufferReadCursor == mInputBufferWriteCursor) {
536 return false;
537 }
538
539 // Real processing goes here:
540 mAccu = mInputBuffer[mInputBufferReadCursor++];
541 if (needsEncoding(ch: mAccu)) { // always needs encoding or
542 mAccuNeedsEncoding = Definitely;
543 } else if ((mSawLineEnd || mFinishing) // needs encoding at end of line
544 && bufferFill == 1 // or end of buffer
545 && needsEncodingAtEOL(ch: mAccu)) {
546 mAccuNeedsEncoding = Definitely;
547 } else if (needsEncodingAtBOL(ch: mAccu)) {
548 mAccuNeedsEncoding = AtBOL;
549 } else {
550 // never needs encoding
551 mAccuNeedsEncoding = Never;
552 }
553
554 return true;
555}
556
557// Outputs processed (verbatim or hex-encoded) chars and inserts soft
558// line breaks as necessary. Depends on processNextChar's directions
559// on whether to encode the current char, and whether
560// the current char is the last one in it's input line:
561void QuotedPrintableEncoder::createOutputBuffer(char *&dcursor, const char *const dend)
562{
563 const int maxLineLength = 76; // rfc 2045
564
565 assert(d->outputBufferCursor == 0);
566
567 /* clang-format off */
568 bool lastOneOnThisLine = mSawLineEnd
569 && mInputBufferReadCursor == mInputBufferWriteCursor;
570 /* clang-format on */
571
572 int neededSpace = 1;
573 if (mAccuNeedsEncoding == Definitely) {
574 neededSpace = 3;
575 }
576
577 // reserve space for the soft hyphen (=)
578 if (!lastOneOnThisLine) {
579 neededSpace++;
580 }
581
582 if (mCurrentLineLength > maxLineLength - neededSpace) {
583 // current line too short, insert soft line break:
584 write(ch: '=', dcursor, dend);
585 writeCRLF(dcursor, dend);
586 mCurrentLineLength = 0;
587 }
588
589 if (Never == mAccuNeedsEncoding //
590 || (AtBOL == mAccuNeedsEncoding && mCurrentLineLength != 0)) {
591 write(ch: mAccu, dcursor, dend);
592 mCurrentLineLength++;
593 } else {
594 write(ch: '=', dcursor, dend);
595 write(ch: binToHex(value: highNibble(ch: mAccu)), dcursor, dend);
596 write(ch: binToHex(value: lowNibble(ch: mAccu)), dcursor, dend);
597 mCurrentLineLength += 3;
598 }
599}
600
601bool QuotedPrintableEncoder::encode(const char *&scursor, const char *const send, char *&dcursor, const char *const dend)
602{
603 // support probing by the caller:
604 if (mFinishing) {
605 return true;
606 }
607
608 while (scursor != send && dcursor != dend) {
609 if (d->outputBufferCursor && !flushOutputBuffer(dcursor, dend)) {
610 return scursor == send;
611 }
612
613 assert(d->outputBufferCursor == 0);
614
615 // fill input buffer until eol has been reached or until the
616 // buffer is full, whatever comes first:
617 fillInputBuffer(scursor, send);
618
619 if (processNextChar()) {
620 // there was one...
621 createOutputBuffer(dcursor, dend);
622 } else if (mSawLineEnd && mInputBufferWriteCursor == mInputBufferReadCursor) {
623 // load a hard line break into output buffer:
624 writeCRLF(dcursor, dend);
625 // signal fillInputBuffer() we are ready for the next line:
626 mSawLineEnd = false;
627 mCurrentLineLength = 0;
628 } else {
629 // we are supposedly finished with this input block:
630 break;
631 }
632 }
633
634 // make sure we write as much as possible and don't stop _writing_
635 // just because we have no more _input_:
636 if (d->outputBufferCursor) {
637 flushOutputBuffer(dcursor, dend);
638 }
639
640 return scursor == send;
641
642} // encode
643
644bool QuotedPrintableEncoder::finish(char *&dcursor, const char *const dend)
645{
646 mFinishing = true;
647
648 if (mFinished) {
649 return flushOutputBuffer(dcursor, dend);
650 }
651
652 while (dcursor != dend) {
653 if (d->outputBufferCursor && !flushOutputBuffer(dcursor, dend)) {
654 return false;
655 }
656
657 assert(d->outputBufferCursor == 0);
658
659 if (processNextChar()) {
660 // there was one...
661 createOutputBuffer(dcursor, dend);
662 } else if (mSawLineEnd && mInputBufferWriteCursor == mInputBufferReadCursor) {
663 // load a hard line break into output buffer:
664 writeCRLF(dcursor, dend);
665 mSawLineEnd = false;
666 mCurrentLineLength = 0;
667 } else {
668 mFinished = true;
669 return flushOutputBuffer(dcursor, dend);
670 }
671 }
672
673 return mFinished && !d->outputBufferCursor;
674
675} // finish
676
677bool Rfc2047QEncodingEncoder::encode(const char *&scursor, const char *const send, char *&dcursor, const char *const dend)
678{
679 if (mInsideFinishing) {
680 return true;
681 }
682
683 while (scursor != send && dcursor != dend) {
684 uchar value = 0;
685 switch (mStepNo) {
686 case 0:
687 // read the next char and decide if and how do encode:
688 mAccu = *scursor++;
689 if (!needsEncoding(ch: mAccu)) {
690 *dcursor++ = char(mAccu);
691 } else if (mEscapeChar == '=' && mAccu == 0x20) {
692 // shortcut encoding for 0x20 (latin-1/us-ascii SPACE)
693 // (not for rfc2231 encoding)
694 *dcursor++ = '_';
695 } else {
696 // needs =XY encoding - write escape char:
697 *dcursor++ = mEscapeChar;
698 mStepNo = 1;
699 }
700 continue;
701 case 1:
702 // extract hi-nibble:
703 value = highNibble(ch: mAccu);
704 mStepNo = 2;
705 break;
706 case 2:
707 // extract lo-nibble:
708 value = lowNibble(ch: mAccu);
709 mStepNo = 0;
710 break;
711 default:
712 assert(0);
713 }
714
715 // and write:
716 *dcursor++ = binToHex(value);
717 }
718
719 return scursor == send;
720} // encode
721
722bool Rfc2047QEncodingEncoder::finish(char *&dcursor, const char *const dend)
723{
724 mInsideFinishing = true;
725
726 // write the last bits of mAccu, if any:
727 while (mStepNo != 0 && dcursor != dend) {
728 uchar value = 0;
729 switch (mStepNo) {
730 case 1:
731 // extract hi-nibble:
732 value = highNibble(ch: mAccu);
733 mStepNo = 2;
734 break;
735 case 2:
736 // extract lo-nibble:
737 value = lowNibble(ch: mAccu);
738 mStepNo = 0;
739 break;
740 default:
741 assert(0);
742 }
743
744 // and write:
745 *dcursor++ = binToHex(value);
746 }
747
748 return mStepNo == 0;
749}
750
751} // namespace KCodecs
752

source code of kcodecs/src/kcodecsqp.cpp