1/* -*- c++ -*-
2 SPDX-FileCopyrightText: 2002 Marc Mutz <mutz@kde.org>
3
4 SPDX-License-Identifier: LGPL-2.0-or-later
5*/
6
7#include "kcodecsqp.h"
8#include "kcodecs_p.h"
9
10#include <QDebug>
11
12#include <cassert>
13
14using namespace KCodecs;
15
16namespace KCodecs
17{
18// none except a-zA-Z0-9!*+-/
19const uchar eTextMap[16] = {0x00, 0x00, 0x00, 0x00, 0x40, 0x35, 0xFF, 0xC0, 0x7F, 0xFF, 0xFF, 0xE0, 0x7F, 0xFF, 0xFF, 0xE0};
20
21// some helpful functions:
22
23/**
24 Converts a 4-bit @p value into its hexadecimal characater representation.
25 So input of value [0,15] returns ['0','1',... 'F']. Input values
26 greater than 15 will produce undesired results.
27 @param value is an unsigned character containing the 4-bit input value.
28*/
29static inline char binToHex(uchar value)
30{
31 if (value > 9) {
32 return value + 'A' - 10;
33 } else {
34 return value + '0';
35 }
36}
37
38/**
39 Returns the high-order 4 bits of an 8-bit value in another 8-bit value.
40 @param ch is an unsigned character containing the 8-bit input value.
41*/
42static inline uchar highNibble(uchar ch)
43{
44 return ch >> 4;
45}
46
47/**
48 Returns the low-order 4 bits of an 8-bit value in another 8-bit value.
49 @param ch is an unsigned character containing the 8-bit input value.
50*/
51static inline uchar lowNibble(uchar ch)
52{
53 return ch & 0xF;
54}
55
56/**
57 Returns true if the specified value is a not Control character or
58 question mark; else true.
59 @param ch is an unsigned character containing the 8-bit input value.
60*/
61static inline bool keep(uchar ch)
62{
63 // no CTLs, except HT and not '?'
64 return !((ch < ' ' && ch != '\t') || ch == '?');
65}
66
67//
68// QuotedPrintableCodec
69//
70
71class QuotedPrintableEncoder : public Encoder
72{
73 char mInputBuffer[16];
74 uchar mCurrentLineLength; // 0..76
75 uchar mAccu;
76 uint mInputBufferReadCursor : 4; // 0..15
77 uint mInputBufferWriteCursor : 4; // 0..15
78 enum {
79 Never,
80 AtBOL,
81 Definitely,
82 } mAccuNeedsEncoding;
83 bool mSawLineEnd : 1;
84 bool mSawCR : 1;
85 bool mFinishing : 1;
86 bool mFinished : 1;
87
88protected:
89 friend class QuotedPrintableCodec;
90 QuotedPrintableEncoder(Codec::NewlineType newline = Codec::NewlineLF)
91 : Encoder(newline)
92 , mCurrentLineLength(0)
93 , mAccu(0)
94 , mInputBufferReadCursor(0)
95 , mInputBufferWriteCursor(0)
96 , mAccuNeedsEncoding(Never)
97 , mSawLineEnd(false)
98 , mSawCR(false)
99 , mFinishing(false)
100 , mFinished(false)
101 {
102 }
103
104 bool needsEncoding(uchar ch)
105 {
106 return ch > '~' || (ch < ' ' && ch != '\t') || ch == '=';
107 }
108 bool needsEncodingAtEOL(uchar ch)
109 {
110 return ch == ' ' || ch == '\t';
111 }
112 bool needsEncodingAtBOL(uchar ch)
113 {
114 return ch == 'F' || ch == '.' || ch == '-';
115 }
116 bool fillInputBuffer(const char *&scursor, const char *const send);
117 bool processNextChar();
118 void createOutputBuffer(char *&dcursor, const char *const dend);
119
120public:
121 ~QuotedPrintableEncoder() override
122 {
123 }
124
125 bool encode(const char *&scursor, const char *const send, char *&dcursor, const char *const dend) override;
126
127 bool finish(char *&dcursor, const char *const dend) override;
128};
129
130class QuotedPrintableDecoder : public Decoder
131{
132 const char mEscapeChar;
133 char mBadChar;
134 /** @p accu holds the msb nibble of the hexchar or zero. */
135 uchar mAccu;
136 /** @p insideHexChar is true iff we're inside an hexchar (=XY).
137 Together with @ref mAccu, we can build this states:
138 @li @p insideHexChar == @p false:
139 normal text
140 @li @p insideHexChar == @p true, @p mAccu == 0:
141 saw the leading '='
142 @li @p insideHexChar == @p true, @p mAccu != 0:
143 saw the first nibble '=X'
144 */
145 const bool mQEncoding;
146 bool mInsideHexChar;
147 bool mFlushing;
148 bool mExpectLF;
149 bool mHaveAccu;
150 /** @p mLastChar holds the first char of an encoded char, so that
151 we are able to keep the first char if the second char is invalid. */
152 char mLastChar;
153
154protected:
155 friend class QuotedPrintableCodec;
156 friend class Rfc2047QEncodingCodec;
157 friend class Rfc2231EncodingCodec;
158 QuotedPrintableDecoder(Codec::NewlineType newline = Codec::NewlineLF, bool aQEncoding = false, char aEscapeChar = '=')
159 : Decoder(newline)
160 , mEscapeChar(aEscapeChar)
161 , mBadChar(0)
162 , mAccu(0)
163 , mQEncoding(aQEncoding)
164 , mInsideHexChar(false)
165 , mFlushing(false)
166 , mExpectLF(false)
167 , mHaveAccu(false)
168 , mLastChar(0)
169 {
170 }
171
172public:
173 ~QuotedPrintableDecoder() override
174 {
175 }
176
177 bool decode(const char *&scursor, const char *const send, char *&dcursor, const char *const dend) override;
178 bool finish(char *&dcursor, const char *const dend) override;
179};
180
181class Rfc2047QEncodingEncoder : public Encoder
182{
183 uchar mAccu;
184 uchar mStepNo;
185 const char mEscapeChar;
186 bool mInsideFinishing : 1;
187
188protected:
189 friend class Rfc2047QEncodingCodec;
190 friend class Rfc2231EncodingCodec;
191 Rfc2047QEncodingEncoder(Codec::NewlineType newline = Codec::NewlineLF, char aEscapeChar = '=')
192 : Encoder(newline)
193 , mAccu(0)
194 , mStepNo(0)
195 , mEscapeChar(aEscapeChar)
196 , mInsideFinishing(false)
197 {
198 // else an optimization in ::encode might break.
199 assert(aEscapeChar == '=' || aEscapeChar == '%');
200 }
201
202 bool isEText(uchar ch)
203 {
204 return (ch < 128) && (eTextMap[ch / 8] & 0x80 >> ch % 8);
205 }
206
207 // this code assumes that isEText( mEscapeChar ) == false!
208 bool needsEncoding(uchar ch)
209 {
210 if (ch > 'z') {
211 return true; // {|}~ DEL and 8bit chars need
212 }
213 if (!isEText(ch)) {
214 return true; // all but a-zA-Z0-9!/*+- need, too
215 }
216 if (mEscapeChar == '%' && (ch == '*' || ch == '/')) {
217 return true; // not allowed in rfc2231 encoding
218 }
219 return false;
220 }
221
222public:
223 ~Rfc2047QEncodingEncoder() override
224 {
225 }
226
227 bool encode(const char *&scursor, const char *const send, char *&dcursor, const char *const dend) override;
228 bool finish(char *&dcursor, const char *const dend) override;
229};
230
231// this doesn't access any member variables, so it can be defined static
232// but then we can't call it from virtual functions
233static qsizetype QuotedPrintableDecoder_maxDecodedSizeFor(qsizetype insize, Codec::NewlineType newline)
234{
235 // all chars unencoded:
236 qsizetype result = insize;
237 // but maybe all of them are \n and we need to make them \r\n :-o
238 if (newline == Codec::NewlineCRLF) {
239 result += insize;
240 }
241
242 // there might be an accu plus escape
243 result += 2;
244
245 return result;
246}
247
248Encoder *QuotedPrintableCodec::makeEncoder(Codec::NewlineType newline) const
249{
250 return new QuotedPrintableEncoder(newline);
251}
252
253Decoder *QuotedPrintableCodec::makeDecoder(Codec::NewlineType newline) const
254{
255 return new QuotedPrintableDecoder(newline);
256}
257
258qsizetype QuotedPrintableCodec::maxDecodedSizeFor(qsizetype insize, Codec::NewlineType newline) const
259{
260 return QuotedPrintableDecoder_maxDecodedSizeFor(insize, newline);
261}
262
263Encoder *Rfc2047QEncodingCodec::makeEncoder(Codec::NewlineType newline) const
264{
265 return new Rfc2047QEncodingEncoder(newline);
266}
267
268Decoder *Rfc2047QEncodingCodec::makeDecoder(Codec::NewlineType newline) const
269{
270 return new QuotedPrintableDecoder(newline, true);
271}
272
273qsizetype Rfc2047QEncodingCodec::maxDecodedSizeFor(qsizetype insize, Codec::NewlineType newline) const
274{
275 return QuotedPrintableDecoder_maxDecodedSizeFor(insize, newline);
276}
277
278Encoder *Rfc2231EncodingCodec::makeEncoder(Codec::NewlineType newline) const
279{
280 return new Rfc2047QEncodingEncoder(newline, '%');
281}
282
283Decoder *Rfc2231EncodingCodec::makeDecoder(Codec::NewlineType newline) const
284{
285 return new QuotedPrintableDecoder(newline, true, '%');
286}
287
288qsizetype Rfc2231EncodingCodec::maxDecodedSizeFor(qsizetype insize, Codec::NewlineType newline) const
289{
290 return QuotedPrintableDecoder_maxDecodedSizeFor(insize, newline);
291}
292
293/********************************************************/
294/********************************************************/
295/********************************************************/
296
297bool QuotedPrintableDecoder::decode(const char *&scursor, const char *const send, char *&dcursor, const char *const dend)
298{
299 if (d->newline == Codec::NewlineCRLF) {
300 qWarning() << "CRLF output for decoders isn't yet supported!";
301 }
302
303 while (scursor != send && dcursor != dend) {
304 if (mFlushing) {
305 // we have to flush chars in the aftermath of a decoding
306 // error. The way to request a flush is to
307 // - store the offending character in mBadChar and
308 // - set mFlushing to true.
309 // The supported cases are (H: hexchar, X: bad char):
310 // =X, =HX, CR
311 // mBadChar is only written out if it is not by itself illegal in
312 // quoted-printable (e.g. CTLs, 8Bits).
313 // A fast way to suppress mBadChar output is to set it to NUL.
314 if (mInsideHexChar) {
315 // output '='
316 *dcursor++ = mEscapeChar;
317 mInsideHexChar = false;
318 } else if (mHaveAccu) {
319 // output the high nibble of the accumulator:
320 *dcursor++ = mLastChar;
321 mHaveAccu = false;
322 mAccu = 0;
323 } else {
324 // output mBadChar
325 assert(mAccu == 0);
326 if (mBadChar) {
327 if (mBadChar == '=') {
328 mInsideHexChar = true;
329 } else {
330 *dcursor++ = mBadChar;
331 }
332 mBadChar = 0;
333 }
334 mFlushing = false;
335 }
336 continue;
337 }
338 assert(mBadChar == 0);
339
340 uchar ch = *scursor++;
341
342 if (mExpectLF && ch != '\n') {
343 // qWarning() << "QuotedPrintableDecoder:"
344 // "illegally formed soft linebreak or lonely CR!";
345 mInsideHexChar = false;
346 mExpectLF = false;
347 if (mAccu != 0) {
348 return false;
349 }
350 }
351
352 if (mInsideHexChar) {
353 uchar value = 255;
354 // next char(s) represent nibble instead of itself:
355 if (ch <= '9') {
356 if (ch >= '0') {
357 value = ch - '0';
358 } else {
359 switch (ch) {
360 case '\r':
361 mExpectLF = true;
362 break;
363 case '\n':
364 // soft line break, but only if mAccu is NUL.
365 if (!mHaveAccu) {
366 mExpectLF = false;
367 mInsideHexChar = false;
368 break;
369 }
370 // else fall through
371 default:
372 // qWarning() << "QuotedPrintableDecoder:"
373 // "illegally formed hex char! Outputting verbatim.";
374 mBadChar = ch;
375 mFlushing = true;
376 }
377 continue;
378 }
379 } else { // ch > '9'
380 if (ch <= 'F') {
381 if (ch >= 'A') {
382 value = 10 + ch - 'A';
383 } else { // [:-@]
384 mBadChar = ch;
385 mFlushing = true;
386 continue;
387 }
388 } else { // ch > 'F'
389 if (ch <= 'f' && ch >= 'a') {
390 value = 10 + ch - 'a';
391 } else {
392 mBadChar = ch;
393 mFlushing = true;
394 continue;
395 }
396 }
397 }
398
399 assert(value < 16);
400 assert(mBadChar == 0);
401 assert(!mExpectLF);
402
403 if (mHaveAccu) {
404 *dcursor++ = char(mAccu | value);
405 mAccu = 0;
406 mHaveAccu = false;
407 mInsideHexChar = false;
408 } else {
409 mHaveAccu = true;
410 mAccu = value << 4;
411 mLastChar = ch;
412 }
413 } else { // not mInsideHexChar
414 if ((ch <= '~' && ch >= ' ') || ch == '\t') {
415 if (ch == mEscapeChar) {
416 mInsideHexChar = true;
417 } else if (mQEncoding && ch == '_') {
418 *dcursor++ = char(0x20);
419 } else {
420 *dcursor++ = char(ch);
421 }
422 } else if (ch == '\n') {
423 *dcursor++ = '\n';
424 mExpectLF = false;
425 } else if (ch == '\r') {
426 mExpectLF = true;
427 } else {
428 // qWarning() << "QuotedPrintableDecoder:" << ch <<
429 // "illegal character in input stream!";
430 *dcursor++ = char(ch);
431 }
432 }
433 }
434
435 return scursor == send;
436}
437
438bool QuotedPrintableDecoder::finish(char *&dcursor, const char *const dend)
439{
440 while ((mInsideHexChar || mHaveAccu || mFlushing) && dcursor != dend) {
441 // we have to flush chars
442 if (mInsideHexChar) {
443 // output '='
444 *dcursor++ = mEscapeChar;
445 mInsideHexChar = false;
446 } else if (mHaveAccu) {
447 // output the high nibble of the accumulator:
448 *dcursor++ = mLastChar;
449 mHaveAccu = false;
450 mAccu = 0;
451 } else {
452 // output mBadChar
453 assert(mAccu == 0);
454 if (mBadChar) {
455 *dcursor++ = mBadChar;
456 mBadChar = 0;
457 }
458 mFlushing = false;
459 }
460 }
461
462 // return false if we are not finished yet; note that mInsideHexChar is always false
463 return !(mHaveAccu || mFlushing);
464}
465
466bool QuotedPrintableEncoder::fillInputBuffer(const char *&scursor, const char *const send)
467{
468 // Don't read more if there's still a tail of a line in the buffer:
469 if (mSawLineEnd) {
470 return true;
471 }
472
473 // Read until the buffer is full or we have found CRLF or LF (which
474 // don't end up in the input buffer):
475 for (; (mInputBufferWriteCursor + 1) % 16 != mInputBufferReadCursor && scursor != send; mInputBufferWriteCursor++) {
476 char ch = *scursor++;
477 if (ch == '\r') {
478 mSawCR = true;
479 } else if (ch == '\n') {
480 // remove the CR from the input buffer (if any) and return that
481 // we found a line ending:
482 if (mSawCR) {
483 mSawCR = false;
484 assert(mInputBufferWriteCursor != mInputBufferReadCursor);
485 mInputBufferWriteCursor--;
486 }
487 mSawLineEnd = true;
488 return true; // saw CRLF or LF
489 } else {
490 mSawCR = false;
491 }
492 mInputBuffer[mInputBufferWriteCursor] = ch;
493 }
494 mSawLineEnd = false;
495 return false; // didn't see a line ending...
496}
497
498bool QuotedPrintableEncoder::processNextChar()
499{
500 // If we process a buffer which doesn't end in a line break, we
501 // can't process all of it, since the next chars that will be read
502 // could be a line break. So we empty the buffer only until a fixed
503 // number of chars is left (except when mFinishing, which means that
504 // the data doesn't end in newline):
505 const int minBufferFillWithoutLineEnd = 4;
506
507 assert(d->outputBufferCursor == 0);
508
509 int bufferFill = int(mInputBufferWriteCursor) - int(mInputBufferReadCursor);
510 if (bufferFill < 0) {
511 bufferFill += 16;
512 }
513
514 assert(bufferFill >= 0 && bufferFill <= 15);
515
516 if (!mFinishing //
517 && !mSawLineEnd //
518 && bufferFill < minBufferFillWithoutLineEnd) {
519 return false;
520 }
521
522 // buffer is empty, return false:
523 if (mInputBufferReadCursor == mInputBufferWriteCursor) {
524 return false;
525 }
526
527 // Real processing goes here:
528 mAccu = mInputBuffer[mInputBufferReadCursor++];
529 if (needsEncoding(ch: mAccu)) { // always needs encoding or
530 mAccuNeedsEncoding = Definitely;
531 } else if ((mSawLineEnd || mFinishing) // needs encoding at end of line
532 && bufferFill == 1 // or end of buffer
533 && needsEncodingAtEOL(ch: mAccu)) {
534 mAccuNeedsEncoding = Definitely;
535 } else if (needsEncodingAtBOL(ch: mAccu)) {
536 mAccuNeedsEncoding = AtBOL;
537 } else {
538 // never needs encoding
539 mAccuNeedsEncoding = Never;
540 }
541
542 return true;
543}
544
545// Outputs processed (verbatim or hex-encoded) chars and inserts soft
546// line breaks as necessary. Depends on processNextChar's directions
547// on whether to encode the current char, and whether
548// the current char is the last one in it's input line:
549void QuotedPrintableEncoder::createOutputBuffer(char *&dcursor, const char *const dend)
550{
551 const int maxLineLength = 76; // rfc 2045
552
553 assert(d->outputBufferCursor == 0);
554
555 /* clang-format off */
556 bool lastOneOnThisLine = mSawLineEnd
557 && mInputBufferReadCursor == mInputBufferWriteCursor;
558 /* clang-format on */
559
560 int neededSpace = 1;
561 if (mAccuNeedsEncoding == Definitely) {
562 neededSpace = 3;
563 }
564
565 // reserve space for the soft hyphen (=)
566 if (!lastOneOnThisLine) {
567 neededSpace++;
568 }
569
570 if (mCurrentLineLength > maxLineLength - neededSpace) {
571 // current line too short, insert soft line break:
572 write(ch: '=', dcursor, dend);
573 writeCRLF(dcursor, dend);
574 mCurrentLineLength = 0;
575 }
576
577 if (Never == mAccuNeedsEncoding //
578 || (AtBOL == mAccuNeedsEncoding && mCurrentLineLength != 0)) {
579 write(ch: mAccu, dcursor, dend);
580 mCurrentLineLength++;
581 } else {
582 write(ch: '=', dcursor, dend);
583 write(ch: binToHex(value: highNibble(ch: mAccu)), dcursor, dend);
584 write(ch: binToHex(value: lowNibble(ch: mAccu)), dcursor, dend);
585 mCurrentLineLength += 3;
586 }
587}
588
589bool QuotedPrintableEncoder::encode(const char *&scursor, const char *const send, char *&dcursor, const char *const dend)
590{
591 // support probing by the caller:
592 if (mFinishing) {
593 return true;
594 }
595
596 while (scursor != send && dcursor != dend) {
597 if (d->outputBufferCursor && !flushOutputBuffer(dcursor, dend)) {
598 return scursor == send;
599 }
600
601 assert(d->outputBufferCursor == 0);
602
603 // fill input buffer until eol has been reached or until the
604 // buffer is full, whatever comes first:
605 fillInputBuffer(scursor, send);
606
607 if (processNextChar()) {
608 // there was one...
609 createOutputBuffer(dcursor, dend);
610 } else if (mSawLineEnd && mInputBufferWriteCursor == mInputBufferReadCursor) {
611 // load a hard line break into output buffer:
612 writeCRLF(dcursor, dend);
613 // signal fillInputBuffer() we are ready for the next line:
614 mSawLineEnd = false;
615 mCurrentLineLength = 0;
616 } else {
617 // we are supposedly finished with this input block:
618 break;
619 }
620 }
621
622 // make sure we write as much as possible and don't stop _writing_
623 // just because we have no more _input_:
624 if (d->outputBufferCursor) {
625 flushOutputBuffer(dcursor, dend);
626 }
627
628 return scursor == send;
629
630} // encode
631
632bool QuotedPrintableEncoder::finish(char *&dcursor, const char *const dend)
633{
634 mFinishing = true;
635
636 if (mFinished) {
637 return flushOutputBuffer(dcursor, dend);
638 }
639
640 while (dcursor != dend) {
641 if (d->outputBufferCursor && !flushOutputBuffer(dcursor, dend)) {
642 return false;
643 }
644
645 assert(d->outputBufferCursor == 0);
646
647 if (processNextChar()) {
648 // there was one...
649 createOutputBuffer(dcursor, dend);
650 } else if (mSawLineEnd && mInputBufferWriteCursor == mInputBufferReadCursor) {
651 // load a hard line break into output buffer:
652 writeCRLF(dcursor, dend);
653 mSawLineEnd = false;
654 mCurrentLineLength = 0;
655 } else {
656 mFinished = true;
657 return flushOutputBuffer(dcursor, dend);
658 }
659 }
660
661 return mFinished && !d->outputBufferCursor;
662
663} // finish
664
665bool Rfc2047QEncodingEncoder::encode(const char *&scursor, const char *const send, char *&dcursor, const char *const dend)
666{
667 if (mInsideFinishing) {
668 return true;
669 }
670
671 while (scursor != send && dcursor != dend) {
672 uchar value = 0;
673 switch (mStepNo) {
674 case 0:
675 // read the next char and decide if and how do encode:
676 mAccu = *scursor++;
677 if (!needsEncoding(ch: mAccu)) {
678 *dcursor++ = char(mAccu);
679 } else if (mEscapeChar == '=' && mAccu == 0x20) {
680 // shortcut encoding for 0x20 (latin-1/us-ascii SPACE)
681 // (not for rfc2231 encoding)
682 *dcursor++ = '_';
683 } else {
684 // needs =XY encoding - write escape char:
685 *dcursor++ = mEscapeChar;
686 mStepNo = 1;
687 }
688 continue;
689 case 1:
690 // extract hi-nibble:
691 value = highNibble(ch: mAccu);
692 mStepNo = 2;
693 break;
694 case 2:
695 // extract lo-nibble:
696 value = lowNibble(ch: mAccu);
697 mStepNo = 0;
698 break;
699 default:
700 assert(0);
701 }
702
703 // and write:
704 *dcursor++ = binToHex(value);
705 }
706
707 return scursor == send;
708} // encode
709
710bool Rfc2047QEncodingEncoder::finish(char *&dcursor, const char *const dend)
711{
712 mInsideFinishing = true;
713
714 // write the last bits of mAccu, if any:
715 while (mStepNo != 0 && dcursor != dend) {
716 uchar value = 0;
717 switch (mStepNo) {
718 case 1:
719 // extract hi-nibble:
720 value = highNibble(ch: mAccu);
721 mStepNo = 2;
722 break;
723 case 2:
724 // extract lo-nibble:
725 value = lowNibble(ch: mAccu);
726 mStepNo = 0;
727 break;
728 default:
729 assert(0);
730 }
731
732 // and write:
733 *dcursor++ = binToHex(value);
734 }
735
736 return mStepNo == 0;
737}
738
739} // namespace KCodecs
740

source code of kcodecs/src/kcodecsqp.cpp