1/* -*- c++ -*-
2 SPDX-FileCopyrightText: 2002 Marc Mutz <mutz@kde.org>
3
4 SPDX-License-Identifier: LGPL-2.0-or-later
5*/
6
7#include "kcodecsqp.h"
8#include "kcodecs_p.h"
9
10#include <QDebug>
11
12#include <cassert>
13
14using namespace KCodecs;
15
16namespace KCodecs
17{
18// none except a-zA-Z0-9!*+-/
19const uchar eTextMap[16] = {0x00, 0x00, 0x00, 0x00, 0x40, 0x35, 0xFF, 0xC0, 0x7F, 0xFF, 0xFF, 0xE0, 0x7F, 0xFF, 0xFF, 0xE0};
20
21// some helpful functions:
22
23/**
24 Converts a 4-bit @p value into its hexadecimal characater representation.
25 So input of value [0,15] returns ['0','1',... 'F']. Input values
26 greater than 15 will produce undesired results.
27 @param value is an unsigned character containing the 4-bit input value.
28*/
29static inline char binToHex(uchar value)
30{
31 if (value > 9) {
32 return value + 'A' - 10;
33 } else {
34 return value + '0';
35 }
36}
37
38/**
39 Returns the high-order 4 bits of an 8-bit value in another 8-bit value.
40 @param ch is an unsigned character containing the 8-bit input value.
41*/
42static inline uchar highNibble(uchar ch)
43{
44 return ch >> 4;
45}
46
47/**
48 Returns the low-order 4 bits of an 8-bit value in another 8-bit value.
49 @param ch is an unsigned character containing the 8-bit input value.
50*/
51static inline uchar lowNibble(uchar ch)
52{
53 return ch & 0xF;
54}
55
56//
57// QuotedPrintableCodec
58//
59
60class QuotedPrintableEncoder : public Encoder
61{
62 char mInputBuffer[16];
63 uchar mCurrentLineLength; // 0..76
64 uchar mAccu;
65 uint mInputBufferReadCursor : 4; // 0..15
66 uint mInputBufferWriteCursor : 4; // 0..15
67 enum {
68 Never,
69 AtBOL,
70 Definitely,
71 } mAccuNeedsEncoding;
72 bool mSawLineEnd : 1;
73 bool mSawCR : 1;
74 bool mFinishing : 1;
75 bool mFinished : 1;
76
77protected:
78 friend class QuotedPrintableCodec;
79 QuotedPrintableEncoder(Codec::NewlineType newline = Codec::NewlineLF)
80 : Encoder(newline)
81 , mCurrentLineLength(0)
82 , mAccu(0)
83 , mInputBufferReadCursor(0)
84 , mInputBufferWriteCursor(0)
85 , mAccuNeedsEncoding(Never)
86 , mSawLineEnd(false)
87 , mSawCR(false)
88 , mFinishing(false)
89 , mFinished(false)
90 {
91 }
92
93 bool needsEncoding(uchar ch)
94 {
95 return ch > '~' || (ch < ' ' && ch != '\t') || ch == '=';
96 }
97 bool needsEncodingAtEOL(uchar ch)
98 {
99 return ch == ' ' || ch == '\t';
100 }
101 bool needsEncodingAtBOL(uchar ch)
102 {
103 return ch == 'F' || ch == '.' || ch == '-';
104 }
105 bool fillInputBuffer(const char *&scursor, const char *const send);
106 bool processNextChar();
107 void createOutputBuffer(char *&dcursor, const char *const dend);
108
109public:
110 ~QuotedPrintableEncoder() override
111 {
112 }
113
114 bool encode(const char *&scursor, const char *const send, char *&dcursor, const char *const dend) override;
115
116 bool finish(char *&dcursor, const char *const dend) override;
117};
118
119class QuotedPrintableDecoder : public Decoder
120{
121 const char mEscapeChar;
122 char mBadChar;
123 /** @p accu holds the msb nibble of the hexchar or zero. */
124 uchar mAccu;
125 /** @p insideHexChar is true iff we're inside an hexchar (=XY).
126 Together with @ref mAccu, we can build this states:
127 @li @p insideHexChar == @p false:
128 normal text
129 @li @p insideHexChar == @p true, @p mAccu == 0:
130 saw the leading '='
131 @li @p insideHexChar == @p true, @p mAccu != 0:
132 saw the first nibble '=X'
133 */
134 const bool mQEncoding;
135 bool mInsideHexChar;
136 bool mFlushing;
137 bool mExpectLF;
138 bool mHaveAccu;
139 /** @p mLastChar holds the first char of an encoded char, so that
140 we are able to keep the first char if the second char is invalid. */
141 char mLastChar;
142
143protected:
144 friend class QuotedPrintableCodec;
145 friend class Rfc2047QEncodingCodec;
146 friend class Rfc2231EncodingCodec;
147 QuotedPrintableDecoder(Codec::NewlineType newline = Codec::NewlineLF, bool aQEncoding = false, char aEscapeChar = '=')
148 : Decoder(newline)
149 , mEscapeChar(aEscapeChar)
150 , mBadChar(0)
151 , mAccu(0)
152 , mQEncoding(aQEncoding)
153 , mInsideHexChar(false)
154 , mFlushing(false)
155 , mExpectLF(false)
156 , mHaveAccu(false)
157 , mLastChar(0)
158 {
159 }
160
161public:
162 ~QuotedPrintableDecoder() override
163 {
164 }
165
166 bool decode(const char *&scursor, const char *const send, char *&dcursor, const char *const dend) override;
167 bool finish(char *&dcursor, const char *const dend) override;
168};
169
170class Rfc2047QEncodingEncoder : public Encoder
171{
172 uchar mAccu;
173 uchar mStepNo;
174 const char mEscapeChar;
175 bool mInsideFinishing : 1;
176
177protected:
178 friend class Rfc2047QEncodingCodec;
179 friend class Rfc2231EncodingCodec;
180 Rfc2047QEncodingEncoder(Codec::NewlineType newline = Codec::NewlineLF, char aEscapeChar = '=')
181 : Encoder(newline)
182 , mAccu(0)
183 , mStepNo(0)
184 , mEscapeChar(aEscapeChar)
185 , mInsideFinishing(false)
186 {
187 // else an optimization in ::encode might break.
188 assert(aEscapeChar == '=' || aEscapeChar == '%');
189 }
190
191 bool isEText(uchar ch)
192 {
193 return (ch < 128) && (eTextMap[ch / 8] & 0x80 >> ch % 8);
194 }
195
196 // this code assumes that isEText( mEscapeChar ) == false!
197 bool needsEncoding(uchar ch)
198 {
199 if (ch > 'z') {
200 return true; // {|}~ DEL and 8bit chars need
201 }
202 if (!isEText(ch)) {
203 return true; // all but a-zA-Z0-9!/*+- need, too
204 }
205 if (mEscapeChar == '%' && (ch == '*' || ch == '/')) {
206 return true; // not allowed in rfc2231 encoding
207 }
208 return false;
209 }
210
211public:
212 ~Rfc2047QEncodingEncoder() override
213 {
214 }
215
216 bool encode(const char *&scursor, const char *const send, char *&dcursor, const char *const dend) override;
217 bool finish(char *&dcursor, const char *const dend) override;
218};
219
220// this doesn't access any member variables, so it can be defined static
221// but then we can't call it from virtual functions
222static qsizetype QuotedPrintableDecoder_maxDecodedSizeFor(qsizetype insize, Codec::NewlineType newline)
223{
224 // all chars unencoded:
225 qsizetype result = insize;
226 // but maybe all of them are \n and we need to make them \r\n :-o
227 if (newline == Codec::NewlineCRLF) {
228 result += insize;
229 }
230
231 // there might be an accu plus escape
232 result += 2;
233
234 return result;
235}
236
237Encoder *QuotedPrintableCodec::makeEncoder(Codec::NewlineType newline) const
238{
239 return new QuotedPrintableEncoder(newline);
240}
241
242Decoder *QuotedPrintableCodec::makeDecoder(Codec::NewlineType newline) const
243{
244 return new QuotedPrintableDecoder(newline);
245}
246
247qsizetype QuotedPrintableCodec::maxDecodedSizeFor(qsizetype insize, Codec::NewlineType newline) const
248{
249 return QuotedPrintableDecoder_maxDecodedSizeFor(insize, newline);
250}
251
252Encoder *Rfc2047QEncodingCodec::makeEncoder(Codec::NewlineType newline) const
253{
254 return new Rfc2047QEncodingEncoder(newline);
255}
256
257Decoder *Rfc2047QEncodingCodec::makeDecoder(Codec::NewlineType newline) const
258{
259 return new QuotedPrintableDecoder(newline, true);
260}
261
262qsizetype Rfc2047QEncodingCodec::maxDecodedSizeFor(qsizetype insize, Codec::NewlineType newline) const
263{
264 return QuotedPrintableDecoder_maxDecodedSizeFor(insize, newline);
265}
266
267Encoder *Rfc2231EncodingCodec::makeEncoder(Codec::NewlineType newline) const
268{
269 return new Rfc2047QEncodingEncoder(newline, '%');
270}
271
272Decoder *Rfc2231EncodingCodec::makeDecoder(Codec::NewlineType newline) const
273{
274 return new QuotedPrintableDecoder(newline, true, '%');
275}
276
277qsizetype Rfc2231EncodingCodec::maxDecodedSizeFor(qsizetype insize, Codec::NewlineType newline) const
278{
279 return QuotedPrintableDecoder_maxDecodedSizeFor(insize, newline);
280}
281
282/********************************************************/
283/********************************************************/
284/********************************************************/
285
286bool QuotedPrintableDecoder::decode(const char *&scursor, const char *const send, char *&dcursor, const char *const dend)
287{
288 if (d->newline == Codec::NewlineCRLF) {
289 qWarning() << "CRLF output for decoders isn't yet supported!";
290 }
291
292 while (scursor != send && dcursor != dend) {
293 if (mFlushing) {
294 // we have to flush chars in the aftermath of a decoding
295 // error. The way to request a flush is to
296 // - store the offending character in mBadChar and
297 // - set mFlushing to true.
298 // The supported cases are (H: hexchar, X: bad char):
299 // =X, =HX, CR
300 // mBadChar is only written out if it is not by itself illegal in
301 // quoted-printable (e.g. CTLs, 8Bits).
302 // A fast way to suppress mBadChar output is to set it to NUL.
303 if (mInsideHexChar) {
304 // output '='
305 *dcursor++ = mEscapeChar;
306 mInsideHexChar = false;
307 } else if (mHaveAccu) {
308 // output the high nibble of the accumulator:
309 *dcursor++ = mLastChar;
310 mHaveAccu = false;
311 mAccu = 0;
312 } else {
313 // output mBadChar
314 assert(mAccu == 0);
315 if (mBadChar) {
316 if (mBadChar == '=') {
317 mInsideHexChar = true;
318 } else {
319 *dcursor++ = mBadChar;
320 }
321 mBadChar = 0;
322 }
323 mFlushing = false;
324 }
325 continue;
326 }
327 assert(mBadChar == 0);
328
329 uchar ch = *scursor++;
330
331 if (mExpectLF && ch != '\n') {
332 // qWarning() << "QuotedPrintableDecoder:"
333 // "illegally formed soft linebreak or lonely CR!";
334 mInsideHexChar = false;
335 mExpectLF = false;
336 if (mAccu != 0) {
337 return false;
338 }
339 }
340
341 if (mInsideHexChar) {
342 uchar value = 255;
343 // next char(s) represent nibble instead of itself:
344 if (ch <= '9') {
345 if (ch >= '0') {
346 value = ch - '0';
347 } else {
348 switch (ch) {
349 case '\r':
350 mExpectLF = true;
351 break;
352 case '\n':
353 // soft line break, but only if mAccu is NUL.
354 if (!mHaveAccu) {
355 mExpectLF = false;
356 mInsideHexChar = false;
357 break;
358 }
359 // else fall through
360 default:
361 // qWarning() << "QuotedPrintableDecoder:"
362 // "illegally formed hex char! Outputting verbatim.";
363 mBadChar = ch;
364 mFlushing = true;
365 }
366 continue;
367 }
368 } else { // ch > '9'
369 if (ch <= 'F') {
370 if (ch >= 'A') {
371 value = 10 + ch - 'A';
372 } else { // [:-@]
373 mBadChar = ch;
374 mFlushing = true;
375 continue;
376 }
377 } else { // ch > 'F'
378 if (ch <= 'f' && ch >= 'a') {
379 value = 10 + ch - 'a';
380 } else {
381 mBadChar = ch;
382 mFlushing = true;
383 continue;
384 }
385 }
386 }
387
388 assert(value < 16);
389 assert(mBadChar == 0);
390 assert(!mExpectLF);
391
392 if (mHaveAccu) {
393 *dcursor++ = char(mAccu | value);
394 mAccu = 0;
395 mHaveAccu = false;
396 mInsideHexChar = false;
397 } else {
398 mHaveAccu = true;
399 mAccu = value << 4;
400 mLastChar = ch;
401 }
402 } else { // not mInsideHexChar
403 if ((ch <= '~' && ch >= ' ') || ch == '\t') {
404 if (ch == mEscapeChar) {
405 mInsideHexChar = true;
406 } else if (mQEncoding && ch == '_') {
407 *dcursor++ = char(0x20);
408 } else {
409 *dcursor++ = char(ch);
410 }
411 } else if (ch == '\n') {
412 *dcursor++ = '\n';
413 mExpectLF = false;
414 } else if (ch == '\r') {
415 mExpectLF = true;
416 } else {
417 // qWarning() << "QuotedPrintableDecoder:" << ch <<
418 // "illegal character in input stream!";
419 *dcursor++ = char(ch);
420 }
421 }
422 }
423
424 return scursor == send;
425}
426
427bool QuotedPrintableDecoder::finish(char *&dcursor, const char *const dend)
428{
429 while ((mInsideHexChar || mHaveAccu || mFlushing) && dcursor != dend) {
430 // we have to flush chars
431 if (mInsideHexChar) {
432 // output '='
433 *dcursor++ = mEscapeChar;
434 mInsideHexChar = false;
435 } else if (mHaveAccu) {
436 // output the high nibble of the accumulator:
437 *dcursor++ = mLastChar;
438 mHaveAccu = false;
439 mAccu = 0;
440 } else {
441 // output mBadChar
442 assert(mAccu == 0);
443 if (mBadChar) {
444 *dcursor++ = mBadChar;
445 mBadChar = 0;
446 }
447 mFlushing = false;
448 }
449 }
450
451 // return false if we are not finished yet; note that mInsideHexChar is always false
452 return !(mHaveAccu || mFlushing);
453}
454
455bool QuotedPrintableEncoder::fillInputBuffer(const char *&scursor, const char *const send)
456{
457 // Don't read more if there's still a tail of a line in the buffer:
458 if (mSawLineEnd) {
459 return true;
460 }
461
462 // Read until the buffer is full or we have found CRLF or LF (which
463 // don't end up in the input buffer):
464 for (; (mInputBufferWriteCursor + 1) % 16 != mInputBufferReadCursor && scursor != send; mInputBufferWriteCursor++) {
465 char ch = *scursor++;
466 if (ch == '\r') {
467 mSawCR = true;
468 } else if (ch == '\n') {
469 // remove the CR from the input buffer (if any) and return that
470 // we found a line ending:
471 if (mSawCR) {
472 mSawCR = false;
473 assert(mInputBufferWriteCursor != mInputBufferReadCursor);
474 mInputBufferWriteCursor--;
475 }
476 mSawLineEnd = true;
477 return true; // saw CRLF or LF
478 } else {
479 mSawCR = false;
480 }
481 mInputBuffer[mInputBufferWriteCursor] = ch;
482 }
483 mSawLineEnd = false;
484 return false; // didn't see a line ending...
485}
486
487bool QuotedPrintableEncoder::processNextChar()
488{
489 // If we process a buffer which doesn't end in a line break, we
490 // can't process all of it, since the next chars that will be read
491 // could be a line break. So we empty the buffer only until a fixed
492 // number of chars is left (except when mFinishing, which means that
493 // the data doesn't end in newline):
494 const int minBufferFillWithoutLineEnd = 4;
495
496 assert(d->outputBufferCursor == 0);
497
498 int bufferFill = int(mInputBufferWriteCursor) - int(mInputBufferReadCursor);
499 if (bufferFill < 0) {
500 bufferFill += 16;
501 }
502
503 assert(bufferFill >= 0 && bufferFill <= 15);
504
505 if (!mFinishing //
506 && !mSawLineEnd //
507 && bufferFill < minBufferFillWithoutLineEnd) {
508 return false;
509 }
510
511 // buffer is empty, return false:
512 if (mInputBufferReadCursor == mInputBufferWriteCursor) {
513 return false;
514 }
515
516 // Real processing goes here:
517 mAccu = mInputBuffer[mInputBufferReadCursor++];
518 if (needsEncoding(ch: mAccu)) { // always needs encoding or
519 mAccuNeedsEncoding = Definitely;
520 } else if ((mSawLineEnd || mFinishing) // needs encoding at end of line
521 && bufferFill == 1 // or end of buffer
522 && needsEncodingAtEOL(ch: mAccu)) {
523 mAccuNeedsEncoding = Definitely;
524 } else if (needsEncodingAtBOL(ch: mAccu)) {
525 mAccuNeedsEncoding = AtBOL;
526 } else {
527 // never needs encoding
528 mAccuNeedsEncoding = Never;
529 }
530
531 return true;
532}
533
534// Outputs processed (verbatim or hex-encoded) chars and inserts soft
535// line breaks as necessary. Depends on processNextChar's directions
536// on whether to encode the current char, and whether
537// the current char is the last one in it's input line:
538void QuotedPrintableEncoder::createOutputBuffer(char *&dcursor, const char *const dend)
539{
540 const int maxLineLength = 76; // rfc 2045
541
542 assert(d->outputBufferCursor == 0);
543
544 /* clang-format off */
545 bool lastOneOnThisLine = mSawLineEnd
546 && mInputBufferReadCursor == mInputBufferWriteCursor;
547 /* clang-format on */
548
549 int neededSpace = 1;
550 if (mAccuNeedsEncoding == Definitely) {
551 neededSpace = 3;
552 }
553
554 // reserve space for the soft hyphen (=)
555 if (!lastOneOnThisLine) {
556 neededSpace++;
557 }
558
559 if (mCurrentLineLength > maxLineLength - neededSpace) {
560 // current line too short, insert soft line break:
561 write(ch: '=', dcursor, dend);
562 writeCRLF(dcursor, dend);
563 mCurrentLineLength = 0;
564 }
565
566 if (Never == mAccuNeedsEncoding //
567 || (AtBOL == mAccuNeedsEncoding && mCurrentLineLength != 0)) {
568 write(ch: mAccu, dcursor, dend);
569 mCurrentLineLength++;
570 } else {
571 write(ch: '=', dcursor, dend);
572 write(ch: binToHex(value: highNibble(ch: mAccu)), dcursor, dend);
573 write(ch: binToHex(value: lowNibble(ch: mAccu)), dcursor, dend);
574 mCurrentLineLength += 3;
575 }
576}
577
578bool QuotedPrintableEncoder::encode(const char *&scursor, const char *const send, char *&dcursor, const char *const dend)
579{
580 // support probing by the caller:
581 if (mFinishing) {
582 return true;
583 }
584
585 while (scursor != send && dcursor != dend) {
586 if (d->outputBufferCursor && !flushOutputBuffer(dcursor, dend)) {
587 return scursor == send;
588 }
589
590 assert(d->outputBufferCursor == 0);
591
592 // fill input buffer until eol has been reached or until the
593 // buffer is full, whatever comes first:
594 fillInputBuffer(scursor, send);
595
596 if (processNextChar()) {
597 // there was one...
598 createOutputBuffer(dcursor, dend);
599 } else if (mSawLineEnd && mInputBufferWriteCursor == mInputBufferReadCursor) {
600 // load a hard line break into output buffer:
601 writeCRLF(dcursor, dend);
602 // signal fillInputBuffer() we are ready for the next line:
603 mSawLineEnd = false;
604 mCurrentLineLength = 0;
605 } else {
606 // we are supposedly finished with this input block:
607 break;
608 }
609 }
610
611 // make sure we write as much as possible and don't stop _writing_
612 // just because we have no more _input_:
613 if (d->outputBufferCursor) {
614 flushOutputBuffer(dcursor, dend);
615 }
616
617 return scursor == send;
618
619} // encode
620
621bool QuotedPrintableEncoder::finish(char *&dcursor, const char *const dend)
622{
623 mFinishing = true;
624
625 if (mFinished) {
626 return flushOutputBuffer(dcursor, dend);
627 }
628
629 while (dcursor != dend) {
630 if (d->outputBufferCursor && !flushOutputBuffer(dcursor, dend)) {
631 return false;
632 }
633
634 assert(d->outputBufferCursor == 0);
635
636 if (processNextChar()) {
637 // there was one...
638 createOutputBuffer(dcursor, dend);
639 } else if (mSawLineEnd && mInputBufferWriteCursor == mInputBufferReadCursor) {
640 // load a hard line break into output buffer:
641 writeCRLF(dcursor, dend);
642 mSawLineEnd = false;
643 mCurrentLineLength = 0;
644 } else {
645 mFinished = true;
646 return flushOutputBuffer(dcursor, dend);
647 }
648 }
649
650 return mFinished && !d->outputBufferCursor;
651
652} // finish
653
654bool Rfc2047QEncodingEncoder::encode(const char *&scursor, const char *const send, char *&dcursor, const char *const dend)
655{
656 if (mInsideFinishing) {
657 return true;
658 }
659
660 while (scursor != send && dcursor != dend) {
661 uchar value = 0;
662 switch (mStepNo) {
663 case 0:
664 // read the next char and decide if and how do encode:
665 mAccu = *scursor++;
666 if (!needsEncoding(ch: mAccu)) {
667 *dcursor++ = char(mAccu);
668 } else if (mEscapeChar == '=' && mAccu == 0x20) {
669 // shortcut encoding for 0x20 (latin-1/us-ascii SPACE)
670 // (not for rfc2231 encoding)
671 *dcursor++ = '_';
672 } else {
673 // needs =XY encoding - write escape char:
674 *dcursor++ = mEscapeChar;
675 mStepNo = 1;
676 }
677 continue;
678 case 1:
679 // extract hi-nibble:
680 value = highNibble(ch: mAccu);
681 mStepNo = 2;
682 break;
683 case 2:
684 // extract lo-nibble:
685 value = lowNibble(ch: mAccu);
686 mStepNo = 0;
687 break;
688 default:
689 assert(0);
690 }
691
692 // and write:
693 *dcursor++ = binToHex(value);
694 }
695
696 return scursor == send;
697} // encode
698
699bool Rfc2047QEncodingEncoder::finish(char *&dcursor, const char *const dend)
700{
701 mInsideFinishing = true;
702
703 // write the last bits of mAccu, if any:
704 while (mStepNo != 0 && dcursor != dend) {
705 uchar value = 0;
706 switch (mStepNo) {
707 case 1:
708 // extract hi-nibble:
709 value = highNibble(ch: mAccu);
710 mStepNo = 2;
711 break;
712 case 2:
713 // extract lo-nibble:
714 value = lowNibble(ch: mAccu);
715 mStepNo = 0;
716 break;
717 default:
718 assert(0);
719 }
720
721 // and write:
722 *dcursor++ = binToHex(value);
723 }
724
725 return mStepNo == 0;
726}
727
728} // namespace KCodecs
729

source code of kcodecs/src/kcodecsqp.cpp