1/*
2 SPDX-FileCopyrightText: 2002 Dave Corrie <kde@davecorrie.com>
3 SPDX-FileCopyrightText: 2014 Daniel Vrátil <dvratil@redhat.com>
4
5 SPDX-License-Identifier: LGPL-2.0-or-later
6*/
7
8#include "ktexttohtml.h"
9#include "kemoticonsparser_p.h"
10#include "ktexttohtml_p.h"
11
12#include <QCoreApplication>
13#include <QFile>
14#include <QRegularExpression>
15#include <QStringList>
16
17#include <limits.h>
18
19KTextToHTMLHelper::KTextToHTMLHelper(const QString &plainText, int pos, int maxUrlLen, int maxAddressLen)
20 : mText(plainText)
21 , mMaxUrlLen(maxUrlLen)
22 , mMaxAddressLen(maxAddressLen)
23 , mPos(pos)
24{
25}
26
27QString KTextToHTMLHelper::getEmailAddress()
28{
29 QString address;
30
31 if (mPos < mText.length() && mText.at(i: mPos) == QLatin1Char('@')) {
32 // the following characters are allowed in a dot-atom (RFC 2822):
33 // a-z A-Z 0-9 . ! # $ % & ' * + - / = ? ^ _ ` { | } ~
34 const QString allowedSpecialChars = QStringLiteral(".!#$%&'*+-/=?^_`{|}~");
35
36 // determine the local part of the email address
37 int start = mPos - 1;
38 while (start >= 0 && mText.at(i: start).unicode() < 128
39 && (mText.at(i: start).isLetterOrNumber() //
40 || mText.at(i: start) == QLatin1Char('@') // allow @ to find invalid email addresses
41 || allowedSpecialChars.indexOf(c: mText.at(i: start)) != -1)) {
42 if (mText.at(i: start) == QLatin1Char('@')) {
43 return QString(); // local part contains '@' -> no email address
44 }
45 --start;
46 }
47 ++start;
48 // we assume that an email address starts with a letter or a digit
49 while ((start < mPos) && !mText.at(i: start).isLetterOrNumber()) {
50 ++start;
51 }
52 if (start == mPos) {
53 return QString(); // local part is empty -> no email address
54 }
55
56 // determine the domain part of the email address
57 int dotPos = INT_MAX;
58 int end = mPos + 1;
59 while (end < mText.length()
60 && (mText.at(i: end).isLetterOrNumber() //
61 || mText.at(i: end) == QLatin1Char('@') // allow @ to find invalid email addresses
62 || mText.at(i: end) == QLatin1Char('.') //
63 || mText.at(i: end) == QLatin1Char('-'))) {
64 if (mText.at(i: end) == QLatin1Char('@')) {
65 return QString(); // domain part contains '@' -> no email address
66 }
67 if (mText.at(i: end) == QLatin1Char('.')) {
68 dotPos = qMin(a: dotPos, b: end); // remember index of first dot in domain
69 }
70 ++end;
71 }
72 // we assume that an email address ends with a letter or a digit
73 while ((end > mPos) && !mText.at(i: end - 1).isLetterOrNumber()) {
74 --end;
75 }
76 if (end == mPos) {
77 return QString(); // domain part is empty -> no email address
78 }
79 if (dotPos >= end) {
80 return QString(); // domain part doesn't contain a dot
81 }
82
83 if (end - start > mMaxAddressLen) {
84 return QString(); // too long -> most likely no email address
85 }
86 address = mText.mid(position: start, n: end - start);
87
88 mPos = end - 1;
89 }
90 return address;
91}
92
93QString KTextToHTMLHelper::getPhoneNumber()
94{
95 if (!mText.at(i: mPos).isDigit() && mText.at(i: mPos) != QLatin1Char('+')) {
96 return {};
97 }
98
99 const QString allowedBeginSeparators = QStringLiteral(" \r\t\n:");
100 if (mPos > 0 && !allowedBeginSeparators.contains(c: mText.at(i: mPos - 1))) {
101 return {};
102 }
103
104 // this isn't 100% accurate, we filter stuff below that is too hard to capture with a regexp
105 static const QRegularExpression telPattern(QStringLiteral(R"([+0](( |( ?[/-] ?)?)\(?\d+\)?+){6,30})"));
106 const auto match = telPattern.match(subject: mText, offset: mPos, matchType: QRegularExpression::NormalMatch, matchOptions: QRegularExpression::AnchorAtOffsetMatchOption);
107 if (match.hasMatch()) {
108 QStringView matchedText = match.capturedView();
109 // check for maximum number of digits (15), see https://en.wikipedia.org/wiki/Telephone_numbering_plan
110 const int digitsCount = std::count_if(first: matchedText.cbegin(), last: matchedText.cend(), pred: [](const QChar c) {
111 return c.isDigit();
112 });
113
114 if (digitsCount > 15) {
115 return {};
116 }
117
118 // only one / is allowed, otherwise we trigger on dates
119 if (matchedText.count(c: QLatin1Char('/')) > 1) {
120 return {};
121 }
122
123 // parenthesis need to be balanced, and must not be nested
124 int openIdx = -1;
125 for (int i = 0, size = matchedText.size(); i < size; ++i) {
126 const QChar ch = matchedText.at(n: i);
127 if ((ch == QLatin1Char('(') && openIdx >= 0) || (ch == QLatin1Char(')') && openIdx < 0)) {
128 return {};
129 }
130
131 if (ch == QLatin1Char('(')) {
132 openIdx = i;
133 } else if (ch == QLatin1Char(')')) {
134 openIdx = -1;
135 }
136 }
137
138 if (openIdx > 0) {
139 matchedText.truncate(n: openIdx - 1);
140 matchedText = matchedText.trimmed();
141 }
142
143 // check if there's a plausible separator at the end
144 const int matchedTextLength = matchedText.size();
145 const int endIdx = mPos + matchedTextLength;
146 if (endIdx < mText.size() && !QStringView(u" \r\t\n,.").contains(c: mText.at(i: endIdx))) {
147 return {};
148 }
149
150 mPos += matchedTextLength - 1;
151 return matchedText.toString();
152 }
153 return {};
154}
155
156static QString normalizePhoneNumber(const QString &str)
157{
158 QString res;
159 res.reserve(asize: str.size());
160 for (const auto c : str) {
161 if (c.isDigit() || c == QLatin1Char('+')) {
162 res.push_back(c);
163 }
164 }
165 return res;
166}
167
168// The following characters are allowed in a dot-atom (RFC 2822):
169// a-z A-Z 0-9 . ! # $ % & ' * + - / = ? ^ _ ` { | } ~
170static const char s_allowedSpecialChars[] = ".!#$%&'*+-/=?^_`{|}~";
171
172bool KTextToHTMLHelper::atUrl() const
173{
174 // The character directly before the URL must not be a letter, a number or
175 // any other character allowed in a dot-atom (RFC 2822).
176 if (mPos > 0) {
177 const auto chBefore = mText.at(i: mPos - 1);
178 if (chBefore.isLetterOrNumber() || QLatin1String(s_allowedSpecialChars).contains(c: chBefore)) {
179 return false;
180 }
181 }
182
183 const auto segment = QStringView(mText).mid(pos: mPos);
184 /* clang-format off */
185 return segment.startsWith(s: QLatin1String("http://"))
186 || segment.startsWith(s: QLatin1String("https://"))
187 || segment.startsWith(s: QLatin1String("vnc://"))
188 || segment.startsWith(s: QLatin1String("fish://"))
189 || segment.startsWith(s: QLatin1String("ftp://"))
190 || segment.startsWith(s: QLatin1String("ftps://"))
191 || segment.startsWith(s: QLatin1String("sftp://"))
192 || segment.startsWith(s: QLatin1String("smb://"))
193 || segment.startsWith(s: QLatin1String("irc://"))
194 || segment.startsWith(s: QLatin1String("ircs://"))
195 || segment.startsWith(s: QLatin1String("mailto:"))
196 || segment.startsWith(s: QLatin1String("www."))
197 || segment.startsWith(s: QLatin1String("ftp."))
198 || segment.startsWith(s: QLatin1String("file://"))
199 || segment.startsWith(s: QLatin1String("news:"))
200 || segment.startsWith(s: QLatin1String("tel:"))
201 || segment.startsWith(s: QLatin1String("xmpp:"));
202 /* clang-format on */
203}
204
205bool KTextToHTMLHelper::isEmptyUrl(const QString &url) const
206{
207 /* clang-format off */
208 return url.isEmpty()
209 || url == QLatin1String("http://")
210 || url == QLatin1String("https://")
211 || url == QLatin1String("fish://")
212 || url == QLatin1String("ftp://")
213 || url == QLatin1String("ftps://")
214 || url == QLatin1String("sftp://")
215 || url == QLatin1String("smb://")
216 || url == QLatin1String("vnc://")
217 || url == QLatin1String("irc://")
218 || url == QLatin1String("ircs://")
219 || url == QLatin1String("mailto")
220 || url == QLatin1String("mailto:")
221 || url == QLatin1String("www")
222 || url == QLatin1String("ftp")
223 || url == QLatin1String("news:")
224 || url == QLatin1String("news://")
225 || url == QLatin1String("tel")
226 || url == QLatin1String("tel:")
227 || url == QLatin1String("xmpp:");
228 /* clang-format on */
229}
230
231QString KTextToHTMLHelper::getUrl(bool *badurl)
232{
233 QString url;
234 if (atUrl()) {
235 // NOTE: see http://tools.ietf.org/html/rfc3986#appendix-A and especially appendix-C
236 // Appendix-C mainly says, that when extracting URLs from plain text, line breaks shall
237 // be allowed and should be ignored when the URI is extracted.
238
239 // This implementation follows this recommendation and
240 // allows the URL to be enclosed within different kind of brackets/quotes
241 // If an URL is enclosed, whitespace characters are allowed and removed, otherwise
242 // the URL ends with the first whitespace
243 // Also, if the URL is enclosed in brackets, the URL itself is not allowed
244 // to contain the closing bracket, as this would be detected as the end of the URL
245
246 QChar beforeUrl;
247 QChar afterUrl;
248
249 // detect if the url has been surrounded by brackets or quotes
250 if (mPos > 0) {
251 beforeUrl = mText.at(i: mPos - 1);
252
253 /*if ( beforeUrl == '(' ) {
254 afterUrl = ')';
255 } else */
256 if (beforeUrl == QLatin1Char('[')) {
257 afterUrl = QLatin1Char(']');
258 } else if (beforeUrl == QLatin1Char('<')) {
259 afterUrl = QLatin1Char('>');
260 } else if (beforeUrl == QLatin1Char('>')) { // for e.g. <link>http://.....</link>
261 afterUrl = QLatin1Char('<');
262 } else if (beforeUrl == QLatin1Char('"')) {
263 afterUrl = QLatin1Char('"');
264 }
265 }
266 url.reserve(asize: mMaxUrlLen); // avoid allocs
267 int start = mPos;
268 bool previousCharIsSpace = false;
269 bool previousCharIsADoubleQuote = false;
270 bool previousIsAnAnchor = false;
271 /* clang-format off */
272 while (mPos < mText.length() //
273 && (mText.at(i: mPos).isPrint() || mText.at(i: mPos).isSpace())
274 && ((afterUrl.isNull() && !mText.at(i: mPos).isSpace())
275 || (!afterUrl.isNull() && mText.at(i: mPos) != afterUrl))) {
276 if (!previousCharIsSpace
277 && mText.at(i: mPos) == QLatin1Char('<')
278 && (mPos + 1) < mText.length()) { /* clang-format on */
279 // Fix Bug #346132: allow "http://www.foo.bar<http://foo.bar/>"
280 // < inside a URL is not allowed, however there is a test which
281 // checks that "http://some<Host>/path" should be allowed
282 // Therefore: check if what follows is another URL and if so, stop here
283 mPos++;
284 if (atUrl()) {
285 mPos--;
286 break;
287 }
288 mPos--;
289 }
290 if (!previousCharIsSpace && (mText.at(i: mPos) == QLatin1Char(' ')) && ((mPos + 1) < mText.length())) {
291 // Fix kmail bug: allow "http://www.foo.bar http://foo.bar/"
292 // Therefore: check if what follows is another URL and if so, stop here
293 mPos++;
294 if (atUrl()) {
295 mPos--;
296 break;
297 }
298 mPos--;
299 }
300 if (mText.at(i: mPos).isSpace()) {
301 previousCharIsSpace = true;
302 } else if (!previousIsAnAnchor && mText.at(i: mPos) == QLatin1Char('[')) {
303 break;
304 } else if (!previousIsAnAnchor && mText.at(i: mPos) == QLatin1Char(']')) {
305 break;
306 } else { // skip whitespace
307 if (previousCharIsSpace && mText.at(i: mPos) == QLatin1Char('<')) {
308 url.append(c: QLatin1Char(' '));
309 break;
310 }
311 previousCharIsSpace = false;
312 if (mText.at(i: mPos) == QLatin1Char('>') && previousCharIsADoubleQuote) {
313 // it's an invalid url
314 if (badurl) {
315 *badurl = true;
316 }
317 return QString();
318 }
319 if (mText.at(i: mPos) == QLatin1Char('"')) {
320 previousCharIsADoubleQuote = true;
321 } else {
322 previousCharIsADoubleQuote = false;
323 }
324 if (mText.at(i: mPos) == QLatin1Char('#')) {
325 previousIsAnAnchor = true;
326 }
327 url.append(c: mText.at(i: mPos));
328 if (url.length() > mMaxUrlLen) {
329 break;
330 }
331 }
332
333 ++mPos;
334 }
335
336 if (isEmptyUrl(url) || (url.length() > mMaxUrlLen)) {
337 mPos = start;
338 url.clear();
339 return url;
340 } else {
341 --mPos;
342 }
343 }
344
345 // HACK: This is actually against the RFC. However, most people don't properly escape the URL in
346 // their text with "" or <>. That leads to people writing an url, followed immediately by
347 // a dot to finish the sentence. That would lead the parser to include the dot in the url,
348 // even though that is not wanted. So work around that here.
349 // Most real-life URLs hopefully don't end with dots or commas.
350 QString wordBoundaries = QStringLiteral(".,:!?>");
351 bool hasOpenParenthese = url.contains(c: QLatin1Char('('));
352 if (!hasOpenParenthese) {
353 wordBoundaries += QLatin1Char(')');
354 }
355
356 if (url.length() > 1) {
357 do {
358 const QChar charact{url.at(i: url.length() - 1)};
359 if (wordBoundaries.contains(c: charact)) {
360 url.chop(n: 1);
361 --mPos;
362 } else if (hasOpenParenthese && (charact == QLatin1Char(')'))) {
363 if (url.length() > 2) {
364 if (url.at(i: url.length() - 2) == QLatin1Char(')')) {
365 url.chop(n: 1);
366 --mPos;
367 hasOpenParenthese = false;
368 } else {
369 break;
370 }
371 } else {
372 break;
373 }
374 } else {
375 break;
376 }
377 } while (url.length() > 1);
378 }
379 return url;
380}
381
382QString KTextToHTMLHelper::highlightedText()
383{
384 // formating symbols must be prepended with a whitespace
385 if ((mPos > 0) && !mText.at(i: mPos - 1).isSpace()) {
386 return QString();
387 }
388
389 const QChar ch = mText.at(i: mPos);
390 if (ch != QLatin1Char('/') && ch != QLatin1Char('*') && ch != QLatin1Char('_') && ch != QLatin1Char('-')) {
391 return QString();
392 }
393
394 const QRegularExpression re(QStringLiteral("\\%1([^\\s|^\\%1].*[^\\s|^\\%1])\\%1").arg(a: ch), QRegularExpression::InvertedGreedinessOption);
395 const auto match =
396 re.match(subject: mText, offset: mPos, matchType: QRegularExpression::NormalMatch, matchOptions: QRegularExpression::AnchorAtOffsetMatchOption); // clazy:exclude=use-static-qregularexpression
397
398 if (match.hasMatch()) {
399 if (match.capturedStart() == mPos) {
400 int length = match.capturedLength();
401 // there must be a whitespace after the closing formating symbol
402 if (mPos + length < mText.length() && !mText.at(i: mPos + length).isSpace()) {
403 return QString();
404 }
405 mPos += length - 1;
406 switch (ch.toLatin1()) {
407 case '*':
408 return QLatin1String("<b>*") + match.capturedView(nth: 1) + QLatin1String("*</b>");
409 case '_':
410 return QLatin1String("<u>_") + match.capturedView(nth: 1) + QLatin1String("_</u>");
411 case '/':
412 return QLatin1String("<i>/") + match.capturedView(nth: 1) + QLatin1String("/</i>");
413 case '-':
414 return QLatin1String("<s>-") + match.capturedView(nth: 1) + QLatin1String("-</s>");
415 }
416 }
417 }
418 return QString();
419}
420
421QString KTextToHTML::convertToHtml(const QString &plainText, const KTextToHTML::Options &flags, int maxUrlLen, int maxAddressLen)
422{
423 KTextToHTMLHelper helper(plainText, 0, maxUrlLen, maxAddressLen);
424
425 QString str;
426 QString result(static_cast<QChar *>(nullptr), helper.mText.length() * 2);
427 QChar ch;
428 int x;
429 bool startOfLine = true;
430
431 for (helper.mPos = 0, x = 0; helper.mPos < helper.mText.length(); ++helper.mPos, ++x) {
432 ch = helper.mText.at(i: helper.mPos);
433 if (flags & PreserveSpaces) {
434 if (ch == QLatin1Char(' ')) {
435 if (helper.mPos + 1 < helper.mText.length()) {
436 if (helper.mText.at(i: helper.mPos + 1) != QLatin1Char(' ')) {
437 // A single space, make it breaking if not at the start or end of the line
438 const bool endOfLine = helper.mText.at(i: helper.mPos + 1) == QLatin1Char('\n');
439 if (!startOfLine && !endOfLine) {
440 result += QLatin1Char(' ');
441 } else {
442 result += QLatin1String("&nbsp;");
443 }
444 } else {
445 // Whitespace of more than one space, make it all non-breaking
446 while (helper.mPos < helper.mText.length() && helper.mText.at(i: helper.mPos) == QLatin1Char(' ')) {
447 result += QLatin1String("&nbsp;");
448 ++helper.mPos;
449 ++x;
450 }
451
452 // We incremented once to often, undo that
453 --helper.mPos;
454 --x;
455 }
456 } else {
457 // Last space in the text, it is non-breaking
458 result += QLatin1String("&nbsp;");
459 }
460
461 if (startOfLine) {
462 startOfLine = false;
463 }
464 continue;
465 } else if (ch == QLatin1Char('\t')) {
466 do {
467 result += QLatin1String("&nbsp;");
468 ++x;
469 } while ((x & 7) != 0);
470 --x;
471 startOfLine = false;
472 continue;
473 }
474 }
475 if (ch == QLatin1Char('\n')) {
476 result += QLatin1String("<br />\n"); // Keep the \n, so apps can figure out the quoting levels correctly.
477 startOfLine = true;
478 x = -1;
479 continue;
480 }
481
482 startOfLine = false;
483 if (ch == QLatin1Char('&')) {
484 result += QLatin1String("&amp;");
485 } else if (ch == QLatin1Char('"')) {
486 result += QLatin1String("&quot;");
487 } else if (ch == QLatin1Char('<')) {
488 result += QLatin1String("&lt;");
489 } else if (ch == QLatin1Char('>')) {
490 result += QLatin1String("&gt;");
491 } else {
492 const int start = helper.mPos;
493 if (!(flags & IgnoreUrls)) {
494 bool badUrl = false;
495 str = helper.getUrl(badurl: &badUrl);
496 if (badUrl) {
497 QString resultBadUrl;
498 for (const QChar chBadUrl : std::as_const(t&: helper.mText)) {
499 if (chBadUrl == QLatin1Char('&')) {
500 resultBadUrl += QLatin1String("&amp;");
501 } else if (chBadUrl == QLatin1Char('"')) {
502 resultBadUrl += QLatin1String("&quot;");
503 } else if (chBadUrl == QLatin1Char('<')) {
504 resultBadUrl += QLatin1String("&lt;");
505 } else if (chBadUrl == QLatin1Char('>')) {
506 resultBadUrl += QLatin1String("&gt;");
507 } else {
508 resultBadUrl += chBadUrl;
509 }
510 }
511 return resultBadUrl;
512 }
513 if (!str.isEmpty()) {
514 QString hyperlink;
515 if (str.startsWith(s: QLatin1String("www."))) {
516 hyperlink = QLatin1String("http://") + str;
517 } else if (str.startsWith(s: QLatin1String("ftp."))) {
518 hyperlink = QLatin1String("ftp://") + str;
519 } else {
520 hyperlink = str;
521 }
522 result += QLatin1String("<a href=\"") + hyperlink + QLatin1String("\">") + str.toHtmlEscaped() + QLatin1String("</a>");
523 x += helper.mPos - start;
524 continue;
525 }
526 str = helper.getEmailAddress();
527 if (!str.isEmpty()) {
528 // len is the length of the local part
529 int len = str.indexOf(c: QLatin1Char('@'));
530 QString localPart = str.left(n: len);
531
532 // remove the local part from the result (as '&'s have been expanded to
533 // &amp; we have to take care of the 4 additional characters per '&')
534 result.truncate(pos: result.length() - len - (localPart.count(c: QLatin1Char('&')) * 4));
535 x -= len;
536
537 result += QLatin1String("<a href=\"mailto:") + str + QLatin1String("\">") + str + QLatin1String("</a>");
538 x += str.length() - 1;
539 continue;
540 }
541 if (flags & ConvertPhoneNumbers) {
542 str = helper.getPhoneNumber();
543 if (!str.isEmpty()) {
544 result += QLatin1String("<a href=\"tel:") + normalizePhoneNumber(str) + QLatin1String("\">") + str + QLatin1String("</a>");
545 x += str.length() - 1;
546 continue;
547 }
548 }
549 }
550 if (flags & HighlightText) {
551 str = helper.highlightedText();
552 if (!str.isEmpty()) {
553 result += str;
554 x += helper.mPos - start;
555 continue;
556 }
557 }
558 result += ch;
559 }
560 }
561
562 if (flags & ReplaceSmileys) {
563 result = KEmoticonsParser::parseEmoticons(text: result);
564 }
565
566 return result;
567}
568

source code of kcoreaddons/src/lib/text/ktexttohtml.cpp