ktexttohtml.cpp source code [kcoreaddons/src/lib/text/ktexttohtml.cpp]

1	/*
2	SPDX-FileCopyrightText: 2002 Dave Corrie <kde@davecorrie.com>
3	SPDX-FileCopyrightText: 2014 Daniel Vrátil <dvratil@redhat.com>
4
5	SPDX-License-Identifier: LGPL-2.0-or-later
6	*/
7
8	#include "ktexttohtml.h"
9	#include "kemoticonsparser_p.h"
10	#include "ktexttohtml_p.h"
11
12	#include <QCoreApplication>
13	#include <QFile>
14	#include <QRegularExpression>
15	#include <QStringList>
16
17	#include <limits.h>
18
19	KTextToHTMLHelper::KTextToHTMLHelper(const QString &plainText, int pos, int maxUrlLen, int maxAddressLen)
20	: mText(plainText)
21	, mMaxUrlLen(maxUrlLen)
22	, mMaxAddressLen(maxAddressLen)
23	, mPos(pos)
24	{
25	}
26
27	QString KTextToHTMLHelper::getEmailAddress()
28	{
29	QString address;
30
31	if (mPos < mText.length() && mText.at(i: mPos) == QLatin1Char(`'@'`)) {
32	// the following characters are allowed in a dot-atom (RFC 2822):
33	// a-z A-Z 0-9 . ! # $ % & ' + - / = ? ^ _ ` { \| } ~*
34	const QString allowedSpecialChars = QStringLiteral(".!#$%&'*+-/=?^_`{\|}~");
35
36	// determine the local part of the email address
37	int start = mPos - `1`;
38	while (start >= `0` && mText.at(i: start).unicode() < `128`
39	&& (mText.at(i: start).isLetterOrNumber() //
40	\|\| mText.at(i: start) == QLatin1Char(`'@'`) // allow @ to find invalid email addresses
41	\|\| allowedSpecialChars.indexOf(c: mText.at(i: start)) != -`1`)) {
42	if (mText.at(i: start) == QLatin1Char(`'@'`)) {
43	return QString(); // local part contains '@' -> no email address
44	}
45	--start;
46	}
47	++start;
48	// we assume that an email address starts with a letter or a digit
49	while ((start < mPos) && !mText.at(i: start).isLetterOrNumber()) {
50	++start;
51	}
52	if (start == mPos) {
53	return QString(); // local part is empty -> no email address
54	}
55
56	// determine the domain part of the email address
57	int dotPos = INT_MAX;
58	int end = mPos + `1`;
59	while (end < mText.length()
60	&& (mText.at(i: end).isLetterOrNumber() //
61	\|\| mText.at(i: end) == QLatin1Char(`'@'`) // allow @ to find invalid email addresses
62	\|\| mText.at(i: end) == QLatin1Char(`'.'`) //
63	\|\| mText.at(i: end) == QLatin1Char(`'-'`))) {
64	if (mText.at(i: end) == QLatin1Char(`'@'`)) {
65	return QString(); // domain part contains '@' -> no email address
66	}
67	if (mText.at(i: end) == QLatin1Char(`'.'`)) {
68	dotPos = qMin(a: dotPos, b: end); // remember index of first dot in domain
69	}
70	++end;
71	}
72	// we assume that an email address ends with a letter or a digit
73	while ((end > mPos) && !mText.at(i: end - `1`).isLetterOrNumber()) {
74	--end;
75	}
76	if (end == mPos) {
77	return QString(); // domain part is empty -> no email address
78	}
79	if (dotPos >= end) {
80	return QString(); // domain part doesn't contain a dot
81	}
82
83	if (end - start > mMaxAddressLen) {
84	return QString(); // too long -> most likely no email address
85	}
86	address = mText.mid(position: start, n: end - start);
87
88	mPos = end - `1`;
89	}
90	return address;
91	}
92
93	QString KTextToHTMLHelper::getPhoneNumber()
94	{
95	if (!mText.at(i: mPos).isDigit() && mText.at(i: mPos) != QLatin1Char(`'+'`)) {
96	return {};
97	}
98
99	const QString allowedBeginSeparators = QStringLiteral(" \r\t\n:");
100	if (mPos > `0` && !allowedBeginSeparators.contains(c: mText.at(i: mPos - `1`))) {
101	return {};
102	}
103
104	// this isn't 100% accurate, we filter stuff below that is too hard to capture with a regexp
105	static const QRegularExpression telPattern(QStringLiteral(R"([+0](( \|( ?[/-] ?)?)$?\d+$?+){6,30})"));
106	const auto match = telPattern.match(subject: mText, offset: mPos, matchType: QRegularExpression::NormalMatch, matchOptions: QRegularExpression::AnchorAtOffsetMatchOption);
107	if (match.hasMatch()) {
108	QStringView matchedText = match.capturedView();
109	// check for maximum number of digits (15), see https://en.wikipedia.org/wiki/Telephone_numbering_plan
110	const int digitsCount = std::count_if(first: matchedText.cbegin(), last: matchedText.cend(), pred: [](const QChar c) {
111	return c.isDigit();
112	});
113
114	if (digitsCount > `15`) {
115	return {};
116	}
117
118	// only one / is allowed, otherwise we trigger on dates
119	if (matchedText.count(c: QLatin1Char(`'/'`)) > `1`) {
120	return {};
121	}
122
123	// parenthesis need to be balanced, and must not be nested
124	int openIdx = -`1`;
125	for (int i = `0`, size = matchedText.size(); i < size; ++i) {
126	const QChar ch = matchedText.at(n: i);
127	if ((ch == QLatin1Char(`'('`) && openIdx >= `0`) \|\| (ch == QLatin1Char(`')'`) && openIdx < `0`)) {
128	return {};
129	}
130
131	if (ch == QLatin1Char(`'('`)) {
132	openIdx = i;
133	} else if (ch == QLatin1Char(`')'`)) {
134	openIdx = -`1`;
135	}
136	}
137
138	if (openIdx > `0`) {
139	matchedText.truncate(n: openIdx - `1`);
140	matchedText = matchedText.trimmed();
141	}
142
143	// check if there's a plausible separator at the end
144	const int matchedTextLength = matchedText.size();
145	const int endIdx = mPos + matchedTextLength;
146	if (endIdx < mText.size() && !QStringView(u" \r\t\n,.").contains(c: mText.at(i: endIdx))) {
147	return {};
148	}
149
150	mPos += matchedTextLength - `1`;
151	return matchedText.toString();
152	}
153	return {};
154	}
155
156	static QString normalizePhoneNumber(const QString &str)
157	{
158	QString res;
159	res.reserve(asize: str.size());
160	for (const auto c : str) {
161	if (c.isDigit() \|\| c == QLatin1Char(`'+'`)) {
162	res.push_back(c);
163	}
164	}
165	return res;
166	}
167
168	// The following characters are allowed in a dot-atom (RFC 2822):
169	// a-z A-Z 0-9 . ! # $ % & ' + - / = ? ^ _ ` { \| } ~*
170	static const char s_allowedSpecialChars[] = ".!#$%&'*+-/=?^_`{\|}~";
171
172	bool KTextToHTMLHelper::atUrl() const
173	{
174	// The character directly before the URL must not be a letter, a number or
175	// any other character allowed in a dot-atom (RFC 2822).
176	if (mPos > `0`) {
177	const auto chBefore = mText.at(i: mPos - `1`);
178	if (chBefore.isLetterOrNumber() \|\| QLatin1String(s_allowedSpecialChars).contains(c: chBefore)) {
179	return false;
180	}
181	}
182
183	const auto segment = QStringView(mText).mid(pos: mPos);
184	/ clang-format off /
185	return segment.startsWith(s: QLatin1String("http://"))
186	\|\| segment.startsWith(s: QLatin1String("https://"))
187	\|\| segment.startsWith(s: QLatin1String("vnc://"))
188	\|\| segment.startsWith(s: QLatin1String("fish://"))
189	\|\| segment.startsWith(s: QLatin1String("ftp://"))
190	\|\| segment.startsWith(s: QLatin1String("ftps://"))
191	\|\| segment.startsWith(s: QLatin1String("sftp://"))
192	\|\| segment.startsWith(s: QLatin1String("smb://"))
193	\|\| segment.startsWith(s: QLatin1String("irc://"))
194	\|\| segment.startsWith(s: QLatin1String("ircs://"))
195	\|\| segment.startsWith(s: QLatin1String("mailto:"))
196	\|\| segment.startsWith(s: QLatin1String("www."))
197	\|\| segment.startsWith(s: QLatin1String("ftp."))
198	\|\| segment.startsWith(s: QLatin1String("file://"))
199	\|\| segment.startsWith(s: QLatin1String("news:"))
200	\|\| segment.startsWith(s: QLatin1String("tel:"))
201	\|\| segment.startsWith(s: QLatin1String("xmpp:"));
202	/ clang-format on /
203	}
204
205	bool KTextToHTMLHelper::isEmptyUrl(const QString &url) const
206	{
207	/ clang-format off /
208	return url.isEmpty()
209	\|\| url == QLatin1String("http://")
210	\|\| url == QLatin1String("https://")
211	\|\| url == QLatin1String("fish://")
212	\|\| url == QLatin1String("ftp://")
213	\|\| url == QLatin1String("ftps://")
214	\|\| url == QLatin1String("sftp://")
215	\|\| url == QLatin1String("smb://")
216	\|\| url == QLatin1String("vnc://")
217	\|\| url == QLatin1String("irc://")
218	\|\| url == QLatin1String("ircs://")
219	\|\| url == QLatin1String("mailto")
220	\|\| url == QLatin1String("mailto:")
221	\|\| url == QLatin1String("www")
222	\|\| url == QLatin1String("ftp")
223	\|\| url == QLatin1String("news:")
224	\|\| url == QLatin1String("news://")
225	\|\| url == QLatin1String("tel")
226	\|\| url == QLatin1String("tel:")
227	\|\| url == QLatin1String("xmpp:");
228	/ clang-format on /
229	}
230
231	QString KTextToHTMLHelper::getUrl(bool *badurl)
232	{
233	QString url;
234	if (atUrl()) {
235	// NOTE: see http://tools.ietf.org/html/rfc3986#appendix-A and especially appendix-C
236	// Appendix-C mainly says, that when extracting URLs from plain text, line breaks shall
237	// be allowed and should be ignored when the URI is extracted.
238
239	// This implementation follows this recommendation and
240	// allows the URL to be enclosed within different kind of brackets/quotes
241	// If an URL is enclosed, whitespace characters are allowed and removed, otherwise
242	// the URL ends with the first whitespace
243	// Also, if the URL is enclosed in brackets, the URL itself is not allowed
244	// to contain the closing bracket, as this would be detected as the end of the URL
245
246	QChar beforeUrl;
247	QChar afterUrl;
248
249	// detect if the url has been surrounded by brackets or quotes
250	if (mPos > `0`) {
251	beforeUrl = mText.at(i: mPos - `1`);
252
253	/if ( beforeUrl == '(' ) {*
254	afterUrl = ')';
255	} else /*
256	if (beforeUrl == QLatin1Char(`'['`)) {
257	afterUrl = QLatin1Char(`']'`);
258	} else if (beforeUrl == QLatin1Char(`'<'`)) {
259	afterUrl = QLatin1Char(`'>'`);
260	} else if (beforeUrl == QLatin1Char(`'>'`)) { // for e.g. <link>http://.....</link>
261	afterUrl = QLatin1Char(`'<'`);
262	} else if (beforeUrl == QLatin1Char(`'"'`)) {
263	afterUrl = QLatin1Char(`'"'`);
264	}
265	}
266	url.reserve(asize: mMaxUrlLen); // avoid allocs
267	int start = mPos;
268	bool previousCharIsSpace = false;
269	bool previousCharIsADoubleQuote = false;
270	bool previousIsAnAnchor = false;
271	/ clang-format off /
272	while (mPos < mText.length() //
273	&& (mText.at(i: mPos).isPrint() \|\| mText.at(i: mPos).isSpace())
274	&& ((afterUrl.isNull() && !mText.at(i: mPos).isSpace())
275	\|\| (!afterUrl.isNull() && mText.at(i: mPos) != afterUrl))) {
276	if (!previousCharIsSpace
277	&& mText.at(i: mPos) == QLatin1Char(`'<'`)
278	&& (mPos + `1`) < mText.length()) { / clang-format on /
279	// Fix Bug #346132: allow "http://www.foo.bar<http://foo.bar/>"
280	// < inside a URL is not allowed, however there is a test which
281	// checks that "http://some<Host>/path" should be allowed
282	// Therefore: check if what follows is another URL and if so, stop here
283	mPos++;
284	if (atUrl()) {
285	mPos--;
286	break;
287	}
288	mPos--;
289	}
290	if (!previousCharIsSpace && (mText.at(i: mPos) == QLatin1Char(`' '`)) && ((mPos + `1`) < mText.length())) {
291	// Fix kmail bug: allow "http://www.foo.bar http://foo.bar/"
292	// Therefore: check if what follows is another URL and if so, stop here
293	mPos++;
294	if (atUrl()) {
295	mPos--;
296	break;
297	}
298	mPos--;
299	}
300	if (mText.at(i: mPos).isSpace()) {
301	previousCharIsSpace = true;
302	} else if (!previousIsAnAnchor && mText.at(i: mPos) == QLatin1Char(`'['`)) {
303	break;
304	} else if (!previousIsAnAnchor && mText.at(i: mPos) == QLatin1Char(`']'`)) {
305	break;
306	} else { // skip whitespace
307	if (previousCharIsSpace && mText.at(i: mPos) == QLatin1Char(`'<'`)) {
308	url.append(c: QLatin1Char(`' '`));
309	break;
310	}
311	previousCharIsSpace = false;
312	if (mText.at(i: mPos) == QLatin1Char(`'>'`) && previousCharIsADoubleQuote) {
313	// it's an invalid url
314	if (badurl) {
315	badurl = true*;
316	}
317	return QString();
318	}
319	if (mText.at(i: mPos) == QLatin1Char(`'"'`)) {
320	previousCharIsADoubleQuote = true;
321	} else {
322	previousCharIsADoubleQuote = false;
323	}
324	if (mText.at(i: mPos) == QLatin1Char(`'#'`)) {
325	previousIsAnAnchor = true;
326	}
327	url.append(c: mText.at(i: mPos));
328	if (url.length() > mMaxUrlLen) {
329	break;
330	}
331	}
332
333	++mPos;
334	}
335
336	if (isEmptyUrl(url) \|\| (url.length() > mMaxUrlLen)) {
337	mPos = start;
338	url.clear();
339	return url;
340	} else {
341	--mPos;
342	}
343	}
344
345	// HACK: This is actually against the RFC. However, most people don't properly escape the URL in
346	// their text with "" or <>. That leads to people writing an url, followed immediately by
347	// a dot to finish the sentence. That would lead the parser to include the dot in the url,
348	// even though that is not wanted. So work around that here.
349	// Most real-life URLs hopefully don't end with dots or commas.
350	QString wordBoundaries = QStringLiteral(".,:!?>");
351	bool hasOpenParenthese = url.contains(c: QLatin1Char(`'('`));
352	if (!hasOpenParenthese) {
353	wordBoundaries += QLatin1Char(`')'`);
354	}
355
356	if (url.length() > `1`) {
357	do {
358	const QChar charact{url.at(i: url.length() - `1`)};
359	if (wordBoundaries.contains(c: charact)) {
360	url.chop(n: `1`);
361	--mPos;
362	} else if (hasOpenParenthese && (charact == QLatin1Char(`')'`))) {
363	if (url.length() > `2`) {
364	if (url.at(i: url.length() - `2`) == QLatin1Char(`')'`)) {
365	url.chop(n: `1`);
366	--mPos;
367	hasOpenParenthese = false;
368	} else {
369	break;
370	}
371	} else {
372	break;
373	}
374	} else {
375	break;
376	}
377	} while (url.length() > `1`);
378	}
379	return url;
380	}
381
382	QString KTextToHTMLHelper::highlightedText()
383	{
384	// formating symbols must be prepended with a whitespace
385	if ((mPos > `0`) && !mText.at(i: mPos - `1`).isSpace()) {
386	return QString();
387	}
388
389	const QChar ch = mText.at(i: mPos);
390	if (ch != QLatin1Char(`'/'`) && ch != QLatin1Char(`'*'`) && ch != QLatin1Char(`'_'`) && ch != QLatin1Char(`'-'`)) {
391	return QString();
392	}
393
394	const QRegularExpression re(QStringLiteral("\\%1([^\\s\|^\\%1].*[^\\s\|^\\%1])\\%1").arg(a: ch), QRegularExpression::InvertedGreedinessOption);
395	const auto match =
396	re.match(subject: mText, offset: mPos, matchType: QRegularExpression::NormalMatch, matchOptions: QRegularExpression::AnchorAtOffsetMatchOption); // clazy:exclude=use-static-qregularexpression
397
398	if (match.hasMatch()) {
399	if (match.capturedStart() == mPos) {
400	int length = match.capturedLength();
401	// there must be a whitespace after the closing formating symbol
402	if (mPos + length < mText.length() && !mText.at(i: mPos + length).isSpace()) {
403	return QString();
404	}
405	mPos += length - `1`;
406	switch (ch.toLatin1()) {
407	case `'*'`:
408	return QLatin1String("<b>") + match.capturedView(nth: `1`) + QLatin1String("</b>");
409	case `'_'`:
410	return QLatin1String("<u>_") + match.capturedView(nth: `1`) + QLatin1String("_</u>");
411	case `'/'`:
412	return QLatin1String("<i>/") + match.capturedView(nth: `1`) + QLatin1String("/</i>");
413	case `'-'`:
414	return QLatin1String("<s>-") + match.capturedView(nth: `1`) + QLatin1String("-</s>");
415	}
416	}
417	}
418	return QString();
419	}
420
421	QString KTextToHTML::convertToHtml(const QString &plainText, const KTextToHTML::Options &flags, int maxUrlLen, int maxAddressLen)
422	{
423	KTextToHTMLHelper helper(plainText, `0`, maxUrlLen, maxAddressLen);
424
425	QString str;
426	QString result(static_cast<QChar >(nullptr), helper.mText.length() `2`);
427	QChar ch;
428	int x;
429	bool startOfLine = true;
430
431	for (helper.mPos = `0`, x = `0`; helper.mPos < helper.mText.length(); ++helper.mPos, ++x) {
432	ch = helper.mText.at(i: helper.mPos);
433	if (flags & PreserveSpaces) {
434	if (ch == QLatin1Char(`' '`)) {
435	if (helper.mPos + `1` < helper.mText.length()) {
436	if (helper.mText.at(i: helper.mPos + `1`) != QLatin1Char(`' '`)) {
437	// A single space, make it breaking if not at the start or end of the line
438	const bool endOfLine = helper.mText.at(i: helper.mPos + `1`) == QLatin1Char(`'\n'`);
439	if (!startOfLine && !endOfLine) {
440	result += QLatin1Char(`' '`);
441	} else {
442	result += QLatin1String(" ");
443	}
444	} else {
445	// Whitespace of more than one space, make it all non-breaking
446	while (helper.mPos < helper.mText.length() && helper.mText.at(i: helper.mPos) == QLatin1Char(`' '`)) {
447	result += QLatin1String(" ");
448	++helper.mPos;
449	++x;
450	}
451
452	// We incremented once to often, undo that
453	--helper.mPos;
454	--x;
455	}
456	} else {
457	// Last space in the text, it is non-breaking
458	result += QLatin1String(" ");
459	}
460
461	if (startOfLine) {
462	startOfLine = false;
463	}
464	continue;
465	} else if (ch == QLatin1Char(`'\t'`)) {
466	do {
467	result += QLatin1String(" ");
468	++x;
469	} while ((x & `7`) != `0`);
470	--x;
471	startOfLine = false;
472	continue;
473	}
474	}
475	if (ch == QLatin1Char(`'\n'`)) {
476	result += QLatin1String("<br />\n"); // Keep the \n, so apps can figure out the quoting levels correctly.
477	startOfLine = true;
478	x = -`1`;
479	continue;
480	}
481
482	startOfLine = false;
483	if (ch == QLatin1Char(`'&'`)) {
484	result += QLatin1String("&");
485	} else if (ch == QLatin1Char(`'"'`)) {
486	result += QLatin1String(""");
487	} else if (ch == QLatin1Char(`'<'`)) {
488	result += QLatin1String("<");
489	} else if (ch == QLatin1Char(`'>'`)) {
490	result += QLatin1String(">");
491	} else {
492	const int start = helper.mPos;
493	if (!(flags & IgnoreUrls)) {
494	bool badUrl = false;
495	str = helper.getUrl(badurl: &badUrl);
496	if (badUrl) {
497	QString resultBadUrl;
498	for (const QChar chBadUrl : std::as_const(t&: helper.mText)) {
499	if (chBadUrl == QLatin1Char(`'&'`)) {
500	resultBadUrl += QLatin1String("&");
501	} else if (chBadUrl == QLatin1Char(`'"'`)) {
502	resultBadUrl += QLatin1String(""");
503	} else if (chBadUrl == QLatin1Char(`'<'`)) {
504	resultBadUrl += QLatin1String("<");
505	} else if (chBadUrl == QLatin1Char(`'>'`)) {
506	resultBadUrl += QLatin1String(">");
507	} else {
508	resultBadUrl += chBadUrl;
509	}
510	}
511	return resultBadUrl;
512	}
513	if (!str.isEmpty()) {
514	QString hyperlink;
515	if (str.startsWith(s: QLatin1String("www."))) {
516	hyperlink = QLatin1String("http://") + str;
517	} else if (str.startsWith(s: QLatin1String("ftp."))) {
518	hyperlink = QLatin1String("ftp://") + str;
519	} else {
520	hyperlink = str;
521	}
522	result += QLatin1String("<a href=\"") + hyperlink + QLatin1String("\">") + str.toHtmlEscaped() + QLatin1String("</a>");
523	x += helper.mPos - start;
524	continue;
525	}
526	str = helper.getEmailAddress();
527	if (!str.isEmpty()) {
528	// len is the length of the local part
529	int len = str.indexOf(c: QLatin1Char(`'@'`));
530	QString localPart = str.left(n: len);
531
532	// remove the local part from the result (as '&'s have been expanded to
533	// & we have to take care of the 4 additional characters per '&')
534	result.truncate(pos: result.length() - len - (localPart.count(c: QLatin1Char(`'&'`)) * `4`));
535	x -= len;
536
537	result += QLatin1String("<a href=\"mailto:") + str + QLatin1String("\">") + str + QLatin1String("</a>");
538	x += str.length() - `1`;
539	continue;
540	}
541	if (flags & ConvertPhoneNumbers) {
542	str = helper.getPhoneNumber();
543	if (!str.isEmpty()) {
544	result += QLatin1String("<a href=\"tel:") + normalizePhoneNumber(str) + QLatin1String("\">") + str + QLatin1String("</a>");
545	x += str.length() - `1`;
546	continue;
547	}
548	}
549	}
550	if (flags & HighlightText) {
551	str = helper.highlightedText();
552	if (!str.isEmpty()) {
553	result += str;
554	x += helper.mPos - start;
555	continue;
556	}
557	}
558	result += ch;
559	}
560	}
561
562	if (flags & ReplaceSmileys) {
563	result = KEmoticonsParser::parseEmoticons(text: result);
564	}
565
566	return result;
567	}
568

source code of kcoreaddons/src/lib/text/ktexttohtml.cpp