1 | /* |
2 | This file is part of the syndication library |
3 | SPDX-FileCopyrightText: 2019 Laurent Montel <montel@kde.org> |
4 | |
5 | SPDX-License-Identifier: LGPL-2.0-or-later |
6 | */ |
7 | |
8 | #include "loaderutil_p.h" |
9 | #include <QDebug> |
10 | #include <QRegularExpression> |
11 | |
12 | // #define DEBUG_PARSING_FEED |
13 | #ifdef DEBUG_PARSING_FEED |
14 | #include <QFile> |
15 | #include <QTextStream> |
16 | #endif |
17 | QUrl Syndication::LoaderUtil::parseFeed(const QByteArray &data, const QUrl &url) |
18 | { |
19 | #ifdef DEBUG_PARSING_FEED |
20 | qDebug() << " QUrl Syndication::LoaderUtil::parseFeed(const QByteArray &data, const QUrl &url)" ; |
21 | QFile headerFile(QStringLiteral("/tmp/bb.txt" )); |
22 | headerFile.open(QIODevice::WriteOnly | QIODevice::Text); |
23 | QTextStream outHeaderStream(&headerFile); |
24 | outHeaderStream << data; |
25 | headerFile.close(); |
26 | #endif |
27 | QUrl discoveredFeedURL; |
28 | QString str = QString::fromLatin1(ba: data.constData()).simplified(); |
29 | QString s2; |
30 | // QTextStream ts( &str, QIODevice::WriteOnly ); |
31 | // ts << data.data(); |
32 | |
33 | // "<[\\s]link[^>]*rel[\\s]=[\\s]\\\"[\\s]alternate[\\s]\\\"[^>]*>" |
34 | // "type[\\s]=[\\s]\\\"application/rss+xml\\\"" |
35 | // "href[\\s]=[\\s]\\\"application/rss+xml\\\"" |
36 | |
37 | // test regexp: https://www.regexplanet.com/advanced/perl/index.html |
38 | |
39 | const static QRegularExpression rx0( |
40 | QStringLiteral( |
41 | R"((?:REL)[^=]*=[^sAa]*(?:service.feed|ALTERNATE)[^sAa]*[\s]*type[^=]*="application/rss\+xml"[^s]*[\s]*[^s]*(?:HREF)[^=]*?=[^A-Z0-9-_~,./$]*([^'\">\s]*))" ), |
42 | QRegularExpression::CaseInsensitiveOption); |
43 | |
44 | QRegularExpressionMatch match; |
45 | if ((match = rx0.match(subject: str)).hasMatch()) { |
46 | s2 = match.captured(nth: 1); |
47 | } else { |
48 | const static QRegularExpression rx( |
49 | QStringLiteral( |
50 | R"((?:REL)[^=]*=[^sAa]*(?:service.feed|ALTERNATE)[^sAa]*[\s]*type[^=]*=\"application/rss\+xml\"[^s][^s](?:[^>]*)[\s]*[^s]*(?:HREF)[^=]*=[^A-Z0-9-_~,./$]*([^'\">\s]*))" ), |
51 | QRegularExpression::CaseInsensitiveOption); |
52 | if ((match = rx.match(subject: str)).hasMatch()) { |
53 | s2 = match.captured(nth: 1); |
54 | } else { |
55 | static const QRegularExpression rx2( |
56 | QStringLiteral(R"((?:REL)[^=]*=[^sAa]*(?:service.feed|ALTERNATE)[\s]*[^s][^s](?:[^>]*)(?:HREF)[^=]*=[^A-Z0-9-_~,./$]*([^'\">\s]*))" ), |
57 | QRegularExpression::CaseInsensitiveOption); |
58 | if ((match = rx2.match(subject: str)).hasMatch()) { |
59 | s2 = match.captured(nth: 1); |
60 | } else { |
61 | // does not support Atom/RSS autodiscovery.. try finding feeds by brute force.... |
62 | QStringList feeds; |
63 | QString host = url.host(); |
64 | static const QRegularExpression rx3(QStringLiteral(R"((?:<A )[^H]*(?:HREF)[^=]*=[^A-Z0-9-_~,./]*([^'\">\s]*))" )); |
65 | QRegularExpressionMatchIterator iter = rx3.globalMatch(subject: str); |
66 | while (iter.hasNext()) { |
67 | match = iter.next(); |
68 | s2 = match.captured(nth: 1); |
69 | if (s2.endsWith(s: QLatin1String(".rdf" )) // |
70 | || s2.endsWith(s: QLatin1String(".rss" )) // |
71 | || s2.endsWith(s: QLatin1String(".xml" ))) { |
72 | feeds.append(t: s2); |
73 | } |
74 | } |
75 | |
76 | // Prefer feeds on same host |
77 | auto it = std::find_if(first: feeds.cbegin(), last: feeds.cend(), pred: [&host](const QString &s) { |
78 | return QUrl(s).host() == host; |
79 | }); |
80 | if (it != feeds.cend()) { |
81 | s2 = *it; |
82 | } |
83 | } |
84 | } |
85 | } |
86 | |
87 | if (s2.isNull()) { |
88 | return discoveredFeedURL; |
89 | } |
90 | |
91 | if (QUrl(s2).isRelative()) { |
92 | if (s2.startsWith(s: QLatin1String("//" ))) { |
93 | s2.prepend(s: url.scheme() + QLatin1Char(':')); |
94 | discoveredFeedURL = QUrl(s2); |
95 | } else if (s2.startsWith(c: QLatin1Char('/'))) { |
96 | discoveredFeedURL = url; |
97 | discoveredFeedURL.setPath(path: s2); |
98 | } else { |
99 | discoveredFeedURL = url; |
100 | discoveredFeedURL.setPath(path: discoveredFeedURL.path() + QLatin1Char('/') + s2); |
101 | } |
102 | } else { |
103 | discoveredFeedURL = QUrl(s2); |
104 | } |
105 | |
106 | return discoveredFeedURL; |
107 | } |
108 | |