1 | /* |
2 | This file is part of the syndication library |
3 | SPDX-FileCopyrightText: 2019 Laurent Montel <montel@kde.org> |
4 | |
5 | SPDX-License-Identifier: LGPL-2.0-or-later |
6 | */ |
7 | |
8 | #include "loaderutil_p.h" |
9 | #include <QDebug> |
10 | #include <QRegularExpression> |
11 | |
12 | //#define DEBUG_PARSING_FEED |
13 | #ifdef DEBUG_PARSING_FEED |
14 | #include <QFile> |
15 | #include <QTextStream> |
16 | #endif |
17 | QUrl Syndication::LoaderUtil::parseFeed(const QByteArray &data, const QUrl &url) |
18 | { |
19 | #ifdef DEBUG_PARSING_FEED |
20 | qDebug() << " QUrl Syndication::LoaderUtil::parseFeed(const QByteArray &data, const QUrl &url)" ; |
21 | QFile headerFile(QStringLiteral("/tmp/bb.txt" )); |
22 | headerFile.open(QIODevice::WriteOnly | QIODevice::Text); |
23 | QTextStream outHeaderStream(&headerFile); |
24 | outHeaderStream << data; |
25 | headerFile.close(); |
26 | #endif |
27 | QUrl discoveredFeedURL; |
28 | QString str = QString::fromLatin1(data.constData()).simplified(); |
29 | QString s2; |
30 | // QTextStream ts( &str, QIODevice::WriteOnly ); |
31 | // ts << data.data(); |
32 | |
33 | // "<[\\s]link[^>]*rel[\\s]=[\\s]\\\"[\\s]alternate[\\s]\\\"[^>]*>" |
34 | // "type[\\s]=[\\s]\\\"application/rss+xml\\\"" |
35 | // "href[\\s]=[\\s]\\\"application/rss+xml\\\"" |
36 | |
37 | QRegularExpression rx(QStringLiteral("(?:REL)[^=]*=[^sAa]*(?:service.feed|ALTERNATE)[^sAa]*" |
38 | "[\\s]*type[^=]*=\"application/rss\\+xml\"[^s][^s](?:[^>]*)" |
39 | "[\\s]*[\\s]*[^s]*(?:HREF)[^=]*=[^A-Z0-9-_~,./$]*([^'\">\\s]*)" ), |
40 | QRegularExpression::CaseInsensitiveOption); |
41 | QRegularExpressionMatch match; |
42 | if ((match = rx.match(str)).hasMatch()) { |
43 | s2 = match.captured(1); |
44 | } else { |
45 | const QRegularExpression rx2(QStringLiteral("(?:REL)[^=]*=[^sAa]*(?:service.feed|ALTERNATE)" |
46 | "[\\s]*[^s][^s](?:[^>]*)(?:HREF)[^=]*=[^A-Z0-9-_~,./$]*([^'\">\\s]*)" ), |
47 | QRegularExpression::CaseInsensitiveOption); |
48 | if ((match = rx2.match(str)).hasMatch()) { |
49 | s2 = match.captured(1); |
50 | } else { |
51 | // does not support Atom/RSS autodiscovery.. try finding feeds by brute force.... |
52 | QStringList feeds; |
53 | QString host = url.host(); |
54 | rx.setPattern(QStringLiteral("(?:<A )[^H]*(?:HREF)[^=]*=[^A-Z0-9-_~,./]*([^'\">\\s]*)" )); |
55 | QRegularExpressionMatchIterator iter = rx.globalMatch(str); |
56 | while (iter.hasNext()) { |
57 | match = iter.next(); |
58 | s2 = match.captured(1); |
59 | if (s2.endsWith(QLatin1String(".rdf" )) // |
60 | || s2.endsWith(QLatin1String(".rss" )) // |
61 | || s2.endsWith(QLatin1String(".xml" ))) { |
62 | feeds.append(s2); |
63 | } |
64 | } |
65 | |
66 | // Prefer feeds on same host |
67 | auto it = std::find_if(feeds.cbegin(), feeds.cend(), [&host](const QString &s) { |
68 | return QUrl(s).host() == host; |
69 | }); |
70 | if (it != feeds.cend()) { |
71 | s2 = *it; |
72 | } |
73 | } |
74 | } |
75 | |
76 | if (s2.isNull()) { |
77 | return discoveredFeedURL; |
78 | } |
79 | |
80 | if (QUrl(s2).isRelative()) { |
81 | if (s2.startsWith(QLatin1String("//" ))) { |
82 | s2.prepend(url.scheme() + QLatin1Char(':')); |
83 | discoveredFeedURL = QUrl(s2); |
84 | } else if (s2.startsWith(QLatin1Char('/'))) { |
85 | discoveredFeedURL = url; |
86 | discoveredFeedURL.setPath(s2); |
87 | } else { |
88 | discoveredFeedURL = url; |
89 | discoveredFeedURL.setPath(discoveredFeedURL.path() + QLatin1Char('/') + s2); |
90 | } |
91 | } else { |
92 | discoveredFeedURL = QUrl(s2); |
93 | } |
94 | |
95 | return discoveredFeedURL; |
96 | } |
97 | |