1 | /* |
2 | This file is part of the KDE Baloo Project |
3 | SPDX-FileCopyrightText: 2013-2015 Vishesh Handa <vhanda@kde.org> |
4 | |
5 | SPDX-License-Identifier: LGPL-2.1-only OR LGPL-3.0-only OR LicenseRef-KDE-Accepted-LGPL |
6 | */ |
7 | |
8 | #include "app.h" |
9 | #include "basicindexingjob.h" |
10 | #include "result.h" |
11 | #include "idutils.h" |
12 | #include "transaction.h" |
13 | #include "baloodebug.h" |
14 | #include "global.h" |
15 | |
16 | #include <QCoreApplication> |
17 | |
18 | #include <QTimer> |
19 | #include <QFileInfo> |
20 | |
21 | #include <KFileMetaData/Extractor> |
22 | #include <KFileMetaData/MimeUtils> |
23 | #include <KIdleTime> |
24 | |
25 | #include <unistd.h> //for STDIN_FILENO |
26 | #include <iostream> |
27 | |
28 | using namespace Baloo; |
29 | |
30 | App::App(QObject* parent) |
31 | : QObject(parent) |
32 | , m_notifyNewData(STDIN_FILENO, QSocketNotifier::Read) |
33 | , m_input() |
34 | , m_output() |
35 | , m_workerPipe(&m_input, &m_output) |
36 | , m_tr(nullptr) |
37 | { |
38 | m_input.open(STDIN_FILENO, ioFlags: QIODevice::ReadOnly | QIODevice::Unbuffered ); |
39 | m_output.open(STDOUT_FILENO, ioFlags: QIODevice::WriteOnly | QIODevice::Unbuffered ); |
40 | |
41 | static int s_idleTimeout = 1000 * 60 * 1; // 1 min |
42 | m_idleTime = KIdleTime::instance(); |
43 | m_idleTime->addIdleTimeout(msec: s_idleTimeout); |
44 | connect(sender: m_idleTime, signal: &KIdleTime::resumingFromIdle, context: this, slot: [this]() { |
45 | qCInfo(BALOO) << "Busy, paced indexing" ; |
46 | m_isBusy = true; |
47 | }); |
48 | connect(sender: m_idleTime, signal: qOverload<int, int>(&KIdleTime::timeoutReached), context: this, slot: [this]() { |
49 | qCInfo(BALOO) << "Not busy, fast indexing" ; |
50 | m_isBusy = false; |
51 | }); |
52 | |
53 | using WorkerPipe = Baloo::Private::WorkerPipe; |
54 | connect(sender: &m_notifyNewData, signal: &QSocketNotifier::activated, context: &m_workerPipe, slot: &WorkerPipe::processIdData); |
55 | connect(sender: &m_workerPipe, signal: &WorkerPipe::newDocumentIds, context: this, slot: &App::slotNewBatch); |
56 | connect(sender: &m_workerPipe, signal: &WorkerPipe::inputEnd, context: this, slot: &QCoreApplication::quit); |
57 | } |
58 | |
59 | App::~App() |
60 | { |
61 | if (m_tr) { |
62 | // Abort the transaction in case the parent process exited |
63 | m_tr->abort(); |
64 | m_tr.reset(); |
65 | } |
66 | } |
67 | |
68 | void App::slotNewBatch(const QVector<quint64>& ids) |
69 | { |
70 | m_ids = ids; |
71 | |
72 | Database *db = globalDatabaseInstance(); |
73 | if (!db->open(mode: Database::ReadWriteDatabase)) { |
74 | qCCritical(BALOO) << "Failed to open the database" ; |
75 | exit(status: 1); |
76 | } |
77 | |
78 | Q_ASSERT(m_tr == nullptr); |
79 | |
80 | if (!m_isBusy) { |
81 | m_idleTime->catchNextResumeEvent(); |
82 | } |
83 | |
84 | QTimer::singleShot(interval: (m_isBusy ? 500 : 0), receiver: this, slot: [this, db] () { |
85 | // FIXME: The transaction is open for way too long. We should just open it for when we're |
86 | // committing the data not during the extraction. |
87 | m_tr = std::make_unique<Transaction>(args: db, args: Transaction::ReadWrite); |
88 | processNextFile(); |
89 | }); |
90 | |
91 | /** |
92 | * A Single Batch seems to be triggering the SocketNotifier more than once |
93 | * so we disable it till the batch is done. |
94 | */ |
95 | m_notifyNewData.setEnabled(false); |
96 | } |
97 | |
98 | void App::processNextFile() |
99 | { |
100 | if (!m_ids.isEmpty()) { |
101 | quint64 id = m_ids.takeFirst(); |
102 | |
103 | QString url = QFile::decodeName(localFileName: m_tr->documentUrl(id)); |
104 | if (url.isEmpty() || !QFile::exists(fileName: url)) { |
105 | m_tr->removeDocument(id); |
106 | QTimer::singleShot(interval: 0, receiver: this, slot: &App::processNextFile); |
107 | return; |
108 | } |
109 | |
110 | bool indexed = index(tr: m_tr.get(), filePath: url, id); |
111 | |
112 | int delay = (m_isBusy && indexed) ? 10 : 0; |
113 | QTimer::singleShot(interval: delay, receiver: this, slot: &App::processNextFile); |
114 | |
115 | } else { |
116 | bool ok = m_tr->commit(); |
117 | if (!ok) { |
118 | exit(status: 2); |
119 | } |
120 | m_tr.reset(); |
121 | |
122 | // Enable the SocketNotifier for the next batch |
123 | m_notifyNewData.setEnabled(true); |
124 | m_workerPipe.batchFinished(); |
125 | } |
126 | } |
127 | |
128 | bool App::index(Transaction* tr, const QString& url, quint64 id) |
129 | { |
130 | if (!m_config.shouldBeIndexed(path: url)) { |
131 | // This apparently happens when the config has changed after the document |
132 | // was added to the content indexing db |
133 | qCDebug(BALOO) << "Found" << url << "in the ContentIndexingDB, although it should be skipped" ; |
134 | tr->removeDocument(id); |
135 | m_workerPipe.urlFailed(url); |
136 | return false; |
137 | } |
138 | |
139 | // The initial BasicIndexingJob run has been supplied with the file extension |
140 | // mimetype only, skip based on the "real" mimetype |
141 | QString mimetype = KFileMetaData::MimeUtils::strictMimeType(filePath: url, db: m_mimeDb).name(); |
142 | if (!m_config.shouldMimeTypeBeIndexed(mimeType: mimetype)) { |
143 | qCDebug(BALOO) << "Skipping" << url << "- mimetype:" << mimetype; |
144 | // FIXME: in case the extension based and content based mimetype differ |
145 | // we should update it. |
146 | tr->removePhaseOne(id); |
147 | m_workerPipe.urlFailed(url); |
148 | return false; |
149 | } |
150 | |
151 | // HACK: Also, we're ignoring ttext files which are greater tha 10 Mb as we |
152 | // have trouble processing them |
153 | // |
154 | if (mimetype.startsWith(s: QLatin1String("text/" ))) { |
155 | QFileInfo fileInfo(url); |
156 | if (fileInfo.size() >= 10 * 1024 * 1024) { |
157 | qCDebug(BALOO) << "Skipping large " << url << "- mimetype:" << mimetype; |
158 | tr->removePhaseOne(id); |
159 | m_workerPipe.urlFailed(url); |
160 | return false; |
161 | } |
162 | } |
163 | qCDebug(BALOO) << "Indexing" << id << url << mimetype; |
164 | m_workerPipe.urlStarted(url); |
165 | |
166 | // We always run the basic indexing again. This is mostly so that the proper |
167 | // mimetype is set and we get proper type information. |
168 | // The mimetype fetched in the BasicIndexingJob is fast but not accurate |
169 | BasicIndexingJob basicIndexer(url, mimetype, BasicIndexingJob::NoLevel); |
170 | basicIndexer.index(); |
171 | |
172 | Baloo::Document doc = basicIndexer.document(); |
173 | |
174 | Result result(url, mimetype, KFileMetaData::ExtractionResult::ExtractMetaData | KFileMetaData::ExtractionResult::ExtractPlainText); |
175 | result.setDocument(doc); |
176 | |
177 | const QList<KFileMetaData::Extractor*> exList = m_extractorCollection.fetchExtractors(mimetype); |
178 | |
179 | for (KFileMetaData::Extractor* ex : exList) { |
180 | ex->extract(result: &result); |
181 | } |
182 | |
183 | result.finish(); |
184 | if (doc.id() != id) { |
185 | qCWarning(BALOO) << url << "id seems to have changed. Perhaps baloo was not running, and this file was deleted + re-created" ; |
186 | tr->removeDocument(id); |
187 | if (!tr->hasDocument(id: doc.id())) { |
188 | tr->addDocument(doc: result.document()); |
189 | } else { |
190 | tr->replaceDocument(doc: result.document(), operations: DocumentTerms | DocumentData); |
191 | } |
192 | } else { |
193 | tr->replaceDocument(doc: result.document(), operations: DocumentTerms | DocumentData); |
194 | } |
195 | tr->removePhaseOne(id: doc.id()); |
196 | m_workerPipe.urlFinished(url); |
197 | return true; |
198 | } |
199 | |
200 | #include "moc_app.cpp" |
201 | |