1 | /* |
2 | This file is part of the KDE Baloo Project |
3 | SPDX-FileCopyrightText: 2013-2015 Vishesh Handa <vhanda@kde.org> |
4 | |
5 | SPDX-License-Identifier: LGPL-2.1-only OR LGPL-3.0-only OR LicenseRef-KDE-Accepted-LGPL |
6 | */ |
7 | |
8 | #include "app.h" |
9 | #include "basicindexingjob.h" |
10 | #include "result.h" |
11 | #include "idutils.h" |
12 | #include "transaction.h" |
13 | #include "baloodebug.h" |
14 | #include "global.h" |
15 | |
16 | #include <QCoreApplication> |
17 | |
18 | #include <QTimer> |
19 | #include <QFileInfo> |
20 | |
21 | #include <KFileMetaData/Extractor> |
22 | #include <KFileMetaData/MimeUtils> |
23 | #include <KIdleTime> |
24 | |
25 | #include <unistd.h> //for STDIN_FILENO |
26 | #include <iostream> |
27 | |
28 | using namespace Baloo; |
29 | |
30 | App::App(QObject* parent) |
31 | : QObject(parent) |
32 | , m_notifyNewData(STDIN_FILENO, QSocketNotifier::Read) |
33 | , m_input() |
34 | , m_output() |
35 | , m_workerPipe(&m_input, &m_output) |
36 | , m_tr(nullptr) |
37 | { |
38 | m_input.open(STDIN_FILENO, ioFlags: QIODevice::ReadOnly | QIODevice::Unbuffered ); |
39 | m_output.open(STDOUT_FILENO, ioFlags: QIODevice::WriteOnly | QIODevice::Unbuffered ); |
40 | |
41 | static int s_idleTimeout = 1000 * 60 * 1; // 1 min |
42 | m_idleTime = KIdleTime::instance(); |
43 | m_idleTime->addIdleTimeout(msec: s_idleTimeout); |
44 | connect(sender: m_idleTime, signal: &KIdleTime::resumingFromIdle, context: this, slot: [this]() { |
45 | qCInfo(BALOO) << "Busy, paced indexing" ; |
46 | m_isBusy = true; |
47 | }); |
48 | connect(sender: m_idleTime, signal: &KIdleTime::timeoutReached, context: this, slot: [this]() { |
49 | qCInfo(BALOO) << "Not busy, fast indexing" ; |
50 | m_isBusy = false; |
51 | }); |
52 | |
53 | using WorkerPipe = Baloo::Private::WorkerPipe; |
54 | connect(sender: &m_notifyNewData, signal: &QSocketNotifier::activated, context: &m_workerPipe, slot: &WorkerPipe::processIdData); |
55 | connect(sender: &m_workerPipe, signal: &WorkerPipe::newDocumentIds, context: this, slot: &App::slotNewBatch); |
56 | connect(sender: &m_workerPipe, signal: &WorkerPipe::inputEnd, context: this, slot: &QCoreApplication::quit); |
57 | } |
58 | |
59 | App::~App() |
60 | { |
61 | if (m_tr) { |
62 | // Abort the transaction in case the parent process exited |
63 | m_tr->abort(); |
64 | m_tr.reset(); |
65 | } |
66 | } |
67 | |
68 | void App::slotNewBatch(const QVector<quint64>& ids) |
69 | { |
70 | m_ids = ids; |
71 | |
72 | Database *db = globalDatabaseInstance(); |
73 | if (db->open(mode: Database::CreateDatabase) != Database::OpenResult::Success) { |
74 | qCCritical(BALOO) << "Failed to open the database" ; |
75 | exit(status: 1); |
76 | } |
77 | |
78 | Q_ASSERT(m_tr == nullptr); |
79 | |
80 | if (!m_isBusy) { |
81 | m_idleTime->catchNextResumeEvent(); |
82 | } |
83 | |
84 | QTimer::singleShot(interval: (m_isBusy ? 500 : 0), receiver: this, slot: [this, db]() { |
85 | // FIXME: The transaction is open for way too long. We should just open it for when we're |
86 | // committing the data not during the extraction. |
87 | m_tr = std::make_unique<Transaction>(args: db, args: Transaction::ReadWrite); |
88 | processNextFile(); |
89 | }); |
90 | } |
91 | |
92 | void App::processNextFile() |
93 | { |
94 | if (!m_ids.isEmpty()) { |
95 | quint64 id = m_ids.takeFirst(); |
96 | |
97 | QString url = QFile::decodeName(localFileName: m_tr->documentUrl(id)); |
98 | if (url.isEmpty() || !QFile::exists(fileName: url)) { |
99 | m_tr->removeDocument(id); |
100 | QTimer::singleShot(interval: 0, receiver: this, slot: &App::processNextFile); |
101 | return; |
102 | } |
103 | |
104 | bool indexed = index(tr: m_tr.get(), filePath: url, id); |
105 | |
106 | int delay = (m_isBusy && indexed) ? 10 : 0; |
107 | QTimer::singleShot(interval: delay, receiver: this, slot: &App::processNextFile); |
108 | |
109 | } else { |
110 | bool ok = m_tr->commit(); |
111 | if (!ok) { |
112 | exit(status: 2); |
113 | } |
114 | m_tr.reset(); |
115 | |
116 | m_workerPipe.batchFinished(); |
117 | } |
118 | } |
119 | |
120 | bool App::index(Transaction* tr, const QString& url, quint64 id) |
121 | { |
122 | if (!m_config.shouldBeIndexed(path: url)) { |
123 | // This apparently happens when the config has changed after the document |
124 | // was added to the content indexing db |
125 | qCDebug(BALOO) << "Found" << url << "in the ContentIndexingDB, although it should be skipped" ; |
126 | tr->removeDocument(id); |
127 | m_workerPipe.urlFailed(url); |
128 | return false; |
129 | } |
130 | |
131 | // The initial BasicIndexingJob run has been supplied with the file extension |
132 | // mimetype only, skip based on the "real" mimetype |
133 | QString mimetype = KFileMetaData::MimeUtils::strictMimeType(filePath: url, db: m_mimeDb).name(); |
134 | if (!m_config.shouldMimeTypeBeIndexed(mimeType: mimetype)) { |
135 | qCDebug(BALOO) << "Skipping" << url << "- mimetype:" << mimetype; |
136 | // FIXME: in case the extension based and content based mimetype differ |
137 | // we should update it. |
138 | tr->removePhaseOne(id); |
139 | m_workerPipe.urlFailed(url); |
140 | return false; |
141 | } |
142 | |
143 | // HACK: Also, we're ignoring ttext files which are greater tha 10 Mb as we |
144 | // have trouble processing them |
145 | // |
146 | if (mimetype.startsWith(s: QLatin1String("text/" ))) { |
147 | QFileInfo fileInfo(url); |
148 | if (fileInfo.size() >= 10 * 1024 * 1024) { |
149 | qCDebug(BALOO) << "Skipping large " << url << "- mimetype:" << mimetype; |
150 | tr->removePhaseOne(id); |
151 | m_workerPipe.urlFailed(url); |
152 | return false; |
153 | } |
154 | } |
155 | qCDebug(BALOO) << "Indexing" << id << url << mimetype; |
156 | m_workerPipe.urlStarted(url); |
157 | |
158 | // We always run the basic indexing again. This is mostly so that the proper |
159 | // mimetype is set and we get proper type information. |
160 | // The mimetype fetched in the BasicIndexingJob is fast but not accurate |
161 | BasicIndexingJob basicIndexer(url, mimetype, BasicIndexingJob::NoLevel); |
162 | if (!basicIndexer.index()) { |
163 | qCDebug(BALOO) << "Skipping non-existing file " << url << "- mimetype:" << mimetype; |
164 | tr->removePhaseOne(id); |
165 | m_workerPipe.urlFailed(url); |
166 | return false; |
167 | } |
168 | |
169 | Baloo::Document doc = basicIndexer.document(); |
170 | |
171 | Result result(url, mimetype, KFileMetaData::ExtractionResult::ExtractMetaData | KFileMetaData::ExtractionResult::ExtractPlainText); |
172 | result.setDocument(doc); |
173 | |
174 | const QList<KFileMetaData::Extractor*> exList = m_extractorCollection.fetchExtractors(mimetype); |
175 | |
176 | for (KFileMetaData::Extractor* ex : exList) { |
177 | ex->extract(result: &result); |
178 | } |
179 | |
180 | result.finish(); |
181 | if (doc.id() != id) { |
182 | qCWarning(BALOO) << url << "id seems to have changed. Perhaps baloo was not running, and this file was deleted + re-created" ; |
183 | tr->removeDocument(id); |
184 | if (!tr->hasDocument(id: doc.id())) { |
185 | tr->addDocument(doc: result.document()); |
186 | } else { |
187 | tr->replaceDocument(doc: result.document(), operations: DocumentTerms | DocumentData); |
188 | } |
189 | } else { |
190 | tr->replaceDocument(doc: result.document(), operations: DocumentTerms | DocumentData); |
191 | } |
192 | tr->removePhaseOne(id: doc.id()); |
193 | m_workerPipe.urlFinished(url); |
194 | return true; |
195 | } |
196 | |
197 | #include "moc_app.cpp" |
198 | |