1/*
2 This file is part of the KDE Baloo Project
3 SPDX-FileCopyrightText: 2013-2015 Vishesh Handa <vhanda@kde.org>
4
5 SPDX-License-Identifier: LGPL-2.1-only OR LGPL-3.0-only OR LicenseRef-KDE-Accepted-LGPL
6*/
7
8#include "app.h"
9#include "basicindexingjob.h"
10#include "result.h"
11#include "idutils.h"
12#include "transaction.h"
13#include "baloodebug.h"
14#include "global.h"
15
16#include <QCoreApplication>
17
18#include <QTimer>
19#include <QFileInfo>
20
21#include <KFileMetaData/Extractor>
22#include <KFileMetaData/MimeUtils>
23#include <KIdleTime>
24
25#include <unistd.h> //for STDIN_FILENO
26#include <iostream>
27
28using namespace Baloo;
29
30App::App(QObject* parent)
31 : QObject(parent)
32 , m_notifyNewData(STDIN_FILENO, QSocketNotifier::Read)
33 , m_input()
34 , m_output()
35 , m_workerPipe(&m_input, &m_output)
36 , m_tr(nullptr)
37{
38 m_input.open(STDIN_FILENO, ioFlags: QIODevice::ReadOnly | QIODevice::Unbuffered );
39 m_output.open(STDOUT_FILENO, ioFlags: QIODevice::WriteOnly | QIODevice::Unbuffered );
40
41 static int s_idleTimeout = 1000 * 60 * 1; // 1 min
42 m_idleTime = KIdleTime::instance();
43 m_idleTime->addIdleTimeout(msec: s_idleTimeout);
44 connect(sender: m_idleTime, signal: &KIdleTime::resumingFromIdle, context: this, slot: [this]() {
45 qCInfo(BALOO) << "Busy, paced indexing";
46 m_isBusy = true;
47 });
48 connect(sender: m_idleTime, signal: &KIdleTime::timeoutReached, context: this, slot: [this]() {
49 qCInfo(BALOO) << "Not busy, fast indexing";
50 m_isBusy = false;
51 });
52
53 using WorkerPipe = Baloo::Private::WorkerPipe;
54 connect(sender: &m_notifyNewData, signal: &QSocketNotifier::activated, context: &m_workerPipe, slot: &WorkerPipe::processIdData);
55 connect(sender: &m_workerPipe, signal: &WorkerPipe::newDocumentIds, context: this, slot: &App::slotNewBatch);
56 connect(sender: &m_workerPipe, signal: &WorkerPipe::inputEnd, context: this, slot: &QCoreApplication::quit);
57}
58
59App::~App()
60{
61 if (m_tr) {
62 // Abort the transaction in case the parent process exited
63 m_tr->abort();
64 m_tr.reset();
65 }
66}
67
68void App::slotNewBatch(const QVector<quint64>& ids)
69{
70 m_ids = ids;
71
72 Database *db = globalDatabaseInstance();
73 if (db->open(mode: Database::CreateDatabase) != Database::OpenResult::Success) {
74 qCCritical(BALOO) << "Failed to open the database";
75 exit(status: 1);
76 }
77
78 Q_ASSERT(m_tr == nullptr);
79
80 if (!m_isBusy) {
81 m_idleTime->catchNextResumeEvent();
82 }
83
84 QTimer::singleShot(interval: (m_isBusy ? 500 : 0), receiver: this, slot: [this, db]() {
85 // FIXME: The transaction is open for way too long. We should just open it for when we're
86 // committing the data not during the extraction.
87 m_tr = std::make_unique<Transaction>(args: db, args: Transaction::ReadWrite);
88 processNextFile();
89 });
90}
91
92void App::processNextFile()
93{
94 if (!m_ids.isEmpty()) {
95 quint64 id = m_ids.takeFirst();
96
97 QString url = QFile::decodeName(localFileName: m_tr->documentUrl(id));
98 if (url.isEmpty() || !QFile::exists(fileName: url)) {
99 m_tr->removeDocument(id);
100 QTimer::singleShot(interval: 0, receiver: this, slot: &App::processNextFile);
101 return;
102 }
103
104 bool indexed = index(tr: m_tr.get(), filePath: url, id);
105
106 int delay = (m_isBusy && indexed) ? 10 : 0;
107 QTimer::singleShot(interval: delay, receiver: this, slot: &App::processNextFile);
108
109 } else {
110 bool ok = m_tr->commit();
111 if (!ok) {
112 exit(status: 2);
113 }
114 m_tr.reset();
115
116 m_workerPipe.batchFinished();
117 }
118}
119
120bool App::index(Transaction* tr, const QString& url, quint64 id)
121{
122 if (!m_config.shouldBeIndexed(path: url)) {
123 // This apparently happens when the config has changed after the document
124 // was added to the content indexing db
125 qCDebug(BALOO) << "Found" << url << "in the ContentIndexingDB, although it should be skipped";
126 tr->removeDocument(id);
127 m_workerPipe.urlFailed(url);
128 return false;
129 }
130
131 // The initial BasicIndexingJob run has been supplied with the file extension
132 // mimetype only, skip based on the "real" mimetype
133 QString mimetype = KFileMetaData::MimeUtils::strictMimeType(filePath: url, db: m_mimeDb).name();
134 if (!m_config.shouldMimeTypeBeIndexed(mimeType: mimetype)) {
135 qCDebug(BALOO) << "Skipping" << url << "- mimetype:" << mimetype;
136 // FIXME: in case the extension based and content based mimetype differ
137 // we should update it.
138 tr->removePhaseOne(id);
139 m_workerPipe.urlFailed(url);
140 return false;
141 }
142
143 // HACK: Also, we're ignoring ttext files which are greater tha 10 Mb as we
144 // have trouble processing them
145 //
146 if (mimetype.startsWith(s: QLatin1String("text/"))) {
147 QFileInfo fileInfo(url);
148 if (fileInfo.size() >= 10 * 1024 * 1024) {
149 qCDebug(BALOO) << "Skipping large " << url << "- mimetype:" << mimetype;
150 tr->removePhaseOne(id);
151 m_workerPipe.urlFailed(url);
152 return false;
153 }
154 }
155 qCDebug(BALOO) << "Indexing" << id << url << mimetype;
156 m_workerPipe.urlStarted(url);
157
158 // We always run the basic indexing again. This is mostly so that the proper
159 // mimetype is set and we get proper type information.
160 // The mimetype fetched in the BasicIndexingJob is fast but not accurate
161 BasicIndexingJob basicIndexer(url, mimetype, BasicIndexingJob::NoLevel);
162 if (!basicIndexer.index()) {
163 qCDebug(BALOO) << "Skipping non-existing file " << url << "- mimetype:" << mimetype;
164 tr->removePhaseOne(id);
165 m_workerPipe.urlFailed(url);
166 return false;
167 }
168
169 Baloo::Document doc = basicIndexer.document();
170
171 Result result(url, mimetype, KFileMetaData::ExtractionResult::ExtractMetaData | KFileMetaData::ExtractionResult::ExtractPlainText);
172 result.setDocument(doc);
173
174 const QList<KFileMetaData::Extractor*> exList = m_extractorCollection.fetchExtractors(mimetype);
175
176 for (KFileMetaData::Extractor* ex : exList) {
177 ex->extract(result: &result);
178 }
179
180 result.finish();
181 if (doc.id() != id) {
182 qCWarning(BALOO) << url << "id seems to have changed. Perhaps baloo was not running, and this file was deleted + re-created";
183 tr->removeDocument(id);
184 if (!tr->hasDocument(id: doc.id())) {
185 tr->addDocument(doc: result.document());
186 } else {
187 tr->replaceDocument(doc: result.document(), operations: DocumentTerms | DocumentData);
188 }
189 } else {
190 tr->replaceDocument(doc: result.document(), operations: DocumentTerms | DocumentData);
191 }
192 tr->removePhaseOne(id: doc.id());
193 m_workerPipe.urlFinished(url);
194 return true;
195}
196
197#include "moc_app.cpp"
198

source code of baloo/src/file/extractor/app.cpp