1/*
2 This file is part of the KDE Baloo Project
3 SPDX-FileCopyrightText: 2013-2015 Vishesh Handa <vhanda@kde.org>
4
5 SPDX-License-Identifier: LGPL-2.1-only OR LGPL-3.0-only OR LicenseRef-KDE-Accepted-LGPL
6*/
7
8#include "app.h"
9#include "basicindexingjob.h"
10#include "result.h"
11#include "idutils.h"
12#include "transaction.h"
13#include "baloodebug.h"
14#include "global.h"
15
16#include <QCoreApplication>
17
18#include <QTimer>
19#include <QFileInfo>
20
21#include <KFileMetaData/Extractor>
22#include <KFileMetaData/MimeUtils>
23#include <KIdleTime>
24
25#include <unistd.h> //for STDIN_FILENO
26#include <iostream>
27
28using namespace Baloo;
29
30App::App(QObject* parent)
31 : QObject(parent)
32 , m_notifyNewData(STDIN_FILENO, QSocketNotifier::Read)
33 , m_input()
34 , m_output()
35 , m_workerPipe(&m_input, &m_output)
36 , m_tr(nullptr)
37{
38 m_input.open(STDIN_FILENO, ioFlags: QIODevice::ReadOnly | QIODevice::Unbuffered );
39 m_output.open(STDOUT_FILENO, ioFlags: QIODevice::WriteOnly | QIODevice::Unbuffered );
40
41 static int s_idleTimeout = 1000 * 60 * 1; // 1 min
42 m_idleTime = KIdleTime::instance();
43 m_idleTime->addIdleTimeout(msec: s_idleTimeout);
44 connect(sender: m_idleTime, signal: &KIdleTime::resumingFromIdle, context: this, slot: [this]() {
45 qCInfo(BALOO) << "Busy, paced indexing";
46 m_isBusy = true;
47 });
48 connect(sender: m_idleTime, signal: qOverload<int, int>(&KIdleTime::timeoutReached), context: this, slot: [this]() {
49 qCInfo(BALOO) << "Not busy, fast indexing";
50 m_isBusy = false;
51 });
52
53 using WorkerPipe = Baloo::Private::WorkerPipe;
54 connect(sender: &m_notifyNewData, signal: &QSocketNotifier::activated, context: &m_workerPipe, slot: &WorkerPipe::processIdData);
55 connect(sender: &m_workerPipe, signal: &WorkerPipe::newDocumentIds, context: this, slot: &App::slotNewBatch);
56 connect(sender: &m_workerPipe, signal: &WorkerPipe::inputEnd, context: this, slot: &QCoreApplication::quit);
57}
58
59App::~App()
60{
61 if (m_tr) {
62 // Abort the transaction in case the parent process exited
63 m_tr->abort();
64 m_tr.reset();
65 }
66}
67
68void App::slotNewBatch(const QVector<quint64>& ids)
69{
70 m_ids = ids;
71
72 Database *db = globalDatabaseInstance();
73 if (!db->open(mode: Database::ReadWriteDatabase)) {
74 qCCritical(BALOO) << "Failed to open the database";
75 exit(status: 1);
76 }
77
78 Q_ASSERT(m_tr == nullptr);
79
80 if (!m_isBusy) {
81 m_idleTime->catchNextResumeEvent();
82 }
83
84 QTimer::singleShot(interval: (m_isBusy ? 500 : 0), receiver: this, slot: [this, db] () {
85 // FIXME: The transaction is open for way too long. We should just open it for when we're
86 // committing the data not during the extraction.
87 m_tr = std::make_unique<Transaction>(args: db, args: Transaction::ReadWrite);
88 processNextFile();
89 });
90
91 /**
92 * A Single Batch seems to be triggering the SocketNotifier more than once
93 * so we disable it till the batch is done.
94 */
95 m_notifyNewData.setEnabled(false);
96}
97
98void App::processNextFile()
99{
100 if (!m_ids.isEmpty()) {
101 quint64 id = m_ids.takeFirst();
102
103 QString url = QFile::decodeName(localFileName: m_tr->documentUrl(id));
104 if (url.isEmpty() || !QFile::exists(fileName: url)) {
105 m_tr->removeDocument(id);
106 QTimer::singleShot(interval: 0, receiver: this, slot: &App::processNextFile);
107 return;
108 }
109
110 bool indexed = index(tr: m_tr.get(), filePath: url, id);
111
112 int delay = (m_isBusy && indexed) ? 10 : 0;
113 QTimer::singleShot(interval: delay, receiver: this, slot: &App::processNextFile);
114
115 } else {
116 bool ok = m_tr->commit();
117 if (!ok) {
118 exit(status: 2);
119 }
120 m_tr.reset();
121
122 // Enable the SocketNotifier for the next batch
123 m_notifyNewData.setEnabled(true);
124 m_workerPipe.batchFinished();
125 }
126}
127
128bool App::index(Transaction* tr, const QString& url, quint64 id)
129{
130 if (!m_config.shouldBeIndexed(path: url)) {
131 // This apparently happens when the config has changed after the document
132 // was added to the content indexing db
133 qCDebug(BALOO) << "Found" << url << "in the ContentIndexingDB, although it should be skipped";
134 tr->removeDocument(id);
135 m_workerPipe.urlFailed(url);
136 return false;
137 }
138
139 // The initial BasicIndexingJob run has been supplied with the file extension
140 // mimetype only, skip based on the "real" mimetype
141 QString mimetype = KFileMetaData::MimeUtils::strictMimeType(filePath: url, db: m_mimeDb).name();
142 if (!m_config.shouldMimeTypeBeIndexed(mimeType: mimetype)) {
143 qCDebug(BALOO) << "Skipping" << url << "- mimetype:" << mimetype;
144 // FIXME: in case the extension based and content based mimetype differ
145 // we should update it.
146 tr->removePhaseOne(id);
147 m_workerPipe.urlFailed(url);
148 return false;
149 }
150
151 // HACK: Also, we're ignoring ttext files which are greater tha 10 Mb as we
152 // have trouble processing them
153 //
154 if (mimetype.startsWith(s: QLatin1String("text/"))) {
155 QFileInfo fileInfo(url);
156 if (fileInfo.size() >= 10 * 1024 * 1024) {
157 qCDebug(BALOO) << "Skipping large " << url << "- mimetype:" << mimetype;
158 tr->removePhaseOne(id);
159 m_workerPipe.urlFailed(url);
160 return false;
161 }
162 }
163 qCDebug(BALOO) << "Indexing" << id << url << mimetype;
164 m_workerPipe.urlStarted(url);
165
166 // We always run the basic indexing again. This is mostly so that the proper
167 // mimetype is set and we get proper type information.
168 // The mimetype fetched in the BasicIndexingJob is fast but not accurate
169 BasicIndexingJob basicIndexer(url, mimetype, BasicIndexingJob::NoLevel);
170 basicIndexer.index();
171
172 Baloo::Document doc = basicIndexer.document();
173
174 Result result(url, mimetype, KFileMetaData::ExtractionResult::ExtractMetaData | KFileMetaData::ExtractionResult::ExtractPlainText);
175 result.setDocument(doc);
176
177 const QList<KFileMetaData::Extractor*> exList = m_extractorCollection.fetchExtractors(mimetype);
178
179 for (KFileMetaData::Extractor* ex : exList) {
180 ex->extract(result: &result);
181 }
182
183 result.finish();
184 if (doc.id() != id) {
185 qCWarning(BALOO) << url << "id seems to have changed. Perhaps baloo was not running, and this file was deleted + re-created";
186 tr->removeDocument(id);
187 if (!tr->hasDocument(id: doc.id())) {
188 tr->addDocument(doc: result.document());
189 } else {
190 tr->replaceDocument(doc: result.document(), operations: DocumentTerms | DocumentData);
191 }
192 } else {
193 tr->replaceDocument(doc: result.document(), operations: DocumentTerms | DocumentData);
194 }
195 tr->removePhaseOne(id: doc.id());
196 m_workerPipe.urlFinished(url);
197 return true;
198}
199
200#include "moc_app.cpp"
201

source code of baloo/src/file/extractor/app.cpp