| 1 | /* |
| 2 | This file is part of the KDE Baloo Project |
| 3 | SPDX-FileCopyrightText: 2013-2015 Vishesh Handa <vhanda@kde.org> |
| 4 | |
| 5 | SPDX-License-Identifier: LGPL-2.1-only OR LGPL-3.0-only OR LicenseRef-KDE-Accepted-LGPL |
| 6 | */ |
| 7 | |
| 8 | #include "app.h" |
| 9 | #include "basicindexingjob.h" |
| 10 | #include "result.h" |
| 11 | #include "idutils.h" |
| 12 | #include "transaction.h" |
| 13 | #include "baloodebug.h" |
| 14 | #include "global.h" |
| 15 | |
| 16 | #include <QCoreApplication> |
| 17 | |
| 18 | #include <QTimer> |
| 19 | #include <QFileInfo> |
| 20 | |
| 21 | #include <KFileMetaData/Extractor> |
| 22 | #include <KFileMetaData/MimeUtils> |
| 23 | #include <KIdleTime> |
| 24 | |
| 25 | #include <unistd.h> //for STDIN_FILENO |
| 26 | #include <iostream> |
| 27 | |
| 28 | using namespace Baloo; |
| 29 | |
| 30 | App::App(QObject* parent) |
| 31 | : QObject(parent) |
| 32 | , m_notifyNewData(STDIN_FILENO, QSocketNotifier::Read) |
| 33 | , m_input() |
| 34 | , m_output() |
| 35 | , m_workerPipe(&m_input, &m_output) |
| 36 | , m_tr(nullptr) |
| 37 | { |
| 38 | m_input.open(STDIN_FILENO, ioFlags: QIODevice::ReadOnly | QIODevice::Unbuffered ); |
| 39 | m_output.open(STDOUT_FILENO, ioFlags: QIODevice::WriteOnly | QIODevice::Unbuffered ); |
| 40 | |
| 41 | static int s_idleTimeout = 1000 * 60 * 1; // 1 min |
| 42 | m_idleTime = KIdleTime::instance(); |
| 43 | m_idleTime->addIdleTimeout(msec: s_idleTimeout); |
| 44 | connect(sender: m_idleTime, signal: &KIdleTime::resumingFromIdle, context: this, slot: [this]() { |
| 45 | qCInfo(BALOO) << "Busy, paced indexing" ; |
| 46 | m_isBusy = true; |
| 47 | }); |
| 48 | connect(sender: m_idleTime, signal: &KIdleTime::timeoutReached, context: this, slot: [this]() { |
| 49 | qCInfo(BALOO) << "Not busy, fast indexing" ; |
| 50 | m_isBusy = false; |
| 51 | }); |
| 52 | |
| 53 | using WorkerPipe = Baloo::Private::WorkerPipe; |
| 54 | connect(sender: &m_notifyNewData, signal: &QSocketNotifier::activated, context: &m_workerPipe, slot: &WorkerPipe::processIdData); |
| 55 | connect(sender: &m_workerPipe, signal: &WorkerPipe::newDocumentIds, context: this, slot: &App::slotNewBatch); |
| 56 | connect(sender: &m_workerPipe, signal: &WorkerPipe::inputEnd, context: this, slot: &QCoreApplication::quit); |
| 57 | } |
| 58 | |
| 59 | App::~App() |
| 60 | { |
| 61 | if (m_tr) { |
| 62 | // Abort the transaction in case the parent process exited |
| 63 | m_tr->abort(); |
| 64 | m_tr.reset(); |
| 65 | } |
| 66 | } |
| 67 | |
| 68 | void App::slotNewBatch(const QVector<quint64>& ids) |
| 69 | { |
| 70 | m_ids = ids; |
| 71 | |
| 72 | Database *db = globalDatabaseInstance(); |
| 73 | if (db->open(mode: Database::CreateDatabase) != Database::OpenResult::Success) { |
| 74 | qCCritical(BALOO) << "Failed to open the database" ; |
| 75 | exit(status: 1); |
| 76 | } |
| 77 | |
| 78 | Q_ASSERT(m_tr == nullptr); |
| 79 | |
| 80 | if (!m_isBusy) { |
| 81 | m_idleTime->catchNextResumeEvent(); |
| 82 | } |
| 83 | |
| 84 | QTimer::singleShot(interval: (m_isBusy ? 500 : 0), receiver: this, slot: [this, db]() { |
| 85 | // FIXME: The transaction is open for way too long. We should just open it for when we're |
| 86 | // committing the data not during the extraction. |
| 87 | m_tr = std::make_unique<Transaction>(args: db, args: Transaction::ReadWrite); |
| 88 | processNextFile(); |
| 89 | }); |
| 90 | } |
| 91 | |
| 92 | void App::processNextFile() |
| 93 | { |
| 94 | if (!m_ids.isEmpty()) { |
| 95 | quint64 id = m_ids.takeFirst(); |
| 96 | |
| 97 | QString url = QFile::decodeName(localFileName: m_tr->documentUrl(id)); |
| 98 | if (url.isEmpty() || !QFile::exists(fileName: url)) { |
| 99 | m_tr->removeDocument(id); |
| 100 | QTimer::singleShot(interval: 0, receiver: this, slot: &App::processNextFile); |
| 101 | return; |
| 102 | } |
| 103 | |
| 104 | bool indexed = index(tr: m_tr.get(), filePath: url, id); |
| 105 | |
| 106 | int delay = (m_isBusy && indexed) ? 10 : 0; |
| 107 | QTimer::singleShot(interval: delay, receiver: this, slot: &App::processNextFile); |
| 108 | |
| 109 | } else { |
| 110 | bool ok = m_tr->commit(); |
| 111 | if (!ok) { |
| 112 | exit(status: 2); |
| 113 | } |
| 114 | m_tr.reset(); |
| 115 | |
| 116 | m_workerPipe.batchFinished(); |
| 117 | } |
| 118 | } |
| 119 | |
| 120 | bool App::index(Transaction* tr, const QString& url, quint64 id) |
| 121 | { |
| 122 | if (!m_config.shouldBeIndexed(path: url)) { |
| 123 | // This apparently happens when the config has changed after the document |
| 124 | // was added to the content indexing db |
| 125 | qCDebug(BALOO) << "Found" << url << "in the ContentIndexingDB, although it should be skipped" ; |
| 126 | tr->removeDocument(id); |
| 127 | m_workerPipe.urlFailed(url); |
| 128 | return false; |
| 129 | } |
| 130 | |
| 131 | // The initial BasicIndexingJob run has been supplied with the file extension |
| 132 | // mimetype only, skip based on the "real" mimetype |
| 133 | QString mimetype = KFileMetaData::MimeUtils::strictMimeType(filePath: url, db: m_mimeDb).name(); |
| 134 | if (!m_config.shouldMimeTypeBeIndexed(mimeType: mimetype)) { |
| 135 | qCDebug(BALOO) << "Skipping" << url << "- mimetype:" << mimetype; |
| 136 | // FIXME: in case the extension based and content based mimetype differ |
| 137 | // we should update it. |
| 138 | tr->removePhaseOne(id); |
| 139 | m_workerPipe.urlFailed(url); |
| 140 | return false; |
| 141 | } |
| 142 | |
| 143 | // HACK: Also, we're ignoring ttext files which are greater tha 10 Mb as we |
| 144 | // have trouble processing them |
| 145 | // |
| 146 | if (mimetype.startsWith(s: QLatin1String("text/" ))) { |
| 147 | QFileInfo fileInfo(url); |
| 148 | if (fileInfo.size() >= 10 * 1024 * 1024) { |
| 149 | qCDebug(BALOO) << "Skipping large " << url << "- mimetype:" << mimetype; |
| 150 | tr->removePhaseOne(id); |
| 151 | m_workerPipe.urlFailed(url); |
| 152 | return false; |
| 153 | } |
| 154 | } |
| 155 | qCDebug(BALOO) << "Indexing" << id << url << mimetype; |
| 156 | m_workerPipe.urlStarted(url); |
| 157 | |
| 158 | // We always run the basic indexing again. This is mostly so that the proper |
| 159 | // mimetype is set and we get proper type information. |
| 160 | // The mimetype fetched in the BasicIndexingJob is fast but not accurate |
| 161 | BasicIndexingJob basicIndexer(url, mimetype, BasicIndexingJob::NoLevel); |
| 162 | if (!basicIndexer.index()) { |
| 163 | qCDebug(BALOO) << "Skipping non-existing file " << url << "- mimetype:" << mimetype; |
| 164 | tr->removePhaseOne(id); |
| 165 | m_workerPipe.urlFailed(url); |
| 166 | return false; |
| 167 | } |
| 168 | |
| 169 | Baloo::Document doc = basicIndexer.document(); |
| 170 | |
| 171 | Result result(url, mimetype, KFileMetaData::ExtractionResult::ExtractMetaData | KFileMetaData::ExtractionResult::ExtractPlainText); |
| 172 | result.setDocument(doc); |
| 173 | |
| 174 | const QList<KFileMetaData::Extractor*> exList = m_extractorCollection.fetchExtractors(mimetype); |
| 175 | |
| 176 | for (KFileMetaData::Extractor* ex : exList) { |
| 177 | ex->extract(result: &result); |
| 178 | } |
| 179 | |
| 180 | result.finish(); |
| 181 | if (doc.id() != id) { |
| 182 | qCWarning(BALOO) << url << "id seems to have changed. Perhaps baloo was not running, and this file was deleted + re-created" ; |
| 183 | tr->removeDocument(id); |
| 184 | if (!tr->hasDocument(id: doc.id())) { |
| 185 | tr->addDocument(doc: result.document()); |
| 186 | } else { |
| 187 | tr->replaceDocument(doc: result.document(), operations: DocumentTerms | DocumentData); |
| 188 | } |
| 189 | } else { |
| 190 | tr->replaceDocument(doc: result.document(), operations: DocumentTerms | DocumentData); |
| 191 | } |
| 192 | tr->removePhaseOne(id: doc.id()); |
| 193 | m_workerPipe.urlFinished(url); |
| 194 | return true; |
| 195 | } |
| 196 | |
| 197 | #include "moc_app.cpp" |
| 198 | |