1/*
2 This file is part of the KDE Baloo project.
3 SPDX-FileCopyrightText: 2015 Vishesh Handa <vhanda@kde.org>
4
5 SPDX-License-Identifier: LGPL-2.1-or-later
6*/
7
8#include "transaction.h"
9#include "documentdb.h"
10#include "documenturldb.h"
11#include "documentiddb.h"
12#include "positiondb.h"
13#include "documentdatadb.h"
14
15#include "document.h"
16#include "enginequery.h"
17
18#include "andpostingiterator.h"
19#include "orpostingiterator.h"
20#include "phraseanditerator.h"
21
22#include "idutils.h"
23#include "database.h"
24#include "databasesize.h"
25
26#include "enginedebug.h"
27
28#include <QFile>
29#include <QFileInfo>
30
31#include <iostream>
32
33using namespace Baloo;
34
35Transaction::Transaction(const Database& db, Transaction::TransactionType type)
36 : m_dbis(db.m_dbis)
37 , m_env(db.m_env)
38{
39 init(type);
40}
41
42void Transaction::reset(TransactionType type)
43{
44 if (m_txn) {
45 qWarning(catFunc: ENGINE) << "Resetting a Transaction without calling abort/commit";
46 abort();
47 }
48 init(type);
49}
50
51void Transaction::init(TransactionType type)
52{
53 uint flags = type == ReadOnly ? MDB_RDONLY : 0;
54 int rc = mdb_txn_begin(env: m_env, parent: nullptr, flags, txn: &m_txn);
55 if (rc) {
56 qCDebug(ENGINE) << "Transaction" << mdb_strerror(err: rc);
57 return;
58 }
59
60 if (type == ReadWrite) {
61 m_writeTrans = std::make_unique<WriteTransaction>(args: m_dbis, args&: m_txn);
62 }
63}
64
65Transaction::Transaction(Database* db, Transaction::TransactionType type)
66 : Transaction(*db, type)
67{
68}
69
70Transaction::~Transaction()
71{
72 if (m_writeTrans) {
73 qWarning(catFunc: ENGINE) << "Closing an active WriteTransaction without calling abort/commit";
74 }
75
76 if (m_txn) {
77 abort();
78 }
79}
80
81bool Transaction::hasDocument(quint64 id) const
82{
83 Q_ASSERT(id > 0);
84
85 DocumentUrlDB docUrlDb(m_dbis.idTreeDbi, m_dbis.idFilenameDbi, m_txn);
86 return docUrlDb.contains(docId: id);
87}
88
89bool Transaction::inPhaseOne(quint64 id) const
90{
91 Q_ASSERT(id > 0);
92 DocumentIdDB contentIndexingDb(m_dbis.contentIndexingDbi, m_txn);
93 return contentIndexingDb.contains(docId: id);
94}
95
96bool Transaction::hasFailed(quint64 id) const
97{
98 Q_ASSERT(id > 0);
99 DocumentIdDB failedIdDb(m_dbis.failedIdDbi, m_txn);
100 return failedIdDb.contains(docId: id);
101}
102
103QVector<quint64> Transaction::failedIds(quint64 limit) const
104{
105 DocumentIdDB failedIdDb(m_dbis.failedIdDbi, m_txn);
106 return failedIdDb.fetchItems(size: limit);
107}
108
109QByteArray Transaction::documentUrl(quint64 id) const
110{
111 Q_ASSERT(m_txn);
112 Q_ASSERT(id > 0);
113
114 DocumentUrlDB docUrlDb(m_dbis.idTreeDbi, m_dbis.idFilenameDbi, m_txn);
115 return docUrlDb.get(docId: id);
116}
117
118quint64 Transaction::documentId(const QByteArray& path) const
119{
120 Q_ASSERT(m_txn);
121 Q_ASSERT(!path.isEmpty());
122
123 DocumentUrlDB docUrlDb(m_dbis.idTreeDbi, m_dbis.idFilenameDbi, m_txn);
124 QList<QByteArray> li = path.split(sep: '/');
125
126 quint64 parentId = 0;
127 for (const QByteArray& fileName : li) {
128 if (fileName.isEmpty()) {
129 continue;
130 }
131
132 parentId = docUrlDb.getId(docId: parentId, fileName);
133 if (!parentId) {
134 return 0;
135 }
136 }
137
138 return parentId;
139}
140
141DocumentTimeDB::TimeInfo Transaction::documentTimeInfo(quint64 id) const
142{
143 Q_ASSERT(m_txn);
144
145 DocumentTimeDB docTimeDb(m_dbis.docTimeDbi, m_txn);
146 return docTimeDb.get(docId: id);
147}
148
149QByteArray Transaction::documentData(quint64 id) const
150{
151 Q_ASSERT(m_txn);
152 Q_ASSERT(id > 0);
153
154 DocumentDataDB docDataDb(m_dbis.docDataDbi, m_txn);
155 return docDataDb.get(docId: id);
156}
157
158QVector<quint64> Transaction::fetchPhaseOneIds(int size) const
159{
160 Q_ASSERT(m_txn);
161 Q_ASSERT(size > 0);
162
163 DocumentIdDB contentIndexingDb(m_dbis.contentIndexingDbi, m_txn);
164 return contentIndexingDb.fetchItems(size);
165}
166
167QVector<QByteArray> Transaction::fetchTermsStartingWith(const QByteArray& term) const
168{
169 Q_ASSERT(term.size() > 0);
170
171 PostingDB postingDb(m_dbis.postingDbi, m_txn);
172 return postingDb.fetchTermsStartingWith(term);
173}
174
175uint Transaction::phaseOneSize() const
176{
177 Q_ASSERT(m_txn);
178
179 DocumentIdDB contentIndexingDb(m_dbis.contentIndexingDbi, m_txn);
180 return contentIndexingDb.size();
181}
182
183uint Transaction::size() const
184{
185 Q_ASSERT(m_txn);
186
187 DocumentDB docTermsDb(m_dbis.docTermsDbi, m_txn);
188 return docTermsDb.size();
189}
190
191//
192// Write Operations
193//
194void Transaction::setPhaseOne(quint64 id)
195{
196 Q_ASSERT(m_txn);
197 Q_ASSERT(id > 0);
198 Q_ASSERT(m_writeTrans);
199
200 DocumentIdDB contentIndexingDb(m_dbis.contentIndexingDbi, m_txn);
201 contentIndexingDb.put(docId: id);
202}
203
204void Transaction::removePhaseOne(quint64 id)
205{
206 Q_ASSERT(m_txn);
207 Q_ASSERT(id > 0);
208 Q_ASSERT(m_writeTrans);
209
210 DocumentIdDB contentIndexingDb(m_dbis.contentIndexingDbi, m_txn);
211 contentIndexingDb.del(docID: id);
212}
213
214void Transaction::addFailed(quint64 id)
215{
216 Q_ASSERT(m_txn);
217 Q_ASSERT(id > 0);
218 Q_ASSERT(m_writeTrans);
219
220 DocumentIdDB failedIdDb(m_dbis.failedIdDbi, m_txn);
221 failedIdDb.put(docId: id);
222}
223
224void Transaction::addDocument(const Document& doc)
225{
226 Q_ASSERT(m_txn);
227 Q_ASSERT(doc.id() > 0);
228 if (!m_writeTrans) {
229 qCWarning(ENGINE) << "m_writeTrans is null";
230 return;
231 }
232
233 m_writeTrans->addDocument(doc);
234}
235
236void Transaction::removeDocument(quint64 id)
237{
238 Q_ASSERT(m_txn);
239 Q_ASSERT(id > 0);
240 if (!m_writeTrans) {
241 qCWarning(ENGINE) << "m_writeTrans is null";
242 return;
243 }
244
245 m_writeTrans->removeDocument(id);
246}
247
248void Transaction::removeRecursively(quint64 id)
249{
250 Q_ASSERT(m_txn);
251 Q_ASSERT(id > 0);
252 if (!m_writeTrans) {
253 qCWarning(ENGINE) << "m_writeTrans is null";
254 return;
255 }
256
257 m_writeTrans->removeRecursively(parentId: id);
258}
259
260void Transaction::replaceDocument(const Document& doc, DocumentOperations operations)
261{
262 Q_ASSERT(m_txn);
263 Q_ASSERT(doc.id() > 0);
264 Q_ASSERT(m_writeTrans);
265 if (!hasDocument(id: doc.id())) {
266 qCDebug(ENGINE) << "Transaction::replaceDocument" << "Document does not exist";
267 }
268
269 if (!m_writeTrans) {
270 qCWarning(ENGINE) << "m_writeTrans is null";
271 return;
272 }
273
274 m_writeTrans->replaceDocument(doc, operations);
275}
276
277bool Transaction::commit()
278{
279 Q_ASSERT(m_txn);
280 if (!m_writeTrans) {
281 qCWarning(ENGINE) << "m_writeTrans is null";
282 return false;
283 }
284
285 m_writeTrans->commit();
286 m_writeTrans.reset();
287
288 int rc = mdb_txn_commit(txn: m_txn);
289 m_txn = nullptr;
290
291 if (rc) {
292 qCWarning(ENGINE) << "Transaction::commit" << mdb_strerror(err: rc);
293 return false;
294 }
295
296 return true;
297}
298
299void Transaction::abort()
300{
301 Q_ASSERT(m_txn);
302
303 mdb_txn_abort(txn: m_txn);
304 m_txn = nullptr;
305
306 m_writeTrans.reset();
307}
308
309//
310// Queries
311//
312
313PostingIterator* Transaction::postingIterator(const EngineQuery& query) const
314{
315 PostingDB postingDb(m_dbis.postingDbi, m_txn);
316 PositionDB positionDb(m_dbis.positionDBi, m_txn);
317
318 if (query.leaf()) {
319 if (query.op() == EngineQuery::Equal) {
320 return postingDb.iter(term: query.term());
321 } else if (query.op() == EngineQuery::StartsWith) {
322 return postingDb.prefixIter(term: query.term());
323 } else {
324 Q_ASSERT(0);
325 }
326 }
327
328 const auto subQueries = query.subQueries();
329 if (subQueries.isEmpty()) {
330 return nullptr;
331 }
332
333 Q_ASSERT(query.op() == EngineQuery::Phrase);
334 if (query.op() == EngineQuery::Phrase) {
335 if (subQueries.size() == 1) {
336 qCDebug(ENGINE) << "Degenerated Phrase with 1 Term:" << query;
337 return postingIterator(query: subQueries[0]);
338 }
339 QVector<VectorPositionInfoIterator*> vec;
340 vec.reserve(asize: subQueries.size());
341 for (const EngineQuery& q : subQueries) {
342 if (!q.leaf()) {
343 qCDebug(ENGINE) << "Transaction::toPostingIterator" << "Phrase subqueries must be leafs";
344 continue;
345 }
346 auto termMatch = positionDb.iter(term: q.term());
347 if (!termMatch) {
348 return nullptr;
349 }
350 vec << termMatch;
351 }
352
353 return new PhraseAndIterator(vec);
354 }
355
356 return nullptr;
357}
358
359PostingIterator* Transaction::postingCompIterator(const QByteArray& prefix, qlonglong value, PostingDB::Comparator com) const
360{
361 PostingDB postingDb(m_dbis.postingDbi, m_txn);
362 return postingDb.compIter(prefix, val: value, com);
363}
364
365PostingIterator* Transaction::postingCompIterator(const QByteArray& prefix, double value, PostingDB::Comparator com) const
366{
367 PostingDB postingDb(m_dbis.postingDbi, m_txn);
368 return postingDb.compIter(prefix, val: value, com);
369}
370
371PostingIterator* Transaction::postingCompIterator(const QByteArray& prefix, const QByteArray& value, PostingDB::Comparator com) const
372{
373 PostingDB postingDb(m_dbis.postingDbi, m_txn);
374 return postingDb.compIter(prefix, val: value, com);
375}
376
377PostingIterator* Transaction::mTimeRangeIter(quint32 beginTime, quint32 endTime) const
378{
379 MTimeDB mTimeDb(m_dbis.mtimeDbi, m_txn);
380 return mTimeDb.iterRange(beginTime, endTime);
381}
382
383PostingIterator* Transaction::docUrlIter(quint64 id) const
384{
385 DocumentUrlDB docUrlDb(m_dbis.idTreeDbi, m_dbis.idFilenameDbi, m_txn);
386 return docUrlDb.iter(docId: id);
387}
388
389//
390// Introspection
391//
392
393QVector<QByteArray> Transaction::documentTerms(quint64 docId) const
394{
395 Q_ASSERT(docId);
396
397 DocumentDB documentTermsDB(m_dbis.docTermsDbi, m_txn);
398 return documentTermsDB.get(docId);
399}
400
401QVector<QByteArray> Transaction::documentFileNameTerms(quint64 docId) const
402{
403 Q_ASSERT(docId);
404
405 DocumentDB documentFileNameTermsDB(m_dbis.docFilenameTermsDbi, m_txn);
406 return documentFileNameTermsDB.get(docId);
407}
408
409QVector<QByteArray> Transaction::documentXattrTerms(quint64 docId) const
410{
411 Q_ASSERT(docId);
412
413 DocumentDB documentXattrTermsDB(m_dbis.docXattrTermsDbi, m_txn);
414 return documentXattrTermsDB.get(docId);
415}
416
417//
418// File Size
419//
420static size_t dbiSize(MDB_txn* txn, MDB_dbi dbi)
421{
422 MDB_stat stat;
423 mdb_stat(txn, dbi, stat: &stat);
424
425 return (stat.ms_branch_pages + stat.ms_leaf_pages + stat.ms_overflow_pages) * stat.ms_psize;
426}
427
428DatabaseSize Transaction::dbSize()
429{
430 DatabaseSize dbSize;
431 dbSize.postingDb = dbiSize(txn: m_txn, dbi: m_dbis.postingDbi);
432 dbSize.positionDb = dbiSize(txn: m_txn, dbi: m_dbis.positionDBi);
433 dbSize.docTerms = dbiSize(txn: m_txn, dbi: m_dbis.docTermsDbi);
434 dbSize.docFilenameTerms = dbiSize(txn: m_txn, dbi: m_dbis.docFilenameTermsDbi);
435 dbSize.docXattrTerms = dbiSize(txn: m_txn, dbi: m_dbis.docXattrTermsDbi);
436
437 dbSize.idTree = dbiSize(txn: m_txn, dbi: m_dbis.idTreeDbi);
438 dbSize.idFilename = dbiSize(txn: m_txn, dbi: m_dbis.idFilenameDbi);
439
440 dbSize.docTime = dbiSize(txn: m_txn, dbi: m_dbis.docTimeDbi);
441 dbSize.docData = dbiSize(txn: m_txn, dbi: m_dbis.docDataDbi);
442
443 dbSize.contentIndexingIds = dbiSize(txn: m_txn, dbi: m_dbis.contentIndexingDbi);
444 dbSize.failedIds = dbiSize(txn: m_txn, dbi: m_dbis.failedIdDbi);
445
446 dbSize.mtimeDb = dbiSize(txn: m_txn, dbi: m_dbis.mtimeDbi);
447
448 dbSize.expectedSize = dbSize.postingDb + dbSize.positionDb + dbSize.docTerms + dbSize.docFilenameTerms
449 + dbSize.docXattrTerms + dbSize.idTree + dbSize.idFilename + dbSize.docTime
450 + dbSize.docData + dbSize.contentIndexingIds + dbSize.failedIds + dbSize.mtimeDb;
451
452 MDB_envinfo info;
453 mdb_env_info(env: m_env, stat: &info);
454 dbSize.actualSize = info.me_last_pgno * 4096; // TODO: separate page size
455
456 return dbSize;
457}
458
459//
460// Debugging
461//
462void Transaction::checkFsTree()
463{
464 DocumentDB documentTermsDB(m_dbis.docTermsDbi, m_txn);
465 DocumentDB documentXattrTermsDB(m_dbis.docXattrTermsDbi, m_txn);
466 DocumentDB documentFileNameTermsDB(m_dbis.docFilenameTermsDbi, m_txn);
467 DocumentUrlDB docUrlDb(m_dbis.idTreeDbi, m_dbis.idFilenameDbi, m_txn);
468 PostingDB postingDb(m_dbis.postingDbi, m_txn);
469
470 const auto map = postingDb.toTestMap();
471
472 QSet<quint64> allIds;
473 for (const auto& list : map) {
474 for (quint64 id : list) {
475 allIds << id;
476 }
477 }
478
479 std::cout << "Total Document IDs: " << allIds.size() << std::endl;
480
481 int count = 0;
482 for (quint64 id: std::as_const(t&: allIds)) {
483 QByteArray url = docUrlDb.get(docId: id);
484 if (url.isEmpty()) {
485 auto terms = documentTermsDB.get(docId: id);
486 auto fileNameTerms = documentFileNameTermsDB.get(docId: id);
487 auto xAttrTerms = documentXattrTermsDB.get(docId: id);
488
489 // Lets reverse engineer the terms
490 QList<QByteArray> newTerms;
491 QMapIterator<QByteArray, PostingList> it(map);
492 while (it.hasNext()) {
493 it.next();
494 if (it.value().contains(t: id)) {
495 newTerms << it.key();
496 }
497 }
498
499 std::cout << "Missing filePath for " << id << std::endl;
500 std::cout << "\tPostingDB Terms: ";
501 for (const QByteArray& term : std::as_const(t&: newTerms)) {
502 std::cout << qPrintable(QString::fromUtf8(term)) << " ";
503 }
504 std::cout << std::endl;
505
506 std::cout << "\tDocumentTermsDB: ";
507 for (const QByteArray& term : terms) {
508 std::cout << qPrintable(QString::fromUtf8(term)) << " ";
509 }
510 std::cout << std::endl;
511
512 std::cout << "\tFileNameTermsDB: ";
513 for (const QByteArray& term : fileNameTerms) {
514 std::cout << qPrintable(QString::fromUtf8(term)) << " ";
515 }
516 std::cout << std::endl;
517
518 std::cout << "\tXAttrTermsDB: ";
519 for (const QByteArray& term : xAttrTerms) {
520 std::cout << qPrintable(QString::fromUtf8(term)) << " ";
521 }
522 std::cout << std::endl;
523
524 count++;
525 } else if (!QFileInfo::exists(file: QString::fromUtf8(ba: url))) {
526 std::cout << "FilePath " << qPrintable(QString::fromUtf8(url)) << " for " << id << " does not exist"<< std::endl;
527 count++;
528 }
529 }
530
531 std::cout << "Invalid Entries: " << count << " (" << count * 100.0 / allIds.size() << "%)" << std::endl;
532}
533
534void Transaction::checkTermsDbinPostingDb()
535{
536 DocumentDB documentTermsDB(m_dbis.docTermsDbi, m_txn);
537 DocumentDB documentXattrTermsDB(m_dbis.docXattrTermsDbi, m_txn);
538 DocumentDB documentFileNameTermsDB(m_dbis.docFilenameTermsDbi, m_txn);
539 PostingDB postingDb(m_dbis.postingDbi, m_txn);
540
541 // Iterate over each document, and fetch all terms
542 // check if each term maps to its own id in the posting db
543
544 const auto map = postingDb.toTestMap();
545
546 QSet<quint64> allIds;
547 for (const auto& list : map) {
548 for (quint64 id : list) {
549 allIds << id;
550 }
551 }
552
553 std::cout << "PostingDB check .." << std::endl;
554 for (quint64 id : std::as_const(t&: allIds)) {
555 QVector<QByteArray> terms = documentTermsDB.get(docId: id);
556 terms += documentXattrTermsDB.get(docId: id);
557 terms += documentFileNameTermsDB.get(docId: id);
558
559 for (const QByteArray& term : std::as_const(t&: terms)) {
560 PostingList plist = postingDb.get(term);
561 if (!plist.contains(t: id)) {
562 std::cout << id << " is missing term " << qPrintable(QString::fromUtf8(term)) << std::endl;
563 }
564 }
565 }
566}
567
568void Transaction::checkPostingDbinTermsDb()
569{
570 DocumentDB documentTermsDB(m_dbis.docTermsDbi, m_txn);
571 DocumentDB documentXattrTermsDB(m_dbis.docXattrTermsDbi, m_txn);
572 DocumentDB documentFileNameTermsDB(m_dbis.docFilenameTermsDbi, m_txn);
573 PostingDB postingDb(m_dbis.postingDbi, m_txn);
574
575 QMap<QByteArray, PostingList> map = postingDb.toTestMap();
576 QMapIterator<QByteArray, PostingList> it(map);
577
578 std::cout << "DocumentTermsDB check .." << std::endl;
579 while (it.hasNext()) {
580 it.next();
581
582 const QByteArray& term = it.key();
583 const PostingList& list = it.value();
584 for (quint64 id : list) {
585 if (documentTermsDB.get(docId: id).contains(t: term)) {
586 continue;
587 }
588 if (documentFileNameTermsDB.get(docId: id).contains(t: term)) {
589 continue;
590 }
591 if (documentXattrTermsDB.get(docId: id).contains(t: term)) {
592 continue;
593 }
594 std::cout << id << " is missing " << qPrintable(QString::fromUtf8(term)) << " from document terms db" << std::endl;
595 }
596 }
597}
598

source code of baloo/src/engine/transaction.cpp