1 | /* |
2 | This file is part of the KDE Baloo project. |
3 | SPDX-FileCopyrightText: 2015 Vishesh Handa <vhanda@kde.org> |
4 | |
5 | SPDX-License-Identifier: LGPL-2.1-or-later |
6 | */ |
7 | |
8 | #include "transaction.h" |
9 | #include "documentdb.h" |
10 | #include "documenturldb.h" |
11 | #include "documentiddb.h" |
12 | #include "positiondb.h" |
13 | #include "documentdatadb.h" |
14 | |
15 | #include "document.h" |
16 | #include "enginequery.h" |
17 | |
18 | #include "andpostingiterator.h" |
19 | #include "orpostingiterator.h" |
20 | #include "phraseanditerator.h" |
21 | |
22 | #include "idutils.h" |
23 | #include "database.h" |
24 | #include "databasesize.h" |
25 | |
26 | #include "enginedebug.h" |
27 | |
28 | #include <QFile> |
29 | #include <QFileInfo> |
30 | |
31 | #include <iostream> |
32 | |
33 | using namespace Baloo; |
34 | |
35 | Transaction::Transaction(const Database& db, Transaction::TransactionType type) |
36 | : m_dbis(db.m_dbis) |
37 | , m_env(db.m_env) |
38 | { |
39 | init(type); |
40 | } |
41 | |
42 | void Transaction::reset(TransactionType type) |
43 | { |
44 | if (m_txn) { |
45 | qWarning(catFunc: ENGINE) << "Resetting a Transaction without calling abort/commit" ; |
46 | abort(); |
47 | } |
48 | init(type); |
49 | } |
50 | |
51 | void Transaction::init(TransactionType type) |
52 | { |
53 | uint flags = type == ReadOnly ? MDB_RDONLY : 0; |
54 | int rc = mdb_txn_begin(env: m_env, parent: nullptr, flags, txn: &m_txn); |
55 | if (rc) { |
56 | qCDebug(ENGINE) << "Transaction" << mdb_strerror(err: rc); |
57 | return; |
58 | } |
59 | |
60 | if (type == ReadWrite) { |
61 | m_writeTrans = std::make_unique<WriteTransaction>(args: m_dbis, args&: m_txn); |
62 | } |
63 | } |
64 | |
65 | Transaction::Transaction(Database* db, Transaction::TransactionType type) |
66 | : Transaction(*db, type) |
67 | { |
68 | } |
69 | |
70 | Transaction::~Transaction() |
71 | { |
72 | if (m_writeTrans) { |
73 | qWarning(catFunc: ENGINE) << "Closing an active WriteTransaction without calling abort/commit" ; |
74 | } |
75 | |
76 | if (m_txn) { |
77 | abort(); |
78 | } |
79 | } |
80 | |
81 | bool Transaction::hasDocument(quint64 id) const |
82 | { |
83 | Q_ASSERT(id > 0); |
84 | |
85 | DocumentUrlDB docUrlDb(m_dbis.idTreeDbi, m_dbis.idFilenameDbi, m_txn); |
86 | return docUrlDb.contains(docId: id); |
87 | } |
88 | |
89 | bool Transaction::inPhaseOne(quint64 id) const |
90 | { |
91 | Q_ASSERT(id > 0); |
92 | DocumentIdDB contentIndexingDb(m_dbis.contentIndexingDbi, m_txn); |
93 | return contentIndexingDb.contains(docId: id); |
94 | } |
95 | |
96 | bool Transaction::hasFailed(quint64 id) const |
97 | { |
98 | Q_ASSERT(id > 0); |
99 | DocumentIdDB failedIdDb(m_dbis.failedIdDbi, m_txn); |
100 | return failedIdDb.contains(docId: id); |
101 | } |
102 | |
103 | QVector<quint64> Transaction::failedIds(quint64 limit) const |
104 | { |
105 | DocumentIdDB failedIdDb(m_dbis.failedIdDbi, m_txn); |
106 | return failedIdDb.fetchItems(size: limit); |
107 | } |
108 | |
109 | QByteArray Transaction::documentUrl(quint64 id) const |
110 | { |
111 | Q_ASSERT(m_txn); |
112 | Q_ASSERT(id > 0); |
113 | |
114 | DocumentUrlDB docUrlDb(m_dbis.idTreeDbi, m_dbis.idFilenameDbi, m_txn); |
115 | return docUrlDb.get(docId: id); |
116 | } |
117 | |
118 | quint64 Transaction::documentId(const QByteArray& path) const |
119 | { |
120 | Q_ASSERT(m_txn); |
121 | Q_ASSERT(!path.isEmpty()); |
122 | |
123 | DocumentUrlDB docUrlDb(m_dbis.idTreeDbi, m_dbis.idFilenameDbi, m_txn); |
124 | QList<QByteArray> li = path.split(sep: '/'); |
125 | |
126 | quint64 parentId = 0; |
127 | for (const QByteArray& fileName : li) { |
128 | if (fileName.isEmpty()) { |
129 | continue; |
130 | } |
131 | |
132 | parentId = docUrlDb.getId(docId: parentId, fileName); |
133 | if (!parentId) { |
134 | return 0; |
135 | } |
136 | } |
137 | |
138 | return parentId; |
139 | } |
140 | |
141 | DocumentTimeDB::TimeInfo Transaction::documentTimeInfo(quint64 id) const |
142 | { |
143 | Q_ASSERT(m_txn); |
144 | |
145 | DocumentTimeDB docTimeDb(m_dbis.docTimeDbi, m_txn); |
146 | return docTimeDb.get(docId: id); |
147 | } |
148 | |
149 | QByteArray Transaction::documentData(quint64 id) const |
150 | { |
151 | Q_ASSERT(m_txn); |
152 | Q_ASSERT(id > 0); |
153 | |
154 | DocumentDataDB docDataDb(m_dbis.docDataDbi, m_txn); |
155 | return docDataDb.get(docId: id); |
156 | } |
157 | |
158 | QVector<quint64> Transaction::fetchPhaseOneIds(int size) const |
159 | { |
160 | Q_ASSERT(m_txn); |
161 | Q_ASSERT(size > 0); |
162 | |
163 | DocumentIdDB contentIndexingDb(m_dbis.contentIndexingDbi, m_txn); |
164 | return contentIndexingDb.fetchItems(size); |
165 | } |
166 | |
167 | QVector<QByteArray> Transaction::fetchTermsStartingWith(const QByteArray& term) const |
168 | { |
169 | Q_ASSERT(term.size() > 0); |
170 | |
171 | PostingDB postingDb(m_dbis.postingDbi, m_txn); |
172 | return postingDb.fetchTermsStartingWith(term); |
173 | } |
174 | |
175 | uint Transaction::phaseOneSize() const |
176 | { |
177 | Q_ASSERT(m_txn); |
178 | |
179 | DocumentIdDB contentIndexingDb(m_dbis.contentIndexingDbi, m_txn); |
180 | return contentIndexingDb.size(); |
181 | } |
182 | |
183 | uint Transaction::size() const |
184 | { |
185 | Q_ASSERT(m_txn); |
186 | |
187 | DocumentDB docTermsDb(m_dbis.docTermsDbi, m_txn); |
188 | return docTermsDb.size(); |
189 | } |
190 | |
191 | // |
192 | // Write Operations |
193 | // |
194 | void Transaction::setPhaseOne(quint64 id) |
195 | { |
196 | Q_ASSERT(m_txn); |
197 | Q_ASSERT(id > 0); |
198 | Q_ASSERT(m_writeTrans); |
199 | |
200 | DocumentIdDB contentIndexingDb(m_dbis.contentIndexingDbi, m_txn); |
201 | contentIndexingDb.put(docId: id); |
202 | } |
203 | |
204 | void Transaction::removePhaseOne(quint64 id) |
205 | { |
206 | Q_ASSERT(m_txn); |
207 | Q_ASSERT(id > 0); |
208 | Q_ASSERT(m_writeTrans); |
209 | |
210 | DocumentIdDB contentIndexingDb(m_dbis.contentIndexingDbi, m_txn); |
211 | contentIndexingDb.del(docID: id); |
212 | } |
213 | |
214 | void Transaction::addFailed(quint64 id) |
215 | { |
216 | Q_ASSERT(m_txn); |
217 | Q_ASSERT(id > 0); |
218 | Q_ASSERT(m_writeTrans); |
219 | |
220 | DocumentIdDB failedIdDb(m_dbis.failedIdDbi, m_txn); |
221 | failedIdDb.put(docId: id); |
222 | } |
223 | |
224 | void Transaction::addDocument(const Document& doc) |
225 | { |
226 | Q_ASSERT(m_txn); |
227 | Q_ASSERT(doc.id() > 0); |
228 | if (!m_writeTrans) { |
229 | qCWarning(ENGINE) << "m_writeTrans is null" ; |
230 | return; |
231 | } |
232 | |
233 | m_writeTrans->addDocument(doc); |
234 | } |
235 | |
236 | void Transaction::removeDocument(quint64 id) |
237 | { |
238 | Q_ASSERT(m_txn); |
239 | Q_ASSERT(id > 0); |
240 | if (!m_writeTrans) { |
241 | qCWarning(ENGINE) << "m_writeTrans is null" ; |
242 | return; |
243 | } |
244 | |
245 | m_writeTrans->removeDocument(id); |
246 | } |
247 | |
248 | void Transaction::removeRecursively(quint64 id) |
249 | { |
250 | Q_ASSERT(m_txn); |
251 | Q_ASSERT(id > 0); |
252 | if (!m_writeTrans) { |
253 | qCWarning(ENGINE) << "m_writeTrans is null" ; |
254 | return; |
255 | } |
256 | |
257 | m_writeTrans->removeRecursively(parentId: id); |
258 | } |
259 | |
260 | void Transaction::replaceDocument(const Document& doc, DocumentOperations operations) |
261 | { |
262 | Q_ASSERT(m_txn); |
263 | Q_ASSERT(doc.id() > 0); |
264 | Q_ASSERT(m_writeTrans); |
265 | if (!hasDocument(id: doc.id())) { |
266 | qCDebug(ENGINE) << "Transaction::replaceDocument" << "Document does not exist" ; |
267 | } |
268 | |
269 | if (!m_writeTrans) { |
270 | qCWarning(ENGINE) << "m_writeTrans is null" ; |
271 | return; |
272 | } |
273 | |
274 | m_writeTrans->replaceDocument(doc, operations); |
275 | } |
276 | |
277 | bool Transaction::commit() |
278 | { |
279 | Q_ASSERT(m_txn); |
280 | if (!m_writeTrans) { |
281 | qCWarning(ENGINE) << "m_writeTrans is null" ; |
282 | return false; |
283 | } |
284 | |
285 | m_writeTrans->commit(); |
286 | m_writeTrans.reset(); |
287 | |
288 | int rc = mdb_txn_commit(txn: m_txn); |
289 | m_txn = nullptr; |
290 | |
291 | if (rc) { |
292 | qCWarning(ENGINE) << "Transaction::commit" << mdb_strerror(err: rc); |
293 | return false; |
294 | } |
295 | |
296 | return true; |
297 | } |
298 | |
299 | void Transaction::abort() |
300 | { |
301 | Q_ASSERT(m_txn); |
302 | |
303 | mdb_txn_abort(txn: m_txn); |
304 | m_txn = nullptr; |
305 | |
306 | m_writeTrans.reset(); |
307 | } |
308 | |
309 | // |
310 | // Queries |
311 | // |
312 | |
313 | PostingIterator* Transaction::postingIterator(const EngineQuery& query) const |
314 | { |
315 | PostingDB postingDb(m_dbis.postingDbi, m_txn); |
316 | PositionDB positionDb(m_dbis.positionDBi, m_txn); |
317 | |
318 | if (query.leaf()) { |
319 | if (query.op() == EngineQuery::Equal) { |
320 | return postingDb.iter(term: query.term()); |
321 | } else if (query.op() == EngineQuery::StartsWith) { |
322 | return postingDb.prefixIter(term: query.term()); |
323 | } else { |
324 | Q_ASSERT(0); |
325 | } |
326 | } |
327 | |
328 | const auto subQueries = query.subQueries(); |
329 | if (subQueries.isEmpty()) { |
330 | return nullptr; |
331 | } |
332 | |
333 | Q_ASSERT(query.op() == EngineQuery::Phrase); |
334 | if (query.op() == EngineQuery::Phrase) { |
335 | if (subQueries.size() == 1) { |
336 | qCDebug(ENGINE) << "Degenerated Phrase with 1 Term:" << query; |
337 | return postingIterator(query: subQueries[0]); |
338 | } |
339 | QVector<VectorPositionInfoIterator*> vec; |
340 | vec.reserve(asize: subQueries.size()); |
341 | for (const EngineQuery& q : subQueries) { |
342 | if (!q.leaf()) { |
343 | qCDebug(ENGINE) << "Transaction::toPostingIterator" << "Phrase subqueries must be leafs" ; |
344 | continue; |
345 | } |
346 | auto termMatch = positionDb.iter(term: q.term()); |
347 | if (!termMatch) { |
348 | return nullptr; |
349 | } |
350 | vec << termMatch; |
351 | } |
352 | |
353 | return new PhraseAndIterator(vec); |
354 | } |
355 | |
356 | return nullptr; |
357 | } |
358 | |
359 | PostingIterator* Transaction::postingCompIterator(const QByteArray& prefix, qlonglong value, PostingDB::Comparator com) const |
360 | { |
361 | PostingDB postingDb(m_dbis.postingDbi, m_txn); |
362 | return postingDb.compIter(prefix, val: value, com); |
363 | } |
364 | |
365 | PostingIterator* Transaction::postingCompIterator(const QByteArray& prefix, double value, PostingDB::Comparator com) const |
366 | { |
367 | PostingDB postingDb(m_dbis.postingDbi, m_txn); |
368 | return postingDb.compIter(prefix, val: value, com); |
369 | } |
370 | |
371 | PostingIterator* Transaction::postingCompIterator(const QByteArray& prefix, const QByteArray& value, PostingDB::Comparator com) const |
372 | { |
373 | PostingDB postingDb(m_dbis.postingDbi, m_txn); |
374 | return postingDb.compIter(prefix, val: value, com); |
375 | } |
376 | |
377 | PostingIterator* Transaction::mTimeRangeIter(quint32 beginTime, quint32 endTime) const |
378 | { |
379 | MTimeDB mTimeDb(m_dbis.mtimeDbi, m_txn); |
380 | return mTimeDb.iterRange(beginTime, endTime); |
381 | } |
382 | |
383 | PostingIterator* Transaction::docUrlIter(quint64 id) const |
384 | { |
385 | DocumentUrlDB docUrlDb(m_dbis.idTreeDbi, m_dbis.idFilenameDbi, m_txn); |
386 | return docUrlDb.iter(docId: id); |
387 | } |
388 | |
389 | // |
390 | // Introspection |
391 | // |
392 | |
393 | QVector<QByteArray> Transaction::documentTerms(quint64 docId) const |
394 | { |
395 | Q_ASSERT(docId); |
396 | |
397 | DocumentDB documentTermsDB(m_dbis.docTermsDbi, m_txn); |
398 | return documentTermsDB.get(docId); |
399 | } |
400 | |
401 | QVector<QByteArray> Transaction::documentFileNameTerms(quint64 docId) const |
402 | { |
403 | Q_ASSERT(docId); |
404 | |
405 | DocumentDB documentFileNameTermsDB(m_dbis.docFilenameTermsDbi, m_txn); |
406 | return documentFileNameTermsDB.get(docId); |
407 | } |
408 | |
409 | QVector<QByteArray> Transaction::documentXattrTerms(quint64 docId) const |
410 | { |
411 | Q_ASSERT(docId); |
412 | |
413 | DocumentDB documentXattrTermsDB(m_dbis.docXattrTermsDbi, m_txn); |
414 | return documentXattrTermsDB.get(docId); |
415 | } |
416 | |
417 | // |
418 | // File Size |
419 | // |
420 | static size_t dbiSize(MDB_txn* txn, MDB_dbi dbi) |
421 | { |
422 | MDB_stat stat; |
423 | mdb_stat(txn, dbi, stat: &stat); |
424 | |
425 | return (stat.ms_branch_pages + stat.ms_leaf_pages + stat.ms_overflow_pages) * stat.ms_psize; |
426 | } |
427 | |
428 | DatabaseSize Transaction::dbSize() |
429 | { |
430 | DatabaseSize dbSize; |
431 | dbSize.postingDb = dbiSize(txn: m_txn, dbi: m_dbis.postingDbi); |
432 | dbSize.positionDb = dbiSize(txn: m_txn, dbi: m_dbis.positionDBi); |
433 | dbSize.docTerms = dbiSize(txn: m_txn, dbi: m_dbis.docTermsDbi); |
434 | dbSize.docFilenameTerms = dbiSize(txn: m_txn, dbi: m_dbis.docFilenameTermsDbi); |
435 | dbSize.docXattrTerms = dbiSize(txn: m_txn, dbi: m_dbis.docXattrTermsDbi); |
436 | |
437 | dbSize.idTree = dbiSize(txn: m_txn, dbi: m_dbis.idTreeDbi); |
438 | dbSize.idFilename = dbiSize(txn: m_txn, dbi: m_dbis.idFilenameDbi); |
439 | |
440 | dbSize.docTime = dbiSize(txn: m_txn, dbi: m_dbis.docTimeDbi); |
441 | dbSize.docData = dbiSize(txn: m_txn, dbi: m_dbis.docDataDbi); |
442 | |
443 | dbSize.contentIndexingIds = dbiSize(txn: m_txn, dbi: m_dbis.contentIndexingDbi); |
444 | dbSize.failedIds = dbiSize(txn: m_txn, dbi: m_dbis.failedIdDbi); |
445 | |
446 | dbSize.mtimeDb = dbiSize(txn: m_txn, dbi: m_dbis.mtimeDbi); |
447 | |
448 | dbSize.expectedSize = dbSize.postingDb + dbSize.positionDb + dbSize.docTerms + dbSize.docFilenameTerms |
449 | + dbSize.docXattrTerms + dbSize.idTree + dbSize.idFilename + dbSize.docTime |
450 | + dbSize.docData + dbSize.contentIndexingIds + dbSize.failedIds + dbSize.mtimeDb; |
451 | |
452 | MDB_envinfo info; |
453 | mdb_env_info(env: m_env, stat: &info); |
454 | dbSize.actualSize = info.me_last_pgno * 4096; // TODO: separate page size |
455 | |
456 | return dbSize; |
457 | } |
458 | |
459 | // |
460 | // Debugging |
461 | // |
462 | void Transaction::checkFsTree() |
463 | { |
464 | DocumentDB documentTermsDB(m_dbis.docTermsDbi, m_txn); |
465 | DocumentDB documentXattrTermsDB(m_dbis.docXattrTermsDbi, m_txn); |
466 | DocumentDB documentFileNameTermsDB(m_dbis.docFilenameTermsDbi, m_txn); |
467 | DocumentUrlDB docUrlDb(m_dbis.idTreeDbi, m_dbis.idFilenameDbi, m_txn); |
468 | PostingDB postingDb(m_dbis.postingDbi, m_txn); |
469 | |
470 | const auto map = postingDb.toTestMap(); |
471 | |
472 | QSet<quint64> allIds; |
473 | for (const auto& list : map) { |
474 | for (quint64 id : list) { |
475 | allIds << id; |
476 | } |
477 | } |
478 | |
479 | std::cout << "Total Document IDs: " << allIds.size() << std::endl; |
480 | |
481 | int count = 0; |
482 | for (quint64 id: std::as_const(t&: allIds)) { |
483 | QByteArray url = docUrlDb.get(docId: id); |
484 | if (url.isEmpty()) { |
485 | auto terms = documentTermsDB.get(docId: id); |
486 | auto fileNameTerms = documentFileNameTermsDB.get(docId: id); |
487 | auto xAttrTerms = documentXattrTermsDB.get(docId: id); |
488 | |
489 | // Lets reverse engineer the terms |
490 | QList<QByteArray> newTerms; |
491 | QMapIterator<QByteArray, PostingList> it(map); |
492 | while (it.hasNext()) { |
493 | it.next(); |
494 | if (it.value().contains(t: id)) { |
495 | newTerms << it.key(); |
496 | } |
497 | } |
498 | |
499 | std::cout << "Missing filePath for " << id << std::endl; |
500 | std::cout << "\tPostingDB Terms: " ; |
501 | for (const QByteArray& term : std::as_const(t&: newTerms)) { |
502 | std::cout << qPrintable(QString::fromUtf8(term)) << " " ; |
503 | } |
504 | std::cout << std::endl; |
505 | |
506 | std::cout << "\tDocumentTermsDB: " ; |
507 | for (const QByteArray& term : terms) { |
508 | std::cout << qPrintable(QString::fromUtf8(term)) << " " ; |
509 | } |
510 | std::cout << std::endl; |
511 | |
512 | std::cout << "\tFileNameTermsDB: " ; |
513 | for (const QByteArray& term : fileNameTerms) { |
514 | std::cout << qPrintable(QString::fromUtf8(term)) << " " ; |
515 | } |
516 | std::cout << std::endl; |
517 | |
518 | std::cout << "\tXAttrTermsDB: " ; |
519 | for (const QByteArray& term : xAttrTerms) { |
520 | std::cout << qPrintable(QString::fromUtf8(term)) << " " ; |
521 | } |
522 | std::cout << std::endl; |
523 | |
524 | count++; |
525 | } else if (!QFileInfo::exists(file: QString::fromUtf8(ba: url))) { |
526 | std::cout << "FilePath " << qPrintable(QString::fromUtf8(url)) << " for " << id << " does not exist" << std::endl; |
527 | count++; |
528 | } |
529 | } |
530 | |
531 | std::cout << "Invalid Entries: " << count << " (" << count * 100.0 / allIds.size() << "%)" << std::endl; |
532 | } |
533 | |
534 | void Transaction::checkTermsDbinPostingDb() |
535 | { |
536 | DocumentDB documentTermsDB(m_dbis.docTermsDbi, m_txn); |
537 | DocumentDB documentXattrTermsDB(m_dbis.docXattrTermsDbi, m_txn); |
538 | DocumentDB documentFileNameTermsDB(m_dbis.docFilenameTermsDbi, m_txn); |
539 | PostingDB postingDb(m_dbis.postingDbi, m_txn); |
540 | |
541 | // Iterate over each document, and fetch all terms |
542 | // check if each term maps to its own id in the posting db |
543 | |
544 | const auto map = postingDb.toTestMap(); |
545 | |
546 | QSet<quint64> allIds; |
547 | for (const auto& list : map) { |
548 | for (quint64 id : list) { |
549 | allIds << id; |
550 | } |
551 | } |
552 | |
553 | std::cout << "PostingDB check .." << std::endl; |
554 | for (quint64 id : std::as_const(t&: allIds)) { |
555 | QVector<QByteArray> terms = documentTermsDB.get(docId: id); |
556 | terms += documentXattrTermsDB.get(docId: id); |
557 | terms += documentFileNameTermsDB.get(docId: id); |
558 | |
559 | for (const QByteArray& term : std::as_const(t&: terms)) { |
560 | PostingList plist = postingDb.get(term); |
561 | if (!plist.contains(t: id)) { |
562 | std::cout << id << " is missing term " << qPrintable(QString::fromUtf8(term)) << std::endl; |
563 | } |
564 | } |
565 | } |
566 | } |
567 | |
568 | void Transaction::checkPostingDbinTermsDb() |
569 | { |
570 | DocumentDB documentTermsDB(m_dbis.docTermsDbi, m_txn); |
571 | DocumentDB documentXattrTermsDB(m_dbis.docXattrTermsDbi, m_txn); |
572 | DocumentDB documentFileNameTermsDB(m_dbis.docFilenameTermsDbi, m_txn); |
573 | PostingDB postingDb(m_dbis.postingDbi, m_txn); |
574 | |
575 | QMap<QByteArray, PostingList> map = postingDb.toTestMap(); |
576 | QMapIterator<QByteArray, PostingList> it(map); |
577 | |
578 | std::cout << "DocumentTermsDB check .." << std::endl; |
579 | while (it.hasNext()) { |
580 | it.next(); |
581 | |
582 | const QByteArray& term = it.key(); |
583 | const PostingList& list = it.value(); |
584 | for (quint64 id : list) { |
585 | if (documentTermsDB.get(docId: id).contains(t: term)) { |
586 | continue; |
587 | } |
588 | if (documentFileNameTermsDB.get(docId: id).contains(t: term)) { |
589 | continue; |
590 | } |
591 | if (documentXattrTermsDB.get(docId: id).contains(t: term)) { |
592 | continue; |
593 | } |
594 | std::cout << id << " is missing " << qPrintable(QString::fromUtf8(term)) << " from document terms db" << std::endl; |
595 | } |
596 | } |
597 | } |
598 | |