1/*
2 This file is part of the KDE Baloo project.
3 SPDX-FileCopyrightText: 2014-2015 Vishesh Handa <vhanda@kde.org>
4
5 SPDX-License-Identifier: LGPL-2.1-or-later
6*/
7
8#include "termgenerator.h"
9
10#include <QTextBoundaryFinder>
11
12using namespace Baloo;
13
14namespace {
15
16QString normalizeTerm(const QString &str)
17{
18 // Remove all accents. It is important to call toLower after normalization,
19 // since some exotic unicode symbols can remain uppercase
20 const QString denormalized = str.normalized(mode: QString::NormalizationForm_KD).toLower();
21
22 QString cleanString;
23 cleanString.reserve(asize: denormalized.size());
24 for (const auto& c : denormalized) {
25 if (!c.isMark()) {
26 cleanString.append(c);
27 }
28 }
29
30 return cleanString.normalized(mode: QString::NormalizationForm_KC);
31}
32
33void appendTerm(QByteArrayList &list, const QString &term)
34{
35 if (!term.isEmpty()) {
36 // Truncate the string to avoid arbitrarily long terms
37 list << QStringView(term).left(n: TermGenerator::maxTermSize).toUtf8();
38 }
39}
40
41}
42
43TermGenerator::TermGenerator(Document& doc)
44 : m_doc(doc)
45 , m_position(1)
46{
47}
48
49void TermGenerator::indexText(const QString& text)
50{
51 indexText(text, prefix: QByteArray());
52}
53
54QByteArrayList TermGenerator::termList(const QString& text_)
55{
56 QString text(text_);
57 text.replace(before: QLatin1Char('_'), after: QLatin1Char(' '));
58
59 int start = 0;
60
61 auto isSkipChar = [] (const QChar& c) {
62 return c.isPunct() || c.isMark() || c.isSpace() || (!c.isPrint() && !c.isSurrogate());
63 };
64
65 QByteArrayList list;
66 QTextBoundaryFinder bf(QTextBoundaryFinder::Word, text);
67 for (; bf.position() != -1; bf.toNextBoundary()) {
68 int end = bf.position();
69 while (start < end && isSkipChar(text[start])) {
70 start++;
71 }
72 if (end == start) {
73 continue;
74 }
75
76 // Typically we commit a term when we have an EndOfItem, starting
77 // from the last StartOfItem, everything between last EndOfItem and
78 // StartOfItem is just whitespace and punctuation. Unfortunately,
79 // most CJK characters do not trigger a StartOfItem and thus no
80 // EndOfItem, so everything in front of a StartOfItem has to be
81 // committed as well
82 bool commit = bf.boundaryReasons() & (QTextBoundaryFinder::EndOfItem | QTextBoundaryFinder::StartOfItem);
83
84 // Also commit term if end-of-text is reached or when we find
85 // any punctuation
86 if (!commit & (end == text.size() || isSkipChar(text[end]))) {
87 commit = true;
88 }
89
90 if (commit) {
91 const QString term = normalizeTerm(str: text.mid(position: start, n: end - start));
92 appendTerm(list, term);
93 start = end;
94 }
95 }
96 return list;
97}
98
99void TermGenerator::indexText(const QString& text, const QByteArray& prefix)
100{
101 const QByteArrayList terms = termList(text_: text);
102 if (terms.size() == 1) {
103 QByteArray finalArr = prefix + terms[0];
104 m_doc.addTerm(term: finalArr);
105 return;
106 }
107 for (const QByteArray& term : terms) {
108 QByteArray finalArr = prefix + term;
109
110 m_doc.addPositionTerm(term: finalArr, position: m_position);
111 m_position++;
112 }
113 m_position++;
114}
115
116void TermGenerator::indexFileNameText(const QString& text)
117{
118 const QByteArray prefix = QByteArrayLiteral("F");
119 const QByteArrayList terms = termList(text_: text);
120 if (terms.size() == 1) {
121 QByteArray finalArr = prefix + terms[0];
122 m_doc.addFileNameTerm(term: finalArr);
123 return;
124 }
125 for (const QByteArray& term : terms) {
126 QByteArray finalArr = prefix + term;
127
128 m_doc.addFileNamePositionTerm(term: finalArr, position: m_position);
129 m_position++;
130 }
131 m_position++;
132}
133
134void TermGenerator::indexXattrText(const QString& text, const QByteArray& prefix)
135{
136 const QByteArrayList terms = termList(text_: text);
137 if (terms.size() == 1) {
138 QByteArray finalArr = prefix + terms[0];
139 m_doc.addXattrTerm(term: finalArr);
140 return;
141 }
142 for (const QByteArray& term : terms) {
143 QByteArray finalArr = prefix + term;
144
145 m_doc.addXattrPositionTerm(term: finalArr, position: m_position);
146 m_position++;
147 }
148 m_position++;
149}
150
151int TermGenerator::position() const
152{
153 return m_position;
154}
155
156void TermGenerator::setPosition(int position)
157{
158 m_position = position;
159}
160

source code of baloo/src/engine/termgenerator.cpp