1/*
2 This file is part of the KDE Baloo project.
3 SPDX-FileCopyrightText: 2014-2015 Vishesh Handa <vhanda@kde.org>
4
5 SPDX-License-Identifier: LGPL-2.1-or-later
6*/
7
8#include "termgenerator.h"
9#include "termgenerator_p.h"
10
11#include <QTextBoundaryFinder>
12
13using namespace Baloo;
14
15namespace {
16
17QString normalizeTerm(const QString &str)
18{
19 // Remove all accents. It is important to call toLower after normalization,
20 // since some exotic unicode symbols can remain uppercase
21 const QString denormalized = str.normalized(mode: QString::NormalizationForm_KD).toLower();
22
23 QString cleanString;
24 cleanString.reserve(asize: denormalized.size());
25 for (const auto& c : denormalized) {
26 if (!c.isMark()) {
27 cleanString.append(c);
28 }
29 }
30
31 return cleanString.normalized(mode: QString::NormalizationForm_KC);
32}
33
34void appendTerm(QByteArrayList &list, const QString &term)
35{
36 if (term.isEmpty()) {
37 return;
38 }
39 // Truncate the string to avoid arbitrarily long terms
40 auto utf8 = QStringView(term).left(n: TermGenerator::maxTermSize).toUtf8();
41 if (!utf8.isEmpty()) {
42 list.append(t: utf8);
43 }
44}
45
46}
47
48TermGenerator::TermGenerator(Document& doc)
49 : m_doc(doc)
50 , m_position(1)
51{
52}
53
54void TermGenerator::indexText(const QString& text)
55{
56 indexText(text, prefix: QByteArray());
57}
58
59QByteArrayList TermGenerator::termList(const QString& text_)
60{
61 if (!Baloo::detail::verifySurrogates(text: text_)) {
62 return {};
63 }
64
65 QString text(text_);
66 text.replace(before: QLatin1Char('_'), after: QLatin1Char(' '));
67
68 int start = 0;
69
70 auto isSkipChar = [] (const QChar& c) {
71 return c.isPunct() || c.isMark() || c.isSpace() || (!c.isPrint() && !c.isSurrogate());
72 };
73
74 QByteArrayList list;
75 QTextBoundaryFinder bf(QTextBoundaryFinder::Word, text);
76 for (; bf.position() != -1; bf.toNextBoundary()) {
77 int end = bf.position();
78 while (start < end && isSkipChar(text.at(i: start))) {
79 start++;
80 }
81 if (end == start) {
82 continue;
83 }
84
85 // Typically we commit a term when we have an EndOfItem, starting
86 // from the last StartOfItem, everything between last EndOfItem and
87 // StartOfItem is just whitespace and punctuation. Unfortunately,
88 // most CJK characters do not trigger a StartOfItem and thus no
89 // EndOfItem, so everything in front of a StartOfItem has to be
90 // committed as well
91 bool commit = bf.boundaryReasons() & (QTextBoundaryFinder::EndOfItem | QTextBoundaryFinder::StartOfItem);
92
93 // Also commit term if end-of-text is reached or when we find
94 // any punctuation
95 if (!commit & (end == text.size() || isSkipChar(text.at(i: end)))) {
96 commit = true;
97 }
98
99 if (commit) {
100 const QString term = normalizeTerm(str: text.mid(position: start, n: end - start));
101 appendTerm(list, term);
102 start = end;
103 }
104 }
105 return list;
106}
107
108void TermGenerator::indexText(const QString& text, const QByteArray& prefix)
109{
110 const QByteArrayList terms = termList(text_: text);
111 if (terms.size() == 1) {
112 QByteArray finalArr = prefix + terms[0];
113 m_doc.addTerm(term: finalArr);
114 return;
115 }
116 for (const QByteArray& term : terms) {
117 QByteArray finalArr = prefix + term;
118
119 m_doc.addPositionTerm(term: finalArr, position: m_position);
120 m_position++;
121 }
122 m_position++;
123}
124
125void TermGenerator::indexFileNameText(const QString& text)
126{
127 const QByteArray prefix = QByteArrayLiteral("F");
128 const QByteArrayList terms = termList(text_: text);
129 if (terms.size() == 1) {
130 QByteArray finalArr = prefix + terms[0];
131 m_doc.addFileNameTerm(term: finalArr);
132 return;
133 }
134 for (const QByteArray& term : terms) {
135 QByteArray finalArr = prefix + term;
136
137 m_doc.addFileNamePositionTerm(term: finalArr, position: m_position);
138 m_position++;
139 }
140 m_position++;
141}
142
143void TermGenerator::indexXattrText(const QString& text, const QByteArray& prefix)
144{
145 const QByteArrayList terms = termList(text_: text);
146 if (terms.size() == 1) {
147 QByteArray finalArr = prefix + terms[0];
148 m_doc.addXattrTerm(term: finalArr);
149 return;
150 }
151 for (const QByteArray& term : terms) {
152 QByteArray finalArr = prefix + term;
153
154 m_doc.addXattrPositionTerm(term: finalArr, position: m_position);
155 m_position++;
156 }
157 m_position++;
158}
159
160int TermGenerator::position() const
161{
162 return m_position;
163}
164
165void TermGenerator::setPosition(int position)
166{
167 m_position = position;
168}
169

source code of baloo/src/engine/termgenerator.cpp