1/*
2 This file is part of the KDE libraries
3
4 SPDX-FileCopyrightText: 2017 Forrest Smith <forrestthewoods@gmail.com>
5 SPDX-FileCopyrightText: 2021 Waqar Ahmed <waqar.17a@gmail.com>
6
7 SPDX-License-Identifier: LGPL-2.0-or-later
8*/
9#include "kfuzzymatcher.h"
10
11#include <QList>
12#include <QString>
13#include <QStringView>
14
15/*
16 * Custom toLower function which is much faster than
17 * c.toLower() directly
18 */
19static inline constexpr QChar toLower(QChar c)
20{
21 return c.isLower() ? c : c.toLower();
22}
23
24// internal
25// clang-format off
26static bool match_recursive(QStringView::const_iterator pattern,
27 QStringView::const_iterator str,
28 int &outScore,
29 const QStringView::const_iterator strBegin,
30 const QStringView::const_iterator strEnd,
31 const QStringView::const_iterator patternEnd,
32 const uint8_t *srcMatches,
33 uint8_t *matches,
34 int nextMatch,
35 int &totalMatches,
36 int &recursionCount)
37{
38 static constexpr int recursionLimit = 10;
39 // max number of matches allowed, this should be enough
40 static constexpr int maxMatches = 256;
41
42 // Count recursions
43 ++recursionCount;
44 if (recursionCount >= recursionLimit) {
45 return false;
46 }
47
48 // Detect end of strings
49 if (pattern == patternEnd || str == strEnd) {
50 return false;
51 }
52
53 // Recursion params
54 bool recursiveMatch = false;
55 uint8_t bestRecursiveMatches[maxMatches];
56 int bestRecursiveScore = 0;
57
58 // Loop through pattern and str looking for a match
59 bool firstMatch = true;
60 QChar currentPatternChar = toLower(c: *pattern);
61 while (pattern != patternEnd && str != strEnd) {
62 // Found match
63 if (currentPatternChar == toLower(c: *str)) {
64 // Supplied matches buffer was too short
65 if (nextMatch >= maxMatches) {
66 return false;
67 }
68
69 // "Copy-on-Write" srcMatches into matches
70 if (firstMatch && srcMatches) {
71 memcpy(dest: matches, src: srcMatches, n: nextMatch);
72 firstMatch = false;
73 }
74
75 // Recursive call that "skips" this match
76 uint8_t recursiveMatches[maxMatches];
77 int recursiveScore = 0;
78 const auto strNextChar = std::next(x: str);
79 if (match_recursive(pattern, str: strNextChar, outScore&: recursiveScore, strBegin,
80 strEnd, patternEnd, srcMatches: matches, matches: recursiveMatches,
81 nextMatch, totalMatches, recursionCount)) {
82 // Pick best recursive score
83 if (!recursiveMatch || recursiveScore > bestRecursiveScore) {
84 memcpy(dest: bestRecursiveMatches, src: recursiveMatches, n: maxMatches);
85 bestRecursiveScore = recursiveScore;
86 }
87 recursiveMatch = true;
88 }
89
90 // Advance
91 matches[nextMatch++] = (uint8_t)(std::distance(first: strBegin, last: str));
92 ++pattern;
93 currentPatternChar = toLower(c: *pattern);
94 }
95 ++str;
96 }
97
98 // Determine if full pattern was matched
99 const bool matched = pattern == patternEnd;
100
101 // Calculate score
102 if (matched) {
103 static constexpr int firstSepScoreDiff = 3;
104
105 static constexpr int sequentialBonus = 20;
106 static constexpr int separatorBonus = 25; // bonus if match occurs after a separator
107 static constexpr int firstLetterBonus = 15; // bonus if the first letter is matched
108 static constexpr int firstLetterSepMatchBonus = firstLetterBonus - firstSepScoreDiff; // bonus if the first matched letter is camel or separator
109
110 static constexpr int unmatchedLetterPenalty = -1; // penalty for every letter that doesn't matter
111
112 int nonBeginSequenceBonus = 10;
113 // points by which nonBeginSequenceBonus is increment on every matched letter
114 static constexpr int nonBeginSequenceIncrement = 4;
115
116 // Initialize score
117 outScore = 100;
118
119#define debug_algo 0
120#if debug_algo
121#define dbg(...) qDebug(__VA_ARGS__)
122#else
123#define dbg(...)
124#endif
125
126 // Apply unmatched penalty
127 const int unmatched = (int)(std::distance(first: strBegin, last: strEnd)) - nextMatch;
128 outScore += unmatchedLetterPenalty * unmatched;
129 dbg("unmatchedLetterPenalty, unmatched count: %d, outScore: %d", unmatched, outScore);
130
131 bool inSeparatorSeq = false;
132 int i = 0;
133 if (matches[i] == 0) {
134 // First letter match has the highest score
135 outScore += firstLetterBonus + separatorBonus;
136 dbg("firstLetterBonus, outScore: %d", outScore);
137 inSeparatorSeq = true;
138 } else {
139 const QChar neighbor = *(strBegin + matches[i] - 1);
140 const QChar curr = *(strBegin + matches[i]);
141 const bool neighborSeparator = neighbor == QLatin1Char('_') || neighbor == QLatin1Char(' ');
142 if (neighborSeparator || (neighbor.isLower() && curr.isUpper())) {
143 // the first letter that got matched was a special char .e., camel or at a separator
144 outScore += firstLetterSepMatchBonus + separatorBonus;
145 dbg("firstLetterSepMatchBonus at %d, letter: %c, outScore: %d", matches[i], curr.toLatin1(), outScore);
146 inSeparatorSeq = true;
147 } else {
148 // nothing
149 nonBeginSequenceBonus += nonBeginSequenceIncrement;
150 }
151 // We didn't match any special positions, apply leading penalty
152 outScore += -(matches[i]);
153 dbg("LeadingPenalty because no first letter match, outScore: %d", outScore);
154 }
155 i++;
156
157 bool allConsecutive = true;
158 // Apply ordering bonuses
159 for (; i < nextMatch; ++i) {
160 const uint8_t currIdx = matches[i];
161 const uint8_t prevIdx = matches[i - 1];
162 // Sequential
163 if (currIdx == (prevIdx + 1)) {
164 if (i == matches[i]) {
165 // We are in a sequence beginning from first letter
166 outScore += sequentialBonus;
167 dbg("sequentialBonus at %d, letter: %c, outScore: %d", matches[i], (strBegin + currIdx)->toLatin1(), outScore);
168 } else if (inSeparatorSeq) {
169 // we are in a sequence beginning from a separator like camelHump or underscore
170 outScore += sequentialBonus - firstSepScoreDiff;
171 dbg("in separator seq, [sequentialBonus - 5] at %d, letter: %c, outScore: %d", matches[i], (strBegin + currIdx)->toLatin1(), outScore);
172 } else {
173 // We are in a random sequence
174 outScore += nonBeginSequenceBonus;
175 nonBeginSequenceBonus += nonBeginSequenceIncrement;
176 dbg("nonBeginSequenceBonus at %d, letter: %c, outScore: %d", matches[i], (strBegin + currIdx)->toLatin1(), outScore);
177 }
178 } else {
179 allConsecutive = false;
180
181 // there is a gap between matching chars, apply penalty
182 int penalty = -((currIdx - prevIdx)) - 2;
183 outScore += penalty;
184 inSeparatorSeq = false;
185 nonBeginSequenceBonus = 10;
186 dbg("gap penalty[%d] at %d, letter: %c, outScore: %d", penalty, matches[i], (strBegin + currIdx)->toLatin1(), outScore);
187 }
188
189 // Check for bonuses based on neighbor character value
190 // Camel case
191 const QChar neighbor = *(strBegin + currIdx - 1);
192 const QChar curr = *(strBegin + currIdx);
193 // if camel case bonus, then not snake / separator.
194 // This prevents double bonuses
195 const bool neighborSeparator = neighbor == QLatin1Char('_') || neighbor == QLatin1Char(' ');
196 if (neighborSeparator || (neighbor.isLower() && curr.isUpper())) {
197 outScore += separatorBonus;
198 dbg("separatorBonus at %d, letter: %c, outScore: %d", matches[i], (strBegin + currIdx)->toLatin1(), outScore);
199 inSeparatorSeq = true;
200 continue;
201 }
202 }
203
204 if (allConsecutive && nextMatch >= 4) {
205 outScore *= 2;
206 dbg("allConsecutive double the score, outScore: %d", outScore);
207 }
208 }
209
210 totalMatches = nextMatch;
211
212 // Return best result
213 if (recursiveMatch && (!matched || bestRecursiveScore > outScore)) {
214 // Recursive score is better than "this"
215 memcpy(dest: matches, src: bestRecursiveMatches, n: maxMatches);
216 outScore = bestRecursiveScore;
217 return true;
218 } else if (matched) {
219 // "this" score is better than recursive
220 return true;
221 } else {
222 // no match
223 return false;
224 }
225}
226// clang-format on
227
228static bool match_internal(QStringView pattern, QStringView str, int &outScore, unsigned char *matches)
229{
230 if (pattern.isEmpty()) {
231 return true;
232 }
233
234 int recursionCount = 0;
235
236 auto strIt = str.cbegin();
237 auto patternIt = pattern.cbegin();
238 const auto patternEnd = pattern.cend();
239 const auto strEnd = str.cend();
240
241 int total = 0;
242 return match_recursive(pattern: patternIt, str: strIt, outScore, strBegin: strIt, strEnd, patternEnd, srcMatches: nullptr, matches, nextMatch: 0, totalMatches&: total, recursionCount);
243}
244
245/**************************************************************/
246
247bool KFuzzyMatcher::matchSimple(QStringView pattern, QStringView str)
248{
249 auto patternIt = pattern.cbegin();
250 /*
251 * Instead of doing
252 *
253 * strIt.toLower() == patternIt.toLower()
254 *
255 * we convert patternIt to Upper / Lower as needed and compare with strIt. This
256 * saves us from calling toLower() on both strings, making things a little bit faster
257 */
258 bool lower = patternIt->isLower();
259 QChar cUp = lower ? patternIt->toUpper() : *patternIt;
260 QChar cLow = lower ? *patternIt : patternIt->toLower();
261 for (auto strIt = str.cbegin(); strIt != str.cend() && patternIt != pattern.cend(); ++strIt) {
262 if (*strIt == cLow || *strIt == cUp) {
263 ++patternIt;
264 lower = patternIt->isLower();
265 cUp = lower ? patternIt->toUpper() : *patternIt;
266 cLow = lower ? *patternIt : patternIt->toLower();
267 }
268 }
269
270 return patternIt == pattern.cend();
271}
272
273KFuzzyMatcher::Result KFuzzyMatcher::match(QStringView pattern, QStringView str)
274{
275 /*
276 * Simple substring matching to flush out non-matching strings
277 */
278 const bool simpleMatch = matchSimple(pattern, str);
279
280 KFuzzyMatcher::Result result;
281 result.matched = false;
282 result.score = 0;
283
284 if (!simpleMatch) {
285 return result;
286 }
287
288 // actual algorithm
289 int score = 0;
290 uint8_t matches[256];
291 const bool matched = match_internal(pattern, str, outScore&: score, matches);
292 result.matched = matched;
293 result.score = score;
294 return result;
295}
296
297QList<KFuzzyMatcher::Range> KFuzzyMatcher::matchedRanges(QStringView pattern, QStringView str, RangeType type)
298{
299 QList<KFuzzyMatcher::Range> ranges;
300 if (pattern.isEmpty()) {
301 return ranges;
302 }
303
304 int totalMatches = 0;
305 int score = 0;
306 int recursionCount = 0;
307
308 auto strIt = str.cbegin();
309 auto patternIt = pattern.cbegin();
310 const auto patternEnd = pattern.cend();
311 const auto strEnd = str.cend();
312
313 uint8_t matches[256];
314 auto res = match_recursive(pattern: patternIt, str: strIt, outScore&: score, strBegin: strIt, strEnd, patternEnd, srcMatches: nullptr, matches, nextMatch: 0, totalMatches, recursionCount);
315 // didn't match? => We don't care about results
316 if (!res && type == RangeType::FullyMatched) {
317 return {};
318 }
319
320 int previousMatch = 0;
321 for (int i = 0; i < totalMatches; ++i) {
322 auto matchPos = matches[i];
323 /*
324 * Check if this match is part of the previous
325 * match. If it is, we increase the length of
326 * the last range.
327 */
328 if (!ranges.isEmpty() && matchPos == previousMatch + 1) {
329 ranges.last().length++;
330 } else {
331 /*
332 * This is a new match inside the string
333 */
334 ranges.push_back(t: {/* start: */ .start: matchPos, /* length: */ .length: 1});
335 }
336 previousMatch = matchPos;
337 }
338
339 return ranges;
340}
341

source code of kcoreaddons/src/lib/text/kfuzzymatcher.cpp