1 | /* |
2 | This file is part of the KDE libraries |
3 | |
4 | SPDX-FileCopyrightText: 2017 Forrest Smith <forrestthewoods@gmail.com> |
5 | SPDX-FileCopyrightText: 2021 Waqar Ahmed <waqar.17a@gmail.com> |
6 | |
7 | SPDX-License-Identifier: LGPL-2.0-or-later |
8 | */ |
9 | #include "kfuzzymatcher.h" |
10 | |
11 | #include <QList> |
12 | #include <QString> |
13 | #include <QStringView> |
14 | |
15 | /** |
16 | * Custom toLower function which is much faster than |
17 | * c.toLower() directly |
18 | */ |
19 | static inline constexpr QChar toLower(QChar c) |
20 | { |
21 | return c.isLower() ? c : c.toLower(); |
22 | } |
23 | |
24 | // internal |
25 | // clang-format off |
26 | static bool match_recursive(QStringView::const_iterator pattern, |
27 | QStringView::const_iterator str, |
28 | int &outScore, |
29 | const QStringView::const_iterator strBegin, |
30 | const QStringView::const_iterator strEnd, |
31 | const QStringView::const_iterator patternEnd, |
32 | const uint8_t *srcMatches, |
33 | uint8_t *matches, |
34 | int nextMatch, |
35 | int &totalMatches, |
36 | int &recursionCount) |
37 | { |
38 | static constexpr int recursionLimit = 10; |
39 | // max number of matches allowed, this should be enough |
40 | static constexpr int maxMatches = 256; |
41 | |
42 | // Count recursions |
43 | ++recursionCount; |
44 | if (recursionCount >= recursionLimit) { |
45 | return false; |
46 | } |
47 | |
48 | // Detect end of strings |
49 | if (pattern == patternEnd || str == strEnd) { |
50 | return false; |
51 | } |
52 | |
53 | // Recursion params |
54 | bool recursiveMatch = false; |
55 | uint8_t bestRecursiveMatches[maxMatches]; |
56 | int bestRecursiveScore = 0; |
57 | |
58 | // Loop through pattern and str looking for a match |
59 | bool firstMatch = true; |
60 | QChar currentPatternChar = toLower(c: *pattern); |
61 | // Are we matching in sequence start from start? |
62 | bool matchingInSequence = true; |
63 | while (pattern != patternEnd && str != strEnd) { |
64 | // Found match |
65 | if (currentPatternChar == toLower(c: *str)) { |
66 | // Supplied matches buffer was too short |
67 | if (nextMatch >= maxMatches) { |
68 | return false; |
69 | } |
70 | |
71 | // "Copy-on-Write" srcMatches into matches |
72 | if (firstMatch && srcMatches) { |
73 | memcpy(dest: matches, src: srcMatches, n: nextMatch); |
74 | firstMatch = false; |
75 | } |
76 | |
77 | // Recursive call that "skips" this match |
78 | uint8_t recursiveMatches[maxMatches]; |
79 | int recursiveScore = 0; |
80 | const auto strNextChar = std::next(x: str); |
81 | if (!matchingInSequence && match_recursive(pattern, str: strNextChar, outScore&: recursiveScore, strBegin, |
82 | strEnd, patternEnd, srcMatches: matches, matches: recursiveMatches, |
83 | nextMatch, totalMatches, recursionCount)) { |
84 | // Pick best recursive score |
85 | if (!recursiveMatch || recursiveScore > bestRecursiveScore) { |
86 | memcpy(dest: bestRecursiveMatches, src: recursiveMatches, n: maxMatches); |
87 | bestRecursiveScore = recursiveScore; |
88 | } |
89 | recursiveMatch = true; |
90 | } |
91 | |
92 | // Advance |
93 | matches[nextMatch++] = (uint8_t)(std::distance(first: strBegin, last: str)); |
94 | ++pattern; |
95 | currentPatternChar = toLower(c: *pattern); |
96 | } else { |
97 | matchingInSequence = false; |
98 | } |
99 | ++str; |
100 | } |
101 | |
102 | // Determine if full pattern was matched |
103 | const bool matched = pattern == patternEnd; |
104 | |
105 | // Calculate score |
106 | if (matched) { |
107 | static constexpr int sequentialBonus = 25; |
108 | static constexpr int separatorBonus = 25; // bonus if match occurs after a separator |
109 | static constexpr int camelBonus = 25; // bonus if match is uppercase and prev is lower |
110 | static constexpr int firstLetterBonus = 15; // bonus if the first letter is matched |
111 | |
112 | static constexpr int leadingLetterPenalty = -5; // penalty applied for every letter in str before the first match |
113 | static constexpr int maxLeadingLetterPenalty = -15; // maximum penalty for leading letters |
114 | static constexpr int unmatchedLetterPenalty = -1; // penalty for every letter that doesn't matter |
115 | |
116 | static constexpr int nonBeginSequenceBonus = 10; |
117 | |
118 | // Initialize score |
119 | outScore = 100; |
120 | |
121 | // Apply leading letter penalty |
122 | const int penalty = std::max(a: leadingLetterPenalty * matches[0], b: maxLeadingLetterPenalty); |
123 | |
124 | outScore += penalty; |
125 | |
126 | // Apply unmatched penalty |
127 | const int unmatched = (int)(std::distance(first: strBegin, last: strEnd)) - nextMatch; |
128 | outScore += unmatchedLetterPenalty * unmatched; |
129 | |
130 | uint8_t seqs[maxMatches] = {0}; |
131 | |
132 | // Apply ordering bonuses |
133 | int j = 0; |
134 | for (int i = 0; i < nextMatch; ++i) { |
135 | const uint8_t currIdx = matches[i]; |
136 | |
137 | if (i > 0) { |
138 | const uint8_t prevIdx = matches[i - 1]; |
139 | |
140 | // Sequential |
141 | if (currIdx == (prevIdx + 1)) { |
142 | if (j > 0 && seqs[j - 1] == i - 1){ |
143 | outScore += sequentialBonus; |
144 | seqs[j++] = i; |
145 | } else { |
146 | // In sequence, but from first char |
147 | outScore += nonBeginSequenceBonus; |
148 | } |
149 | } |
150 | } |
151 | |
152 | // Check for bonuses based on neighbor character value |
153 | if (currIdx > 0) { |
154 | // Camel case |
155 | const QChar neighbor = *(strBegin + currIdx - 1); |
156 | const QChar curr = *(strBegin + currIdx); |
157 | if (neighbor.isLower() && curr.isUpper()) { |
158 | outScore += camelBonus; |
159 | } |
160 | |
161 | // Separator |
162 | const bool neighborSeparator = neighbor == QLatin1Char('_') || neighbor == QLatin1Char(' '); |
163 | if (neighborSeparator) { |
164 | outScore += separatorBonus; |
165 | } |
166 | } else { |
167 | // First letter |
168 | outScore += firstLetterBonus; |
169 | seqs[j++] = i; |
170 | } |
171 | } |
172 | } |
173 | |
174 | totalMatches = nextMatch; |
175 | |
176 | // Return best result |
177 | if (recursiveMatch && (!matched || bestRecursiveScore > outScore)) { |
178 | // Recursive score is better than "this" |
179 | memcpy(dest: matches, src: bestRecursiveMatches, n: maxMatches); |
180 | outScore = bestRecursiveScore; |
181 | return true; |
182 | } else if (matched) { |
183 | // "this" score is better than recursive |
184 | return true; |
185 | } else { |
186 | // no match |
187 | return false; |
188 | } |
189 | } |
190 | // clang-format on |
191 | |
192 | static bool match_internal(QStringView pattern, QStringView str, int &outScore, unsigned char *matches) |
193 | { |
194 | if (pattern.isEmpty()) { |
195 | return true; |
196 | } |
197 | |
198 | int recursionCount = 0; |
199 | |
200 | auto strIt = str.cbegin(); |
201 | auto patternIt = pattern.cbegin(); |
202 | const auto patternEnd = pattern.cend(); |
203 | const auto strEnd = str.cend(); |
204 | |
205 | int total = 0; |
206 | return match_recursive(pattern: patternIt, str: strIt, outScore, strBegin: strIt, strEnd, patternEnd, srcMatches: nullptr, matches, nextMatch: 0, totalMatches&: total, recursionCount); |
207 | } |
208 | |
209 | /**************************************************************/ |
210 | |
211 | bool KFuzzyMatcher::matchSimple(QStringView pattern, QStringView str) |
212 | { |
213 | auto patternIt = pattern.cbegin(); |
214 | /** |
215 | * Instead of doing |
216 | * |
217 | * strIt.toLower() == patternIt.toLower() |
218 | * |
219 | * we convert patternIt to Upper / Lower as needed and compare with strIt. This |
220 | * saves us from calling toLower() on both strings, making things a little bit faster |
221 | */ |
222 | bool lower = patternIt->isLower(); |
223 | QChar cUp = lower ? patternIt->toUpper() : *patternIt; |
224 | QChar cLow = lower ? *patternIt : patternIt->toLower(); |
225 | for (auto strIt = str.cbegin(); strIt != str.cend() && patternIt != pattern.cend(); ++strIt) { |
226 | if (*strIt == cLow || *strIt == cUp) { |
227 | ++patternIt; |
228 | lower = patternIt->isLower(); |
229 | cUp = lower ? patternIt->toUpper() : *patternIt; |
230 | cLow = lower ? *patternIt : patternIt->toLower(); |
231 | } |
232 | } |
233 | |
234 | return patternIt == pattern.cend(); |
235 | } |
236 | |
237 | KFuzzyMatcher::Result KFuzzyMatcher::match(QStringView pattern, QStringView str) |
238 | { |
239 | /** |
240 | * Simple substring matching to flush out non-matching strings |
241 | */ |
242 | const bool simpleMatch = matchSimple(pattern, str); |
243 | |
244 | KFuzzyMatcher::Result result; |
245 | result.matched = false; |
246 | result.score = 0; |
247 | |
248 | if (!simpleMatch) { |
249 | return result; |
250 | } |
251 | |
252 | // actual algorithm |
253 | int score = 0; |
254 | uint8_t matches[256]; |
255 | const bool matched = match_internal(pattern, str, outScore&: score, matches); |
256 | result.matched = matched; |
257 | result.score = score; |
258 | return result; |
259 | } |
260 | |
261 | QList<KFuzzyMatcher::Range> KFuzzyMatcher::matchedRanges(QStringView pattern, QStringView str, RangeType type) |
262 | { |
263 | QList<KFuzzyMatcher::Range> ranges; |
264 | if (pattern.isEmpty()) { |
265 | return ranges; |
266 | } |
267 | |
268 | int totalMatches = 0; |
269 | int score = 0; |
270 | int recursionCount = 0; |
271 | |
272 | auto strIt = str.cbegin(); |
273 | auto patternIt = pattern.cbegin(); |
274 | const auto patternEnd = pattern.cend(); |
275 | const auto strEnd = str.cend(); |
276 | |
277 | uint8_t matches[256]; |
278 | auto res = match_recursive(pattern: patternIt, str: strIt, outScore&: score, strBegin: strIt, strEnd, patternEnd, srcMatches: nullptr, matches, nextMatch: 0, totalMatches, recursionCount); |
279 | // didn't match? => We don't care about results |
280 | if (!res && type == RangeType::FullyMatched) { |
281 | return {}; |
282 | } |
283 | |
284 | int previousMatch = 0; |
285 | for (int i = 0; i < totalMatches; ++i) { |
286 | auto matchPos = matches[i]; |
287 | /** |
288 | * Check if this match is part of the previous |
289 | * match. If it is, we increase the length of |
290 | * the last range. |
291 | */ |
292 | if (!ranges.isEmpty() && matchPos == previousMatch + 1) { |
293 | ranges.last().length++; |
294 | } else { |
295 | /** |
296 | * This is a new match inside the string |
297 | */ |
298 | ranges.push_back(t: {/* start: */ .start: matchPos, /* length: */ .length: 1}); |
299 | } |
300 | previousMatch = matchPos; |
301 | } |
302 | |
303 | return ranges; |
304 | } |
305 | |