| 1 | /* |
| 2 | This file is part of the KDE libraries |
| 3 | |
| 4 | SPDX-FileCopyrightText: 2017 Forrest Smith <forrestthewoods@gmail.com> |
| 5 | SPDX-FileCopyrightText: 2021 Waqar Ahmed <waqar.17a@gmail.com> |
| 6 | |
| 7 | SPDX-License-Identifier: LGPL-2.0-or-later |
| 8 | */ |
| 9 | #include "kfuzzymatcher.h" |
| 10 | |
| 11 | #include <QList> |
| 12 | #include <QString> |
| 13 | #include <QStringView> |
| 14 | |
| 15 | /* |
| 16 | * Custom toLower function which is much faster than |
| 17 | * c.toLower() directly |
| 18 | */ |
| 19 | static inline constexpr QChar toLower(QChar c) |
| 20 | { |
| 21 | return c.isLower() ? c : c.toLower(); |
| 22 | } |
| 23 | |
| 24 | // internal |
| 25 | // clang-format off |
| 26 | static bool match_recursive(QStringView::const_iterator pattern, |
| 27 | QStringView::const_iterator str, |
| 28 | int &outScore, |
| 29 | const QStringView::const_iterator strBegin, |
| 30 | const QStringView::const_iterator strEnd, |
| 31 | const QStringView::const_iterator patternEnd, |
| 32 | const uint8_t *srcMatches, |
| 33 | uint8_t *matches, |
| 34 | int nextMatch, |
| 35 | int &totalMatches, |
| 36 | int &recursionCount) |
| 37 | { |
| 38 | static constexpr int recursionLimit = 10; |
| 39 | // max number of matches allowed, this should be enough |
| 40 | static constexpr int maxMatches = 256; |
| 41 | |
| 42 | // Count recursions |
| 43 | ++recursionCount; |
| 44 | if (recursionCount >= recursionLimit) { |
| 45 | return false; |
| 46 | } |
| 47 | |
| 48 | // Detect end of strings |
| 49 | if (pattern == patternEnd || str == strEnd) { |
| 50 | return false; |
| 51 | } |
| 52 | |
| 53 | // Recursion params |
| 54 | bool recursiveMatch = false; |
| 55 | uint8_t bestRecursiveMatches[maxMatches]; |
| 56 | int bestRecursiveScore = 0; |
| 57 | |
| 58 | // Loop through pattern and str looking for a match |
| 59 | bool firstMatch = true; |
| 60 | QChar currentPatternChar = toLower(c: *pattern); |
| 61 | while (pattern != patternEnd && str != strEnd) { |
| 62 | // Found match |
| 63 | if (currentPatternChar == toLower(c: *str)) { |
| 64 | // Supplied matches buffer was too short |
| 65 | if (nextMatch >= maxMatches) { |
| 66 | return false; |
| 67 | } |
| 68 | |
| 69 | // "Copy-on-Write" srcMatches into matches |
| 70 | if (firstMatch && srcMatches) { |
| 71 | memcpy(dest: matches, src: srcMatches, n: nextMatch); |
| 72 | firstMatch = false; |
| 73 | } |
| 74 | |
| 75 | // Recursive call that "skips" this match |
| 76 | uint8_t recursiveMatches[maxMatches]; |
| 77 | int recursiveScore = 0; |
| 78 | const auto strNextChar = std::next(x: str); |
| 79 | if (match_recursive(pattern, str: strNextChar, outScore&: recursiveScore, strBegin, |
| 80 | strEnd, patternEnd, srcMatches: matches, matches: recursiveMatches, |
| 81 | nextMatch, totalMatches, recursionCount)) { |
| 82 | // Pick best recursive score |
| 83 | if (!recursiveMatch || recursiveScore > bestRecursiveScore) { |
| 84 | memcpy(dest: bestRecursiveMatches, src: recursiveMatches, n: maxMatches); |
| 85 | bestRecursiveScore = recursiveScore; |
| 86 | } |
| 87 | recursiveMatch = true; |
| 88 | } |
| 89 | |
| 90 | // Advance |
| 91 | matches[nextMatch++] = (uint8_t)(std::distance(first: strBegin, last: str)); |
| 92 | ++pattern; |
| 93 | currentPatternChar = toLower(c: *pattern); |
| 94 | } |
| 95 | ++str; |
| 96 | } |
| 97 | |
| 98 | // Determine if full pattern was matched |
| 99 | const bool matched = pattern == patternEnd; |
| 100 | |
| 101 | // Calculate score |
| 102 | if (matched) { |
| 103 | static constexpr int firstSepScoreDiff = 3; |
| 104 | |
| 105 | static constexpr int sequentialBonus = 20; |
| 106 | static constexpr int separatorBonus = 25; // bonus if match occurs after a separator |
| 107 | static constexpr int firstLetterBonus = 15; // bonus if the first letter is matched |
| 108 | static constexpr int firstLetterSepMatchBonus = firstLetterBonus - firstSepScoreDiff; // bonus if the first matched letter is camel or separator |
| 109 | |
| 110 | static constexpr int unmatchedLetterPenalty = -1; // penalty for every letter that doesn't matter |
| 111 | |
| 112 | int nonBeginSequenceBonus = 10; |
| 113 | // points by which nonBeginSequenceBonus is increment on every matched letter |
| 114 | static constexpr int nonBeginSequenceIncrement = 4; |
| 115 | |
| 116 | // Initialize score |
| 117 | outScore = 100; |
| 118 | |
| 119 | #define debug_algo 0 |
| 120 | #if debug_algo |
| 121 | #define dbg(...) qDebug(__VA_ARGS__) |
| 122 | #else |
| 123 | #define dbg(...) |
| 124 | #endif |
| 125 | |
| 126 | // Apply unmatched penalty |
| 127 | const int unmatched = (int)(std::distance(first: strBegin, last: strEnd)) - nextMatch; |
| 128 | outScore += unmatchedLetterPenalty * unmatched; |
| 129 | dbg("unmatchedLetterPenalty, unmatched count: %d, outScore: %d" , unmatched, outScore); |
| 130 | |
| 131 | bool inSeparatorSeq = false; |
| 132 | int i = 0; |
| 133 | if (matches[i] == 0) { |
| 134 | // First letter match has the highest score |
| 135 | outScore += firstLetterBonus + separatorBonus; |
| 136 | dbg("firstLetterBonus, outScore: %d" , outScore); |
| 137 | inSeparatorSeq = true; |
| 138 | } else { |
| 139 | const QChar neighbor = *(strBegin + matches[i] - 1); |
| 140 | const QChar curr = *(strBegin + matches[i]); |
| 141 | const bool neighborSeparator = neighbor == QLatin1Char('_') || neighbor == QLatin1Char(' '); |
| 142 | if (neighborSeparator || (neighbor.isLower() && curr.isUpper())) { |
| 143 | // the first letter that got matched was a special char .e., camel or at a separator |
| 144 | outScore += firstLetterSepMatchBonus + separatorBonus; |
| 145 | dbg("firstLetterSepMatchBonus at %d, letter: %c, outScore: %d" , matches[i], curr.toLatin1(), outScore); |
| 146 | inSeparatorSeq = true; |
| 147 | } else { |
| 148 | // nothing |
| 149 | nonBeginSequenceBonus += nonBeginSequenceIncrement; |
| 150 | } |
| 151 | // We didn't match any special positions, apply leading penalty |
| 152 | outScore += -(matches[i]); |
| 153 | dbg("LeadingPenalty because no first letter match, outScore: %d" , outScore); |
| 154 | } |
| 155 | i++; |
| 156 | |
| 157 | bool allConsecutive = true; |
| 158 | // Apply ordering bonuses |
| 159 | for (; i < nextMatch; ++i) { |
| 160 | const uint8_t currIdx = matches[i]; |
| 161 | const uint8_t prevIdx = matches[i - 1]; |
| 162 | // Sequential |
| 163 | if (currIdx == (prevIdx + 1)) { |
| 164 | if (i == matches[i]) { |
| 165 | // We are in a sequence beginning from first letter |
| 166 | outScore += sequentialBonus; |
| 167 | dbg("sequentialBonus at %d, letter: %c, outScore: %d" , matches[i], (strBegin + currIdx)->toLatin1(), outScore); |
| 168 | } else if (inSeparatorSeq) { |
| 169 | // we are in a sequence beginning from a separator like camelHump or underscore |
| 170 | outScore += sequentialBonus - firstSepScoreDiff; |
| 171 | dbg("in separator seq, [sequentialBonus - 5] at %d, letter: %c, outScore: %d" , matches[i], (strBegin + currIdx)->toLatin1(), outScore); |
| 172 | } else { |
| 173 | // We are in a random sequence |
| 174 | outScore += nonBeginSequenceBonus; |
| 175 | nonBeginSequenceBonus += nonBeginSequenceIncrement; |
| 176 | dbg("nonBeginSequenceBonus at %d, letter: %c, outScore: %d" , matches[i], (strBegin + currIdx)->toLatin1(), outScore); |
| 177 | } |
| 178 | } else { |
| 179 | allConsecutive = false; |
| 180 | |
| 181 | // there is a gap between matching chars, apply penalty |
| 182 | int penalty = -((currIdx - prevIdx)) - 2; |
| 183 | outScore += penalty; |
| 184 | inSeparatorSeq = false; |
| 185 | nonBeginSequenceBonus = 10; |
| 186 | dbg("gap penalty[%d] at %d, letter: %c, outScore: %d" , penalty, matches[i], (strBegin + currIdx)->toLatin1(), outScore); |
| 187 | } |
| 188 | |
| 189 | // Check for bonuses based on neighbor character value |
| 190 | // Camel case |
| 191 | const QChar neighbor = *(strBegin + currIdx - 1); |
| 192 | const QChar curr = *(strBegin + currIdx); |
| 193 | // if camel case bonus, then not snake / separator. |
| 194 | // This prevents double bonuses |
| 195 | const bool neighborSeparator = neighbor == QLatin1Char('_') || neighbor == QLatin1Char(' '); |
| 196 | if (neighborSeparator || (neighbor.isLower() && curr.isUpper())) { |
| 197 | outScore += separatorBonus; |
| 198 | dbg("separatorBonus at %d, letter: %c, outScore: %d" , matches[i], (strBegin + currIdx)->toLatin1(), outScore); |
| 199 | inSeparatorSeq = true; |
| 200 | continue; |
| 201 | } |
| 202 | } |
| 203 | |
| 204 | if (allConsecutive && nextMatch >= 4) { |
| 205 | outScore *= 2; |
| 206 | dbg("allConsecutive double the score, outScore: %d" , outScore); |
| 207 | } |
| 208 | } |
| 209 | |
| 210 | totalMatches = nextMatch; |
| 211 | |
| 212 | // Return best result |
| 213 | if (recursiveMatch && (!matched || bestRecursiveScore > outScore)) { |
| 214 | // Recursive score is better than "this" |
| 215 | memcpy(dest: matches, src: bestRecursiveMatches, n: maxMatches); |
| 216 | outScore = bestRecursiveScore; |
| 217 | return true; |
| 218 | } else if (matched) { |
| 219 | // "this" score is better than recursive |
| 220 | return true; |
| 221 | } else { |
| 222 | // no match |
| 223 | return false; |
| 224 | } |
| 225 | } |
| 226 | // clang-format on |
| 227 | |
| 228 | static bool match_internal(QStringView pattern, QStringView str, int &outScore, unsigned char *matches) |
| 229 | { |
| 230 | if (pattern.isEmpty()) { |
| 231 | return true; |
| 232 | } |
| 233 | |
| 234 | int recursionCount = 0; |
| 235 | |
| 236 | auto strIt = str.cbegin(); |
| 237 | auto patternIt = pattern.cbegin(); |
| 238 | const auto patternEnd = pattern.cend(); |
| 239 | const auto strEnd = str.cend(); |
| 240 | |
| 241 | int total = 0; |
| 242 | return match_recursive(pattern: patternIt, str: strIt, outScore, strBegin: strIt, strEnd, patternEnd, srcMatches: nullptr, matches, nextMatch: 0, totalMatches&: total, recursionCount); |
| 243 | } |
| 244 | |
| 245 | /**************************************************************/ |
| 246 | |
| 247 | bool KFuzzyMatcher::matchSimple(QStringView pattern, QStringView str) |
| 248 | { |
| 249 | auto patternIt = pattern.cbegin(); |
| 250 | /* |
| 251 | * Instead of doing |
| 252 | * |
| 253 | * strIt.toLower() == patternIt.toLower() |
| 254 | * |
| 255 | * we convert patternIt to Upper / Lower as needed and compare with strIt. This |
| 256 | * saves us from calling toLower() on both strings, making things a little bit faster |
| 257 | */ |
| 258 | bool lower = patternIt->isLower(); |
| 259 | QChar cUp = lower ? patternIt->toUpper() : *patternIt; |
| 260 | QChar cLow = lower ? *patternIt : patternIt->toLower(); |
| 261 | for (auto strIt = str.cbegin(); strIt != str.cend() && patternIt != pattern.cend(); ++strIt) { |
| 262 | if (*strIt == cLow || *strIt == cUp) { |
| 263 | ++patternIt; |
| 264 | lower = patternIt->isLower(); |
| 265 | cUp = lower ? patternIt->toUpper() : *patternIt; |
| 266 | cLow = lower ? *patternIt : patternIt->toLower(); |
| 267 | } |
| 268 | } |
| 269 | |
| 270 | return patternIt == pattern.cend(); |
| 271 | } |
| 272 | |
| 273 | KFuzzyMatcher::Result KFuzzyMatcher::match(QStringView pattern, QStringView str) |
| 274 | { |
| 275 | /* |
| 276 | * Simple substring matching to flush out non-matching strings |
| 277 | */ |
| 278 | const bool simpleMatch = matchSimple(pattern, str); |
| 279 | |
| 280 | KFuzzyMatcher::Result result; |
| 281 | result.matched = false; |
| 282 | result.score = 0; |
| 283 | |
| 284 | if (!simpleMatch) { |
| 285 | return result; |
| 286 | } |
| 287 | |
| 288 | // actual algorithm |
| 289 | int score = 0; |
| 290 | uint8_t matches[256]; |
| 291 | const bool matched = match_internal(pattern, str, outScore&: score, matches); |
| 292 | result.matched = matched; |
| 293 | result.score = score; |
| 294 | return result; |
| 295 | } |
| 296 | |
| 297 | QList<KFuzzyMatcher::Range> KFuzzyMatcher::matchedRanges(QStringView pattern, QStringView str, RangeType type) |
| 298 | { |
| 299 | QList<KFuzzyMatcher::Range> ranges; |
| 300 | if (pattern.isEmpty()) { |
| 301 | return ranges; |
| 302 | } |
| 303 | |
| 304 | int totalMatches = 0; |
| 305 | int score = 0; |
| 306 | int recursionCount = 0; |
| 307 | |
| 308 | auto strIt = str.cbegin(); |
| 309 | auto patternIt = pattern.cbegin(); |
| 310 | const auto patternEnd = pattern.cend(); |
| 311 | const auto strEnd = str.cend(); |
| 312 | |
| 313 | uint8_t matches[256]; |
| 314 | auto res = match_recursive(pattern: patternIt, str: strIt, outScore&: score, strBegin: strIt, strEnd, patternEnd, srcMatches: nullptr, matches, nextMatch: 0, totalMatches, recursionCount); |
| 315 | // didn't match? => We don't care about results |
| 316 | if (!res && type == RangeType::FullyMatched) { |
| 317 | return {}; |
| 318 | } |
| 319 | |
| 320 | int previousMatch = 0; |
| 321 | for (int i = 0; i < totalMatches; ++i) { |
| 322 | auto matchPos = matches[i]; |
| 323 | /* |
| 324 | * Check if this match is part of the previous |
| 325 | * match. If it is, we increase the length of |
| 326 | * the last range. |
| 327 | */ |
| 328 | if (!ranges.isEmpty() && matchPos == previousMatch + 1) { |
| 329 | ranges.last().length++; |
| 330 | } else { |
| 331 | /* |
| 332 | * This is a new match inside the string |
| 333 | */ |
| 334 | ranges.push_back(t: {/* start: */ .start: matchPos, /* length: */ .length: 1}); |
| 335 | } |
| 336 | previousMatch = matchPos; |
| 337 | } |
| 338 | |
| 339 | return ranges; |
| 340 | } |
| 341 | |