| 1 | /* |
| 2 | This file is part of the KDE libraries |
| 3 | |
| 4 | SPDX-FileCopyrightText: 2017 Forrest Smith <forrestthewoods@gmail.com> |
| 5 | SPDX-FileCopyrightText: 2021 Waqar Ahmed <waqar.17a@gmail.com> |
| 6 | |
| 7 | SPDX-License-Identifier: LGPL-2.0-or-later |
| 8 | */ |
| 9 | #include "kfuzzymatcher.h" |
| 10 | |
| 11 | #include <QList> |
| 12 | #include <QString> |
| 13 | #include <QStringView> |
| 14 | |
| 15 | /* |
| 16 | * Custom toLower function which is much faster than |
| 17 | * c.toLower() directly |
| 18 | */ |
| 19 | static inline constexpr QChar toLower(QChar c) |
| 20 | { |
| 21 | return c.isLower() ? c : c.toLower(); |
| 22 | } |
| 23 | |
| 24 | // internal |
| 25 | // clang-format off |
| 26 | static bool match_recursive(QStringView::const_iterator pattern, |
| 27 | QStringView::const_iterator str, |
| 28 | int &outScore, |
| 29 | const QStringView::const_iterator strBegin, |
| 30 | const QStringView::const_iterator strEnd, |
| 31 | const QStringView::const_iterator patternEnd, |
| 32 | const uint8_t *srcMatches, |
| 33 | uint8_t *matches, |
| 34 | int nextMatch, |
| 35 | int &totalMatches, |
| 36 | int &recursionCount) |
| 37 | { |
| 38 | static constexpr int recursionLimit = 10; |
| 39 | // max number of matches allowed, this should be enough |
| 40 | static constexpr int maxMatches = 256; |
| 41 | |
| 42 | // Count recursions |
| 43 | ++recursionCount; |
| 44 | if (recursionCount >= recursionLimit) { |
| 45 | return false; |
| 46 | } |
| 47 | |
| 48 | // Detect end of strings |
| 49 | if (pattern == patternEnd || str == strEnd) { |
| 50 | return false; |
| 51 | } |
| 52 | |
| 53 | // Recursion params |
| 54 | bool recursiveMatch = false; |
| 55 | uint8_t bestRecursiveMatches[maxMatches]; |
| 56 | int bestRecursiveScore = 0; |
| 57 | |
| 58 | // Loop through pattern and str looking for a match |
| 59 | bool firstMatch = true; |
| 60 | QChar currentPatternChar = toLower(c: *pattern); |
| 61 | // Are we matching in sequence start from start? |
| 62 | bool matchingInSequence = true; |
| 63 | while (pattern != patternEnd && str != strEnd) { |
| 64 | // Found match |
| 65 | if (currentPatternChar == toLower(c: *str)) { |
| 66 | // Supplied matches buffer was too short |
| 67 | if (nextMatch >= maxMatches) { |
| 68 | return false; |
| 69 | } |
| 70 | |
| 71 | // "Copy-on-Write" srcMatches into matches |
| 72 | if (firstMatch && srcMatches) { |
| 73 | memcpy(dest: matches, src: srcMatches, n: nextMatch); |
| 74 | firstMatch = false; |
| 75 | } |
| 76 | |
| 77 | // Recursive call that "skips" this match |
| 78 | uint8_t recursiveMatches[maxMatches]; |
| 79 | int recursiveScore = 0; |
| 80 | const auto strNextChar = std::next(x: str); |
| 81 | if (!matchingInSequence && match_recursive(pattern, str: strNextChar, outScore&: recursiveScore, strBegin, |
| 82 | strEnd, patternEnd, srcMatches: matches, matches: recursiveMatches, |
| 83 | nextMatch, totalMatches, recursionCount)) { |
| 84 | // Pick best recursive score |
| 85 | if (!recursiveMatch || recursiveScore > bestRecursiveScore) { |
| 86 | memcpy(dest: bestRecursiveMatches, src: recursiveMatches, n: maxMatches); |
| 87 | bestRecursiveScore = recursiveScore; |
| 88 | } |
| 89 | recursiveMatch = true; |
| 90 | } |
| 91 | |
| 92 | // Advance |
| 93 | matches[nextMatch++] = (uint8_t)(std::distance(first: strBegin, last: str)); |
| 94 | ++pattern; |
| 95 | currentPatternChar = toLower(c: *pattern); |
| 96 | } else { |
| 97 | matchingInSequence = false; |
| 98 | } |
| 99 | ++str; |
| 100 | } |
| 101 | |
| 102 | // Determine if full pattern was matched |
| 103 | const bool matched = pattern == patternEnd; |
| 104 | |
| 105 | // Calculate score |
| 106 | if (matched) { |
| 107 | static constexpr int sequentialBonus = 25; |
| 108 | static constexpr int separatorBonus = 25; // bonus if match occurs after a separator |
| 109 | static constexpr int camelBonus = 25; // bonus if match is uppercase and prev is lower |
| 110 | static constexpr int firstLetterBonus = 15; // bonus if the first letter is matched |
| 111 | |
| 112 | static constexpr int leadingLetterPenalty = -5; // penalty applied for every letter in str before the first match |
| 113 | static constexpr int maxLeadingLetterPenalty = -15; // maximum penalty for leading letters |
| 114 | static constexpr int unmatchedLetterPenalty = -1; // penalty for every letter that doesn't matter |
| 115 | |
| 116 | static constexpr int nonBeginSequenceBonus = 10; |
| 117 | |
| 118 | // Initialize score |
| 119 | outScore = 100; |
| 120 | |
| 121 | // Apply leading letter penalty |
| 122 | const int penalty = std::max(a: leadingLetterPenalty * matches[0], b: maxLeadingLetterPenalty); |
| 123 | |
| 124 | outScore += penalty; |
| 125 | |
| 126 | // Apply unmatched penalty |
| 127 | const int unmatched = (int)(std::distance(first: strBegin, last: strEnd)) - nextMatch; |
| 128 | outScore += unmatchedLetterPenalty * unmatched; |
| 129 | |
| 130 | uint8_t seqs[maxMatches] = {0}; |
| 131 | |
| 132 | // Apply ordering bonuses |
| 133 | int j = 0; |
| 134 | for (int i = 0; i < nextMatch; ++i) { |
| 135 | const uint8_t currIdx = matches[i]; |
| 136 | |
| 137 | if (i > 0) { |
| 138 | const uint8_t prevIdx = matches[i - 1]; |
| 139 | |
| 140 | // Sequential |
| 141 | if (currIdx == (prevIdx + 1)) { |
| 142 | if (j > 0 && seqs[j - 1] == i - 1){ |
| 143 | outScore += sequentialBonus; |
| 144 | seqs[j++] = i; |
| 145 | } else { |
| 146 | // In sequence, but from first char |
| 147 | outScore += nonBeginSequenceBonus; |
| 148 | } |
| 149 | } |
| 150 | } |
| 151 | |
| 152 | // Check for bonuses based on neighbor character value |
| 153 | if (currIdx > 0) { |
| 154 | // Camel case |
| 155 | const QChar neighbor = *(strBegin + currIdx - 1); |
| 156 | const QChar curr = *(strBegin + currIdx); |
| 157 | if (neighbor.isLower() && curr.isUpper()) { |
| 158 | outScore += camelBonus; |
| 159 | } |
| 160 | |
| 161 | // Separator |
| 162 | const bool neighborSeparator = neighbor == QLatin1Char('_') || neighbor == QLatin1Char(' '); |
| 163 | if (neighborSeparator) { |
| 164 | outScore += separatorBonus; |
| 165 | } |
| 166 | } else { |
| 167 | // First letter |
| 168 | outScore += firstLetterBonus; |
| 169 | seqs[j++] = i; |
| 170 | } |
| 171 | } |
| 172 | } |
| 173 | |
| 174 | totalMatches = nextMatch; |
| 175 | |
| 176 | // Return best result |
| 177 | if (recursiveMatch && (!matched || bestRecursiveScore > outScore)) { |
| 178 | // Recursive score is better than "this" |
| 179 | memcpy(dest: matches, src: bestRecursiveMatches, n: maxMatches); |
| 180 | outScore = bestRecursiveScore; |
| 181 | return true; |
| 182 | } else if (matched) { |
| 183 | // "this" score is better than recursive |
| 184 | return true; |
| 185 | } else { |
| 186 | // no match |
| 187 | return false; |
| 188 | } |
| 189 | } |
| 190 | // clang-format on |
| 191 | |
| 192 | static bool match_internal(QStringView pattern, QStringView str, int &outScore, unsigned char *matches) |
| 193 | { |
| 194 | if (pattern.isEmpty()) { |
| 195 | return true; |
| 196 | } |
| 197 | |
| 198 | int recursionCount = 0; |
| 199 | |
| 200 | auto strIt = str.cbegin(); |
| 201 | auto patternIt = pattern.cbegin(); |
| 202 | const auto patternEnd = pattern.cend(); |
| 203 | const auto strEnd = str.cend(); |
| 204 | |
| 205 | int total = 0; |
| 206 | return match_recursive(pattern: patternIt, str: strIt, outScore, strBegin: strIt, strEnd, patternEnd, srcMatches: nullptr, matches, nextMatch: 0, totalMatches&: total, recursionCount); |
| 207 | } |
| 208 | |
| 209 | /**************************************************************/ |
| 210 | |
| 211 | bool KFuzzyMatcher::matchSimple(QStringView pattern, QStringView str) |
| 212 | { |
| 213 | auto patternIt = pattern.cbegin(); |
| 214 | /* |
| 215 | * Instead of doing |
| 216 | * |
| 217 | * strIt.toLower() == patternIt.toLower() |
| 218 | * |
| 219 | * we convert patternIt to Upper / Lower as needed and compare with strIt. This |
| 220 | * saves us from calling toLower() on both strings, making things a little bit faster |
| 221 | */ |
| 222 | bool lower = patternIt->isLower(); |
| 223 | QChar cUp = lower ? patternIt->toUpper() : *patternIt; |
| 224 | QChar cLow = lower ? *patternIt : patternIt->toLower(); |
| 225 | for (auto strIt = str.cbegin(); strIt != str.cend() && patternIt != pattern.cend(); ++strIt) { |
| 226 | if (*strIt == cLow || *strIt == cUp) { |
| 227 | ++patternIt; |
| 228 | lower = patternIt->isLower(); |
| 229 | cUp = lower ? patternIt->toUpper() : *patternIt; |
| 230 | cLow = lower ? *patternIt : patternIt->toLower(); |
| 231 | } |
| 232 | } |
| 233 | |
| 234 | return patternIt == pattern.cend(); |
| 235 | } |
| 236 | |
| 237 | KFuzzyMatcher::Result KFuzzyMatcher::match(QStringView pattern, QStringView str) |
| 238 | { |
| 239 | /* |
| 240 | * Simple substring matching to flush out non-matching strings |
| 241 | */ |
| 242 | const bool simpleMatch = matchSimple(pattern, str); |
| 243 | |
| 244 | KFuzzyMatcher::Result result; |
| 245 | result.matched = false; |
| 246 | result.score = 0; |
| 247 | |
| 248 | if (!simpleMatch) { |
| 249 | return result; |
| 250 | } |
| 251 | |
| 252 | // actual algorithm |
| 253 | int score = 0; |
| 254 | uint8_t matches[256]; |
| 255 | const bool matched = match_internal(pattern, str, outScore&: score, matches); |
| 256 | result.matched = matched; |
| 257 | result.score = score; |
| 258 | return result; |
| 259 | } |
| 260 | |
| 261 | QList<KFuzzyMatcher::Range> KFuzzyMatcher::matchedRanges(QStringView pattern, QStringView str, RangeType type) |
| 262 | { |
| 263 | QList<KFuzzyMatcher::Range> ranges; |
| 264 | if (pattern.isEmpty()) { |
| 265 | return ranges; |
| 266 | } |
| 267 | |
| 268 | int totalMatches = 0; |
| 269 | int score = 0; |
| 270 | int recursionCount = 0; |
| 271 | |
| 272 | auto strIt = str.cbegin(); |
| 273 | auto patternIt = pattern.cbegin(); |
| 274 | const auto patternEnd = pattern.cend(); |
| 275 | const auto strEnd = str.cend(); |
| 276 | |
| 277 | uint8_t matches[256]; |
| 278 | auto res = match_recursive(pattern: patternIt, str: strIt, outScore&: score, strBegin: strIt, strEnd, patternEnd, srcMatches: nullptr, matches, nextMatch: 0, totalMatches, recursionCount); |
| 279 | // didn't match? => We don't care about results |
| 280 | if (!res && type == RangeType::FullyMatched) { |
| 281 | return {}; |
| 282 | } |
| 283 | |
| 284 | int previousMatch = 0; |
| 285 | for (int i = 0; i < totalMatches; ++i) { |
| 286 | auto matchPos = matches[i]; |
| 287 | /* |
| 288 | * Check if this match is part of the previous |
| 289 | * match. If it is, we increase the length of |
| 290 | * the last range. |
| 291 | */ |
| 292 | if (!ranges.isEmpty() && matchPos == previousMatch + 1) { |
| 293 | ranges.last().length++; |
| 294 | } else { |
| 295 | /* |
| 296 | * This is a new match inside the string |
| 297 | */ |
| 298 | ranges.push_back(t: {/* start: */ .start: matchPos, /* length: */ .length: 1}); |
| 299 | } |
| 300 | previousMatch = matchPos; |
| 301 | } |
| 302 | |
| 303 | return ranges; |
| 304 | } |
| 305 | |