1 | //===--- FuzzyMatch.h - Approximate identifier matching ---------*- C++-*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // To check for a match between a Pattern ('u_p') and a Word ('unique_ptr'), |
10 | // we consider the possible partial match states: |
11 | // |
12 | // u n i q u e _ p t r |
13 | // +--------------------- |
14 | // |A . . . . . . . . . . |
15 | // u| |
16 | // |. . . . . . . . . . . |
17 | // _| |
18 | // |. . . . . . . O . . . |
19 | // p| |
20 | // |. . . . . . . . . . B |
21 | // |
22 | // Each dot represents some prefix of the pattern being matched against some |
23 | // prefix of the word. |
24 | // - A is the initial state: '' matched against '' |
25 | // - O is an intermediate state: 'u_' matched against 'unique_' |
26 | // - B is the target state: 'u_p' matched against 'unique_ptr' |
27 | // |
28 | // We aim to find the best path from A->B. |
29 | // - Moving right (consuming a word character) |
30 | // Always legal: not all word characters must match. |
31 | // - Moving diagonally (consuming both a word and pattern character) |
32 | // Legal if the characters match. |
33 | // - Moving down (consuming a pattern character) is never legal. |
34 | // Never legal: all pattern characters must match something. |
35 | // Characters are matched case-insensitively. |
36 | // The first pattern character may only match the start of a word segment. |
37 | // |
38 | // The scoring is based on heuristics: |
39 | // - when matching a character, apply a bonus or penalty depending on the |
40 | // match quality (does case match, do word segments align, etc) |
41 | // - when skipping a character, apply a penalty if it hurts the match |
42 | // (it starts a word segment, or splits the matched region, etc) |
43 | // |
44 | // These heuristics require the ability to "look backward" one character, to |
45 | // see whether it was matched or not. Therefore the dynamic-programming matrix |
46 | // has an extra dimension (last character matched). |
47 | // Each entry also has an additional flag indicating whether the last-but-one |
48 | // character matched, which is needed to trace back through the scoring table |
49 | // and reconstruct the match. |
50 | // |
51 | // We treat strings as byte-sequences, so only ASCII has first-class support. |
52 | // |
53 | // This algorithm was inspired by VS code's client-side filtering, and aims |
54 | // to be mostly-compatible. |
55 | // |
56 | //===----------------------------------------------------------------------===// |
57 | |
58 | #include "FuzzyMatch.h" |
59 | #include "llvm/Support/Format.h" |
60 | #include <optional> |
61 | |
62 | namespace clang { |
63 | namespace clangd { |
64 | |
65 | constexpr int FuzzyMatcher::MaxPat; |
66 | constexpr int FuzzyMatcher::MaxWord; |
67 | |
68 | static char lower(char C) { return C >= 'A' && C <= 'Z' ? C + ('a' - 'A') : C; } |
69 | // A "negative infinity" score that won't overflow. |
70 | // We use this to mark unreachable states and forbidden solutions. |
71 | // Score field is 15 bits wide, min value is -2^14, we use half of that. |
72 | static constexpr int AwfulScore = -(1 << 13); |
73 | static bool isAwful(int S) { return S < AwfulScore / 2; } |
74 | static constexpr int PerfectBonus = 4; // Perfect per-pattern-char score. |
75 | |
76 | FuzzyMatcher::FuzzyMatcher(llvm::StringRef Pattern) |
77 | : PatN(std::min<int>(a: MaxPat, b: Pattern.size())), |
78 | ScoreScale(PatN ? float{1} / (PerfectBonus * PatN) : 0), WordN(0) { |
79 | std::copy(first: Pattern.begin(), last: Pattern.begin() + PatN, result: Pat); |
80 | for (int I = 0; I < PatN; ++I) |
81 | LowPat[I] = lower(C: Pat[I]); |
82 | Scores[0][0][Miss] = {.Score: 0, .Prev: Miss}; |
83 | Scores[0][0][Match] = {.Score: AwfulScore, .Prev: Miss}; |
84 | for (int P = 0; P <= PatN; ++P) |
85 | for (int W = 0; W < P; ++W) |
86 | for (Action A : {Miss, Match}) |
87 | Scores[P][W][A] = {.Score: AwfulScore, .Prev: Miss}; |
88 | PatTypeSet = calculateRoles(Text: llvm::StringRef(Pat, PatN), |
89 | Roles: llvm::MutableArrayRef(PatRole, PatN)); |
90 | } |
91 | |
92 | std::optional<float> FuzzyMatcher::match(llvm::StringRef Word) { |
93 | if (!(WordContainsPattern = init(Word))) |
94 | return std::nullopt; |
95 | if (!PatN) |
96 | return 1; |
97 | buildGraph(); |
98 | auto Best = std::max(a: Scores[PatN][WordN][Miss].Score, |
99 | b: Scores[PatN][WordN][Match].Score); |
100 | if (isAwful(S: Best)) |
101 | return std::nullopt; |
102 | float Score = |
103 | ScoreScale * std::min(a: PerfectBonus * PatN, b: std::max<int>(a: 0, b: Best)); |
104 | // If the pattern is as long as the word, we have an exact string match, |
105 | // since every pattern character must match something. |
106 | if (WordN == PatN) |
107 | Score *= 2; // May not be perfect 2 if case differs in a significant way. |
108 | return Score; |
109 | } |
110 | |
111 | // We get CharTypes from a lookup table. Each is 2 bits, 4 fit in each byte. |
112 | // The top 6 bits of the char select the byte, the bottom 2 select the offset. |
113 | // e.g. 'q' = 010100 01 = byte 28 (55), bits 3-2 (01) -> Lower. |
114 | constexpr static uint8_t CharTypes[] = { |
115 | 0x00, 0x00, 0x00, 0x00, // Control characters |
116 | 0x00, 0x00, 0x00, 0x00, // Control characters |
117 | 0xff, 0xff, 0xff, 0xff, // Punctuation |
118 | 0x55, 0x55, 0xf5, 0xff, // Numbers->Lower, more Punctuation. |
119 | 0xab, 0xaa, 0xaa, 0xaa, // @ and A-O |
120 | 0xaa, 0xaa, 0xea, 0xff, // P-Z, more Punctuation. |
121 | 0x57, 0x55, 0x55, 0x55, // ` and a-o |
122 | 0x55, 0x55, 0xd5, 0x3f, // p-z, Punctuation, DEL. |
123 | 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // Bytes over 127 -> Lower. |
124 | 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // (probably UTF-8). |
125 | 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, |
126 | 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, |
127 | }; |
128 | |
129 | // The Role can be determined from the Type of a character and its neighbors: |
130 | // |
131 | // Example | Chars | Type | Role |
132 | // ---------+--------------+----- |
133 | // F(o)oBar | Foo | Ull | Tail |
134 | // Foo(B)ar | oBa | lUl | Head |
135 | // (f)oo | ^fo | Ell | Head |
136 | // H(T)TP | HTT | UUU | Tail |
137 | // |
138 | // Our lookup table maps a 6 bit key (Prev, Curr, Next) to a 2-bit Role. |
139 | // A byte packs 4 Roles. (Prev, Curr) selects a byte, Next selects the offset. |
140 | // e.g. Lower, Upper, Lower -> 01 10 01 -> byte 6 (aa), bits 3-2 (10) -> Head. |
141 | constexpr static uint8_t CharRoles[] = { |
142 | // clang-format off |
143 | // Curr= Empty Lower Upper Separ |
144 | /* Prev=Empty */ 0x00, 0xaa, 0xaa, 0xff, // At start, Lower|Upper->Head |
145 | /* Prev=Lower */ 0x00, 0x55, 0xaa, 0xff, // In word, Upper->Head;Lower->Tail |
146 | /* Prev=Upper */ 0x00, 0x55, 0x59, 0xff, // Ditto, but U(U)U->Tail |
147 | /* Prev=Separ */ 0x00, 0xaa, 0xaa, 0xff, // After separator, like at start |
148 | // clang-format on |
149 | }; |
150 | |
151 | template <typename T> static T packedLookup(const uint8_t *Data, int I) { |
152 | return static_cast<T>((Data[I >> 2] >> ((I & 3) * 2)) & 3); |
153 | } |
154 | CharTypeSet calculateRoles(llvm::StringRef Text, |
155 | llvm::MutableArrayRef<CharRole> Roles) { |
156 | assert(Text.size() == Roles.size()); |
157 | if (Text.size() == 0) |
158 | return 0; |
159 | CharType Type = packedLookup<CharType>(Data: CharTypes, I: Text[0]); |
160 | CharTypeSet TypeSet = 1 << Type; |
161 | // Types holds a sliding window of (Prev, Curr, Next) types. |
162 | // Initial value is (Empty, Empty, type of Text[0]). |
163 | int Types = Type; |
164 | // Rotate slides in the type of the next character. |
165 | auto Rotate = [&](CharType T) { Types = ((Types << 2) | T) & 0x3f; }; |
166 | for (unsigned I = 0; I < Text.size() - 1; ++I) { |
167 | // For each character, rotate in the next, and look up the role. |
168 | Type = packedLookup<CharType>(Data: CharTypes, I: Text[I + 1]); |
169 | TypeSet |= 1 << Type; |
170 | Rotate(Type); |
171 | Roles[I] = packedLookup<CharRole>(Data: CharRoles, I: Types); |
172 | } |
173 | // For the last character, the "next character" is Empty. |
174 | Rotate(Empty); |
175 | Roles[Text.size() - 1] = packedLookup<CharRole>(Data: CharRoles, I: Types); |
176 | return TypeSet; |
177 | } |
178 | |
179 | // Sets up the data structures matching Word. |
180 | // Returns false if we can cheaply determine that no match is possible. |
181 | bool FuzzyMatcher::init(llvm::StringRef NewWord) { |
182 | WordN = std::min<int>(a: MaxWord, b: NewWord.size()); |
183 | if (PatN > WordN) |
184 | return false; |
185 | std::copy(first: NewWord.begin(), last: NewWord.begin() + WordN, result: Word); |
186 | if (PatN == 0) |
187 | return true; |
188 | for (int I = 0; I < WordN; ++I) |
189 | LowWord[I] = lower(C: Word[I]); |
190 | |
191 | // Cheap subsequence check. |
192 | for (int W = 0, P = 0; P != PatN; ++W) { |
193 | if (W == WordN) |
194 | return false; |
195 | if (LowWord[W] == LowPat[P]) |
196 | ++P; |
197 | } |
198 | |
199 | // FIXME: some words are hard to tokenize algorithmically. |
200 | // e.g. vsprintf is V S Print F, and should match [pri] but not [int]. |
201 | // We could add a tokenization dictionary for common stdlib names. |
202 | WordTypeSet = calculateRoles(Text: llvm::StringRef(Word, WordN), |
203 | Roles: llvm::MutableArrayRef(WordRole, WordN)); |
204 | return true; |
205 | } |
206 | |
207 | // The forwards pass finds the mappings of Pattern onto Word. |
208 | // Score = best score achieved matching Word[..W] against Pat[..P]. |
209 | // Unlike other tables, indices range from 0 to N *inclusive* |
210 | // Matched = whether we chose to match Word[W] with Pat[P] or not. |
211 | // |
212 | // Points are mostly assigned to matched characters, with 1 being a good score |
213 | // and 3 being a great one. So we treat the score range as [0, 3 * PatN]. |
214 | // This range is not strict: we can apply larger bonuses/penalties, or penalize |
215 | // non-matched characters. |
216 | void FuzzyMatcher::buildGraph() { |
217 | for (int W = 0; W < WordN; ++W) { |
218 | Scores[0][W + 1][Miss] = {.Score: Scores[0][W][Miss].Score - skipPenalty(W, Last: Miss), |
219 | .Prev: Miss}; |
220 | Scores[0][W + 1][Match] = {.Score: AwfulScore, .Prev: Miss}; |
221 | } |
222 | for (int P = 0; P < PatN; ++P) { |
223 | for (int W = P; W < WordN; ++W) { |
224 | auto &Score = Scores[P + 1][W + 1], &PreMiss = Scores[P + 1][W]; |
225 | |
226 | auto MatchMissScore = PreMiss[Match].Score; |
227 | auto MissMissScore = PreMiss[Miss].Score; |
228 | if (P < PatN - 1) { // Skipping trailing characters is always free. |
229 | MatchMissScore -= skipPenalty(W, Last: Match); |
230 | MissMissScore -= skipPenalty(W, Last: Miss); |
231 | } |
232 | Score[Miss] = (MatchMissScore > MissMissScore) |
233 | ? ScoreInfo{.Score: MatchMissScore, .Prev: Match} |
234 | : ScoreInfo{.Score: MissMissScore, .Prev: Miss}; |
235 | |
236 | auto &PreMatch = Scores[P][W]; |
237 | auto MatchMatchScore = |
238 | allowMatch(P, W, Last: Match) |
239 | ? PreMatch[Match].Score + matchBonus(P, W, Last: Match) |
240 | : AwfulScore; |
241 | auto MissMatchScore = allowMatch(P, W, Last: Miss) |
242 | ? PreMatch[Miss].Score + matchBonus(P, W, Last: Miss) |
243 | : AwfulScore; |
244 | Score[Match] = (MatchMatchScore > MissMatchScore) |
245 | ? ScoreInfo{.Score: MatchMatchScore, .Prev: Match} |
246 | : ScoreInfo{.Score: MissMatchScore, .Prev: Miss}; |
247 | } |
248 | } |
249 | } |
250 | |
251 | bool FuzzyMatcher::allowMatch(int P, int W, Action Last) const { |
252 | if (LowPat[P] != LowWord[W]) |
253 | return false; |
254 | // We require a "strong" match: |
255 | // - for the first pattern character. [foo] !~ "barefoot" |
256 | // - after a gap. [pat] !~ "patnther" |
257 | if (Last == Miss) { |
258 | // We're banning matches outright, so conservatively accept some other cases |
259 | // where our segmentation might be wrong: |
260 | // - allow matching B in ABCDef (but not in NDEBUG) |
261 | // - we'd like to accept print in sprintf, but too many false positives |
262 | if (WordRole[W] == Tail && |
263 | (Word[W] == LowWord[W] || !(WordTypeSet & 1 << Lower))) |
264 | return false; |
265 | } |
266 | return true; |
267 | } |
268 | |
269 | int FuzzyMatcher::skipPenalty(int W, Action Last) const { |
270 | if (W == 0) // Skipping the first character. |
271 | return 3; |
272 | if (WordRole[W] == Head) // Skipping a segment. |
273 | return 1; // We want to keep this lower than a consecutive match bonus. |
274 | // Instead of penalizing non-consecutive matches, we give a bonus to a |
275 | // consecutive match in matchBonus. This produces a better score distribution |
276 | // than penalties in case of small patterns, e.g. 'up' for 'unique_ptr'. |
277 | return 0; |
278 | } |
279 | |
280 | int FuzzyMatcher::matchBonus(int P, int W, Action Last) const { |
281 | assert(LowPat[P] == LowWord[W]); |
282 | int S = 1; |
283 | bool IsPatSingleCase = |
284 | (PatTypeSet == 1 << Lower) || (PatTypeSet == 1 << Upper); |
285 | // Bonus: case matches, or a Head in the pattern aligns with one in the word. |
286 | // Single-case patterns lack segmentation signals and we assume any character |
287 | // can be a head of a segment. |
288 | if (Pat[P] == Word[W] || |
289 | (WordRole[W] == Head && (IsPatSingleCase || PatRole[P] == Head))) |
290 | ++S; |
291 | // Bonus: a consecutive match. First character match also gets a bonus to |
292 | // ensure prefix final match score normalizes to 1.0. |
293 | if (W == 0 || Last == Match) |
294 | S += 2; |
295 | // Penalty: matching inside a segment (and previous char wasn't matched). |
296 | if (WordRole[W] == Tail && P && Last == Miss) |
297 | S -= 3; |
298 | // Penalty: a Head in the pattern matches in the middle of a word segment. |
299 | if (PatRole[P] == Head && WordRole[W] == Tail) |
300 | --S; |
301 | // Penalty: matching the first pattern character in the middle of a segment. |
302 | if (P == 0 && WordRole[W] == Tail) |
303 | S -= 4; |
304 | assert(S <= PerfectBonus); |
305 | return S; |
306 | } |
307 | |
308 | llvm::SmallString<256> FuzzyMatcher::dumpLast(llvm::raw_ostream &OS) const { |
309 | llvm::SmallString<256> Result; |
310 | OS << "=== Match \"" << llvm::StringRef(Word, WordN) << "\" against [" |
311 | << llvm::StringRef(Pat, PatN) << "] ===\n" ; |
312 | if (PatN == 0) { |
313 | OS << "Pattern is empty: perfect match.\n" ; |
314 | return Result = llvm::StringRef(Word, WordN); |
315 | } |
316 | if (WordN == 0) { |
317 | OS << "Word is empty: no match.\n" ; |
318 | return Result; |
319 | } |
320 | if (!WordContainsPattern) { |
321 | OS << "Substring check failed.\n" ; |
322 | return Result; |
323 | } |
324 | if (isAwful(S: std::max(a: Scores[PatN][WordN][Match].Score, |
325 | b: Scores[PatN][WordN][Miss].Score))) { |
326 | OS << "Substring check passed, but all matches are forbidden\n" ; |
327 | } |
328 | if (!(PatTypeSet & 1 << Upper)) |
329 | OS << "Lowercase query, so scoring ignores case\n" ; |
330 | |
331 | // Traverse Matched table backwards to reconstruct the Pattern/Word mapping. |
332 | // The Score table has cumulative scores, subtracting along this path gives |
333 | // us the per-letter scores. |
334 | Action Last = |
335 | (Scores[PatN][WordN][Match].Score > Scores[PatN][WordN][Miss].Score) |
336 | ? Match |
337 | : Miss; |
338 | int S[MaxWord]; |
339 | Action A[MaxWord]; |
340 | for (int W = WordN - 1, P = PatN - 1; W >= 0; --W) { |
341 | A[W] = Last; |
342 | const auto &Cell = Scores[P + 1][W + 1][Last]; |
343 | if (Last == Match) |
344 | --P; |
345 | const auto &Prev = Scores[P + 1][W][Cell.Prev]; |
346 | S[W] = Cell.Score - Prev.Score; |
347 | Last = Cell.Prev; |
348 | } |
349 | for (int I = 0; I < WordN; ++I) { |
350 | if (A[I] == Match && (I == 0 || A[I - 1] == Miss)) |
351 | Result.push_back(Elt: '['); |
352 | if (A[I] == Miss && I > 0 && A[I - 1] == Match) |
353 | Result.push_back(Elt: ']'); |
354 | Result.push_back(Elt: Word[I]); |
355 | } |
356 | if (A[WordN - 1] == Match) |
357 | Result.push_back(Elt: ']'); |
358 | |
359 | for (char C : llvm::StringRef(Word, WordN)) |
360 | OS << " " << C << " " ; |
361 | OS << "\n" ; |
362 | for (int I = 0, J = 0; I < WordN; I++) |
363 | OS << " " << (A[I] == Match ? Pat[J++] : ' ') << " " ; |
364 | OS << "\n" ; |
365 | for (int I = 0; I < WordN; I++) |
366 | OS << llvm::format(Fmt: "%2d " , Vals: S[I]); |
367 | OS << "\n" ; |
368 | |
369 | OS << "\nSegmentation:" ; |
370 | OS << "\n'" << llvm::StringRef(Word, WordN) << "'\n " ; |
371 | for (int I = 0; I < WordN; ++I) |
372 | OS << "?-+ " [static_cast<int>(WordRole[I])]; |
373 | OS << "\n[" << llvm::StringRef(Pat, PatN) << "]\n " ; |
374 | for (int I = 0; I < PatN; ++I) |
375 | OS << "?-+ " [static_cast<int>(PatRole[I])]; |
376 | OS << "\n" ; |
377 | |
378 | OS << "\nScoring table (last-Miss, last-Match):\n" ; |
379 | OS << " | " ; |
380 | for (char C : llvm::StringRef(Word, WordN)) |
381 | OS << " " << C << " " ; |
382 | OS << "\n" ; |
383 | OS << "-+----" << std::string(WordN * 4, '-') << "\n" ; |
384 | for (int I = 0; I <= PatN; ++I) { |
385 | for (Action A : {Miss, Match}) { |
386 | OS << ((I && A == Miss) ? Pat[I - 1] : ' ') << "|" ; |
387 | for (int J = 0; J <= WordN; ++J) { |
388 | if (!isAwful(S: Scores[I][J][A].Score)) |
389 | OS << llvm::format(Fmt: "%3d%c" , Vals: Scores[I][J][A].Score, |
390 | Vals: Scores[I][J][A].Prev == Match ? '*' : ' '); |
391 | else |
392 | OS << " " ; |
393 | } |
394 | OS << "\n" ; |
395 | } |
396 | } |
397 | |
398 | return Result; |
399 | } |
400 | |
401 | } // namespace clangd |
402 | } // namespace clang |
403 | |