1//===--- FuzzyMatch.h - Approximate identifier matching ---------*- C++-*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// To check for a match between a Pattern ('u_p') and a Word ('unique_ptr'),
10// we consider the possible partial match states:
11//
12// u n i q u e _ p t r
13// +---------------------
14// |A . . . . . . . . . .
15// u|
16// |. . . . . . . . . . .
17// _|
18// |. . . . . . . O . . .
19// p|
20// |. . . . . . . . . . B
21//
22// Each dot represents some prefix of the pattern being matched against some
23// prefix of the word.
24// - A is the initial state: '' matched against ''
25// - O is an intermediate state: 'u_' matched against 'unique_'
26// - B is the target state: 'u_p' matched against 'unique_ptr'
27//
28// We aim to find the best path from A->B.
29// - Moving right (consuming a word character)
30// Always legal: not all word characters must match.
31// - Moving diagonally (consuming both a word and pattern character)
32// Legal if the characters match.
33// - Moving down (consuming a pattern character) is never legal.
34// Never legal: all pattern characters must match something.
35// Characters are matched case-insensitively.
36// The first pattern character may only match the start of a word segment.
37//
38// The scoring is based on heuristics:
39// - when matching a character, apply a bonus or penalty depending on the
40// match quality (does case match, do word segments align, etc)
41// - when skipping a character, apply a penalty if it hurts the match
42// (it starts a word segment, or splits the matched region, etc)
43//
44// These heuristics require the ability to "look backward" one character, to
45// see whether it was matched or not. Therefore the dynamic-programming matrix
46// has an extra dimension (last character matched).
47// Each entry also has an additional flag indicating whether the last-but-one
48// character matched, which is needed to trace back through the scoring table
49// and reconstruct the match.
50//
51// We treat strings as byte-sequences, so only ASCII has first-class support.
52//
53// This algorithm was inspired by VS code's client-side filtering, and aims
54// to be mostly-compatible.
55//
56//===----------------------------------------------------------------------===//
57
58#include "FuzzyMatch.h"
59#include "llvm/Support/Format.h"
60#include <optional>
61
62namespace clang {
63namespace clangd {
64
65constexpr int FuzzyMatcher::MaxPat;
66constexpr int FuzzyMatcher::MaxWord;
67
68static char lower(char C) { return C >= 'A' && C <= 'Z' ? C + ('a' - 'A') : C; }
69// A "negative infinity" score that won't overflow.
70// We use this to mark unreachable states and forbidden solutions.
71// Score field is 15 bits wide, min value is -2^14, we use half of that.
72static constexpr int AwfulScore = -(1 << 13);
73static bool isAwful(int S) { return S < AwfulScore / 2; }
74static constexpr int PerfectBonus = 4; // Perfect per-pattern-char score.
75
76FuzzyMatcher::FuzzyMatcher(llvm::StringRef Pattern)
77 : PatN(std::min<int>(a: MaxPat, b: Pattern.size())),
78 ScoreScale(PatN ? float{1} / (PerfectBonus * PatN) : 0), WordN(0) {
79 std::copy(first: Pattern.begin(), last: Pattern.begin() + PatN, result: Pat);
80 for (int I = 0; I < PatN; ++I)
81 LowPat[I] = lower(C: Pat[I]);
82 Scores[0][0][Miss] = {.Score: 0, .Prev: Miss};
83 Scores[0][0][Match] = {.Score: AwfulScore, .Prev: Miss};
84 for (int P = 0; P <= PatN; ++P)
85 for (int W = 0; W < P; ++W)
86 for (Action A : {Miss, Match})
87 Scores[P][W][A] = {.Score: AwfulScore, .Prev: Miss};
88 PatTypeSet = calculateRoles(Text: llvm::StringRef(Pat, PatN),
89 Roles: llvm::MutableArrayRef(PatRole, PatN));
90}
91
92std::optional<float> FuzzyMatcher::match(llvm::StringRef Word) {
93 if (!(WordContainsPattern = init(Word)))
94 return std::nullopt;
95 if (!PatN)
96 return 1;
97 buildGraph();
98 auto Best = std::max(a: Scores[PatN][WordN][Miss].Score,
99 b: Scores[PatN][WordN][Match].Score);
100 if (isAwful(S: Best))
101 return std::nullopt;
102 float Score =
103 ScoreScale * std::min(a: PerfectBonus * PatN, b: std::max<int>(a: 0, b: Best));
104 // If the pattern is as long as the word, we have an exact string match,
105 // since every pattern character must match something.
106 if (WordN == PatN)
107 Score *= 2; // May not be perfect 2 if case differs in a significant way.
108 return Score;
109}
110
111// We get CharTypes from a lookup table. Each is 2 bits, 4 fit in each byte.
112// The top 6 bits of the char select the byte, the bottom 2 select the offset.
113// e.g. 'q' = 010100 01 = byte 28 (55), bits 3-2 (01) -> Lower.
114constexpr static uint8_t CharTypes[] = {
115 0x00, 0x00, 0x00, 0x00, // Control characters
116 0x00, 0x00, 0x00, 0x00, // Control characters
117 0xff, 0xff, 0xff, 0xff, // Punctuation
118 0x55, 0x55, 0xf5, 0xff, // Numbers->Lower, more Punctuation.
119 0xab, 0xaa, 0xaa, 0xaa, // @ and A-O
120 0xaa, 0xaa, 0xea, 0xff, // P-Z, more Punctuation.
121 0x57, 0x55, 0x55, 0x55, // ` and a-o
122 0x55, 0x55, 0xd5, 0x3f, // p-z, Punctuation, DEL.
123 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // Bytes over 127 -> Lower.
124 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // (probably UTF-8).
125 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
126 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
127};
128
129// The Role can be determined from the Type of a character and its neighbors:
130//
131// Example | Chars | Type | Role
132// ---------+--------------+-----
133// F(o)oBar | Foo | Ull | Tail
134// Foo(B)ar | oBa | lUl | Head
135// (f)oo | ^fo | Ell | Head
136// H(T)TP | HTT | UUU | Tail
137//
138// Our lookup table maps a 6 bit key (Prev, Curr, Next) to a 2-bit Role.
139// A byte packs 4 Roles. (Prev, Curr) selects a byte, Next selects the offset.
140// e.g. Lower, Upper, Lower -> 01 10 01 -> byte 6 (aa), bits 3-2 (10) -> Head.
141constexpr static uint8_t CharRoles[] = {
142 // clang-format off
143 // Curr= Empty Lower Upper Separ
144 /* Prev=Empty */ 0x00, 0xaa, 0xaa, 0xff, // At start, Lower|Upper->Head
145 /* Prev=Lower */ 0x00, 0x55, 0xaa, 0xff, // In word, Upper->Head;Lower->Tail
146 /* Prev=Upper */ 0x00, 0x55, 0x59, 0xff, // Ditto, but U(U)U->Tail
147 /* Prev=Separ */ 0x00, 0xaa, 0xaa, 0xff, // After separator, like at start
148 // clang-format on
149};
150
151template <typename T> static T packedLookup(const uint8_t *Data, int I) {
152 return static_cast<T>((Data[I >> 2] >> ((I & 3) * 2)) & 3);
153}
154CharTypeSet calculateRoles(llvm::StringRef Text,
155 llvm::MutableArrayRef<CharRole> Roles) {
156 assert(Text.size() == Roles.size());
157 if (Text.size() == 0)
158 return 0;
159 CharType Type = packedLookup<CharType>(Data: CharTypes, I: Text[0]);
160 CharTypeSet TypeSet = 1 << Type;
161 // Types holds a sliding window of (Prev, Curr, Next) types.
162 // Initial value is (Empty, Empty, type of Text[0]).
163 int Types = Type;
164 // Rotate slides in the type of the next character.
165 auto Rotate = [&](CharType T) { Types = ((Types << 2) | T) & 0x3f; };
166 for (unsigned I = 0; I < Text.size() - 1; ++I) {
167 // For each character, rotate in the next, and look up the role.
168 Type = packedLookup<CharType>(Data: CharTypes, I: Text[I + 1]);
169 TypeSet |= 1 << Type;
170 Rotate(Type);
171 Roles[I] = packedLookup<CharRole>(Data: CharRoles, I: Types);
172 }
173 // For the last character, the "next character" is Empty.
174 Rotate(Empty);
175 Roles[Text.size() - 1] = packedLookup<CharRole>(Data: CharRoles, I: Types);
176 return TypeSet;
177}
178
179// Sets up the data structures matching Word.
180// Returns false if we can cheaply determine that no match is possible.
181bool FuzzyMatcher::init(llvm::StringRef NewWord) {
182 WordN = std::min<int>(a: MaxWord, b: NewWord.size());
183 if (PatN > WordN)
184 return false;
185 std::copy(first: NewWord.begin(), last: NewWord.begin() + WordN, result: Word);
186 if (PatN == 0)
187 return true;
188 for (int I = 0; I < WordN; ++I)
189 LowWord[I] = lower(C: Word[I]);
190
191 // Cheap subsequence check.
192 for (int W = 0, P = 0; P != PatN; ++W) {
193 if (W == WordN)
194 return false;
195 if (LowWord[W] == LowPat[P])
196 ++P;
197 }
198
199 // FIXME: some words are hard to tokenize algorithmically.
200 // e.g. vsprintf is V S Print F, and should match [pri] but not [int].
201 // We could add a tokenization dictionary for common stdlib names.
202 WordTypeSet = calculateRoles(Text: llvm::StringRef(Word, WordN),
203 Roles: llvm::MutableArrayRef(WordRole, WordN));
204 return true;
205}
206
207// The forwards pass finds the mappings of Pattern onto Word.
208// Score = best score achieved matching Word[..W] against Pat[..P].
209// Unlike other tables, indices range from 0 to N *inclusive*
210// Matched = whether we chose to match Word[W] with Pat[P] or not.
211//
212// Points are mostly assigned to matched characters, with 1 being a good score
213// and 3 being a great one. So we treat the score range as [0, 3 * PatN].
214// This range is not strict: we can apply larger bonuses/penalties, or penalize
215// non-matched characters.
216void FuzzyMatcher::buildGraph() {
217 for (int W = 0; W < WordN; ++W) {
218 Scores[0][W + 1][Miss] = {.Score: Scores[0][W][Miss].Score - skipPenalty(W, Last: Miss),
219 .Prev: Miss};
220 Scores[0][W + 1][Match] = {.Score: AwfulScore, .Prev: Miss};
221 }
222 for (int P = 0; P < PatN; ++P) {
223 for (int W = P; W < WordN; ++W) {
224 auto &Score = Scores[P + 1][W + 1], &PreMiss = Scores[P + 1][W];
225
226 auto MatchMissScore = PreMiss[Match].Score;
227 auto MissMissScore = PreMiss[Miss].Score;
228 if (P < PatN - 1) { // Skipping trailing characters is always free.
229 MatchMissScore -= skipPenalty(W, Last: Match);
230 MissMissScore -= skipPenalty(W, Last: Miss);
231 }
232 Score[Miss] = (MatchMissScore > MissMissScore)
233 ? ScoreInfo{.Score: MatchMissScore, .Prev: Match}
234 : ScoreInfo{.Score: MissMissScore, .Prev: Miss};
235
236 auto &PreMatch = Scores[P][W];
237 auto MatchMatchScore =
238 allowMatch(P, W, Last: Match)
239 ? PreMatch[Match].Score + matchBonus(P, W, Last: Match)
240 : AwfulScore;
241 auto MissMatchScore = allowMatch(P, W, Last: Miss)
242 ? PreMatch[Miss].Score + matchBonus(P, W, Last: Miss)
243 : AwfulScore;
244 Score[Match] = (MatchMatchScore > MissMatchScore)
245 ? ScoreInfo{.Score: MatchMatchScore, .Prev: Match}
246 : ScoreInfo{.Score: MissMatchScore, .Prev: Miss};
247 }
248 }
249}
250
251bool FuzzyMatcher::allowMatch(int P, int W, Action Last) const {
252 if (LowPat[P] != LowWord[W])
253 return false;
254 // We require a "strong" match:
255 // - for the first pattern character. [foo] !~ "barefoot"
256 // - after a gap. [pat] !~ "patnther"
257 if (Last == Miss) {
258 // We're banning matches outright, so conservatively accept some other cases
259 // where our segmentation might be wrong:
260 // - allow matching B in ABCDef (but not in NDEBUG)
261 // - we'd like to accept print in sprintf, but too many false positives
262 if (WordRole[W] == Tail &&
263 (Word[W] == LowWord[W] || !(WordTypeSet & 1 << Lower)))
264 return false;
265 }
266 return true;
267}
268
269int FuzzyMatcher::skipPenalty(int W, Action Last) const {
270 if (W == 0) // Skipping the first character.
271 return 3;
272 if (WordRole[W] == Head) // Skipping a segment.
273 return 1; // We want to keep this lower than a consecutive match bonus.
274 // Instead of penalizing non-consecutive matches, we give a bonus to a
275 // consecutive match in matchBonus. This produces a better score distribution
276 // than penalties in case of small patterns, e.g. 'up' for 'unique_ptr'.
277 return 0;
278}
279
280int FuzzyMatcher::matchBonus(int P, int W, Action Last) const {
281 assert(LowPat[P] == LowWord[W]);
282 int S = 1;
283 bool IsPatSingleCase =
284 (PatTypeSet == 1 << Lower) || (PatTypeSet == 1 << Upper);
285 // Bonus: case matches, or a Head in the pattern aligns with one in the word.
286 // Single-case patterns lack segmentation signals and we assume any character
287 // can be a head of a segment.
288 if (Pat[P] == Word[W] ||
289 (WordRole[W] == Head && (IsPatSingleCase || PatRole[P] == Head)))
290 ++S;
291 // Bonus: a consecutive match. First character match also gets a bonus to
292 // ensure prefix final match score normalizes to 1.0.
293 if (W == 0 || Last == Match)
294 S += 2;
295 // Penalty: matching inside a segment (and previous char wasn't matched).
296 if (WordRole[W] == Tail && P && Last == Miss)
297 S -= 3;
298 // Penalty: a Head in the pattern matches in the middle of a word segment.
299 if (PatRole[P] == Head && WordRole[W] == Tail)
300 --S;
301 // Penalty: matching the first pattern character in the middle of a segment.
302 if (P == 0 && WordRole[W] == Tail)
303 S -= 4;
304 assert(S <= PerfectBonus);
305 return S;
306}
307
308llvm::SmallString<256> FuzzyMatcher::dumpLast(llvm::raw_ostream &OS) const {
309 llvm::SmallString<256> Result;
310 OS << "=== Match \"" << llvm::StringRef(Word, WordN) << "\" against ["
311 << llvm::StringRef(Pat, PatN) << "] ===\n";
312 if (PatN == 0) {
313 OS << "Pattern is empty: perfect match.\n";
314 return Result = llvm::StringRef(Word, WordN);
315 }
316 if (WordN == 0) {
317 OS << "Word is empty: no match.\n";
318 return Result;
319 }
320 if (!WordContainsPattern) {
321 OS << "Substring check failed.\n";
322 return Result;
323 }
324 if (isAwful(S: std::max(a: Scores[PatN][WordN][Match].Score,
325 b: Scores[PatN][WordN][Miss].Score))) {
326 OS << "Substring check passed, but all matches are forbidden\n";
327 }
328 if (!(PatTypeSet & 1 << Upper))
329 OS << "Lowercase query, so scoring ignores case\n";
330
331 // Traverse Matched table backwards to reconstruct the Pattern/Word mapping.
332 // The Score table has cumulative scores, subtracting along this path gives
333 // us the per-letter scores.
334 Action Last =
335 (Scores[PatN][WordN][Match].Score > Scores[PatN][WordN][Miss].Score)
336 ? Match
337 : Miss;
338 int S[MaxWord];
339 Action A[MaxWord];
340 for (int W = WordN - 1, P = PatN - 1; W >= 0; --W) {
341 A[W] = Last;
342 const auto &Cell = Scores[P + 1][W + 1][Last];
343 if (Last == Match)
344 --P;
345 const auto &Prev = Scores[P + 1][W][Cell.Prev];
346 S[W] = Cell.Score - Prev.Score;
347 Last = Cell.Prev;
348 }
349 for (int I = 0; I < WordN; ++I) {
350 if (A[I] == Match && (I == 0 || A[I - 1] == Miss))
351 Result.push_back(Elt: '[');
352 if (A[I] == Miss && I > 0 && A[I - 1] == Match)
353 Result.push_back(Elt: ']');
354 Result.push_back(Elt: Word[I]);
355 }
356 if (A[WordN - 1] == Match)
357 Result.push_back(Elt: ']');
358
359 for (char C : llvm::StringRef(Word, WordN))
360 OS << " " << C << " ";
361 OS << "\n";
362 for (int I = 0, J = 0; I < WordN; I++)
363 OS << " " << (A[I] == Match ? Pat[J++] : ' ') << " ";
364 OS << "\n";
365 for (int I = 0; I < WordN; I++)
366 OS << llvm::format(Fmt: "%2d ", Vals: S[I]);
367 OS << "\n";
368
369 OS << "\nSegmentation:";
370 OS << "\n'" << llvm::StringRef(Word, WordN) << "'\n ";
371 for (int I = 0; I < WordN; ++I)
372 OS << "?-+ "[static_cast<int>(WordRole[I])];
373 OS << "\n[" << llvm::StringRef(Pat, PatN) << "]\n ";
374 for (int I = 0; I < PatN; ++I)
375 OS << "?-+ "[static_cast<int>(PatRole[I])];
376 OS << "\n";
377
378 OS << "\nScoring table (last-Miss, last-Match):\n";
379 OS << " | ";
380 for (char C : llvm::StringRef(Word, WordN))
381 OS << " " << C << " ";
382 OS << "\n";
383 OS << "-+----" << std::string(WordN * 4, '-') << "\n";
384 for (int I = 0; I <= PatN; ++I) {
385 for (Action A : {Miss, Match}) {
386 OS << ((I && A == Miss) ? Pat[I - 1] : ' ') << "|";
387 for (int J = 0; J <= WordN; ++J) {
388 if (!isAwful(S: Scores[I][J][A].Score))
389 OS << llvm::format(Fmt: "%3d%c", Vals: Scores[I][J][A].Score,
390 Vals: Scores[I][J][A].Prev == Match ? '*' : ' ');
391 else
392 OS << " ";
393 }
394 OS << "\n";
395 }
396 }
397
398 return Result;
399}
400
401} // namespace clangd
402} // namespace clang
403

source code of clang-tools-extra/clangd/FuzzyMatch.cpp