FuzzyMatch.cpp source code [clang-tools-extra/clangd/FuzzyMatch.cpp]

1	//===--- FuzzyMatch.h - Approximate identifier matching ---------- C++--===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// To check for a match between a Pattern ('u_p') and a Word ('unique_ptr'),
10	// we consider the possible partial match states:
11	//
12	// u n i q u e _ p t r
13	// +---------------------
14	// \|A . . . . . . . . . .
15	// u\|
16	// \|. . . . . . . . . . .
17	// _\|
18	// \|. . . . . . . O . . .
19	// p\|
20	// \|. . . . . . . . . . B
21	//
22	// Each dot represents some prefix of the pattern being matched against some
23	// prefix of the word.
24	// - A is the initial state: '' matched against ''
25	// - O is an intermediate state: 'u_' matched against 'unique_'
26	// - B is the target state: 'u_p' matched against 'unique_ptr'
27	//
28	// We aim to find the best path from A->B.
29	// - Moving right (consuming a word character)
30	// Always legal: not all word characters must match.
31	// - Moving diagonally (consuming both a word and pattern character)
32	// Legal if the characters match.
33	// - Moving down (consuming a pattern character) is never legal.
34	// Never legal: all pattern characters must match something.
35	// Characters are matched case-insensitively.
36	// The first pattern character may only match the start of a word segment.
37	//
38	// The scoring is based on heuristics:
39	// - when matching a character, apply a bonus or penalty depending on the
40	// match quality (does case match, do word segments align, etc)
41	// - when skipping a character, apply a penalty if it hurts the match
42	// (it starts a word segment, or splits the matched region, etc)
43	//
44	// These heuristics require the ability to "look backward" one character, to
45	// see whether it was matched or not. Therefore the dynamic-programming matrix
46	// has an extra dimension (last character matched).
47	// Each entry also has an additional flag indicating whether the last-but-one
48	// character matched, which is needed to trace back through the scoring table
49	// and reconstruct the match.
50	//
51	// We treat strings as byte-sequences, so only ASCII has first-class support.
52	//
53	// This algorithm was inspired by VS code's client-side filtering, and aims
54	// to be mostly-compatible.
55	//
56	//===----------------------------------------------------------------------===//
57
58	#include "FuzzyMatch.h"
59	#include "llvm/Support/Format.h"
60	#include <optional>
61
62	namespace clang {
63	namespace clangd {
64
65	constexpr int FuzzyMatcher::MaxPat;
66	constexpr int FuzzyMatcher::MaxWord;
67
68	static char lower(char C) { return C >= `'A'` && C <= `'Z'` ? C + (`'a'` - `'A'`) : C; }
69	// A "negative infinity" score that won't overflow.
70	// We use this to mark unreachable states and forbidden solutions.
71	// Score field is 15 bits wide, min value is -2^14, we use half of that.
72	static constexpr int AwfulScore = -(`1` << `13`);
73	static bool isAwful(int S) { return S < AwfulScore / `2`; }
74	static constexpr int PerfectBonus = `4`; // Perfect per-pattern-char score.
75
76	FuzzyMatcher::FuzzyMatcher(llvm::StringRef Pattern)
77	: PatN(std::min<int>(a: MaxPat, b: Pattern.size())),
78	ScoreScale(PatN ? float{`1`} / (PerfectBonus * PatN) : `0`), WordN(`0`) {
79	std::copy(first: Pattern.begin(), last: Pattern.begin() + PatN, result: Pat);
80	for (int I = `0`; I < PatN; ++I)
81	LowPat[I] = lower(C: Pat[I]);
82	Scores[`0`][`0`][Miss] = {.Score: `0`, .Prev: Miss};
83	Scores[`0`][`0`][Match] = {.Score: AwfulScore, .Prev: Miss};
84	for (int P = `0`; P <= PatN; ++P)
85	for (int W = `0`; W < P; ++W)
86	for (Action A : {Miss, Match})
87	Scores[P][W][A] = {.Score: AwfulScore, .Prev: Miss};
88	PatTypeSet = calculateRoles(Text: llvm::StringRef (Pat, PatN),
89	Roles: llvm::MutableArrayRef(PatRole, PatN));
90	}
91
92	std::optional<float> FuzzyMatcher::match(llvm::StringRef Word) {
93	if (!(WordContainsPattern = init(Word)))
94	return std::nullopt;
95	if (!PatN)
96	return `1`;
97	buildGraph();
98	auto Best = std::max(a: Scores[PatN][WordN][Miss].Score,
99	b: Scores[PatN][WordN][Match].Score);
100	if (isAwful(S: Best))
101	return std::nullopt;
102	float Score =
103	ScoreScale * std::min(a: PerfectBonus * PatN, b: std::max<int>(a: `0`, b: Best));
104	// If the pattern is as long as the word, we have an exact string match,
105	// since every pattern character must match something.
106	if (WordN == PatN)
107	Score = `2`; // May not be perfect 2 if case differs in a significant way.*
108	return Score;
109	}
110
111	// We get CharTypes from a lookup table. Each is 2 bits, 4 fit in each byte.
112	// The top 6 bits of the char select the byte, the bottom 2 select the offset.
113	// e.g. 'q' = 010100 01 = byte 28 (55), bits 3-2 (01) -> Lower.
114	constexpr static uint8_t CharTypes[] = {
115	`0x00`, `0x00`, `0x00`, `0x00`, // Control characters
116	`0x00`, `0x00`, `0x00`, `0x00`, // Control characters
117	`0xff`, `0xff`, `0xff`, `0xff`, // Punctuation
118	`0x55`, `0x55`, `0xf5`, `0xff`, // Numbers->Lower, more Punctuation.
119	`0xab`, `0xaa`, `0xaa`, `0xaa`, // @ and A-O
120	`0xaa`, `0xaa`, `0xea`, `0xff`, // P-Z, more Punctuation.
121	`0x57`, `0x55`, `0x55`, `0x55`, // ` and a-o
122	`0x55`, `0x55`, `0xd5`, `0x3f`, // p-z, Punctuation, DEL.
123	`0x55`, `0x55`, `0x55`, `0x55`, `0x55`, `0x55`, `0x55`, `0x55`, // Bytes over 127 -> Lower.
124	`0x55`, `0x55`, `0x55`, `0x55`, `0x55`, `0x55`, `0x55`, `0x55`, // (probably UTF-8).
125	`0x55`, `0x55`, `0x55`, `0x55`, `0x55`, `0x55`, `0x55`, `0x55`,
126	`0x55`, `0x55`, `0x55`, `0x55`, `0x55`, `0x55`, `0x55`, `0x55`,
127	};
128
129	// The Role can be determined from the Type of a character and its neighbors:
130	//
131	// Example \| Chars \| Type \| Role
132	// ---------+--------------+-----
133	// F(o)oBar \| Foo \| Ull \| Tail
134	// Foo(B)ar \| oBa \| lUl \| Head
135	// (f)oo \| ^fo \| Ell \| Head
136	// H(T)TP \| HTT \| UUU \| Tail
137	//
138	// Our lookup table maps a 6 bit key (Prev, Curr, Next) to a 2-bit Role.
139	// A byte packs 4 Roles. (Prev, Curr) selects a byte, Next selects the offset.
140	// e.g. Lower, Upper, Lower -> 01 10 01 -> byte 6 (aa), bits 3-2 (10) -> Head.
141	constexpr static uint8_t CharRoles[] = {
142	// clang-format off
143	// Curr= Empty Lower Upper Separ
144	/ Prev=Empty / `0x00`, `0xaa`, `0xaa`, `0xff`, // At start, Lower\|Upper->Head
145	/ Prev=Lower / `0x00`, `0x55`, `0xaa`, `0xff`, // In word, Upper->Head;Lower->Tail
146	/ Prev=Upper / `0x00`, `0x55`, `0x59`, `0xff`, // Ditto, but U(U)U->Tail
147	/ Prev=Separ / `0x00`, `0xaa`, `0xaa`, `0xff`, // After separator, like at start
148	// clang-format on
149	};
150
151	template <typename T> static T packedLookup(const uint8_t Data, int* I) {
152	return static_cast<T>((Data[I >> `2`] >> ((I & `3`) * `2`)) & `3`);
153	}
154	CharTypeSet calculateRoles(llvm::StringRef Text,
155	llvm::MutableArrayRef<CharRole> Roles) {
156	assert(Text.size() == Roles.size());
157	if (Text.size() == `0`)
158	return `0`;
159	CharType Type = packedLookup<CharType>(Data: CharTypes, I: Text [`0`]);
160	CharTypeSet TypeSet = `1` << Type;
161	// Types holds a sliding window of (Prev, Curr, Next) types.
162	// Initial value is (Empty, Empty, type of Text[0]).
163	int Types = Type;
164	// Rotate slides in the type of the next character.
165	auto Rotate = [&](CharType T) { Types = ((Types << `2`) \| T) & `0x3f`; };
166	for (unsigned I = `0`; I < Text.size() - `1`; ++I) {
167	// For each character, rotate in the next, and look up the role.
168	Type = packedLookup<CharType>(Data: CharTypes, I: Text [I + `1`]);
169	TypeSet \|= `1` << Type;
170	Rotate (Type);
171	Roles [I] = packedLookup<CharRole>(Data: CharRoles, I: Types);
172	}
173	// For the last character, the "next character" is Empty.
174	Rotate (Empty);
175	Roles [Text.size() - `1`] = packedLookup<CharRole>(Data: CharRoles, I: Types);
176	return TypeSet;
177	}
178
179	// Sets up the data structures matching Word.
180	// Returns false if we can cheaply determine that no match is possible.
181	bool FuzzyMatcher::init(llvm::StringRef NewWord) {
182	WordN = std::min<int>(a: MaxWord, b: NewWord.size());
183	if (PatN > WordN)
184	return false;
185	std::copy(first: NewWord.begin(), last: NewWord.begin() + WordN, result: Word);
186	if (PatN == `0`)
187	return true;
188	for (int I = `0`; I < WordN; ++I)
189	LowWord[I] = lower(C: Word[I]);
190
191	// Cheap subsequence check.
192	for (int W = `0`, P = `0`; P != PatN; ++W) {
193	if (W == WordN)
194	return false;
195	if (LowWord[W] == LowPat[P])
196	++P;
197	}
198
199	// FIXME: some words are hard to tokenize algorithmically.
200	// e.g. vsprintf is V S Print F, and should match [pri] but not [int].
201	// We could add a tokenization dictionary for common stdlib names.
202	WordTypeSet = calculateRoles(Text: llvm::StringRef (Word, WordN),
203	Roles: llvm::MutableArrayRef(WordRole, WordN));
204	return true;
205	}
206
207	// The forwards pass finds the mappings of Pattern onto Word.
208	// Score = best score achieved matching Word[..W] against Pat[..P].
209	// Unlike other tables, indices range from 0 to N inclusive
210	// Matched = whether we chose to match Word[W] with Pat[P] or not.
211	//
212	// Points are mostly assigned to matched characters, with 1 being a good score
213	// and 3 being a great one. So we treat the score range as [0, 3 PatN].*
214	// This range is not strict: we can apply larger bonuses/penalties, or penalize
215	// non-matched characters.
216	void FuzzyMatcher::buildGraph() {
217	for (int W = `0`; W < WordN; ++W) {
218	Scores[`0`][W + `1`][Miss] = {.Score: Scores[`0`][W][Miss].Score - skipPenalty(W, Last: Miss),
219	.Prev: Miss};
220	Scores[`0`][W + `1`][Match] = {.Score: AwfulScore, .Prev: Miss};
221	}
222	for (int P = `0`; P < PatN; ++P) {
223	for (int W = P; W < WordN; ++W) {
224	auto &Score = Scores[P + `1`][W + `1`], &PreMiss = Scores[P + `1`][W];
225
226	auto MatchMissScore = PreMiss[Match].Score;
227	auto MissMissScore = PreMiss[Miss].Score;
228	if (P < PatN - `1`) { // Skipping trailing characters is always free.
229	MatchMissScore -= skipPenalty(W, Last: Match);
230	MissMissScore -= skipPenalty(W, Last: Miss);
231	}
232	Score[Miss] = (MatchMissScore > MissMissScore)
233	? ScoreInfo{.Score: MatchMissScore, .Prev: Match}
234	: ScoreInfo{.Score: MissMissScore, .Prev: Miss};
235
236	auto &PreMatch = Scores[P][W];
237	auto MatchMatchScore =
238	allowMatch(P, W, Last: Match)
239	? PreMatch[Match].Score + matchBonus(P, W, Last: Match)
240	: AwfulScore;
241	auto MissMatchScore = allowMatch(P, W, Last: Miss)
242	? PreMatch[Miss].Score + matchBonus(P, W, Last: Miss)
243	: AwfulScore;
244	Score[Match] = (MatchMatchScore > MissMatchScore)
245	? ScoreInfo{.Score: MatchMatchScore, .Prev: Match}
246	: ScoreInfo{.Score: MissMatchScore, .Prev: Miss};
247	}
248	}
249	}
250
251	bool FuzzyMatcher::allowMatch(int P, int W, Action Last) const {
252	if (LowPat[P] != LowWord[W])
253	return false;
254	// We require a "strong" match:
255	// - for the first pattern character. [foo] !~ "barefoot"
256	// - after a gap. [pat] !~ "patnther"
257	if (Last == Miss) {
258	// We're banning matches outright, so conservatively accept some other cases
259	// where our segmentation might be wrong:
260	// - allow matching B in ABCDef (but not in NDEBUG)
261	// - we'd like to accept print in sprintf, but too many false positives
262	if (WordRole[W] == Tail &&
263	(Word[W] == LowWord[W] \|\| !(WordTypeSet & `1` << Lower)))
264	return false;
265	}
266	return true;
267	}
268
269	int FuzzyMatcher::skipPenalty(int W, Action Last) const {
270	if (W == `0`) // Skipping the first character.
271	return `3`;
272	if (WordRole[W] == Head) // Skipping a segment.
273	return `1`; // We want to keep this lower than a consecutive match bonus.
274	// Instead of penalizing non-consecutive matches, we give a bonus to a
275	// consecutive match in matchBonus. This produces a better score distribution
276	// than penalties in case of small patterns, e.g. 'up' for 'unique_ptr'.
277	return `0`;
278	}
279
280	int FuzzyMatcher::matchBonus(int P, int W, Action Last) const {
281	assert(LowPat[P] == LowWord[W]);
282	int S = `1`;
283	bool IsPatSingleCase =
284	(PatTypeSet == `1` << Lower) \|\| (PatTypeSet == `1` << Upper);
285	// Bonus: case matches, or a Head in the pattern aligns with one in the word.
286	// Single-case patterns lack segmentation signals and we assume any character
287	// can be a head of a segment.
288	if (Pat[P] == Word[W] \|\|
289	(WordRole[W] == Head && (IsPatSingleCase \|\| PatRole[P] == Head)))
290	++S;
291	// Bonus: a consecutive match. First character match also gets a bonus to
292	// ensure prefix final match score normalizes to 1.0.
293	if (W == `0` \|\| Last == Match)
294	S += `2`;
295	// Penalty: matching inside a segment (and previous char wasn't matched).
296	if (WordRole[W] == Tail && P && Last == Miss)
297	S -= `3`;
298	// Penalty: a Head in the pattern matches in the middle of a word segment.
299	if (PatRole[P] == Head && WordRole[W] == Tail)
300	--S;
301	// Penalty: matching the first pattern character in the middle of a segment.
302	if (P == `0` && WordRole[W] == Tail)
303	S -= `4`;
304	assert(S <= PerfectBonus);
305	return S;
306	}
307
308	llvm::SmallString<`256`> FuzzyMatcher::dumpLast(llvm::raw_ostream &OS) const {
309	llvm::SmallString<`256`> Result;
310	OS << "=== Match \"" << llvm::StringRef (Word, WordN) << "\" against ["
311	<< llvm::StringRef (Pat, PatN) << "] ===\n";
312	if (PatN == `0`) {
313	OS << "Pattern is empty: perfect match.\n";
314	return Result = llvm::StringRef (Word, WordN);
315	}
316	if (WordN == `0`) {
317	OS << "Word is empty: no match.\n";
318	return Result;
319	}
320	if (!WordContainsPattern) {
321	OS << "Substring check failed.\n";
322	return Result;
323	}
324	if (isAwful(S: std::max(a: Scores[PatN][WordN][Match].Score,
325	b: Scores[PatN][WordN][Miss].Score))) {
326	OS << "Substring check passed, but all matches are forbidden\n";
327	}
328	if (!(PatTypeSet & `1` << Upper))
329	OS << "Lowercase query, so scoring ignores case\n";
330
331	// Traverse Matched table backwards to reconstruct the Pattern/Word mapping.
332	// The Score table has cumulative scores, subtracting along this path gives
333	// us the per-letter scores.
334	Action Last =
335	(Scores[PatN][WordN][Match].Score > Scores[PatN][WordN][Miss].Score)
336	? Match
337	: Miss;
338	int S[MaxWord];
339	Action A[MaxWord];
340	for (int W = WordN - `1`, P = PatN - `1`; W >= `0`; --W) {
341	A[W] = Last;
342	const auto &Cell = Scores[P + `1`][W + `1`][Last];
343	if (Last == Match)
344	--P;
345	const auto &Prev = Scores[P + `1`][W][Cell.Prev];
346	S[W] = Cell.Score - Prev.Score;
347	Last = Cell.Prev;
348	}
349	for (int I = `0`; I < WordN; ++I) {
350	if (A[I] == Match && (I == `0` \|\| A[I - `1`] == Miss))
351	Result.push_back(Elt: `'['`);
352	if (A[I] == Miss && I > `0` && A[I - `1`] == Match)
353	Result.push_back(Elt: `']'`);
354	Result.push_back(Elt: Word[I]);
355	}
356	if (A[WordN - `1`] == Match)
357	Result.push_back(Elt: `']'`);
358
359	for (char C : llvm::StringRef (Word, WordN))
360	OS << " " << C << " ";
361	OS << "\n";
362	for (int I = `0`, J = `0`; I < WordN; I++)
363	OS << " " << (A[I] == Match ? Pat[J++] : `' '`) << " ";
364	OS << "\n";
365	for (int I = `0`; I < WordN; I++)
366	OS << llvm::format(Fmt: "%2d ", Vals: S[I]);
367	OS << "\n";
368
369	OS << "\nSegmentation:";
370	OS << "\n'" << llvm::StringRef (Word, WordN) << "'\n ";
371	for (int I = `0`; I < WordN; ++I)
372	OS << "?-+ "[static_cast<int>(WordRole[I])];
373	OS << "\n[" << llvm::StringRef (Pat, PatN) << "]\n ";
374	for (int I = `0`; I < PatN; ++I)
375	OS << "?-+ "[static_cast<int>(PatRole[I])];
376	OS << "\n";
377
378	OS << "\nScoring table (last-Miss, last-Match):\n";
379	OS << " \| ";
380	for (char C : llvm::StringRef (Word, WordN))
381	OS << " " << C << " ";
382	OS << "\n";
383	OS << "-+----" << std::string (WordN * `4`, `'-'`) << "\n";
384	for (int I = `0`; I <= PatN; ++I) {
385	for (Action A : {Miss, Match}) {
386	OS << ((I && A == Miss) ? Pat[I - `1`] : `' '`) << "\|";
387	for (int J = `0`; J <= WordN; ++J) {
388	if (!isAwful(S: Scores[I][J][A].Score))
389	OS << llvm::format(Fmt: "%3d%c", Vals: Scores[I][J][A].Score,
390	Vals: Scores[I][J][A].Prev == Match ? `'*'` : `' '`);
391	else
392	OS << " ";
393	}
394	OS << "\n";
395	}
396	}
397
398	return Result;
399	}
400
401	} // namespace clangd
402	} // namespace clang
403

source code of clang-tools-extra/clangd/FuzzyMatch.cpp