FuzzyMatch.h source code [clang-tools-extra/clangd/FuzzyMatch.h]

1	//===--- FuzzyMatch.h - Approximate identifier matching ---------- C++--===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements fuzzy-matching of strings against identifiers.
10	// It indicates both the existence and quality of a match:
11	// 'eb' matches both 'emplace_back' and 'embed', the former has a better score.
12	//
13	//===----------------------------------------------------------------------===//
14
15	#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H
16	#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H
17
18	#include "llvm/ADT/ArrayRef.h"
19	#include "llvm/ADT/SmallString.h"
20	#include "llvm/ADT/StringRef.h"
21	#include "llvm/Support/raw_ostream.h"
22	#include <optional>
23
24	namespace clang {
25	namespace clangd {
26
27	// Utilities for word segmentation.
28	// FuzzyMatcher already incorporates this logic, so most users don't need this.
29	//
30	// A name like "fooBar_baz" consists of several parts foo, bar, baz.
31	// Aligning segmentation of word and pattern improves the fuzzy-match.
32	// For example: [lol] matches "LaughingOutLoud" better than "LionPopulation"
33	//
34	// First we classify each character into types (uppercase, lowercase, etc).
35	// Then we look at the sequence: e.g. [upper, lower] is the start of a segment.
36
37	// We distinguish the types of characters that affect segmentation.
38	// It's not obvious how to segment digits, we treat them as lowercase letters.
39	// As we don't decode UTF-8, we treat bytes over 127 as lowercase too.
40	// This means we require exact (case-sensitive) match for those characters.
41	enum CharType : unsigned char {
42	Empty = `0`, // Before-the-start and after-the-end (and control chars).
43	Lower = `1`, // Lowercase letters, digits, and non-ASCII bytes.
44	Upper = `2`, // Uppercase letters.
45	Punctuation = `3`, // ASCII punctuation (including Space)
46	};
47	// A CharTypeSet is a bitfield representing all the character types in a word.
48	// Its bits are 1<<Empty, 1<<Lower, etc.
49	using CharTypeSet = unsigned char;
50
51	// Each character's Role is the Head or Tail of a segment, or a Separator.
52	// e.g. XMLHttpRequest_Async
53	// +--+---+------ +----
54	// ^Head ^Tail ^Separator
55	enum CharRole : unsigned char {
56	Unknown = `0`, // Stray control characters or impossible states.
57	Tail = `1`, // Part of a word segment, but not the first character.
58	Head = `2`, // The first character of a word segment.
59	Separator = `3`, // Punctuation characters that separate word segments.
60	};
61
62	// Compute segmentation of Text.
63	// Character roles are stored in Roles (Roles.size() must equal Text.size()).
64	// The set of character types encountered is returned, this may inform
65	// heuristics for dealing with poorly-segmented identifiers like "strndup".
66	CharTypeSet calculateRoles(llvm::StringRef Text,
67	llvm::MutableArrayRef<CharRole> Roles);
68
69	// A matcher capable of matching and scoring strings against a single pattern.
70	// It's optimized for matching against many strings - match() does not allocate.
71	class FuzzyMatcher {
72	public:
73	// Characters beyond MaxPat are ignored.
74	FuzzyMatcher(llvm::StringRef Pattern);
75
76	// If Word matches the pattern, return a score indicating the quality match.
77	// Scores usually fall in a [0,1] range, with 1 being a very good score.
78	// "Super" scores in (1,2] are possible if the pattern is the full word.
79	// Characters beyond MaxWord are ignored.
80	std::optional<float> match(llvm::StringRef Word);
81
82	llvm::StringRef pattern() const { return llvm::StringRef (Pat, PatN); }
83	bool empty() const { return PatN == `0`; }
84
85	// Dump internal state from the last match() to the stream, for debugging.
86	// Returns the pattern with [] around matched characters, e.g.
87	// [u_p] + "unique_ptr" --> "[u]nique[_p]tr"
88	llvm::SmallString<`256`> dumpLast(llvm::raw_ostream &) const;
89
90	private:
91	// We truncate the pattern and the word to bound the cost of matching.
92	constexpr static int MaxPat = `63`, MaxWord = `127`;
93	// Action describes how a word character was matched to the pattern.
94	// It should be an enum, but this causes bitfield problems:
95	// - for MSVC the enum type must be explicitly unsigned for correctness
96	// - GCC 4.8 complains not all values fit if the type is unsigned
97	using Action = bool;
98	constexpr static Action Miss = false; // Word character was skipped.
99	constexpr static Action Match = true; // Matched against a pattern character.
100
101	bool init(llvm::StringRef Word);
102	void buildGraph();
103	bool allowMatch(int P, int W, Action Last) const;
104	int skipPenalty(int W, Action Last) const;
105	int matchBonus(int P, int W, Action Last) const;
106
107	// Pattern data is initialized by the constructor, then constant.
108	char Pat[MaxPat]; // Pattern data
109	int PatN; // Length
110	char LowPat[MaxPat]; // Pattern in lowercase
111	CharRole PatRole[MaxPat]; // Pattern segmentation info
112	CharTypeSet PatTypeSet; // Bitmask of 1<<CharType for all Pattern characters
113	float ScoreScale; // Normalizes scores for the pattern length.
114
115	// Word data is initialized on each call to match(), mostly by init().
116	char Word[MaxWord]; // Word data
117	int WordN; // Length
118	char LowWord[MaxWord]; // Word in lowercase
119	CharRole WordRole[MaxWord]; // Word segmentation info
120	CharTypeSet WordTypeSet; // Bitmask of 1<<CharType for all Word characters
121	bool WordContainsPattern; // Simple substring check
122
123	// Cumulative best-match score table.
124	// Boundary conditions are filled in by the constructor.
125	// The rest is repopulated for each match(), by buildGraph().
126	struct ScoreInfo {
127	signed int Score : `15`;
128	Action Prev : `1`;
129	};
130	ScoreInfo Scores[MaxPat + `1`][MaxWord + `1`][/ Last Action / `2`];
131	};
132
133	} // namespace clangd
134	} // namespace clang
135
136	#endif
137

Provided by KDAB

Learn to use CMake with our Intro Training

Find out more

Definitions

source code of clang-tools-extra/clangd/FuzzyMatch.h