1 | //===--- FuzzyMatch.h - Approximate identifier matching ---------*- C++-*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements fuzzy-matching of strings against identifiers. |
10 | // It indicates both the existence and quality of a match: |
11 | // 'eb' matches both 'emplace_back' and 'embed', the former has a better score. |
12 | // |
13 | //===----------------------------------------------------------------------===// |
14 | |
15 | #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H |
16 | #define |
17 | |
18 | #include "llvm/ADT/ArrayRef.h" |
19 | #include "llvm/ADT/SmallString.h" |
20 | #include "llvm/ADT/StringRef.h" |
21 | #include "llvm/Support/raw_ostream.h" |
22 | #include <optional> |
23 | |
24 | namespace clang { |
25 | namespace clangd { |
26 | |
27 | // Utilities for word segmentation. |
28 | // FuzzyMatcher already incorporates this logic, so most users don't need this. |
29 | // |
30 | // A name like "fooBar_baz" consists of several parts foo, bar, baz. |
31 | // Aligning segmentation of word and pattern improves the fuzzy-match. |
32 | // For example: [lol] matches "LaughingOutLoud" better than "LionPopulation" |
33 | // |
34 | // First we classify each character into types (uppercase, lowercase, etc). |
35 | // Then we look at the sequence: e.g. [upper, lower] is the start of a segment. |
36 | |
37 | // We distinguish the types of characters that affect segmentation. |
38 | // It's not obvious how to segment digits, we treat them as lowercase letters. |
39 | // As we don't decode UTF-8, we treat bytes over 127 as lowercase too. |
40 | // This means we require exact (case-sensitive) match for those characters. |
41 | enum CharType : unsigned char { |
42 | Empty = 0, // Before-the-start and after-the-end (and control chars). |
43 | Lower = 1, // Lowercase letters, digits, and non-ASCII bytes. |
44 | Upper = 2, // Uppercase letters. |
45 | Punctuation = 3, // ASCII punctuation (including Space) |
46 | }; |
47 | // A CharTypeSet is a bitfield representing all the character types in a word. |
48 | // Its bits are 1<<Empty, 1<<Lower, etc. |
49 | using CharTypeSet = unsigned char; |
50 | |
51 | // Each character's Role is the Head or Tail of a segment, or a Separator. |
52 | // e.g. XMLHttpRequest_Async |
53 | // +--+---+------ +---- |
54 | // ^Head ^Tail ^Separator |
55 | enum CharRole : unsigned char { |
56 | Unknown = 0, // Stray control characters or impossible states. |
57 | Tail = 1, // Part of a word segment, but not the first character. |
58 | Head = 2, // The first character of a word segment. |
59 | Separator = 3, // Punctuation characters that separate word segments. |
60 | }; |
61 | |
62 | // Compute segmentation of Text. |
63 | // Character roles are stored in Roles (Roles.size() must equal Text.size()). |
64 | // The set of character types encountered is returned, this may inform |
65 | // heuristics for dealing with poorly-segmented identifiers like "strndup". |
66 | CharTypeSet calculateRoles(llvm::StringRef Text, |
67 | llvm::MutableArrayRef<CharRole> Roles); |
68 | |
69 | // A matcher capable of matching and scoring strings against a single pattern. |
70 | // It's optimized for matching against many strings - match() does not allocate. |
71 | class FuzzyMatcher { |
72 | public: |
73 | // Characters beyond MaxPat are ignored. |
74 | FuzzyMatcher(llvm::StringRef Pattern); |
75 | |
76 | // If Word matches the pattern, return a score indicating the quality match. |
77 | // Scores usually fall in a [0,1] range, with 1 being a very good score. |
78 | // "Super" scores in (1,2] are possible if the pattern is the full word. |
79 | // Characters beyond MaxWord are ignored. |
80 | std::optional<float> match(llvm::StringRef Word); |
81 | |
82 | llvm::StringRef pattern() const { return llvm::StringRef(Pat, PatN); } |
83 | bool empty() const { return PatN == 0; } |
84 | |
85 | // Dump internal state from the last match() to the stream, for debugging. |
86 | // Returns the pattern with [] around matched characters, e.g. |
87 | // [u_p] + "unique_ptr" --> "[u]nique[_p]tr" |
88 | llvm::SmallString<256> dumpLast(llvm::raw_ostream &) const; |
89 | |
90 | private: |
91 | // We truncate the pattern and the word to bound the cost of matching. |
92 | constexpr static int MaxPat = 63, MaxWord = 127; |
93 | // Action describes how a word character was matched to the pattern. |
94 | // It should be an enum, but this causes bitfield problems: |
95 | // - for MSVC the enum type must be explicitly unsigned for correctness |
96 | // - GCC 4.8 complains not all values fit if the type is unsigned |
97 | using Action = bool; |
98 | constexpr static Action Miss = false; // Word character was skipped. |
99 | constexpr static Action Match = true; // Matched against a pattern character. |
100 | |
101 | bool init(llvm::StringRef Word); |
102 | void buildGraph(); |
103 | bool allowMatch(int P, int W, Action Last) const; |
104 | int skipPenalty(int W, Action Last) const; |
105 | int matchBonus(int P, int W, Action Last) const; |
106 | |
107 | // Pattern data is initialized by the constructor, then constant. |
108 | char Pat[MaxPat]; // Pattern data |
109 | int PatN; // Length |
110 | char LowPat[MaxPat]; // Pattern in lowercase |
111 | CharRole PatRole[MaxPat]; // Pattern segmentation info |
112 | CharTypeSet PatTypeSet; // Bitmask of 1<<CharType for all Pattern characters |
113 | float ScoreScale; // Normalizes scores for the pattern length. |
114 | |
115 | // Word data is initialized on each call to match(), mostly by init(). |
116 | char Word[MaxWord]; // Word data |
117 | int WordN; // Length |
118 | char LowWord[MaxWord]; // Word in lowercase |
119 | CharRole WordRole[MaxWord]; // Word segmentation info |
120 | CharTypeSet WordTypeSet; // Bitmask of 1<<CharType for all Word characters |
121 | bool WordContainsPattern; // Simple substring check |
122 | |
123 | // Cumulative best-match score table. |
124 | // Boundary conditions are filled in by the constructor. |
125 | // The rest is repopulated for each match(), by buildGraph(). |
126 | struct ScoreInfo { |
127 | signed int Score : 15; |
128 | Action Prev : 1; |
129 | }; |
130 | ScoreInfo Scores[MaxPat + 1][MaxWord + 1][/* Last Action */ 2]; |
131 | }; |
132 | |
133 | } // namespace clangd |
134 | } // namespace clang |
135 | |
136 | #endif |
137 | |