| 1 | //===--- FuzzyMatch.h - Approximate identifier matching ---------*- C++-*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file implements fuzzy-matching of strings against identifiers. |
| 10 | // It indicates both the existence and quality of a match: |
| 11 | // 'eb' matches both 'emplace_back' and 'embed', the former has a better score. |
| 12 | // |
| 13 | //===----------------------------------------------------------------------===// |
| 14 | |
| 15 | #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H |
| 16 | #define |
| 17 | |
| 18 | #include "llvm/ADT/ArrayRef.h" |
| 19 | #include "llvm/ADT/SmallString.h" |
| 20 | #include "llvm/ADT/StringRef.h" |
| 21 | #include "llvm/Support/raw_ostream.h" |
| 22 | #include <optional> |
| 23 | |
| 24 | namespace clang { |
| 25 | namespace clangd { |
| 26 | |
| 27 | // Utilities for word segmentation. |
| 28 | // FuzzyMatcher already incorporates this logic, so most users don't need this. |
| 29 | // |
| 30 | // A name like "fooBar_baz" consists of several parts foo, bar, baz. |
| 31 | // Aligning segmentation of word and pattern improves the fuzzy-match. |
| 32 | // For example: [lol] matches "LaughingOutLoud" better than "LionPopulation" |
| 33 | // |
| 34 | // First we classify each character into types (uppercase, lowercase, etc). |
| 35 | // Then we look at the sequence: e.g. [upper, lower] is the start of a segment. |
| 36 | |
| 37 | // We distinguish the types of characters that affect segmentation. |
| 38 | // It's not obvious how to segment digits, we treat them as lowercase letters. |
| 39 | // As we don't decode UTF-8, we treat bytes over 127 as lowercase too. |
| 40 | // This means we require exact (case-sensitive) match for those characters. |
| 41 | enum CharType : unsigned char { |
| 42 | Empty = 0, // Before-the-start and after-the-end (and control chars). |
| 43 | Lower = 1, // Lowercase letters, digits, and non-ASCII bytes. |
| 44 | Upper = 2, // Uppercase letters. |
| 45 | Punctuation = 3, // ASCII punctuation (including Space) |
| 46 | }; |
| 47 | // A CharTypeSet is a bitfield representing all the character types in a word. |
| 48 | // Its bits are 1<<Empty, 1<<Lower, etc. |
| 49 | using CharTypeSet = unsigned char; |
| 50 | |
| 51 | // Each character's Role is the Head or Tail of a segment, or a Separator. |
| 52 | // e.g. XMLHttpRequest_Async |
| 53 | // +--+---+------ +---- |
| 54 | // ^Head ^Tail ^Separator |
| 55 | enum CharRole : unsigned char { |
| 56 | Unknown = 0, // Stray control characters or impossible states. |
| 57 | Tail = 1, // Part of a word segment, but not the first character. |
| 58 | Head = 2, // The first character of a word segment. |
| 59 | Separator = 3, // Punctuation characters that separate word segments. |
| 60 | }; |
| 61 | |
| 62 | // Compute segmentation of Text. |
| 63 | // Character roles are stored in Roles (Roles.size() must equal Text.size()). |
| 64 | // The set of character types encountered is returned, this may inform |
| 65 | // heuristics for dealing with poorly-segmented identifiers like "strndup". |
| 66 | CharTypeSet calculateRoles(llvm::StringRef Text, |
| 67 | llvm::MutableArrayRef<CharRole> Roles); |
| 68 | |
| 69 | // A matcher capable of matching and scoring strings against a single pattern. |
| 70 | // It's optimized for matching against many strings - match() does not allocate. |
| 71 | class FuzzyMatcher { |
| 72 | public: |
| 73 | // Characters beyond MaxPat are ignored. |
| 74 | FuzzyMatcher(llvm::StringRef Pattern); |
| 75 | |
| 76 | // If Word matches the pattern, return a score indicating the quality match. |
| 77 | // Scores usually fall in a [0,1] range, with 1 being a very good score. |
| 78 | // "Super" scores in (1,2] are possible if the pattern is the full word. |
| 79 | // Characters beyond MaxWord are ignored. |
| 80 | std::optional<float> match(llvm::StringRef Word); |
| 81 | |
| 82 | llvm::StringRef pattern() const { return llvm::StringRef(Pat, PatN); } |
| 83 | bool empty() const { return PatN == 0; } |
| 84 | |
| 85 | // Dump internal state from the last match() to the stream, for debugging. |
| 86 | // Returns the pattern with [] around matched characters, e.g. |
| 87 | // [u_p] + "unique_ptr" --> "[u]nique[_p]tr" |
| 88 | llvm::SmallString<256> dumpLast(llvm::raw_ostream &) const; |
| 89 | |
| 90 | private: |
| 91 | // We truncate the pattern and the word to bound the cost of matching. |
| 92 | constexpr static int MaxPat = 63, MaxWord = 127; |
| 93 | // Action describes how a word character was matched to the pattern. |
| 94 | // It should be an enum, but this causes bitfield problems: |
| 95 | // - for MSVC the enum type must be explicitly unsigned for correctness |
| 96 | // - GCC 4.8 complains not all values fit if the type is unsigned |
| 97 | using Action = bool; |
| 98 | constexpr static Action Miss = false; // Word character was skipped. |
| 99 | constexpr static Action Match = true; // Matched against a pattern character. |
| 100 | |
| 101 | bool init(llvm::StringRef Word); |
| 102 | void buildGraph(); |
| 103 | bool allowMatch(int P, int W, Action Last) const; |
| 104 | int skipPenalty(int W, Action Last) const; |
| 105 | int matchBonus(int P, int W, Action Last) const; |
| 106 | |
| 107 | // Pattern data is initialized by the constructor, then constant. |
| 108 | char Pat[MaxPat]; // Pattern data |
| 109 | int PatN; // Length |
| 110 | char LowPat[MaxPat]; // Pattern in lowercase |
| 111 | CharRole PatRole[MaxPat]; // Pattern segmentation info |
| 112 | CharTypeSet PatTypeSet; // Bitmask of 1<<CharType for all Pattern characters |
| 113 | float ScoreScale; // Normalizes scores for the pattern length. |
| 114 | |
| 115 | // Word data is initialized on each call to match(), mostly by init(). |
| 116 | char Word[MaxWord]; // Word data |
| 117 | int WordN; // Length |
| 118 | char LowWord[MaxWord]; // Word in lowercase |
| 119 | CharRole WordRole[MaxWord]; // Word segmentation info |
| 120 | CharTypeSet WordTypeSet; // Bitmask of 1<<CharType for all Word characters |
| 121 | bool WordContainsPattern; // Simple substring check |
| 122 | |
| 123 | // Cumulative best-match score table. |
| 124 | // Boundary conditions are filled in by the constructor. |
| 125 | // The rest is repopulated for each match(), by buildGraph(). |
| 126 | struct ScoreInfo { |
| 127 | signed int Score : 15; |
| 128 | Action Prev : 1; |
| 129 | }; |
| 130 | ScoreInfo Scores[MaxPat + 1][MaxWord + 1][/* Last Action */ 2]; |
| 131 | }; |
| 132 | |
| 133 | } // namespace clangd |
| 134 | } // namespace clang |
| 135 | |
| 136 | #endif |
| 137 | |