1 | //===--- Quality.h - Ranking alternatives for ambiguous queries --*- C++-*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// |
9 | /// Some operations such as code completion produce a set of candidates. |
10 | /// Usually the user can choose between them, but we should put the best options |
11 | /// at the top (they're easier to select, and more likely to be seen). |
12 | /// |
13 | /// This file defines building blocks for ranking candidates. |
14 | /// It's used by the features directly and also in the implementation of |
15 | /// indexes, as indexes also need to heuristically limit their results. |
16 | /// |
17 | /// The facilities here are: |
18 | /// - retrieving scoring signals from e.g. indexes, AST, CodeCompletionString |
19 | /// These are structured in a way that they can be debugged, and are fairly |
20 | /// consistent regardless of the source. |
21 | /// - compute scores from scoring signals. These are suitable for sorting. |
22 | /// - sorting utilities like the TopN container. |
23 | /// These could be split up further to isolate dependencies if we care. |
24 | /// |
25 | //===----------------------------------------------------------------------===// |
26 | |
27 | #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_QUALITY_H |
28 | #define |
29 | |
30 | #include "FileDistance.h" |
31 | #include "clang/Sema/CodeCompleteConsumer.h" |
32 | #include "llvm/ADT/StringRef.h" |
33 | #include "llvm/ADT/StringSet.h" |
34 | #include <algorithm> |
35 | #include <functional> |
36 | #include <optional> |
37 | #include <vector> |
38 | |
39 | namespace llvm { |
40 | class raw_ostream; |
41 | } // namespace llvm |
42 | |
43 | namespace clang { |
44 | class CodeCompletionResult; |
45 | |
46 | namespace clangd { |
47 | struct ASTSignals; |
48 | struct Symbol; |
49 | class URIDistance; |
50 | |
51 | // Signals structs are designed to be aggregated from 0 or more sources. |
52 | // A default instance has neutral signals, and sources are merged into it. |
53 | // They can be dumped for debugging, and evaluate()d into a score. |
54 | |
55 | /// Attributes of a symbol that affect how much we like it. |
56 | struct SymbolQualitySignals { |
57 | bool Deprecated = false; |
58 | bool ReservedName = false; // __foo, _Foo are usually implementation details. |
59 | // FIXME: make these findable once user types _. |
60 | bool ImplementationDetail = false; |
61 | unsigned References = 0; |
62 | |
63 | enum SymbolCategory { |
64 | Unknown = 0, |
65 | Variable, |
66 | Macro, |
67 | Type, |
68 | Function, |
69 | Constructor, |
70 | Destructor, |
71 | Namespace, |
72 | Keyword, |
73 | Operator, |
74 | } Category = Unknown; |
75 | |
76 | void merge(const CodeCompletionResult &SemaCCResult); |
77 | void merge(const Symbol &IndexResult); |
78 | |
79 | // Condense these signals down to a single number, higher is better. |
80 | float evaluateHeuristics() const; |
81 | }; |
82 | llvm::raw_ostream &operator<<(llvm::raw_ostream &, |
83 | const SymbolQualitySignals &); |
84 | |
85 | /// Attributes of a symbol-query pair that affect how much we like it. |
86 | struct SymbolRelevanceSignals { |
87 | /// The name of the symbol (for ContextWords). Must be explicitly assigned. |
88 | llvm::StringRef Name; |
89 | /// 0-1+ fuzzy-match score for unqualified name. Must be explicitly assigned. |
90 | float NameMatch = 1; |
91 | /// Lowercase words relevant to the context (e.g. near the completion point). |
92 | llvm::StringSet<>* ContextWords = nullptr; |
93 | bool Forbidden = false; // Unavailable (e.g const) or inaccessible (private). |
94 | /// Whether fixits needs to be applied for that completion or not. |
95 | bool NeedsFixIts = false; |
96 | bool InBaseClass = false; // A member from base class of the accessed class. |
97 | |
98 | URIDistance *FileProximityMatch = nullptr; |
99 | /// These are used to calculate proximity between the index symbol and the |
100 | /// query. |
101 | llvm::StringRef SymbolURI; |
102 | /// FIXME: unify with index proximity score - signals should be |
103 | /// source-independent. |
104 | /// Proximity between best declaration and the query. [0-1], 1 is closest. |
105 | float SemaFileProximityScore = 0; |
106 | |
107 | // Scope proximity is only considered (both index and sema) when this is set. |
108 | ScopeDistance *ScopeProximityMatch = nullptr; |
109 | std::optional<llvm::StringRef> SymbolScope; |
110 | // A symbol from sema should be accessible from the current scope. |
111 | bool SemaSaysInScope = false; |
112 | |
113 | // An approximate measure of where we expect the symbol to be used. |
114 | enum AccessibleScope { |
115 | FunctionScope, |
116 | ClassScope, |
117 | FileScope, |
118 | GlobalScope, |
119 | } Scope = GlobalScope; |
120 | |
121 | enum QueryType { |
122 | CodeComplete, |
123 | Generic, |
124 | } Query = Generic; |
125 | |
126 | CodeCompletionContext::Kind Context = CodeCompletionContext::CCC_Other; |
127 | |
128 | // Whether symbol is an instance member of a class. |
129 | bool IsInstanceMember = false; |
130 | |
131 | // Whether clang provided a preferred type in the completion context. |
132 | bool HadContextType = false; |
133 | // Whether a source completion item or a symbol had a type information. |
134 | bool HadSymbolType = false; |
135 | // Whether the item matches the type expected in the completion context. |
136 | bool TypeMatchesPreferred = false; |
137 | |
138 | /// Length of the unqualified partial name of Symbol typed in |
139 | /// CompletionPrefix. |
140 | unsigned FilterLength = 0; |
141 | |
142 | const ASTSignals *MainFileSignals = nullptr; |
143 | /// Number of references to the candidate in the main file. |
144 | unsigned MainFileRefs = 0; |
145 | /// Number of unique symbols in the main file which belongs to candidate's |
146 | /// namespace. This indicates how relevant the namespace is in the current |
147 | /// file. |
148 | unsigned ScopeRefsInFile = 0; |
149 | |
150 | /// Set of derived signals computed by calculateDerivedSignals(). Must not be |
151 | /// set explicitly. |
152 | struct DerivedSignals { |
153 | /// Whether Name contains some word from context. |
154 | bool NameMatchesContext = false; |
155 | /// Min distance between SymbolURI and all the headers included by the TU. |
156 | unsigned FileProximityDistance = FileDistance::Unreachable; |
157 | /// Min distance between SymbolScope and all the available scopes. |
158 | unsigned ScopeProximityDistance = FileDistance::Unreachable; |
159 | }; |
160 | |
161 | DerivedSignals calculateDerivedSignals() const; |
162 | |
163 | void merge(const CodeCompletionResult &SemaResult); |
164 | void merge(const Symbol &IndexResult); |
165 | void computeASTSignals(const CodeCompletionResult &SemaResult); |
166 | |
167 | // Condense these signals down to a single number, higher is better. |
168 | float evaluateHeuristics() const; |
169 | }; |
170 | llvm::raw_ostream &operator<<(llvm::raw_ostream &, |
171 | const SymbolRelevanceSignals &); |
172 | |
173 | /// Combine symbol quality and relevance into a single score. |
174 | float evaluateSymbolAndRelevance(float SymbolQuality, float SymbolRelevance); |
175 | |
176 | /// Same semantics as CodeComplete::Score. Quality score and Relevance score |
177 | /// have been removed since DecisionForest cannot assign individual scores to |
178 | /// Quality and Relevance signals. |
179 | struct DecisionForestScores { |
180 | float Total = 0.f; |
181 | float ExcludingName = 0.f; |
182 | }; |
183 | |
184 | DecisionForestScores |
185 | evaluateDecisionForest(const SymbolQualitySignals &Quality, |
186 | const SymbolRelevanceSignals &Relevance, float Base); |
187 | |
188 | /// TopN<T> is a lossy container that preserves only the "best" N elements. |
189 | template <typename T, typename Compare = std::greater<T>> class TopN { |
190 | public: |
191 | using value_type = T; |
192 | TopN(size_t N, Compare Greater = Compare()) |
193 | : N(N), Greater(std::move(Greater)) {} |
194 | |
195 | // Adds a candidate to the set. |
196 | // Returns true if a candidate was dropped to get back under N. |
197 | bool push(value_type &&V) { |
198 | bool Dropped = false; |
199 | if (Heap.size() >= N) { |
200 | Dropped = true; |
201 | if (N > 0 && Greater(V, Heap.front())) { |
202 | std::pop_heap(Heap.begin(), Heap.end(), Greater); |
203 | Heap.back() = std::move(V); |
204 | std::push_heap(Heap.begin(), Heap.end(), Greater); |
205 | } |
206 | } else { |
207 | Heap.push_back(std::move(V)); |
208 | std::push_heap(Heap.begin(), Heap.end(), Greater); |
209 | } |
210 | assert(Heap.size() <= N); |
211 | assert(std::is_heap(Heap.begin(), Heap.end(), Greater)); |
212 | return Dropped; |
213 | } |
214 | |
215 | // Returns candidates from best to worst. |
216 | std::vector<value_type> items() && { |
217 | std::sort_heap(Heap.begin(), Heap.end(), Greater); |
218 | assert(Heap.size() <= N); |
219 | return std::move(Heap); |
220 | } |
221 | |
222 | private: |
223 | const size_t N; |
224 | std::vector<value_type> Heap; // Min-heap, comparator is Greater. |
225 | Compare Greater; |
226 | }; |
227 | |
228 | /// Returns a string that sorts in the same order as (-Score, Tiebreak), for |
229 | /// LSP. (The highest score compares smallest so it sorts at the top). |
230 | std::string sortText(float Score, llvm::StringRef Tiebreak = "" ); |
231 | |
232 | struct SignatureQualitySignals { |
233 | uint32_t NumberOfParameters = 0; |
234 | uint32_t NumberOfOptionalParameters = 0; |
235 | CodeCompleteConsumer::OverloadCandidate::CandidateKind Kind = |
236 | CodeCompleteConsumer::OverloadCandidate::CandidateKind::CK_Function; |
237 | }; |
238 | llvm::raw_ostream &operator<<(llvm::raw_ostream &, |
239 | const SignatureQualitySignals &); |
240 | |
241 | } // namespace clangd |
242 | } // namespace clang |
243 | |
244 | #endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_QUALITY_H |
245 | |