1 | //===--- Dex.cpp - Dex Symbol Index Implementation --------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "Dex.h" |
10 | #include "FileDistance.h" |
11 | #include "FuzzyMatch.h" |
12 | #include "Quality.h" |
13 | #include "URI.h" |
14 | #include "index/Index.h" |
15 | #include "index/dex/Iterator.h" |
16 | #include "index/dex/Token.h" |
17 | #include "index/dex/Trigram.h" |
18 | #include "support/Logger.h" |
19 | #include "support/Trace.h" |
20 | #include "llvm/ADT/DenseMap.h" |
21 | #include "llvm/ADT/StringMap.h" |
22 | #include "llvm/ADT/StringSet.h" |
23 | #include "llvm/Support/Path.h" |
24 | #include "llvm/Support/ScopedPrinter.h" |
25 | #include <algorithm> |
26 | #include <optional> |
27 | #include <queue> |
28 | #include <utility> |
29 | #include <vector> |
30 | |
31 | namespace clang { |
32 | namespace clangd { |
33 | namespace dex { |
34 | |
35 | std::unique_ptr<SymbolIndex> Dex::build(SymbolSlab Symbols, RefSlab Refs, |
36 | RelationSlab Rels) { |
37 | auto Size = Symbols.bytes() + Refs.bytes(); |
38 | // There is no need to include "Rels" in Data because the relations are self- |
39 | // contained, without references into a backing store. |
40 | auto Data = std::make_pair(x: std::move(Symbols), y: std::move(Refs)); |
41 | return std::make_unique<Dex>(args&: Data.first, args&: Data.second, args&: Rels, args: std::move(Data), |
42 | args&: Size); |
43 | } |
44 | |
45 | namespace { |
46 | |
47 | // Mark symbols which are can be used for code completion. |
48 | const Token RestrictedForCodeCompletion = |
49 | Token(Token::Kind::Sentinel, "Restricted For Code Completion" ); |
50 | |
51 | // Helper to efficiently assemble the inverse index (token -> matching docs). |
52 | // The output is a nice uniform structure keyed on Token, but constructing |
53 | // the Token object every time we want to insert into the map is wasteful. |
54 | // Instead we have various maps keyed on things that are cheap to compute, |
55 | // and produce the Token keys once at the end. |
56 | class IndexBuilder { |
57 | llvm::DenseMap<Trigram, std::vector<DocID>> TrigramDocs; |
58 | std::vector<DocID> RestrictedCCDocs; |
59 | llvm::StringMap<std::vector<DocID>> TypeDocs; |
60 | llvm::StringMap<std::vector<DocID>> ScopeDocs; |
61 | llvm::StringMap<std::vector<DocID>> ProximityDocs; |
62 | std::vector<Trigram> TrigramScratch; |
63 | |
64 | public: |
65 | // Add the tokens which are given symbol's characteristics. |
66 | // This includes fuzzy matching trigrams, symbol's scope, etc. |
67 | // FIXME(kbobyrev): Support more token types: |
68 | // * Namespace proximity |
69 | void add(const Symbol &Sym, DocID D) { |
70 | generateIdentifierTrigrams(Identifier: Sym.Name, Out&: TrigramScratch); |
71 | for (Trigram T : TrigramScratch) |
72 | TrigramDocs[T].push_back(x: D); |
73 | ScopeDocs[Sym.Scope].push_back(x: D); |
74 | if (!llvm::StringRef(Sym.CanonicalDeclaration.FileURI).empty()) |
75 | for (const auto &ProximityURI : |
76 | generateProximityURIs(Sym.CanonicalDeclaration.FileURI)) |
77 | ProximityDocs[ProximityURI].push_back(x: D); |
78 | if (Sym.Flags & Symbol::IndexedForCodeCompletion) |
79 | RestrictedCCDocs.push_back(x: D); |
80 | if (!Sym.Type.empty()) |
81 | TypeDocs[Sym.Type].push_back(x: D); |
82 | } |
83 | |
84 | // Assemble the final compressed posting lists for the added symbols. |
85 | llvm::DenseMap<Token, PostingList> build() && { |
86 | llvm::DenseMap<Token, PostingList> Result(/*InitialReserve=*/ |
87 | TrigramDocs.size() + |
88 | RestrictedCCDocs.size() + |
89 | TypeDocs.size() + |
90 | ScopeDocs.size() + |
91 | ProximityDocs.size()); |
92 | // Tear down intermediate structs as we go to reduce memory usage. |
93 | // Since we're trying to get rid of underlying allocations, clearing the |
94 | // containers is not enough. |
95 | auto CreatePostingList = |
96 | [&Result](Token::Kind TK, llvm::StringMap<std::vector<DocID>> &Docs) { |
97 | for (auto &E : Docs) { |
98 | Result.try_emplace(Key: Token(TK, E.first()), Args&: E.second); |
99 | E.second = {}; |
100 | } |
101 | Docs = {}; |
102 | }; |
103 | CreatePostingList(Token::Kind::Type, TypeDocs); |
104 | CreatePostingList(Token::Kind::Scope, ScopeDocs); |
105 | CreatePostingList(Token::Kind::ProximityURI, ProximityDocs); |
106 | |
107 | // TrigramDocs are stored in a DenseMap and RestrictedCCDocs is not even a |
108 | // map, treat them specially. |
109 | for (auto &E : TrigramDocs) { |
110 | Result.try_emplace(Key: Token(Token::Kind::Trigram, E.first.str()), Args&: E.second); |
111 | E.second = {}; |
112 | } |
113 | TrigramDocs = llvm::DenseMap<Trigram, std::vector<DocID>>{}; |
114 | if (!RestrictedCCDocs.empty()) |
115 | Result.try_emplace(Key: RestrictedForCodeCompletion, |
116 | Args: std::move(RestrictedCCDocs)); |
117 | return Result; |
118 | } |
119 | }; |
120 | |
121 | } // namespace |
122 | |
123 | void Dex::buildIndex() { |
124 | this->Corpus = dex::Corpus(Symbols.size()); |
125 | std::vector<std::pair<float, const Symbol *>> ScoredSymbols(Symbols.size()); |
126 | |
127 | for (size_t I = 0; I < Symbols.size(); ++I) { |
128 | const Symbol *Sym = Symbols[I]; |
129 | LookupTable[Sym->ID] = Sym; |
130 | ScoredSymbols[I] = {quality(S: *Sym), Sym}; |
131 | } |
132 | |
133 | // Symbols are sorted by symbol qualities so that items in the posting lists |
134 | // are stored in the descending order of symbol quality. |
135 | llvm::sort(C&: ScoredSymbols, Comp: std::greater<std::pair<float, const Symbol *>>()); |
136 | |
137 | // SymbolQuality was empty up until now. |
138 | SymbolQuality.resize(new_size: Symbols.size()); |
139 | // Populate internal storage using Symbol + Score pairs. |
140 | for (size_t I = 0; I < ScoredSymbols.size(); ++I) { |
141 | SymbolQuality[I] = ScoredSymbols[I].first; |
142 | Symbols[I] = ScoredSymbols[I].second; |
143 | } |
144 | |
145 | // Build posting lists for symbols. |
146 | IndexBuilder Builder; |
147 | for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank) |
148 | Builder.add(Sym: *Symbols[SymbolRank], D: SymbolRank); |
149 | InvertedIndex = std::move(Builder).build(); |
150 | } |
151 | |
152 | std::unique_ptr<Iterator> Dex::iterator(const Token &Tok) const { |
153 | auto It = InvertedIndex.find(Val: Tok); |
154 | return It == InvertedIndex.end() ? Corpus.none() |
155 | : It->second.iterator(Tok: &It->first); |
156 | } |
157 | |
158 | // Constructs BOOST iterators for Path Proximities. |
159 | std::unique_ptr<Iterator> Dex::createFileProximityIterator( |
160 | llvm::ArrayRef<std::string> ProximityPaths) const { |
161 | std::vector<std::unique_ptr<Iterator>> BoostingIterators; |
162 | // Deduplicate parent URIs extracted from the ProximityPaths. |
163 | llvm::StringSet<> ParentURIs; |
164 | llvm::StringMap<SourceParams> Sources; |
165 | for (const auto &Path : ProximityPaths) { |
166 | Sources[Path] = SourceParams(); |
167 | auto PathURI = URI::create(AbsolutePath: Path).toString(); |
168 | const auto PathProximityURIs = generateProximityURIs(PathURI.c_str()); |
169 | for (const auto &ProximityURI : PathProximityURIs) |
170 | ParentURIs.insert(key: ProximityURI); |
171 | } |
172 | // Use SymbolRelevanceSignals for symbol relevance evaluation: use defaults |
173 | // for all parameters except for Proximity Path distance signal. |
174 | SymbolRelevanceSignals PathProximitySignals; |
175 | // DistanceCalculator will find the shortest distance from ProximityPaths to |
176 | // any URI extracted from the ProximityPaths. |
177 | URIDistance DistanceCalculator(Sources); |
178 | PathProximitySignals.FileProximityMatch = &DistanceCalculator; |
179 | // Try to build BOOST iterator for each Proximity Path provided by |
180 | // ProximityPaths. Boosting factor should depend on the distance to the |
181 | // Proximity Path: the closer processed path is, the higher boosting factor. |
182 | for (const auto &ParentURI : ParentURIs.keys()) { |
183 | // FIXME(kbobyrev): Append LIMIT on top of every BOOST iterator. |
184 | auto It = iterator(Tok: Token(Token::Kind::ProximityURI, ParentURI)); |
185 | if (It->kind() != Iterator::Kind::False) { |
186 | PathProximitySignals.SymbolURI = ParentURI; |
187 | BoostingIterators.push_back(x: Corpus.boost( |
188 | Child: std::move(It), Factor: PathProximitySignals.evaluateHeuristics())); |
189 | } |
190 | } |
191 | BoostingIterators.push_back(x: Corpus.all()); |
192 | return Corpus.unionOf(Children: std::move(BoostingIterators)); |
193 | } |
194 | |
195 | // Constructs BOOST iterators for preferred types. |
196 | std::unique_ptr<Iterator> |
197 | Dex::createTypeBoostingIterator(llvm::ArrayRef<std::string> Types) const { |
198 | std::vector<std::unique_ptr<Iterator>> BoostingIterators; |
199 | SymbolRelevanceSignals PreferredTypeSignals; |
200 | PreferredTypeSignals.TypeMatchesPreferred = true; |
201 | auto Boost = PreferredTypeSignals.evaluateHeuristics(); |
202 | for (const auto &T : Types) |
203 | BoostingIterators.push_back( |
204 | x: Corpus.boost(Child: iterator(Tok: Token(Token::Kind::Type, T)), Factor: Boost)); |
205 | BoostingIterators.push_back(x: Corpus.all()); |
206 | return Corpus.unionOf(Children: std::move(BoostingIterators)); |
207 | } |
208 | |
209 | /// Constructs iterators over tokens extracted from the query and exhausts it |
210 | /// while applying Callback to each symbol in the order of decreasing quality |
211 | /// of the matched symbols. |
212 | bool Dex::fuzzyFind(const FuzzyFindRequest &Req, |
213 | llvm::function_ref<void(const Symbol &)> Callback) const { |
214 | assert(!StringRef(Req.Query).contains("::" ) && |
215 | "There must be no :: in query." ); |
216 | trace::Span Tracer("Dex fuzzyFind" ); |
217 | FuzzyMatcher Filter(Req.Query); |
218 | // For short queries we use specialized trigrams that don't yield all results. |
219 | // Prevent clients from postfiltering them for longer queries. |
220 | bool More = !Req.Query.empty() && Req.Query.size() < 3; |
221 | |
222 | std::vector<std::unique_ptr<Iterator>> Criteria; |
223 | const auto TrigramTokens = generateQueryTrigrams(Query: Req.Query); |
224 | |
225 | // Generate query trigrams and construct AND iterator over all query |
226 | // trigrams. |
227 | std::vector<std::unique_ptr<Iterator>> TrigramIterators; |
228 | for (const auto &Trigram : TrigramTokens) |
229 | TrigramIterators.push_back(x: iterator(Tok: Trigram)); |
230 | Criteria.push_back(x: Corpus.intersect(Children: std::move(TrigramIterators))); |
231 | |
232 | // Generate scope tokens for search query. |
233 | std::vector<std::unique_ptr<Iterator>> ScopeIterators; |
234 | for (const auto &Scope : Req.Scopes) |
235 | ScopeIterators.push_back(x: iterator(Tok: Token(Token::Kind::Scope, Scope))); |
236 | if (Req.AnyScope) |
237 | ScopeIterators.push_back( |
238 | x: Corpus.boost(Child: Corpus.all(), Factor: ScopeIterators.empty() ? 1.0 : 0.2)); |
239 | Criteria.push_back(x: Corpus.unionOf(Children: std::move(ScopeIterators))); |
240 | |
241 | // Add proximity paths boosting (all symbols, some boosted). |
242 | Criteria.push_back(x: createFileProximityIterator(ProximityPaths: Req.ProximityPaths)); |
243 | // Add boosting for preferred types. |
244 | Criteria.push_back(x: createTypeBoostingIterator(Types: Req.PreferredTypes)); |
245 | |
246 | if (Req.RestrictForCodeCompletion) |
247 | Criteria.push_back(x: iterator(Tok: RestrictedForCodeCompletion)); |
248 | |
249 | // Use TRUE iterator if both trigrams and scopes from the query are not |
250 | // present in the symbol index. |
251 | auto Root = Corpus.intersect(Children: std::move(Criteria)); |
252 | // Retrieve more items than it was requested: some of the items with high |
253 | // final score might not be retrieved otherwise. |
254 | // FIXME(kbobyrev): Tune this ratio. |
255 | if (Req.Limit) |
256 | Root = Corpus.limit(Child: std::move(Root), Limit: *Req.Limit * 100); |
257 | SPAN_ATTACH(Tracer, "query" , llvm::to_string(*Root)); |
258 | vlog(Fmt: "Dex query tree: {0}" , Vals&: *Root); |
259 | |
260 | using IDAndScore = std::pair<DocID, float>; |
261 | std::vector<IDAndScore> IDAndScores = consume(It&: *Root); |
262 | |
263 | auto Compare = [](const IDAndScore &LHS, const IDAndScore &RHS) { |
264 | return LHS.second > RHS.second; |
265 | }; |
266 | TopN<IDAndScore, decltype(Compare)> Top( |
267 | Req.Limit ? *Req.Limit : std::numeric_limits<size_t>::max(), Compare); |
268 | for (const auto &IDAndScore : IDAndScores) { |
269 | const DocID SymbolDocID = IDAndScore.first; |
270 | const auto *Sym = Symbols[SymbolDocID]; |
271 | const std::optional<float> Score = Filter.match(Word: Sym->Name); |
272 | if (!Score) |
273 | continue; |
274 | // Combine Fuzzy Matching score, precomputed symbol quality and boosting |
275 | // score for a cumulative final symbol score. |
276 | const float FinalScore = |
277 | (*Score) * SymbolQuality[SymbolDocID] * IDAndScore.second; |
278 | // If Top.push(...) returns true, it means that it had to pop an item. In |
279 | // this case, it is possible to retrieve more symbols. |
280 | if (Top.push(V: {SymbolDocID, FinalScore})) |
281 | More = true; |
282 | } |
283 | |
284 | // Apply callback to the top Req.Limit items in the descending |
285 | // order of cumulative score. |
286 | for (const auto &Item : std::move(Top).items()) |
287 | Callback(*Symbols[Item.first]); |
288 | return More; |
289 | } |
290 | |
291 | void Dex::lookup(const LookupRequest &Req, |
292 | llvm::function_ref<void(const Symbol &)> Callback) const { |
293 | trace::Span Tracer("Dex lookup" ); |
294 | for (const auto &ID : Req.IDs) { |
295 | auto I = LookupTable.find(Val: ID); |
296 | if (I != LookupTable.end()) |
297 | Callback(*I->second); |
298 | } |
299 | } |
300 | |
301 | bool Dex::refs(const RefsRequest &Req, |
302 | llvm::function_ref<void(const Ref &)> Callback) const { |
303 | trace::Span Tracer("Dex refs" ); |
304 | uint32_t Remaining = Req.Limit.value_or(u: std::numeric_limits<uint32_t>::max()); |
305 | for (const auto &ID : Req.IDs) |
306 | for (const auto &Ref : Refs.lookup(Val: ID)) { |
307 | if (!static_cast<int>(Req.Filter & Ref.Kind)) |
308 | continue; |
309 | if (Remaining == 0) |
310 | return true; // More refs were available. |
311 | --Remaining; |
312 | Callback(Ref); |
313 | } |
314 | return false; // We reported all refs. |
315 | } |
316 | |
317 | void Dex::relations( |
318 | const RelationsRequest &Req, |
319 | llvm::function_ref<void(const SymbolID &, const Symbol &)> Callback) const { |
320 | trace::Span Tracer("Dex relations" ); |
321 | uint32_t Remaining = Req.Limit.value_or(u: std::numeric_limits<uint32_t>::max()); |
322 | for (const SymbolID &Subject : Req.Subjects) { |
323 | LookupRequest LookupReq; |
324 | auto It = Relations.find( |
325 | Val: std::make_pair(x: Subject, y: static_cast<uint8_t>(Req.Predicate))); |
326 | if (It != Relations.end()) { |
327 | for (const auto &Object : It->second) { |
328 | if (Remaining > 0) { |
329 | --Remaining; |
330 | LookupReq.IDs.insert(V: Object); |
331 | } |
332 | } |
333 | } |
334 | lookup(Req: LookupReq, Callback: [&](const Symbol &Object) { Callback(Subject, Object); }); |
335 | } |
336 | } |
337 | |
338 | llvm::unique_function<IndexContents(llvm::StringRef) const> |
339 | Dex::indexedFiles() const { |
340 | return [this](llvm::StringRef FileURI) { |
341 | return Files.contains(key: FileURI) ? IdxContents : IndexContents::None; |
342 | }; |
343 | } |
344 | |
345 | size_t Dex::estimateMemoryUsage() const { |
346 | size_t Bytes = Symbols.size() * sizeof(const Symbol *); |
347 | Bytes += SymbolQuality.size() * sizeof(float); |
348 | Bytes += LookupTable.getMemorySize(); |
349 | Bytes += InvertedIndex.getMemorySize(); |
350 | for (const auto &TokenToPostingList : InvertedIndex) |
351 | Bytes += TokenToPostingList.second.bytes(); |
352 | Bytes += Refs.getMemorySize(); |
353 | Bytes += Relations.getMemorySize(); |
354 | return Bytes + BackingDataSize; |
355 | } |
356 | |
357 | // Given foo://bar/one/two |
358 | // Returns ~~~~~~~~ (or empty for bad URI) |
359 | llvm::StringRef findPathInURI(llvm::StringRef S) { |
360 | S = S.split(Separator: ':').second; // Eat scheme. |
361 | if (S.consume_front(Prefix: "//" )) // Eat authority. |
362 | S = S.drop_until(F: [](char C) { return C == '/'; }); |
363 | return S; |
364 | } |
365 | |
366 | // FIXME(kbobyrev): Currently, this is a heuristic which defines the maximum |
367 | // size of resulting vector. Some projects might want to have higher limit if |
368 | // the file hierarchy is deeper. For the generic case, it would be useful to |
369 | // calculate Limit in the index build stage by calculating the maximum depth |
370 | // of the project source tree at runtime. |
371 | constexpr unsigned ProximityURILimit = 5; |
372 | |
373 | llvm::SmallVector<llvm::StringRef, ProximityURILimit> |
374 | generateProximityURIs(llvm::StringRef URI) { |
375 | // This function is hot when indexing, so don't parse/reserialize URIPath, |
376 | // just emit substrings of it instead. |
377 | // |
378 | // foo://bar/one/two |
379 | // ~~~~~~~~ |
380 | // Path |
381 | llvm::StringRef Path = findPathInURI(S: URI); |
382 | if (Path.empty()) |
383 | return {}; // Bad URI. |
384 | assert(Path.begin() >= URI.begin() && Path.begin() < URI.end() && |
385 | Path.end() == URI.end()); |
386 | |
387 | // The original is a proximity URI. |
388 | llvm::SmallVector<llvm::StringRef, ProximityURILimit> Result = {URI}; |
389 | // Form proximity URIs by chopping before each slash in the path part. |
390 | for (auto Slash = Path.rfind(C: '/'); Slash > 0 && Slash != StringRef::npos; |
391 | Slash = Path.rfind(C: '/')) { |
392 | Path = Path.substr(Start: 0, N: Slash); |
393 | Result.push_back(Elt: URI.substr(Start: 0, N: Path.end() - URI.data())); |
394 | if (Result.size() == ProximityURILimit) |
395 | return Result; |
396 | } |
397 | // The root foo://bar/ is a proximity URI. |
398 | if (Path.starts_with(Prefix: "/" )) |
399 | Result.push_back(Elt: URI.substr(Start: 0, N: Path.begin() + 1 - URI.data())); |
400 | return Result; |
401 | } |
402 | |
403 | } // namespace dex |
404 | } // namespace clangd |
405 | } // namespace clang |
406 | |