1 | //===--- Dex.cpp - Dex Symbol Index Implementation --------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "Dex.h" |
10 | #include "FileDistance.h" |
11 | #include "FuzzyMatch.h" |
12 | #include "Quality.h" |
13 | #include "URI.h" |
14 | #include "index/Index.h" |
15 | #include "index/dex/Iterator.h" |
16 | #include "index/dex/Token.h" |
17 | #include "index/dex/Trigram.h" |
18 | #include "support/Logger.h" |
19 | #include "support/Trace.h" |
20 | #include "llvm/ADT/DenseMap.h" |
21 | #include "llvm/ADT/StringMap.h" |
22 | #include "llvm/ADT/StringSet.h" |
23 | #include "llvm/Support/Path.h" |
24 | #include "llvm/Support/ScopedPrinter.h" |
25 | #include <algorithm> |
26 | #include <optional> |
27 | #include <queue> |
28 | #include <utility> |
29 | #include <vector> |
30 | |
31 | namespace clang { |
32 | namespace clangd { |
33 | namespace dex { |
34 | |
35 | std::unique_ptr<SymbolIndex> Dex::build(SymbolSlab Symbols, RefSlab Refs, |
36 | RelationSlab Rels, |
37 | bool SupportContainedRefs) { |
38 | auto Size = Symbols.bytes() + Refs.bytes(); |
39 | // There is no need to include "Rels" in Data because the relations are self- |
40 | // contained, without references into a backing store. |
41 | auto Data = std::make_pair(x: std::move(Symbols), y: std::move(Refs)); |
42 | return std::make_unique<Dex>(args&: Data.first, args&: Data.second, args&: Rels, args: std::move(Data), |
43 | args&: Size, args&: SupportContainedRefs); |
44 | } |
45 | |
46 | namespace { |
47 | |
48 | // Mark symbols which are can be used for code completion. |
49 | const Token RestrictedForCodeCompletion = |
50 | Token(Token::Kind::Sentinel, "Restricted For Code Completion" ); |
51 | |
52 | // Helper to efficiently assemble the inverse index (token -> matching docs). |
53 | // The output is a nice uniform structure keyed on Token, but constructing |
54 | // the Token object every time we want to insert into the map is wasteful. |
55 | // Instead we have various maps keyed on things that are cheap to compute, |
56 | // and produce the Token keys once at the end. |
57 | class IndexBuilder { |
58 | llvm::DenseMap<Trigram, std::vector<DocID>> TrigramDocs; |
59 | std::vector<DocID> RestrictedCCDocs; |
60 | llvm::StringMap<std::vector<DocID>> TypeDocs; |
61 | llvm::StringMap<std::vector<DocID>> ScopeDocs; |
62 | llvm::StringMap<std::vector<DocID>> ProximityDocs; |
63 | std::vector<Trigram> TrigramScratch; |
64 | |
65 | public: |
66 | // Add the tokens which are given symbol's characteristics. |
67 | // This includes fuzzy matching trigrams, symbol's scope, etc. |
68 | // FIXME(kbobyrev): Support more token types: |
69 | // * Namespace proximity |
70 | void add(const Symbol &Sym, DocID D) { |
71 | generateIdentifierTrigrams(Identifier: Sym.Name, Out&: TrigramScratch); |
72 | for (Trigram T : TrigramScratch) |
73 | TrigramDocs[T].push_back(x: D); |
74 | ScopeDocs[Sym.Scope].push_back(x: D); |
75 | if (!llvm::StringRef(Sym.CanonicalDeclaration.FileURI).empty()) |
76 | for (const auto &ProximityURI : |
77 | generateProximityURIs(Sym.CanonicalDeclaration.FileURI)) |
78 | ProximityDocs[ProximityURI].push_back(x: D); |
79 | if (Sym.Flags & Symbol::IndexedForCodeCompletion) |
80 | RestrictedCCDocs.push_back(x: D); |
81 | if (!Sym.Type.empty()) |
82 | TypeDocs[Sym.Type].push_back(x: D); |
83 | } |
84 | |
85 | // Assemble the final compressed posting lists for the added symbols. |
86 | llvm::DenseMap<Token, PostingList> build() && { |
87 | llvm::DenseMap<Token, PostingList> Result(/*InitialReserve=*/ |
88 | TrigramDocs.size() + |
89 | RestrictedCCDocs.size() + |
90 | TypeDocs.size() + |
91 | ScopeDocs.size() + |
92 | ProximityDocs.size()); |
93 | // Tear down intermediate structs as we go to reduce memory usage. |
94 | // Since we're trying to get rid of underlying allocations, clearing the |
95 | // containers is not enough. |
96 | auto CreatePostingList = |
97 | [&Result](Token::Kind TK, llvm::StringMap<std::vector<DocID>> &Docs) { |
98 | for (auto &E : Docs) { |
99 | Result.try_emplace(Key: Token(TK, E.first()), Args&: E.second); |
100 | E.second = {}; |
101 | } |
102 | Docs = {}; |
103 | }; |
104 | CreatePostingList(Token::Kind::Type, TypeDocs); |
105 | CreatePostingList(Token::Kind::Scope, ScopeDocs); |
106 | CreatePostingList(Token::Kind::ProximityURI, ProximityDocs); |
107 | |
108 | // TrigramDocs are stored in a DenseMap and RestrictedCCDocs is not even a |
109 | // map, treat them specially. |
110 | for (auto &E : TrigramDocs) { |
111 | Result.try_emplace(Key: Token(Token::Kind::Trigram, E.first.str()), Args&: E.second); |
112 | E.second = {}; |
113 | } |
114 | TrigramDocs = llvm::DenseMap<Trigram, std::vector<DocID>>{}; |
115 | if (!RestrictedCCDocs.empty()) |
116 | Result.try_emplace(Key: RestrictedForCodeCompletion, |
117 | Args: std::move(RestrictedCCDocs)); |
118 | return Result; |
119 | } |
120 | }; |
121 | |
122 | } // namespace |
123 | |
124 | void Dex::buildIndex(bool SupportContainedRefs) { |
125 | this->Corpus = dex::Corpus(Symbols.size()); |
126 | std::vector<std::pair<float, const Symbol *>> ScoredSymbols(Symbols.size()); |
127 | |
128 | for (size_t I = 0; I < Symbols.size(); ++I) { |
129 | const Symbol *Sym = Symbols[I]; |
130 | LookupTable[Sym->ID] = Sym; |
131 | ScoredSymbols[I] = {quality(S: *Sym), Sym}; |
132 | } |
133 | |
134 | // Symbols are sorted by symbol qualities so that items in the posting lists |
135 | // are stored in the descending order of symbol quality. |
136 | llvm::sort(C&: ScoredSymbols, Comp: std::greater<std::pair<float, const Symbol *>>()); |
137 | |
138 | // SymbolQuality was empty up until now. |
139 | SymbolQuality.resize(new_size: Symbols.size()); |
140 | // Populate internal storage using Symbol + Score pairs. |
141 | for (size_t I = 0; I < ScoredSymbols.size(); ++I) { |
142 | SymbolQuality[I] = ScoredSymbols[I].first; |
143 | Symbols[I] = ScoredSymbols[I].second; |
144 | } |
145 | |
146 | // Build posting lists for symbols. |
147 | IndexBuilder Builder; |
148 | for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank) |
149 | Builder.add(Sym: *Symbols[SymbolRank], D: SymbolRank); |
150 | InvertedIndex = std::move(Builder).build(); |
151 | |
152 | // If the containedRefs() operation is supported, build the RevRefs |
153 | // data structure used to implement it. |
154 | if (!SupportContainedRefs) |
155 | return; |
156 | for (const auto &[ID, RefList] : Refs) |
157 | for (const auto &R : RefList) |
158 | if ((R.Kind & ContainedRefsRequest::SupportedRefKinds) != |
159 | RefKind::Unknown) |
160 | RevRefs.emplace_back(args: R, args: ID); |
161 | // Sort by container ID so we can use binary search for lookup. |
162 | llvm::sort(C&: RevRefs, Comp: [](const RevRef &A, const RevRef &B) { |
163 | return A.ref().Container < B.ref().Container; |
164 | }); |
165 | } |
166 | |
167 | std::unique_ptr<Iterator> Dex::iterator(const Token &Tok) const { |
168 | auto It = InvertedIndex.find(Val: Tok); |
169 | return It == InvertedIndex.end() ? Corpus.none() |
170 | : It->second.iterator(Tok: &It->first); |
171 | } |
172 | |
173 | // Constructs BOOST iterators for Path Proximities. |
174 | std::unique_ptr<Iterator> Dex::createFileProximityIterator( |
175 | llvm::ArrayRef<std::string> ProximityPaths) const { |
176 | std::vector<std::unique_ptr<Iterator>> BoostingIterators; |
177 | // Deduplicate parent URIs extracted from the ProximityPaths. |
178 | llvm::StringSet<> ParentURIs; |
179 | llvm::StringMap<SourceParams> Sources; |
180 | for (const auto &Path : ProximityPaths) { |
181 | Sources[Path] = SourceParams(); |
182 | auto PathURI = URI::create(AbsolutePath: Path).toString(); |
183 | const auto PathProximityURIs = generateProximityURIs(PathURI.c_str()); |
184 | ParentURIs.insert_range(R: PathProximityURIs); |
185 | } |
186 | // Use SymbolRelevanceSignals for symbol relevance evaluation: use defaults |
187 | // for all parameters except for Proximity Path distance signal. |
188 | SymbolRelevanceSignals PathProximitySignals; |
189 | // DistanceCalculator will find the shortest distance from ProximityPaths to |
190 | // any URI extracted from the ProximityPaths. |
191 | URIDistance DistanceCalculator(Sources); |
192 | PathProximitySignals.FileProximityMatch = &DistanceCalculator; |
193 | // Try to build BOOST iterator for each Proximity Path provided by |
194 | // ProximityPaths. Boosting factor should depend on the distance to the |
195 | // Proximity Path: the closer processed path is, the higher boosting factor. |
196 | for (const auto &ParentURI : ParentURIs.keys()) { |
197 | // FIXME(kbobyrev): Append LIMIT on top of every BOOST iterator. |
198 | auto It = iterator(Tok: Token(Token::Kind::ProximityURI, ParentURI)); |
199 | if (It->kind() != Iterator::Kind::False) { |
200 | PathProximitySignals.SymbolURI = ParentURI; |
201 | BoostingIterators.push_back(x: Corpus.boost( |
202 | Child: std::move(It), Factor: PathProximitySignals.evaluateHeuristics())); |
203 | } |
204 | } |
205 | BoostingIterators.push_back(x: Corpus.all()); |
206 | return Corpus.unionOf(Children: std::move(BoostingIterators)); |
207 | } |
208 | |
209 | // Constructs BOOST iterators for preferred types. |
210 | std::unique_ptr<Iterator> |
211 | Dex::createTypeBoostingIterator(llvm::ArrayRef<std::string> Types) const { |
212 | std::vector<std::unique_ptr<Iterator>> BoostingIterators; |
213 | SymbolRelevanceSignals PreferredTypeSignals; |
214 | PreferredTypeSignals.TypeMatchesPreferred = true; |
215 | auto Boost = PreferredTypeSignals.evaluateHeuristics(); |
216 | for (const auto &T : Types) |
217 | BoostingIterators.push_back( |
218 | x: Corpus.boost(Child: iterator(Tok: Token(Token::Kind::Type, T)), Factor: Boost)); |
219 | BoostingIterators.push_back(x: Corpus.all()); |
220 | return Corpus.unionOf(Children: std::move(BoostingIterators)); |
221 | } |
222 | |
223 | /// Constructs iterators over tokens extracted from the query and exhausts it |
224 | /// while applying Callback to each symbol in the order of decreasing quality |
225 | /// of the matched symbols. |
226 | bool Dex::fuzzyFind(const FuzzyFindRequest &Req, |
227 | llvm::function_ref<void(const Symbol &)> Callback) const { |
228 | assert(!StringRef(Req.Query).contains("::" ) && |
229 | "There must be no :: in query." ); |
230 | trace::Span Tracer("Dex fuzzyFind" ); |
231 | FuzzyMatcher Filter(Req.Query); |
232 | // For short queries we use specialized trigrams that don't yield all results. |
233 | // Prevent clients from postfiltering them for longer queries. |
234 | bool More = !Req.Query.empty() && Req.Query.size() < 3; |
235 | |
236 | std::vector<std::unique_ptr<Iterator>> Criteria; |
237 | const auto TrigramTokens = generateQueryTrigrams(Query: Req.Query); |
238 | |
239 | // Generate query trigrams and construct AND iterator over all query |
240 | // trigrams. |
241 | std::vector<std::unique_ptr<Iterator>> TrigramIterators; |
242 | for (const auto &Trigram : TrigramTokens) |
243 | TrigramIterators.push_back(x: iterator(Tok: Trigram)); |
244 | Criteria.push_back(x: Corpus.intersect(Children: std::move(TrigramIterators))); |
245 | |
246 | // Generate scope tokens for search query. |
247 | std::vector<std::unique_ptr<Iterator>> ScopeIterators; |
248 | for (const auto &Scope : Req.Scopes) |
249 | ScopeIterators.push_back(x: iterator(Tok: Token(Token::Kind::Scope, Scope))); |
250 | if (Req.AnyScope) |
251 | ScopeIterators.push_back( |
252 | x: Corpus.boost(Child: Corpus.all(), Factor: ScopeIterators.empty() ? 1.0 : 0.2)); |
253 | Criteria.push_back(x: Corpus.unionOf(Children: std::move(ScopeIterators))); |
254 | |
255 | // Add proximity paths boosting (all symbols, some boosted). |
256 | Criteria.push_back(x: createFileProximityIterator(ProximityPaths: Req.ProximityPaths)); |
257 | // Add boosting for preferred types. |
258 | Criteria.push_back(x: createTypeBoostingIterator(Types: Req.PreferredTypes)); |
259 | |
260 | if (Req.RestrictForCodeCompletion) |
261 | Criteria.push_back(x: iterator(Tok: RestrictedForCodeCompletion)); |
262 | |
263 | // Use TRUE iterator if both trigrams and scopes from the query are not |
264 | // present in the symbol index. |
265 | auto Root = Corpus.intersect(Children: std::move(Criteria)); |
266 | // Retrieve more items than it was requested: some of the items with high |
267 | // final score might not be retrieved otherwise. |
268 | // FIXME(kbobyrev): Tune this ratio. |
269 | if (Req.Limit) |
270 | Root = Corpus.limit(Child: std::move(Root), Limit: *Req.Limit * 100); |
271 | SPAN_ATTACH(Tracer, "query" , llvm::to_string(*Root)); |
272 | vlog(Fmt: "Dex query tree: {0}" , Vals&: *Root); |
273 | |
274 | using IDAndScore = std::pair<DocID, float>; |
275 | std::vector<IDAndScore> IDAndScores = consume(It&: *Root); |
276 | |
277 | auto Compare = [](const IDAndScore &LHS, const IDAndScore &RHS) { |
278 | return LHS.second > RHS.second; |
279 | }; |
280 | TopN<IDAndScore, decltype(Compare)> Top( |
281 | Req.Limit.value_or(u: std::numeric_limits<size_t>::max()), Compare); |
282 | for (const auto &IDAndScore : IDAndScores) { |
283 | const DocID SymbolDocID = IDAndScore.first; |
284 | const auto *Sym = Symbols[SymbolDocID]; |
285 | const std::optional<float> Score = Filter.match(Word: Sym->Name); |
286 | if (!Score) |
287 | continue; |
288 | // Combine Fuzzy Matching score, precomputed symbol quality and boosting |
289 | // score for a cumulative final symbol score. |
290 | const float FinalScore = |
291 | (*Score) * SymbolQuality[SymbolDocID] * IDAndScore.second; |
292 | // If Top.push(...) returns true, it means that it had to pop an item. In |
293 | // this case, it is possible to retrieve more symbols. |
294 | if (Top.push(V: {SymbolDocID, FinalScore})) |
295 | More = true; |
296 | } |
297 | |
298 | // Apply callback to the top Req.Limit items in the descending |
299 | // order of cumulative score. |
300 | for (const auto &Item : std::move(Top).items()) |
301 | Callback(*Symbols[Item.first]); |
302 | return More; |
303 | } |
304 | |
305 | void Dex::lookup(const LookupRequest &Req, |
306 | llvm::function_ref<void(const Symbol &)> Callback) const { |
307 | trace::Span Tracer("Dex lookup" ); |
308 | for (const auto &ID : Req.IDs) { |
309 | auto I = LookupTable.find(Val: ID); |
310 | if (I != LookupTable.end()) |
311 | Callback(*I->second); |
312 | } |
313 | } |
314 | |
315 | bool Dex::refs(const RefsRequest &Req, |
316 | llvm::function_ref<void(const Ref &)> Callback) const { |
317 | trace::Span Tracer("Dex refs" ); |
318 | uint32_t Remaining = Req.Limit.value_or(u: std::numeric_limits<uint32_t>::max()); |
319 | for (const auto &ID : Req.IDs) |
320 | for (const auto &Ref : Refs.lookup(Val: ID)) { |
321 | if (!static_cast<int>(Req.Filter & Ref.Kind)) |
322 | continue; |
323 | if (Remaining == 0) |
324 | return true; // More refs were available. |
325 | --Remaining; |
326 | Callback(Ref); |
327 | } |
328 | return false; // We reported all refs. |
329 | } |
330 | |
331 | llvm::iterator_range<std::vector<Dex::RevRef>::const_iterator> |
332 | Dex::lookupRevRefs(const SymbolID &Container) const { |
333 | // equal_range() requires an element of the same type as the elements of the |
334 | // range, so construct a dummy RevRef with the container of interest. |
335 | Ref QueryRef; |
336 | QueryRef.Container = Container; |
337 | RevRef Query(QueryRef, SymbolID{}); |
338 | |
339 | auto ItPair = std::equal_range(first: RevRefs.cbegin(), last: RevRefs.cend(), val: Query, |
340 | comp: [](const RevRef &A, const RevRef &B) { |
341 | return A.ref().Container < B.ref().Container; |
342 | }); |
343 | return {ItPair.first, ItPair.second}; |
344 | } |
345 | |
346 | bool Dex::containedRefs( |
347 | const ContainedRefsRequest &Req, |
348 | llvm::function_ref<void(const ContainedRefsResult &)> Callback) const { |
349 | trace::Span Tracer("Dex reversed refs" ); |
350 | uint32_t Remaining = Req.Limit.value_or(u: std::numeric_limits<uint32_t>::max()); |
351 | for (const auto &Rev : lookupRevRefs(Container: Req.ID)) { |
352 | // RevRefs are already filtered to ContainedRefsRequest::SupportedRefKinds |
353 | if (Remaining == 0) |
354 | return true; // More refs were available. |
355 | --Remaining; |
356 | Callback(Rev.containedRefsResult()); |
357 | } |
358 | return false; // We reported all refs. |
359 | } |
360 | |
361 | void Dex::relations( |
362 | const RelationsRequest &Req, |
363 | llvm::function_ref<void(const SymbolID &, const Symbol &)> Callback) const { |
364 | trace::Span Tracer("Dex relations" ); |
365 | uint32_t Remaining = Req.Limit.value_or(u: std::numeric_limits<uint32_t>::max()); |
366 | for (const SymbolID &Subject : Req.Subjects) { |
367 | LookupRequest LookupReq; |
368 | auto It = Relations.find( |
369 | Val: std::make_pair(x: Subject, y: static_cast<uint8_t>(Req.Predicate))); |
370 | if (It != Relations.end()) { |
371 | for (const auto &Object : It->second) { |
372 | if (Remaining > 0) { |
373 | --Remaining; |
374 | LookupReq.IDs.insert(V: Object); |
375 | } |
376 | } |
377 | } |
378 | lookup(Req: LookupReq, Callback: [&](const Symbol &Object) { Callback(Subject, Object); }); |
379 | } |
380 | } |
381 | |
382 | llvm::unique_function<IndexContents(llvm::StringRef) const> |
383 | Dex::indexedFiles() const { |
384 | return [this](llvm::StringRef FileURI) { |
385 | return Files.contains(key: FileURI) ? IdxContents : IndexContents::None; |
386 | }; |
387 | } |
388 | |
389 | size_t Dex::estimateMemoryUsage() const { |
390 | size_t Bytes = Symbols.size() * sizeof(const Symbol *); |
391 | Bytes += SymbolQuality.size() * sizeof(float); |
392 | Bytes += LookupTable.getMemorySize(); |
393 | Bytes += InvertedIndex.getMemorySize(); |
394 | for (const auto &TokenToPostingList : InvertedIndex) |
395 | Bytes += TokenToPostingList.second.bytes(); |
396 | Bytes += Refs.getMemorySize(); |
397 | Bytes += RevRefs.size() * sizeof(RevRef); |
398 | Bytes += Relations.getMemorySize(); |
399 | return Bytes + BackingDataSize; |
400 | } |
401 | |
402 | // Given foo://bar/one/two |
403 | // Returns ~~~~~~~~ (or empty for bad URI) |
404 | llvm::StringRef findPathInURI(llvm::StringRef S) { |
405 | S = S.split(Separator: ':').second; // Eat scheme. |
406 | if (S.consume_front(Prefix: "//" )) // Eat authority. |
407 | S = S.drop_until(F: [](char C) { return C == '/'; }); |
408 | return S; |
409 | } |
410 | |
411 | // FIXME(kbobyrev): Currently, this is a heuristic which defines the maximum |
412 | // size of resulting vector. Some projects might want to have higher limit if |
413 | // the file hierarchy is deeper. For the generic case, it would be useful to |
414 | // calculate Limit in the index build stage by calculating the maximum depth |
415 | // of the project source tree at runtime. |
416 | constexpr unsigned ProximityURILimit = 5; |
417 | |
418 | llvm::SmallVector<llvm::StringRef, ProximityURILimit> |
419 | generateProximityURIs(llvm::StringRef URI) { |
420 | // This function is hot when indexing, so don't parse/reserialize URIPath, |
421 | // just emit substrings of it instead. |
422 | // |
423 | // foo://bar/one/two |
424 | // ~~~~~~~~ |
425 | // Path |
426 | llvm::StringRef Path = findPathInURI(S: URI); |
427 | if (Path.empty()) |
428 | return {}; // Bad URI. |
429 | assert(Path.begin() >= URI.begin() && Path.begin() < URI.end() && |
430 | Path.end() == URI.end()); |
431 | |
432 | // The original is a proximity URI. |
433 | llvm::SmallVector<llvm::StringRef, ProximityURILimit> Result = {URI}; |
434 | // Form proximity URIs by chopping before each slash in the path part. |
435 | for (auto Slash = Path.rfind(C: '/'); Slash > 0 && Slash != StringRef::npos; |
436 | Slash = Path.rfind(C: '/')) { |
437 | Path = Path.substr(Start: 0, N: Slash); |
438 | Result.push_back(Elt: URI.substr(Start: 0, N: Path.end() - URI.data())); |
439 | if (Result.size() == ProximityURILimit) |
440 | return Result; |
441 | } |
442 | // The root foo://bar/ is a proximity URI. |
443 | if (Path.starts_with(Prefix: "/" )) |
444 | Result.push_back(Elt: URI.substr(Start: 0, N: Path.begin() + 1 - URI.data())); |
445 | return Result; |
446 | } |
447 | |
448 | } // namespace dex |
449 | } // namespace clangd |
450 | } // namespace clang |
451 | |