1//===--- Dex.cpp - Dex Symbol Index Implementation --------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "Dex.h"
10#include "FileDistance.h"
11#include "FuzzyMatch.h"
12#include "Quality.h"
13#include "URI.h"
14#include "index/Index.h"
15#include "index/dex/Iterator.h"
16#include "index/dex/Token.h"
17#include "index/dex/Trigram.h"
18#include "support/Logger.h"
19#include "support/Trace.h"
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/StringMap.h"
22#include "llvm/ADT/StringSet.h"
23#include "llvm/Support/Path.h"
24#include "llvm/Support/ScopedPrinter.h"
25#include <algorithm>
26#include <optional>
27#include <queue>
28#include <utility>
29#include <vector>
30
31namespace clang {
32namespace clangd {
33namespace dex {
34
35std::unique_ptr<SymbolIndex> Dex::build(SymbolSlab Symbols, RefSlab Refs,
36 RelationSlab Rels,
37 bool SupportContainedRefs) {
38 auto Size = Symbols.bytes() + Refs.bytes();
39 // There is no need to include "Rels" in Data because the relations are self-
40 // contained, without references into a backing store.
41 auto Data = std::make_pair(x: std::move(Symbols), y: std::move(Refs));
42 return std::make_unique<Dex>(args&: Data.first, args&: Data.second, args&: Rels, args: std::move(Data),
43 args&: Size, args&: SupportContainedRefs);
44}
45
46namespace {
47
48// Mark symbols which are can be used for code completion.
49const Token RestrictedForCodeCompletion =
50 Token(Token::Kind::Sentinel, "Restricted For Code Completion");
51
52// Helper to efficiently assemble the inverse index (token -> matching docs).
53// The output is a nice uniform structure keyed on Token, but constructing
54// the Token object every time we want to insert into the map is wasteful.
55// Instead we have various maps keyed on things that are cheap to compute,
56// and produce the Token keys once at the end.
57class IndexBuilder {
58 llvm::DenseMap<Trigram, std::vector<DocID>> TrigramDocs;
59 std::vector<DocID> RestrictedCCDocs;
60 llvm::StringMap<std::vector<DocID>> TypeDocs;
61 llvm::StringMap<std::vector<DocID>> ScopeDocs;
62 llvm::StringMap<std::vector<DocID>> ProximityDocs;
63 std::vector<Trigram> TrigramScratch;
64
65public:
66 // Add the tokens which are given symbol's characteristics.
67 // This includes fuzzy matching trigrams, symbol's scope, etc.
68 // FIXME(kbobyrev): Support more token types:
69 // * Namespace proximity
70 void add(const Symbol &Sym, DocID D) {
71 generateIdentifierTrigrams(Identifier: Sym.Name, Out&: TrigramScratch);
72 for (Trigram T : TrigramScratch)
73 TrigramDocs[T].push_back(x: D);
74 ScopeDocs[Sym.Scope].push_back(x: D);
75 if (!llvm::StringRef(Sym.CanonicalDeclaration.FileURI).empty())
76 for (const auto &ProximityURI :
77 generateProximityURIs(Sym.CanonicalDeclaration.FileURI))
78 ProximityDocs[ProximityURI].push_back(x: D);
79 if (Sym.Flags & Symbol::IndexedForCodeCompletion)
80 RestrictedCCDocs.push_back(x: D);
81 if (!Sym.Type.empty())
82 TypeDocs[Sym.Type].push_back(x: D);
83 }
84
85 // Assemble the final compressed posting lists for the added symbols.
86 llvm::DenseMap<Token, PostingList> build() && {
87 llvm::DenseMap<Token, PostingList> Result(/*InitialReserve=*/
88 TrigramDocs.size() +
89 RestrictedCCDocs.size() +
90 TypeDocs.size() +
91 ScopeDocs.size() +
92 ProximityDocs.size());
93 // Tear down intermediate structs as we go to reduce memory usage.
94 // Since we're trying to get rid of underlying allocations, clearing the
95 // containers is not enough.
96 auto CreatePostingList =
97 [&Result](Token::Kind TK, llvm::StringMap<std::vector<DocID>> &Docs) {
98 for (auto &E : Docs) {
99 Result.try_emplace(Key: Token(TK, E.first()), Args&: E.second);
100 E.second = {};
101 }
102 Docs = {};
103 };
104 CreatePostingList(Token::Kind::Type, TypeDocs);
105 CreatePostingList(Token::Kind::Scope, ScopeDocs);
106 CreatePostingList(Token::Kind::ProximityURI, ProximityDocs);
107
108 // TrigramDocs are stored in a DenseMap and RestrictedCCDocs is not even a
109 // map, treat them specially.
110 for (auto &E : TrigramDocs) {
111 Result.try_emplace(Key: Token(Token::Kind::Trigram, E.first.str()), Args&: E.second);
112 E.second = {};
113 }
114 TrigramDocs = llvm::DenseMap<Trigram, std::vector<DocID>>{};
115 if (!RestrictedCCDocs.empty())
116 Result.try_emplace(Key: RestrictedForCodeCompletion,
117 Args: std::move(RestrictedCCDocs));
118 return Result;
119 }
120};
121
122} // namespace
123
124void Dex::buildIndex(bool SupportContainedRefs) {
125 this->Corpus = dex::Corpus(Symbols.size());
126 std::vector<std::pair<float, const Symbol *>> ScoredSymbols(Symbols.size());
127
128 for (size_t I = 0; I < Symbols.size(); ++I) {
129 const Symbol *Sym = Symbols[I];
130 LookupTable[Sym->ID] = Sym;
131 ScoredSymbols[I] = {quality(S: *Sym), Sym};
132 }
133
134 // Symbols are sorted by symbol qualities so that items in the posting lists
135 // are stored in the descending order of symbol quality.
136 llvm::sort(C&: ScoredSymbols, Comp: std::greater<std::pair<float, const Symbol *>>());
137
138 // SymbolQuality was empty up until now.
139 SymbolQuality.resize(new_size: Symbols.size());
140 // Populate internal storage using Symbol + Score pairs.
141 for (size_t I = 0; I < ScoredSymbols.size(); ++I) {
142 SymbolQuality[I] = ScoredSymbols[I].first;
143 Symbols[I] = ScoredSymbols[I].second;
144 }
145
146 // Build posting lists for symbols.
147 IndexBuilder Builder;
148 for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank)
149 Builder.add(Sym: *Symbols[SymbolRank], D: SymbolRank);
150 InvertedIndex = std::move(Builder).build();
151
152 // If the containedRefs() operation is supported, build the RevRefs
153 // data structure used to implement it.
154 if (!SupportContainedRefs)
155 return;
156 for (const auto &[ID, RefList] : Refs)
157 for (const auto &R : RefList)
158 if ((R.Kind & ContainedRefsRequest::SupportedRefKinds) !=
159 RefKind::Unknown)
160 RevRefs.emplace_back(args: R, args: ID);
161 // Sort by container ID so we can use binary search for lookup.
162 llvm::sort(C&: RevRefs, Comp: [](const RevRef &A, const RevRef &B) {
163 return A.ref().Container < B.ref().Container;
164 });
165}
166
167std::unique_ptr<Iterator> Dex::iterator(const Token &Tok) const {
168 auto It = InvertedIndex.find(Val: Tok);
169 return It == InvertedIndex.end() ? Corpus.none()
170 : It->second.iterator(Tok: &It->first);
171}
172
173// Constructs BOOST iterators for Path Proximities.
174std::unique_ptr<Iterator> Dex::createFileProximityIterator(
175 llvm::ArrayRef<std::string> ProximityPaths) const {
176 std::vector<std::unique_ptr<Iterator>> BoostingIterators;
177 // Deduplicate parent URIs extracted from the ProximityPaths.
178 llvm::StringSet<> ParentURIs;
179 llvm::StringMap<SourceParams> Sources;
180 for (const auto &Path : ProximityPaths) {
181 Sources[Path] = SourceParams();
182 auto PathURI = URI::create(AbsolutePath: Path).toString();
183 const auto PathProximityURIs = generateProximityURIs(PathURI.c_str());
184 ParentURIs.insert_range(R: PathProximityURIs);
185 }
186 // Use SymbolRelevanceSignals for symbol relevance evaluation: use defaults
187 // for all parameters except for Proximity Path distance signal.
188 SymbolRelevanceSignals PathProximitySignals;
189 // DistanceCalculator will find the shortest distance from ProximityPaths to
190 // any URI extracted from the ProximityPaths.
191 URIDistance DistanceCalculator(Sources);
192 PathProximitySignals.FileProximityMatch = &DistanceCalculator;
193 // Try to build BOOST iterator for each Proximity Path provided by
194 // ProximityPaths. Boosting factor should depend on the distance to the
195 // Proximity Path: the closer processed path is, the higher boosting factor.
196 for (const auto &ParentURI : ParentURIs.keys()) {
197 // FIXME(kbobyrev): Append LIMIT on top of every BOOST iterator.
198 auto It = iterator(Tok: Token(Token::Kind::ProximityURI, ParentURI));
199 if (It->kind() != Iterator::Kind::False) {
200 PathProximitySignals.SymbolURI = ParentURI;
201 BoostingIterators.push_back(x: Corpus.boost(
202 Child: std::move(It), Factor: PathProximitySignals.evaluateHeuristics()));
203 }
204 }
205 BoostingIterators.push_back(x: Corpus.all());
206 return Corpus.unionOf(Children: std::move(BoostingIterators));
207}
208
209// Constructs BOOST iterators for preferred types.
210std::unique_ptr<Iterator>
211Dex::createTypeBoostingIterator(llvm::ArrayRef<std::string> Types) const {
212 std::vector<std::unique_ptr<Iterator>> BoostingIterators;
213 SymbolRelevanceSignals PreferredTypeSignals;
214 PreferredTypeSignals.TypeMatchesPreferred = true;
215 auto Boost = PreferredTypeSignals.evaluateHeuristics();
216 for (const auto &T : Types)
217 BoostingIterators.push_back(
218 x: Corpus.boost(Child: iterator(Tok: Token(Token::Kind::Type, T)), Factor: Boost));
219 BoostingIterators.push_back(x: Corpus.all());
220 return Corpus.unionOf(Children: std::move(BoostingIterators));
221}
222
223/// Constructs iterators over tokens extracted from the query and exhausts it
224/// while applying Callback to each symbol in the order of decreasing quality
225/// of the matched symbols.
226bool Dex::fuzzyFind(const FuzzyFindRequest &Req,
227 llvm::function_ref<void(const Symbol &)> Callback) const {
228 assert(!StringRef(Req.Query).contains("::") &&
229 "There must be no :: in query.");
230 trace::Span Tracer("Dex fuzzyFind");
231 FuzzyMatcher Filter(Req.Query);
232 // For short queries we use specialized trigrams that don't yield all results.
233 // Prevent clients from postfiltering them for longer queries.
234 bool More = !Req.Query.empty() && Req.Query.size() < 3;
235
236 std::vector<std::unique_ptr<Iterator>> Criteria;
237 const auto TrigramTokens = generateQueryTrigrams(Query: Req.Query);
238
239 // Generate query trigrams and construct AND iterator over all query
240 // trigrams.
241 std::vector<std::unique_ptr<Iterator>> TrigramIterators;
242 for (const auto &Trigram : TrigramTokens)
243 TrigramIterators.push_back(x: iterator(Tok: Trigram));
244 Criteria.push_back(x: Corpus.intersect(Children: std::move(TrigramIterators)));
245
246 // Generate scope tokens for search query.
247 std::vector<std::unique_ptr<Iterator>> ScopeIterators;
248 for (const auto &Scope : Req.Scopes)
249 ScopeIterators.push_back(x: iterator(Tok: Token(Token::Kind::Scope, Scope)));
250 if (Req.AnyScope)
251 ScopeIterators.push_back(
252 x: Corpus.boost(Child: Corpus.all(), Factor: ScopeIterators.empty() ? 1.0 : 0.2));
253 Criteria.push_back(x: Corpus.unionOf(Children: std::move(ScopeIterators)));
254
255 // Add proximity paths boosting (all symbols, some boosted).
256 Criteria.push_back(x: createFileProximityIterator(ProximityPaths: Req.ProximityPaths));
257 // Add boosting for preferred types.
258 Criteria.push_back(x: createTypeBoostingIterator(Types: Req.PreferredTypes));
259
260 if (Req.RestrictForCodeCompletion)
261 Criteria.push_back(x: iterator(Tok: RestrictedForCodeCompletion));
262
263 // Use TRUE iterator if both trigrams and scopes from the query are not
264 // present in the symbol index.
265 auto Root = Corpus.intersect(Children: std::move(Criteria));
266 // Retrieve more items than it was requested: some of the items with high
267 // final score might not be retrieved otherwise.
268 // FIXME(kbobyrev): Tune this ratio.
269 if (Req.Limit)
270 Root = Corpus.limit(Child: std::move(Root), Limit: *Req.Limit * 100);
271 SPAN_ATTACH(Tracer, "query", llvm::to_string(*Root));
272 vlog(Fmt: "Dex query tree: {0}", Vals&: *Root);
273
274 using IDAndScore = std::pair<DocID, float>;
275 std::vector<IDAndScore> IDAndScores = consume(It&: *Root);
276
277 auto Compare = [](const IDAndScore &LHS, const IDAndScore &RHS) {
278 return LHS.second > RHS.second;
279 };
280 TopN<IDAndScore, decltype(Compare)> Top(
281 Req.Limit.value_or(u: std::numeric_limits<size_t>::max()), Compare);
282 for (const auto &IDAndScore : IDAndScores) {
283 const DocID SymbolDocID = IDAndScore.first;
284 const auto *Sym = Symbols[SymbolDocID];
285 const std::optional<float> Score = Filter.match(Word: Sym->Name);
286 if (!Score)
287 continue;
288 // Combine Fuzzy Matching score, precomputed symbol quality and boosting
289 // score for a cumulative final symbol score.
290 const float FinalScore =
291 (*Score) * SymbolQuality[SymbolDocID] * IDAndScore.second;
292 // If Top.push(...) returns true, it means that it had to pop an item. In
293 // this case, it is possible to retrieve more symbols.
294 if (Top.push(V: {SymbolDocID, FinalScore}))
295 More = true;
296 }
297
298 // Apply callback to the top Req.Limit items in the descending
299 // order of cumulative score.
300 for (const auto &Item : std::move(Top).items())
301 Callback(*Symbols[Item.first]);
302 return More;
303}
304
305void Dex::lookup(const LookupRequest &Req,
306 llvm::function_ref<void(const Symbol &)> Callback) const {
307 trace::Span Tracer("Dex lookup");
308 for (const auto &ID : Req.IDs) {
309 auto I = LookupTable.find(Val: ID);
310 if (I != LookupTable.end())
311 Callback(*I->second);
312 }
313}
314
315bool Dex::refs(const RefsRequest &Req,
316 llvm::function_ref<void(const Ref &)> Callback) const {
317 trace::Span Tracer("Dex refs");
318 uint32_t Remaining = Req.Limit.value_or(u: std::numeric_limits<uint32_t>::max());
319 for (const auto &ID : Req.IDs)
320 for (const auto &Ref : Refs.lookup(Val: ID)) {
321 if (!static_cast<int>(Req.Filter & Ref.Kind))
322 continue;
323 if (Remaining == 0)
324 return true; // More refs were available.
325 --Remaining;
326 Callback(Ref);
327 }
328 return false; // We reported all refs.
329}
330
331llvm::iterator_range<std::vector<Dex::RevRef>::const_iterator>
332Dex::lookupRevRefs(const SymbolID &Container) const {
333 // equal_range() requires an element of the same type as the elements of the
334 // range, so construct a dummy RevRef with the container of interest.
335 Ref QueryRef;
336 QueryRef.Container = Container;
337 RevRef Query(QueryRef, SymbolID{});
338
339 auto ItPair = std::equal_range(first: RevRefs.cbegin(), last: RevRefs.cend(), val: Query,
340 comp: [](const RevRef &A, const RevRef &B) {
341 return A.ref().Container < B.ref().Container;
342 });
343 return {ItPair.first, ItPair.second};
344}
345
346bool Dex::containedRefs(
347 const ContainedRefsRequest &Req,
348 llvm::function_ref<void(const ContainedRefsResult &)> Callback) const {
349 trace::Span Tracer("Dex reversed refs");
350 uint32_t Remaining = Req.Limit.value_or(u: std::numeric_limits<uint32_t>::max());
351 for (const auto &Rev : lookupRevRefs(Container: Req.ID)) {
352 // RevRefs are already filtered to ContainedRefsRequest::SupportedRefKinds
353 if (Remaining == 0)
354 return true; // More refs were available.
355 --Remaining;
356 Callback(Rev.containedRefsResult());
357 }
358 return false; // We reported all refs.
359}
360
361void Dex::relations(
362 const RelationsRequest &Req,
363 llvm::function_ref<void(const SymbolID &, const Symbol &)> Callback) const {
364 trace::Span Tracer("Dex relations");
365 uint32_t Remaining = Req.Limit.value_or(u: std::numeric_limits<uint32_t>::max());
366 for (const SymbolID &Subject : Req.Subjects) {
367 LookupRequest LookupReq;
368 auto It = Relations.find(
369 Val: std::make_pair(x: Subject, y: static_cast<uint8_t>(Req.Predicate)));
370 if (It != Relations.end()) {
371 for (const auto &Object : It->second) {
372 if (Remaining > 0) {
373 --Remaining;
374 LookupReq.IDs.insert(V: Object);
375 }
376 }
377 }
378 lookup(Req: LookupReq, Callback: [&](const Symbol &Object) { Callback(Subject, Object); });
379 }
380}
381
382llvm::unique_function<IndexContents(llvm::StringRef) const>
383Dex::indexedFiles() const {
384 return [this](llvm::StringRef FileURI) {
385 return Files.contains(key: FileURI) ? IdxContents : IndexContents::None;
386 };
387}
388
389size_t Dex::estimateMemoryUsage() const {
390 size_t Bytes = Symbols.size() * sizeof(const Symbol *);
391 Bytes += SymbolQuality.size() * sizeof(float);
392 Bytes += LookupTable.getMemorySize();
393 Bytes += InvertedIndex.getMemorySize();
394 for (const auto &TokenToPostingList : InvertedIndex)
395 Bytes += TokenToPostingList.second.bytes();
396 Bytes += Refs.getMemorySize();
397 Bytes += RevRefs.size() * sizeof(RevRef);
398 Bytes += Relations.getMemorySize();
399 return Bytes + BackingDataSize;
400}
401
402// Given foo://bar/one/two
403// Returns ~~~~~~~~ (or empty for bad URI)
404llvm::StringRef findPathInURI(llvm::StringRef S) {
405 S = S.split(Separator: ':').second; // Eat scheme.
406 if (S.consume_front(Prefix: "//")) // Eat authority.
407 S = S.drop_until(F: [](char C) { return C == '/'; });
408 return S;
409}
410
411// FIXME(kbobyrev): Currently, this is a heuristic which defines the maximum
412// size of resulting vector. Some projects might want to have higher limit if
413// the file hierarchy is deeper. For the generic case, it would be useful to
414// calculate Limit in the index build stage by calculating the maximum depth
415// of the project source tree at runtime.
416constexpr unsigned ProximityURILimit = 5;
417
418llvm::SmallVector<llvm::StringRef, ProximityURILimit>
419generateProximityURIs(llvm::StringRef URI) {
420 // This function is hot when indexing, so don't parse/reserialize URIPath,
421 // just emit substrings of it instead.
422 //
423 // foo://bar/one/two
424 // ~~~~~~~~
425 // Path
426 llvm::StringRef Path = findPathInURI(S: URI);
427 if (Path.empty())
428 return {}; // Bad URI.
429 assert(Path.begin() >= URI.begin() && Path.begin() < URI.end() &&
430 Path.end() == URI.end());
431
432 // The original is a proximity URI.
433 llvm::SmallVector<llvm::StringRef, ProximityURILimit> Result = {URI};
434 // Form proximity URIs by chopping before each slash in the path part.
435 for (auto Slash = Path.rfind(C: '/'); Slash > 0 && Slash != StringRef::npos;
436 Slash = Path.rfind(C: '/')) {
437 Path = Path.substr(Start: 0, N: Slash);
438 Result.push_back(Elt: URI.substr(Start: 0, N: Path.end() - URI.data()));
439 if (Result.size() == ProximityURILimit)
440 return Result;
441 }
442 // The root foo://bar/ is a proximity URI.
443 if (Path.starts_with(Prefix: "/"))
444 Result.push_back(Elt: URI.substr(Start: 0, N: Path.begin() + 1 - URI.data()));
445 return Result;
446}
447
448} // namespace dex
449} // namespace clangd
450} // namespace clang
451

source code of clang-tools-extra/clangd/index/dex/Dex.cpp