Dex.cpp source code [clang-tools-extra/clangd/index/dex/Dex.cpp]

1	//===--- Dex.cpp - Dex Symbol Index Implementation --------------- C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "Dex.h"
10	#include "FileDistance.h"
11	#include "FuzzyMatch.h"
12	#include "Quality.h"
13	#include "URI.h"
14	#include "index/Index.h"
15	#include "index/dex/Iterator.h"
16	#include "index/dex/Token.h"
17	#include "index/dex/Trigram.h"
18	#include "support/Logger.h"
19	#include "support/Trace.h"
20	#include "llvm/ADT/DenseMap.h"
21	#include "llvm/ADT/StringMap.h"
22	#include "llvm/ADT/StringSet.h"
23	#include "llvm/Support/Path.h"
24	#include "llvm/Support/ScopedPrinter.h"
25	#include <algorithm>
26	#include <optional>
27	#include <queue>
28	#include <utility>
29	#include <vector>
30
31	namespace clang {
32	namespace clangd {
33	namespace dex {
34
35	std::unique_ptr<SymbolIndex> Dex::build(SymbolSlab Symbols, RefSlab Refs,
36	RelationSlab Rels) {
37	auto Size = Symbols.bytes() + Refs.bytes();
38	// There is no need to include "Rels" in Data because the relations are self-
39	// contained, without references into a backing store.
40	auto Data = std::make_pair(x: std::move(Symbols), y: std::move(Refs));
41	return std::make_unique<Dex>(args&: Data.first, args&: Data.second, args&: Rels, args: std::move(Data),
42	args&: Size);
43	}
44
45	namespace {
46
47	// Mark symbols which are can be used for code completion.
48	const Token RestrictedForCodeCompletion =
49	Token (Token::Kind::Sentinel, "Restricted For Code Completion");
50
51	// Helper to efficiently assemble the inverse index (token -> matching docs).
52	// The output is a nice uniform structure keyed on Token, but constructing
53	// the Token object every time we want to insert into the map is wasteful.
54	// Instead we have various maps keyed on things that are cheap to compute,
55	// and produce the Token keys once at the end.
56	class IndexBuilder {
57	llvm::DenseMap<Trigram, std::vector<DocID>> TrigramDocs;
58	std::vector<DocID> RestrictedCCDocs;
59	llvm::StringMap<std::vector<DocID>> TypeDocs;
60	llvm::StringMap<std::vector<DocID>> ScopeDocs;
61	llvm::StringMap<std::vector<DocID>> ProximityDocs;
62	std::vector<Trigram> TrigramScratch;
63
64	public:
65	// Add the tokens which are given symbol's characteristics.
66	// This includes fuzzy matching trigrams, symbol's scope, etc.
67	// FIXME(kbobyrev): Support more token types:
68	// Namespace proximity*
69	void add(const Symbol &Sym, DocID D) {
70	generateIdentifierTrigrams(Identifier: Sym.Name, Out&: TrigramScratch);
71	for (Trigram T : TrigramScratch)
72	TrigramDocs [T].push_back(x: D);
73	ScopeDocs [Sym.Scope].push_back(x: D);
74	if (!llvm::StringRef (Sym.CanonicalDeclaration.FileURI).empty())
75	for (const auto &ProximityURI :
76	generateProximityURIs(Sym.CanonicalDeclaration.FileURI))
77	ProximityDocs [ProximityURI].push_back(x: D);
78	if (Sym.Flags & Symbol::IndexedForCodeCompletion)
79	RestrictedCCDocs.push_back(x: D);
80	if (!Sym.Type.empty())
81	TypeDocs [Sym.Type].push_back(x: D);
82	}
83
84	// Assemble the final compressed posting lists for the added symbols.
85	llvm::DenseMap<Token, PostingList> build() && {
86	llvm::DenseMap<Token, PostingList> Result(/InitialReserve=/
87	TrigramDocs.size() +
88	RestrictedCCDocs.size() +
89	TypeDocs.size() +
90	ScopeDocs.size() +
91	ProximityDocs.size());
92	// Tear down intermediate structs as we go to reduce memory usage.
93	// Since we're trying to get rid of underlying allocations, clearing the
94	// containers is not enough.
95	auto CreatePostingList =
96	[&Result](Token::Kind TK, llvm::StringMap<std::vector<DocID>> &Docs) {
97	for (auto &E : Docs) {
98	Result.try_emplace(Key: Token (TK, E.first()), Args&: E.second);
99	E.second = {};
100	}
101	Docs = {};
102	};
103	CreatePostingList(Token::Kind::Type, TypeDocs);
104	CreatePostingList(Token::Kind::Scope, ScopeDocs);
105	CreatePostingList(Token::Kind::ProximityURI, ProximityDocs);
106
107	// TrigramDocs are stored in a DenseMap and RestrictedCCDocs is not even a
108	// map, treat them specially.
109	for (auto &E : TrigramDocs) {
110	Result.try_emplace(Key: Token (Token::Kind::Trigram, E.first.str()), Args&: E.second);
111	E.second = {};
112	}
113	TrigramDocs = llvm::DenseMap<Trigram, std::vector<DocID>>{};
114	if (!RestrictedCCDocs.empty())
115	Result.try_emplace(Key: RestrictedForCodeCompletion,
116	Args: std::move(RestrictedCCDocs));
117	return Result;
118	}
119	};
120
121	} // namespace
122
123	void Dex::buildIndex() {
124	this->Corpus = dex::Corpus (Symbols.size());
125	std::vector<std::pair<float, const Symbol *>> ScoredSymbols(Symbols.size());
126
127	for (size_t I = `0`; I < Symbols.size(); ++I) {
128	const Symbol *Sym = Symbols [I];
129	LookupTable [Sym->ID] = Sym;
130	ScoredSymbols [I] = {quality(S: *Sym), Sym};
131	}
132
133	// Symbols are sorted by symbol qualities so that items in the posting lists
134	// are stored in the descending order of symbol quality.
135	llvm::sort(C&: ScoredSymbols, Comp: std::greater<std::pair<float, const Symbol *>>());
136
137	// SymbolQuality was empty up until now.
138	SymbolQuality.resize(new_size: Symbols.size());
139	// Populate internal storage using Symbol + Score pairs.
140	for (size_t I = `0`; I < ScoredSymbols.size(); ++I) {
141	SymbolQuality [I] = ScoredSymbols [I].first;
142	Symbols [I] = ScoredSymbols [I].second;
143	}
144
145	// Build posting lists for symbols.
146	IndexBuilder Builder;
147	for (DocID SymbolRank = `0`; SymbolRank < Symbols.size(); ++SymbolRank)
148	Builder.add(Sym: *Symbols [SymbolRank], D: SymbolRank);
149	InvertedIndex = std::move(Builder).build();
150	}
151
152	std::unique_ptr<Iterator> Dex::iterator(const Token &Tok) const {
153	auto It = InvertedIndex.find(Val: Tok);
154	return It == InvertedIndex.end() ? Corpus.none()
155	: It ->second.iterator(Tok: &It ->first);
156	}
157
158	// Constructs BOOST iterators for Path Proximities.
159	std::unique_ptr<Iterator> Dex::createFileProximityIterator(
160	llvm::ArrayRef<std::string> ProximityPaths) const {
161	std::vector<std::unique_ptr<Iterator>> BoostingIterators;
162	// Deduplicate parent URIs extracted from the ProximityPaths.
163	llvm::StringSet<> ParentURIs;
164	llvm::StringMap<SourceParams> Sources;
165	for (const auto &Path : ProximityPaths) {
166	Sources [Path] = SourceParams ();
167	auto PathURI = URI::create(AbsolutePath: Path).toString();
168	const auto PathProximityURIs = generateProximityURIs(PathURI.c_str());
169	for (const auto &ProximityURI : PathProximityURIs)
170	ParentURIs.insert(key: ProximityURI);
171	}
172	// Use SymbolRelevanceSignals for symbol relevance evaluation: use defaults
173	// for all parameters except for Proximity Path distance signal.
174	SymbolRelevanceSignals PathProximitySignals;
175	// DistanceCalculator will find the shortest distance from ProximityPaths to
176	// any URI extracted from the ProximityPaths.
177	URIDistance DistanceCalculator(Sources);
178	PathProximitySignals.FileProximityMatch = &DistanceCalculator;
179	// Try to build BOOST iterator for each Proximity Path provided by
180	// ProximityPaths. Boosting factor should depend on the distance to the
181	// Proximity Path: the closer processed path is, the higher boosting factor.
182	for (const auto &ParentURI : ParentURIs.keys()) {
183	// FIXME(kbobyrev): Append LIMIT on top of every BOOST iterator.
184	auto It = iterator(Tok: Token (Token::Kind::ProximityURI, ParentURI));
185	if (It ->kind() != Iterator::Kind::False) {
186	PathProximitySignals.SymbolURI = ParentURI;
187	BoostingIterators.push_back(x: Corpus.boost(
188	Child: std::move(It), Factor: PathProximitySignals.evaluateHeuristics()));
189	}
190	}
191	BoostingIterators.push_back(x: Corpus.all());
192	return Corpus.unionOf(Children: std::move(BoostingIterators));
193	}
194
195	// Constructs BOOST iterators for preferred types.
196	std::unique_ptr<Iterator>
197	Dex::createTypeBoostingIterator(llvm::ArrayRef<std::string> Types) const {
198	std::vector<std::unique_ptr<Iterator>> BoostingIterators;
199	SymbolRelevanceSignals PreferredTypeSignals;
200	PreferredTypeSignals.TypeMatchesPreferred = true;
201	auto Boost = PreferredTypeSignals.evaluateHeuristics();
202	for (const auto &T : Types)
203	BoostingIterators.push_back(
204	x: Corpus.boost(Child: iterator(Tok: Token (Token::Kind::Type, T)), Factor: Boost));
205	BoostingIterators.push_back(x: Corpus.all());
206	return Corpus.unionOf(Children: std::move(BoostingIterators));
207	}
208
209	/// Constructs iterators over tokens extracted from the query and exhausts it
210	/// while applying Callback to each symbol in the order of decreasing quality
211	/// of the matched symbols.
212	bool Dex::fuzzyFind(const FuzzyFindRequest &Req,
213	llvm::function_ref<void(const Symbol &)> Callback) const {
214	assert(!StringRef(Req.Query).contains("::") &&
215	"There must be no :: in query.");
216	trace::Span Tracer("Dex fuzzyFind");
217	FuzzyMatcher Filter(Req.Query);
218	// For short queries we use specialized trigrams that don't yield all results.
219	// Prevent clients from postfiltering them for longer queries.
220	bool More = !Req.Query.empty() && Req.Query.size() < `3`;
221
222	std::vector<std::unique_ptr<Iterator>> Criteria;
223	const auto TrigramTokens = generateQueryTrigrams(Query: Req.Query);
224
225	// Generate query trigrams and construct AND iterator over all query
226	// trigrams.
227	std::vector<std::unique_ptr<Iterator>> TrigramIterators;
228	for (const auto &Trigram : TrigramTokens)
229	TrigramIterators.push_back(x: iterator(Tok: Trigram));
230	Criteria.push_back(x: Corpus.intersect(Children: std::move(TrigramIterators)));
231
232	// Generate scope tokens for search query.
233	std::vector<std::unique_ptr<Iterator>> ScopeIterators;
234	for (const auto &Scope : Req.Scopes)
235	ScopeIterators.push_back(x: iterator(Tok: Token (Token::Kind::Scope, Scope)));
236	if (Req.AnyScope)
237	ScopeIterators.push_back(
238	x: Corpus.boost(Child: Corpus.all(), Factor: ScopeIterators.empty() ? `1.0` : `0.2`));
239	Criteria.push_back(x: Corpus.unionOf(Children: std::move(ScopeIterators)));
240
241	// Add proximity paths boosting (all symbols, some boosted).
242	Criteria.push_back(x: createFileProximityIterator(ProximityPaths: Req.ProximityPaths));
243	// Add boosting for preferred types.
244	Criteria.push_back(x: createTypeBoostingIterator(Types: Req.PreferredTypes));
245
246	if (Req.RestrictForCodeCompletion)
247	Criteria.push_back(x: iterator(Tok: RestrictedForCodeCompletion));
248
249	// Use TRUE iterator if both trigrams and scopes from the query are not
250	// present in the symbol index.
251	auto Root = Corpus.intersect(Children: std::move(Criteria));
252	// Retrieve more items than it was requested: some of the items with high
253	// final score might not be retrieved otherwise.
254	// FIXME(kbobyrev): Tune this ratio.
255	if (Req.Limit)
256	Root = Corpus.limit(Child: std::move(Root), Limit: Req.Limit `100`);
257	SPAN_ATTACH(Tracer, "query", llvm::to_string(*Root));
258	vlog(Fmt: "Dex query tree: {0}", Vals&: *Root);
259
260	using IDAndScore = std::pair<DocID, float>;
261	std::vector<IDAndScore> IDAndScores = consume(It&: *Root);
262
263	auto Compare = [](const IDAndScore &LHS, const IDAndScore &RHS) {
264	return LHS.second > RHS.second;
265	};
266	TopN<IDAndScore, decltype(Compare)> Top(
267	Req.Limit ? *Req.Limit : std::numeric_limits<size_t>::max(), Compare);
268	for (const auto &IDAndScore : IDAndScores) {
269	const DocID SymbolDocID = IDAndScore.first;
270	const auto *Sym = Symbols [SymbolDocID];
271	const std::optional<float> Score = Filter.match(Word: Sym->Name);
272	if (!Score)
273	continue;
274	// Combine Fuzzy Matching score, precomputed symbol quality and boosting
275	// score for a cumulative final symbol score.
276	const float FinalScore =
277	(Score) SymbolQuality [SymbolDocID] * IDAndScore.second;
278	// If Top.push(...) returns true, it means that it had to pop an item. In
279	// this case, it is possible to retrieve more symbols.
280	if (Top.push(V: {SymbolDocID, FinalScore}))
281	More = true;
282	}
283
284	// Apply callback to the top Req.Limit items in the descending
285	// order of cumulative score.
286	for (const auto &Item : std::move(Top).items())
287	Callback (*Symbols [Item.first]);
288	return More;
289	}
290
291	void Dex::lookup(const LookupRequest &Req,
292	llvm::function_ref<void(const Symbol &)> Callback) const {
293	trace::Span Tracer("Dex lookup");
294	for (const auto &ID : Req.IDs) {
295	auto I = LookupTable.find(Val: ID);
296	if (I != LookupTable.end())
297	Callback (*I ->second);
298	}
299	}
300
301	bool Dex::refs(const RefsRequest &Req,
302	llvm::function_ref<void(const Ref &)> Callback) const {
303	trace::Span Tracer("Dex refs");
304	uint32_t Remaining = Req.Limit.value_or(u: std::numeric_limits<uint32_t>::max());
305	for (const auto &ID : Req.IDs)
306	for (const auto &Ref : Refs.lookup(Val: ID)) {
307	if (!static_cast<int>(Req.Filter & Ref.Kind))
308	continue;
309	if (Remaining == `0`)
310	return true; // More refs were available.
311	--Remaining;
312	Callback (Ref);
313	}
314	return false; // We reported all refs.
315	}
316
317	void Dex::relations(
318	const RelationsRequest &Req,
319	llvm::function_ref<void(const SymbolID &, const Symbol &)> Callback) const {
320	trace::Span Tracer("Dex relations");
321	uint32_t Remaining = Req.Limit.value_or(u: std::numeric_limits<uint32_t>::max());
322	for (const SymbolID &Subject : Req.Subjects) {
323	LookupRequest LookupReq;
324	auto It = Relations.find(
325	Val: std::make_pair(x: Subject, y: static_cast<uint8_t>(Req.Predicate)));
326	if (It != Relations.end()) {
327	for (const auto &Object : It ->second) {
328	if (Remaining > `0`) {
329	--Remaining;
330	LookupReq.IDs.insert(V: Object);
331	}
332	}
333	}
334	lookup(Req: LookupReq, Callback: [&](const Symbol &Object) { Callback (Subject, Object); });
335	}
336	}
337
338	llvm::unique_function<IndexContents(llvm::StringRef) const>
339	Dex::indexedFiles() const {
340	return [this](llvm::StringRef FileURI) {
341	return Files.contains(key: FileURI) ? IdxContents : IndexContents::None;
342	};
343	}
344
345	size_t Dex::estimateMemoryUsage() const {
346	size_t Bytes = Symbols.size() * sizeof(const Symbol *);
347	Bytes += SymbolQuality.size() * sizeof(float);
348	Bytes += LookupTable.getMemorySize();
349	Bytes += InvertedIndex.getMemorySize();
350	for (const auto &TokenToPostingList : InvertedIndex)
351	Bytes += TokenToPostingList.second.bytes();
352	Bytes += Refs.getMemorySize();
353	Bytes += Relations.getMemorySize();
354	return Bytes + BackingDataSize;
355	}
356
357	// Given foo://bar/one/two
358	// Returns ~~~~~~~~ (or empty for bad URI)
359	llvm::StringRef findPathInURI(llvm::StringRef S) {
360	S = S.split(Separator: `':'`).second; // Eat scheme.
361	if (S.consume_front(Prefix: "//")) // Eat authority.
362	S = S.drop_until(F: [](char C) { return C == `'/'`; });
363	return S;
364	}
365
366	// FIXME(kbobyrev): Currently, this is a heuristic which defines the maximum
367	// size of resulting vector. Some projects might want to have higher limit if
368	// the file hierarchy is deeper. For the generic case, it would be useful to
369	// calculate Limit in the index build stage by calculating the maximum depth
370	// of the project source tree at runtime.
371	constexpr unsigned ProximityURILimit = `5`;
372
373	llvm::SmallVector<llvm::StringRef, ProximityURILimit>
374	generateProximityURIs(llvm::StringRef URI) {
375	// This function is hot when indexing, so don't parse/reserialize URIPath,
376	// just emit substrings of it instead.
377	//
378	// foo://bar/one/two
379	// ~~~~~~~~
380	// Path
381	llvm::StringRef Path = findPathInURI(S: URI);
382	if (Path.empty())
383	return {}; // Bad URI.
384	assert(Path.begin() >= URI.begin() && Path.begin() < URI.end() &&
385	Path.end() == URI.end());
386
387	// The original is a proximity URI.
388	llvm::SmallVector<llvm::StringRef, ProximityURILimit> Result = {URI};
389	// Form proximity URIs by chopping before each slash in the path part.
390	for (auto Slash = Path.rfind(C: `'/'`); Slash > `0` && Slash != StringRef::npos;
391	Slash = Path.rfind(C: `'/'`)) {
392	Path = Path.substr(Start: `0`, N: Slash);
393	Result.push_back(Elt: URI.substr(Start: `0`, N: Path.end() - URI.data()));
394	if (Result.size() == ProximityURILimit)
395	return Result;
396	}
397	// The root foo://bar/ is a proximity URI.
398	if (Path.starts_with(Prefix: "/"))
399	Result.push_back(Elt: URI.substr(Start: `0`, N: Path.begin() + `1` - URI.data()));
400	return Result;
401	}
402
403	} // namespace dex
404	} // namespace clangd
405	} // namespace clang
406

source code of clang-tools-extra/clangd/index/dex/Dex.cpp