Trigram.cpp source code [clang-tools-extra/clangd/index/dex/Trigram.cpp]

1	//===--- Trigram.cpp - Trigram generation for Fuzzy Matching ----- C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "Trigram.h"
10	#include "FuzzyMatch.h"
11	#include "index/dex/Token.h"
12	#include "llvm/ADT/ArrayRef.h"
13	#include "llvm/ADT/DenseSet.h"
14	#include "llvm/ADT/STLExtras.h"
15	#include "llvm/ADT/SmallVector.h"
16	#include "llvm/ADT/StringExtras.h"
17	#include "llvm/ADT/StringRef.h"
18	#include <cctype>
19	#include <limits>
20	#include <queue>
21	#include <string>
22	#include <utility>
23
24	namespace clang {
25	namespace clangd {
26	namespace dex {
27
28	// Produce trigrams (including duplicates) and pass them to Out().
29	template <typename Func>
30	static void identifierTrigrams(llvm::StringRef Identifier, Func Out) {
31	assert(!Identifier.empty());
32	// Apply fuzzy matching text segmentation.
33	llvm::SmallVector<CharRole> Roles(Identifier.size());
34	calculateRoles(Text: Identifier,
35	Roles: llvm::MutableArrayRef(Roles.data(), Identifier.size()));
36
37	std::string LowercaseIdentifier = Identifier.lower();
38
39	// For each character, store indices of the characters to which fuzzy matching
40	// algorithm can jump. There are 2 possible variants:
41	//
42	// Next Tail - next character from the same segment*
43	// Next Head - front character of the next segment*
44	//
45	// Next stores tuples of three indices in the presented order, if a variant is
46	// not available then 0 is stored.
47	llvm::SmallVector<std::array<unsigned, `2`>, `12`> Next(
48	LowercaseIdentifier.size());
49	unsigned NextTail = `0`, NextHead = `0`;
50	for (int I = LowercaseIdentifier.size() - `1`; I >= `0`; --I) {
51	Next [I] = {._M_elems: {NextTail, NextHead}};
52	NextTail = Roles [I] == Tail ? I : `0`;
53	if (Roles [I] == Head) {
54	NextHead = I;
55	}
56	}
57
58	// Iterate through valid sequences of three characters Fuzzy Matcher can
59	// process.
60	for (unsigned I = `0`; I < LowercaseIdentifier.size(); ++I) {
61	// Skip delimiters.
62	if (Roles [I] != Head && Roles [I] != Tail)
63	continue;
64	for (unsigned J : Next [I]) {
65	if (J == `0`)
66	continue;
67	for (unsigned K : Next [J]) {
68	if (K == `0`)
69	continue;
70	Out(Trigram (LowercaseIdentifier [I], LowercaseIdentifier [J],
71	LowercaseIdentifier [K]));
72	}
73	}
74	}
75	// Short queries semantics are different. When the user dosn't type enough
76	// symbols to form trigrams, we still want to serve meaningful results. To
77	// achieve that, we form incomplete trigrams (bi- and unigrams) for the
78	// identifiers and also generate short trigrams on the query side from what
79	// is available. We allow a small number of short trigram types in order to
80	// prevent excessive memory usage and increase the quality of the search.
81	// Only the first few symbols are allowed to be used in incomplete trigrams.
82	//
83	// Example - for "_abc_def_ghi_jkl" we'll get following incomplete trigrams:
84	// "_", "_a", "a", "ab", "ad", "d", "de", "dg"
85	for (unsigned Position = `0`, HeadsSeen = `0`; HeadsSeen < `2`;) {
86	// The first symbol might be a separator, so the loop condition should be
87	// stopping as soon as there is no next head or we have seen two heads.
88	if (Roles [Position] == Head)
89	++HeadsSeen;
90	Out(Trigram (LowercaseIdentifier [Position]));
91	for (unsigned I : Next [Position])
92	if (I != `0`)
93	Out(Trigram (LowercaseIdentifier [Position], LowercaseIdentifier [I]));
94	Position = Next [Position][`1`];
95	if (Position == `0`)
96	break;
97	}
98	}
99
100	void generateIdentifierTrigrams(llvm::StringRef Identifier,
101	std::vector<Trigram> &Result) {
102	// Empirically, scanning for duplicates is faster with fewer trigrams, and
103	// a hashtable is faster with more. This is a hot path, so dispatch based on
104	// expected number of trigrams. Longer identifiers produce more trigrams.
105	// The magic number was tuned by running IndexBenchmark.DexBuild.
106	constexpr unsigned ManyTrigramsIdentifierThreshold = `14`;
107	Result.clear();
108	if (Identifier.empty())
109	return;
110
111	if (Identifier.size() < ManyTrigramsIdentifierThreshold) {
112	identifierTrigrams(Identifier, Out: [&](Trigram T) {
113	if (!llvm::is_contained(Range&: Result, Element: T))
114	Result.push_back(x: T);
115	});
116	} else {
117	identifierTrigrams(Identifier, Out: [&](Trigram T) { Result.push_back(x: T); });
118	llvm::sort(C&: Result);
119	Result.erase(first: std::unique(first: Result.begin(), last: Result.end()), last: Result.end());
120	}
121	}
122
123	std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) {
124	if (Query.empty())
125	return {};
126
127	// Apply fuzzy matching text segmentation.
128	llvm::SmallVector<CharRole> Roles(Query.size());
129	calculateRoles(Text: Query, Roles: llvm::MutableArrayRef(Roles.data(), Query.size()));
130
131	std::string LowercaseQuery = Query.lower();
132
133	llvm::DenseSet<Token> UniqueTrigrams;
134	std::string Chars;
135	for (unsigned I = `0`; I < LowercaseQuery.size(); ++I) {
136	if (Roles [I] != Head && Roles [I] != Tail)
137	continue; // Skip delimiters.
138	Chars.push_back(c: LowercaseQuery [I]);
139	if (Chars.size() > `3`)
140	Chars.erase(position: Chars.begin());
141	if (Chars.size() == `3`)
142	UniqueTrigrams.insert(V: Token (Token::Kind::Trigram, Chars));
143	}
144
145	// For queries with very few letters, generateIdentifierTrigrams emulates
146	// outputs of this function to match the semantics.
147	if (UniqueTrigrams.empty()) {
148	// If bigram can't be formed out of letters/numbers, we prepend separator.
149	std::string Result(`1`, LowercaseQuery.front());
150	for (unsigned I = `1`; I < LowercaseQuery.size(); ++I)
151	if (Roles [I] == Head \|\| Roles [I] == Tail)
152	Result += LowercaseQuery [I];
153	UniqueTrigrams.insert(
154	V: Token (Token::Kind::Trigram, llvm::StringRef (Result).take_back(N: `2`)));
155	}
156
157	return {UniqueTrigrams.begin(), UniqueTrigrams.end()};
158	}
159
160	} // namespace dex
161	} // namespace clangd
162	} // namespace clang
163

source code of clang-tools-extra/clangd/index/dex/Trigram.cpp