PostingList.cpp source code [clang-tools-extra/clangd/index/dex/PostingList.cpp]

1	//===--- PostingList.cpp - Symbol identifiers storage interface -----------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "PostingList.h"
10	#include "index/dex/Iterator.h"
11	#include "index/dex/Token.h"
12	#include "llvm/Support/MathExtras.h"
13	#include <optional>
14
15	namespace clang {
16	namespace clangd {
17	namespace dex {
18	namespace {
19
20	/// Implements iterator of PostingList chunks. This requires iterating over two
21	/// levels: the first level iterator iterates over the chunks and decompresses
22	/// them on-the-fly when the contents of chunk are to be seen.
23	class ChunkIterator : public Iterator {
24	public:
25	explicit ChunkIterator(const Token *Tok, llvm::ArrayRef<Chunk> Chunks)
26	: Tok(Tok), Chunks (Chunks), CurrentChunk(Chunks.begin()) {
27	if (!Chunks.empty()) {
28	DecompressedChunk = CurrentChunk->decompress();
29	CurrentID = DecompressedChunk.begin();
30	}
31	}
32
33	bool reachedEnd() const override { return CurrentChunk == Chunks.end(); }
34
35	/// Advances cursor to the next item.
36	void advance() override {
37	assert(!reachedEnd() &&
38	"Posting List iterator can't advance() at the end.");
39	++CurrentID;
40	normalizeCursor();
41	}
42
43	/// Applies binary search to advance cursor to the next item with DocID
44	/// equal or higher than the given one.
45	void advanceTo(DocID ID) override {
46	assert(!reachedEnd() &&
47	"Posting List iterator can't advance() at the end.");
48	if (ID <= peek())
49	return;
50	advanceToChunk(ID);
51	// Try to find ID within current chunk.
52	CurrentID = std::partition_point(first: CurrentID, last: DecompressedChunk.end(),
53	pred: [&](const DocID D) { return D < ID; });
54	normalizeCursor();
55	}
56
57	DocID peek() const override {
58	assert(!reachedEnd() && "Posting List iterator can't peek() at the end.");
59	return *CurrentID;
60	}
61
62	float consume() override {
63	assert(!reachedEnd() &&
64	"Posting List iterator can't consume() at the end.");
65	return `1`;
66	}
67
68	size_t estimateSize() const override {
69	return Chunks.size() * ApproxEntriesPerChunk;
70	}
71
72	private:
73	llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
74	if (Tok != nullptr)
75	return OS << *Tok;
76	OS << `'['`;
77	const char *Sep = "";
78	for (const Chunk &C : Chunks)
79	for (const DocID Doc : C.decompress()) {
80	OS << Sep << Doc;
81	Sep = " ";
82	}
83	return OS << `']'`;
84	}
85
86	/// If the cursor is at the end of a chunk, place it at the start of the next
87	/// chunk.
88	void normalizeCursor() {
89	// Invariant is already established if examined chunk is not exhausted.
90	if (CurrentID != std::end(cont&: DecompressedChunk))
91	return;
92	// Advance to next chunk if current one is exhausted.
93	++CurrentChunk;
94	if (CurrentChunk == Chunks.end()) // Reached the end of PostingList.
95	return;
96	DecompressedChunk = CurrentChunk->decompress();
97	CurrentID = DecompressedChunk.begin();
98	}
99
100	/// Advances CurrentChunk to the chunk which might contain ID.
101	void advanceToChunk(DocID ID) {
102	if ((CurrentChunk != Chunks.end() - `1`) &&
103	((CurrentChunk + `1`)->Head <= ID)) {
104	CurrentChunk =
105	std::partition_point(first: CurrentChunk + `1`, last: Chunks.end(),
106	pred: [&](const Chunk &C) { return C.Head < ID; });
107	--CurrentChunk;
108	DecompressedChunk = CurrentChunk->decompress();
109	CurrentID = DecompressedChunk.begin();
110	}
111	}
112
113	const Token *Tok;
114	llvm::ArrayRef<Chunk> Chunks;
115	/// Iterator over chunks.
116	/// If CurrentChunk is valid, then DecompressedChunk is
117	/// CurrentChunk->decompress() and CurrentID is a valid (non-end) iterator
118	/// into it.
119	decltype(Chunks)::const_iterator CurrentChunk;
120	llvm::SmallVector<DocID, Chunk::PayloadSize + `1`> DecompressedChunk;
121	/// Iterator over DecompressedChunk.
122	decltype(DecompressedChunk)::iterator CurrentID;
123
124	static constexpr size_t ApproxEntriesPerChunk = `15`;
125	};
126
127	static constexpr size_t BitsPerEncodingByte = `7`;
128
129	/// Writes a variable length DocID into the buffer and updates the buffer size.
130	/// If it doesn't fit, returns false and doesn't write to the buffer.
131	bool encodeVByte(DocID Delta, llvm::MutableArrayRef<uint8_t> &Payload) {
132	assert(Delta != `0` && "0 is not a valid PostingList delta.");
133	// Calculate number of bytes Delta encoding would take by examining the
134	// meaningful bits.
135	unsigned Width = `1` + llvm::Log2_64(Value: Delta) / BitsPerEncodingByte;
136	if (Width > Payload.size())
137	return false;
138
139	do {
140	uint8_t Encoding = Delta & `0x7f`;
141	Delta >>= `7`;
142	Payload.front() = Delta ? Encoding \| `0x80` : Encoding;
143	Payload = Payload.drop_front();
144	} while (Delta != `0`);
145	return true;
146	}
147
148	/// Use Variable-length Byte (VByte) delta encoding to compress sorted list of
149	/// DocIDs. The compression stores deltas (differences) between subsequent
150	/// DocIDs and encodes these deltas utilizing the least possible number of
151	/// bytes.
152	///
153	/// Each encoding byte consists of two parts: the first bit (continuation bit)
154	/// indicates whether this is the last byte (0 if this byte is the last) of
155	/// current encoding and seven bytes a piece of DocID (payload). DocID contains
156	/// 32 bits and therefore it takes up to 5 bytes to encode it (4 full 7-bit
157	/// payloads and one 4-bit payload), but in practice it is expected that gaps
158	/// (deltas) between subsequent DocIDs are not large enough to require 5 bytes.
159	/// In very dense posting lists (with average gaps less than 128) this
160	/// representation would be 4 times more efficient than raw DocID array.
161	///
162	/// PostingList encoding example:
163	///
164	/// DocIDs 42 47 7000
165	/// gaps 5 6958
166	/// Encoding (raw number) 00000101 10110110 00101110
167	std::vector<Chunk> encodeStream(llvm::ArrayRef<DocID> Documents) {
168	assert(!Documents.empty() && "Can't encode empty sequence.");
169	std::vector<Chunk> Result;
170	Result.emplace_back();
171	DocID Last = Result.back().Head = Documents.front();
172	llvm::MutableArrayRef<uint8_t> RemainingPayload = Result.back().Payload;
173	for (DocID Doc : Documents.drop_front()) {
174	if (!encodeVByte(Delta: Doc - Last, Payload&: RemainingPayload)) { // didn't fit, flush chunk
175	Result.emplace_back();
176	Result.back().Head = Doc;
177	RemainingPayload = Result.back().Payload;
178	}
179	Last = Doc;
180	}
181	return std::vector<Chunk>(Result); // no move, shrink-to-fit
182	}
183
184	/// Reads variable length DocID from the buffer and updates the buffer size. If
185	/// the stream is terminated, return std::nullopt.
186	std::optional<DocID> readVByte(llvm::ArrayRef<uint8_t> &Bytes) {
187	if (Bytes.front() == `0` \|\| Bytes.empty())
188	return std::nullopt;
189	DocID Result = `0`;
190	bool HasNextByte = true;
191	for (size_t Length = `0`; HasNextByte && !Bytes.empty(); ++Length) {
192	assert(Length <= `5` && "Malformed VByte encoding sequence.");
193	// Write meaningful bits to the correct place in the document decoding.
194	Result \|= (Bytes.front() & `0x7f`) << (BitsPerEncodingByte * Length);
195	if ((Bytes.front() & `0x80`) == `0`)
196	HasNextByte = false;
197	Bytes = Bytes.drop_front();
198	}
199	return Result;
200	}
201
202	} // namespace
203
204	llvm::SmallVector<DocID, Chunk::PayloadSize + `1`> Chunk::decompress() const {
205	llvm::SmallVector<DocID, Chunk::PayloadSize + `1`> Result{Head};
206	llvm::ArrayRef<uint8_t> Bytes(Payload);
207	DocID Delta;
208	for (DocID Current = Head; !Bytes.empty(); Current += Delta) {
209	auto MaybeDelta = readVByte(Bytes);
210	if (!MaybeDelta)
211	break;
212	Delta = *MaybeDelta;
213	Result.push_back(Elt: Current + Delta);
214	}
215	return llvm::SmallVector<DocID, Chunk::PayloadSize + `1`>{Result};
216	}
217
218	PostingList::PostingList(llvm::ArrayRef<DocID> Documents)
219	: Chunks(encodeStream(Documents)) {}
220
221	std::unique_ptr<Iterator> PostingList::iterator(const Token Tok) const* {
222	return std::make_unique<ChunkIterator>(args&: Tok, args: Chunks);
223	}
224
225	} // namespace dex
226	} // namespace clangd
227	} // namespace clang
228

source code of clang-tools-extra/clangd/index/dex/PostingList.cpp