1 | //===- MultiOnDiskHashTable.h - Merged set of hash tables -------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file provides a hash table data structure suitable for incremental and |
10 | // distributed storage across a set of files. |
11 | // |
12 | // Multiple hash tables from different files are implicitly merged to improve |
13 | // performance, and on reload the merged table will override those from other |
14 | // files. |
15 | // |
16 | //===----------------------------------------------------------------------===// |
17 | |
18 | #ifndef LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H |
19 | #define LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H |
20 | |
21 | #include "llvm/ADT/DenseMap.h" |
22 | #include "llvm/ADT/DenseSet.h" |
23 | #include "llvm/ADT/PointerUnion.h" |
24 | #include "llvm/ADT/STLExtras.h" |
25 | #include "llvm/ADT/SmallVector.h" |
26 | #include "llvm/ADT/TinyPtrVector.h" |
27 | #include "llvm/ADT/iterator_range.h" |
28 | #include "llvm/Support/Endian.h" |
29 | #include "llvm/Support/EndianStream.h" |
30 | #include "llvm/Support/OnDiskHashTable.h" |
31 | #include "llvm/Support/raw_ostream.h" |
32 | #include <algorithm> |
33 | #include <cstdint> |
34 | #include <vector> |
35 | |
36 | namespace clang { |
37 | namespace serialization { |
38 | |
39 | /// A collection of on-disk hash tables, merged when relevant for performance. |
40 | template<typename Info> class MultiOnDiskHashTable { |
41 | public: |
42 | /// A handle to a file, used when overriding tables. |
43 | using file_type = typename Info::file_type; |
44 | |
45 | /// A pointer to an on-disk representation of the hash table. |
46 | using storage_type = const unsigned char *; |
47 | |
48 | using external_key_type = typename Info::external_key_type; |
49 | using internal_key_type = typename Info::internal_key_type; |
50 | using data_type = typename Info::data_type; |
51 | using data_type_builder = typename Info::data_type_builder; |
52 | using hash_value_type = unsigned; |
53 | |
54 | private: |
55 | /// The generator is permitted to read our merged table. |
56 | template<typename ReaderInfo, typename WriterInfo> |
57 | friend class MultiOnDiskHashTableGenerator; |
58 | |
59 | /// A hash table stored on disk. |
60 | struct OnDiskTable { |
61 | using HashTable = llvm::OnDiskIterableChainedHashTable<Info>; |
62 | |
63 | file_type File; |
64 | HashTable Table; |
65 | |
66 | OnDiskTable(file_type File, unsigned NumBuckets, unsigned NumEntries, |
67 | storage_type Buckets, storage_type Payload, storage_type Base, |
68 | const Info &InfoObj) |
69 | : File(File), |
70 | Table(NumBuckets, NumEntries, Buckets, Payload, Base, InfoObj) {} |
71 | }; |
72 | |
73 | struct MergedTable { |
74 | std::vector<file_type> Files; |
75 | llvm::DenseMap<internal_key_type, data_type> Data; |
76 | }; |
77 | |
78 | using Table = llvm::PointerUnion<OnDiskTable *, MergedTable *>; |
79 | using TableVector = llvm::TinyPtrVector<void *>; |
80 | |
81 | /// The current set of on-disk and merged tables. |
82 | /// We manually store the opaque value of the Table because TinyPtrVector |
83 | /// can't cope with holding a PointerUnion directly. |
84 | /// There can be at most one MergedTable in this vector, and if present, |
85 | /// it is the first table. |
86 | TableVector Tables; |
87 | |
88 | /// Files corresponding to overridden tables that we've not yet |
89 | /// discarded. |
90 | llvm::TinyPtrVector<file_type> PendingOverrides; |
91 | |
92 | struct AsOnDiskTable { |
93 | using result_type = OnDiskTable *; |
94 | |
95 | result_type operator()(void *P) const { |
96 | return llvm::cast<OnDiskTable *>(Table::getFromOpaqueValue(P)); |
97 | } |
98 | }; |
99 | |
100 | using table_iterator = |
101 | llvm::mapped_iterator<TableVector::iterator, AsOnDiskTable>; |
102 | using table_range = llvm::iterator_range<table_iterator>; |
103 | |
104 | /// The current set of on-disk tables. |
105 | table_range tables() { |
106 | unsigned DropBegin = getMergedTable() ? 1 : 0; |
107 | return llvm::map_range(llvm::drop_begin(RangeOrContainer&: Tables, N: DropBegin), |
108 | AsOnDiskTable()); |
109 | } |
110 | |
111 | MergedTable *getMergedTable() const { |
112 | // If we already have a merged table, it's the first one. |
113 | return Tables.empty() ? nullptr : Table::getFromOpaqueValue(*Tables.begin()) |
114 | .template dyn_cast<MergedTable*>(); |
115 | } |
116 | |
117 | /// Delete all our current on-disk tables. |
118 | void clear() { |
119 | for (auto *T : tables()) |
120 | delete T; |
121 | if (auto *M = getMergedTable()) |
122 | delete M; |
123 | Tables.clear(); |
124 | } |
125 | |
126 | void removeOverriddenTables() { |
127 | llvm::DenseSet<file_type> Files; |
128 | Files.insert_range(PendingOverrides); |
129 | // Explicitly capture Files to work around an MSVC 2015 rejects-valid bug. |
130 | auto ShouldRemove = [&Files](void *T) -> bool { |
131 | auto *ODT = llvm::cast<OnDiskTable *>(Table::getFromOpaqueValue(T)); |
132 | bool Remove = Files.count(ODT->File); |
133 | if (Remove) |
134 | delete ODT; |
135 | return Remove; |
136 | }; |
137 | Tables.erase(std::remove_if(tables().begin().getCurrent(), Tables.end(), |
138 | ShouldRemove), |
139 | Tables.end()); |
140 | PendingOverrides.clear(); |
141 | } |
142 | |
143 | void condense() { |
144 | MergedTable *Merged = getMergedTable(); |
145 | if (!Merged) |
146 | Merged = new MergedTable; |
147 | |
148 | // Read in all the tables and merge them together. |
149 | // FIXME: Be smarter about which tables we merge. |
150 | for (auto *ODT : tables()) { |
151 | auto &HT = ODT->Table; |
152 | Info &InfoObj = HT.getInfoObj(); |
153 | |
154 | for (auto I = HT.data_begin(), E = HT.data_end(); I != E; ++I) { |
155 | auto *LocalPtr = I.getItem(); |
156 | |
157 | // FIXME: Don't rely on the OnDiskHashTable format here. |
158 | auto L = InfoObj.ReadKeyDataLength(LocalPtr); |
159 | const internal_key_type &Key = InfoObj.ReadKey(LocalPtr, L.first); |
160 | data_type_builder ValueBuilder(Merged->Data[Key]); |
161 | InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second, |
162 | ValueBuilder); |
163 | } |
164 | |
165 | Merged->Files.push_back(ODT->File); |
166 | delete ODT; |
167 | } |
168 | |
169 | Tables.clear(); |
170 | Tables.push_back(NewVal: Table(Merged).getOpaqueValue()); |
171 | } |
172 | |
173 | public: |
174 | MultiOnDiskHashTable() = default; |
175 | |
176 | MultiOnDiskHashTable(MultiOnDiskHashTable &&O) |
177 | : Tables(std::move(O.Tables)), |
178 | PendingOverrides(std::move(O.PendingOverrides)) { |
179 | O.Tables.clear(); |
180 | } |
181 | |
182 | MultiOnDiskHashTable &operator=(MultiOnDiskHashTable &&O) { |
183 | if (&O == this) |
184 | return *this; |
185 | clear(); |
186 | Tables = std::move(O.Tables); |
187 | O.Tables.clear(); |
188 | PendingOverrides = std::move(O.PendingOverrides); |
189 | return *this; |
190 | } |
191 | |
192 | ~MultiOnDiskHashTable() { clear(); } |
193 | |
194 | /// Add the table \p Data loaded from file \p File. |
195 | void add(file_type File, storage_type Data, Info InfoObj = Info()) { |
196 | using namespace llvm::support; |
197 | |
198 | storage_type Ptr = Data; |
199 | |
200 | uint32_t BucketOffset = |
201 | endian::readNext<uint32_t, llvm::endianness::little>(memory&: Ptr); |
202 | |
203 | // Read the list of overridden files. |
204 | uint32_t NumFiles = |
205 | endian::readNext<uint32_t, llvm::endianness::little>(memory&: Ptr); |
206 | // FIXME: Add a reserve() to TinyPtrVector so that we don't need to make |
207 | // an additional copy. |
208 | llvm::SmallVector<file_type, 16> OverriddenFiles; |
209 | OverriddenFiles.reserve(NumFiles); |
210 | for (/**/; NumFiles != 0; --NumFiles) |
211 | OverriddenFiles.push_back(InfoObj.ReadFileRef(Ptr)); |
212 | llvm::append_range(PendingOverrides, OverriddenFiles); |
213 | |
214 | // Read the OnDiskChainedHashTable header. |
215 | storage_type Buckets = Data + BucketOffset; |
216 | auto NumBucketsAndEntries = |
217 | OnDiskTable::HashTable::readNumBucketsAndEntries(Buckets); |
218 | |
219 | // Register the table. |
220 | Table NewTable = new OnDiskTable(File, NumBucketsAndEntries.first, |
221 | NumBucketsAndEntries.second, |
222 | Buckets, Ptr, Data, std::move(InfoObj)); |
223 | Tables.push_back(NewVal: NewTable.getOpaqueValue()); |
224 | } |
225 | |
226 | /// Find and read the lookup results for \p EKey. |
227 | data_type find(const external_key_type &EKey) { |
228 | data_type Result; |
229 | |
230 | if (!PendingOverrides.empty()) |
231 | removeOverriddenTables(); |
232 | |
233 | if (Tables.size() > static_cast<unsigned>(Info::MaxTables)) |
234 | condense(); |
235 | |
236 | internal_key_type Key = Info::GetInternalKey(EKey); |
237 | auto KeyHash = Info::ComputeHash(Key); |
238 | |
239 | if (MergedTable *M = getMergedTable()) { |
240 | auto It = M->Data.find(Key); |
241 | if (It != M->Data.end()) |
242 | Result = It->second; |
243 | } |
244 | |
245 | data_type_builder ResultBuilder(Result); |
246 | |
247 | for (auto *ODT : tables()) { |
248 | auto &HT = ODT->Table; |
249 | auto It = HT.find_hashed(Key, KeyHash); |
250 | if (It != HT.end()) |
251 | HT.getInfoObj().ReadDataInto(Key, It.getDataPtr(), It.getDataLen(), |
252 | ResultBuilder); |
253 | } |
254 | |
255 | return Result; |
256 | } |
257 | |
258 | /// Read all the lookup results into a single value. This only makes |
259 | /// sense if merging values across keys is meaningful. |
260 | data_type findAll() { |
261 | data_type Result; |
262 | data_type_builder ResultBuilder(Result); |
263 | |
264 | if (!PendingOverrides.empty()) |
265 | removeOverriddenTables(); |
266 | |
267 | if (MergedTable *M = getMergedTable()) { |
268 | for (auto &KV : M->Data) |
269 | Info::MergeDataInto(KV.second, ResultBuilder); |
270 | } |
271 | |
272 | for (auto *ODT : tables()) { |
273 | auto &HT = ODT->Table; |
274 | Info &InfoObj = HT.getInfoObj(); |
275 | for (auto I = HT.data_begin(), E = HT.data_end(); I != E; ++I) { |
276 | auto *LocalPtr = I.getItem(); |
277 | |
278 | // FIXME: Don't rely on the OnDiskHashTable format here. |
279 | auto L = InfoObj.ReadKeyDataLength(LocalPtr); |
280 | const internal_key_type &Key = InfoObj.ReadKey(LocalPtr, L.first); |
281 | InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second, ResultBuilder); |
282 | } |
283 | } |
284 | |
285 | return Result; |
286 | } |
287 | }; |
288 | |
289 | /// Writer for the on-disk hash table. |
290 | template<typename ReaderInfo, typename WriterInfo> |
291 | class MultiOnDiskHashTableGenerator { |
292 | using BaseTable = MultiOnDiskHashTable<ReaderInfo>; |
293 | using Generator = llvm::OnDiskChainedHashTableGenerator<WriterInfo>; |
294 | |
295 | Generator Gen; |
296 | |
297 | public: |
298 | MultiOnDiskHashTableGenerator() : Gen() {} |
299 | |
300 | void insert(typename WriterInfo::key_type_ref Key, |
301 | typename WriterInfo::data_type_ref Data, WriterInfo &Info) { |
302 | Gen.insert(Key, Data, Info); |
303 | } |
304 | |
305 | void emit(llvm::SmallVectorImpl<char> &Out, WriterInfo &Info, |
306 | const BaseTable *Base) { |
307 | using namespace llvm::support; |
308 | |
309 | llvm::raw_svector_ostream OutStream(Out); |
310 | |
311 | // Write our header information. |
312 | { |
313 | endian::Writer Writer(OutStream, llvm::endianness::little); |
314 | |
315 | // Reserve four bytes for the bucket offset. |
316 | Writer.write<uint32_t>(Val: 0); |
317 | |
318 | if (auto *Merged = Base ? Base->getMergedTable() : nullptr) { |
319 | // Write list of overridden files. |
320 | Writer.write<uint32_t>(Merged->Files.size()); |
321 | for (const auto &F : Merged->Files) |
322 | Info.EmitFileRef(OutStream, F); |
323 | |
324 | // Add all merged entries from Base to the generator. |
325 | for (auto &KV : Merged->Data) { |
326 | if (!Gen.contains(KV.first, Info)) |
327 | Gen.insert(KV.first, Info.ImportData(KV.second), Info); |
328 | } |
329 | } else { |
330 | Writer.write<uint32_t>(Val: 0); |
331 | } |
332 | } |
333 | |
334 | // Write the table itself. |
335 | uint32_t BucketOffset = Gen.Emit(OutStream, Info); |
336 | |
337 | // Replace the first four bytes with the bucket offset. |
338 | endian::write32le(P: Out.data(), V: BucketOffset); |
339 | } |
340 | }; |
341 | |
342 | } // namespace serialization |
343 | } // namespace clang |
344 | |
345 | #endif // LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H |
346 | |