1 | //===- MultiOnDiskHashTable.h - Merged set of hash tables -------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file provides a hash table data structure suitable for incremental and |
10 | // distributed storage across a set of files. |
11 | // |
12 | // Multiple hash tables from different files are implicitly merged to improve |
13 | // performance, and on reload the merged table will override those from other |
14 | // files. |
15 | // |
16 | //===----------------------------------------------------------------------===// |
17 | |
18 | #ifndef LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H |
19 | #define LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H |
20 | |
21 | #include "llvm/ADT/DenseMap.h" |
22 | #include "llvm/ADT/DenseSet.h" |
23 | #include "llvm/ADT/PointerUnion.h" |
24 | #include "llvm/ADT/STLExtras.h" |
25 | #include "llvm/ADT/SmallVector.h" |
26 | #include "llvm/ADT/TinyPtrVector.h" |
27 | #include "llvm/ADT/iterator_range.h" |
28 | #include "llvm/Support/Endian.h" |
29 | #include "llvm/Support/EndianStream.h" |
30 | #include "llvm/Support/OnDiskHashTable.h" |
31 | #include "llvm/Support/raw_ostream.h" |
32 | #include <algorithm> |
33 | #include <cstdint> |
34 | #include <vector> |
35 | |
36 | namespace clang { |
37 | namespace serialization { |
38 | |
39 | /// A collection of on-disk hash tables, merged when relevant for performance. |
40 | template<typename Info> class MultiOnDiskHashTable { |
41 | public: |
42 | /// A handle to a file, used when overriding tables. |
43 | using file_type = typename Info::file_type; |
44 | |
45 | /// A pointer to an on-disk representation of the hash table. |
46 | using storage_type = const unsigned char *; |
47 | |
48 | using external_key_type = typename Info::external_key_type; |
49 | using internal_key_type = typename Info::internal_key_type; |
50 | using data_type = typename Info::data_type; |
51 | using data_type_builder = typename Info::data_type_builder; |
52 | using hash_value_type = unsigned; |
53 | |
54 | private: |
55 | /// The generator is permitted to read our merged table. |
56 | template<typename ReaderInfo, typename WriterInfo> |
57 | friend class MultiOnDiskHashTableGenerator; |
58 | |
59 | /// A hash table stored on disk. |
60 | struct OnDiskTable { |
61 | using HashTable = llvm::OnDiskIterableChainedHashTable<Info>; |
62 | |
63 | file_type File; |
64 | HashTable Table; |
65 | |
66 | OnDiskTable(file_type File, unsigned NumBuckets, unsigned NumEntries, |
67 | storage_type Buckets, storage_type Payload, storage_type Base, |
68 | const Info &InfoObj) |
69 | : File(File), |
70 | Table(NumBuckets, NumEntries, Buckets, Payload, Base, InfoObj) {} |
71 | }; |
72 | |
73 | struct MergedTable { |
74 | std::vector<file_type> Files; |
75 | llvm::DenseMap<internal_key_type, data_type> Data; |
76 | }; |
77 | |
78 | using Table = llvm::PointerUnion<OnDiskTable *, MergedTable *>; |
79 | using TableVector = llvm::TinyPtrVector<void *>; |
80 | |
81 | /// The current set of on-disk and merged tables. |
82 | /// We manually store the opaque value of the Table because TinyPtrVector |
83 | /// can't cope with holding a PointerUnion directly. |
84 | /// There can be at most one MergedTable in this vector, and if present, |
85 | /// it is the first table. |
86 | TableVector Tables; |
87 | |
88 | /// Files corresponding to overridden tables that we've not yet |
89 | /// discarded. |
90 | llvm::TinyPtrVector<file_type> PendingOverrides; |
91 | |
92 | struct AsOnDiskTable { |
93 | using result_type = OnDiskTable *; |
94 | |
95 | result_type operator()(void *P) const { |
96 | return Table::getFromOpaqueValue(P).template get<OnDiskTable *>(); |
97 | } |
98 | }; |
99 | |
100 | using table_iterator = |
101 | llvm::mapped_iterator<TableVector::iterator, AsOnDiskTable>; |
102 | using table_range = llvm::iterator_range<table_iterator>; |
103 | |
104 | /// The current set of on-disk tables. |
105 | table_range tables() { |
106 | auto Begin = Tables.begin(), End = Tables.end(); |
107 | if (getMergedTable()) |
108 | ++Begin; |
109 | return llvm::make_range(llvm::map_iterator(Begin, AsOnDiskTable()), |
110 | llvm::map_iterator(End, AsOnDiskTable())); |
111 | } |
112 | |
113 | MergedTable *getMergedTable() const { |
114 | // If we already have a merged table, it's the first one. |
115 | return Tables.empty() ? nullptr : Table::getFromOpaqueValue(*Tables.begin()) |
116 | .template dyn_cast<MergedTable*>(); |
117 | } |
118 | |
119 | /// Delete all our current on-disk tables. |
120 | void clear() { |
121 | for (auto *T : tables()) |
122 | delete T; |
123 | if (auto *M = getMergedTable()) |
124 | delete M; |
125 | Tables.clear(); |
126 | } |
127 | |
128 | void removeOverriddenTables() { |
129 | llvm::DenseSet<file_type> Files; |
130 | Files.insert(PendingOverrides.begin(), PendingOverrides.end()); |
131 | // Explicitly capture Files to work around an MSVC 2015 rejects-valid bug. |
132 | auto ShouldRemove = [&Files](void *T) -> bool { |
133 | auto *ODT = Table::getFromOpaqueValue(T).template get<OnDiskTable *>(); |
134 | bool Remove = Files.count(ODT->File); |
135 | if (Remove) |
136 | delete ODT; |
137 | return Remove; |
138 | }; |
139 | Tables.erase(std::remove_if(tables().begin().getCurrent(), Tables.end(), |
140 | ShouldRemove), |
141 | Tables.end()); |
142 | PendingOverrides.clear(); |
143 | } |
144 | |
145 | void condense() { |
146 | MergedTable *Merged = getMergedTable(); |
147 | if (!Merged) |
148 | Merged = new MergedTable; |
149 | |
150 | // Read in all the tables and merge them together. |
151 | // FIXME: Be smarter about which tables we merge. |
152 | for (auto *ODT : tables()) { |
153 | auto &HT = ODT->Table; |
154 | Info &InfoObj = HT.getInfoObj(); |
155 | |
156 | for (auto I = HT.data_begin(), E = HT.data_end(); I != E; ++I) { |
157 | auto *LocalPtr = I.getItem(); |
158 | |
159 | // FIXME: Don't rely on the OnDiskHashTable format here. |
160 | auto L = InfoObj.ReadKeyDataLength(LocalPtr); |
161 | const internal_key_type &Key = InfoObj.ReadKey(LocalPtr, L.first); |
162 | data_type_builder ValueBuilder(Merged->Data[Key]); |
163 | InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second, |
164 | ValueBuilder); |
165 | } |
166 | |
167 | Merged->Files.push_back(ODT->File); |
168 | delete ODT; |
169 | } |
170 | |
171 | Tables.clear(); |
172 | Tables.push_back(NewVal: Table(Merged).getOpaqueValue()); |
173 | } |
174 | |
175 | public: |
176 | MultiOnDiskHashTable() = default; |
177 | |
178 | MultiOnDiskHashTable(MultiOnDiskHashTable &&O) |
179 | : Tables(std::move(O.Tables)), |
180 | PendingOverrides(std::move(O.PendingOverrides)) { |
181 | O.Tables.clear(); |
182 | } |
183 | |
184 | MultiOnDiskHashTable &operator=(MultiOnDiskHashTable &&O) { |
185 | if (&O == this) |
186 | return *this; |
187 | clear(); |
188 | Tables = std::move(O.Tables); |
189 | O.Tables.clear(); |
190 | PendingOverrides = std::move(O.PendingOverrides); |
191 | return *this; |
192 | } |
193 | |
194 | ~MultiOnDiskHashTable() { clear(); } |
195 | |
196 | /// Add the table \p Data loaded from file \p File. |
197 | void add(file_type File, storage_type Data, Info InfoObj = Info()) { |
198 | using namespace llvm::support; |
199 | |
200 | storage_type Ptr = Data; |
201 | |
202 | uint32_t BucketOffset = |
203 | endian::readNext<uint32_t, llvm::endianness::little>(memory&: Ptr); |
204 | |
205 | // Read the list of overridden files. |
206 | uint32_t NumFiles = |
207 | endian::readNext<uint32_t, llvm::endianness::little>(memory&: Ptr); |
208 | // FIXME: Add a reserve() to TinyPtrVector so that we don't need to make |
209 | // an additional copy. |
210 | llvm::SmallVector<file_type, 16> OverriddenFiles; |
211 | OverriddenFiles.reserve(NumFiles); |
212 | for (/**/; NumFiles != 0; --NumFiles) |
213 | OverriddenFiles.push_back(InfoObj.ReadFileRef(Ptr)); |
214 | PendingOverrides.insert(PendingOverrides.end(), OverriddenFiles.begin(), |
215 | OverriddenFiles.end()); |
216 | |
217 | // Read the OnDiskChainedHashTable header. |
218 | storage_type Buckets = Data + BucketOffset; |
219 | auto NumBucketsAndEntries = |
220 | OnDiskTable::HashTable::readNumBucketsAndEntries(Buckets); |
221 | |
222 | // Register the table. |
223 | Table NewTable = new OnDiskTable(File, NumBucketsAndEntries.first, |
224 | NumBucketsAndEntries.second, |
225 | Buckets, Ptr, Data, std::move(InfoObj)); |
226 | Tables.push_back(NewVal: NewTable.getOpaqueValue()); |
227 | } |
228 | |
229 | /// Find and read the lookup results for \p EKey. |
230 | data_type find(const external_key_type &EKey) { |
231 | data_type Result; |
232 | |
233 | if (!PendingOverrides.empty()) |
234 | removeOverriddenTables(); |
235 | |
236 | if (Tables.size() > static_cast<unsigned>(Info::MaxTables)) |
237 | condense(); |
238 | |
239 | internal_key_type Key = Info::GetInternalKey(EKey); |
240 | auto KeyHash = Info::ComputeHash(Key); |
241 | |
242 | if (MergedTable *M = getMergedTable()) { |
243 | auto It = M->Data.find(Key); |
244 | if (It != M->Data.end()) |
245 | Result = It->second; |
246 | } |
247 | |
248 | data_type_builder ResultBuilder(Result); |
249 | |
250 | for (auto *ODT : tables()) { |
251 | auto &HT = ODT->Table; |
252 | auto It = HT.find_hashed(Key, KeyHash); |
253 | if (It != HT.end()) |
254 | HT.getInfoObj().ReadDataInto(Key, It.getDataPtr(), It.getDataLen(), |
255 | ResultBuilder); |
256 | } |
257 | |
258 | return Result; |
259 | } |
260 | |
261 | /// Read all the lookup results into a single value. This only makes |
262 | /// sense if merging values across keys is meaningful. |
263 | data_type findAll() { |
264 | data_type Result; |
265 | data_type_builder ResultBuilder(Result); |
266 | |
267 | if (!PendingOverrides.empty()) |
268 | removeOverriddenTables(); |
269 | |
270 | if (MergedTable *M = getMergedTable()) { |
271 | for (auto &KV : M->Data) |
272 | Info::MergeDataInto(KV.second, ResultBuilder); |
273 | } |
274 | |
275 | for (auto *ODT : tables()) { |
276 | auto &HT = ODT->Table; |
277 | Info &InfoObj = HT.getInfoObj(); |
278 | for (auto I = HT.data_begin(), E = HT.data_end(); I != E; ++I) { |
279 | auto *LocalPtr = I.getItem(); |
280 | |
281 | // FIXME: Don't rely on the OnDiskHashTable format here. |
282 | auto L = InfoObj.ReadKeyDataLength(LocalPtr); |
283 | const internal_key_type &Key = InfoObj.ReadKey(LocalPtr, L.first); |
284 | InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second, ResultBuilder); |
285 | } |
286 | } |
287 | |
288 | return Result; |
289 | } |
290 | }; |
291 | |
292 | /// Writer for the on-disk hash table. |
293 | template<typename ReaderInfo, typename WriterInfo> |
294 | class MultiOnDiskHashTableGenerator { |
295 | using BaseTable = MultiOnDiskHashTable<ReaderInfo>; |
296 | using Generator = llvm::OnDiskChainedHashTableGenerator<WriterInfo>; |
297 | |
298 | Generator Gen; |
299 | |
300 | public: |
301 | MultiOnDiskHashTableGenerator() : Gen() {} |
302 | |
303 | void insert(typename WriterInfo::key_type_ref Key, |
304 | typename WriterInfo::data_type_ref Data, WriterInfo &Info) { |
305 | Gen.insert(Key, Data, Info); |
306 | } |
307 | |
308 | void emit(llvm::SmallVectorImpl<char> &Out, WriterInfo &Info, |
309 | const BaseTable *Base) { |
310 | using namespace llvm::support; |
311 | |
312 | llvm::raw_svector_ostream OutStream(Out); |
313 | |
314 | // Write our header information. |
315 | { |
316 | endian::Writer Writer(OutStream, llvm::endianness::little); |
317 | |
318 | // Reserve four bytes for the bucket offset. |
319 | Writer.write<uint32_t>(Val: 0); |
320 | |
321 | if (auto *Merged = Base ? Base->getMergedTable() : nullptr) { |
322 | // Write list of overridden files. |
323 | Writer.write<uint32_t>(Merged->Files.size()); |
324 | for (const auto &F : Merged->Files) |
325 | Info.EmitFileRef(OutStream, F); |
326 | |
327 | // Add all merged entries from Base to the generator. |
328 | for (auto &KV : Merged->Data) { |
329 | if (!Gen.contains(KV.first, Info)) |
330 | Gen.insert(KV.first, Info.ImportData(KV.second), Info); |
331 | } |
332 | } else { |
333 | Writer.write<uint32_t>(Val: 0); |
334 | } |
335 | } |
336 | |
337 | // Write the table itself. |
338 | uint32_t BucketOffset = Gen.Emit(OutStream, Info); |
339 | |
340 | // Replace the first four bytes with the bucket offset. |
341 | endian::write32le(P: Out.data(), V: BucketOffset); |
342 | } |
343 | }; |
344 | |
345 | } // namespace serialization |
346 | } // namespace clang |
347 | |
348 | #endif // LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H |
349 | |