1 | //===-- Serialization.cpp - Binary serialization of index data ------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "Serialization.h" |
10 | #include "Headers.h" |
11 | #include "RIFF.h" |
12 | #include "index/MemIndex.h" |
13 | #include "index/SymbolLocation.h" |
14 | #include "index/SymbolOrigin.h" |
15 | #include "index/dex/Dex.h" |
16 | #include "support/Logger.h" |
17 | #include "support/Trace.h" |
18 | #include "clang/Tooling/CompilationDatabase.h" |
19 | #include "llvm/ADT/StringRef.h" |
20 | #include "llvm/Support/Compiler.h" |
21 | #include "llvm/Support/Compression.h" |
22 | #include "llvm/Support/Endian.h" |
23 | #include "llvm/Support/Error.h" |
24 | #include "llvm/Support/raw_ostream.h" |
25 | #include <cstdint> |
26 | #include <vector> |
27 | |
28 | namespace clang { |
29 | namespace clangd { |
30 | namespace { |
31 | |
32 | // IO PRIMITIVES |
33 | // We use little-endian 32 bit ints, sometimes with variable-length encoding. |
34 | // |
35 | // Variable-length int encoding (varint) uses the bottom 7 bits of each byte |
36 | // to encode the number, and the top bit to indicate whether more bytes follow. |
37 | // e.g. 9a 2f means [0x1a and keep reading, 0x2f and stop]. |
38 | // This represents 0x1a | 0x2f<<7 = 6042. |
39 | // A 32-bit integer takes 1-5 bytes to encode; small numbers are more compact. |
40 | |
41 | // Reads binary data from a StringRef, and keeps track of position. |
42 | class Reader { |
43 | const char *Begin, *End; |
44 | bool Err = false; |
45 | |
46 | public: |
47 | Reader(llvm::StringRef Data) : Begin(Data.begin()), End(Data.end()) {} |
48 | // The "error" bit is set by reading past EOF or reading invalid data. |
49 | // When in an error state, reads may return zero values: callers should check. |
50 | bool err() const { return Err; } |
51 | // Did we read all the data, or encounter an error? |
52 | bool eof() const { return Begin == End || Err; } |
53 | // All the data we didn't read yet. |
54 | llvm::StringRef rest() const { return llvm::StringRef(Begin, End - Begin); } |
55 | |
56 | uint8_t consume8() { |
57 | if (LLVM_UNLIKELY(Begin == End)) { |
58 | Err = true; |
59 | return 0; |
60 | } |
61 | return *Begin++; |
62 | } |
63 | |
64 | uint32_t consume32() { |
65 | if (LLVM_UNLIKELY(Begin + 4 > End)) { |
66 | Err = true; |
67 | return 0; |
68 | } |
69 | auto Ret = llvm::support::endian::read32le(P: Begin); |
70 | Begin += 4; |
71 | return Ret; |
72 | } |
73 | |
74 | llvm::StringRef consume(int N) { |
75 | if (LLVM_UNLIKELY(Begin + N > End)) { |
76 | Err = true; |
77 | return llvm::StringRef(); |
78 | } |
79 | llvm::StringRef Ret(Begin, N); |
80 | Begin += N; |
81 | return Ret; |
82 | } |
83 | |
84 | uint32_t consumeVar() { |
85 | constexpr static uint8_t More = 1 << 7; |
86 | |
87 | // Use a 32 bit unsigned here to prevent promotion to signed int (unless int |
88 | // is wider than 32 bits). |
89 | uint32_t B = consume8(); |
90 | if (LLVM_LIKELY(!(B & More))) |
91 | return B; |
92 | uint32_t Val = B & ~More; |
93 | for (int Shift = 7; B & More && Shift < 32; Shift += 7) { |
94 | B = consume8(); |
95 | // 5th byte of a varint can only have lowest 4 bits set. |
96 | assert((Shift != 28 || B == (B & 0x0f)) && "Invalid varint encoding" ); |
97 | Val |= (B & ~More) << Shift; |
98 | } |
99 | return Val; |
100 | } |
101 | |
102 | llvm::StringRef consumeString(llvm::ArrayRef<llvm::StringRef> Strings) { |
103 | auto StringIndex = consumeVar(); |
104 | if (LLVM_UNLIKELY(StringIndex >= Strings.size())) { |
105 | Err = true; |
106 | return llvm::StringRef(); |
107 | } |
108 | return Strings[StringIndex]; |
109 | } |
110 | |
111 | SymbolID consumeID() { |
112 | llvm::StringRef Raw = consume(N: SymbolID::RawSize); // short if truncated. |
113 | return LLVM_UNLIKELY(err()) ? SymbolID() : SymbolID::fromRaw(Raw); |
114 | } |
115 | |
116 | // Read a varint (as consumeVar) and resize the container accordingly. |
117 | // If the size is invalid, return false and mark an error. |
118 | // (The caller should abort in this case). |
119 | template <typename T> [[nodiscard]] bool consumeSize(T &Container) { |
120 | auto Size = consumeVar(); |
121 | // Conservatively assume each element is at least one byte. |
122 | if (Size > (size_t)(End - Begin)) { |
123 | Err = true; |
124 | return false; |
125 | } |
126 | Container.resize(Size); |
127 | return true; |
128 | } |
129 | }; |
130 | |
131 | void write32(uint32_t I, llvm::raw_ostream &OS) { |
132 | char Buf[4]; |
133 | llvm::support::endian::write32le(P: Buf, V: I); |
134 | OS.write(Ptr: Buf, Size: sizeof(Buf)); |
135 | } |
136 | |
137 | void writeVar(uint32_t I, llvm::raw_ostream &OS) { |
138 | constexpr static uint8_t More = 1 << 7; |
139 | if (LLVM_LIKELY(I < 1 << 7)) { |
140 | OS.write(C: I); |
141 | return; |
142 | } |
143 | for (;;) { |
144 | OS.write(C: I | More); |
145 | I >>= 7; |
146 | if (I < 1 << 7) { |
147 | OS.write(C: I); |
148 | return; |
149 | } |
150 | } |
151 | } |
152 | |
153 | // STRING TABLE ENCODING |
154 | // Index data has many string fields, and many strings are identical. |
155 | // We store each string once, and refer to them by index. |
156 | // |
157 | // The string table's format is: |
158 | // - UncompressedSize : uint32 (or 0 for no compression) |
159 | // - CompressedData : byte[CompressedSize] |
160 | // |
161 | // CompressedData is a zlib-compressed byte[UncompressedSize]. |
162 | // It contains a sequence of null-terminated strings, e.g. "foo\0bar\0". |
163 | // These are sorted to improve compression. |
164 | |
165 | // Maps each string to a canonical representation. |
166 | // Strings remain owned externally (e.g. by SymbolSlab). |
167 | class StringTableOut { |
168 | llvm::DenseSet<llvm::StringRef> Unique; |
169 | std::vector<llvm::StringRef> Sorted; |
170 | // Since strings are interned, look up can be by pointer. |
171 | llvm::DenseMap<std::pair<const char *, size_t>, unsigned> Index; |
172 | |
173 | public: |
174 | StringTableOut() { |
175 | // Ensure there's at least one string in the table. |
176 | // Table size zero is reserved to indicate no compression. |
177 | Unique.insert(V: "" ); |
178 | } |
179 | // Add a string to the table. Overwrites S if an identical string exists. |
180 | void intern(llvm::StringRef &S) { S = *Unique.insert(V: S).first; }; |
181 | // Finalize the table and write it to OS. No more strings may be added. |
182 | void finalize(llvm::raw_ostream &OS) { |
183 | Sorted = {Unique.begin(), Unique.end()}; |
184 | llvm::sort(C&: Sorted); |
185 | for (unsigned I = 0; I < Sorted.size(); ++I) |
186 | Index.try_emplace(Key: {Sorted[I].data(), Sorted[I].size()}, Args&: I); |
187 | |
188 | std::string RawTable; |
189 | for (llvm::StringRef S : Sorted) { |
190 | RawTable.append(str: std::string(S)); |
191 | RawTable.push_back(c: 0); |
192 | } |
193 | if (llvm::compression::zlib::isAvailable()) { |
194 | llvm::SmallVector<uint8_t, 0> Compressed; |
195 | llvm::compression::zlib::compress(Input: llvm::arrayRefFromStringRef(Input: RawTable), |
196 | CompressedBuffer&: Compressed); |
197 | write32(I: RawTable.size(), OS); |
198 | OS << llvm::toStringRef(Input: Compressed); |
199 | } else { |
200 | write32(I: 0, OS); // No compression. |
201 | OS << RawTable; |
202 | } |
203 | } |
204 | // Get the ID of an string, which must be interned. Table must be finalized. |
205 | unsigned index(llvm::StringRef S) const { |
206 | assert(!Sorted.empty() && "table not finalized" ); |
207 | assert(Index.count({S.data(), S.size()}) && "string not interned" ); |
208 | return Index.find(Val: {S.data(), S.size()})->second; |
209 | } |
210 | }; |
211 | |
212 | struct StringTableIn { |
213 | llvm::BumpPtrAllocator Arena; |
214 | std::vector<llvm::StringRef> Strings; |
215 | }; |
216 | |
217 | llvm::Expected<StringTableIn> readStringTable(llvm::StringRef Data) { |
218 | Reader R(Data); |
219 | size_t UncompressedSize = R.consume32(); |
220 | if (R.err()) |
221 | return error(Fmt: "Truncated string table" ); |
222 | |
223 | llvm::StringRef Uncompressed; |
224 | llvm::SmallVector<uint8_t, 0> UncompressedStorage; |
225 | if (UncompressedSize == 0) // No compression |
226 | Uncompressed = R.rest(); |
227 | else if (llvm::compression::zlib::isAvailable()) { |
228 | // Don't allocate a massive buffer if UncompressedSize was corrupted |
229 | // This is effective for sharded index, but not big monolithic ones, as |
230 | // once compressed size reaches 4MB nothing can be ruled out. |
231 | // Theoretical max ratio from https://zlib.net/zlib_tech.html |
232 | constexpr int MaxCompressionRatio = 1032; |
233 | if (UncompressedSize / MaxCompressionRatio > R.rest().size()) |
234 | return error(Fmt: "Bad stri table: uncompress {0} -> {1} bytes is implausible" , |
235 | Vals: R.rest().size(), Vals&: UncompressedSize); |
236 | |
237 | if (llvm::Error E = llvm::compression::zlib::decompress( |
238 | Input: llvm::arrayRefFromStringRef(Input: R.rest()), Output&: UncompressedStorage, |
239 | UncompressedSize)) |
240 | return std::move(E); |
241 | Uncompressed = toStringRef(Input: UncompressedStorage); |
242 | } else |
243 | return error(Fmt: "Compressed string table, but zlib is unavailable" ); |
244 | |
245 | StringTableIn Table; |
246 | llvm::StringSaver Saver(Table.Arena); |
247 | R = Reader(Uncompressed); |
248 | for (Reader R(Uncompressed); !R.eof();) { |
249 | auto Len = R.rest().find(C: 0); |
250 | if (Len == llvm::StringRef::npos) |
251 | return error(Fmt: "Bad string table: not null terminated" ); |
252 | Table.Strings.push_back(x: Saver.save(S: R.consume(N: Len))); |
253 | R.consume8(); |
254 | } |
255 | if (R.err()) |
256 | return error(Fmt: "Truncated string table" ); |
257 | return std::move(Table); |
258 | } |
259 | |
260 | // SYMBOL ENCODING |
261 | // Each field of clangd::Symbol is encoded in turn (see implementation). |
262 | // - StringRef fields encode as varint (index into the string table) |
263 | // - enums encode as the underlying type |
264 | // - most numbers encode as varint |
265 | |
266 | void writeLocation(const SymbolLocation &Loc, const StringTableOut &Strings, |
267 | llvm::raw_ostream &OS) { |
268 | writeVar(I: Strings.index(S: Loc.FileURI), OS); |
269 | for (const auto &Endpoint : {Loc.Start, Loc.End}) { |
270 | writeVar(I: Endpoint.line(), OS); |
271 | writeVar(I: Endpoint.column(), OS); |
272 | } |
273 | } |
274 | |
275 | SymbolLocation readLocation(Reader &Data, |
276 | llvm::ArrayRef<llvm::StringRef> Strings) { |
277 | SymbolLocation Loc; |
278 | Loc.FileURI = Data.consumeString(Strings).data(); |
279 | for (auto *Endpoint : {&Loc.Start, &Loc.End}) { |
280 | Endpoint->setLine(Data.consumeVar()); |
281 | Endpoint->setColumn(Data.consumeVar()); |
282 | } |
283 | return Loc; |
284 | } |
285 | |
286 | IncludeGraphNode readIncludeGraphNode(Reader &Data, |
287 | llvm::ArrayRef<llvm::StringRef> Strings) { |
288 | IncludeGraphNode IGN; |
289 | IGN.Flags = static_cast<IncludeGraphNode::SourceFlag>(Data.consume8()); |
290 | IGN.URI = Data.consumeString(Strings); |
291 | llvm::StringRef Digest = Data.consume(N: IGN.Digest.size()); |
292 | std::copy(first: Digest.bytes_begin(), last: Digest.bytes_end(), result: IGN.Digest.begin()); |
293 | if (!Data.consumeSize(Container&: IGN.DirectIncludes)) |
294 | return IGN; |
295 | for (llvm::StringRef &Include : IGN.DirectIncludes) |
296 | Include = Data.consumeString(Strings); |
297 | return IGN; |
298 | } |
299 | |
300 | void writeIncludeGraphNode(const IncludeGraphNode &IGN, |
301 | const StringTableOut &Strings, |
302 | llvm::raw_ostream &OS) { |
303 | OS.write(C: static_cast<uint8_t>(IGN.Flags)); |
304 | writeVar(I: Strings.index(S: IGN.URI), OS); |
305 | llvm::StringRef Hash(reinterpret_cast<const char *>(IGN.Digest.data()), |
306 | IGN.Digest.size()); |
307 | OS << Hash; |
308 | writeVar(I: IGN.DirectIncludes.size(), OS); |
309 | for (llvm::StringRef Include : IGN.DirectIncludes) |
310 | writeVar(I: Strings.index(S: Include), OS); |
311 | } |
312 | |
313 | void writeSymbol(const Symbol &Sym, const StringTableOut &Strings, |
314 | llvm::raw_ostream &OS) { |
315 | OS << Sym.ID.raw(); // TODO: once we start writing xrefs and posting lists, |
316 | // symbol IDs should probably be in a string table. |
317 | OS.write(C: static_cast<uint8_t>(Sym.SymInfo.Kind)); |
318 | OS.write(C: static_cast<uint8_t>(Sym.SymInfo.Lang)); |
319 | writeVar(I: Strings.index(S: Sym.Name), OS); |
320 | writeVar(I: Strings.index(S: Sym.Scope), OS); |
321 | writeVar(I: Strings.index(S: Sym.TemplateSpecializationArgs), OS); |
322 | writeLocation(Loc: Sym.Definition, Strings, OS); |
323 | writeLocation(Loc: Sym.CanonicalDeclaration, Strings, OS); |
324 | writeVar(I: Sym.References, OS); |
325 | OS.write(C: static_cast<uint8_t>(Sym.Flags)); |
326 | writeVar(I: Strings.index(S: Sym.Signature), OS); |
327 | writeVar(I: Strings.index(S: Sym.CompletionSnippetSuffix), OS); |
328 | writeVar(I: Strings.index(S: Sym.Documentation), OS); |
329 | writeVar(I: Strings.index(S: Sym.ReturnType), OS); |
330 | writeVar(I: Strings.index(S: Sym.Type), OS); |
331 | |
332 | auto WriteInclude = [&](const Symbol::IncludeHeaderWithReferences &Include) { |
333 | writeVar(I: Strings.index(S: Include.IncludeHeader), OS); |
334 | writeVar(I: (Include.References << 2) | Include.SupportedDirectives, OS); |
335 | }; |
336 | writeVar(I: Sym.IncludeHeaders.size(), OS); |
337 | for (const auto &Include : Sym.IncludeHeaders) |
338 | WriteInclude(Include); |
339 | } |
340 | |
341 | Symbol readSymbol(Reader &Data, llvm::ArrayRef<llvm::StringRef> Strings, |
342 | SymbolOrigin Origin) { |
343 | Symbol Sym; |
344 | Sym.ID = Data.consumeID(); |
345 | Sym.SymInfo.Kind = static_cast<index::SymbolKind>(Data.consume8()); |
346 | Sym.SymInfo.Lang = static_cast<index::SymbolLanguage>(Data.consume8()); |
347 | Sym.Name = Data.consumeString(Strings); |
348 | Sym.Scope = Data.consumeString(Strings); |
349 | Sym.TemplateSpecializationArgs = Data.consumeString(Strings); |
350 | Sym.Definition = readLocation(Data, Strings); |
351 | Sym.CanonicalDeclaration = readLocation(Data, Strings); |
352 | Sym.References = Data.consumeVar(); |
353 | Sym.Flags = static_cast<Symbol::SymbolFlag>(Data.consume8()); |
354 | Sym.Origin = Origin; |
355 | Sym.Signature = Data.consumeString(Strings); |
356 | Sym.CompletionSnippetSuffix = Data.consumeString(Strings); |
357 | Sym.Documentation = Data.consumeString(Strings); |
358 | Sym.ReturnType = Data.consumeString(Strings); |
359 | Sym.Type = Data.consumeString(Strings); |
360 | if (!Data.consumeSize(Container&: Sym.IncludeHeaders)) |
361 | return Sym; |
362 | for (auto &I : Sym.IncludeHeaders) { |
363 | I.IncludeHeader = Data.consumeString(Strings); |
364 | uint32_t RefsWithDirectives = Data.consumeVar(); |
365 | I.References = RefsWithDirectives >> 2; |
366 | I.SupportedDirectives = RefsWithDirectives & 0x3; |
367 | } |
368 | return Sym; |
369 | } |
370 | |
371 | // REFS ENCODING |
372 | // A refs section has data grouped by Symbol. Each symbol has: |
373 | // - SymbolID: 8 bytes |
374 | // - NumRefs: varint |
375 | // - Ref[NumRefs] |
376 | // Fields of Ref are encoded in turn, see implementation. |
377 | |
378 | void writeRefs(const SymbolID &ID, llvm::ArrayRef<Ref> Refs, |
379 | const StringTableOut &Strings, llvm::raw_ostream &OS) { |
380 | OS << ID.raw(); |
381 | writeVar(I: Refs.size(), OS); |
382 | for (const auto &Ref : Refs) { |
383 | OS.write(C: static_cast<unsigned char>(Ref.Kind)); |
384 | writeLocation(Loc: Ref.Location, Strings, OS); |
385 | OS << Ref.Container.raw(); |
386 | } |
387 | } |
388 | |
389 | std::pair<SymbolID, std::vector<Ref>> |
390 | readRefs(Reader &Data, llvm::ArrayRef<llvm::StringRef> Strings) { |
391 | std::pair<SymbolID, std::vector<Ref>> Result; |
392 | Result.first = Data.consumeID(); |
393 | if (!Data.consumeSize(Container&: Result.second)) |
394 | return Result; |
395 | for (auto &Ref : Result.second) { |
396 | Ref.Kind = static_cast<RefKind>(Data.consume8()); |
397 | Ref.Location = readLocation(Data, Strings); |
398 | Ref.Container = Data.consumeID(); |
399 | } |
400 | return Result; |
401 | } |
402 | |
403 | // RELATIONS ENCODING |
404 | // A relations section is a flat list of relations. Each relation has: |
405 | // - SymbolID (subject): 8 bytes |
406 | // - relation kind (predicate): 1 byte |
407 | // - SymbolID (object): 8 bytes |
408 | // In the future, we might prefer a packed representation if the need arises. |
409 | |
410 | void writeRelation(const Relation &R, llvm::raw_ostream &OS) { |
411 | OS << R.Subject.raw(); |
412 | OS.write(C: static_cast<uint8_t>(R.Predicate)); |
413 | OS << R.Object.raw(); |
414 | } |
415 | |
416 | Relation readRelation(Reader &Data) { |
417 | SymbolID Subject = Data.consumeID(); |
418 | RelationKind Predicate = static_cast<RelationKind>(Data.consume8()); |
419 | SymbolID Object = Data.consumeID(); |
420 | return {.Subject: Subject, .Predicate: Predicate, .Object: Object}; |
421 | } |
422 | |
423 | struct InternedCompileCommand { |
424 | llvm::StringRef Directory; |
425 | std::vector<llvm::StringRef> CommandLine; |
426 | }; |
427 | |
428 | void writeCompileCommand(const InternedCompileCommand &Cmd, |
429 | const StringTableOut &Strings, |
430 | llvm::raw_ostream &CmdOS) { |
431 | writeVar(I: Strings.index(S: Cmd.Directory), OS&: CmdOS); |
432 | writeVar(I: Cmd.CommandLine.size(), OS&: CmdOS); |
433 | for (llvm::StringRef C : Cmd.CommandLine) |
434 | writeVar(I: Strings.index(S: C), OS&: CmdOS); |
435 | } |
436 | |
437 | InternedCompileCommand |
438 | readCompileCommand(Reader CmdReader, llvm::ArrayRef<llvm::StringRef> Strings) { |
439 | InternedCompileCommand Cmd; |
440 | Cmd.Directory = CmdReader.consumeString(Strings); |
441 | if (!CmdReader.consumeSize(Container&: Cmd.CommandLine)) |
442 | return Cmd; |
443 | for (llvm::StringRef &C : Cmd.CommandLine) |
444 | C = CmdReader.consumeString(Strings); |
445 | return Cmd; |
446 | } |
447 | |
448 | // FILE ENCODING |
449 | // A file is a RIFF chunk with type 'CdIx'. |
450 | // It contains the sections: |
451 | // - meta: version number |
452 | // - srcs: information related to include graph |
453 | // - stri: string table |
454 | // - symb: symbols |
455 | // - refs: references to symbols |
456 | |
457 | // The current versioning scheme is simple - non-current versions are rejected. |
458 | // If you make a breaking change, bump this version number to invalidate stored |
459 | // data. Later we may want to support some backward compatibility. |
460 | constexpr static uint32_t Version = 19; |
461 | |
462 | llvm::Expected<IndexFileIn> readRIFF(llvm::StringRef Data, |
463 | SymbolOrigin Origin) { |
464 | auto RIFF = riff::readFile(Stream: Data); |
465 | if (!RIFF) |
466 | return RIFF.takeError(); |
467 | if (RIFF->Type != riff::fourCC(Literal: "CdIx" )) |
468 | return error(Fmt: "wrong RIFF filetype: {0}" , Vals: riff::fourCCStr(Data: RIFF->Type)); |
469 | llvm::StringMap<llvm::StringRef> Chunks; |
470 | for (const auto &Chunk : RIFF->Chunks) |
471 | Chunks.try_emplace(Key: llvm::StringRef(Chunk.ID.data(), Chunk.ID.size()), |
472 | Args: Chunk.Data); |
473 | |
474 | if (!Chunks.count(Key: "meta" )) |
475 | return error(Fmt: "missing meta chunk" ); |
476 | Reader Meta(Chunks.lookup(Key: "meta" )); |
477 | auto SeenVersion = Meta.consume32(); |
478 | if (SeenVersion != Version) |
479 | return error(Fmt: "wrong version: want {0}, got {1}" , Vals: Version, Vals&: SeenVersion); |
480 | |
481 | // meta chunk is checked above, as we prefer the "version mismatch" error. |
482 | for (llvm::StringRef RequiredChunk : {"stri" }) |
483 | if (!Chunks.count(Key: RequiredChunk)) |
484 | return error(Fmt: "missing required chunk {0}" , Vals&: RequiredChunk); |
485 | |
486 | auto Strings = readStringTable(Data: Chunks.lookup(Key: "stri" )); |
487 | if (!Strings) |
488 | return Strings.takeError(); |
489 | |
490 | IndexFileIn Result; |
491 | if (Chunks.count(Key: "srcs" )) { |
492 | Reader SrcsReader(Chunks.lookup(Key: "srcs" )); |
493 | Result.Sources.emplace(); |
494 | while (!SrcsReader.eof()) { |
495 | auto IGN = readIncludeGraphNode(Data&: SrcsReader, Strings: Strings->Strings); |
496 | auto Entry = Result.Sources->try_emplace(Key: IGN.URI).first; |
497 | Entry->getValue() = std::move(IGN); |
498 | // We change all the strings inside the structure to point at the keys in |
499 | // the map, since it is the only copy of the string that's going to live. |
500 | Entry->getValue().URI = Entry->getKey(); |
501 | for (auto &Include : Entry->getValue().DirectIncludes) |
502 | Include = Result.Sources->try_emplace(Key: Include).first->getKey(); |
503 | } |
504 | if (SrcsReader.err()) |
505 | return error(Fmt: "malformed or truncated include uri" ); |
506 | } |
507 | |
508 | if (Chunks.count(Key: "symb" )) { |
509 | Reader SymbolReader(Chunks.lookup(Key: "symb" )); |
510 | SymbolSlab::Builder Symbols; |
511 | while (!SymbolReader.eof()) |
512 | Symbols.insert(S: readSymbol(Data&: SymbolReader, Strings: Strings->Strings, Origin)); |
513 | if (SymbolReader.err()) |
514 | return error(Fmt: "malformed or truncated symbol" ); |
515 | Result.Symbols = std::move(Symbols).build(); |
516 | } |
517 | if (Chunks.count(Key: "refs" )) { |
518 | Reader RefsReader(Chunks.lookup(Key: "refs" )); |
519 | RefSlab::Builder Refs; |
520 | while (!RefsReader.eof()) { |
521 | auto RefsBundle = readRefs(Data&: RefsReader, Strings: Strings->Strings); |
522 | for (const auto &Ref : RefsBundle.second) // FIXME: bulk insert? |
523 | Refs.insert(ID: RefsBundle.first, S: Ref); |
524 | } |
525 | if (RefsReader.err()) |
526 | return error(Fmt: "malformed or truncated refs" ); |
527 | Result.Refs = std::move(Refs).build(); |
528 | } |
529 | if (Chunks.count(Key: "rela" )) { |
530 | Reader RelationsReader(Chunks.lookup(Key: "rela" )); |
531 | RelationSlab::Builder Relations; |
532 | while (!RelationsReader.eof()) |
533 | Relations.insert(R: readRelation(Data&: RelationsReader)); |
534 | if (RelationsReader.err()) |
535 | return error(Fmt: "malformed or truncated relations" ); |
536 | Result.Relations = std::move(Relations).build(); |
537 | } |
538 | if (Chunks.count(Key: "cmdl" )) { |
539 | Reader CmdReader(Chunks.lookup(Key: "cmdl" )); |
540 | InternedCompileCommand Cmd = |
541 | readCompileCommand(CmdReader, Strings: Strings->Strings); |
542 | if (CmdReader.err()) |
543 | return error(Fmt: "malformed or truncated commandline section" ); |
544 | Result.Cmd.emplace(); |
545 | Result.Cmd->Directory = std::string(Cmd.Directory); |
546 | Result.Cmd->CommandLine.reserve(n: Cmd.CommandLine.size()); |
547 | for (llvm::StringRef C : Cmd.CommandLine) |
548 | Result.Cmd->CommandLine.emplace_back(args&: C); |
549 | } |
550 | return std::move(Result); |
551 | } |
552 | |
553 | template <class Callback> |
554 | void visitStrings(IncludeGraphNode &IGN, const Callback &CB) { |
555 | CB(IGN.URI); |
556 | for (llvm::StringRef &Include : IGN.DirectIncludes) |
557 | CB(Include); |
558 | } |
559 | |
560 | void writeRIFF(const IndexFileOut &Data, llvm::raw_ostream &OS) { |
561 | assert(Data.Symbols && "An index file without symbols makes no sense!" ); |
562 | riff::File RIFF; |
563 | RIFF.Type = riff::fourCC(Literal: "CdIx" ); |
564 | |
565 | llvm::SmallString<4> Meta; |
566 | { |
567 | llvm::raw_svector_ostream MetaOS(Meta); |
568 | write32(I: Version, OS&: MetaOS); |
569 | } |
570 | RIFF.Chunks.push_back(x: {.ID: riff::fourCC(Literal: "meta" ), .Data: Meta}); |
571 | |
572 | StringTableOut Strings; |
573 | std::vector<Symbol> Symbols; |
574 | for (const auto &Sym : *Data.Symbols) { |
575 | Symbols.emplace_back(args: Sym); |
576 | visitStrings(S&: Symbols.back(), |
577 | CB: [&](llvm::StringRef &S) { Strings.intern(S); }); |
578 | } |
579 | std::vector<IncludeGraphNode> Sources; |
580 | if (Data.Sources) |
581 | for (const auto &Source : *Data.Sources) { |
582 | Sources.push_back(x: Source.getValue()); |
583 | visitStrings(IGN&: Sources.back(), |
584 | CB: [&](llvm::StringRef &S) { Strings.intern(S); }); |
585 | } |
586 | |
587 | std::vector<std::pair<SymbolID, std::vector<Ref>>> Refs; |
588 | if (Data.Refs) { |
589 | for (const auto &Sym : *Data.Refs) { |
590 | Refs.emplace_back(args: Sym); |
591 | for (auto &Ref : Refs.back().second) { |
592 | llvm::StringRef File = Ref.Location.FileURI; |
593 | Strings.intern(S&: File); |
594 | Ref.Location.FileURI = File.data(); |
595 | } |
596 | } |
597 | } |
598 | |
599 | std::vector<Relation> Relations; |
600 | if (Data.Relations) { |
601 | for (const auto &Relation : *Data.Relations) { |
602 | Relations.emplace_back(args: Relation); |
603 | // No strings to be interned in relations. |
604 | } |
605 | } |
606 | |
607 | InternedCompileCommand InternedCmd; |
608 | if (Data.Cmd) { |
609 | InternedCmd.CommandLine.reserve(n: Data.Cmd->CommandLine.size()); |
610 | InternedCmd.Directory = Data.Cmd->Directory; |
611 | Strings.intern(S&: InternedCmd.Directory); |
612 | for (llvm::StringRef C : Data.Cmd->CommandLine) { |
613 | InternedCmd.CommandLine.emplace_back(args&: C); |
614 | Strings.intern(S&: InternedCmd.CommandLine.back()); |
615 | } |
616 | } |
617 | |
618 | std::string StringSection; |
619 | { |
620 | llvm::raw_string_ostream StringOS(StringSection); |
621 | Strings.finalize(OS&: StringOS); |
622 | } |
623 | RIFF.Chunks.push_back(x: {.ID: riff::fourCC(Literal: "stri" ), .Data: StringSection}); |
624 | |
625 | std::string SymbolSection; |
626 | { |
627 | llvm::raw_string_ostream SymbolOS(SymbolSection); |
628 | for (const auto &Sym : Symbols) |
629 | writeSymbol(Sym, Strings, OS&: SymbolOS); |
630 | } |
631 | RIFF.Chunks.push_back(x: {.ID: riff::fourCC(Literal: "symb" ), .Data: SymbolSection}); |
632 | |
633 | std::string RefsSection; |
634 | if (Data.Refs) { |
635 | { |
636 | llvm::raw_string_ostream RefsOS(RefsSection); |
637 | for (const auto &Sym : Refs) |
638 | writeRefs(ID: Sym.first, Refs: Sym.second, Strings, OS&: RefsOS); |
639 | } |
640 | RIFF.Chunks.push_back(x: {.ID: riff::fourCC(Literal: "refs" ), .Data: RefsSection}); |
641 | } |
642 | |
643 | std::string RelationSection; |
644 | if (Data.Relations) { |
645 | { |
646 | llvm::raw_string_ostream RelationOS{RelationSection}; |
647 | for (const auto &Relation : Relations) |
648 | writeRelation(R: Relation, OS&: RelationOS); |
649 | } |
650 | RIFF.Chunks.push_back(x: {.ID: riff::fourCC(Literal: "rela" ), .Data: RelationSection}); |
651 | } |
652 | |
653 | std::string SrcsSection; |
654 | { |
655 | { |
656 | llvm::raw_string_ostream SrcsOS(SrcsSection); |
657 | for (const auto &SF : Sources) |
658 | writeIncludeGraphNode(IGN: SF, Strings, OS&: SrcsOS); |
659 | } |
660 | RIFF.Chunks.push_back(x: {.ID: riff::fourCC(Literal: "srcs" ), .Data: SrcsSection}); |
661 | } |
662 | |
663 | std::string CmdlSection; |
664 | if (Data.Cmd) { |
665 | { |
666 | llvm::raw_string_ostream CmdOS(CmdlSection); |
667 | writeCompileCommand(Cmd: InternedCmd, Strings, CmdOS); |
668 | } |
669 | RIFF.Chunks.push_back(x: {.ID: riff::fourCC(Literal: "cmdl" ), .Data: CmdlSection}); |
670 | } |
671 | |
672 | OS << RIFF; |
673 | } |
674 | |
675 | } // namespace |
676 | |
677 | // Defined in YAMLSerialization.cpp. |
678 | void writeYAML(const IndexFileOut &, llvm::raw_ostream &); |
679 | llvm::Expected<IndexFileIn> readYAML(llvm::StringRef, SymbolOrigin Origin); |
680 | |
681 | llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const IndexFileOut &O) { |
682 | switch (O.Format) { |
683 | case IndexFileFormat::RIFF: |
684 | writeRIFF(Data: O, OS); |
685 | break; |
686 | case IndexFileFormat::YAML: |
687 | writeYAML(O, OS); |
688 | break; |
689 | } |
690 | return OS; |
691 | } |
692 | |
693 | llvm::Expected<IndexFileIn> readIndexFile(llvm::StringRef Data, |
694 | SymbolOrigin Origin) { |
695 | if (Data.starts_with(Prefix: "RIFF" )) { |
696 | return readRIFF(Data, Origin); |
697 | } |
698 | if (auto YAMLContents = readYAML(Data, Origin)) { |
699 | return std::move(*YAMLContents); |
700 | } else { |
701 | return error(Fmt: "Not a RIFF file and failed to parse as YAML: {0}" , |
702 | Vals: YAMLContents.takeError()); |
703 | } |
704 | } |
705 | |
706 | std::unique_ptr<SymbolIndex> loadIndex(llvm::StringRef SymbolFilename, |
707 | SymbolOrigin Origin, bool UseDex) { |
708 | trace::Span OverallTracer("LoadIndex" ); |
709 | auto Buffer = llvm::MemoryBuffer::getFile(Filename: SymbolFilename); |
710 | if (!Buffer) { |
711 | elog(Fmt: "Can't open {0}: {1}" , Vals&: SymbolFilename, Vals: Buffer.getError().message()); |
712 | return nullptr; |
713 | } |
714 | |
715 | SymbolSlab Symbols; |
716 | RefSlab Refs; |
717 | RelationSlab Relations; |
718 | { |
719 | trace::Span Tracer("ParseIndex" ); |
720 | if (auto I = readIndexFile(Data: Buffer->get()->getBuffer(), Origin)) { |
721 | if (I->Symbols) |
722 | Symbols = std::move(*I->Symbols); |
723 | if (I->Refs) |
724 | Refs = std::move(*I->Refs); |
725 | if (I->Relations) |
726 | Relations = std::move(*I->Relations); |
727 | } else { |
728 | elog(Fmt: "Bad index file: {0}" , Vals: I.takeError()); |
729 | return nullptr; |
730 | } |
731 | } |
732 | |
733 | size_t NumSym = Symbols.size(); |
734 | size_t NumRefs = Refs.numRefs(); |
735 | size_t NumRelations = Relations.size(); |
736 | |
737 | trace::Span Tracer("BuildIndex" ); |
738 | auto Index = UseDex ? dex::Dex::build(std::move(Symbols), std::move(Refs), |
739 | std::move(Relations)) |
740 | : MemIndex::build(Symbols: std::move(Symbols), Refs: std::move(Refs), |
741 | Relations: std::move(Relations)); |
742 | vlog(Fmt: "Loaded {0} from {1} with estimated memory usage {2} bytes\n" |
743 | " - number of symbols: {3}\n" |
744 | " - number of refs: {4}\n" |
745 | " - number of relations: {5}" , |
746 | Vals: UseDex ? "Dex" : "MemIndex" , Vals&: SymbolFilename, |
747 | Vals: Index->estimateMemoryUsage(), Vals&: NumSym, Vals&: NumRefs, Vals&: NumRelations); |
748 | return Index; |
749 | } |
750 | |
751 | } // namespace clangd |
752 | } // namespace clang |
753 | |