| 1 | //===-- Serialization.cpp - Binary serialization of index data ------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #include "Serialization.h" |
| 10 | #include "Headers.h" |
| 11 | #include "RIFF.h" |
| 12 | #include "index/MemIndex.h" |
| 13 | #include "index/SymbolLocation.h" |
| 14 | #include "index/SymbolOrigin.h" |
| 15 | #include "index/dex/Dex.h" |
| 16 | #include "support/Logger.h" |
| 17 | #include "support/Trace.h" |
| 18 | #include "clang/Tooling/CompilationDatabase.h" |
| 19 | #include "llvm/ADT/StringRef.h" |
| 20 | #include "llvm/Support/Compiler.h" |
| 21 | #include "llvm/Support/Compression.h" |
| 22 | #include "llvm/Support/Endian.h" |
| 23 | #include "llvm/Support/Error.h" |
| 24 | #include "llvm/Support/raw_ostream.h" |
| 25 | #include <cstdint> |
| 26 | #include <vector> |
| 27 | |
| 28 | namespace clang { |
| 29 | namespace clangd { |
| 30 | namespace { |
| 31 | |
| 32 | // IO PRIMITIVES |
| 33 | // We use little-endian 32 bit ints, sometimes with variable-length encoding. |
| 34 | // |
| 35 | // Variable-length int encoding (varint) uses the bottom 7 bits of each byte |
| 36 | // to encode the number, and the top bit to indicate whether more bytes follow. |
| 37 | // e.g. 9a 2f means [0x1a and keep reading, 0x2f and stop]. |
| 38 | // This represents 0x1a | 0x2f<<7 = 6042. |
| 39 | // A 32-bit integer takes 1-5 bytes to encode; small numbers are more compact. |
| 40 | |
| 41 | // Reads binary data from a StringRef, and keeps track of position. |
| 42 | class Reader { |
| 43 | const char *Begin, *End; |
| 44 | bool Err = false; |
| 45 | |
| 46 | public: |
| 47 | Reader(llvm::StringRef Data) : Begin(Data.begin()), End(Data.end()) {} |
| 48 | // The "error" bit is set by reading past EOF or reading invalid data. |
| 49 | // When in an error state, reads may return zero values: callers should check. |
| 50 | bool err() const { return Err; } |
| 51 | // Did we read all the data, or encounter an error? |
| 52 | bool eof() const { return Begin == End || Err; } |
| 53 | // All the data we didn't read yet. |
| 54 | llvm::StringRef rest() const { return llvm::StringRef(Begin, End - Begin); } |
| 55 | |
| 56 | uint8_t consume8() { |
| 57 | if (LLVM_UNLIKELY(Begin == End)) { |
| 58 | Err = true; |
| 59 | return 0; |
| 60 | } |
| 61 | return *Begin++; |
| 62 | } |
| 63 | |
| 64 | uint32_t consume32() { |
| 65 | if (LLVM_UNLIKELY(Begin + 4 > End)) { |
| 66 | Err = true; |
| 67 | return 0; |
| 68 | } |
| 69 | auto Ret = llvm::support::endian::read32le(P: Begin); |
| 70 | Begin += 4; |
| 71 | return Ret; |
| 72 | } |
| 73 | |
| 74 | llvm::StringRef consume(int N) { |
| 75 | if (LLVM_UNLIKELY(Begin + N > End)) { |
| 76 | Err = true; |
| 77 | return llvm::StringRef(); |
| 78 | } |
| 79 | llvm::StringRef Ret(Begin, N); |
| 80 | Begin += N; |
| 81 | return Ret; |
| 82 | } |
| 83 | |
| 84 | uint32_t consumeVar() { |
| 85 | constexpr static uint8_t More = 1 << 7; |
| 86 | |
| 87 | // Use a 32 bit unsigned here to prevent promotion to signed int (unless int |
| 88 | // is wider than 32 bits). |
| 89 | uint32_t B = consume8(); |
| 90 | if (LLVM_LIKELY(!(B & More))) |
| 91 | return B; |
| 92 | uint32_t Val = B & ~More; |
| 93 | for (int Shift = 7; B & More && Shift < 32; Shift += 7) { |
| 94 | B = consume8(); |
| 95 | // 5th byte of a varint can only have lowest 4 bits set. |
| 96 | assert((Shift != 28 || B == (B & 0x0f)) && "Invalid varint encoding" ); |
| 97 | Val |= (B & ~More) << Shift; |
| 98 | } |
| 99 | return Val; |
| 100 | } |
| 101 | |
| 102 | llvm::StringRef consumeString(llvm::ArrayRef<llvm::StringRef> Strings) { |
| 103 | auto StringIndex = consumeVar(); |
| 104 | if (LLVM_UNLIKELY(StringIndex >= Strings.size())) { |
| 105 | Err = true; |
| 106 | return llvm::StringRef(); |
| 107 | } |
| 108 | return Strings[StringIndex]; |
| 109 | } |
| 110 | |
| 111 | SymbolID consumeID() { |
| 112 | llvm::StringRef Raw = consume(N: SymbolID::RawSize); // short if truncated. |
| 113 | return LLVM_UNLIKELY(err()) ? SymbolID() : SymbolID::fromRaw(Raw); |
| 114 | } |
| 115 | |
| 116 | // Read a varint (as consumeVar) and resize the container accordingly. |
| 117 | // If the size is invalid, return false and mark an error. |
| 118 | // (The caller should abort in this case). |
| 119 | template <typename T> [[nodiscard]] bool consumeSize(T &Container) { |
| 120 | auto Size = consumeVar(); |
| 121 | // Conservatively assume each element is at least one byte. |
| 122 | if (Size > (size_t)(End - Begin)) { |
| 123 | Err = true; |
| 124 | return false; |
| 125 | } |
| 126 | Container.resize(Size); |
| 127 | return true; |
| 128 | } |
| 129 | }; |
| 130 | |
| 131 | void write32(uint32_t I, llvm::raw_ostream &OS) { |
| 132 | char Buf[4]; |
| 133 | llvm::support::endian::write32le(P: Buf, V: I); |
| 134 | OS.write(Ptr: Buf, Size: sizeof(Buf)); |
| 135 | } |
| 136 | |
| 137 | void writeVar(uint32_t I, llvm::raw_ostream &OS) { |
| 138 | constexpr static uint8_t More = 1 << 7; |
| 139 | if (LLVM_LIKELY(I < 1 << 7)) { |
| 140 | OS.write(C: I); |
| 141 | return; |
| 142 | } |
| 143 | for (;;) { |
| 144 | OS.write(C: I | More); |
| 145 | I >>= 7; |
| 146 | if (I < 1 << 7) { |
| 147 | OS.write(C: I); |
| 148 | return; |
| 149 | } |
| 150 | } |
| 151 | } |
| 152 | |
| 153 | // STRING TABLE ENCODING |
| 154 | // Index data has many string fields, and many strings are identical. |
| 155 | // We store each string once, and refer to them by index. |
| 156 | // |
| 157 | // The string table's format is: |
| 158 | // - UncompressedSize : uint32 (or 0 for no compression) |
| 159 | // - CompressedData : byte[CompressedSize] |
| 160 | // |
| 161 | // CompressedData is a zlib-compressed byte[UncompressedSize]. |
| 162 | // It contains a sequence of null-terminated strings, e.g. "foo\0bar\0". |
| 163 | // These are sorted to improve compression. |
| 164 | |
| 165 | // Maps each string to a canonical representation. |
| 166 | // Strings remain owned externally (e.g. by SymbolSlab). |
| 167 | class StringTableOut { |
| 168 | llvm::DenseSet<llvm::StringRef> Unique; |
| 169 | std::vector<llvm::StringRef> Sorted; |
| 170 | // Since strings are interned, look up can be by pointer. |
| 171 | llvm::DenseMap<std::pair<const char *, size_t>, unsigned> Index; |
| 172 | |
| 173 | public: |
| 174 | StringTableOut() { |
| 175 | // Ensure there's at least one string in the table. |
| 176 | // Table size zero is reserved to indicate no compression. |
| 177 | Unique.insert(V: "" ); |
| 178 | } |
| 179 | // Add a string to the table. Overwrites S if an identical string exists. |
| 180 | void intern(llvm::StringRef &S) { S = *Unique.insert(V: S).first; }; |
| 181 | // Finalize the table and write it to OS. No more strings may be added. |
| 182 | void finalize(llvm::raw_ostream &OS) { |
| 183 | Sorted = {Unique.begin(), Unique.end()}; |
| 184 | llvm::sort(C&: Sorted); |
| 185 | for (unsigned I = 0; I < Sorted.size(); ++I) |
| 186 | Index.try_emplace(Key: {Sorted[I].data(), Sorted[I].size()}, Args&: I); |
| 187 | |
| 188 | std::string RawTable; |
| 189 | for (llvm::StringRef S : Sorted) { |
| 190 | RawTable.append(str: std::string(S)); |
| 191 | RawTable.push_back(c: 0); |
| 192 | } |
| 193 | if (llvm::compression::zlib::isAvailable()) { |
| 194 | llvm::SmallVector<uint8_t, 0> Compressed; |
| 195 | llvm::compression::zlib::compress(Input: llvm::arrayRefFromStringRef(Input: RawTable), |
| 196 | CompressedBuffer&: Compressed); |
| 197 | write32(I: RawTable.size(), OS); |
| 198 | OS << llvm::toStringRef(Input: Compressed); |
| 199 | } else { |
| 200 | write32(I: 0, OS); // No compression. |
| 201 | OS << RawTable; |
| 202 | } |
| 203 | } |
| 204 | // Get the ID of an string, which must be interned. Table must be finalized. |
| 205 | unsigned index(llvm::StringRef S) const { |
| 206 | assert(!Sorted.empty() && "table not finalized" ); |
| 207 | assert(Index.count({S.data(), S.size()}) && "string not interned" ); |
| 208 | return Index.find(Val: {S.data(), S.size()})->second; |
| 209 | } |
| 210 | }; |
| 211 | |
| 212 | struct StringTableIn { |
| 213 | llvm::BumpPtrAllocator Arena; |
| 214 | std::vector<llvm::StringRef> Strings; |
| 215 | }; |
| 216 | |
| 217 | llvm::Expected<StringTableIn> readStringTable(llvm::StringRef Data) { |
| 218 | Reader R(Data); |
| 219 | size_t UncompressedSize = R.consume32(); |
| 220 | if (R.err()) |
| 221 | return error(Fmt: "Truncated string table" ); |
| 222 | |
| 223 | llvm::StringRef Uncompressed; |
| 224 | llvm::SmallVector<uint8_t, 0> UncompressedStorage; |
| 225 | if (UncompressedSize == 0) // No compression |
| 226 | Uncompressed = R.rest(); |
| 227 | else if (llvm::compression::zlib::isAvailable()) { |
| 228 | // Don't allocate a massive buffer if UncompressedSize was corrupted |
| 229 | // This is effective for sharded index, but not big monolithic ones, as |
| 230 | // once compressed size reaches 4MB nothing can be ruled out. |
| 231 | // Theoretical max ratio from https://zlib.net/zlib_tech.html |
| 232 | constexpr int MaxCompressionRatio = 1032; |
| 233 | if (UncompressedSize / MaxCompressionRatio > R.rest().size()) |
| 234 | return error(Fmt: "Bad stri table: uncompress {0} -> {1} bytes is implausible" , |
| 235 | Vals: R.rest().size(), Vals&: UncompressedSize); |
| 236 | |
| 237 | if (llvm::Error E = llvm::compression::zlib::decompress( |
| 238 | Input: llvm::arrayRefFromStringRef(Input: R.rest()), Output&: UncompressedStorage, |
| 239 | UncompressedSize)) |
| 240 | return std::move(E); |
| 241 | Uncompressed = toStringRef(Input: UncompressedStorage); |
| 242 | } else |
| 243 | return error(Fmt: "Compressed string table, but zlib is unavailable" ); |
| 244 | |
| 245 | StringTableIn Table; |
| 246 | llvm::StringSaver Saver(Table.Arena); |
| 247 | R = Reader(Uncompressed); |
| 248 | for (Reader R(Uncompressed); !R.eof();) { |
| 249 | auto Len = R.rest().find(C: 0); |
| 250 | if (Len == llvm::StringRef::npos) |
| 251 | return error(Fmt: "Bad string table: not null terminated" ); |
| 252 | Table.Strings.push_back(x: Saver.save(S: R.consume(N: Len))); |
| 253 | R.consume8(); |
| 254 | } |
| 255 | if (R.err()) |
| 256 | return error(Fmt: "Truncated string table" ); |
| 257 | return std::move(Table); |
| 258 | } |
| 259 | |
| 260 | // SYMBOL ENCODING |
| 261 | // Each field of clangd::Symbol is encoded in turn (see implementation). |
| 262 | // - StringRef fields encode as varint (index into the string table) |
| 263 | // - enums encode as the underlying type |
| 264 | // - most numbers encode as varint |
| 265 | |
| 266 | void writeLocation(const SymbolLocation &Loc, const StringTableOut &Strings, |
| 267 | llvm::raw_ostream &OS) { |
| 268 | writeVar(I: Strings.index(S: Loc.FileURI), OS); |
| 269 | for (const auto &Endpoint : {Loc.Start, Loc.End}) { |
| 270 | writeVar(I: Endpoint.line(), OS); |
| 271 | writeVar(I: Endpoint.column(), OS); |
| 272 | } |
| 273 | } |
| 274 | |
| 275 | SymbolLocation readLocation(Reader &Data, |
| 276 | llvm::ArrayRef<llvm::StringRef> Strings) { |
| 277 | SymbolLocation Loc; |
| 278 | Loc.FileURI = Data.consumeString(Strings).data(); |
| 279 | for (auto *Endpoint : {&Loc.Start, &Loc.End}) { |
| 280 | Endpoint->setLine(Data.consumeVar()); |
| 281 | Endpoint->setColumn(Data.consumeVar()); |
| 282 | } |
| 283 | return Loc; |
| 284 | } |
| 285 | |
| 286 | IncludeGraphNode readIncludeGraphNode(Reader &Data, |
| 287 | llvm::ArrayRef<llvm::StringRef> Strings) { |
| 288 | IncludeGraphNode IGN; |
| 289 | IGN.Flags = static_cast<IncludeGraphNode::SourceFlag>(Data.consume8()); |
| 290 | IGN.URI = Data.consumeString(Strings); |
| 291 | llvm::StringRef Digest = Data.consume(N: IGN.Digest.size()); |
| 292 | std::copy(first: Digest.bytes_begin(), last: Digest.bytes_end(), result: IGN.Digest.begin()); |
| 293 | if (!Data.consumeSize(Container&: IGN.DirectIncludes)) |
| 294 | return IGN; |
| 295 | for (llvm::StringRef &Include : IGN.DirectIncludes) |
| 296 | Include = Data.consumeString(Strings); |
| 297 | return IGN; |
| 298 | } |
| 299 | |
| 300 | void writeIncludeGraphNode(const IncludeGraphNode &IGN, |
| 301 | const StringTableOut &Strings, |
| 302 | llvm::raw_ostream &OS) { |
| 303 | OS.write(C: static_cast<uint8_t>(IGN.Flags)); |
| 304 | writeVar(I: Strings.index(S: IGN.URI), OS); |
| 305 | llvm::StringRef Hash(reinterpret_cast<const char *>(IGN.Digest.data()), |
| 306 | IGN.Digest.size()); |
| 307 | OS << Hash; |
| 308 | writeVar(I: IGN.DirectIncludes.size(), OS); |
| 309 | for (llvm::StringRef Include : IGN.DirectIncludes) |
| 310 | writeVar(I: Strings.index(S: Include), OS); |
| 311 | } |
| 312 | |
| 313 | void writeSymbol(const Symbol &Sym, const StringTableOut &Strings, |
| 314 | llvm::raw_ostream &OS) { |
| 315 | OS << Sym.ID.raw(); // TODO: once we start writing xrefs and posting lists, |
| 316 | // symbol IDs should probably be in a string table. |
| 317 | OS.write(C: static_cast<uint8_t>(Sym.SymInfo.Kind)); |
| 318 | OS.write(C: static_cast<uint8_t>(Sym.SymInfo.Lang)); |
| 319 | writeVar(I: Strings.index(S: Sym.Name), OS); |
| 320 | writeVar(I: Strings.index(S: Sym.Scope), OS); |
| 321 | writeVar(I: Strings.index(S: Sym.TemplateSpecializationArgs), OS); |
| 322 | writeLocation(Loc: Sym.Definition, Strings, OS); |
| 323 | writeLocation(Loc: Sym.CanonicalDeclaration, Strings, OS); |
| 324 | writeVar(I: Sym.References, OS); |
| 325 | OS.write(C: static_cast<uint8_t>(Sym.Flags)); |
| 326 | writeVar(I: Strings.index(S: Sym.Signature), OS); |
| 327 | writeVar(I: Strings.index(S: Sym.CompletionSnippetSuffix), OS); |
| 328 | writeVar(I: Strings.index(S: Sym.Documentation), OS); |
| 329 | writeVar(I: Strings.index(S: Sym.ReturnType), OS); |
| 330 | writeVar(I: Strings.index(S: Sym.Type), OS); |
| 331 | |
| 332 | auto WriteInclude = [&](const Symbol::IncludeHeaderWithReferences &Include) { |
| 333 | writeVar(I: Strings.index(S: Include.IncludeHeader), OS); |
| 334 | writeVar(I: (Include.References << 2) | Include.SupportedDirectives, OS); |
| 335 | }; |
| 336 | writeVar(I: Sym.IncludeHeaders.size(), OS); |
| 337 | for (const auto &Include : Sym.IncludeHeaders) |
| 338 | WriteInclude(Include); |
| 339 | } |
| 340 | |
| 341 | Symbol readSymbol(Reader &Data, llvm::ArrayRef<llvm::StringRef> Strings, |
| 342 | SymbolOrigin Origin) { |
| 343 | Symbol Sym; |
| 344 | Sym.ID = Data.consumeID(); |
| 345 | Sym.SymInfo.Kind = static_cast<index::SymbolKind>(Data.consume8()); |
| 346 | Sym.SymInfo.Lang = static_cast<index::SymbolLanguage>(Data.consume8()); |
| 347 | Sym.Name = Data.consumeString(Strings); |
| 348 | Sym.Scope = Data.consumeString(Strings); |
| 349 | Sym.TemplateSpecializationArgs = Data.consumeString(Strings); |
| 350 | Sym.Definition = readLocation(Data, Strings); |
| 351 | Sym.CanonicalDeclaration = readLocation(Data, Strings); |
| 352 | Sym.References = Data.consumeVar(); |
| 353 | Sym.Flags = static_cast<Symbol::SymbolFlag>(Data.consume8()); |
| 354 | Sym.Origin = Origin; |
| 355 | Sym.Signature = Data.consumeString(Strings); |
| 356 | Sym.CompletionSnippetSuffix = Data.consumeString(Strings); |
| 357 | Sym.Documentation = Data.consumeString(Strings); |
| 358 | Sym.ReturnType = Data.consumeString(Strings); |
| 359 | Sym.Type = Data.consumeString(Strings); |
| 360 | if (!Data.consumeSize(Container&: Sym.IncludeHeaders)) |
| 361 | return Sym; |
| 362 | for (auto &I : Sym.IncludeHeaders) { |
| 363 | I.IncludeHeader = Data.consumeString(Strings); |
| 364 | uint32_t RefsWithDirectives = Data.consumeVar(); |
| 365 | I.References = RefsWithDirectives >> 2; |
| 366 | I.SupportedDirectives = RefsWithDirectives & 0x3; |
| 367 | } |
| 368 | return Sym; |
| 369 | } |
| 370 | |
| 371 | // REFS ENCODING |
| 372 | // A refs section has data grouped by Symbol. Each symbol has: |
| 373 | // - SymbolID: 8 bytes |
| 374 | // - NumRefs: varint |
| 375 | // - Ref[NumRefs] |
| 376 | // Fields of Ref are encoded in turn, see implementation. |
| 377 | |
| 378 | void writeRefs(const SymbolID &ID, llvm::ArrayRef<Ref> Refs, |
| 379 | const StringTableOut &Strings, llvm::raw_ostream &OS) { |
| 380 | OS << ID.raw(); |
| 381 | writeVar(I: Refs.size(), OS); |
| 382 | for (const auto &Ref : Refs) { |
| 383 | OS.write(C: static_cast<unsigned char>(Ref.Kind)); |
| 384 | writeLocation(Loc: Ref.Location, Strings, OS); |
| 385 | OS << Ref.Container.raw(); |
| 386 | } |
| 387 | } |
| 388 | |
| 389 | std::pair<SymbolID, std::vector<Ref>> |
| 390 | readRefs(Reader &Data, llvm::ArrayRef<llvm::StringRef> Strings) { |
| 391 | std::pair<SymbolID, std::vector<Ref>> Result; |
| 392 | Result.first = Data.consumeID(); |
| 393 | if (!Data.consumeSize(Container&: Result.second)) |
| 394 | return Result; |
| 395 | for (auto &Ref : Result.second) { |
| 396 | Ref.Kind = static_cast<RefKind>(Data.consume8()); |
| 397 | Ref.Location = readLocation(Data, Strings); |
| 398 | Ref.Container = Data.consumeID(); |
| 399 | } |
| 400 | return Result; |
| 401 | } |
| 402 | |
| 403 | // RELATIONS ENCODING |
| 404 | // A relations section is a flat list of relations. Each relation has: |
| 405 | // - SymbolID (subject): 8 bytes |
| 406 | // - relation kind (predicate): 1 byte |
| 407 | // - SymbolID (object): 8 bytes |
| 408 | // In the future, we might prefer a packed representation if the need arises. |
| 409 | |
| 410 | void writeRelation(const Relation &R, llvm::raw_ostream &OS) { |
| 411 | OS << R.Subject.raw(); |
| 412 | OS.write(C: static_cast<uint8_t>(R.Predicate)); |
| 413 | OS << R.Object.raw(); |
| 414 | } |
| 415 | |
| 416 | Relation readRelation(Reader &Data) { |
| 417 | SymbolID Subject = Data.consumeID(); |
| 418 | RelationKind Predicate = static_cast<RelationKind>(Data.consume8()); |
| 419 | SymbolID Object = Data.consumeID(); |
| 420 | return {.Subject: Subject, .Predicate: Predicate, .Object: Object}; |
| 421 | } |
| 422 | |
| 423 | struct InternedCompileCommand { |
| 424 | llvm::StringRef Directory; |
| 425 | std::vector<llvm::StringRef> CommandLine; |
| 426 | }; |
| 427 | |
| 428 | void writeCompileCommand(const InternedCompileCommand &Cmd, |
| 429 | const StringTableOut &Strings, |
| 430 | llvm::raw_ostream &CmdOS) { |
| 431 | writeVar(I: Strings.index(S: Cmd.Directory), OS&: CmdOS); |
| 432 | writeVar(I: Cmd.CommandLine.size(), OS&: CmdOS); |
| 433 | for (llvm::StringRef C : Cmd.CommandLine) |
| 434 | writeVar(I: Strings.index(S: C), OS&: CmdOS); |
| 435 | } |
| 436 | |
| 437 | InternedCompileCommand |
| 438 | readCompileCommand(Reader CmdReader, llvm::ArrayRef<llvm::StringRef> Strings) { |
| 439 | InternedCompileCommand Cmd; |
| 440 | Cmd.Directory = CmdReader.consumeString(Strings); |
| 441 | if (!CmdReader.consumeSize(Container&: Cmd.CommandLine)) |
| 442 | return Cmd; |
| 443 | for (llvm::StringRef &C : Cmd.CommandLine) |
| 444 | C = CmdReader.consumeString(Strings); |
| 445 | return Cmd; |
| 446 | } |
| 447 | |
| 448 | // FILE ENCODING |
| 449 | // A file is a RIFF chunk with type 'CdIx'. |
| 450 | // It contains the sections: |
| 451 | // - meta: version number |
| 452 | // - srcs: information related to include graph |
| 453 | // - stri: string table |
| 454 | // - symb: symbols |
| 455 | // - refs: references to symbols |
| 456 | |
| 457 | // The current versioning scheme is simple - non-current versions are rejected. |
| 458 | // If you make a breaking change, bump this version number to invalidate stored |
| 459 | // data. Later we may want to support some backward compatibility. |
| 460 | constexpr static uint32_t Version = 20; |
| 461 | |
| 462 | llvm::Expected<IndexFileIn> readRIFF(llvm::StringRef Data, |
| 463 | SymbolOrigin Origin) { |
| 464 | auto RIFF = riff::readFile(Stream: Data); |
| 465 | if (!RIFF) |
| 466 | return RIFF.takeError(); |
| 467 | if (RIFF->Type != riff::fourCC(Literal: "CdIx" )) |
| 468 | return error(Fmt: "wrong RIFF filetype: {0}" , Vals: riff::fourCCStr(Data: RIFF->Type)); |
| 469 | llvm::StringMap<llvm::StringRef> Chunks; |
| 470 | for (const auto &Chunk : RIFF->Chunks) |
| 471 | Chunks.try_emplace(Key: llvm::StringRef(Chunk.ID.data(), Chunk.ID.size()), |
| 472 | Args: Chunk.Data); |
| 473 | |
| 474 | if (!Chunks.count(Key: "meta" )) |
| 475 | return error(Fmt: "missing meta chunk" ); |
| 476 | Reader Meta(Chunks.lookup(Key: "meta" )); |
| 477 | auto SeenVersion = Meta.consume32(); |
| 478 | if (SeenVersion != Version) |
| 479 | return error(Fmt: "wrong version: want {0}, got {1}" , Vals: Version, Vals&: SeenVersion); |
| 480 | |
| 481 | // meta chunk is checked above, as we prefer the "version mismatch" error. |
| 482 | for (llvm::StringRef RequiredChunk : {"stri" }) |
| 483 | if (!Chunks.count(Key: RequiredChunk)) |
| 484 | return error(Fmt: "missing required chunk {0}" , Vals&: RequiredChunk); |
| 485 | |
| 486 | auto Strings = readStringTable(Data: Chunks.lookup(Key: "stri" )); |
| 487 | if (!Strings) |
| 488 | return Strings.takeError(); |
| 489 | |
| 490 | IndexFileIn Result; |
| 491 | if (Chunks.count(Key: "srcs" )) { |
| 492 | Reader SrcsReader(Chunks.lookup(Key: "srcs" )); |
| 493 | Result.Sources.emplace(); |
| 494 | while (!SrcsReader.eof()) { |
| 495 | auto IGN = readIncludeGraphNode(Data&: SrcsReader, Strings: Strings->Strings); |
| 496 | auto Entry = Result.Sources->try_emplace(Key: IGN.URI).first; |
| 497 | Entry->getValue() = std::move(IGN); |
| 498 | // We change all the strings inside the structure to point at the keys in |
| 499 | // the map, since it is the only copy of the string that's going to live. |
| 500 | Entry->getValue().URI = Entry->getKey(); |
| 501 | for (auto &Include : Entry->getValue().DirectIncludes) |
| 502 | Include = Result.Sources->try_emplace(Key: Include).first->getKey(); |
| 503 | } |
| 504 | if (SrcsReader.err()) |
| 505 | return error(Fmt: "malformed or truncated include uri" ); |
| 506 | } |
| 507 | |
| 508 | if (Chunks.count(Key: "symb" )) { |
| 509 | Reader SymbolReader(Chunks.lookup(Key: "symb" )); |
| 510 | SymbolSlab::Builder Symbols; |
| 511 | while (!SymbolReader.eof()) |
| 512 | Symbols.insert(S: readSymbol(Data&: SymbolReader, Strings: Strings->Strings, Origin)); |
| 513 | if (SymbolReader.err()) |
| 514 | return error(Fmt: "malformed or truncated symbol" ); |
| 515 | Result.Symbols = std::move(Symbols).build(); |
| 516 | } |
| 517 | if (Chunks.count(Key: "refs" )) { |
| 518 | Reader RefsReader(Chunks.lookup(Key: "refs" )); |
| 519 | RefSlab::Builder Refs; |
| 520 | while (!RefsReader.eof()) { |
| 521 | auto RefsBundle = readRefs(Data&: RefsReader, Strings: Strings->Strings); |
| 522 | for (const auto &Ref : RefsBundle.second) // FIXME: bulk insert? |
| 523 | Refs.insert(ID: RefsBundle.first, S: Ref); |
| 524 | } |
| 525 | if (RefsReader.err()) |
| 526 | return error(Fmt: "malformed or truncated refs" ); |
| 527 | Result.Refs = std::move(Refs).build(); |
| 528 | } |
| 529 | if (Chunks.count(Key: "rela" )) { |
| 530 | Reader RelationsReader(Chunks.lookup(Key: "rela" )); |
| 531 | RelationSlab::Builder Relations; |
| 532 | while (!RelationsReader.eof()) |
| 533 | Relations.insert(R: readRelation(Data&: RelationsReader)); |
| 534 | if (RelationsReader.err()) |
| 535 | return error(Fmt: "malformed or truncated relations" ); |
| 536 | Result.Relations = std::move(Relations).build(); |
| 537 | } |
| 538 | if (Chunks.count(Key: "cmdl" )) { |
| 539 | Reader CmdReader(Chunks.lookup(Key: "cmdl" )); |
| 540 | InternedCompileCommand Cmd = |
| 541 | readCompileCommand(CmdReader, Strings: Strings->Strings); |
| 542 | if (CmdReader.err()) |
| 543 | return error(Fmt: "malformed or truncated commandline section" ); |
| 544 | Result.Cmd.emplace(); |
| 545 | Result.Cmd->Directory = std::string(Cmd.Directory); |
| 546 | Result.Cmd->CommandLine.reserve(n: Cmd.CommandLine.size()); |
| 547 | for (llvm::StringRef C : Cmd.CommandLine) |
| 548 | Result.Cmd->CommandLine.emplace_back(args&: C); |
| 549 | } |
| 550 | return std::move(Result); |
| 551 | } |
| 552 | |
| 553 | template <class Callback> |
| 554 | void visitStrings(IncludeGraphNode &IGN, const Callback &CB) { |
| 555 | CB(IGN.URI); |
| 556 | for (llvm::StringRef &Include : IGN.DirectIncludes) |
| 557 | CB(Include); |
| 558 | } |
| 559 | |
| 560 | void writeRIFF(const IndexFileOut &Data, llvm::raw_ostream &OS) { |
| 561 | assert(Data.Symbols && "An index file without symbols makes no sense!" ); |
| 562 | riff::File RIFF; |
| 563 | RIFF.Type = riff::fourCC(Literal: "CdIx" ); |
| 564 | |
| 565 | llvm::SmallString<4> Meta; |
| 566 | { |
| 567 | llvm::raw_svector_ostream MetaOS(Meta); |
| 568 | write32(I: Version, OS&: MetaOS); |
| 569 | } |
| 570 | RIFF.Chunks.push_back(x: {.ID: riff::fourCC(Literal: "meta" ), .Data: Meta}); |
| 571 | |
| 572 | StringTableOut Strings; |
| 573 | std::vector<Symbol> Symbols; |
| 574 | for (const auto &Sym : *Data.Symbols) { |
| 575 | Symbols.emplace_back(args: Sym); |
| 576 | visitStrings(S&: Symbols.back(), |
| 577 | CB: [&](llvm::StringRef &S) { Strings.intern(S); }); |
| 578 | } |
| 579 | std::vector<IncludeGraphNode> Sources; |
| 580 | if (Data.Sources) |
| 581 | for (const auto &Source : *Data.Sources) { |
| 582 | Sources.push_back(x: Source.getValue()); |
| 583 | visitStrings(IGN&: Sources.back(), |
| 584 | CB: [&](llvm::StringRef &S) { Strings.intern(S); }); |
| 585 | } |
| 586 | |
| 587 | std::vector<std::pair<SymbolID, std::vector<Ref>>> Refs; |
| 588 | if (Data.Refs) { |
| 589 | for (const auto &Sym : *Data.Refs) { |
| 590 | Refs.emplace_back(args: Sym); |
| 591 | for (auto &Ref : Refs.back().second) { |
| 592 | llvm::StringRef File = Ref.Location.FileURI; |
| 593 | Strings.intern(S&: File); |
| 594 | Ref.Location.FileURI = File.data(); |
| 595 | } |
| 596 | } |
| 597 | } |
| 598 | |
| 599 | std::vector<Relation> Relations; |
| 600 | if (Data.Relations) { |
| 601 | for (const auto &Relation : *Data.Relations) { |
| 602 | Relations.emplace_back(args: Relation); |
| 603 | // No strings to be interned in relations. |
| 604 | } |
| 605 | } |
| 606 | |
| 607 | InternedCompileCommand InternedCmd; |
| 608 | if (Data.Cmd) { |
| 609 | InternedCmd.CommandLine.reserve(n: Data.Cmd->CommandLine.size()); |
| 610 | InternedCmd.Directory = Data.Cmd->Directory; |
| 611 | Strings.intern(S&: InternedCmd.Directory); |
| 612 | for (llvm::StringRef C : Data.Cmd->CommandLine) { |
| 613 | InternedCmd.CommandLine.emplace_back(args&: C); |
| 614 | Strings.intern(S&: InternedCmd.CommandLine.back()); |
| 615 | } |
| 616 | } |
| 617 | |
| 618 | std::string StringSection; |
| 619 | { |
| 620 | llvm::raw_string_ostream StringOS(StringSection); |
| 621 | Strings.finalize(OS&: StringOS); |
| 622 | } |
| 623 | RIFF.Chunks.push_back(x: {.ID: riff::fourCC(Literal: "stri" ), .Data: StringSection}); |
| 624 | |
| 625 | std::string SymbolSection; |
| 626 | { |
| 627 | llvm::raw_string_ostream SymbolOS(SymbolSection); |
| 628 | for (const auto &Sym : Symbols) |
| 629 | writeSymbol(Sym, Strings, OS&: SymbolOS); |
| 630 | } |
| 631 | RIFF.Chunks.push_back(x: {.ID: riff::fourCC(Literal: "symb" ), .Data: SymbolSection}); |
| 632 | |
| 633 | std::string RefsSection; |
| 634 | if (Data.Refs) { |
| 635 | { |
| 636 | llvm::raw_string_ostream RefsOS(RefsSection); |
| 637 | for (const auto &Sym : Refs) |
| 638 | writeRefs(ID: Sym.first, Refs: Sym.second, Strings, OS&: RefsOS); |
| 639 | } |
| 640 | RIFF.Chunks.push_back(x: {.ID: riff::fourCC(Literal: "refs" ), .Data: RefsSection}); |
| 641 | } |
| 642 | |
| 643 | std::string RelationSection; |
| 644 | if (Data.Relations) { |
| 645 | { |
| 646 | llvm::raw_string_ostream RelationOS{RelationSection}; |
| 647 | for (const auto &Relation : Relations) |
| 648 | writeRelation(R: Relation, OS&: RelationOS); |
| 649 | } |
| 650 | RIFF.Chunks.push_back(x: {.ID: riff::fourCC(Literal: "rela" ), .Data: RelationSection}); |
| 651 | } |
| 652 | |
| 653 | std::string SrcsSection; |
| 654 | { |
| 655 | { |
| 656 | llvm::raw_string_ostream SrcsOS(SrcsSection); |
| 657 | for (const auto &SF : Sources) |
| 658 | writeIncludeGraphNode(IGN: SF, Strings, OS&: SrcsOS); |
| 659 | } |
| 660 | RIFF.Chunks.push_back(x: {.ID: riff::fourCC(Literal: "srcs" ), .Data: SrcsSection}); |
| 661 | } |
| 662 | |
| 663 | std::string CmdlSection; |
| 664 | if (Data.Cmd) { |
| 665 | { |
| 666 | llvm::raw_string_ostream CmdOS(CmdlSection); |
| 667 | writeCompileCommand(Cmd: InternedCmd, Strings, CmdOS); |
| 668 | } |
| 669 | RIFF.Chunks.push_back(x: {.ID: riff::fourCC(Literal: "cmdl" ), .Data: CmdlSection}); |
| 670 | } |
| 671 | |
| 672 | OS << RIFF; |
| 673 | } |
| 674 | |
| 675 | } // namespace |
| 676 | |
| 677 | // Defined in YAMLSerialization.cpp. |
| 678 | void writeYAML(const IndexFileOut &, llvm::raw_ostream &); |
| 679 | llvm::Expected<IndexFileIn> readYAML(llvm::StringRef, SymbolOrigin Origin); |
| 680 | |
| 681 | llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const IndexFileOut &O) { |
| 682 | switch (O.Format) { |
| 683 | case IndexFileFormat::RIFF: |
| 684 | writeRIFF(Data: O, OS); |
| 685 | break; |
| 686 | case IndexFileFormat::YAML: |
| 687 | writeYAML(O, OS); |
| 688 | break; |
| 689 | } |
| 690 | return OS; |
| 691 | } |
| 692 | |
| 693 | llvm::Expected<IndexFileIn> readIndexFile(llvm::StringRef Data, |
| 694 | SymbolOrigin Origin) { |
| 695 | if (Data.starts_with(Prefix: "RIFF" )) { |
| 696 | return readRIFF(Data, Origin); |
| 697 | } |
| 698 | if (auto YAMLContents = readYAML(Data, Origin)) { |
| 699 | return std::move(*YAMLContents); |
| 700 | } else { |
| 701 | return error(Fmt: "Not a RIFF file and failed to parse as YAML: {0}" , |
| 702 | Vals: YAMLContents.takeError()); |
| 703 | } |
| 704 | } |
| 705 | |
| 706 | std::unique_ptr<SymbolIndex> loadIndex(llvm::StringRef SymbolFilename, |
| 707 | SymbolOrigin Origin, bool UseDex, |
| 708 | bool SupportContainedRefs) { |
| 709 | trace::Span OverallTracer("LoadIndex" ); |
| 710 | auto Buffer = llvm::MemoryBuffer::getFile(Filename: SymbolFilename); |
| 711 | if (!Buffer) { |
| 712 | elog(Fmt: "Can't open {0}: {1}" , Vals&: SymbolFilename, Vals: Buffer.getError().message()); |
| 713 | return nullptr; |
| 714 | } |
| 715 | |
| 716 | SymbolSlab Symbols; |
| 717 | RefSlab Refs; |
| 718 | RelationSlab Relations; |
| 719 | { |
| 720 | trace::Span Tracer("ParseIndex" ); |
| 721 | if (auto I = readIndexFile(Data: Buffer->get()->getBuffer(), Origin)) { |
| 722 | if (I->Symbols) |
| 723 | Symbols = std::move(*I->Symbols); |
| 724 | if (I->Refs) |
| 725 | Refs = std::move(*I->Refs); |
| 726 | if (I->Relations) |
| 727 | Relations = std::move(*I->Relations); |
| 728 | } else { |
| 729 | elog(Fmt: "Bad index file: {0}" , Vals: I.takeError()); |
| 730 | return nullptr; |
| 731 | } |
| 732 | } |
| 733 | |
| 734 | size_t NumSym = Symbols.size(); |
| 735 | size_t NumRefs = Refs.numRefs(); |
| 736 | size_t NumRelations = Relations.size(); |
| 737 | |
| 738 | trace::Span Tracer("BuildIndex" ); |
| 739 | auto Index = UseDex |
| 740 | ? dex::Dex::build(std::move(Symbols), std::move(Refs), |
| 741 | std::move(Relations), SupportContainedRefs) |
| 742 | : MemIndex::build(Symbols: std::move(Symbols), Refs: std::move(Refs), |
| 743 | Relations: std::move(Relations)); |
| 744 | vlog(Fmt: "Loaded {0} from {1} with estimated memory usage {2} bytes\n" |
| 745 | " - number of symbols: {3}\n" |
| 746 | " - number of refs: {4}\n" |
| 747 | " - number of relations: {5}" , |
| 748 | Vals: UseDex ? "Dex" : "MemIndex" , Vals&: SymbolFilename, |
| 749 | Vals: Index->estimateMemoryUsage(), Vals&: NumSym, Vals&: NumRefs, Vals&: NumRelations); |
| 750 | return Index; |
| 751 | } |
| 752 | |
| 753 | } // namespace clangd |
| 754 | } // namespace clang |
| 755 | |