| 1 | //===--- PostingList.h - Symbol identifiers storage interface --*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | /// |
| 9 | /// \file |
| 10 | /// This defines posting list interface: a storage for identifiers of symbols |
| 11 | /// which can be characterized by a specific feature (such as fuzzy-find |
| 12 | /// trigram, scope, type or any other Search Token). Posting lists can be |
| 13 | /// traversed in order using an iterator and are values for inverted index, |
| 14 | /// which maps search tokens to corresponding posting lists. |
| 15 | /// |
| 16 | /// In order to decrease size of Index in-memory representation, Variable Byte |
| 17 | /// Encoding (VByte) is used for PostingLists compression. An overview of VByte |
| 18 | /// algorithm can be found in "Introduction to Information Retrieval" book: |
| 19 | /// https://nlp.stanford.edu/IR-book/html/htmledition/variable-byte-codes-1.html |
| 20 | /// |
| 21 | //===----------------------------------------------------------------------===// |
| 22 | |
| 23 | #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H |
| 24 | #define |
| 25 | |
| 26 | #include "Iterator.h" |
| 27 | #include "llvm/ADT/ArrayRef.h" |
| 28 | #include "llvm/ADT/SmallVector.h" |
| 29 | #include <cstdint> |
| 30 | #include <vector> |
| 31 | |
| 32 | namespace clang { |
| 33 | namespace clangd { |
| 34 | namespace dex { |
| 35 | class Token; |
| 36 | |
| 37 | /// NOTE: This is an implementation detail. |
| 38 | /// |
| 39 | /// Chunk is a fixed-width piece of PostingList which contains the first DocID |
| 40 | /// in uncompressed format (Head) and delta-encoded Payload. It can be |
| 41 | /// decompressed upon request. |
| 42 | struct Chunk { |
| 43 | /// Keep sizeof(Chunk) == 32. |
| 44 | static constexpr size_t PayloadSize = 32 - sizeof(DocID); |
| 45 | |
| 46 | llvm::SmallVector<DocID, PayloadSize + 1> decompress() const; |
| 47 | |
| 48 | /// The first element of decompressed Chunk. |
| 49 | DocID Head; |
| 50 | /// VByte-encoded deltas. |
| 51 | std::array<uint8_t, PayloadSize> Payload; |
| 52 | }; |
| 53 | static_assert(sizeof(Chunk) == 32, "Chunk should take 32 bytes of memory." ); |
| 54 | |
| 55 | /// PostingList is the storage of DocIDs which can be inserted to the Query |
| 56 | /// Tree as a leaf by constructing Iterator over the PostingList object. DocIDs |
| 57 | /// are stored in underlying chunks. Compression saves memory at a small cost |
| 58 | /// in access time, which is still fast enough in practice. |
| 59 | class PostingList { |
| 60 | public: |
| 61 | explicit PostingList(llvm::ArrayRef<DocID> Documents); |
| 62 | |
| 63 | /// Constructs DocumentIterator over given posting list. DocumentIterator will |
| 64 | /// go through the chunks and decompress them on-the-fly when necessary. |
| 65 | /// If given, Tok is only used for the string representation. |
| 66 | std::unique_ptr<Iterator> iterator(const Token *Tok = nullptr) const; |
| 67 | |
| 68 | /// Returns in-memory size of external storage. |
| 69 | size_t bytes() const { return Chunks.capacity() * sizeof(Chunk); } |
| 70 | |
| 71 | private: |
| 72 | const std::vector<Chunk> Chunks; |
| 73 | }; |
| 74 | |
| 75 | } // namespace dex |
| 76 | } // namespace clangd |
| 77 | } // namespace clang |
| 78 | |
| 79 | #endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H |
| 80 | |