1 | //===--- PostingList.h - Symbol identifiers storage interface --*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// |
9 | /// \file |
10 | /// This defines posting list interface: a storage for identifiers of symbols |
11 | /// which can be characterized by a specific feature (such as fuzzy-find |
12 | /// trigram, scope, type or any other Search Token). Posting lists can be |
13 | /// traversed in order using an iterator and are values for inverted index, |
14 | /// which maps search tokens to corresponding posting lists. |
15 | /// |
16 | /// In order to decrease size of Index in-memory representation, Variable Byte |
17 | /// Encoding (VByte) is used for PostingLists compression. An overview of VByte |
18 | /// algorithm can be found in "Introduction to Information Retrieval" book: |
19 | /// https://nlp.stanford.edu/IR-book/html/htmledition/variable-byte-codes-1.html |
20 | /// |
21 | //===----------------------------------------------------------------------===// |
22 | |
23 | #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H |
24 | #define |
25 | |
26 | #include "Iterator.h" |
27 | #include "llvm/ADT/ArrayRef.h" |
28 | #include "llvm/ADT/SmallVector.h" |
29 | #include <cstdint> |
30 | #include <vector> |
31 | |
32 | namespace clang { |
33 | namespace clangd { |
34 | namespace dex { |
35 | class Token; |
36 | |
37 | /// NOTE: This is an implementation detail. |
38 | /// |
39 | /// Chunk is a fixed-width piece of PostingList which contains the first DocID |
40 | /// in uncompressed format (Head) and delta-encoded Payload. It can be |
41 | /// decompressed upon request. |
42 | struct Chunk { |
43 | /// Keep sizeof(Chunk) == 32. |
44 | static constexpr size_t PayloadSize = 32 - sizeof(DocID); |
45 | |
46 | llvm::SmallVector<DocID, PayloadSize + 1> decompress() const; |
47 | |
48 | /// The first element of decompressed Chunk. |
49 | DocID Head; |
50 | /// VByte-encoded deltas. |
51 | std::array<uint8_t, PayloadSize> Payload; |
52 | }; |
53 | static_assert(sizeof(Chunk) == 32, "Chunk should take 32 bytes of memory." ); |
54 | |
55 | /// PostingList is the storage of DocIDs which can be inserted to the Query |
56 | /// Tree as a leaf by constructing Iterator over the PostingList object. DocIDs |
57 | /// are stored in underlying chunks. Compression saves memory at a small cost |
58 | /// in access time, which is still fast enough in practice. |
59 | class PostingList { |
60 | public: |
61 | explicit PostingList(llvm::ArrayRef<DocID> Documents); |
62 | |
63 | /// Constructs DocumentIterator over given posting list. DocumentIterator will |
64 | /// go through the chunks and decompress them on-the-fly when necessary. |
65 | /// If given, Tok is only used for the string representation. |
66 | std::unique_ptr<Iterator> iterator(const Token *Tok = nullptr) const; |
67 | |
68 | /// Returns in-memory size of external storage. |
69 | size_t bytes() const { return Chunks.capacity() * sizeof(Chunk); } |
70 | |
71 | private: |
72 | const std::vector<Chunk> Chunks; |
73 | }; |
74 | |
75 | } // namespace dex |
76 | } // namespace clangd |
77 | } // namespace clang |
78 | |
79 | #endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H |
80 | |