| 1 | //===--- Iterator.h - Query Symbol Retrieval --------------------*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | /// |
| 9 | /// \file |
| 10 | /// Symbol index queries consist of specific requirements for the requested |
| 11 | /// symbol, such as high fuzzy matching score, scope, type etc. The lists of all |
| 12 | /// symbols matching some criteria (e.g. belonging to "clang::clangd::" scope) |
| 13 | /// are expressed in a form of Search Tokens which are stored in the inverted |
| 14 | /// index. Inverted index maps these tokens to the posting lists - sorted (by |
| 15 | /// symbol quality) sequences of symbol IDs matching the token, e.g. scope token |
| 16 | /// "clangd::clangd::" is mapped to the list of IDs of all symbols which are |
| 17 | /// declared in this namespace. Search queries are build from a set of |
| 18 | /// requirements which can be combined with each other forming the query trees. |
| 19 | /// The leafs of such trees are posting lists, and the nodes are operations on |
| 20 | /// these posting lists, e.g. intersection or union. Efficient processing of |
| 21 | /// these multi-level queries is handled by Iterators. Iterators advance through |
| 22 | /// all leaf posting lists producing the result of search query, which preserves |
| 23 | /// the sorted order of IDs. Having the resulting IDs sorted is important, |
| 24 | /// because it allows receiving a certain number of the most valuable items |
| 25 | /// (e.g. symbols with highest quality which was the sorting key in the first |
| 26 | /// place) without processing all items with requested properties (this might |
| 27 | /// not be computationally effective if search request is not very restrictive). |
| 28 | // |
| 29 | //===----------------------------------------------------------------------===// |
| 30 | |
| 31 | #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H |
| 32 | #define |
| 33 | |
| 34 | #include "llvm/Support/raw_ostream.h" |
| 35 | #include <algorithm> |
| 36 | #include <memory> |
| 37 | #include <vector> |
| 38 | |
| 39 | namespace clang { |
| 40 | namespace clangd { |
| 41 | namespace dex { |
| 42 | |
| 43 | /// Symbol position in the list of all index symbols sorted by a pre-computed |
| 44 | /// symbol quality. |
| 45 | using DocID = uint32_t; |
| 46 | |
| 47 | /// Iterator is the interface for Query Tree node. The simplest type of Iterator |
| 48 | /// is DocumentIterator which is simply a wrapper around PostingList iterator |
| 49 | /// and serves as the Query Tree leaf. More sophisticated examples of iterators |
| 50 | /// can manage intersection, union of the elements produced by other iterators |
| 51 | /// (their children) to form a multi-level Query Tree. The interface is designed |
| 52 | /// to be extensible in order to support multiple types of iterators. |
| 53 | class Iterator { |
| 54 | public: |
| 55 | /// Returns true if all valid DocIDs were processed and hence the iterator is |
| 56 | /// exhausted. |
| 57 | virtual bool reachedEnd() const = 0; |
| 58 | /// Moves to next valid DocID. If it doesn't exist, the iterator is exhausted |
| 59 | /// and proceeds to the END. |
| 60 | /// |
| 61 | /// Note: reachedEnd() must be false. |
| 62 | virtual void advance() = 0; |
| 63 | /// Moves to the first valid DocID which is equal or higher than given ID. If |
| 64 | /// it doesn't exist, the iterator is exhausted and proceeds to the END. |
| 65 | /// |
| 66 | /// Note: reachedEnd() must be false. |
| 67 | virtual void advanceTo(DocID ID) = 0; |
| 68 | /// Returns the current element this iterator points to. |
| 69 | /// |
| 70 | /// Note: reachedEnd() must be false. |
| 71 | virtual DocID peek() const = 0; |
| 72 | /// Informs the iterator that the current document was consumed, and returns |
| 73 | /// its boost. |
| 74 | /// |
| 75 | /// Note: If this iterator has any child iterators that contain the document, |
| 76 | /// consume() should be called on those and their boosts incorporated. |
| 77 | /// consume() must *not* be called on children that don't contain the current |
| 78 | /// doc. |
| 79 | virtual float consume() = 0; |
| 80 | /// Returns an estimate of advance() calls before the iterator is exhausted. |
| 81 | virtual size_t estimateSize() const = 0; |
| 82 | |
| 83 | virtual ~Iterator() {} |
| 84 | |
| 85 | /// Prints a convenient human-readable iterator representation by recursively |
| 86 | /// dumping iterators in the following format: |
| 87 | /// |
| 88 | /// (Type Child1 Child2 ...) |
| 89 | /// |
| 90 | /// Where Type is the iterator type representation: "&" for And, "|" for Or, |
| 91 | /// ChildN is N-th iterator child. Raw iterators over PostingList are |
| 92 | /// represented as "[... CurID ...]" where CurID is the current PostingList |
| 93 | /// entry being inspected. |
| 94 | friend llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, |
| 95 | const Iterator &Iterator) { |
| 96 | return Iterator.dump(OS); |
| 97 | } |
| 98 | |
| 99 | /// Inspect iterator type, used internally for optimizing query trees. |
| 100 | enum class Kind { And, Or, True, False, Other }; |
| 101 | Kind kind() const { return MyKind; } |
| 102 | |
| 103 | protected: |
| 104 | Iterator(Kind MyKind = Kind::Other) : MyKind(MyKind) {} |
| 105 | |
| 106 | private: |
| 107 | virtual llvm::raw_ostream &dump(llvm::raw_ostream &OS) const = 0; |
| 108 | Kind MyKind; |
| 109 | }; |
| 110 | |
| 111 | /// Advances the iterator until it is exhausted. Returns pairs of document IDs |
| 112 | /// with the corresponding boosting score. |
| 113 | /// |
| 114 | /// Boosting can be seen as a compromise between retrieving too many items and |
| 115 | /// calculating finals score for each of them (which might be very expensive) |
| 116 | /// and not retrieving enough items so that items with very high final score |
| 117 | /// would not be processed. Boosting score is a computationally efficient way |
| 118 | /// to acquire preliminary scores of requested items. |
| 119 | std::vector<std::pair<DocID, float>> consume(Iterator &It); |
| 120 | |
| 121 | namespace detail { |
| 122 | // Variadic template machinery. |
| 123 | inline void populateChildren(std::vector<std::unique_ptr<Iterator>> &) {} |
| 124 | template <typename... TailT> |
| 125 | void populateChildren(std::vector<std::unique_ptr<Iterator>> &Children, |
| 126 | std::unique_ptr<Iterator> Head, TailT... Tail) { |
| 127 | Children.push_back(x: std::move(Head)); |
| 128 | populateChildren(Children, std::move(Tail)...); |
| 129 | } |
| 130 | } // namespace detail |
| 131 | |
| 132 | // A corpus is a set of documents, and a factory for iterators over them. |
| 133 | class Corpus { |
| 134 | DocID Size; |
| 135 | |
| 136 | public: |
| 137 | explicit Corpus(DocID Size) : Size(Size) {} |
| 138 | |
| 139 | /// Returns AND Iterator which performs the intersection of the PostingLists |
| 140 | /// of its children. |
| 141 | /// |
| 142 | /// consume(): AND Iterator returns the product of Childrens' boosting |
| 143 | /// scores. |
| 144 | std::unique_ptr<Iterator> |
| 145 | intersect(std::vector<std::unique_ptr<Iterator>> Children) const; |
| 146 | |
| 147 | /// Returns OR Iterator which performs the union of the PostingLists of its |
| 148 | /// children. |
| 149 | /// |
| 150 | /// consume(): OR Iterator returns the highest boost value among children |
| 151 | /// containing the requested item. |
| 152 | std::unique_ptr<Iterator> |
| 153 | unionOf(std::vector<std::unique_ptr<Iterator>> Children) const; |
| 154 | |
| 155 | /// Returns TRUE Iterator which iterates over "virtual" PostingList |
| 156 | /// containing all items in range [0, Size) in an efficient manner. |
| 157 | std::unique_ptr<Iterator> all() const; |
| 158 | |
| 159 | /// Returns FALSE Iterator which iterates over no documents. |
| 160 | std::unique_ptr<Iterator> none() const; |
| 161 | |
| 162 | /// Returns BOOST iterator which multiplies the score of each item by given |
| 163 | /// factor. Boosting can be used as a computationally inexpensive filtering. |
| 164 | /// Users can return significantly more items using consumeAndBoost() and |
| 165 | /// then trim Top K using retrieval score. |
| 166 | std::unique_ptr<Iterator> boost(std::unique_ptr<Iterator> Child, |
| 167 | float Factor) const; |
| 168 | |
| 169 | /// Returns LIMIT iterator, which yields up to N elements of its child |
| 170 | /// iterator. Elements only count towards the limit if they are part of the |
| 171 | /// final result set. Therefore the following iterator (AND (2) (LIMIT (1 2) |
| 172 | /// 1)) yields (2), not (). |
| 173 | std::unique_ptr<Iterator> limit(std::unique_ptr<Iterator> Child, |
| 174 | size_t Limit) const; |
| 175 | |
| 176 | /// This allows intersect(create(...), create(...)) syntax. |
| 177 | template <typename... Args> |
| 178 | std::unique_ptr<Iterator> intersect(Args... args) const { |
| 179 | std::vector<std::unique_ptr<Iterator>> Children; |
| 180 | detail::populateChildren(Children, std::forward<Args>(args)...); |
| 181 | return intersect(Children: std::move(Children)); |
| 182 | } |
| 183 | |
| 184 | /// This allows unionOf(create(...), create(...)) syntax. |
| 185 | template <typename... Args> |
| 186 | std::unique_ptr<Iterator> unionOf(Args... args) const { |
| 187 | std::vector<std::unique_ptr<Iterator>> Children; |
| 188 | detail::populateChildren(Children, std::forward<Args>(args)...); |
| 189 | return unionOf(Children: std::move(Children)); |
| 190 | } |
| 191 | }; |
| 192 | |
| 193 | } // namespace dex |
| 194 | } // namespace clangd |
| 195 | } // namespace clang |
| 196 | |
| 197 | #endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H |
| 198 | |