1 | //===--- Iterator.h - Query Symbol Retrieval --------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// |
9 | /// \file |
10 | /// Symbol index queries consist of specific requirements for the requested |
11 | /// symbol, such as high fuzzy matching score, scope, type etc. The lists of all |
12 | /// symbols matching some criteria (e.g. belonging to "clang::clangd::" scope) |
13 | /// are expressed in a form of Search Tokens which are stored in the inverted |
14 | /// index. Inverted index maps these tokens to the posting lists - sorted (by |
15 | /// symbol quality) sequences of symbol IDs matching the token, e.g. scope token |
16 | /// "clangd::clangd::" is mapped to the list of IDs of all symbols which are |
17 | /// declared in this namespace. Search queries are build from a set of |
18 | /// requirements which can be combined with each other forming the query trees. |
19 | /// The leafs of such trees are posting lists, and the nodes are operations on |
20 | /// these posting lists, e.g. intersection or union. Efficient processing of |
21 | /// these multi-level queries is handled by Iterators. Iterators advance through |
22 | /// all leaf posting lists producing the result of search query, which preserves |
23 | /// the sorted order of IDs. Having the resulting IDs sorted is important, |
24 | /// because it allows receiving a certain number of the most valuable items |
25 | /// (e.g. symbols with highest quality which was the sorting key in the first |
26 | /// place) without processing all items with requested properties (this might |
27 | /// not be computationally effective if search request is not very restrictive). |
28 | // |
29 | //===----------------------------------------------------------------------===// |
30 | |
31 | #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H |
32 | #define |
33 | |
34 | #include "llvm/Support/raw_ostream.h" |
35 | #include <algorithm> |
36 | #include <memory> |
37 | #include <vector> |
38 | |
39 | namespace clang { |
40 | namespace clangd { |
41 | namespace dex { |
42 | |
43 | /// Symbol position in the list of all index symbols sorted by a pre-computed |
44 | /// symbol quality. |
45 | using DocID = uint32_t; |
46 | |
47 | /// Iterator is the interface for Query Tree node. The simplest type of Iterator |
48 | /// is DocumentIterator which is simply a wrapper around PostingList iterator |
49 | /// and serves as the Query Tree leaf. More sophisticated examples of iterators |
50 | /// can manage intersection, union of the elements produced by other iterators |
51 | /// (their children) to form a multi-level Query Tree. The interface is designed |
52 | /// to be extensible in order to support multiple types of iterators. |
53 | class Iterator { |
54 | public: |
55 | /// Returns true if all valid DocIDs were processed and hence the iterator is |
56 | /// exhausted. |
57 | virtual bool reachedEnd() const = 0; |
58 | /// Moves to next valid DocID. If it doesn't exist, the iterator is exhausted |
59 | /// and proceeds to the END. |
60 | /// |
61 | /// Note: reachedEnd() must be false. |
62 | virtual void advance() = 0; |
63 | /// Moves to the first valid DocID which is equal or higher than given ID. If |
64 | /// it doesn't exist, the iterator is exhausted and proceeds to the END. |
65 | /// |
66 | /// Note: reachedEnd() must be false. |
67 | virtual void advanceTo(DocID ID) = 0; |
68 | /// Returns the current element this iterator points to. |
69 | /// |
70 | /// Note: reachedEnd() must be false. |
71 | virtual DocID peek() const = 0; |
72 | /// Informs the iterator that the current document was consumed, and returns |
73 | /// its boost. |
74 | /// |
75 | /// Note: If this iterator has any child iterators that contain the document, |
76 | /// consume() should be called on those and their boosts incorporated. |
77 | /// consume() must *not* be called on children that don't contain the current |
78 | /// doc. |
79 | virtual float consume() = 0; |
80 | /// Returns an estimate of advance() calls before the iterator is exhausted. |
81 | virtual size_t estimateSize() const = 0; |
82 | |
83 | virtual ~Iterator() {} |
84 | |
85 | /// Prints a convenient human-readable iterator representation by recursively |
86 | /// dumping iterators in the following format: |
87 | /// |
88 | /// (Type Child1 Child2 ...) |
89 | /// |
90 | /// Where Type is the iterator type representation: "&" for And, "|" for Or, |
91 | /// ChildN is N-th iterator child. Raw iterators over PostingList are |
92 | /// represented as "[... CurID ...]" where CurID is the current PostingList |
93 | /// entry being inspected. |
94 | friend llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, |
95 | const Iterator &Iterator) { |
96 | return Iterator.dump(OS); |
97 | } |
98 | |
99 | /// Inspect iterator type, used internally for optimizing query trees. |
100 | enum class Kind { And, Or, True, False, Other }; |
101 | Kind kind() const { return MyKind; } |
102 | |
103 | protected: |
104 | Iterator(Kind MyKind = Kind::Other) : MyKind(MyKind) {} |
105 | |
106 | private: |
107 | virtual llvm::raw_ostream &dump(llvm::raw_ostream &OS) const = 0; |
108 | Kind MyKind; |
109 | }; |
110 | |
111 | /// Advances the iterator until it is exhausted. Returns pairs of document IDs |
112 | /// with the corresponding boosting score. |
113 | /// |
114 | /// Boosting can be seen as a compromise between retrieving too many items and |
115 | /// calculating finals score for each of them (which might be very expensive) |
116 | /// and not retrieving enough items so that items with very high final score |
117 | /// would not be processed. Boosting score is a computationally efficient way |
118 | /// to acquire preliminary scores of requested items. |
119 | std::vector<std::pair<DocID, float>> consume(Iterator &It); |
120 | |
121 | namespace detail { |
122 | // Variadic template machinery. |
123 | inline void populateChildren(std::vector<std::unique_ptr<Iterator>> &) {} |
124 | template <typename... TailT> |
125 | void populateChildren(std::vector<std::unique_ptr<Iterator>> &Children, |
126 | std::unique_ptr<Iterator> Head, TailT... Tail) { |
127 | Children.push_back(x: std::move(Head)); |
128 | populateChildren(Children, std::move(Tail)...); |
129 | } |
130 | } // namespace detail |
131 | |
132 | // A corpus is a set of documents, and a factory for iterators over them. |
133 | class Corpus { |
134 | DocID Size; |
135 | |
136 | public: |
137 | explicit Corpus(DocID Size) : Size(Size) {} |
138 | |
139 | /// Returns AND Iterator which performs the intersection of the PostingLists |
140 | /// of its children. |
141 | /// |
142 | /// consume(): AND Iterator returns the product of Childrens' boosting |
143 | /// scores. |
144 | std::unique_ptr<Iterator> |
145 | intersect(std::vector<std::unique_ptr<Iterator>> Children) const; |
146 | |
147 | /// Returns OR Iterator which performs the union of the PostingLists of its |
148 | /// children. |
149 | /// |
150 | /// consume(): OR Iterator returns the highest boost value among children |
151 | /// containing the requested item. |
152 | std::unique_ptr<Iterator> |
153 | unionOf(std::vector<std::unique_ptr<Iterator>> Children) const; |
154 | |
155 | /// Returns TRUE Iterator which iterates over "virtual" PostingList |
156 | /// containing all items in range [0, Size) in an efficient manner. |
157 | std::unique_ptr<Iterator> all() const; |
158 | |
159 | /// Returns FALSE Iterator which iterates over no documents. |
160 | std::unique_ptr<Iterator> none() const; |
161 | |
162 | /// Returns BOOST iterator which multiplies the score of each item by given |
163 | /// factor. Boosting can be used as a computationally inexpensive filtering. |
164 | /// Users can return significantly more items using consumeAndBoost() and |
165 | /// then trim Top K using retrieval score. |
166 | std::unique_ptr<Iterator> boost(std::unique_ptr<Iterator> Child, |
167 | float Factor) const; |
168 | |
169 | /// Returns LIMIT iterator, which yields up to N elements of its child |
170 | /// iterator. Elements only count towards the limit if they are part of the |
171 | /// final result set. Therefore the following iterator (AND (2) (LIMIT (1 2) |
172 | /// 1)) yields (2), not (). |
173 | std::unique_ptr<Iterator> limit(std::unique_ptr<Iterator> Child, |
174 | size_t Limit) const; |
175 | |
176 | /// This allows intersect(create(...), create(...)) syntax. |
177 | template <typename... Args> |
178 | std::unique_ptr<Iterator> intersect(Args... args) const { |
179 | std::vector<std::unique_ptr<Iterator>> Children; |
180 | detail::populateChildren(Children, std::forward<Args>(args)...); |
181 | return intersect(Children: std::move(Children)); |
182 | } |
183 | |
184 | /// This allows unionOf(create(...), create(...)) syntax. |
185 | template <typename... Args> |
186 | std::unique_ptr<Iterator> unionOf(Args... args) const { |
187 | std::vector<std::unique_ptr<Iterator>> Children; |
188 | detail::populateChildren(Children, std::forward<Args>(args)...); |
189 | return unionOf(Children: std::move(Children)); |
190 | } |
191 | }; |
192 | |
193 | } // namespace dex |
194 | } // namespace clangd |
195 | } // namespace clang |
196 | |
197 | #endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H |
198 | |