1 | //===--- Iterator.cpp - Query Symbol Retrieval ------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "Iterator.h" |
10 | #include "llvm/ADT/STLExtras.h" |
11 | #include <algorithm> |
12 | #include <cassert> |
13 | #include <numeric> |
14 | |
15 | namespace clang { |
16 | namespace clangd { |
17 | namespace dex { |
18 | namespace { |
19 | |
20 | /// Implements Iterator over the intersection of other iterators. |
21 | /// |
22 | /// AndIterator iterates through common items among all children. It becomes |
23 | /// exhausted as soon as any child becomes exhausted. After each mutation, the |
24 | /// iterator restores the invariant: all children must point to the same item. |
25 | class AndIterator : public Iterator { |
26 | public: |
27 | explicit AndIterator(std::vector<std::unique_ptr<Iterator>> AllChildren) |
28 | : Iterator(Kind::And), Children(std::move(AllChildren)) { |
29 | assert(!Children.empty() && "AND iterator should have at least one child." ); |
30 | // Establish invariants. |
31 | for (const auto &Child : Children) |
32 | ReachedEnd |= Child->reachedEnd(); |
33 | sync(); |
34 | // When children are sorted by the estimateSize(), sync() calls are more |
35 | // effective. Each sync() starts with the first child and makes sure all |
36 | // children point to the same element. If any child is "above" the previous |
37 | // ones, the algorithm resets and advances the children to the next |
38 | // highest element starting from the front. When child iterators in the |
39 | // beginning have smaller estimated size, the sync() will have less restarts |
40 | // and become more effective. |
41 | llvm::sort(C&: Children, Comp: [](const std::unique_ptr<Iterator> &LHS, |
42 | const std::unique_ptr<Iterator> &RHS) { |
43 | return LHS->estimateSize() < RHS->estimateSize(); |
44 | }); |
45 | } |
46 | |
47 | bool reachedEnd() const override { return ReachedEnd; } |
48 | |
49 | /// Advances all children to the next common item. |
50 | void advance() override { |
51 | assert(!reachedEnd() && "AND iterator can't advance() at the end." ); |
52 | Children.front()->advance(); |
53 | sync(); |
54 | } |
55 | |
56 | /// Advances all children to the next common item with DocumentID >= ID. |
57 | void advanceTo(DocID ID) override { |
58 | assert(!reachedEnd() && "AND iterator can't advanceTo() at the end." ); |
59 | Children.front()->advanceTo(ID); |
60 | sync(); |
61 | } |
62 | |
63 | DocID peek() const override { return Children.front()->peek(); } |
64 | |
65 | float consume() override { |
66 | assert(!reachedEnd() && "AND iterator can't consume() at the end." ); |
67 | float Boost = 1; |
68 | for (const auto &Child : Children) |
69 | Boost *= Child->consume(); |
70 | return Boost; |
71 | } |
72 | |
73 | size_t estimateSize() const override { |
74 | return Children.front()->estimateSize(); |
75 | } |
76 | |
77 | private: |
78 | llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { |
79 | OS << "(& " ; |
80 | auto *Separator = "" ; |
81 | for (const auto &Child : Children) { |
82 | OS << Separator << *Child; |
83 | Separator = " " ; |
84 | } |
85 | OS << ')'; |
86 | return OS; |
87 | } |
88 | |
89 | /// Restores class invariants: each child will point to the same element after |
90 | /// sync. |
91 | void sync() { |
92 | ReachedEnd |= Children.front()->reachedEnd(); |
93 | if (ReachedEnd) |
94 | return; |
95 | auto SyncID = Children.front()->peek(); |
96 | // Indicates whether any child needs to be advanced to new SyncID. |
97 | bool NeedsAdvance = false; |
98 | do { |
99 | NeedsAdvance = false; |
100 | for (auto &Child : Children) { |
101 | Child->advanceTo(ID: SyncID); |
102 | ReachedEnd |= Child->reachedEnd(); |
103 | // If any child reaches end And iterator can not match any other items. |
104 | // In this case, just terminate the process. |
105 | if (ReachedEnd) |
106 | return; |
107 | // Cache the result so that peek() is not called again as it may be |
108 | // quite expensive in AND with large subtrees. |
109 | auto Candidate = Child->peek(); |
110 | // If any child goes beyond given ID (i.e. ID is not the common item), |
111 | // all children should be advanced to the next common item. |
112 | if (Candidate > SyncID) { |
113 | SyncID = Candidate; |
114 | NeedsAdvance = true; |
115 | // Reset and try to sync again. Sync starts with the first child as |
116 | // this is the cheapest (smallest size estimate). This way advanceTo |
117 | // is called on the large posting lists once the sync point is very |
118 | // likely. |
119 | break; |
120 | } |
121 | } |
122 | } while (NeedsAdvance); |
123 | } |
124 | |
125 | /// AndIterator owns its children and ensures that all of them point to the |
126 | /// same element. As soon as one child gets exhausted, AndIterator can no |
127 | /// longer advance and has reached its end. |
128 | std::vector<std::unique_ptr<Iterator>> Children; |
129 | /// Indicates whether any child is exhausted. It is cheaper to maintain and |
130 | /// update the field, rather than traversing the whole subtree in each |
131 | /// reachedEnd() call. |
132 | bool ReachedEnd = false; |
133 | friend Corpus; // For optimizations. |
134 | }; |
135 | |
136 | /// Implements Iterator over the union of other iterators. |
137 | /// |
138 | /// OrIterator iterates through all items which can be pointed to by at least |
139 | /// one child. To preserve the sorted order, this iterator always advances the |
140 | /// child with smallest Child->peek() value. OrIterator becomes exhausted as |
141 | /// soon as all of its children are exhausted. |
142 | class OrIterator : public Iterator { |
143 | public: |
144 | explicit OrIterator(std::vector<std::unique_ptr<Iterator>> AllChildren) |
145 | : Iterator(Kind::Or), Children(std::move(AllChildren)) { |
146 | assert(!Children.empty() && "OR iterator should have at least one child." ); |
147 | } |
148 | |
149 | /// Returns true if all children are exhausted. |
150 | bool reachedEnd() const override { |
151 | for (const auto &Child : Children) |
152 | if (!Child->reachedEnd()) |
153 | return false; |
154 | return true; |
155 | } |
156 | |
157 | /// Moves each child pointing to the smallest DocID to the next item. |
158 | void advance() override { |
159 | assert(!reachedEnd() && "OR iterator can't advance() at the end." ); |
160 | const auto SmallestID = peek(); |
161 | for (const auto &Child : Children) |
162 | if (!Child->reachedEnd() && Child->peek() == SmallestID) |
163 | Child->advance(); |
164 | } |
165 | |
166 | /// Advances each child to the next existing element with DocumentID >= ID. |
167 | void advanceTo(DocID ID) override { |
168 | assert(!reachedEnd() && "OR iterator can't advanceTo() at the end." ); |
169 | for (const auto &Child : Children) |
170 | if (!Child->reachedEnd()) |
171 | Child->advanceTo(ID); |
172 | } |
173 | |
174 | /// Returns the element under cursor of the child with smallest Child->peek() |
175 | /// value. |
176 | DocID peek() const override { |
177 | assert(!reachedEnd() && "OR iterator can't peek() at the end." ); |
178 | DocID Result = std::numeric_limits<DocID>::max(); |
179 | |
180 | for (const auto &Child : Children) |
181 | if (!Child->reachedEnd()) |
182 | Result = std::min(a: Result, b: Child->peek()); |
183 | |
184 | return Result; |
185 | } |
186 | |
187 | // Returns the maximum boosting score among all Children when iterator |
188 | // points to the current ID. |
189 | float consume() override { |
190 | assert(!reachedEnd() && "OR iterator can't consume() at the end." ); |
191 | const DocID ID = peek(); |
192 | float Boost = 1; |
193 | for (const auto &Child : Children) |
194 | if (!Child->reachedEnd() && Child->peek() == ID) |
195 | Boost = std::max(a: Boost, b: Child->consume()); |
196 | return Boost; |
197 | } |
198 | |
199 | size_t estimateSize() const override { |
200 | size_t Size = 0; |
201 | for (const auto &Child : Children) |
202 | Size = std::max(a: Size, b: Child->estimateSize()); |
203 | return Size; |
204 | } |
205 | |
206 | private: |
207 | llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { |
208 | OS << "(| " ; |
209 | auto *Separator = "" ; |
210 | for (const auto &Child : Children) { |
211 | OS << Separator << *Child; |
212 | Separator = " " ; |
213 | } |
214 | OS << ')'; |
215 | return OS; |
216 | } |
217 | |
218 | std::vector<std::unique_ptr<Iterator>> Children; |
219 | friend Corpus; // For optimizations. |
220 | }; |
221 | |
222 | /// TrueIterator handles PostingLists which contain all items of the index. It |
223 | /// stores size of the virtual posting list, and all operations are performed |
224 | /// in O(1). |
225 | class TrueIterator : public Iterator { |
226 | public: |
227 | explicit TrueIterator(DocID Size) : Iterator(Kind::True), Size(Size) {} |
228 | |
229 | bool reachedEnd() const override { return Index >= Size; } |
230 | |
231 | void advance() override { |
232 | assert(!reachedEnd() && "TRUE iterator can't advance() at the end." ); |
233 | ++Index; |
234 | } |
235 | |
236 | void advanceTo(DocID ID) override { |
237 | assert(!reachedEnd() && "TRUE iterator can't advanceTo() at the end." ); |
238 | Index = std::min(a: ID, b: Size); |
239 | } |
240 | |
241 | DocID peek() const override { |
242 | assert(!reachedEnd() && "TRUE iterator can't peek() at the end." ); |
243 | return Index; |
244 | } |
245 | |
246 | float consume() override { |
247 | assert(!reachedEnd() && "TRUE iterator can't consume() at the end." ); |
248 | return 1; |
249 | } |
250 | |
251 | size_t estimateSize() const override { return Size; } |
252 | |
253 | private: |
254 | llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { |
255 | return OS << "true" ; |
256 | } |
257 | |
258 | DocID Index = 0; |
259 | /// Size of the underlying virtual PostingList. |
260 | DocID Size; |
261 | }; |
262 | |
263 | /// FalseIterator yields no results. |
264 | class FalseIterator : public Iterator { |
265 | public: |
266 | FalseIterator() : Iterator(Kind::False) {} |
267 | bool reachedEnd() const override { return true; } |
268 | void advance() override { assert(false); } |
269 | void advanceTo(DocID ID) override { assert(false); } |
270 | DocID peek() const override { |
271 | assert(false); |
272 | return 0; |
273 | } |
274 | float consume() override { |
275 | assert(false); |
276 | return 1; |
277 | } |
278 | size_t estimateSize() const override { return 0; } |
279 | |
280 | private: |
281 | llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { |
282 | return OS << "false" ; |
283 | } |
284 | }; |
285 | |
286 | /// Boost iterator is a wrapper around its child which multiplies scores of |
287 | /// each retrieved item by a given factor. |
288 | class BoostIterator : public Iterator { |
289 | public: |
290 | BoostIterator(std::unique_ptr<Iterator> Child, float Factor) |
291 | : Child(std::move(Child)), Factor(Factor) {} |
292 | |
293 | bool reachedEnd() const override { return Child->reachedEnd(); } |
294 | |
295 | void advance() override { Child->advance(); } |
296 | |
297 | void advanceTo(DocID ID) override { Child->advanceTo(ID); } |
298 | |
299 | DocID peek() const override { return Child->peek(); } |
300 | |
301 | float consume() override { return Child->consume() * Factor; } |
302 | |
303 | size_t estimateSize() const override { return Child->estimateSize(); } |
304 | |
305 | private: |
306 | llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { |
307 | return OS << "(* " << Factor << ' ' << *Child << ')'; |
308 | } |
309 | |
310 | std::unique_ptr<Iterator> Child; |
311 | float Factor; |
312 | }; |
313 | |
314 | /// This iterator limits the number of items retrieved from the child iterator |
315 | /// on top of the query tree. To ensure that query tree with LIMIT iterators |
316 | /// inside works correctly, users have to call Root->consume(Root->peek()) each |
317 | /// time item is retrieved at the root of query tree. |
318 | class LimitIterator : public Iterator { |
319 | public: |
320 | LimitIterator(std::unique_ptr<Iterator> Child, size_t Limit) |
321 | : Child(std::move(Child)), Limit(Limit), ItemsLeft(Limit) {} |
322 | |
323 | bool reachedEnd() const override { |
324 | return ItemsLeft == 0 || Child->reachedEnd(); |
325 | } |
326 | |
327 | void advance() override { Child->advance(); } |
328 | |
329 | void advanceTo(DocID ID) override { Child->advanceTo(ID); } |
330 | |
331 | DocID peek() const override { return Child->peek(); } |
332 | |
333 | /// Decreases the limit in case the element consumed at top of the query tree |
334 | /// comes from the underlying iterator. |
335 | float consume() override { |
336 | assert(!reachedEnd() && "LimitIterator can't consume() at the end." ); |
337 | --ItemsLeft; |
338 | return Child->consume(); |
339 | } |
340 | |
341 | size_t estimateSize() const override { |
342 | return std::min(a: Child->estimateSize(), b: Limit); |
343 | } |
344 | |
345 | private: |
346 | llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { |
347 | return OS << "(LIMIT " << Limit << " " << *Child << ')'; |
348 | } |
349 | |
350 | std::unique_ptr<Iterator> Child; |
351 | size_t Limit; |
352 | size_t ItemsLeft; |
353 | }; |
354 | |
355 | } // end namespace |
356 | |
357 | std::vector<std::pair<DocID, float>> consume(Iterator &It) { |
358 | std::vector<std::pair<DocID, float>> Result; |
359 | for (; !It.reachedEnd(); It.advance()) |
360 | Result.emplace_back(args: It.peek(), args: It.consume()); |
361 | return Result; |
362 | } |
363 | |
364 | std::unique_ptr<Iterator> |
365 | Corpus::intersect(std::vector<std::unique_ptr<Iterator>> Children) const { |
366 | std::vector<std::unique_ptr<Iterator>> RealChildren; |
367 | for (auto &Child : Children) { |
368 | switch (Child->kind()) { |
369 | case Iterator::Kind::True: |
370 | break; // No effect, drop the iterator. |
371 | case Iterator::Kind::False: |
372 | return std::move(Child); // Intersection is empty. |
373 | case Iterator::Kind::And: { |
374 | // Inline nested AND into parent AND. |
375 | auto &NewChildren = static_cast<AndIterator *>(Child.get())->Children; |
376 | std::move(first: NewChildren.begin(), last: NewChildren.end(), |
377 | result: std::back_inserter(x&: RealChildren)); |
378 | break; |
379 | } |
380 | default: |
381 | RealChildren.push_back(x: std::move(Child)); |
382 | } |
383 | } |
384 | switch (RealChildren.size()) { |
385 | case 0: |
386 | return all(); |
387 | case 1: |
388 | return std::move(RealChildren.front()); |
389 | default: |
390 | return std::make_unique<AndIterator>(args: std::move(RealChildren)); |
391 | } |
392 | } |
393 | |
394 | std::unique_ptr<Iterator> |
395 | Corpus::unionOf(std::vector<std::unique_ptr<Iterator>> Children) const { |
396 | std::vector<std::unique_ptr<Iterator>> RealChildren; |
397 | for (auto &Child : Children) { |
398 | switch (Child->kind()) { |
399 | case Iterator::Kind::False: |
400 | break; // No effect, drop the iterator. |
401 | case Iterator::Kind::Or: { |
402 | // Inline nested OR into parent OR. |
403 | auto &NewChildren = static_cast<OrIterator *>(Child.get())->Children; |
404 | std::move(first: NewChildren.begin(), last: NewChildren.end(), |
405 | result: std::back_inserter(x&: RealChildren)); |
406 | break; |
407 | } |
408 | case Iterator::Kind::True: |
409 | // Don't return all(), which would discard sibling boosts. |
410 | default: |
411 | RealChildren.push_back(x: std::move(Child)); |
412 | } |
413 | } |
414 | switch (RealChildren.size()) { |
415 | case 0: |
416 | return none(); |
417 | case 1: |
418 | return std::move(RealChildren.front()); |
419 | default: |
420 | return std::make_unique<OrIterator>(args: std::move(RealChildren)); |
421 | } |
422 | } |
423 | |
424 | std::unique_ptr<Iterator> Corpus::all() const { |
425 | return std::make_unique<TrueIterator>(args: Size); |
426 | } |
427 | |
428 | std::unique_ptr<Iterator> Corpus::none() const { |
429 | return std::make_unique<FalseIterator>(); |
430 | } |
431 | |
432 | std::unique_ptr<Iterator> Corpus::boost(std::unique_ptr<Iterator> Child, |
433 | float Factor) const { |
434 | if (Factor == 1) |
435 | return Child; |
436 | if (Child->kind() == Iterator::Kind::False) |
437 | return Child; |
438 | return std::make_unique<BoostIterator>(args: std::move(Child), args&: Factor); |
439 | } |
440 | |
441 | std::unique_ptr<Iterator> Corpus::limit(std::unique_ptr<Iterator> Child, |
442 | size_t Limit) const { |
443 | if (Child->kind() == Iterator::Kind::False) |
444 | return Child; |
445 | return std::make_unique<LimitIterator>(args: std::move(Child), args&: Limit); |
446 | } |
447 | |
448 | } // namespace dex |
449 | } // namespace clangd |
450 | } // namespace clang |
451 | |