| 1 | //===--- Iterator.cpp - Query Symbol Retrieval ------------------*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #include "Iterator.h" |
| 10 | #include "llvm/ADT/STLExtras.h" |
| 11 | #include <algorithm> |
| 12 | #include <cassert> |
| 13 | #include <numeric> |
| 14 | |
| 15 | namespace clang { |
| 16 | namespace clangd { |
| 17 | namespace dex { |
| 18 | namespace { |
| 19 | |
| 20 | /// Implements Iterator over the intersection of other iterators. |
| 21 | /// |
| 22 | /// AndIterator iterates through common items among all children. It becomes |
| 23 | /// exhausted as soon as any child becomes exhausted. After each mutation, the |
| 24 | /// iterator restores the invariant: all children must point to the same item. |
| 25 | class AndIterator : public Iterator { |
| 26 | public: |
| 27 | explicit AndIterator(std::vector<std::unique_ptr<Iterator>> AllChildren) |
| 28 | : Iterator(Kind::And), Children(std::move(AllChildren)) { |
| 29 | assert(!Children.empty() && "AND iterator should have at least one child." ); |
| 30 | // Establish invariants. |
| 31 | for (const auto &Child : Children) |
| 32 | ReachedEnd |= Child->reachedEnd(); |
| 33 | sync(); |
| 34 | // When children are sorted by the estimateSize(), sync() calls are more |
| 35 | // effective. Each sync() starts with the first child and makes sure all |
| 36 | // children point to the same element. If any child is "above" the previous |
| 37 | // ones, the algorithm resets and advances the children to the next |
| 38 | // highest element starting from the front. When child iterators in the |
| 39 | // beginning have smaller estimated size, the sync() will have less restarts |
| 40 | // and become more effective. |
| 41 | llvm::sort(C&: Children, Comp: [](const std::unique_ptr<Iterator> &LHS, |
| 42 | const std::unique_ptr<Iterator> &RHS) { |
| 43 | return LHS->estimateSize() < RHS->estimateSize(); |
| 44 | }); |
| 45 | } |
| 46 | |
| 47 | bool reachedEnd() const override { return ReachedEnd; } |
| 48 | |
| 49 | /// Advances all children to the next common item. |
| 50 | void advance() override { |
| 51 | assert(!reachedEnd() && "AND iterator can't advance() at the end." ); |
| 52 | Children.front()->advance(); |
| 53 | sync(); |
| 54 | } |
| 55 | |
| 56 | /// Advances all children to the next common item with DocumentID >= ID. |
| 57 | void advanceTo(DocID ID) override { |
| 58 | assert(!reachedEnd() && "AND iterator can't advanceTo() at the end." ); |
| 59 | Children.front()->advanceTo(ID); |
| 60 | sync(); |
| 61 | } |
| 62 | |
| 63 | DocID peek() const override { return Children.front()->peek(); } |
| 64 | |
| 65 | float consume() override { |
| 66 | assert(!reachedEnd() && "AND iterator can't consume() at the end." ); |
| 67 | float Boost = 1; |
| 68 | for (const auto &Child : Children) |
| 69 | Boost *= Child->consume(); |
| 70 | return Boost; |
| 71 | } |
| 72 | |
| 73 | size_t estimateSize() const override { |
| 74 | return Children.front()->estimateSize(); |
| 75 | } |
| 76 | |
| 77 | private: |
| 78 | llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { |
| 79 | OS << "(& " ; |
| 80 | auto *Separator = "" ; |
| 81 | for (const auto &Child : Children) { |
| 82 | OS << Separator << *Child; |
| 83 | Separator = " " ; |
| 84 | } |
| 85 | OS << ')'; |
| 86 | return OS; |
| 87 | } |
| 88 | |
| 89 | /// Restores class invariants: each child will point to the same element after |
| 90 | /// sync. |
| 91 | void sync() { |
| 92 | ReachedEnd |= Children.front()->reachedEnd(); |
| 93 | if (ReachedEnd) |
| 94 | return; |
| 95 | auto SyncID = Children.front()->peek(); |
| 96 | // Indicates whether any child needs to be advanced to new SyncID. |
| 97 | bool NeedsAdvance = false; |
| 98 | do { |
| 99 | NeedsAdvance = false; |
| 100 | for (auto &Child : Children) { |
| 101 | Child->advanceTo(ID: SyncID); |
| 102 | ReachedEnd |= Child->reachedEnd(); |
| 103 | // If any child reaches end And iterator can not match any other items. |
| 104 | // In this case, just terminate the process. |
| 105 | if (ReachedEnd) |
| 106 | return; |
| 107 | // Cache the result so that peek() is not called again as it may be |
| 108 | // quite expensive in AND with large subtrees. |
| 109 | auto Candidate = Child->peek(); |
| 110 | // If any child goes beyond given ID (i.e. ID is not the common item), |
| 111 | // all children should be advanced to the next common item. |
| 112 | if (Candidate > SyncID) { |
| 113 | SyncID = Candidate; |
| 114 | NeedsAdvance = true; |
| 115 | // Reset and try to sync again. Sync starts with the first child as |
| 116 | // this is the cheapest (smallest size estimate). This way advanceTo |
| 117 | // is called on the large posting lists once the sync point is very |
| 118 | // likely. |
| 119 | break; |
| 120 | } |
| 121 | } |
| 122 | } while (NeedsAdvance); |
| 123 | } |
| 124 | |
| 125 | /// AndIterator owns its children and ensures that all of them point to the |
| 126 | /// same element. As soon as one child gets exhausted, AndIterator can no |
| 127 | /// longer advance and has reached its end. |
| 128 | std::vector<std::unique_ptr<Iterator>> Children; |
| 129 | /// Indicates whether any child is exhausted. It is cheaper to maintain and |
| 130 | /// update the field, rather than traversing the whole subtree in each |
| 131 | /// reachedEnd() call. |
| 132 | bool ReachedEnd = false; |
| 133 | friend Corpus; // For optimizations. |
| 134 | }; |
| 135 | |
| 136 | /// Implements Iterator over the union of other iterators. |
| 137 | /// |
| 138 | /// OrIterator iterates through all items which can be pointed to by at least |
| 139 | /// one child. To preserve the sorted order, this iterator always advances the |
| 140 | /// child with smallest Child->peek() value. OrIterator becomes exhausted as |
| 141 | /// soon as all of its children are exhausted. |
| 142 | class OrIterator : public Iterator { |
| 143 | public: |
| 144 | explicit OrIterator(std::vector<std::unique_ptr<Iterator>> AllChildren) |
| 145 | : Iterator(Kind::Or), Children(std::move(AllChildren)) { |
| 146 | assert(!Children.empty() && "OR iterator should have at least one child." ); |
| 147 | } |
| 148 | |
| 149 | /// Returns true if all children are exhausted. |
| 150 | bool reachedEnd() const override { |
| 151 | for (const auto &Child : Children) |
| 152 | if (!Child->reachedEnd()) |
| 153 | return false; |
| 154 | return true; |
| 155 | } |
| 156 | |
| 157 | /// Moves each child pointing to the smallest DocID to the next item. |
| 158 | void advance() override { |
| 159 | assert(!reachedEnd() && "OR iterator can't advance() at the end." ); |
| 160 | const auto SmallestID = peek(); |
| 161 | for (const auto &Child : Children) |
| 162 | if (!Child->reachedEnd() && Child->peek() == SmallestID) |
| 163 | Child->advance(); |
| 164 | } |
| 165 | |
| 166 | /// Advances each child to the next existing element with DocumentID >= ID. |
| 167 | void advanceTo(DocID ID) override { |
| 168 | assert(!reachedEnd() && "OR iterator can't advanceTo() at the end." ); |
| 169 | for (const auto &Child : Children) |
| 170 | if (!Child->reachedEnd()) |
| 171 | Child->advanceTo(ID); |
| 172 | } |
| 173 | |
| 174 | /// Returns the element under cursor of the child with smallest Child->peek() |
| 175 | /// value. |
| 176 | DocID peek() const override { |
| 177 | assert(!reachedEnd() && "OR iterator can't peek() at the end." ); |
| 178 | DocID Result = std::numeric_limits<DocID>::max(); |
| 179 | |
| 180 | for (const auto &Child : Children) |
| 181 | if (!Child->reachedEnd()) |
| 182 | Result = std::min(a: Result, b: Child->peek()); |
| 183 | |
| 184 | return Result; |
| 185 | } |
| 186 | |
| 187 | // Returns the maximum boosting score among all Children when iterator |
| 188 | // points to the current ID. |
| 189 | float consume() override { |
| 190 | assert(!reachedEnd() && "OR iterator can't consume() at the end." ); |
| 191 | const DocID ID = peek(); |
| 192 | float Boost = 1; |
| 193 | for (const auto &Child : Children) |
| 194 | if (!Child->reachedEnd() && Child->peek() == ID) |
| 195 | Boost = std::max(a: Boost, b: Child->consume()); |
| 196 | return Boost; |
| 197 | } |
| 198 | |
| 199 | size_t estimateSize() const override { |
| 200 | size_t Size = 0; |
| 201 | for (const auto &Child : Children) |
| 202 | Size = std::max(a: Size, b: Child->estimateSize()); |
| 203 | return Size; |
| 204 | } |
| 205 | |
| 206 | private: |
| 207 | llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { |
| 208 | OS << "(| " ; |
| 209 | auto *Separator = "" ; |
| 210 | for (const auto &Child : Children) { |
| 211 | OS << Separator << *Child; |
| 212 | Separator = " " ; |
| 213 | } |
| 214 | OS << ')'; |
| 215 | return OS; |
| 216 | } |
| 217 | |
| 218 | std::vector<std::unique_ptr<Iterator>> Children; |
| 219 | friend Corpus; // For optimizations. |
| 220 | }; |
| 221 | |
| 222 | /// TrueIterator handles PostingLists which contain all items of the index. It |
| 223 | /// stores size of the virtual posting list, and all operations are performed |
| 224 | /// in O(1). |
| 225 | class TrueIterator : public Iterator { |
| 226 | public: |
| 227 | explicit TrueIterator(DocID Size) : Iterator(Kind::True), Size(Size) {} |
| 228 | |
| 229 | bool reachedEnd() const override { return Index >= Size; } |
| 230 | |
| 231 | void advance() override { |
| 232 | assert(!reachedEnd() && "TRUE iterator can't advance() at the end." ); |
| 233 | ++Index; |
| 234 | } |
| 235 | |
| 236 | void advanceTo(DocID ID) override { |
| 237 | assert(!reachedEnd() && "TRUE iterator can't advanceTo() at the end." ); |
| 238 | Index = std::min(a: ID, b: Size); |
| 239 | } |
| 240 | |
| 241 | DocID peek() const override { |
| 242 | assert(!reachedEnd() && "TRUE iterator can't peek() at the end." ); |
| 243 | return Index; |
| 244 | } |
| 245 | |
| 246 | float consume() override { |
| 247 | assert(!reachedEnd() && "TRUE iterator can't consume() at the end." ); |
| 248 | return 1; |
| 249 | } |
| 250 | |
| 251 | size_t estimateSize() const override { return Size; } |
| 252 | |
| 253 | private: |
| 254 | llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { |
| 255 | return OS << "true" ; |
| 256 | } |
| 257 | |
| 258 | DocID Index = 0; |
| 259 | /// Size of the underlying virtual PostingList. |
| 260 | DocID Size; |
| 261 | }; |
| 262 | |
| 263 | /// FalseIterator yields no results. |
| 264 | class FalseIterator : public Iterator { |
| 265 | public: |
| 266 | FalseIterator() : Iterator(Kind::False) {} |
| 267 | bool reachedEnd() const override { return true; } |
| 268 | void advance() override { assert(false); } |
| 269 | void advanceTo(DocID ID) override { assert(false); } |
| 270 | DocID peek() const override { |
| 271 | assert(false); |
| 272 | return 0; |
| 273 | } |
| 274 | float consume() override { |
| 275 | assert(false); |
| 276 | return 1; |
| 277 | } |
| 278 | size_t estimateSize() const override { return 0; } |
| 279 | |
| 280 | private: |
| 281 | llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { |
| 282 | return OS << "false" ; |
| 283 | } |
| 284 | }; |
| 285 | |
| 286 | /// Boost iterator is a wrapper around its child which multiplies scores of |
| 287 | /// each retrieved item by a given factor. |
| 288 | class BoostIterator : public Iterator { |
| 289 | public: |
| 290 | BoostIterator(std::unique_ptr<Iterator> Child, float Factor) |
| 291 | : Child(std::move(Child)), Factor(Factor) {} |
| 292 | |
| 293 | bool reachedEnd() const override { return Child->reachedEnd(); } |
| 294 | |
| 295 | void advance() override { Child->advance(); } |
| 296 | |
| 297 | void advanceTo(DocID ID) override { Child->advanceTo(ID); } |
| 298 | |
| 299 | DocID peek() const override { return Child->peek(); } |
| 300 | |
| 301 | float consume() override { return Child->consume() * Factor; } |
| 302 | |
| 303 | size_t estimateSize() const override { return Child->estimateSize(); } |
| 304 | |
| 305 | private: |
| 306 | llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { |
| 307 | return OS << "(* " << Factor << ' ' << *Child << ')'; |
| 308 | } |
| 309 | |
| 310 | std::unique_ptr<Iterator> Child; |
| 311 | float Factor; |
| 312 | }; |
| 313 | |
| 314 | /// This iterator limits the number of items retrieved from the child iterator |
| 315 | /// on top of the query tree. To ensure that query tree with LIMIT iterators |
| 316 | /// inside works correctly, users have to call Root->consume(Root->peek()) each |
| 317 | /// time item is retrieved at the root of query tree. |
| 318 | class LimitIterator : public Iterator { |
| 319 | public: |
| 320 | LimitIterator(std::unique_ptr<Iterator> Child, size_t Limit) |
| 321 | : Child(std::move(Child)), Limit(Limit), ItemsLeft(Limit) {} |
| 322 | |
| 323 | bool reachedEnd() const override { |
| 324 | return ItemsLeft == 0 || Child->reachedEnd(); |
| 325 | } |
| 326 | |
| 327 | void advance() override { Child->advance(); } |
| 328 | |
| 329 | void advanceTo(DocID ID) override { Child->advanceTo(ID); } |
| 330 | |
| 331 | DocID peek() const override { return Child->peek(); } |
| 332 | |
| 333 | /// Decreases the limit in case the element consumed at top of the query tree |
| 334 | /// comes from the underlying iterator. |
| 335 | float consume() override { |
| 336 | assert(!reachedEnd() && "LimitIterator can't consume() at the end." ); |
| 337 | --ItemsLeft; |
| 338 | return Child->consume(); |
| 339 | } |
| 340 | |
| 341 | size_t estimateSize() const override { |
| 342 | return std::min(a: Child->estimateSize(), b: Limit); |
| 343 | } |
| 344 | |
| 345 | private: |
| 346 | llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { |
| 347 | return OS << "(LIMIT " << Limit << " " << *Child << ')'; |
| 348 | } |
| 349 | |
| 350 | std::unique_ptr<Iterator> Child; |
| 351 | size_t Limit; |
| 352 | size_t ItemsLeft; |
| 353 | }; |
| 354 | |
| 355 | } // end namespace |
| 356 | |
| 357 | std::vector<std::pair<DocID, float>> consume(Iterator &It) { |
| 358 | std::vector<std::pair<DocID, float>> Result; |
| 359 | for (; !It.reachedEnd(); It.advance()) |
| 360 | Result.emplace_back(args: It.peek(), args: It.consume()); |
| 361 | return Result; |
| 362 | } |
| 363 | |
| 364 | std::unique_ptr<Iterator> |
| 365 | Corpus::intersect(std::vector<std::unique_ptr<Iterator>> Children) const { |
| 366 | std::vector<std::unique_ptr<Iterator>> RealChildren; |
| 367 | for (auto &Child : Children) { |
| 368 | switch (Child->kind()) { |
| 369 | case Iterator::Kind::True: |
| 370 | break; // No effect, drop the iterator. |
| 371 | case Iterator::Kind::False: |
| 372 | return std::move(Child); // Intersection is empty. |
| 373 | case Iterator::Kind::And: { |
| 374 | // Inline nested AND into parent AND. |
| 375 | auto &NewChildren = static_cast<AndIterator *>(Child.get())->Children; |
| 376 | std::move(first: NewChildren.begin(), last: NewChildren.end(), |
| 377 | result: std::back_inserter(x&: RealChildren)); |
| 378 | break; |
| 379 | } |
| 380 | default: |
| 381 | RealChildren.push_back(x: std::move(Child)); |
| 382 | } |
| 383 | } |
| 384 | switch (RealChildren.size()) { |
| 385 | case 0: |
| 386 | return all(); |
| 387 | case 1: |
| 388 | return std::move(RealChildren.front()); |
| 389 | default: |
| 390 | return std::make_unique<AndIterator>(args: std::move(RealChildren)); |
| 391 | } |
| 392 | } |
| 393 | |
| 394 | std::unique_ptr<Iterator> |
| 395 | Corpus::unionOf(std::vector<std::unique_ptr<Iterator>> Children) const { |
| 396 | std::vector<std::unique_ptr<Iterator>> RealChildren; |
| 397 | for (auto &Child : Children) { |
| 398 | switch (Child->kind()) { |
| 399 | case Iterator::Kind::False: |
| 400 | break; // No effect, drop the iterator. |
| 401 | case Iterator::Kind::Or: { |
| 402 | // Inline nested OR into parent OR. |
| 403 | auto &NewChildren = static_cast<OrIterator *>(Child.get())->Children; |
| 404 | std::move(first: NewChildren.begin(), last: NewChildren.end(), |
| 405 | result: std::back_inserter(x&: RealChildren)); |
| 406 | break; |
| 407 | } |
| 408 | case Iterator::Kind::True: |
| 409 | // Don't return all(), which would discard sibling boosts. |
| 410 | default: |
| 411 | RealChildren.push_back(x: std::move(Child)); |
| 412 | } |
| 413 | } |
| 414 | switch (RealChildren.size()) { |
| 415 | case 0: |
| 416 | return none(); |
| 417 | case 1: |
| 418 | return std::move(RealChildren.front()); |
| 419 | default: |
| 420 | return std::make_unique<OrIterator>(args: std::move(RealChildren)); |
| 421 | } |
| 422 | } |
| 423 | |
| 424 | std::unique_ptr<Iterator> Corpus::all() const { |
| 425 | return std::make_unique<TrueIterator>(args: Size); |
| 426 | } |
| 427 | |
| 428 | std::unique_ptr<Iterator> Corpus::none() const { |
| 429 | return std::make_unique<FalseIterator>(); |
| 430 | } |
| 431 | |
| 432 | std::unique_ptr<Iterator> Corpus::boost(std::unique_ptr<Iterator> Child, |
| 433 | float Factor) const { |
| 434 | if (Factor == 1) |
| 435 | return Child; |
| 436 | if (Child->kind() == Iterator::Kind::False) |
| 437 | return Child; |
| 438 | return std::make_unique<BoostIterator>(args: std::move(Child), args&: Factor); |
| 439 | } |
| 440 | |
| 441 | std::unique_ptr<Iterator> Corpus::limit(std::unique_ptr<Iterator> Child, |
| 442 | size_t Limit) const { |
| 443 | if (Child->kind() == Iterator::Kind::False) |
| 444 | return Child; |
| 445 | return std::make_unique<LimitIterator>(args: std::move(Child), args&: Limit); |
| 446 | } |
| 447 | |
| 448 | } // namespace dex |
| 449 | } // namespace clangd |
| 450 | } // namespace clang |
| 451 | |