| 1 | //===- bolt/Passes/ReorderAlgorithm.cpp - Basic block reordering ----------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file implements classes used by several basic block reordering |
| 10 | // algorithms. |
| 11 | // |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #include "bolt/Passes/ReorderAlgorithm.h" |
| 15 | #include "bolt/Core/BinaryBasicBlock.h" |
| 16 | #include "bolt/Core/BinaryFunction.h" |
| 17 | #include "llvm/Support/CommandLine.h" |
| 18 | #include "llvm/Transforms/Utils/CodeLayout.h" |
| 19 | #include <queue> |
| 20 | #include <random> |
| 21 | #include <stack> |
| 22 | |
| 23 | #undef DEBUG_TYPE |
| 24 | #define DEBUG_TYPE "bolt" |
| 25 | |
| 26 | using namespace llvm; |
| 27 | using namespace bolt; |
| 28 | |
| 29 | namespace opts { |
| 30 | |
| 31 | extern cl::OptionCategory BoltOptCategory; |
| 32 | extern cl::opt<bool> NoThreads; |
| 33 | |
| 34 | static cl::opt<unsigned> ColdThreshold( |
| 35 | "cold-threshold" , |
| 36 | cl::desc("tenths of percents of main entry frequency to use as a " |
| 37 | "threshold when evaluating whether a basic block is cold " |
| 38 | "(0 means it is only considered cold if the block has zero " |
| 39 | "samples). Default: 0 " ), |
| 40 | cl::init(Val: 0), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory)); |
| 41 | |
| 42 | static cl::opt<bool> PrintClusters("print-clusters" , cl::desc("print clusters" ), |
| 43 | cl::Hidden, cl::cat(BoltOptCategory)); |
| 44 | |
| 45 | cl::opt<uint32_t> RandomSeed("bolt-seed" , cl::desc("seed for randomization" ), |
| 46 | cl::init(Val: 42), cl::Hidden, |
| 47 | cl::cat(BoltOptCategory)); |
| 48 | |
| 49 | } // namespace opts |
| 50 | |
| 51 | namespace { |
| 52 | |
| 53 | template <class T> inline void hashCombine(size_t &Seed, const T &Val) { |
| 54 | std::hash<T> Hasher; |
| 55 | Seed ^= Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2); |
| 56 | } |
| 57 | |
| 58 | template <typename A, typename B> struct HashPair { |
| 59 | size_t operator()(const std::pair<A, B> &Val) const { |
| 60 | std::hash<A> Hasher; |
| 61 | size_t Seed = Hasher(Val.first); |
| 62 | hashCombine(Seed, Val.second); |
| 63 | return Seed; |
| 64 | } |
| 65 | }; |
| 66 | |
| 67 | } // namespace |
| 68 | |
| 69 | void ClusterAlgorithm::computeClusterAverageFrequency(const BinaryContext &BC) { |
| 70 | // Create a separate MCCodeEmitter to allow lock-free execution |
| 71 | BinaryContext::IndependentCodeEmitter Emitter; |
| 72 | if (!opts::NoThreads) |
| 73 | Emitter = BC.createIndependentMCCodeEmitter(); |
| 74 | |
| 75 | AvgFreq.resize(new_size: Clusters.size(), x: 0.0); |
| 76 | for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { |
| 77 | double Freq = 0.0; |
| 78 | uint64_t ClusterSize = 0; |
| 79 | for (const BinaryBasicBlock *BB : Clusters[I]) { |
| 80 | if (BB->getNumNonPseudos() > 0) { |
| 81 | Freq += BB->getExecutionCount(); |
| 82 | // Estimate the size of a block in bytes at run time |
| 83 | // NOTE: This might be inaccurate |
| 84 | ClusterSize += BB->estimateSize(Emitter: Emitter.MCE.get()); |
| 85 | } |
| 86 | } |
| 87 | AvgFreq[I] = ClusterSize == 0 ? 0 : Freq / ClusterSize; |
| 88 | } |
| 89 | } |
| 90 | |
| 91 | void ClusterAlgorithm::printClusters() const { |
| 92 | for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { |
| 93 | errs() << "Cluster number " << I; |
| 94 | if (AvgFreq.size() == Clusters.size()) |
| 95 | errs() << " (frequency: " << AvgFreq[I] << ")" ; |
| 96 | errs() << " : " ; |
| 97 | const char *Sep = "" ; |
| 98 | for (const BinaryBasicBlock *BB : Clusters[I]) { |
| 99 | errs() << Sep << BB->getName(); |
| 100 | Sep = ", " ; |
| 101 | } |
| 102 | errs() << "\n" ; |
| 103 | } |
| 104 | } |
| 105 | |
| 106 | void ClusterAlgorithm::reset() { |
| 107 | Clusters.clear(); |
| 108 | ClusterEdges.clear(); |
| 109 | AvgFreq.clear(); |
| 110 | } |
| 111 | |
| 112 | void GreedyClusterAlgorithm::EdgeTy::print(raw_ostream &OS) const { |
| 113 | OS << Src->getName() << " -> " << Dst->getName() << ", count: " << Count; |
| 114 | } |
| 115 | |
| 116 | size_t GreedyClusterAlgorithm::EdgeHash::operator()(const EdgeTy &E) const { |
| 117 | HashPair<const BinaryBasicBlock *, const BinaryBasicBlock *> Hasher; |
| 118 | return Hasher(std::make_pair(x: E.Src, y: E.Dst)); |
| 119 | } |
| 120 | |
| 121 | bool GreedyClusterAlgorithm::EdgeEqual::operator()(const EdgeTy &A, |
| 122 | const EdgeTy &B) const { |
| 123 | return A.Src == B.Src && A.Dst == B.Dst; |
| 124 | } |
| 125 | |
| 126 | void GreedyClusterAlgorithm::clusterBasicBlocks(BinaryFunction &BF, |
| 127 | bool ComputeEdges) { |
| 128 | reset(); |
| 129 | |
| 130 | // Greedy heuristic implementation for the TSP, applied to BB layout. Try to |
| 131 | // maximize weight during a path traversing all BBs. In this way, we will |
| 132 | // convert the hottest branches into fall-throughs. |
| 133 | |
| 134 | // This is the queue of edges from which we will pop edges and use them to |
| 135 | // cluster basic blocks in a greedy fashion. |
| 136 | std::vector<EdgeTy> Queue; |
| 137 | |
| 138 | // Initialize inter-cluster weights. |
| 139 | if (ComputeEdges) |
| 140 | ClusterEdges.resize(new_size: BF.getLayout().block_size()); |
| 141 | |
| 142 | // Initialize clusters and edge queue. |
| 143 | for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { |
| 144 | // Create a cluster for this BB. |
| 145 | uint32_t I = Clusters.size(); |
| 146 | Clusters.emplace_back(); |
| 147 | std::vector<BinaryBasicBlock *> &Cluster = Clusters.back(); |
| 148 | Cluster.push_back(x: BB); |
| 149 | BBToClusterMap[BB] = I; |
| 150 | // Populate priority queue with edges. |
| 151 | auto BI = BB->branch_info_begin(); |
| 152 | for (const BinaryBasicBlock *I : BB->successors()) { |
| 153 | assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && |
| 154 | "attempted reordering blocks of function with no profile data" ); |
| 155 | Queue.emplace_back(args: EdgeTy(BB, I, BI->Count)); |
| 156 | ++BI; |
| 157 | } |
| 158 | } |
| 159 | // Sort and adjust the edge queue. |
| 160 | initQueue(Queue, BF); |
| 161 | |
| 162 | // Grow clusters in a greedy fashion. |
| 163 | while (!Queue.empty()) { |
| 164 | EdgeTy E = Queue.back(); |
| 165 | Queue.pop_back(); |
| 166 | |
| 167 | const BinaryBasicBlock *SrcBB = E.Src; |
| 168 | const BinaryBasicBlock *DstBB = E.Dst; |
| 169 | |
| 170 | LLVM_DEBUG(dbgs() << "Popped edge " ; E.print(dbgs()); dbgs() << "\n" ); |
| 171 | |
| 172 | // Case 1: BBSrc and BBDst are the same. Ignore this edge |
| 173 | if (SrcBB == DstBB || DstBB == *BF.getLayout().block_begin()) { |
| 174 | LLVM_DEBUG(dbgs() << "\tIgnored (same src, dst)\n" ); |
| 175 | continue; |
| 176 | } |
| 177 | |
| 178 | int I = BBToClusterMap[SrcBB]; |
| 179 | int J = BBToClusterMap[DstBB]; |
| 180 | |
| 181 | // Case 2: If they are already allocated at the same cluster, just increase |
| 182 | // the weight of this cluster |
| 183 | if (I == J) { |
| 184 | if (ComputeEdges) |
| 185 | ClusterEdges[I][I] += E.Count; |
| 186 | LLVM_DEBUG(dbgs() << "\tIgnored (src, dst belong to the same cluster)\n" ); |
| 187 | continue; |
| 188 | } |
| 189 | |
| 190 | std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I]; |
| 191 | std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J]; |
| 192 | if (areClustersCompatible(Front: ClusterA, Back: ClusterB, E)) { |
| 193 | // Case 3: SrcBB is at the end of a cluster and DstBB is at the start, |
| 194 | // allowing us to merge two clusters. |
| 195 | for (const BinaryBasicBlock *BB : ClusterB) |
| 196 | BBToClusterMap[BB] = I; |
| 197 | ClusterA.insert(position: ClusterA.end(), first: ClusterB.begin(), last: ClusterB.end()); |
| 198 | ClusterB.clear(); |
| 199 | if (ComputeEdges) { |
| 200 | // Increase the intra-cluster edge count of cluster A with the count of |
| 201 | // this edge as well as with the total count of previously visited edges |
| 202 | // from cluster B cluster A. |
| 203 | ClusterEdges[I][I] += E.Count; |
| 204 | ClusterEdges[I][I] += ClusterEdges[J][I]; |
| 205 | // Iterate through all inter-cluster edges and transfer edges targeting |
| 206 | // cluster B to cluster A. |
| 207 | for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K) |
| 208 | ClusterEdges[K][I] += ClusterEdges[K][J]; |
| 209 | } |
| 210 | // Adjust the weights of the remaining edges and re-sort the queue. |
| 211 | adjustQueue(Queue, BF); |
| 212 | LLVM_DEBUG(dbgs() << "\tMerged clusters of src, dst\n" ); |
| 213 | } else { |
| 214 | // Case 4: Both SrcBB and DstBB are allocated in positions we cannot |
| 215 | // merge them. Add the count of this edge to the inter-cluster edge count |
| 216 | // between clusters A and B to help us decide ordering between these |
| 217 | // clusters. |
| 218 | if (ComputeEdges) |
| 219 | ClusterEdges[I][J] += E.Count; |
| 220 | LLVM_DEBUG( |
| 221 | dbgs() << "\tIgnored (src, dst belong to incompatible clusters)\n" ); |
| 222 | } |
| 223 | } |
| 224 | } |
| 225 | |
| 226 | void GreedyClusterAlgorithm::reset() { |
| 227 | ClusterAlgorithm::reset(); |
| 228 | BBToClusterMap.clear(); |
| 229 | } |
| 230 | |
| 231 | void PHGreedyClusterAlgorithm::initQueue(std::vector<EdgeTy> &Queue, |
| 232 | const BinaryFunction &BF) { |
| 233 | // Define a comparison function to establish SWO between edges. |
| 234 | auto Comp = [&BF](const EdgeTy &A, const EdgeTy &B) { |
| 235 | // With equal weights, prioritize branches with lower index |
| 236 | // source/destination. This helps to keep original block order for blocks |
| 237 | // when optimal order cannot be deducted from a profile. |
| 238 | if (A.Count == B.Count) { |
| 239 | const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A: A.Src, B: B.Src); |
| 240 | return (SrcOrder != 0) |
| 241 | ? SrcOrder > 0 |
| 242 | : BF.getOriginalLayoutRelativeOrder(A: A.Dst, B: B.Dst) > 0; |
| 243 | } |
| 244 | return A.Count < B.Count; |
| 245 | }; |
| 246 | |
| 247 | // Sort edges in increasing profile count order. |
| 248 | llvm::sort(C&: Queue, Comp); |
| 249 | } |
| 250 | |
| 251 | void PHGreedyClusterAlgorithm::adjustQueue(std::vector<EdgeTy> &Queue, |
| 252 | const BinaryFunction &BF) { |
| 253 | // Nothing to do. |
| 254 | } |
| 255 | |
| 256 | bool PHGreedyClusterAlgorithm::areClustersCompatible(const ClusterTy &Front, |
| 257 | const ClusterTy &Back, |
| 258 | const EdgeTy &E) const { |
| 259 | return Front.back() == E.Src && Back.front() == E.Dst; |
| 260 | } |
| 261 | |
| 262 | int64_t MinBranchGreedyClusterAlgorithm::calculateWeight( |
| 263 | const EdgeTy &E, const BinaryFunction &BF) const { |
| 264 | const BinaryBasicBlock *SrcBB = E.Src; |
| 265 | const BinaryBasicBlock *DstBB = E.Dst; |
| 266 | |
| 267 | // Initial weight value. |
| 268 | int64_t W = (int64_t)E.Count; |
| 269 | |
| 270 | // Adjust the weight by taking into account other edges with the same source. |
| 271 | auto BI = SrcBB->branch_info_begin(); |
| 272 | for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) { |
| 273 | assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && |
| 274 | "attempted reordering blocks of function with no profile data" ); |
| 275 | assert(BI->Count <= std::numeric_limits<int64_t>::max() && |
| 276 | "overflow detected" ); |
| 277 | // Ignore edges with same source and destination, edges that target the |
| 278 | // entry block as well as the edge E itself. |
| 279 | if (SuccBB != SrcBB && SuccBB != *BF.getLayout().block_begin() && |
| 280 | SuccBB != DstBB) |
| 281 | W -= (int64_t)BI->Count; |
| 282 | ++BI; |
| 283 | } |
| 284 | |
| 285 | // Adjust the weight by taking into account other edges with the same |
| 286 | // destination. |
| 287 | for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) { |
| 288 | // Ignore edges with same source and destination as well as the edge E |
| 289 | // itself. |
| 290 | if (PredBB == DstBB || PredBB == SrcBB) |
| 291 | continue; |
| 292 | auto BI = PredBB->branch_info_begin(); |
| 293 | for (const BinaryBasicBlock *SuccBB : PredBB->successors()) { |
| 294 | if (SuccBB == DstBB) |
| 295 | break; |
| 296 | ++BI; |
| 297 | } |
| 298 | assert(BI != PredBB->branch_info_end() && "invalid control flow graph" ); |
| 299 | assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && |
| 300 | "attempted reordering blocks of function with no profile data" ); |
| 301 | assert(BI->Count <= std::numeric_limits<int64_t>::max() && |
| 302 | "overflow detected" ); |
| 303 | W -= (int64_t)BI->Count; |
| 304 | } |
| 305 | |
| 306 | return W; |
| 307 | } |
| 308 | |
| 309 | void MinBranchGreedyClusterAlgorithm::initQueue(std::vector<EdgeTy> &Queue, |
| 310 | const BinaryFunction &BF) { |
| 311 | // Initialize edge weights. |
| 312 | for (const EdgeTy &E : Queue) |
| 313 | Weight.emplace(args: std::make_pair(x: E, y: calculateWeight(E, BF))); |
| 314 | |
| 315 | // Sort edges in increasing weight order. |
| 316 | adjustQueue(Queue, BF); |
| 317 | } |
| 318 | |
| 319 | void MinBranchGreedyClusterAlgorithm::adjustQueue(std::vector<EdgeTy> &Queue, |
| 320 | const BinaryFunction &BF) { |
| 321 | // Define a comparison function to establish SWO between edges. |
| 322 | auto Comp = [&](const EdgeTy &A, const EdgeTy &B) { |
| 323 | // With equal weights, prioritize branches with lower index |
| 324 | // source/destination. This helps to keep original block order for blocks |
| 325 | // when optimal order cannot be deduced from a profile. |
| 326 | if (Weight[A] == Weight[B]) { |
| 327 | const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A: A.Src, B: B.Src); |
| 328 | return (SrcOrder != 0) |
| 329 | ? SrcOrder > 0 |
| 330 | : BF.getOriginalLayoutRelativeOrder(A: A.Dst, B: B.Dst) > 0; |
| 331 | } |
| 332 | return Weight[A] < Weight[B]; |
| 333 | }; |
| 334 | |
| 335 | // Iterate through all remaining edges to find edges that have their |
| 336 | // source and destination in the same cluster. |
| 337 | std::vector<EdgeTy> NewQueue; |
| 338 | for (const EdgeTy &E : Queue) { |
| 339 | const BinaryBasicBlock *SrcBB = E.Src; |
| 340 | const BinaryBasicBlock *DstBB = E.Dst; |
| 341 | |
| 342 | // Case 1: SrcBB and DstBB are the same or DstBB is the entry block. Ignore |
| 343 | // this edge. |
| 344 | if (SrcBB == DstBB || DstBB == *BF.getLayout().block_begin()) { |
| 345 | LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge " ; E.print(dbgs()); |
| 346 | dbgs() << " (same src, dst)\n" ); |
| 347 | continue; |
| 348 | } |
| 349 | |
| 350 | int I = BBToClusterMap[SrcBB]; |
| 351 | int J = BBToClusterMap[DstBB]; |
| 352 | std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I]; |
| 353 | std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J]; |
| 354 | |
| 355 | // Case 2: They are already allocated at the same cluster or incompatible |
| 356 | // clusters. Adjust the weights of edges with the same source or |
| 357 | // destination, so that this edge has no effect on them any more, and ignore |
| 358 | // this edge. Also increase the intra- (or inter-) cluster edge count. |
| 359 | if (I == J || !areClustersCompatible(Front: ClusterA, Back: ClusterB, E)) { |
| 360 | if (!ClusterEdges.empty()) |
| 361 | ClusterEdges[I][J] += E.Count; |
| 362 | LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge " ; E.print(dbgs()); |
| 363 | dbgs() << " (src, dst belong to same cluster or incompatible " |
| 364 | "clusters)\n" ); |
| 365 | for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) { |
| 366 | if (SuccBB == DstBB) |
| 367 | continue; |
| 368 | auto WI = Weight.find(x: EdgeTy(SrcBB, SuccBB, 0)); |
| 369 | assert(WI != Weight.end() && "CFG edge not found in Weight map" ); |
| 370 | WI->second += (int64_t)E.Count; |
| 371 | } |
| 372 | for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) { |
| 373 | if (PredBB == SrcBB) |
| 374 | continue; |
| 375 | auto WI = Weight.find(x: EdgeTy(PredBB, DstBB, 0)); |
| 376 | assert(WI != Weight.end() && "CFG edge not found in Weight map" ); |
| 377 | WI->second += (int64_t)E.Count; |
| 378 | } |
| 379 | continue; |
| 380 | } |
| 381 | |
| 382 | // Case 3: None of the previous cases is true, so just keep this edge in |
| 383 | // the queue. |
| 384 | NewQueue.emplace_back(args: E); |
| 385 | } |
| 386 | |
| 387 | // Sort remaining edges in increasing weight order. |
| 388 | Queue.swap(x&: NewQueue); |
| 389 | llvm::sort(C&: Queue, Comp); |
| 390 | } |
| 391 | |
| 392 | bool MinBranchGreedyClusterAlgorithm::areClustersCompatible( |
| 393 | const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const { |
| 394 | return Front.back() == E.Src && Back.front() == E.Dst; |
| 395 | } |
| 396 | |
| 397 | void MinBranchGreedyClusterAlgorithm::reset() { |
| 398 | GreedyClusterAlgorithm::reset(); |
| 399 | Weight.clear(); |
| 400 | } |
| 401 | |
| 402 | void TSPReorderAlgorithm::reorderBasicBlocks(BinaryFunction &BF, |
| 403 | BasicBlockOrder &Order) const { |
| 404 | std::vector<std::vector<uint64_t>> Weight; |
| 405 | std::vector<BinaryBasicBlock *> IndexToBB; |
| 406 | |
| 407 | const size_t N = BF.getLayout().block_size(); |
| 408 | assert(N <= std::numeric_limits<uint64_t>::digits && |
| 409 | "cannot use TSP solution for sizes larger than bits in uint64_t" ); |
| 410 | |
| 411 | // Populating weight map and index map |
| 412 | for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { |
| 413 | BB->setLayoutIndex(IndexToBB.size()); |
| 414 | IndexToBB.push_back(x: BB); |
| 415 | } |
| 416 | Weight.resize(new_size: N); |
| 417 | for (const BinaryBasicBlock *BB : BF.getLayout().blocks()) { |
| 418 | auto BI = BB->branch_info_begin(); |
| 419 | Weight[BB->getLayoutIndex()].resize(new_size: N); |
| 420 | for (BinaryBasicBlock *SuccBB : BB->successors()) { |
| 421 | if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) |
| 422 | Weight[BB->getLayoutIndex()][SuccBB->getLayoutIndex()] = BI->Count; |
| 423 | ++BI; |
| 424 | } |
| 425 | } |
| 426 | |
| 427 | std::vector<std::vector<int64_t>> DP; |
| 428 | DP.resize(new_size: static_cast<size_t>(1) << N); |
| 429 | for (std::vector<int64_t> &Elmt : DP) |
| 430 | Elmt.resize(new_size: N, x: -1); |
| 431 | |
| 432 | // Start with the entry basic block being allocated with cost zero |
| 433 | DP[1][0] = 0; |
| 434 | // Walk through TSP solutions using a bitmask to represent state (current set |
| 435 | // of BBs in the layout) |
| 436 | uint64_t BestSet = 1; |
| 437 | uint64_t BestLast = 0; |
| 438 | int64_t BestWeight = 0; |
| 439 | for (uint64_t Set = 1; Set < (1ULL << N); ++Set) { |
| 440 | // Traverse each possibility of Last BB visited in this layout |
| 441 | for (uint64_t Last = 0; Last < N; ++Last) { |
| 442 | // Case 1: There is no possible layout with this BB as Last |
| 443 | if (DP[Set][Last] == -1) |
| 444 | continue; |
| 445 | |
| 446 | // Case 2: There is a layout with this Set and this Last, and we try |
| 447 | // to expand this set with New |
| 448 | for (uint64_t New = 1; New < N; ++New) { |
| 449 | // Case 2a: BB "New" is already in this Set |
| 450 | if ((Set & (1ULL << New)) != 0) |
| 451 | continue; |
| 452 | |
| 453 | // Case 2b: BB "New" is not in this set and we add it to this Set and |
| 454 | // record total weight of this layout with "New" as the last BB. |
| 455 | uint64_t NewSet = (Set | (1ULL << New)); |
| 456 | if (DP[NewSet][New] == -1) |
| 457 | DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New]; |
| 458 | DP[NewSet][New] = std::max(a: DP[NewSet][New], |
| 459 | b: DP[Set][Last] + (int64_t)Weight[Last][New]); |
| 460 | |
| 461 | if (DP[NewSet][New] > BestWeight) { |
| 462 | BestWeight = DP[NewSet][New]; |
| 463 | BestSet = NewSet; |
| 464 | BestLast = New; |
| 465 | } |
| 466 | } |
| 467 | } |
| 468 | } |
| 469 | |
| 470 | // Define final function layout based on layout that maximizes weight |
| 471 | uint64_t Last = BestLast; |
| 472 | uint64_t Set = BestSet; |
| 473 | BitVector Visited; |
| 474 | Visited.resize(N); |
| 475 | Visited[Last] = true; |
| 476 | Order.push_back(Elt: IndexToBB[Last]); |
| 477 | Set = Set & ~(1ULL << Last); |
| 478 | while (Set != 0) { |
| 479 | int64_t Best = -1; |
| 480 | uint64_t NewLast; |
| 481 | for (uint64_t I = 0; I < N; ++I) { |
| 482 | if (DP[Set][I] == -1) |
| 483 | continue; |
| 484 | int64_t AdjWeight = Weight[I][Last] > 0 ? Weight[I][Last] : 0; |
| 485 | if (DP[Set][I] + AdjWeight > Best) { |
| 486 | NewLast = I; |
| 487 | Best = DP[Set][I] + AdjWeight; |
| 488 | } |
| 489 | } |
| 490 | Last = NewLast; |
| 491 | Visited[Last] = true; |
| 492 | Order.push_back(Elt: IndexToBB[Last]); |
| 493 | Set = Set & ~(1ULL << Last); |
| 494 | } |
| 495 | std::reverse(first: Order.begin(), last: Order.end()); |
| 496 | |
| 497 | // Finalize layout with BBs that weren't assigned to the layout using the |
| 498 | // input layout. |
| 499 | for (BinaryBasicBlock *BB : BF.getLayout().blocks()) |
| 500 | if (Visited[BB->getLayoutIndex()] == false) |
| 501 | Order.push_back(Elt: BB); |
| 502 | } |
| 503 | |
| 504 | void ExtTSPReorderAlgorithm::reorderBasicBlocks(BinaryFunction &BF, |
| 505 | BasicBlockOrder &Order) const { |
| 506 | if (BF.getLayout().block_empty()) |
| 507 | return; |
| 508 | |
| 509 | // Do not change layout of functions w/o profile information |
| 510 | if (!BF.hasValidProfile() || BF.getLayout().block_size() <= 2) { |
| 511 | for (BinaryBasicBlock *BB : BF.getLayout().blocks()) |
| 512 | Order.push_back(Elt: BB); |
| 513 | return; |
| 514 | } |
| 515 | |
| 516 | // Create a separate MCCodeEmitter to allow lock-free execution |
| 517 | BinaryContext::IndependentCodeEmitter Emitter; |
| 518 | if (!opts::NoThreads) |
| 519 | Emitter = BF.getBinaryContext().createIndependentMCCodeEmitter(); |
| 520 | |
| 521 | // Initialize CFG nodes and their data |
| 522 | std::vector<uint64_t> BlockSizes; |
| 523 | std::vector<uint64_t> BlockCounts; |
| 524 | BasicBlockOrder OrigOrder; |
| 525 | BF.getLayout().updateLayoutIndices(); |
| 526 | for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { |
| 527 | uint64_t Size = std::max<uint64_t>(a: BB->estimateSize(Emitter: Emitter.MCE.get()), b: 1); |
| 528 | BlockSizes.push_back(x: Size); |
| 529 | BlockCounts.push_back(x: BB->getKnownExecutionCount()); |
| 530 | OrigOrder.push_back(Elt: BB); |
| 531 | } |
| 532 | |
| 533 | // Initialize CFG edges |
| 534 | std::vector<codelayout::EdgeCount> JumpCounts; |
| 535 | for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { |
| 536 | auto BI = BB->branch_info_begin(); |
| 537 | for (BinaryBasicBlock *SuccBB : BB->successors()) { |
| 538 | assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && |
| 539 | "missing profile for a jump" ); |
| 540 | JumpCounts.push_back( |
| 541 | x: {.src: BB->getLayoutIndex(), .dst: SuccBB->getLayoutIndex(), .count: BI->Count}); |
| 542 | ++BI; |
| 543 | } |
| 544 | } |
| 545 | |
| 546 | // Run the layout algorithm |
| 547 | auto Result = |
| 548 | codelayout::computeExtTspLayout(NodeSizes: BlockSizes, NodeCounts: BlockCounts, EdgeCounts: JumpCounts); |
| 549 | Order.reserve(N: BF.getLayout().block_size()); |
| 550 | for (uint64_t R : Result) |
| 551 | Order.push_back(Elt: OrigOrder[R]); |
| 552 | } |
| 553 | |
| 554 | void OptimizeReorderAlgorithm::reorderBasicBlocks( |
| 555 | BinaryFunction &BF, BasicBlockOrder &Order) const { |
| 556 | if (BF.getLayout().block_empty()) |
| 557 | return; |
| 558 | |
| 559 | // Cluster basic blocks. |
| 560 | CAlgo->clusterBasicBlocks(BF); |
| 561 | |
| 562 | if (opts::PrintClusters) |
| 563 | CAlgo->printClusters(); |
| 564 | |
| 565 | // Arrange basic blocks according to clusters. |
| 566 | for (ClusterAlgorithm::ClusterTy &Cluster : CAlgo->Clusters) |
| 567 | Order.insert(I: Order.end(), From: Cluster.begin(), To: Cluster.end()); |
| 568 | } |
| 569 | |
| 570 | void OptimizeBranchReorderAlgorithm::reorderBasicBlocks( |
| 571 | BinaryFunction &BF, BasicBlockOrder &Order) const { |
| 572 | if (BF.getLayout().block_empty()) |
| 573 | return; |
| 574 | |
| 575 | // Cluster basic blocks. |
| 576 | CAlgo->clusterBasicBlocks(BF, /* ComputeEdges = */ true); |
| 577 | std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters; |
| 578 | std::vector<std::unordered_map<uint32_t, uint64_t>> &ClusterEdges = |
| 579 | CAlgo->ClusterEdges; |
| 580 | |
| 581 | // Compute clusters' average frequencies. |
| 582 | CAlgo->computeClusterAverageFrequency(BC: BF.getBinaryContext()); |
| 583 | std::vector<double> &AvgFreq = CAlgo->AvgFreq; |
| 584 | |
| 585 | if (opts::PrintClusters) |
| 586 | CAlgo->printClusters(); |
| 587 | |
| 588 | // Cluster layout order |
| 589 | std::vector<uint32_t> ClusterOrder; |
| 590 | |
| 591 | // Do a topological sort for clusters, prioritizing frequently-executed BBs |
| 592 | // during the traversal. |
| 593 | std::stack<uint32_t> Stack; |
| 594 | std::vector<uint32_t> Status; |
| 595 | std::vector<uint32_t> Parent; |
| 596 | Status.resize(new_size: Clusters.size(), x: 0); |
| 597 | Parent.resize(new_size: Clusters.size(), x: 0); |
| 598 | constexpr uint32_t STACKED = 1; |
| 599 | constexpr uint32_t VISITED = 2; |
| 600 | Status[0] = STACKED; |
| 601 | Stack.push(x: 0); |
| 602 | while (!Stack.empty()) { |
| 603 | uint32_t I = Stack.top(); |
| 604 | if (!(Status[I] & VISITED)) { |
| 605 | Status[I] |= VISITED; |
| 606 | // Order successors by weight |
| 607 | auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) { |
| 608 | return ClusterEdges[I][A] > ClusterEdges[I][B]; |
| 609 | }; |
| 610 | std::priority_queue<uint32_t, std::vector<uint32_t>, |
| 611 | decltype(ClusterComp)> |
| 612 | SuccQueue(ClusterComp); |
| 613 | for (std::pair<const uint32_t, uint64_t> &Target : ClusterEdges[I]) { |
| 614 | if (Target.second > 0 && !(Status[Target.first] & STACKED) && |
| 615 | !Clusters[Target.first].empty()) { |
| 616 | Parent[Target.first] = I; |
| 617 | Status[Target.first] = STACKED; |
| 618 | SuccQueue.push(x: Target.first); |
| 619 | } |
| 620 | } |
| 621 | while (!SuccQueue.empty()) { |
| 622 | Stack.push(x: SuccQueue.top()); |
| 623 | SuccQueue.pop(); |
| 624 | } |
| 625 | continue; |
| 626 | } |
| 627 | // Already visited this node |
| 628 | Stack.pop(); |
| 629 | ClusterOrder.push_back(x: I); |
| 630 | } |
| 631 | std::reverse(first: ClusterOrder.begin(), last: ClusterOrder.end()); |
| 632 | // Put unreachable clusters at the end |
| 633 | for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) |
| 634 | if (!(Status[I] & VISITED) && !Clusters[I].empty()) |
| 635 | ClusterOrder.push_back(x: I); |
| 636 | |
| 637 | // Sort nodes with equal precedence |
| 638 | auto Beg = ClusterOrder.begin(); |
| 639 | // Don't reorder the first cluster, which contains the function entry point |
| 640 | ++Beg; |
| 641 | std::stable_sort(first: Beg, last: ClusterOrder.end(), |
| 642 | comp: [&AvgFreq, &Parent](uint32_t A, uint32_t B) { |
| 643 | uint32_t P = Parent[A]; |
| 644 | while (Parent[P] != 0) { |
| 645 | if (Parent[P] == B) |
| 646 | return false; |
| 647 | P = Parent[P]; |
| 648 | } |
| 649 | P = Parent[B]; |
| 650 | while (Parent[P] != 0) { |
| 651 | if (Parent[P] == A) |
| 652 | return true; |
| 653 | P = Parent[P]; |
| 654 | } |
| 655 | return AvgFreq[A] > AvgFreq[B]; |
| 656 | }); |
| 657 | |
| 658 | if (opts::PrintClusters) { |
| 659 | errs() << "New cluster order: " ; |
| 660 | const char *Sep = "" ; |
| 661 | for (uint32_t O : ClusterOrder) { |
| 662 | errs() << Sep << O; |
| 663 | Sep = ", " ; |
| 664 | } |
| 665 | errs() << '\n'; |
| 666 | } |
| 667 | |
| 668 | // Arrange basic blocks according to cluster order. |
| 669 | for (uint32_t ClusterIndex : ClusterOrder) { |
| 670 | ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; |
| 671 | Order.insert(I: Order.end(), From: Cluster.begin(), To: Cluster.end()); |
| 672 | } |
| 673 | } |
| 674 | |
| 675 | void OptimizeCacheReorderAlgorithm::reorderBasicBlocks( |
| 676 | BinaryFunction &BF, BasicBlockOrder &Order) const { |
| 677 | if (BF.getLayout().block_empty()) |
| 678 | return; |
| 679 | |
| 680 | const uint64_t ColdThreshold = |
| 681 | opts::ColdThreshold * |
| 682 | (*BF.getLayout().block_begin())->getExecutionCount() / 1000; |
| 683 | |
| 684 | // Cluster basic blocks. |
| 685 | CAlgo->clusterBasicBlocks(BF); |
| 686 | std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters; |
| 687 | |
| 688 | // Compute clusters' average frequencies. |
| 689 | CAlgo->computeClusterAverageFrequency(BC: BF.getBinaryContext()); |
| 690 | std::vector<double> &AvgFreq = CAlgo->AvgFreq; |
| 691 | |
| 692 | if (opts::PrintClusters) |
| 693 | CAlgo->printClusters(); |
| 694 | |
| 695 | // Cluster layout order |
| 696 | std::vector<uint32_t> ClusterOrder; |
| 697 | |
| 698 | // Order clusters based on average instruction execution frequency |
| 699 | for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) |
| 700 | if (!Clusters[I].empty()) |
| 701 | ClusterOrder.push_back(x: I); |
| 702 | // Don't reorder the first cluster, which contains the function entry point |
| 703 | std::stable_sort( |
| 704 | first: std::next(x: ClusterOrder.begin()), last: ClusterOrder.end(), |
| 705 | comp: [&AvgFreq](uint32_t A, uint32_t B) { return AvgFreq[A] > AvgFreq[B]; }); |
| 706 | |
| 707 | if (opts::PrintClusters) { |
| 708 | errs() << "New cluster order: " ; |
| 709 | const char *Sep = "" ; |
| 710 | for (uint32_t O : ClusterOrder) { |
| 711 | errs() << Sep << O; |
| 712 | Sep = ", " ; |
| 713 | } |
| 714 | errs() << '\n'; |
| 715 | } |
| 716 | |
| 717 | // Arrange basic blocks according to cluster order. |
| 718 | for (uint32_t ClusterIndex : ClusterOrder) { |
| 719 | ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; |
| 720 | Order.insert(I: Order.end(), From: Cluster.begin(), To: Cluster.end()); |
| 721 | // Force zero execution count on clusters that do not meet the cut off |
| 722 | // specified by --cold-threshold. |
| 723 | if (AvgFreq[ClusterIndex] < static_cast<double>(ColdThreshold)) |
| 724 | for (BinaryBasicBlock *BBPtr : Cluster) |
| 725 | BBPtr->setExecutionCount(0); |
| 726 | } |
| 727 | } |
| 728 | |
| 729 | void ReverseReorderAlgorithm::reorderBasicBlocks(BinaryFunction &BF, |
| 730 | BasicBlockOrder &Order) const { |
| 731 | if (BF.getLayout().block_empty()) |
| 732 | return; |
| 733 | |
| 734 | BinaryBasicBlock *FirstBB = *BF.getLayout().block_begin(); |
| 735 | Order.push_back(Elt: FirstBB); |
| 736 | for (auto RLI = BF.getLayout().block_rbegin(); *RLI != FirstBB; ++RLI) |
| 737 | Order.push_back(Elt: *RLI); |
| 738 | } |
| 739 | |
| 740 | void RandomClusterReorderAlgorithm::reorderBasicBlocks( |
| 741 | BinaryFunction &BF, BasicBlockOrder &Order) const { |
| 742 | if (BF.getLayout().block_empty()) |
| 743 | return; |
| 744 | |
| 745 | // Cluster basic blocks. |
| 746 | CAlgo->clusterBasicBlocks(BF); |
| 747 | std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters; |
| 748 | |
| 749 | if (opts::PrintClusters) |
| 750 | CAlgo->printClusters(); |
| 751 | |
| 752 | // Cluster layout order |
| 753 | std::vector<uint32_t> ClusterOrder; |
| 754 | |
| 755 | // Order clusters based on average instruction execution frequency |
| 756 | for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) |
| 757 | if (!Clusters[I].empty()) |
| 758 | ClusterOrder.push_back(x: I); |
| 759 | |
| 760 | std::shuffle(first: std::next(x: ClusterOrder.begin()), last: ClusterOrder.end(), |
| 761 | g: std::default_random_engine(opts::RandomSeed.getValue())); |
| 762 | |
| 763 | if (opts::PrintClusters) { |
| 764 | errs() << "New cluster order: " ; |
| 765 | const char *Sep = "" ; |
| 766 | for (uint32_t O : ClusterOrder) { |
| 767 | errs() << Sep << O; |
| 768 | Sep = ", " ; |
| 769 | } |
| 770 | errs() << '\n'; |
| 771 | } |
| 772 | |
| 773 | // Arrange basic blocks according to cluster order. |
| 774 | for (uint32_t ClusterIndex : ClusterOrder) { |
| 775 | ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; |
| 776 | Order.insert(I: Order.end(), From: Cluster.begin(), To: Cluster.end()); |
| 777 | } |
| 778 | } |
| 779 | |