1//===- bolt/Passes/ReorderAlgorithm.cpp - Basic block reordering ----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements classes used by several basic block reordering
10// algorithms.
11//
12//===----------------------------------------------------------------------===//
13
14#include "bolt/Passes/ReorderAlgorithm.h"
15#include "bolt/Core/BinaryBasicBlock.h"
16#include "bolt/Core/BinaryFunction.h"
17#include "llvm/Support/CommandLine.h"
18#include "llvm/Transforms/Utils/CodeLayout.h"
19#include <queue>
20#include <random>
21#include <stack>
22
23#undef DEBUG_TYPE
24#define DEBUG_TYPE "bolt"
25
26using namespace llvm;
27using namespace bolt;
28
29namespace opts {
30
31extern cl::OptionCategory BoltOptCategory;
32extern cl::opt<bool> NoThreads;
33
34static cl::opt<unsigned> ColdThreshold(
35 "cold-threshold",
36 cl::desc("tenths of percents of main entry frequency to use as a "
37 "threshold when evaluating whether a basic block is cold "
38 "(0 means it is only considered cold if the block has zero "
39 "samples). Default: 0 "),
40 cl::init(Val: 0), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
41
42static cl::opt<bool> PrintClusters("print-clusters", cl::desc("print clusters"),
43 cl::Hidden, cl::cat(BoltOptCategory));
44
45cl::opt<uint32_t> RandomSeed("bolt-seed", cl::desc("seed for randomization"),
46 cl::init(Val: 42), cl::Hidden,
47 cl::cat(BoltOptCategory));
48
49} // namespace opts
50
51namespace {
52
53template <class T> inline void hashCombine(size_t &Seed, const T &Val) {
54 std::hash<T> Hasher;
55 Seed ^= Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2);
56}
57
58template <typename A, typename B> struct HashPair {
59 size_t operator()(const std::pair<A, B> &Val) const {
60 std::hash<A> Hasher;
61 size_t Seed = Hasher(Val.first);
62 hashCombine(Seed, Val.second);
63 return Seed;
64 }
65};
66
67} // namespace
68
69void ClusterAlgorithm::computeClusterAverageFrequency(const BinaryContext &BC) {
70 // Create a separate MCCodeEmitter to allow lock-free execution
71 BinaryContext::IndependentCodeEmitter Emitter;
72 if (!opts::NoThreads)
73 Emitter = BC.createIndependentMCCodeEmitter();
74
75 AvgFreq.resize(new_size: Clusters.size(), x: 0.0);
76 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
77 double Freq = 0.0;
78 uint64_t ClusterSize = 0;
79 for (const BinaryBasicBlock *BB : Clusters[I]) {
80 if (BB->getNumNonPseudos() > 0) {
81 Freq += BB->getExecutionCount();
82 // Estimate the size of a block in bytes at run time
83 // NOTE: This might be inaccurate
84 ClusterSize += BB->estimateSize(Emitter: Emitter.MCE.get());
85 }
86 }
87 AvgFreq[I] = ClusterSize == 0 ? 0 : Freq / ClusterSize;
88 }
89}
90
91void ClusterAlgorithm::printClusters() const {
92 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
93 errs() << "Cluster number " << I;
94 if (AvgFreq.size() == Clusters.size())
95 errs() << " (frequency: " << AvgFreq[I] << ")";
96 errs() << " : ";
97 const char *Sep = "";
98 for (const BinaryBasicBlock *BB : Clusters[I]) {
99 errs() << Sep << BB->getName();
100 Sep = ", ";
101 }
102 errs() << "\n";
103 }
104}
105
106void ClusterAlgorithm::reset() {
107 Clusters.clear();
108 ClusterEdges.clear();
109 AvgFreq.clear();
110}
111
112void GreedyClusterAlgorithm::EdgeTy::print(raw_ostream &OS) const {
113 OS << Src->getName() << " -> " << Dst->getName() << ", count: " << Count;
114}
115
116size_t GreedyClusterAlgorithm::EdgeHash::operator()(const EdgeTy &E) const {
117 HashPair<const BinaryBasicBlock *, const BinaryBasicBlock *> Hasher;
118 return Hasher(std::make_pair(x: E.Src, y: E.Dst));
119}
120
121bool GreedyClusterAlgorithm::EdgeEqual::operator()(const EdgeTy &A,
122 const EdgeTy &B) const {
123 return A.Src == B.Src && A.Dst == B.Dst;
124}
125
126void GreedyClusterAlgorithm::clusterBasicBlocks(BinaryFunction &BF,
127 bool ComputeEdges) {
128 reset();
129
130 // Greedy heuristic implementation for the TSP, applied to BB layout. Try to
131 // maximize weight during a path traversing all BBs. In this way, we will
132 // convert the hottest branches into fall-throughs.
133
134 // This is the queue of edges from which we will pop edges and use them to
135 // cluster basic blocks in a greedy fashion.
136 std::vector<EdgeTy> Queue;
137
138 // Initialize inter-cluster weights.
139 if (ComputeEdges)
140 ClusterEdges.resize(new_size: BF.getLayout().block_size());
141
142 // Initialize clusters and edge queue.
143 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
144 // Create a cluster for this BB.
145 uint32_t I = Clusters.size();
146 Clusters.emplace_back();
147 std::vector<BinaryBasicBlock *> &Cluster = Clusters.back();
148 Cluster.push_back(x: BB);
149 BBToClusterMap[BB] = I;
150 // Populate priority queue with edges.
151 auto BI = BB->branch_info_begin();
152 for (const BinaryBasicBlock *I : BB->successors()) {
153 assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
154 "attempted reordering blocks of function with no profile data");
155 Queue.emplace_back(args: EdgeTy(BB, I, BI->Count));
156 ++BI;
157 }
158 }
159 // Sort and adjust the edge queue.
160 initQueue(Queue, BF);
161
162 // Grow clusters in a greedy fashion.
163 while (!Queue.empty()) {
164 EdgeTy E = Queue.back();
165 Queue.pop_back();
166
167 const BinaryBasicBlock *SrcBB = E.Src;
168 const BinaryBasicBlock *DstBB = E.Dst;
169
170 LLVM_DEBUG(dbgs() << "Popped edge "; E.print(dbgs()); dbgs() << "\n");
171
172 // Case 1: BBSrc and BBDst are the same. Ignore this edge
173 if (SrcBB == DstBB || DstBB == *BF.getLayout().block_begin()) {
174 LLVM_DEBUG(dbgs() << "\tIgnored (same src, dst)\n");
175 continue;
176 }
177
178 int I = BBToClusterMap[SrcBB];
179 int J = BBToClusterMap[DstBB];
180
181 // Case 2: If they are already allocated at the same cluster, just increase
182 // the weight of this cluster
183 if (I == J) {
184 if (ComputeEdges)
185 ClusterEdges[I][I] += E.Count;
186 LLVM_DEBUG(dbgs() << "\tIgnored (src, dst belong to the same cluster)\n");
187 continue;
188 }
189
190 std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I];
191 std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J];
192 if (areClustersCompatible(Front: ClusterA, Back: ClusterB, E)) {
193 // Case 3: SrcBB is at the end of a cluster and DstBB is at the start,
194 // allowing us to merge two clusters.
195 for (const BinaryBasicBlock *BB : ClusterB)
196 BBToClusterMap[BB] = I;
197 ClusterA.insert(position: ClusterA.end(), first: ClusterB.begin(), last: ClusterB.end());
198 ClusterB.clear();
199 if (ComputeEdges) {
200 // Increase the intra-cluster edge count of cluster A with the count of
201 // this edge as well as with the total count of previously visited edges
202 // from cluster B cluster A.
203 ClusterEdges[I][I] += E.Count;
204 ClusterEdges[I][I] += ClusterEdges[J][I];
205 // Iterate through all inter-cluster edges and transfer edges targeting
206 // cluster B to cluster A.
207 for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K)
208 ClusterEdges[K][I] += ClusterEdges[K][J];
209 }
210 // Adjust the weights of the remaining edges and re-sort the queue.
211 adjustQueue(Queue, BF);
212 LLVM_DEBUG(dbgs() << "\tMerged clusters of src, dst\n");
213 } else {
214 // Case 4: Both SrcBB and DstBB are allocated in positions we cannot
215 // merge them. Add the count of this edge to the inter-cluster edge count
216 // between clusters A and B to help us decide ordering between these
217 // clusters.
218 if (ComputeEdges)
219 ClusterEdges[I][J] += E.Count;
220 LLVM_DEBUG(
221 dbgs() << "\tIgnored (src, dst belong to incompatible clusters)\n");
222 }
223 }
224}
225
226void GreedyClusterAlgorithm::reset() {
227 ClusterAlgorithm::reset();
228 BBToClusterMap.clear();
229}
230
231void PHGreedyClusterAlgorithm::initQueue(std::vector<EdgeTy> &Queue,
232 const BinaryFunction &BF) {
233 // Define a comparison function to establish SWO between edges.
234 auto Comp = [&BF](const EdgeTy &A, const EdgeTy &B) {
235 // With equal weights, prioritize branches with lower index
236 // source/destination. This helps to keep original block order for blocks
237 // when optimal order cannot be deducted from a profile.
238 if (A.Count == B.Count) {
239 const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A: A.Src, B: B.Src);
240 return (SrcOrder != 0)
241 ? SrcOrder > 0
242 : BF.getOriginalLayoutRelativeOrder(A: A.Dst, B: B.Dst) > 0;
243 }
244 return A.Count < B.Count;
245 };
246
247 // Sort edges in increasing profile count order.
248 llvm::sort(C&: Queue, Comp);
249}
250
251void PHGreedyClusterAlgorithm::adjustQueue(std::vector<EdgeTy> &Queue,
252 const BinaryFunction &BF) {
253 // Nothing to do.
254}
255
256bool PHGreedyClusterAlgorithm::areClustersCompatible(const ClusterTy &Front,
257 const ClusterTy &Back,
258 const EdgeTy &E) const {
259 return Front.back() == E.Src && Back.front() == E.Dst;
260}
261
262int64_t MinBranchGreedyClusterAlgorithm::calculateWeight(
263 const EdgeTy &E, const BinaryFunction &BF) const {
264 const BinaryBasicBlock *SrcBB = E.Src;
265 const BinaryBasicBlock *DstBB = E.Dst;
266
267 // Initial weight value.
268 int64_t W = (int64_t)E.Count;
269
270 // Adjust the weight by taking into account other edges with the same source.
271 auto BI = SrcBB->branch_info_begin();
272 for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) {
273 assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
274 "attempted reordering blocks of function with no profile data");
275 assert(BI->Count <= std::numeric_limits<int64_t>::max() &&
276 "overflow detected");
277 // Ignore edges with same source and destination, edges that target the
278 // entry block as well as the edge E itself.
279 if (SuccBB != SrcBB && SuccBB != *BF.getLayout().block_begin() &&
280 SuccBB != DstBB)
281 W -= (int64_t)BI->Count;
282 ++BI;
283 }
284
285 // Adjust the weight by taking into account other edges with the same
286 // destination.
287 for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) {
288 // Ignore edges with same source and destination as well as the edge E
289 // itself.
290 if (PredBB == DstBB || PredBB == SrcBB)
291 continue;
292 auto BI = PredBB->branch_info_begin();
293 for (const BinaryBasicBlock *SuccBB : PredBB->successors()) {
294 if (SuccBB == DstBB)
295 break;
296 ++BI;
297 }
298 assert(BI != PredBB->branch_info_end() && "invalid control flow graph");
299 assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
300 "attempted reordering blocks of function with no profile data");
301 assert(BI->Count <= std::numeric_limits<int64_t>::max() &&
302 "overflow detected");
303 W -= (int64_t)BI->Count;
304 }
305
306 return W;
307}
308
309void MinBranchGreedyClusterAlgorithm::initQueue(std::vector<EdgeTy> &Queue,
310 const BinaryFunction &BF) {
311 // Initialize edge weights.
312 for (const EdgeTy &E : Queue)
313 Weight.emplace(args: std::make_pair(x: E, y: calculateWeight(E, BF)));
314
315 // Sort edges in increasing weight order.
316 adjustQueue(Queue, BF);
317}
318
319void MinBranchGreedyClusterAlgorithm::adjustQueue(std::vector<EdgeTy> &Queue,
320 const BinaryFunction &BF) {
321 // Define a comparison function to establish SWO between edges.
322 auto Comp = [&](const EdgeTy &A, const EdgeTy &B) {
323 // With equal weights, prioritize branches with lower index
324 // source/destination. This helps to keep original block order for blocks
325 // when optimal order cannot be deduced from a profile.
326 if (Weight[A] == Weight[B]) {
327 const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A: A.Src, B: B.Src);
328 return (SrcOrder != 0)
329 ? SrcOrder > 0
330 : BF.getOriginalLayoutRelativeOrder(A: A.Dst, B: B.Dst) > 0;
331 }
332 return Weight[A] < Weight[B];
333 };
334
335 // Iterate through all remaining edges to find edges that have their
336 // source and destination in the same cluster.
337 std::vector<EdgeTy> NewQueue;
338 for (const EdgeTy &E : Queue) {
339 const BinaryBasicBlock *SrcBB = E.Src;
340 const BinaryBasicBlock *DstBB = E.Dst;
341
342 // Case 1: SrcBB and DstBB are the same or DstBB is the entry block. Ignore
343 // this edge.
344 if (SrcBB == DstBB || DstBB == *BF.getLayout().block_begin()) {
345 LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs());
346 dbgs() << " (same src, dst)\n");
347 continue;
348 }
349
350 int I = BBToClusterMap[SrcBB];
351 int J = BBToClusterMap[DstBB];
352 std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I];
353 std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J];
354
355 // Case 2: They are already allocated at the same cluster or incompatible
356 // clusters. Adjust the weights of edges with the same source or
357 // destination, so that this edge has no effect on them any more, and ignore
358 // this edge. Also increase the intra- (or inter-) cluster edge count.
359 if (I == J || !areClustersCompatible(Front: ClusterA, Back: ClusterB, E)) {
360 if (!ClusterEdges.empty())
361 ClusterEdges[I][J] += E.Count;
362 LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs());
363 dbgs() << " (src, dst belong to same cluster or incompatible "
364 "clusters)\n");
365 for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) {
366 if (SuccBB == DstBB)
367 continue;
368 auto WI = Weight.find(x: EdgeTy(SrcBB, SuccBB, 0));
369 assert(WI != Weight.end() && "CFG edge not found in Weight map");
370 WI->second += (int64_t)E.Count;
371 }
372 for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) {
373 if (PredBB == SrcBB)
374 continue;
375 auto WI = Weight.find(x: EdgeTy(PredBB, DstBB, 0));
376 assert(WI != Weight.end() && "CFG edge not found in Weight map");
377 WI->second += (int64_t)E.Count;
378 }
379 continue;
380 }
381
382 // Case 3: None of the previous cases is true, so just keep this edge in
383 // the queue.
384 NewQueue.emplace_back(args: E);
385 }
386
387 // Sort remaining edges in increasing weight order.
388 Queue.swap(x&: NewQueue);
389 llvm::sort(C&: Queue, Comp);
390}
391
392bool MinBranchGreedyClusterAlgorithm::areClustersCompatible(
393 const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const {
394 return Front.back() == E.Src && Back.front() == E.Dst;
395}
396
397void MinBranchGreedyClusterAlgorithm::reset() {
398 GreedyClusterAlgorithm::reset();
399 Weight.clear();
400}
401
402void TSPReorderAlgorithm::reorderBasicBlocks(BinaryFunction &BF,
403 BasicBlockOrder &Order) const {
404 std::vector<std::vector<uint64_t>> Weight;
405 std::vector<BinaryBasicBlock *> IndexToBB;
406
407 const size_t N = BF.getLayout().block_size();
408 assert(N <= std::numeric_limits<uint64_t>::digits &&
409 "cannot use TSP solution for sizes larger than bits in uint64_t");
410
411 // Populating weight map and index map
412 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
413 BB->setLayoutIndex(IndexToBB.size());
414 IndexToBB.push_back(x: BB);
415 }
416 Weight.resize(new_size: N);
417 for (const BinaryBasicBlock *BB : BF.getLayout().blocks()) {
418 auto BI = BB->branch_info_begin();
419 Weight[BB->getLayoutIndex()].resize(new_size: N);
420 for (BinaryBasicBlock *SuccBB : BB->successors()) {
421 if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE)
422 Weight[BB->getLayoutIndex()][SuccBB->getLayoutIndex()] = BI->Count;
423 ++BI;
424 }
425 }
426
427 std::vector<std::vector<int64_t>> DP;
428 DP.resize(new_size: static_cast<size_t>(1) << N);
429 for (std::vector<int64_t> &Elmt : DP)
430 Elmt.resize(new_size: N, x: -1);
431
432 // Start with the entry basic block being allocated with cost zero
433 DP[1][0] = 0;
434 // Walk through TSP solutions using a bitmask to represent state (current set
435 // of BBs in the layout)
436 uint64_t BestSet = 1;
437 uint64_t BestLast = 0;
438 int64_t BestWeight = 0;
439 for (uint64_t Set = 1; Set < (1ULL << N); ++Set) {
440 // Traverse each possibility of Last BB visited in this layout
441 for (uint64_t Last = 0; Last < N; ++Last) {
442 // Case 1: There is no possible layout with this BB as Last
443 if (DP[Set][Last] == -1)
444 continue;
445
446 // Case 2: There is a layout with this Set and this Last, and we try
447 // to expand this set with New
448 for (uint64_t New = 1; New < N; ++New) {
449 // Case 2a: BB "New" is already in this Set
450 if ((Set & (1ULL << New)) != 0)
451 continue;
452
453 // Case 2b: BB "New" is not in this set and we add it to this Set and
454 // record total weight of this layout with "New" as the last BB.
455 uint64_t NewSet = (Set | (1ULL << New));
456 if (DP[NewSet][New] == -1)
457 DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New];
458 DP[NewSet][New] = std::max(a: DP[NewSet][New],
459 b: DP[Set][Last] + (int64_t)Weight[Last][New]);
460
461 if (DP[NewSet][New] > BestWeight) {
462 BestWeight = DP[NewSet][New];
463 BestSet = NewSet;
464 BestLast = New;
465 }
466 }
467 }
468 }
469
470 // Define final function layout based on layout that maximizes weight
471 uint64_t Last = BestLast;
472 uint64_t Set = BestSet;
473 BitVector Visited;
474 Visited.resize(N);
475 Visited[Last] = true;
476 Order.push_back(Elt: IndexToBB[Last]);
477 Set = Set & ~(1ULL << Last);
478 while (Set != 0) {
479 int64_t Best = -1;
480 uint64_t NewLast;
481 for (uint64_t I = 0; I < N; ++I) {
482 if (DP[Set][I] == -1)
483 continue;
484 int64_t AdjWeight = Weight[I][Last] > 0 ? Weight[I][Last] : 0;
485 if (DP[Set][I] + AdjWeight > Best) {
486 NewLast = I;
487 Best = DP[Set][I] + AdjWeight;
488 }
489 }
490 Last = NewLast;
491 Visited[Last] = true;
492 Order.push_back(Elt: IndexToBB[Last]);
493 Set = Set & ~(1ULL << Last);
494 }
495 std::reverse(first: Order.begin(), last: Order.end());
496
497 // Finalize layout with BBs that weren't assigned to the layout using the
498 // input layout.
499 for (BinaryBasicBlock *BB : BF.getLayout().blocks())
500 if (Visited[BB->getLayoutIndex()] == false)
501 Order.push_back(Elt: BB);
502}
503
504void ExtTSPReorderAlgorithm::reorderBasicBlocks(BinaryFunction &BF,
505 BasicBlockOrder &Order) const {
506 if (BF.getLayout().block_empty())
507 return;
508
509 // Do not change layout of functions w/o profile information
510 if (!BF.hasValidProfile() || BF.getLayout().block_size() <= 2) {
511 for (BinaryBasicBlock *BB : BF.getLayout().blocks())
512 Order.push_back(Elt: BB);
513 return;
514 }
515
516 // Create a separate MCCodeEmitter to allow lock-free execution
517 BinaryContext::IndependentCodeEmitter Emitter;
518 if (!opts::NoThreads)
519 Emitter = BF.getBinaryContext().createIndependentMCCodeEmitter();
520
521 // Initialize CFG nodes and their data
522 std::vector<uint64_t> BlockSizes;
523 std::vector<uint64_t> BlockCounts;
524 BasicBlockOrder OrigOrder;
525 BF.getLayout().updateLayoutIndices();
526 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
527 uint64_t Size = std::max<uint64_t>(a: BB->estimateSize(Emitter: Emitter.MCE.get()), b: 1);
528 BlockSizes.push_back(x: Size);
529 BlockCounts.push_back(x: BB->getKnownExecutionCount());
530 OrigOrder.push_back(Elt: BB);
531 }
532
533 // Initialize CFG edges
534 std::vector<codelayout::EdgeCount> JumpCounts;
535 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
536 auto BI = BB->branch_info_begin();
537 for (BinaryBasicBlock *SuccBB : BB->successors()) {
538 assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
539 "missing profile for a jump");
540 JumpCounts.push_back(
541 x: {.src: BB->getLayoutIndex(), .dst: SuccBB->getLayoutIndex(), .count: BI->Count});
542 ++BI;
543 }
544 }
545
546 // Run the layout algorithm
547 auto Result =
548 codelayout::computeExtTspLayout(NodeSizes: BlockSizes, NodeCounts: BlockCounts, EdgeCounts: JumpCounts);
549 Order.reserve(N: BF.getLayout().block_size());
550 for (uint64_t R : Result)
551 Order.push_back(Elt: OrigOrder[R]);
552}
553
554void OptimizeReorderAlgorithm::reorderBasicBlocks(
555 BinaryFunction &BF, BasicBlockOrder &Order) const {
556 if (BF.getLayout().block_empty())
557 return;
558
559 // Cluster basic blocks.
560 CAlgo->clusterBasicBlocks(BF);
561
562 if (opts::PrintClusters)
563 CAlgo->printClusters();
564
565 // Arrange basic blocks according to clusters.
566 for (ClusterAlgorithm::ClusterTy &Cluster : CAlgo->Clusters)
567 Order.insert(I: Order.end(), From: Cluster.begin(), To: Cluster.end());
568}
569
570void OptimizeBranchReorderAlgorithm::reorderBasicBlocks(
571 BinaryFunction &BF, BasicBlockOrder &Order) const {
572 if (BF.getLayout().block_empty())
573 return;
574
575 // Cluster basic blocks.
576 CAlgo->clusterBasicBlocks(BF, /* ComputeEdges = */ true);
577 std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
578 std::vector<std::unordered_map<uint32_t, uint64_t>> &ClusterEdges =
579 CAlgo->ClusterEdges;
580
581 // Compute clusters' average frequencies.
582 CAlgo->computeClusterAverageFrequency(BC: BF.getBinaryContext());
583 std::vector<double> &AvgFreq = CAlgo->AvgFreq;
584
585 if (opts::PrintClusters)
586 CAlgo->printClusters();
587
588 // Cluster layout order
589 std::vector<uint32_t> ClusterOrder;
590
591 // Do a topological sort for clusters, prioritizing frequently-executed BBs
592 // during the traversal.
593 std::stack<uint32_t> Stack;
594 std::vector<uint32_t> Status;
595 std::vector<uint32_t> Parent;
596 Status.resize(new_size: Clusters.size(), x: 0);
597 Parent.resize(new_size: Clusters.size(), x: 0);
598 constexpr uint32_t STACKED = 1;
599 constexpr uint32_t VISITED = 2;
600 Status[0] = STACKED;
601 Stack.push(x: 0);
602 while (!Stack.empty()) {
603 uint32_t I = Stack.top();
604 if (!(Status[I] & VISITED)) {
605 Status[I] |= VISITED;
606 // Order successors by weight
607 auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) {
608 return ClusterEdges[I][A] > ClusterEdges[I][B];
609 };
610 std::priority_queue<uint32_t, std::vector<uint32_t>,
611 decltype(ClusterComp)>
612 SuccQueue(ClusterComp);
613 for (std::pair<const uint32_t, uint64_t> &Target : ClusterEdges[I]) {
614 if (Target.second > 0 && !(Status[Target.first] & STACKED) &&
615 !Clusters[Target.first].empty()) {
616 Parent[Target.first] = I;
617 Status[Target.first] = STACKED;
618 SuccQueue.push(x: Target.first);
619 }
620 }
621 while (!SuccQueue.empty()) {
622 Stack.push(x: SuccQueue.top());
623 SuccQueue.pop();
624 }
625 continue;
626 }
627 // Already visited this node
628 Stack.pop();
629 ClusterOrder.push_back(x: I);
630 }
631 std::reverse(first: ClusterOrder.begin(), last: ClusterOrder.end());
632 // Put unreachable clusters at the end
633 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
634 if (!(Status[I] & VISITED) && !Clusters[I].empty())
635 ClusterOrder.push_back(x: I);
636
637 // Sort nodes with equal precedence
638 auto Beg = ClusterOrder.begin();
639 // Don't reorder the first cluster, which contains the function entry point
640 ++Beg;
641 std::stable_sort(first: Beg, last: ClusterOrder.end(),
642 comp: [&AvgFreq, &Parent](uint32_t A, uint32_t B) {
643 uint32_t P = Parent[A];
644 while (Parent[P] != 0) {
645 if (Parent[P] == B)
646 return false;
647 P = Parent[P];
648 }
649 P = Parent[B];
650 while (Parent[P] != 0) {
651 if (Parent[P] == A)
652 return true;
653 P = Parent[P];
654 }
655 return AvgFreq[A] > AvgFreq[B];
656 });
657
658 if (opts::PrintClusters) {
659 errs() << "New cluster order: ";
660 const char *Sep = "";
661 for (uint32_t O : ClusterOrder) {
662 errs() << Sep << O;
663 Sep = ", ";
664 }
665 errs() << '\n';
666 }
667
668 // Arrange basic blocks according to cluster order.
669 for (uint32_t ClusterIndex : ClusterOrder) {
670 ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
671 Order.insert(I: Order.end(), From: Cluster.begin(), To: Cluster.end());
672 }
673}
674
675void OptimizeCacheReorderAlgorithm::reorderBasicBlocks(
676 BinaryFunction &BF, BasicBlockOrder &Order) const {
677 if (BF.getLayout().block_empty())
678 return;
679
680 const uint64_t ColdThreshold =
681 opts::ColdThreshold *
682 (*BF.getLayout().block_begin())->getExecutionCount() / 1000;
683
684 // Cluster basic blocks.
685 CAlgo->clusterBasicBlocks(BF);
686 std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
687
688 // Compute clusters' average frequencies.
689 CAlgo->computeClusterAverageFrequency(BC: BF.getBinaryContext());
690 std::vector<double> &AvgFreq = CAlgo->AvgFreq;
691
692 if (opts::PrintClusters)
693 CAlgo->printClusters();
694
695 // Cluster layout order
696 std::vector<uint32_t> ClusterOrder;
697
698 // Order clusters based on average instruction execution frequency
699 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
700 if (!Clusters[I].empty())
701 ClusterOrder.push_back(x: I);
702 // Don't reorder the first cluster, which contains the function entry point
703 std::stable_sort(
704 first: std::next(x: ClusterOrder.begin()), last: ClusterOrder.end(),
705 comp: [&AvgFreq](uint32_t A, uint32_t B) { return AvgFreq[A] > AvgFreq[B]; });
706
707 if (opts::PrintClusters) {
708 errs() << "New cluster order: ";
709 const char *Sep = "";
710 for (uint32_t O : ClusterOrder) {
711 errs() << Sep << O;
712 Sep = ", ";
713 }
714 errs() << '\n';
715 }
716
717 // Arrange basic blocks according to cluster order.
718 for (uint32_t ClusterIndex : ClusterOrder) {
719 ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
720 Order.insert(I: Order.end(), From: Cluster.begin(), To: Cluster.end());
721 // Force zero execution count on clusters that do not meet the cut off
722 // specified by --cold-threshold.
723 if (AvgFreq[ClusterIndex] < static_cast<double>(ColdThreshold))
724 for (BinaryBasicBlock *BBPtr : Cluster)
725 BBPtr->setExecutionCount(0);
726 }
727}
728
729void ReverseReorderAlgorithm::reorderBasicBlocks(BinaryFunction &BF,
730 BasicBlockOrder &Order) const {
731 if (BF.getLayout().block_empty())
732 return;
733
734 BinaryBasicBlock *FirstBB = *BF.getLayout().block_begin();
735 Order.push_back(Elt: FirstBB);
736 for (auto RLI = BF.getLayout().block_rbegin(); *RLI != FirstBB; ++RLI)
737 Order.push_back(Elt: *RLI);
738}
739
740void RandomClusterReorderAlgorithm::reorderBasicBlocks(
741 BinaryFunction &BF, BasicBlockOrder &Order) const {
742 if (BF.getLayout().block_empty())
743 return;
744
745 // Cluster basic blocks.
746 CAlgo->clusterBasicBlocks(BF);
747 std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
748
749 if (opts::PrintClusters)
750 CAlgo->printClusters();
751
752 // Cluster layout order
753 std::vector<uint32_t> ClusterOrder;
754
755 // Order clusters based on average instruction execution frequency
756 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
757 if (!Clusters[I].empty())
758 ClusterOrder.push_back(x: I);
759
760 std::shuffle(first: std::next(x: ClusterOrder.begin()), last: ClusterOrder.end(),
761 g: std::default_random_engine(opts::RandomSeed.getValue()));
762
763 if (opts::PrintClusters) {
764 errs() << "New cluster order: ";
765 const char *Sep = "";
766 for (uint32_t O : ClusterOrder) {
767 errs() << Sep << O;
768 Sep = ", ";
769 }
770 errs() << '\n';
771 }
772
773 // Arrange basic blocks according to cluster order.
774 for (uint32_t ClusterIndex : ClusterOrder) {
775 ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
776 Order.insert(I: Order.end(), From: Cluster.begin(), To: Cluster.end());
777 }
778}
779

source code of bolt/lib/Passes/ReorderAlgorithm.cpp