1 | //===- bolt/Passes/ReorderAlgorithm.cpp - Basic block reordering ----------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements classes used by several basic block reordering |
10 | // algorithms. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "bolt/Passes/ReorderAlgorithm.h" |
15 | #include "bolt/Core/BinaryBasicBlock.h" |
16 | #include "bolt/Core/BinaryFunction.h" |
17 | #include "llvm/Support/CommandLine.h" |
18 | #include "llvm/Transforms/Utils/CodeLayout.h" |
19 | #include <queue> |
20 | #include <random> |
21 | #include <stack> |
22 | |
23 | #undef DEBUG_TYPE |
24 | #define DEBUG_TYPE "bolt" |
25 | |
26 | using namespace llvm; |
27 | using namespace bolt; |
28 | |
29 | namespace opts { |
30 | |
31 | extern cl::OptionCategory BoltOptCategory; |
32 | extern cl::opt<bool> NoThreads; |
33 | |
34 | static cl::opt<unsigned> ColdThreshold( |
35 | "cold-threshold" , |
36 | cl::desc("tenths of percents of main entry frequency to use as a " |
37 | "threshold when evaluating whether a basic block is cold " |
38 | "(0 means it is only considered cold if the block has zero " |
39 | "samples). Default: 0 " ), |
40 | cl::init(Val: 0), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory)); |
41 | |
42 | static cl::opt<bool> PrintClusters("print-clusters" , cl::desc("print clusters" ), |
43 | cl::Hidden, cl::cat(BoltOptCategory)); |
44 | |
45 | cl::opt<uint32_t> RandomSeed("bolt-seed" , cl::desc("seed for randomization" ), |
46 | cl::init(Val: 42), cl::Hidden, |
47 | cl::cat(BoltOptCategory)); |
48 | |
49 | } // namespace opts |
50 | |
51 | namespace { |
52 | |
53 | template <class T> inline void hashCombine(size_t &Seed, const T &Val) { |
54 | std::hash<T> Hasher; |
55 | Seed ^= Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2); |
56 | } |
57 | |
58 | template <typename A, typename B> struct HashPair { |
59 | size_t operator()(const std::pair<A, B> &Val) const { |
60 | std::hash<A> Hasher; |
61 | size_t Seed = Hasher(Val.first); |
62 | hashCombine(Seed, Val.second); |
63 | return Seed; |
64 | } |
65 | }; |
66 | |
67 | } // namespace |
68 | |
69 | void ClusterAlgorithm::computeClusterAverageFrequency(const BinaryContext &BC) { |
70 | // Create a separate MCCodeEmitter to allow lock-free execution |
71 | BinaryContext::IndependentCodeEmitter Emitter; |
72 | if (!opts::NoThreads) |
73 | Emitter = BC.createIndependentMCCodeEmitter(); |
74 | |
75 | AvgFreq.resize(new_size: Clusters.size(), x: 0.0); |
76 | for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { |
77 | double Freq = 0.0; |
78 | uint64_t ClusterSize = 0; |
79 | for (const BinaryBasicBlock *BB : Clusters[I]) { |
80 | if (BB->getNumNonPseudos() > 0) { |
81 | Freq += BB->getExecutionCount(); |
82 | // Estimate the size of a block in bytes at run time |
83 | // NOTE: This might be inaccurate |
84 | ClusterSize += BB->estimateSize(Emitter: Emitter.MCE.get()); |
85 | } |
86 | } |
87 | AvgFreq[I] = ClusterSize == 0 ? 0 : Freq / ClusterSize; |
88 | } |
89 | } |
90 | |
91 | void ClusterAlgorithm::printClusters() const { |
92 | for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { |
93 | errs() << "Cluster number " << I; |
94 | if (AvgFreq.size() == Clusters.size()) |
95 | errs() << " (frequency: " << AvgFreq[I] << ")" ; |
96 | errs() << " : " ; |
97 | const char *Sep = "" ; |
98 | for (const BinaryBasicBlock *BB : Clusters[I]) { |
99 | errs() << Sep << BB->getName(); |
100 | Sep = ", " ; |
101 | } |
102 | errs() << "\n" ; |
103 | } |
104 | } |
105 | |
106 | void ClusterAlgorithm::reset() { |
107 | Clusters.clear(); |
108 | ClusterEdges.clear(); |
109 | AvgFreq.clear(); |
110 | } |
111 | |
112 | void GreedyClusterAlgorithm::EdgeTy::print(raw_ostream &OS) const { |
113 | OS << Src->getName() << " -> " << Dst->getName() << ", count: " << Count; |
114 | } |
115 | |
116 | size_t GreedyClusterAlgorithm::EdgeHash::operator()(const EdgeTy &E) const { |
117 | HashPair<const BinaryBasicBlock *, const BinaryBasicBlock *> Hasher; |
118 | return Hasher(std::make_pair(x: E.Src, y: E.Dst)); |
119 | } |
120 | |
121 | bool GreedyClusterAlgorithm::EdgeEqual::operator()(const EdgeTy &A, |
122 | const EdgeTy &B) const { |
123 | return A.Src == B.Src && A.Dst == B.Dst; |
124 | } |
125 | |
126 | void GreedyClusterAlgorithm::clusterBasicBlocks(BinaryFunction &BF, |
127 | bool ComputeEdges) { |
128 | reset(); |
129 | |
130 | // Greedy heuristic implementation for the TSP, applied to BB layout. Try to |
131 | // maximize weight during a path traversing all BBs. In this way, we will |
132 | // convert the hottest branches into fall-throughs. |
133 | |
134 | // This is the queue of edges from which we will pop edges and use them to |
135 | // cluster basic blocks in a greedy fashion. |
136 | std::vector<EdgeTy> Queue; |
137 | |
138 | // Initialize inter-cluster weights. |
139 | if (ComputeEdges) |
140 | ClusterEdges.resize(new_size: BF.getLayout().block_size()); |
141 | |
142 | // Initialize clusters and edge queue. |
143 | for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { |
144 | // Create a cluster for this BB. |
145 | uint32_t I = Clusters.size(); |
146 | Clusters.emplace_back(); |
147 | std::vector<BinaryBasicBlock *> &Cluster = Clusters.back(); |
148 | Cluster.push_back(x: BB); |
149 | BBToClusterMap[BB] = I; |
150 | // Populate priority queue with edges. |
151 | auto BI = BB->branch_info_begin(); |
152 | for (const BinaryBasicBlock *I : BB->successors()) { |
153 | assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && |
154 | "attempted reordering blocks of function with no profile data" ); |
155 | Queue.emplace_back(args: EdgeTy(BB, I, BI->Count)); |
156 | ++BI; |
157 | } |
158 | } |
159 | // Sort and adjust the edge queue. |
160 | initQueue(Queue, BF); |
161 | |
162 | // Grow clusters in a greedy fashion. |
163 | while (!Queue.empty()) { |
164 | EdgeTy E = Queue.back(); |
165 | Queue.pop_back(); |
166 | |
167 | const BinaryBasicBlock *SrcBB = E.Src; |
168 | const BinaryBasicBlock *DstBB = E.Dst; |
169 | |
170 | LLVM_DEBUG(dbgs() << "Popped edge " ; E.print(dbgs()); dbgs() << "\n" ); |
171 | |
172 | // Case 1: BBSrc and BBDst are the same. Ignore this edge |
173 | if (SrcBB == DstBB || DstBB == *BF.getLayout().block_begin()) { |
174 | LLVM_DEBUG(dbgs() << "\tIgnored (same src, dst)\n" ); |
175 | continue; |
176 | } |
177 | |
178 | int I = BBToClusterMap[SrcBB]; |
179 | int J = BBToClusterMap[DstBB]; |
180 | |
181 | // Case 2: If they are already allocated at the same cluster, just increase |
182 | // the weight of this cluster |
183 | if (I == J) { |
184 | if (ComputeEdges) |
185 | ClusterEdges[I][I] += E.Count; |
186 | LLVM_DEBUG(dbgs() << "\tIgnored (src, dst belong to the same cluster)\n" ); |
187 | continue; |
188 | } |
189 | |
190 | std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I]; |
191 | std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J]; |
192 | if (areClustersCompatible(Front: ClusterA, Back: ClusterB, E)) { |
193 | // Case 3: SrcBB is at the end of a cluster and DstBB is at the start, |
194 | // allowing us to merge two clusters. |
195 | for (const BinaryBasicBlock *BB : ClusterB) |
196 | BBToClusterMap[BB] = I; |
197 | ClusterA.insert(position: ClusterA.end(), first: ClusterB.begin(), last: ClusterB.end()); |
198 | ClusterB.clear(); |
199 | if (ComputeEdges) { |
200 | // Increase the intra-cluster edge count of cluster A with the count of |
201 | // this edge as well as with the total count of previously visited edges |
202 | // from cluster B cluster A. |
203 | ClusterEdges[I][I] += E.Count; |
204 | ClusterEdges[I][I] += ClusterEdges[J][I]; |
205 | // Iterate through all inter-cluster edges and transfer edges targeting |
206 | // cluster B to cluster A. |
207 | for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K) |
208 | ClusterEdges[K][I] += ClusterEdges[K][J]; |
209 | } |
210 | // Adjust the weights of the remaining edges and re-sort the queue. |
211 | adjustQueue(Queue, BF); |
212 | LLVM_DEBUG(dbgs() << "\tMerged clusters of src, dst\n" ); |
213 | } else { |
214 | // Case 4: Both SrcBB and DstBB are allocated in positions we cannot |
215 | // merge them. Add the count of this edge to the inter-cluster edge count |
216 | // between clusters A and B to help us decide ordering between these |
217 | // clusters. |
218 | if (ComputeEdges) |
219 | ClusterEdges[I][J] += E.Count; |
220 | LLVM_DEBUG( |
221 | dbgs() << "\tIgnored (src, dst belong to incompatible clusters)\n" ); |
222 | } |
223 | } |
224 | } |
225 | |
226 | void GreedyClusterAlgorithm::reset() { |
227 | ClusterAlgorithm::reset(); |
228 | BBToClusterMap.clear(); |
229 | } |
230 | |
231 | void PHGreedyClusterAlgorithm::initQueue(std::vector<EdgeTy> &Queue, |
232 | const BinaryFunction &BF) { |
233 | // Define a comparison function to establish SWO between edges. |
234 | auto Comp = [&BF](const EdgeTy &A, const EdgeTy &B) { |
235 | // With equal weights, prioritize branches with lower index |
236 | // source/destination. This helps to keep original block order for blocks |
237 | // when optimal order cannot be deducted from a profile. |
238 | if (A.Count == B.Count) { |
239 | const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A: A.Src, B: B.Src); |
240 | return (SrcOrder != 0) |
241 | ? SrcOrder > 0 |
242 | : BF.getOriginalLayoutRelativeOrder(A: A.Dst, B: B.Dst) > 0; |
243 | } |
244 | return A.Count < B.Count; |
245 | }; |
246 | |
247 | // Sort edges in increasing profile count order. |
248 | llvm::sort(C&: Queue, Comp); |
249 | } |
250 | |
251 | void PHGreedyClusterAlgorithm::adjustQueue(std::vector<EdgeTy> &Queue, |
252 | const BinaryFunction &BF) { |
253 | // Nothing to do. |
254 | } |
255 | |
256 | bool PHGreedyClusterAlgorithm::areClustersCompatible(const ClusterTy &Front, |
257 | const ClusterTy &Back, |
258 | const EdgeTy &E) const { |
259 | return Front.back() == E.Src && Back.front() == E.Dst; |
260 | } |
261 | |
262 | int64_t MinBranchGreedyClusterAlgorithm::calculateWeight( |
263 | const EdgeTy &E, const BinaryFunction &BF) const { |
264 | const BinaryBasicBlock *SrcBB = E.Src; |
265 | const BinaryBasicBlock *DstBB = E.Dst; |
266 | |
267 | // Initial weight value. |
268 | int64_t W = (int64_t)E.Count; |
269 | |
270 | // Adjust the weight by taking into account other edges with the same source. |
271 | auto BI = SrcBB->branch_info_begin(); |
272 | for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) { |
273 | assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && |
274 | "attempted reordering blocks of function with no profile data" ); |
275 | assert(BI->Count <= std::numeric_limits<int64_t>::max() && |
276 | "overflow detected" ); |
277 | // Ignore edges with same source and destination, edges that target the |
278 | // entry block as well as the edge E itself. |
279 | if (SuccBB != SrcBB && SuccBB != *BF.getLayout().block_begin() && |
280 | SuccBB != DstBB) |
281 | W -= (int64_t)BI->Count; |
282 | ++BI; |
283 | } |
284 | |
285 | // Adjust the weight by taking into account other edges with the same |
286 | // destination. |
287 | for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) { |
288 | // Ignore edges with same source and destination as well as the edge E |
289 | // itself. |
290 | if (PredBB == DstBB || PredBB == SrcBB) |
291 | continue; |
292 | auto BI = PredBB->branch_info_begin(); |
293 | for (const BinaryBasicBlock *SuccBB : PredBB->successors()) { |
294 | if (SuccBB == DstBB) |
295 | break; |
296 | ++BI; |
297 | } |
298 | assert(BI != PredBB->branch_info_end() && "invalid control flow graph" ); |
299 | assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && |
300 | "attempted reordering blocks of function with no profile data" ); |
301 | assert(BI->Count <= std::numeric_limits<int64_t>::max() && |
302 | "overflow detected" ); |
303 | W -= (int64_t)BI->Count; |
304 | } |
305 | |
306 | return W; |
307 | } |
308 | |
309 | void MinBranchGreedyClusterAlgorithm::initQueue(std::vector<EdgeTy> &Queue, |
310 | const BinaryFunction &BF) { |
311 | // Initialize edge weights. |
312 | for (const EdgeTy &E : Queue) |
313 | Weight.emplace(args: std::make_pair(x: E, y: calculateWeight(E, BF))); |
314 | |
315 | // Sort edges in increasing weight order. |
316 | adjustQueue(Queue, BF); |
317 | } |
318 | |
319 | void MinBranchGreedyClusterAlgorithm::adjustQueue(std::vector<EdgeTy> &Queue, |
320 | const BinaryFunction &BF) { |
321 | // Define a comparison function to establish SWO between edges. |
322 | auto Comp = [&](const EdgeTy &A, const EdgeTy &B) { |
323 | // With equal weights, prioritize branches with lower index |
324 | // source/destination. This helps to keep original block order for blocks |
325 | // when optimal order cannot be deduced from a profile. |
326 | if (Weight[A] == Weight[B]) { |
327 | const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A: A.Src, B: B.Src); |
328 | return (SrcOrder != 0) |
329 | ? SrcOrder > 0 |
330 | : BF.getOriginalLayoutRelativeOrder(A: A.Dst, B: B.Dst) > 0; |
331 | } |
332 | return Weight[A] < Weight[B]; |
333 | }; |
334 | |
335 | // Iterate through all remaining edges to find edges that have their |
336 | // source and destination in the same cluster. |
337 | std::vector<EdgeTy> NewQueue; |
338 | for (const EdgeTy &E : Queue) { |
339 | const BinaryBasicBlock *SrcBB = E.Src; |
340 | const BinaryBasicBlock *DstBB = E.Dst; |
341 | |
342 | // Case 1: SrcBB and DstBB are the same or DstBB is the entry block. Ignore |
343 | // this edge. |
344 | if (SrcBB == DstBB || DstBB == *BF.getLayout().block_begin()) { |
345 | LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge " ; E.print(dbgs()); |
346 | dbgs() << " (same src, dst)\n" ); |
347 | continue; |
348 | } |
349 | |
350 | int I = BBToClusterMap[SrcBB]; |
351 | int J = BBToClusterMap[DstBB]; |
352 | std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I]; |
353 | std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J]; |
354 | |
355 | // Case 2: They are already allocated at the same cluster or incompatible |
356 | // clusters. Adjust the weights of edges with the same source or |
357 | // destination, so that this edge has no effect on them any more, and ignore |
358 | // this edge. Also increase the intra- (or inter-) cluster edge count. |
359 | if (I == J || !areClustersCompatible(Front: ClusterA, Back: ClusterB, E)) { |
360 | if (!ClusterEdges.empty()) |
361 | ClusterEdges[I][J] += E.Count; |
362 | LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge " ; E.print(dbgs()); |
363 | dbgs() << " (src, dst belong to same cluster or incompatible " |
364 | "clusters)\n" ); |
365 | for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) { |
366 | if (SuccBB == DstBB) |
367 | continue; |
368 | auto WI = Weight.find(x: EdgeTy(SrcBB, SuccBB, 0)); |
369 | assert(WI != Weight.end() && "CFG edge not found in Weight map" ); |
370 | WI->second += (int64_t)E.Count; |
371 | } |
372 | for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) { |
373 | if (PredBB == SrcBB) |
374 | continue; |
375 | auto WI = Weight.find(x: EdgeTy(PredBB, DstBB, 0)); |
376 | assert(WI != Weight.end() && "CFG edge not found in Weight map" ); |
377 | WI->second += (int64_t)E.Count; |
378 | } |
379 | continue; |
380 | } |
381 | |
382 | // Case 3: None of the previous cases is true, so just keep this edge in |
383 | // the queue. |
384 | NewQueue.emplace_back(args: E); |
385 | } |
386 | |
387 | // Sort remaining edges in increasing weight order. |
388 | Queue.swap(x&: NewQueue); |
389 | llvm::sort(C&: Queue, Comp); |
390 | } |
391 | |
392 | bool MinBranchGreedyClusterAlgorithm::areClustersCompatible( |
393 | const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const { |
394 | return Front.back() == E.Src && Back.front() == E.Dst; |
395 | } |
396 | |
397 | void MinBranchGreedyClusterAlgorithm::reset() { |
398 | GreedyClusterAlgorithm::reset(); |
399 | Weight.clear(); |
400 | } |
401 | |
402 | void TSPReorderAlgorithm::reorderBasicBlocks(BinaryFunction &BF, |
403 | BasicBlockOrder &Order) const { |
404 | std::vector<std::vector<uint64_t>> Weight; |
405 | std::vector<BinaryBasicBlock *> IndexToBB; |
406 | |
407 | const size_t N = BF.getLayout().block_size(); |
408 | assert(N <= std::numeric_limits<uint64_t>::digits && |
409 | "cannot use TSP solution for sizes larger than bits in uint64_t" ); |
410 | |
411 | // Populating weight map and index map |
412 | for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { |
413 | BB->setLayoutIndex(IndexToBB.size()); |
414 | IndexToBB.push_back(x: BB); |
415 | } |
416 | Weight.resize(new_size: N); |
417 | for (const BinaryBasicBlock *BB : BF.getLayout().blocks()) { |
418 | auto BI = BB->branch_info_begin(); |
419 | Weight[BB->getLayoutIndex()].resize(new_size: N); |
420 | for (BinaryBasicBlock *SuccBB : BB->successors()) { |
421 | if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) |
422 | Weight[BB->getLayoutIndex()][SuccBB->getLayoutIndex()] = BI->Count; |
423 | ++BI; |
424 | } |
425 | } |
426 | |
427 | std::vector<std::vector<int64_t>> DP; |
428 | DP.resize(new_size: static_cast<size_t>(1) << N); |
429 | for (std::vector<int64_t> &Elmt : DP) |
430 | Elmt.resize(new_size: N, x: -1); |
431 | |
432 | // Start with the entry basic block being allocated with cost zero |
433 | DP[1][0] = 0; |
434 | // Walk through TSP solutions using a bitmask to represent state (current set |
435 | // of BBs in the layout) |
436 | uint64_t BestSet = 1; |
437 | uint64_t BestLast = 0; |
438 | int64_t BestWeight = 0; |
439 | for (uint64_t Set = 1; Set < (1ULL << N); ++Set) { |
440 | // Traverse each possibility of Last BB visited in this layout |
441 | for (uint64_t Last = 0; Last < N; ++Last) { |
442 | // Case 1: There is no possible layout with this BB as Last |
443 | if (DP[Set][Last] == -1) |
444 | continue; |
445 | |
446 | // Case 2: There is a layout with this Set and this Last, and we try |
447 | // to expand this set with New |
448 | for (uint64_t New = 1; New < N; ++New) { |
449 | // Case 2a: BB "New" is already in this Set |
450 | if ((Set & (1ULL << New)) != 0) |
451 | continue; |
452 | |
453 | // Case 2b: BB "New" is not in this set and we add it to this Set and |
454 | // record total weight of this layout with "New" as the last BB. |
455 | uint64_t NewSet = (Set | (1ULL << New)); |
456 | if (DP[NewSet][New] == -1) |
457 | DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New]; |
458 | DP[NewSet][New] = std::max(a: DP[NewSet][New], |
459 | b: DP[Set][Last] + (int64_t)Weight[Last][New]); |
460 | |
461 | if (DP[NewSet][New] > BestWeight) { |
462 | BestWeight = DP[NewSet][New]; |
463 | BestSet = NewSet; |
464 | BestLast = New; |
465 | } |
466 | } |
467 | } |
468 | } |
469 | |
470 | // Define final function layout based on layout that maximizes weight |
471 | uint64_t Last = BestLast; |
472 | uint64_t Set = BestSet; |
473 | BitVector Visited; |
474 | Visited.resize(N); |
475 | Visited[Last] = true; |
476 | Order.push_back(Elt: IndexToBB[Last]); |
477 | Set = Set & ~(1ULL << Last); |
478 | while (Set != 0) { |
479 | int64_t Best = -1; |
480 | uint64_t NewLast; |
481 | for (uint64_t I = 0; I < N; ++I) { |
482 | if (DP[Set][I] == -1) |
483 | continue; |
484 | int64_t AdjWeight = Weight[I][Last] > 0 ? Weight[I][Last] : 0; |
485 | if (DP[Set][I] + AdjWeight > Best) { |
486 | NewLast = I; |
487 | Best = DP[Set][I] + AdjWeight; |
488 | } |
489 | } |
490 | Last = NewLast; |
491 | Visited[Last] = true; |
492 | Order.push_back(Elt: IndexToBB[Last]); |
493 | Set = Set & ~(1ULL << Last); |
494 | } |
495 | std::reverse(first: Order.begin(), last: Order.end()); |
496 | |
497 | // Finalize layout with BBs that weren't assigned to the layout using the |
498 | // input layout. |
499 | for (BinaryBasicBlock *BB : BF.getLayout().blocks()) |
500 | if (Visited[BB->getLayoutIndex()] == false) |
501 | Order.push_back(Elt: BB); |
502 | } |
503 | |
504 | void ExtTSPReorderAlgorithm::reorderBasicBlocks(BinaryFunction &BF, |
505 | BasicBlockOrder &Order) const { |
506 | if (BF.getLayout().block_empty()) |
507 | return; |
508 | |
509 | // Do not change layout of functions w/o profile information |
510 | if (!BF.hasValidProfile() || BF.getLayout().block_size() <= 2) { |
511 | for (BinaryBasicBlock *BB : BF.getLayout().blocks()) |
512 | Order.push_back(Elt: BB); |
513 | return; |
514 | } |
515 | |
516 | // Create a separate MCCodeEmitter to allow lock-free execution |
517 | BinaryContext::IndependentCodeEmitter Emitter; |
518 | if (!opts::NoThreads) |
519 | Emitter = BF.getBinaryContext().createIndependentMCCodeEmitter(); |
520 | |
521 | // Initialize CFG nodes and their data |
522 | std::vector<uint64_t> BlockSizes; |
523 | std::vector<uint64_t> BlockCounts; |
524 | BasicBlockOrder OrigOrder; |
525 | BF.getLayout().updateLayoutIndices(); |
526 | for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { |
527 | uint64_t Size = std::max<uint64_t>(a: BB->estimateSize(Emitter: Emitter.MCE.get()), b: 1); |
528 | BlockSizes.push_back(x: Size); |
529 | BlockCounts.push_back(x: BB->getKnownExecutionCount()); |
530 | OrigOrder.push_back(Elt: BB); |
531 | } |
532 | |
533 | // Initialize CFG edges |
534 | std::vector<codelayout::EdgeCount> JumpCounts; |
535 | for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { |
536 | auto BI = BB->branch_info_begin(); |
537 | for (BinaryBasicBlock *SuccBB : BB->successors()) { |
538 | assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && |
539 | "missing profile for a jump" ); |
540 | JumpCounts.push_back( |
541 | x: {.src: BB->getLayoutIndex(), .dst: SuccBB->getLayoutIndex(), .count: BI->Count}); |
542 | ++BI; |
543 | } |
544 | } |
545 | |
546 | // Run the layout algorithm |
547 | auto Result = |
548 | codelayout::computeExtTspLayout(NodeSizes: BlockSizes, NodeCounts: BlockCounts, EdgeCounts: JumpCounts); |
549 | Order.reserve(N: BF.getLayout().block_size()); |
550 | for (uint64_t R : Result) |
551 | Order.push_back(Elt: OrigOrder[R]); |
552 | } |
553 | |
554 | void OptimizeReorderAlgorithm::reorderBasicBlocks( |
555 | BinaryFunction &BF, BasicBlockOrder &Order) const { |
556 | if (BF.getLayout().block_empty()) |
557 | return; |
558 | |
559 | // Cluster basic blocks. |
560 | CAlgo->clusterBasicBlocks(BF); |
561 | |
562 | if (opts::PrintClusters) |
563 | CAlgo->printClusters(); |
564 | |
565 | // Arrange basic blocks according to clusters. |
566 | for (ClusterAlgorithm::ClusterTy &Cluster : CAlgo->Clusters) |
567 | Order.insert(I: Order.end(), From: Cluster.begin(), To: Cluster.end()); |
568 | } |
569 | |
570 | void OptimizeBranchReorderAlgorithm::reorderBasicBlocks( |
571 | BinaryFunction &BF, BasicBlockOrder &Order) const { |
572 | if (BF.getLayout().block_empty()) |
573 | return; |
574 | |
575 | // Cluster basic blocks. |
576 | CAlgo->clusterBasicBlocks(BF, /* ComputeEdges = */ true); |
577 | std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters; |
578 | std::vector<std::unordered_map<uint32_t, uint64_t>> &ClusterEdges = |
579 | CAlgo->ClusterEdges; |
580 | |
581 | // Compute clusters' average frequencies. |
582 | CAlgo->computeClusterAverageFrequency(BC: BF.getBinaryContext()); |
583 | std::vector<double> &AvgFreq = CAlgo->AvgFreq; |
584 | |
585 | if (opts::PrintClusters) |
586 | CAlgo->printClusters(); |
587 | |
588 | // Cluster layout order |
589 | std::vector<uint32_t> ClusterOrder; |
590 | |
591 | // Do a topological sort for clusters, prioritizing frequently-executed BBs |
592 | // during the traversal. |
593 | std::stack<uint32_t> Stack; |
594 | std::vector<uint32_t> Status; |
595 | std::vector<uint32_t> Parent; |
596 | Status.resize(new_size: Clusters.size(), x: 0); |
597 | Parent.resize(new_size: Clusters.size(), x: 0); |
598 | constexpr uint32_t STACKED = 1; |
599 | constexpr uint32_t VISITED = 2; |
600 | Status[0] = STACKED; |
601 | Stack.push(x: 0); |
602 | while (!Stack.empty()) { |
603 | uint32_t I = Stack.top(); |
604 | if (!(Status[I] & VISITED)) { |
605 | Status[I] |= VISITED; |
606 | // Order successors by weight |
607 | auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) { |
608 | return ClusterEdges[I][A] > ClusterEdges[I][B]; |
609 | }; |
610 | std::priority_queue<uint32_t, std::vector<uint32_t>, |
611 | decltype(ClusterComp)> |
612 | SuccQueue(ClusterComp); |
613 | for (std::pair<const uint32_t, uint64_t> &Target : ClusterEdges[I]) { |
614 | if (Target.second > 0 && !(Status[Target.first] & STACKED) && |
615 | !Clusters[Target.first].empty()) { |
616 | Parent[Target.first] = I; |
617 | Status[Target.first] = STACKED; |
618 | SuccQueue.push(x: Target.first); |
619 | } |
620 | } |
621 | while (!SuccQueue.empty()) { |
622 | Stack.push(x: SuccQueue.top()); |
623 | SuccQueue.pop(); |
624 | } |
625 | continue; |
626 | } |
627 | // Already visited this node |
628 | Stack.pop(); |
629 | ClusterOrder.push_back(x: I); |
630 | } |
631 | std::reverse(first: ClusterOrder.begin(), last: ClusterOrder.end()); |
632 | // Put unreachable clusters at the end |
633 | for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) |
634 | if (!(Status[I] & VISITED) && !Clusters[I].empty()) |
635 | ClusterOrder.push_back(x: I); |
636 | |
637 | // Sort nodes with equal precedence |
638 | auto Beg = ClusterOrder.begin(); |
639 | // Don't reorder the first cluster, which contains the function entry point |
640 | ++Beg; |
641 | std::stable_sort(first: Beg, last: ClusterOrder.end(), |
642 | comp: [&AvgFreq, &Parent](uint32_t A, uint32_t B) { |
643 | uint32_t P = Parent[A]; |
644 | while (Parent[P] != 0) { |
645 | if (Parent[P] == B) |
646 | return false; |
647 | P = Parent[P]; |
648 | } |
649 | P = Parent[B]; |
650 | while (Parent[P] != 0) { |
651 | if (Parent[P] == A) |
652 | return true; |
653 | P = Parent[P]; |
654 | } |
655 | return AvgFreq[A] > AvgFreq[B]; |
656 | }); |
657 | |
658 | if (opts::PrintClusters) { |
659 | errs() << "New cluster order: " ; |
660 | const char *Sep = "" ; |
661 | for (uint32_t O : ClusterOrder) { |
662 | errs() << Sep << O; |
663 | Sep = ", " ; |
664 | } |
665 | errs() << '\n'; |
666 | } |
667 | |
668 | // Arrange basic blocks according to cluster order. |
669 | for (uint32_t ClusterIndex : ClusterOrder) { |
670 | ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; |
671 | Order.insert(I: Order.end(), From: Cluster.begin(), To: Cluster.end()); |
672 | } |
673 | } |
674 | |
675 | void OptimizeCacheReorderAlgorithm::reorderBasicBlocks( |
676 | BinaryFunction &BF, BasicBlockOrder &Order) const { |
677 | if (BF.getLayout().block_empty()) |
678 | return; |
679 | |
680 | const uint64_t ColdThreshold = |
681 | opts::ColdThreshold * |
682 | (*BF.getLayout().block_begin())->getExecutionCount() / 1000; |
683 | |
684 | // Cluster basic blocks. |
685 | CAlgo->clusterBasicBlocks(BF); |
686 | std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters; |
687 | |
688 | // Compute clusters' average frequencies. |
689 | CAlgo->computeClusterAverageFrequency(BC: BF.getBinaryContext()); |
690 | std::vector<double> &AvgFreq = CAlgo->AvgFreq; |
691 | |
692 | if (opts::PrintClusters) |
693 | CAlgo->printClusters(); |
694 | |
695 | // Cluster layout order |
696 | std::vector<uint32_t> ClusterOrder; |
697 | |
698 | // Order clusters based on average instruction execution frequency |
699 | for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) |
700 | if (!Clusters[I].empty()) |
701 | ClusterOrder.push_back(x: I); |
702 | // Don't reorder the first cluster, which contains the function entry point |
703 | std::stable_sort( |
704 | first: std::next(x: ClusterOrder.begin()), last: ClusterOrder.end(), |
705 | comp: [&AvgFreq](uint32_t A, uint32_t B) { return AvgFreq[A] > AvgFreq[B]; }); |
706 | |
707 | if (opts::PrintClusters) { |
708 | errs() << "New cluster order: " ; |
709 | const char *Sep = "" ; |
710 | for (uint32_t O : ClusterOrder) { |
711 | errs() << Sep << O; |
712 | Sep = ", " ; |
713 | } |
714 | errs() << '\n'; |
715 | } |
716 | |
717 | // Arrange basic blocks according to cluster order. |
718 | for (uint32_t ClusterIndex : ClusterOrder) { |
719 | ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; |
720 | Order.insert(I: Order.end(), From: Cluster.begin(), To: Cluster.end()); |
721 | // Force zero execution count on clusters that do not meet the cut off |
722 | // specified by --cold-threshold. |
723 | if (AvgFreq[ClusterIndex] < static_cast<double>(ColdThreshold)) |
724 | for (BinaryBasicBlock *BBPtr : Cluster) |
725 | BBPtr->setExecutionCount(0); |
726 | } |
727 | } |
728 | |
729 | void ReverseReorderAlgorithm::reorderBasicBlocks(BinaryFunction &BF, |
730 | BasicBlockOrder &Order) const { |
731 | if (BF.getLayout().block_empty()) |
732 | return; |
733 | |
734 | BinaryBasicBlock *FirstBB = *BF.getLayout().block_begin(); |
735 | Order.push_back(Elt: FirstBB); |
736 | for (auto RLI = BF.getLayout().block_rbegin(); *RLI != FirstBB; ++RLI) |
737 | Order.push_back(Elt: *RLI); |
738 | } |
739 | |
740 | void RandomClusterReorderAlgorithm::reorderBasicBlocks( |
741 | BinaryFunction &BF, BasicBlockOrder &Order) const { |
742 | if (BF.getLayout().block_empty()) |
743 | return; |
744 | |
745 | // Cluster basic blocks. |
746 | CAlgo->clusterBasicBlocks(BF); |
747 | std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters; |
748 | |
749 | if (opts::PrintClusters) |
750 | CAlgo->printClusters(); |
751 | |
752 | // Cluster layout order |
753 | std::vector<uint32_t> ClusterOrder; |
754 | |
755 | // Order clusters based on average instruction execution frequency |
756 | for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) |
757 | if (!Clusters[I].empty()) |
758 | ClusterOrder.push_back(x: I); |
759 | |
760 | std::shuffle(first: std::next(x: ClusterOrder.begin()), last: ClusterOrder.end(), |
761 | g: std::default_random_engine(opts::RandomSeed.getValue())); |
762 | |
763 | if (opts::PrintClusters) { |
764 | errs() << "New cluster order: " ; |
765 | const char *Sep = "" ; |
766 | for (uint32_t O : ClusterOrder) { |
767 | errs() << Sep << O; |
768 | Sep = ", " ; |
769 | } |
770 | errs() << '\n'; |
771 | } |
772 | |
773 | // Arrange basic blocks according to cluster order. |
774 | for (uint32_t ClusterIndex : ClusterOrder) { |
775 | ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; |
776 | Order.insert(I: Order.end(), From: Cluster.begin(), To: Cluster.end()); |
777 | } |
778 | } |
779 | |