ReorderAlgorithm.cpp source code [bolt/lib/Passes/ReorderAlgorithm.cpp]

1	//===- bolt/Passes/ReorderAlgorithm.cpp - Basic block reordering ----------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements classes used by several basic block reordering
10	// algorithms.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "bolt/Passes/ReorderAlgorithm.h"
15	#include "bolt/Core/BinaryBasicBlock.h"
16	#include "bolt/Core/BinaryFunction.h"
17	#include "llvm/Support/CommandLine.h"
18	#include "llvm/Transforms/Utils/CodeLayout.h"
19	#include <queue>
20	#include <random>
21	#include <stack>
22
23	#undef DEBUG_TYPE
24	#define DEBUG_TYPE "bolt"
25
26	using namespace llvm;
27	using namespace bolt;
28
29	namespace opts {
30
31	extern cl::OptionCategory BoltOptCategory;
32	extern cl::opt<bool> NoThreads;
33
34	static cl::opt<unsigned> ColdThreshold(
35	"cold-threshold",
36	cl::desc ("tenths of percents of main entry frequency to use as a "
37	"threshold when evaluating whether a basic block is cold "
38	"(0 means it is only considered cold if the block has zero "
39	"samples). Default: 0 "),
40	cl::init(Val: `0`), cl::ZeroOrMore, cl::Hidden, cl::cat (BoltOptCategory));
41
42	static cl::opt<bool> PrintClusters("print-clusters", cl::desc ("print clusters"),
43	cl::Hidden, cl::cat (BoltOptCategory));
44
45	cl::opt<uint32_t> RandomSeed("bolt-seed", cl::desc ("seed for randomization"),
46	cl::init(Val: `42`), cl::Hidden,
47	cl::cat (BoltOptCategory));
48
49	} // namespace opts
50
51	namespace {
52
53	template <class T> inline void hashCombine(size_t &Seed, const T &Val) {
54	std::hash<T> Hasher;
55	Seed ^= Hasher(Val) + `0x9e3779b9` + (Seed << `6`) + (Seed >> `2`);
56	}
57
58	template <typename A, typename B> struct HashPair {
59	size_t operator()(const std::pair<A, B> &Val) const {
60	std::hash<A> Hasher;
61	size_t Seed = Hasher(Val.first);
62	hashCombine(Seed, Val.second);
63	return Seed;
64	}
65	};
66
67	} // namespace
68
69	void ClusterAlgorithm::computeClusterAverageFrequency(const BinaryContext &BC) {
70	// Create a separate MCCodeEmitter to allow lock-free execution
71	BinaryContext::IndependentCodeEmitter Emitter;
72	if (!opts::NoThreads)
73	Emitter = BC.createIndependentMCCodeEmitter();
74
75	AvgFreq.resize(new_size: Clusters.size(), x: `0.0`);
76	for (uint32_t I = `0`, E = Clusters.size(); I < E; ++I) {
77	double Freq = `0.0`;
78	uint64_t ClusterSize = `0`;
79	for (const BinaryBasicBlock *BB : Clusters [I]) {
80	if (BB->getNumNonPseudos() > `0`) {
81	Freq += BB->getExecutionCount();
82	// Estimate the size of a block in bytes at run time
83	// NOTE: This might be inaccurate
84	ClusterSize += BB->estimateSize(Emitter: Emitter.MCE.get());
85	}
86	}
87	AvgFreq [I] = ClusterSize == `0` ? `0` : Freq / ClusterSize;
88	}
89	}
90
91	void ClusterAlgorithm::printClusters() const {
92	for (uint32_t I = `0`, E = Clusters.size(); I < E; ++I) {
93	errs() << "Cluster number " << I;
94	if (AvgFreq.size() == Clusters.size())
95	errs() << " (frequency: " << AvgFreq [I] << ")";
96	errs() << " : ";
97	const char *Sep = "";
98	for (const BinaryBasicBlock *BB : Clusters [I]) {
99	errs() << Sep << BB->getName();
100	Sep = ", ";
101	}
102	errs() << "\n";
103	}
104	}
105
106	void ClusterAlgorithm::reset() {
107	Clusters.clear();
108	ClusterEdges.clear();
109	AvgFreq.clear();
110	}
111
112	void GreedyClusterAlgorithm::EdgeTy::print(raw_ostream &OS) const {
113	OS << Src->getName() << " -> " << Dst->getName() << ", count: " << Count;
114	}
115
116	size_t GreedyClusterAlgorithm::EdgeHash::operator()(const EdgeTy &E) const {
117	HashPair<const BinaryBasicBlock , const* BinaryBasicBlock *> Hasher;
118	return Hasher (std::make_pair(x: E.Src, y: E.Dst));
119	}
120
121	bool GreedyClusterAlgorithm::EdgeEqual::operator()(const EdgeTy &A,
122	const EdgeTy &B) const {
123	return A.Src == B.Src && A.Dst == B.Dst;
124	}
125
126	void GreedyClusterAlgorithm::clusterBasicBlocks(BinaryFunction &BF,
127	bool ComputeEdges) {
128	reset();
129
130	// Greedy heuristic implementation for the TSP, applied to BB layout. Try to
131	// maximize weight during a path traversing all BBs. In this way, we will
132	// convert the hottest branches into fall-throughs.
133
134	// This is the queue of edges from which we will pop edges and use them to
135	// cluster basic blocks in a greedy fashion.
136	std::vector<EdgeTy> Queue;
137
138	// Initialize inter-cluster weights.
139	if (ComputeEdges)
140	ClusterEdges.resize(new_size: BF.getLayout().block_size());
141
142	// Initialize clusters and edge queue.
143	for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
144	// Create a cluster for this BB.
145	uint32_t I = Clusters.size();
146	Clusters.emplace_back();
147	std::vector<BinaryBasicBlock *> &Cluster = Clusters.back();
148	Cluster.push_back(x: BB);
149	BBToClusterMap [BB] = I;
150	// Populate priority queue with edges.
151	auto BI = BB->branch_info_begin();
152	for (const BinaryBasicBlock *I : BB->successors()) {
153	assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
154	"attempted reordering blocks of function with no profile data");
155	Queue.emplace_back(args: EdgeTy (BB, I, BI->Count));
156	++BI;
157	}
158	}
159	// Sort and adjust the edge queue.
160	initQueue(Queue, BF);
161
162	// Grow clusters in a greedy fashion.
163	while (!Queue.empty()) {
164	EdgeTy E = Queue.back();
165	Queue.pop_back();
166
167	const BinaryBasicBlock *SrcBB = E.Src;
168	const BinaryBasicBlock *DstBB = E.Dst;
169
170	LLVM_DEBUG(dbgs() << "Popped edge "; E.print(dbgs()); dbgs() << "\n");
171
172	// Case 1: BBSrc and BBDst are the same. Ignore this edge
173	if (SrcBB == DstBB \|\| DstBB == *BF.getLayout().block_begin()) {
174	LLVM_DEBUG(dbgs() << "\tIgnored (same src, dst)\n");
175	continue;
176	}
177
178	int I = BBToClusterMap [SrcBB];
179	int J = BBToClusterMap [DstBB];
180
181	// Case 2: If they are already allocated at the same cluster, just increase
182	// the weight of this cluster
183	if (I == J) {
184	if (ComputeEdges)
185	ClusterEdges [I][I] += E.Count;
186	LLVM_DEBUG(dbgs() << "\tIgnored (src, dst belong to the same cluster)\n");
187	continue;
188	}
189
190	std::vector<BinaryBasicBlock *> &ClusterA = Clusters [I];
191	std::vector<BinaryBasicBlock *> &ClusterB = Clusters [J];
192	if (areClustersCompatible(Front: ClusterA, Back: ClusterB, E)) {
193	// Case 3: SrcBB is at the end of a cluster and DstBB is at the start,
194	// allowing us to merge two clusters.
195	for (const BinaryBasicBlock *BB : ClusterB)
196	BBToClusterMap [BB] = I;
197	ClusterA.insert(position: ClusterA.end(), first: ClusterB.begin(), last: ClusterB.end());
198	ClusterB.clear();
199	if (ComputeEdges) {
200	// Increase the intra-cluster edge count of cluster A with the count of
201	// this edge as well as with the total count of previously visited edges
202	// from cluster B cluster A.
203	ClusterEdges [I][I] += E.Count;
204	ClusterEdges [I][I] += ClusterEdges [J][I];
205	// Iterate through all inter-cluster edges and transfer edges targeting
206	// cluster B to cluster A.
207	for (uint32_t K = `0`, E = ClusterEdges.size(); K != E; ++K)
208	ClusterEdges [K][I] += ClusterEdges [K][J];
209	}
210	// Adjust the weights of the remaining edges and re-sort the queue.
211	adjustQueue(Queue, BF);
212	LLVM_DEBUG(dbgs() << "\tMerged clusters of src, dst\n");
213	} else {
214	// Case 4: Both SrcBB and DstBB are allocated in positions we cannot
215	// merge them. Add the count of this edge to the inter-cluster edge count
216	// between clusters A and B to help us decide ordering between these
217	// clusters.
218	if (ComputeEdges)
219	ClusterEdges [I][J] += E.Count;
220	LLVM_DEBUG(
221	dbgs() << "\tIgnored (src, dst belong to incompatible clusters)\n");
222	}
223	}
224	}
225
226	void GreedyClusterAlgorithm::reset() {
227	ClusterAlgorithm::reset();
228	BBToClusterMap.clear();
229	}
230
231	void PHGreedyClusterAlgorithm::initQueue(std::vector<EdgeTy> &Queue,
232	const BinaryFunction &BF) {
233	// Define a comparison function to establish SWO between edges.
234	auto Comp = [&BF](const EdgeTy &A, const EdgeTy &B) {
235	// With equal weights, prioritize branches with lower index
236	// source/destination. This helps to keep original block order for blocks
237	// when optimal order cannot be deducted from a profile.
238	if (A.Count == B.Count) {
239	const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A: A.Src, B: B.Src);
240	return (SrcOrder != `0`)
241	? SrcOrder > `0`
242	: BF.getOriginalLayoutRelativeOrder(A: A.Dst, B: B.Dst) > `0`;
243	}
244	return A.Count < B.Count;
245	};
246
247	// Sort edges in increasing profile count order.
248	llvm::sort(C&: Queue, Comp);
249	}
250
251	void PHGreedyClusterAlgorithm::adjustQueue(std::vector<EdgeTy> &Queue,
252	const BinaryFunction &BF) {
253	// Nothing to do.
254	}
255
256	bool PHGreedyClusterAlgorithm::areClustersCompatible(const ClusterTy &Front,
257	const ClusterTy &Back,
258	const EdgeTy &E) const {
259	return Front.back() == E.Src && Back.front() == E.Dst;
260	}
261
262	int64_t MinBranchGreedyClusterAlgorithm::calculateWeight(
263	const EdgeTy &E, const BinaryFunction &BF) const {
264	const BinaryBasicBlock *SrcBB = E.Src;
265	const BinaryBasicBlock *DstBB = E.Dst;
266
267	// Initial weight value.
268	int64_t W = (int64_t)E.Count;
269
270	// Adjust the weight by taking into account other edges with the same source.
271	auto BI = SrcBB->branch_info_begin();
272	for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) {
273	assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
274	"attempted reordering blocks of function with no profile data");
275	assert(BI->Count <= std::numeric_limits<int64_t>::max() &&
276	"overflow detected");
277	// Ignore edges with same source and destination, edges that target the
278	// entry block as well as the edge E itself.
279	if (SuccBB != SrcBB && SuccBB != *BF.getLayout().block_begin() &&
280	SuccBB != DstBB)
281	W -= (int64_t)BI->Count;
282	++BI;
283	}
284
285	// Adjust the weight by taking into account other edges with the same
286	// destination.
287	for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) {
288	// Ignore edges with same source and destination as well as the edge E
289	// itself.
290	if (PredBB == DstBB \|\| PredBB == SrcBB)
291	continue;
292	auto BI = PredBB->branch_info_begin();
293	for (const BinaryBasicBlock *SuccBB : PredBB->successors()) {
294	if (SuccBB == DstBB)
295	break;
296	++BI;
297	}
298	assert(BI != PredBB->branch_info_end() && "invalid control flow graph");
299	assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
300	"attempted reordering blocks of function with no profile data");
301	assert(BI->Count <= std::numeric_limits<int64_t>::max() &&
302	"overflow detected");
303	W -= (int64_t)BI->Count;
304	}
305
306	return W;
307	}
308
309	void MinBranchGreedyClusterAlgorithm::initQueue(std::vector<EdgeTy> &Queue,
310	const BinaryFunction &BF) {
311	// Initialize edge weights.
312	for (const EdgeTy &E : Queue)
313	Weight.emplace(args: std::make_pair(x: E, y: calculateWeight(E, BF)));
314
315	// Sort edges in increasing weight order.
316	adjustQueue(Queue, BF);
317	}
318
319	void MinBranchGreedyClusterAlgorithm::adjustQueue(std::vector<EdgeTy> &Queue,
320	const BinaryFunction &BF) {
321	// Define a comparison function to establish SWO between edges.
322	auto Comp = [&](const EdgeTy &A, const EdgeTy &B) {
323	// With equal weights, prioritize branches with lower index
324	// source/destination. This helps to keep original block order for blocks
325	// when optimal order cannot be deduced from a profile.
326	if (Weight [A] == Weight [B]) {
327	const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A: A.Src, B: B.Src);
328	return (SrcOrder != `0`)
329	? SrcOrder > `0`
330	: BF.getOriginalLayoutRelativeOrder(A: A.Dst, B: B.Dst) > `0`;
331	}
332	return Weight [A] < Weight [B];
333	};
334
335	// Iterate through all remaining edges to find edges that have their
336	// source and destination in the same cluster.
337	std::vector<EdgeTy> NewQueue;
338	for (const EdgeTy &E : Queue) {
339	const BinaryBasicBlock *SrcBB = E.Src;
340	const BinaryBasicBlock *DstBB = E.Dst;
341
342	// Case 1: SrcBB and DstBB are the same or DstBB is the entry block. Ignore
343	// this edge.
344	if (SrcBB == DstBB \|\| DstBB == *BF.getLayout().block_begin()) {
345	LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs());
346	dbgs() << " (same src, dst)\n");
347	continue;
348	}
349
350	int I = BBToClusterMap [SrcBB];
351	int J = BBToClusterMap [DstBB];
352	std::vector<BinaryBasicBlock *> &ClusterA = Clusters [I];
353	std::vector<BinaryBasicBlock *> &ClusterB = Clusters [J];
354
355	// Case 2: They are already allocated at the same cluster or incompatible
356	// clusters. Adjust the weights of edges with the same source or
357	// destination, so that this edge has no effect on them any more, and ignore
358	// this edge. Also increase the intra- (or inter-) cluster edge count.
359	if (I == J \|\| !areClustersCompatible(Front: ClusterA, Back: ClusterB, E)) {
360	if (!ClusterEdges.empty())
361	ClusterEdges [I][J] += E.Count;
362	LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs());
363	dbgs() << " (src, dst belong to same cluster or incompatible "
364	"clusters)\n");
365	for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) {
366	if (SuccBB == DstBB)
367	continue;
368	auto WI = Weight.find(x: EdgeTy (SrcBB, SuccBB, `0`));
369	assert(WI != Weight.end() && "CFG edge not found in Weight map");
370	WI ->second += (int64_t)E.Count;
371	}
372	for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) {
373	if (PredBB == SrcBB)
374	continue;
375	auto WI = Weight.find(x: EdgeTy (PredBB, DstBB, `0`));
376	assert(WI != Weight.end() && "CFG edge not found in Weight map");
377	WI ->second += (int64_t)E.Count;
378	}
379	continue;
380	}
381
382	// Case 3: None of the previous cases is true, so just keep this edge in
383	// the queue.
384	NewQueue.emplace_back(args: E);
385	}
386
387	// Sort remaining edges in increasing weight order.
388	Queue.swap(x&: NewQueue);
389	llvm::sort(C&: Queue, Comp);
390	}
391
392	bool MinBranchGreedyClusterAlgorithm::areClustersCompatible(
393	const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const {
394	return Front.back() == E.Src && Back.front() == E.Dst;
395	}
396
397	void MinBranchGreedyClusterAlgorithm::reset() {
398	GreedyClusterAlgorithm::reset();
399	Weight.clear();
400	}
401
402	void TSPReorderAlgorithm::reorderBasicBlocks(BinaryFunction &BF,
403	BasicBlockOrder &Order) const {
404	std::vector<std::vector<uint64_t>> Weight;
405	std::vector<BinaryBasicBlock *> IndexToBB;
406
407	const size_t N = BF.getLayout().block_size();
408	assert(N <= std::numeric_limits<uint64_t>::digits &&
409	"cannot use TSP solution for sizes larger than bits in uint64_t");
410
411	// Populating weight map and index map
412	for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
413	BB->setLayoutIndex(IndexToBB.size());
414	IndexToBB.push_back(x: BB);
415	}
416	Weight.resize(new_size: N);
417	for (const BinaryBasicBlock *BB : BF.getLayout().blocks()) {
418	auto BI = BB->branch_info_begin();
419	Weight [BB->getLayoutIndex()].resize(new_size: N);
420	for (BinaryBasicBlock *SuccBB : BB->successors()) {
421	if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE)
422	Weight [BB->getLayoutIndex()][SuccBB->getLayoutIndex()] = BI->Count;
423	++BI;
424	}
425	}
426
427	std::vector<std::vector<int64_t>> DP;
428	DP.resize(new_size: static_cast<size_t>(`1`) << N);
429	for (std::vector<int64_t> &Elmt : DP)
430	Elmt.resize(new_size: N, x: -`1`);
431
432	// Start with the entry basic block being allocated with cost zero
433	DP [`1`][`0`] = `0`;
434	// Walk through TSP solutions using a bitmask to represent state (current set
435	// of BBs in the layout)
436	uint64_t BestSet = `1`;
437	uint64_t BestLast = `0`;
438	int64_t BestWeight = `0`;
439	for (uint64_t Set = `1`; Set < (`1ULL` << N); ++Set) {
440	// Traverse each possibility of Last BB visited in this layout
441	for (uint64_t Last = `0`; Last < N; ++Last) {
442	// Case 1: There is no possible layout with this BB as Last
443	if (DP [Set][Last] == -`1`)
444	continue;
445
446	// Case 2: There is a layout with this Set and this Last, and we try
447	// to expand this set with New
448	for (uint64_t New = `1`; New < N; ++New) {
449	// Case 2a: BB "New" is already in this Set
450	if ((Set & (`1ULL` << New)) != `0`)
451	continue;
452
453	// Case 2b: BB "New" is not in this set and we add it to this Set and
454	// record total weight of this layout with "New" as the last BB.
455	uint64_t NewSet = (Set \| (`1ULL` << New));
456	if (DP [NewSet][New] == -`1`)
457	DP [NewSet][New] = DP [Set][Last] + (int64_t)Weight [Last][New];
458	DP [NewSet][New] = std::max(a: DP [NewSet][New],
459	b: DP [Set][Last] + (int64_t)Weight [Last][New]);
460
461	if (DP [NewSet][New] > BestWeight) {
462	BestWeight = DP [NewSet][New];
463	BestSet = NewSet;
464	BestLast = New;
465	}
466	}
467	}
468	}
469
470	// Define final function layout based on layout that maximizes weight
471	uint64_t Last = BestLast;
472	uint64_t Set = BestSet;
473	BitVector Visited;
474	Visited.resize(N);
475	Visited [Last] = true;
476	Order.push_back(Elt: IndexToBB [Last]);
477	Set = Set & ~(`1ULL` << Last);
478	while (Set != `0`) {
479	int64_t Best = -`1`;
480	uint64_t NewLast;
481	for (uint64_t I = `0`; I < N; ++I) {
482	if (DP [Set][I] == -`1`)
483	continue;
484	int64_t AdjWeight = Weight [I][Last] > `0` ? Weight [I][Last] : `0`;
485	if (DP [Set][I] + AdjWeight > Best) {
486	NewLast = I;
487	Best = DP [Set][I] + AdjWeight;
488	}
489	}
490	Last = NewLast;
491	Visited [Last] = true;
492	Order.push_back(Elt: IndexToBB [Last]);
493	Set = Set & ~(`1ULL` << Last);
494	}
495	std::reverse(first: Order.begin(), last: Order.end());
496
497	// Finalize layout with BBs that weren't assigned to the layout using the
498	// input layout.
499	for (BinaryBasicBlock *BB : BF.getLayout().blocks())
500	if (Visited [BB->getLayoutIndex()] == false)
501	Order.push_back(Elt: BB);
502	}
503
504	void ExtTSPReorderAlgorithm::reorderBasicBlocks(BinaryFunction &BF,
505	BasicBlockOrder &Order) const {
506	if (BF.getLayout().block_empty())
507	return;
508
509	// Do not change layout of functions w/o profile information
510	if (!BF.hasValidProfile() \|\| BF.getLayout().block_size() <= `2`) {
511	for (BinaryBasicBlock *BB : BF.getLayout().blocks())
512	Order.push_back(Elt: BB);
513	return;
514	}
515
516	// Create a separate MCCodeEmitter to allow lock-free execution
517	BinaryContext::IndependentCodeEmitter Emitter;
518	if (!opts::NoThreads)
519	Emitter = BF.getBinaryContext().createIndependentMCCodeEmitter();
520
521	// Initialize CFG nodes and their data
522	std::vector<uint64_t> BlockSizes;
523	std::vector<uint64_t> BlockCounts;
524	BasicBlockOrder OrigOrder;
525	BF.getLayout().updateLayoutIndices();
526	for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
527	uint64_t Size = std::max<uint64_t>(a: BB->estimateSize(Emitter: Emitter.MCE.get()), b: `1`);
528	BlockSizes.push_back(x: Size);
529	BlockCounts.push_back(x: BB->getKnownExecutionCount());
530	OrigOrder.push_back(Elt: BB);
531	}
532
533	// Initialize CFG edges
534	std::vector<codelayout::EdgeCount> JumpCounts;
535	for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
536	auto BI = BB->branch_info_begin();
537	for (BinaryBasicBlock *SuccBB : BB->successors()) {
538	assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
539	"missing profile for a jump");
540	JumpCounts.push_back(
541	x: {.src: BB->getLayoutIndex(), .dst: SuccBB->getLayoutIndex(), .count: BI->Count});
542	++BI;
543	}
544	}
545
546	// Run the layout algorithm
547	auto Result =
548	codelayout::computeExtTspLayout(NodeSizes: BlockSizes, NodeCounts: BlockCounts, EdgeCounts: JumpCounts);
549	Order.reserve(N: BF.getLayout().block_size());
550	for (uint64_t R : Result)
551	Order.push_back(Elt: OrigOrder [R]);
552	}
553
554	void OptimizeReorderAlgorithm::reorderBasicBlocks(
555	BinaryFunction &BF, BasicBlockOrder &Order) const {
556	if (BF.getLayout().block_empty())
557	return;
558
559	// Cluster basic blocks.
560	CAlgo ->clusterBasicBlocks(BF);
561
562	if (opts::PrintClusters)
563	CAlgo ->printClusters();
564
565	// Arrange basic blocks according to clusters.
566	for (ClusterAlgorithm::ClusterTy &Cluster : CAlgo ->Clusters)
567	Order.insert(I: Order.end(), From: Cluster.begin(), To: Cluster.end());
568	}
569
570	void OptimizeBranchReorderAlgorithm::reorderBasicBlocks(
571	BinaryFunction &BF, BasicBlockOrder &Order) const {
572	if (BF.getLayout().block_empty())
573	return;
574
575	// Cluster basic blocks.
576	CAlgo ->clusterBasicBlocks(BF, / ComputeEdges = / true);
577	std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo ->Clusters;
578	std::vector<std::unordered_map<uint32_t, uint64_t>> &ClusterEdges =
579	CAlgo ->ClusterEdges;
580
581	// Compute clusters' average frequencies.
582	CAlgo ->computeClusterAverageFrequency(BC: BF.getBinaryContext());
583	std::vector<double> &AvgFreq = CAlgo ->AvgFreq;
584
585	if (opts::PrintClusters)
586	CAlgo ->printClusters();
587
588	// Cluster layout order
589	std::vector<uint32_t> ClusterOrder;
590
591	// Do a topological sort for clusters, prioritizing frequently-executed BBs
592	// during the traversal.
593	std::stack<uint32_t> Stack;
594	std::vector<uint32_t> Status;
595	std::vector<uint32_t> Parent;
596	Status.resize(new_size: Clusters.size(), x: `0`);
597	Parent.resize(new_size: Clusters.size(), x: `0`);
598	constexpr uint32_t STACKED = `1`;
599	constexpr uint32_t VISITED = `2`;
600	Status [`0`] = STACKED;
601	Stack.push(x: `0`);
602	while (!Stack.empty()) {
603	uint32_t I = Stack.top();
604	if (!(Status [I] & VISITED)) {
605	Status [I] \|= VISITED;
606	// Order successors by weight
607	auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) {
608	return ClusterEdges [I][A] > ClusterEdges [I][B];
609	};
610	std::priority_queue<uint32_t, std::vector<uint32_t>,
611	decltype(ClusterComp)>
612	SuccQueue(ClusterComp);
613	for (std::pair<const uint32_t, uint64_t> &Target : ClusterEdges [I]) {
614	if (Target.second > `0` && !(Status [Target.first] & STACKED) &&
615	!Clusters [Target.first].empty()) {
616	Parent [Target.first] = I;
617	Status [Target.first] = STACKED;
618	SuccQueue.push(x: Target.first);
619	}
620	}
621	while (!SuccQueue.empty()) {
622	Stack.push(x: SuccQueue.top());
623	SuccQueue.pop();
624	}
625	continue;
626	}
627	// Already visited this node
628	Stack.pop();
629	ClusterOrder.push_back(x: I);
630	}
631	std::reverse(first: ClusterOrder.begin(), last: ClusterOrder.end());
632	// Put unreachable clusters at the end
633	for (uint32_t I = `0`, E = Clusters.size(); I < E; ++I)
634	if (!(Status [I] & VISITED) && !Clusters [I].empty())
635	ClusterOrder.push_back(x: I);
636
637	// Sort nodes with equal precedence
638	auto Beg = ClusterOrder.begin();
639	// Don't reorder the first cluster, which contains the function entry point
640	++Beg;
641	std::stable_sort(first: Beg, last: ClusterOrder.end(),
642	comp: [&AvgFreq, &Parent](uint32_t A, uint32_t B) {
643	uint32_t P = Parent [A];
644	while (Parent [P] != `0`) {
645	if (Parent [P] == B)
646	return false;
647	P = Parent [P];
648	}
649	P = Parent [B];
650	while (Parent [P] != `0`) {
651	if (Parent [P] == A)
652	return true;
653	P = Parent [P];
654	}
655	return AvgFreq [A] > AvgFreq [B];
656	});
657
658	if (opts::PrintClusters) {
659	errs() << "New cluster order: ";
660	const char *Sep = "";
661	for (uint32_t O : ClusterOrder) {
662	errs() << Sep << O;
663	Sep = ", ";
664	}
665	errs() << `'\n'`;
666	}
667
668	// Arrange basic blocks according to cluster order.
669	for (uint32_t ClusterIndex : ClusterOrder) {
670	ClusterAlgorithm::ClusterTy &Cluster = Clusters [ClusterIndex];
671	Order.insert(I: Order.end(), From: Cluster.begin(), To: Cluster.end());
672	}
673	}
674
675	void OptimizeCacheReorderAlgorithm::reorderBasicBlocks(
676	BinaryFunction &BF, BasicBlockOrder &Order) const {
677	if (BF.getLayout().block_empty())
678	return;
679
680	const uint64_t ColdThreshold =
681	opts::ColdThreshold *
682	(*BF.getLayout().block_begin())->getExecutionCount() / `1000`;
683
684	// Cluster basic blocks.
685	CAlgo ->clusterBasicBlocks(BF);
686	std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo ->Clusters;
687
688	// Compute clusters' average frequencies.
689	CAlgo ->computeClusterAverageFrequency(BC: BF.getBinaryContext());
690	std::vector<double> &AvgFreq = CAlgo ->AvgFreq;
691
692	if (opts::PrintClusters)
693	CAlgo ->printClusters();
694
695	// Cluster layout order
696	std::vector<uint32_t> ClusterOrder;
697
698	// Order clusters based on average instruction execution frequency
699	for (uint32_t I = `0`, E = Clusters.size(); I < E; ++I)
700	if (!Clusters [I].empty())
701	ClusterOrder.push_back(x: I);
702	// Don't reorder the first cluster, which contains the function entry point
703	std::stable_sort(
704	first: std::next(x: ClusterOrder.begin()), last: ClusterOrder.end(),
705	comp: [&AvgFreq](uint32_t A, uint32_t B) { return AvgFreq [A] > AvgFreq [B]; });
706
707	if (opts::PrintClusters) {
708	errs() << "New cluster order: ";
709	const char *Sep = "";
710	for (uint32_t O : ClusterOrder) {
711	errs() << Sep << O;
712	Sep = ", ";
713	}
714	errs() << `'\n'`;
715	}
716
717	// Arrange basic blocks according to cluster order.
718	for (uint32_t ClusterIndex : ClusterOrder) {
719	ClusterAlgorithm::ClusterTy &Cluster = Clusters [ClusterIndex];
720	Order.insert(I: Order.end(), From: Cluster.begin(), To: Cluster.end());
721	// Force zero execution count on clusters that do not meet the cut off
722	// specified by --cold-threshold.
723	if (AvgFreq [ClusterIndex] < static_cast<double>(ColdThreshold))
724	for (BinaryBasicBlock *BBPtr : Cluster)
725	BBPtr->setExecutionCount(`0`);
726	}
727	}
728
729	void ReverseReorderAlgorithm::reorderBasicBlocks(BinaryFunction &BF,
730	BasicBlockOrder &Order) const {
731	if (BF.getLayout().block_empty())
732	return;
733
734	BinaryBasicBlock FirstBB = BF.getLayout().block_begin();
735	Order.push_back(Elt: FirstBB);
736	for (auto RLI = BF.getLayout().block_rbegin(); *RLI != FirstBB; ++RLI)
737	Order.push_back(Elt: *RLI);
738	}
739
740	void RandomClusterReorderAlgorithm::reorderBasicBlocks(
741	BinaryFunction &BF, BasicBlockOrder &Order) const {
742	if (BF.getLayout().block_empty())
743	return;
744
745	// Cluster basic blocks.
746	CAlgo ->clusterBasicBlocks(BF);
747	std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo ->Clusters;
748
749	if (opts::PrintClusters)
750	CAlgo ->printClusters();
751
752	// Cluster layout order
753	std::vector<uint32_t> ClusterOrder;
754
755	// Order clusters based on average instruction execution frequency
756	for (uint32_t I = `0`, E = Clusters.size(); I < E; ++I)
757	if (!Clusters [I].empty())
758	ClusterOrder.push_back(x: I);
759
760	std::shuffle(first: std::next(x: ClusterOrder.begin()), last: ClusterOrder.end(),
761	g: std::default_random_engine (opts::RandomSeed.getValue()));
762
763	if (opts::PrintClusters) {
764	errs() << "New cluster order: ";
765	const char *Sep = "";
766	for (uint32_t O : ClusterOrder) {
767	errs() << Sep << O;
768	Sep = ", ";
769	}
770	errs() << `'\n'`;
771	}
772
773	// Arrange basic blocks according to cluster order.
774	for (uint32_t ClusterIndex : ClusterOrder) {
775	ClusterAlgorithm::ClusterTy &Cluster = Clusters [ClusterIndex];
776	Order.insert(I: Order.end(), From: Cluster.begin(), To: Cluster.end());
777	}
778	}
779

Provided by KDAB

Definitions

source code of bolt/lib/Passes/ReorderAlgorithm.cpp